blake3-1.8.1/.cargo/config.toml000064400000000000000000000000531046102023000143050ustar 00000000000000[target.wasm32-wasip1] runner = "wasmtime" blake3-1.8.1/.cargo_vcs_info.json0000644000000001360000000000100122040ustar { "git": { "sha1": "ad639b126ef9b5f3b131093363cc3bb6bba4c3bf" }, "path_in_vcs": "" }blake3-1.8.1/.git-blame-ignore-revs000064400000000000000000000001141046102023000150700ustar 00000000000000# CMakeLists.txt whitespace fixups 3e14f865d30271c74fc68d417af488ea91b66d48 blake3-1.8.1/.github/workflows/build_b3sum.py000064400000000000000000000021721046102023000171550ustar 00000000000000#! /usr/bin/env python3 from pathlib import Path import platform import shutil import subprocess import sys ROOT = Path(__file__).parent.parent.parent RUST_TARGET = sys.argv[1] subprocess.run( ["cargo", "build", "--target", sys.argv[1], "--release"], cwd=ROOT / "b3sum" ) if platform.system() == "Windows": original_exe_name = "b3sum.exe" else: original_exe_name = "b3sum" if platform.system() == "Windows": new_exe_name = "b3sum_windows_x64_bin.exe" elif platform.system() == "Darwin": new_exe_name = "b3sum_macos_x64_bin" elif platform.system() == "Linux": new_exe_name = "b3sum_linux_x64_bin" else: raise RuntimeError("Unexpected platform: " + platform.system()) # Copy the built binary so that it has the upload name we want. out_dir = ROOT / "b3sum/target" / RUST_TARGET / "release" original_exe_path = str(out_dir / original_exe_name) new_exe_path = str(out_dir / new_exe_name) print("copying", repr(original_exe_path), "to", repr(new_exe_path)) shutil.copyfile(original_exe_path, new_exe_path) # This lets the subsequent upload step get the filepath. print("::set-output name=bin_path::" + new_exe_path) blake3-1.8.1/.github/workflows/ci.yml000064400000000000000000000445661046102023000155260ustar 00000000000000name: tests on: push: branches: - "*" # not on tags pull_request: env: BLAKE3_CI: "1" RUSTFLAGS: "-D warnings" RUST_BACKTRACE: "1" jobs: library_tests: name: ${{ matrix.target.name }} ${{ matrix.channel }} runs-on: ${{ matrix.target.os }} strategy: fail-fast: false matrix: target: [ { "os": "ubuntu-latest", "toolchain": "x86_64-unknown-linux-gnu", "name": "Linux GNU" }, { "os": "macOS-latest", "toolchain": "aarch64-apple-darwin", "name": "macOS" }, { "os": "windows-latest", "toolchain": "x86_64-pc-windows-msvc", "name": "Windows MSVC" }, { "os": "windows-latest", "toolchain": "x86_64-pc-windows-gnu", "name": "Windows GNU" } ] channel: ["stable", "beta", "nightly"] steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} # Print the compiler version, for debugging. - name: print compiler version run: cargo run --quiet working-directory: ./tools/compiler_version # Print out instruction set support, for debugging. - name: print instruction set support run: cargo run --quiet working-directory: ./tools/instruction_set_support # Default tests plus Rayon and trait implementations. - run: cargo test --features=rayon,traits-preview,serde,zeroize # Same but with only one thread in the Rayon pool. This can find deadlocks. - name: "again with RAYON_NUM_THREADS=1" run: cargo test --features=rayon,traits-preview,serde,zeroize env: RAYON_NUM_THREADS: 1 # The mmap feature by itself (update_mmap_rayon is omitted). - run: cargo test --features=mmap # All public features put together. - run: cargo test --features=mmap,rayon,traits-preview,serde,zeroize # no_std tests. - run: cargo test --no-default-features # A matrix of different test settings: # - debug vs release # - assembly vs Rust+C intrinsics vs pure Rust intrinsics # - different levels of SIMD support # # Full SIMD support. - run: cargo test --features= - run: cargo test --features=prefer_intrinsics - run: cargo test --features=pure - run: cargo test --features= --release - run: cargo test --features=prefer_intrinsics --release - run: cargo test --features=pure --release # No AVX-512. - run: cargo test --features=no_avx512 - run: cargo test --features=no_avx512,prefer_intrinsics - run: cargo test --features=no_avx512,pure - run: cargo test --features=no_avx512 --release - run: cargo test --features=no_avx512,prefer_intrinsics --release - run: cargo test --features=no_avx512,pure --release # No AVX2. - run: cargo test --features=no_avx512,no_avx2 - run: cargo test --features=no_avx512,no_avx2,prefer_intrinsics - run: cargo test --features=no_avx512,no_avx2,pure - run: cargo test --features=no_avx512,no_avx2 --release - run: cargo test --features=no_avx512,no_avx2,prefer_intrinsics --release - run: cargo test --features=no_avx512,no_avx2,pure --release # No SSE4.1 - run: cargo test --features=no_avx512,no_avx2,no_sse41 - run: cargo test --features=no_avx512,no_avx2,no_sse41,prefer_intrinsics - run: cargo test --features=no_avx512,no_avx2,no_sse41,pure - run: cargo test --features=no_avx512,no_avx2,no_sse41 --release - run: cargo test --features=no_avx512,no_avx2,no_sse41,prefer_intrinsics --release - run: cargo test --features=no_avx512,no_avx2,no_sse41,pure --release # No SSE2 - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2 - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,prefer_intrinsics - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,pure - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2 --release - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,prefer_intrinsics --release - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,pure --release # Test benchmarks. RUSTC_BOOTSTRAP=1 lets this run on non-nightly toolchains. - run: cargo test --benches --features=rayon env: RUSTC_BOOTSTRAP: 1 # Test vectors. - name: test vectors run: cargo test working-directory: ./test_vectors - name: test vectors intrinsics run: cargo test --features=prefer_intrinsics working-directory: ./test_vectors - name: test vectors pure run: cargo test --features=pure working-directory: ./test_vectors # Test C code. - name: cargo test C bindings assembly run: cargo test working-directory: ./c/blake3_c_rust_bindings - name: cargo test C bindings intrinsics run: cargo test --features=prefer_intrinsics working-directory: ./c/blake3_c_rust_bindings - name: cargo test C bindings no AVX-512 run: cargo test working-directory: ./c/blake3_c_rust_bindings env: CFLAGS: -DBLAKE3_NO_AVX512 - name: cargo test C bindings no AVX2 run: cargo test working-directory: ./c/blake3_c_rust_bindings env: CFLAGS: -DBLAKE3_NO_AVX512 -DBLAKE3_NO_AVX2 - name: cargo test C bindings no SSE41 run: cargo test working-directory: ./c/blake3_c_rust_bindings env: CFLAGS: -DBLAKE3_NO_AVX512 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_SSE41 - name: cargo test C bindings no SSE2 run: cargo test working-directory: ./c/blake3_c_rust_bindings env: CFLAGS: -DBLAKE3_NO_AVX512 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_SSE2 # Reference impl doc test. - name: reference impl doc test run: cargo test working-directory: ./reference_impl msrv_build: name: MSRV build ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: ["ubuntu-latest", "macOS-latest", "windows-latest"] steps: - uses: actions/checkout@v4 # The current MSRV. This crate doesn't have an official MSRV policy, # but in practice we'll probably do what libc does: # https://github.com/rust-lang/libs-team/issues/72. # This test target is here so that we notice if we accidentally bump # the MSRV, but it's not a promise that we won't bump it. - uses: dtolnay/rust-toolchain@1.66.1 - run: cargo build --features=mmap,rayon,traits-preview,serde,zeroize b3sum_tests: name: b3sum ${{ matrix.target.name }} ${{ matrix.channel }} runs-on: ${{ matrix.target.os }} strategy: fail-fast: false matrix: target: [ { "os": "ubuntu-latest", "toolchain": "x86_64-unknown-linux-gnu", "name": "Linux GNU" }, { "os": "macOS-latest", "toolchain": "aarch64-apple-darwin", "name": "macOS" }, { "os": "windows-latest", "toolchain": "x86_64-pc-windows-msvc", "name": "Windows MSVC" }, { "os": "windows-latest", "toolchain": "x86_64-pc-windows-gnu", "name": "Windows GNU" } ] channel: ["stable", "beta", "nightly"] steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} # Test b3sum. - name: test b3sum run: cargo test working-directory: ./b3sum - name: test b3sum --no-default-features run: cargo test --no-default-features working-directory: ./b3sum cross_tests: name: cross ${{ matrix.arch }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: arch: - i586-unknown-linux-musl - i686-unknown-linux-musl - armv7-unknown-linux-gnueabihf - aarch64-unknown-linux-gnu # Big-endian targets. See https://twitter.com/burntsushi5/status/1695483429997945092. - powerpc64-unknown-linux-gnu - s390x-unknown-linux-gnu steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - run: cargo install cross # Test the portable implementation on everything. - run: cross test --target ${{ matrix.arch }} # Test building for ancient i386 processors without guaranteed SSE2 support. - run: cross rustc --target ${{ matrix.arch }} -- -C target-cpu=i386 if: startsWith(matrix.arch, 'i586-') || startsWith(matrix.arch, 'i686-') # Test the NEON implementation on ARM targets. - run: cross test --target ${{ matrix.arch }} --features=neon if: startsWith(matrix.arch, 'armv7-') || startsWith(matrix.arch, 'aarch64-') # NEON is enabled by default on aarch64, disabling it through the no_neon feature. - run: cross test --target ${{ matrix.arch }} --features=no_neon if: startsWith(matrix.arch, 'aarch64-') # Test vectors. Note that this uses a hacky script due to path dependency limitations. - run: ./test_vectors/cross_test.sh --target ${{ matrix.arch }} # C code. Same issue with the hacky script. - run: ./c/blake3_c_rust_bindings/cross_test.sh --target ${{ matrix.arch }} - run: ./c/blake3_c_rust_bindings/cross_test.sh --target ${{ matrix.arch }} --features=neon if: startsWith(matrix.arch, 'armv7-') || startsWith(matrix.arch, 'aarch64-') wasm_tests: name: WASM tests runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: targets: wasm32-wasip1 - name: install Wasmtime run: | curl https://wasmtime.dev/install.sh -sSf | bash echo PATH: $PATH mkdir -p ~/.local/bin ln -s ~/.wasmtime/bin/wasmtime ~/.local/bin/wasmtime - run: cargo test --target wasm32-wasip1 - run: cargo test --target wasm32-wasip1 --no-default-features - run: cargo test --target wasm32-wasip1 --features wasm32_simd - run: cargo test --target wasm32-wasip1 --no-default-features --features wasm32_simd - run: cargo test --target wasm32-wasip1 --benches --features=wasm32_simd env: RUSTC_BOOTSTRAP: 1 - name: test vectors w/o SIMD run: cargo test --target wasm32-wasip1 working-directory: ./test_vectors - name: test vectors w/ SIMD run: cargo test --target wasm32-wasip1 --features wasm32_simd working-directory: ./test_vectors cargo_xwin_test: name: cargo xwin test runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - run: docker run -v $(pwd):/io -w /io messense/cargo-xwin cargo xwin test --target x86_64-pc-windows-msvc --features=mmap,rayon,traits-preview,serde,zeroize # Currently only on x86. c_tests: name: C tests SIMD=${{ matrix.simd }} TBB=${{ matrix.use_tbb }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: use_tbb: ["OFF", "ON"] simd: ["x86-intrinsics", "amd64-asm"] steps: - uses: actions/checkout@v4 - run: | sudo apt-get update sudo apt-get install ninja-build libtbb-dev libtbb12 # Test the intrinsics-based and assembly-based implementations. - run: | cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_SIMD_TYPE=${{ matrix.simd }}" cmake --build c/build --target test cat c/build/Testing/Temporary/LastTest.log - run: | cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_SIMD_TYPE=${{ matrix.simd }}" -DBLAKE3_NO_SSE2=1 cmake --build c/build --target test cat c/build/Testing/Temporary/LastTest.log - run: | cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_SIMD_TYPE=${{ matrix.simd }}" -DBLAKE3_NO_SSE2=1 -DBLAKE3_NO_SSE41=1 cmake --build c/build --target test cat c/build/Testing/Temporary/LastTest.log - run: | cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_SIMD_TYPE=${{ matrix.simd }}" "-DBLAKE3_NO_SSE2=1" "-DBLAKE3_NO_SSE41=1" "-DBLAKE3_NO_AVX2=1" cmake --build c/build --target test cat c/build/Testing/Temporary/LastTest.log - run: | cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_SIMD_TYPE=${{ matrix.simd }}" "-DBLAKE3_NO_SSE2=1" "-DBLAKE3_NO_SSE41=1" "-DBLAKE3_NO_AVX2=1" "-DBLAKE3_NO_AVX512=1" cmake --build c/build --target test cat c/build/Testing/Temporary/LastTest.log # Test with TBB disabled/enabled. - run: | cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_USE_TBB=${{ matrix.use_tbb }}" cmake --build c/build --target test cat c/build/Testing/Temporary/LastTest.log # Build the example with TBB disabled/enabled. - run: | cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON -DBLAKE3_EXAMPLES=ON "-DBLAKE3_USE_TBB=${{ matrix.use_tbb }}" cmake --build c/build --target blake3-example # Note that this jobs builds AArch64 binaries from an x86_64 host. build_apple_silicon: name: build for Apple Silicon runs-on: macOS-latest strategy: fail-fast: false steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: targets: aarch64-apple-darwin - name: build blake3 run: cargo build --target aarch64-apple-darwin - name: build b3sum run: cargo build --target aarch64-apple-darwin working-directory: ./b3sum build_tinycc: name: build with the Tiny C Compiler runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: install TCC run: sudo apt-get install -y tcc - name: compile run: > tcc -shared -O3 -o libblake3.so \ -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_AVX512 \ blake3.c blake3_dispatch.c blake3_portable.c working-directory: ./c # See https://github.com/BLAKE3-team/BLAKE3/issues/271 for why we test this. # Note that this isn't guaranteed to execute on an AVX-512-supporting server, # but hopefully at least some of the time it will. gcc54: name: "compile and test with GCC 5.4" runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: addnab/docker-run-action@v3 with: image: gcc:5.4 options: -v ${{ github.workspace }}:/work run: | cat /proc/cpuinfo curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal cd /work ~/.cargo/bin/cargo test --features prefer_intrinsics # CMake build test (Library only). cmake_current_build: name: CMake ${{ matrix.os }} CC=${{ matrix.toolchain.cc }} CXX=${{ matrix.toolchain.cxx }} TBB=${{ matrix.use_tbb }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: cmakeVersion: [latest] ninjaVersion: [latest] os: [ubuntu-latest, macOS-latest, windows-latest] toolchain: [ { cc: cl, cxx: cl }, { cc: clang, cxx: clang++ }, { cc: clang-cl, cxx: clang-cl }, { cc: gcc, cxx: g++ }, ] use_tbb: [OFF, ON] exclude: - os: macOS-latest toolchain: { cc: cl, cxx: cl } - os: macOS-latest toolchain: { cc: clang-cl, cxx: clang-cl } - os: ubuntu-latest toolchain: { cc: cl, cxx: cl } - os: ubuntu-latest toolchain: { cc: clang-cl, cxx: clang-cl } - os: windows-latest toolchain: { cc: clang, cxx: clang++ } use_tbb: ON - os: windows-latest toolchain: { cc: gcc, cxx: g++ } use_tbb: ON steps: - uses: actions/checkout@v4 - uses: lukka/get-cmake@5f6e04f5267c8133f1273bf2103583fc72c46b17 with: cmakeVersion: ${{ matrix.cmakeVersion }} ninjaVersion: ${{ matrix.ninjaVersion }} - if: matrix.os == 'macOS-latest' name: Install dependencies on macOS run: | brew update brew install tbb - if: matrix.os == 'ubuntu-latest' name: Install dependencies on Linux run: | sudo apt-get update sudo apt-get install libtbb-dev libtbb12 - name: CMake generation, build, install run: | ${{ matrix.os != 'windows-latest' || '& "C:/Program Files/Microsoft Visual Studio/2022/Enterprise/Common7/Tools/Launch-VsDevShell.ps1" -Arch amd64 -SkipAutomaticLocation' }} cmake -S c -B c/build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/target -DCMAKE_C_COMPILER=${{ matrix.toolchain.cc }} -DCMAKE_CXX_COMPILER=${{ matrix.toolchain.cxx }} -DBLAKE3_USE_TBB=${{ matrix.use_tbb }} -DBLAKE3_FETCH_TBB=${{ matrix.os == 'windows-latest' && 'YES' || 'NO' }} -DBLAKE3_EXAMPLES=ON cmake --build c/build --target install cmake_3-9_build: name: CMake 3.9.6 ubuntu-latest runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: lukka/get-cmake@5f6e04f5267c8133f1273bf2103583fc72c46b17 with: cmakeVersion: 3.9.6 - name: Create build directory run: mkdir c/build - name: CMake generation run: cmake .. -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/target working-directory: c/build - name: CMake build / install run: make install working-directory: c/build miri_smoketest: name: Miri smoketest runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@nightly with: components: miri # Currently the test search "miri" only matches "test_miri_smoketest", but # we might add more. If this accidentally picks up anything incompatible or # slow, we can narrow it. - run: cargo miri test miri tbb_rust_bindings_tests: name: TBB test bindings ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: ["ubuntu-latest", "macOS-latest"] steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - name: install TBB if: matrix.os == 'ubuntu-latest' run: | sudo apt-get update sudo apt-get install libtbb-dev libtbb12 - name: install TBB if: matrix.os == 'macOS-latest' run: | brew install tbb echo "CXXFLAGS=-I$(brew --prefix)/include $CPPFLAGS" >> $GITHUB_ENV echo "RUSTFLAGS=-L$(brew --prefix)/lib $RUSTFLAGS" >> $GITHUB_ENV - name: cargo test C bindings with TBB run: cargo test --features=tbb working-directory: ./c/blake3_c_rust_bindings blake3-1.8.1/.github/workflows/tag.yml000064400000000000000000000025031046102023000156670ustar 00000000000000name: publish_b3sum_binaries on: push: tags: - "*" env: BLAKE3_CI: "1" RUSTFLAGS: "-D warnings" jobs: cargo_tests: name: ${{ matrix.target.name }} runs-on: ${{ matrix.target.os }} strategy: fail-fast: false matrix: target: [ { "os": "ubuntu-latest", "rust-target": "x86_64-unknown-linux-musl", "name": "Linux" }, { "os": "macOS-latest", "rust-target": "x86_64-apple-darwin", "name": "macOS" }, { "os": "windows-latest", "rust-target": "x86_64-pc-windows-msvc", "name": "Windows" }, ] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: python-version: "3.x" - run: pip install PyGithub - run: sudo apt-get install musl-tools if: matrix.target.os == 'ubuntu-latest' - uses: dtolnay/rust-toolchain@stable with: targets: ${{ matrix.target.rust-target }} - name: build b3sum id: build_b3sum run: python -u .github/workflows/build_b3sum.py ${{ matrix.target.rust-target }} - name: upload release asset env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TAG: ${{ github.ref }} run: python -u .github/workflows/upload_github_release_asset.py ${{ steps.build_b3sum.outputs.bin_path }} blake3-1.8.1/.github/workflows/upload_github_release_asset.py000075500000000000000000000044561046102023000225040ustar 00000000000000#! /usr/bin/env python3 import github import os import sys import time RETRIES = 10 g = github.Github(os.environ["GITHUB_TOKEN"]) tag_name = os.environ["GITHUB_TAG"] tag_prefix = "refs/tags/" if tag_name.startswith(tag_prefix): tag_name = tag_name[len(tag_prefix) :] assert len(sys.argv) == 2 asset_path = sys.argv[1] asset_name = os.path.basename(asset_path) repo = g.get_repo(os.environ["GITHUB_REPOSITORY"]) tags = list(repo.get_tags()) for tag in tags: if tag.name == tag_name: break else: raise RuntimeError("no tag named " + repr(tag_name)) try: print("Creating GitHub release for tag " + repr(tag_name) + "...") repo.create_git_release(tag_name, tag_name, tag.commit.commit.message) except github.GithubException as github_error: if github_error.data["errors"][0]["code"] == "already_exists": print("Release for tag " + repr(tag_name) + " already exists.") else: raise def get_release(): for i in range(RETRIES): releases = list(repo.get_releases()) for release in releases: if release.tag_name == tag_name: return release print(f"Release for tag {repr(tag_name)} not found. Retrying...") time.sleep(1) raise RuntimeError("no release for tag " + repr(tag_name)) release = get_release() print("Uploading " + repr(asset_path) + "...") for i in range(RETRIES): try: print("Upload attempt #{} of {}...".format(i + 1, RETRIES)) release.upload_asset(asset_path) break except github.GithubException as github_error: # Unfortunately the asset upload API is flaky. Even worse, it often # partially succeeds, returning an error to the caller but leaving the # release in a state where subsequent uploads of the same asset will # fail with an "already_exists" error. (Though the asset is not visible # on github.com, so we can't just declare victory and move on.) If we # detect this case, explicitly delete the asset and continue retrying. print(github_error) for asset in release.get_assets(): if asset.name == asset_name: print("Found uploaded asset after failure. Deleting...") asset.delete_asset() else: raise RuntimeError("All upload attempts failed.") print("Success!") blake3-1.8.1/.gitignore000064400000000000000000000000221046102023000127560ustar 00000000000000Cargo.lock target blake3-1.8.1/CONTRIBUTING.md000064400000000000000000000022201046102023000132210ustar 00000000000000# Contributing We welcome and encourage third-party contributions to BLAKE3, be it reports of issues encountered while using the software or proposals of patches. ## Bug reports Bugs and other problems should be reported on [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues). If you report a bug, please: * Check that it's not already reported in the [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues). * Provide information to help us diagnose and ideally reproduce the bug. ## Patches We encourage you to fix a bug via a [GitHub Pull request](https://github.com/BLAKE3/BLAKE3/pulls), preferably after creating a related issue and referring it in the PR. If you contribute code and submit a patch, please note the following: * We use Rust's stable branch for developing BLAKE3. * Pull requests should target the `master` branch. * Try to follow the established Rust [style guidelines](https://doc.rust-lang.org/1.0.0/style/). Also please make sure to create new unit tests covering your code additions. You can execute the tests by running: ```bash cargo test ``` All third-party contributions will be recognized in the list of contributors. blake3-1.8.1/Cargo.lock0000644000000362750000000000100101740ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 4 [[package]] name = "arrayref" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" [[package]] name = "arrayvec" version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" dependencies = [ "zeroize", ] [[package]] name = "bitflags" version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" [[package]] name = "blake3" version = "1.8.1" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "ciborium", "constant_time_eq", "digest", "hex", "hmac", "memmap2", "page_size", "rand", "rand_chacha", "rayon-core", "serde", "serde_json", "tempfile", "zeroize", ] [[package]] name = "block-buffer" version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" dependencies = [ "generic-array", ] [[package]] name = "cc" version = "1.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fcb57c740ae1daf453ae85f16e37396f672b039e00d9d866e07ddb24e328e3a" dependencies = [ "shlex", ] [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "ciborium" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" dependencies = [ "ciborium-io", "ciborium-ll", "serde", ] [[package]] name = "ciborium-io" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" [[package]] name = "ciborium-ll" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" dependencies = [ "ciborium-io", "half", ] [[package]] name = "constant_time_eq" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" [[package]] name = "crossbeam-deque" version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" dependencies = [ "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ "crossbeam-utils", ] [[package]] name = "crossbeam-utils" version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" [[package]] name = "crypto-common" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" dependencies = [ "generic-array", "typenum", ] [[package]] name = "digest" version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", "subtle", ] [[package]] name = "errno" version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc", "windows-sys", ] [[package]] name = "fastrand" version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "generic-array" version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", ] [[package]] name = "getrandom" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0" dependencies = [ "cfg-if", "libc", "r-efi", "wasi", ] [[package]] name = "half" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7db2ff139bba50379da6aa0766b52fdcb62cb5b263009b09ed58ba604e14bbd1" dependencies = [ "cfg-if", "crunchy", ] [[package]] name = "hex" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "hmac" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" dependencies = [ "digest", ] [[package]] name = "itoa" version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "libc" version = "0.2.171" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" [[package]] name = "linux-raw-sys" version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe7db12097d22ec582439daf8618b8fdd1a7bef6270e9af3b1ebcd30893cf413" [[package]] name = "memchr" version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "memmap2" version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" dependencies = [ "libc", ] [[package]] name = "once_cell" version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "page_size" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da" dependencies = [ "libc", "winapi", ] [[package]] name = "ppv-lite86" version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" dependencies = [ "zerocopy", ] [[package]] name = "proc-macro2" version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] [[package]] name = "r-efi" version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" [[package]] name = "rand" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" dependencies = [ "rand_chacha", "rand_core", "zerocopy", ] [[package]] name = "rand_chacha" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", "rand_core", ] [[package]] name = "rand_core" version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" dependencies = [ "getrandom", ] [[package]] name = "rayon-core" version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ "crossbeam-deque", "crossbeam-utils", ] [[package]] name = "rustix" version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d97817398dd4bb2e6da002002db259209759911da105da92bec29ccb12cf58bf" dependencies = [ "bitflags", "errno", "libc", "linux-raw-sys", "windows-sys", ] [[package]] name = "ryu" version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" [[package]] name = "serde" version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "serde_json" version = "1.0.140" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" dependencies = [ "itoa", "memchr", "ryu", "serde", ] [[package]] name = "shlex" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "subtle" version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" version = "2.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "tempfile" version = "3.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7437ac7763b9b123ccf33c338a5cc1bac6f69b45a136c19bdd8a65e3916435bf" dependencies = [ "fastrand", "getrandom", "once_cell", "rustix", "windows-sys", ] [[package]] name = "typenum" version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" [[package]] name = "unicode-ident" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" [[package]] name = "version_check" version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "wasi" version = "0.14.2+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" dependencies = [ "wit-bindgen-rt", ] [[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ "winapi-i686-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu", ] [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", "windows_i686_gnullvm", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "wit-bindgen-rt" version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" dependencies = [ "bitflags", ] [[package]] name = "zerocopy" version = "0.8.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" version = "0.8.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "zeroize" version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" blake3-1.8.1/Cargo.toml0000644000000047250000000000100102120ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" name = "blake3" version = "1.8.1" authors = [ "Jack O'Connor ", "Samuel Neves", ] build = "build.rs" autolib = false autobins = false autoexamples = false autotests = false autobenches = false description = "the BLAKE3 hash function" documentation = "https://docs.rs/blake3" readme = "README.md" license = "CC0-1.0 OR Apache-2.0 OR Apache-2.0 WITH LLVM-exception" repository = "https://github.com/BLAKE3-team/BLAKE3" [package.metadata.docs.rs] features = [ "mmap", "rayon", "serde", "zeroize", ] [features] default = ["std"] digest = ["dep:digest"] mmap = [ "std", "dep:memmap2", ] neon = [] no_avx2 = [] no_avx512 = [] no_neon = [] no_sse2 = [] no_sse41 = [] prefer_intrinsics = [] pure = [] rayon = [ "dep:rayon-core", "std", ] std = [] traits-preview = ["dep:digest"] wasm32_simd = [] zeroize = [ "dep:zeroize", "arrayvec/zeroize", ] [lib] name = "blake3" path = "src/lib.rs" [[bench]] name = "bench" path = "benches/bench.rs" [dependencies.arrayref] version = "0.3.5" [dependencies.arrayvec] version = "0.7.4" default-features = false [dependencies.cfg-if] version = "1.0.0" [dependencies.constant_time_eq] version = "0.3.1" default-features = false [dependencies.digest] version = "0.10.1" features = ["mac"] optional = true [dependencies.memmap2] version = "0.9" optional = true [dependencies.rayon-core] version = "1.12.1" optional = true [dependencies.serde] version = "1.0" features = ["derive"] optional = true default-features = false [dependencies.zeroize] version = "1" optional = true default-features = false [dev-dependencies.ciborium] version = "0.2.2" [dev-dependencies.hex] version = "0.4.2" [dev-dependencies.hmac] version = "0.12.0" [dev-dependencies.page_size] version = "0.6.0" [dev-dependencies.rand] version = "0.9.0" [dev-dependencies.rand_chacha] version = "0.9.0" [dev-dependencies.serde_json] version = "1.0.107" [dev-dependencies.tempfile] version = "3.8.0" [build-dependencies.cc] version = "1.1.12" blake3-1.8.1/Cargo.toml.orig000064400000000000000000000140201046102023000136600ustar 00000000000000[package] name = "blake3" version = "1.8.1" authors = ["Jack O'Connor ", "Samuel Neves"] description = "the BLAKE3 hash function" repository = "https://github.com/BLAKE3-team/BLAKE3" license = "CC0-1.0 OR Apache-2.0 OR Apache-2.0 WITH LLVM-exception" documentation = "https://docs.rs/blake3" readme = "README.md" edition = "2021" [features] default = ["std"] # The NEON implementation does not participate in dynamic feature detection, # which is currently x86-only. If "neon" is on, NEON support is assumed. Note # that AArch64 always supports NEON, but support on ARMv7 varies. The NEON # implementation uses C intrinsics and requires a C compiler. neon = [] # The Wasm SIMD implementation does not participate in dynamic feature detection, # which is currently x86-only. If "wasm_simd" is on, Wasm SIMD support is assumed. # Note that not all Wasm implementations support the Wasm SIMD specification. # This may become the default in the future. wasm32_simd = [] # This crate uses libstd for std::io trait implementations, and also for # runtime CPU feature detection. This feature is enabled by default. If you use # --no-default-features, the only way to use the SIMD implementations in this # crate is to enable the corresponding instruction sets statically for the # entire build, with e.g. RUSTFLAGS="-C target-cpu=native". std = [] # The `rayon` feature (disabled by default, but enabled for docs.rs) adds the # `update_rayon` and (in combination with `mmap` below) `update_mmap_rayon` # methods, for multithreaded hashing. However, even if this feature is enabled, # all other APIs remain single-threaded. # # Implementation detail: We take a dependency on rayon-core instead of rayon, # because it builds faster and still includes all the APIs we need. rayon = ["dep:rayon-core", "std"] # The `mmap` feature (disabled by default, but enabled for docs.rs) adds the # `update_mmap` and (in combination with `rayon` above) `update_mmap_rayon` # helper methods for memory-mapped IO. mmap = ["std", "dep:memmap2"] # Implement the zeroize::Zeroize trait for types in this crate. zeroize = ["dep:zeroize", "arrayvec/zeroize"] # This crate implements traits from the RustCrypto project, exposed here as the # "traits-preview" feature. However, these traits aren't stable, and they're # expected to change in incompatible ways before they reach 1.0. For that # reason, this crate makes no SemVer guarantees for this feature, and callers # who use it should expect breaking changes between patch versions of this # crate. (The "*-preview" feature name follows the conventions of the RustCrypto # "signature" crate.) traits-preview = ["dep:digest"] # ---------- Features below this line are undocumented and unstable. ---------- # The following features are mainly intended for testing and benchmarking, and # they might change or disappear at any time without a major version bump. # It wasn't originally intended to expose "digest" as its own feature, but the # traits-preview feature above predated the "dep:" syntax in Cargo. Version # 1.5.2 of this crate started using "dep:" syntax, but that broke some callers # in the wild (https://solana.stackexchange.com/q/17787/29050). This feature # unbreaks those callers. When Cargo gains the ability to deprecate features, # this feature will be deprecated. Note that the relevant trait implementations # are still gated by "traits-preview". digest = ["dep:digest"] # By default on x86_64, this crate uses Samuel Neves' hand-written assembly # implementations for SSE4.1, AVX2, and AVX512. (These provide both the best # runtime performance, and the fastest build times.) And by default on 32-bit # x86, this crate uses Rust intrinsics implementations for SSE4.1 and AVX2, and # a C intrinsics implementation for AVX-512. In both cases, if a C compiler is # not detected, or if AVX-512 support is missing from the detected compiler, # build.rs automatically falls back to a pure Rust build. This feature forces # that fallback, for testing purposes. (Note that in CI testing, we set the # BLAKE3_CI environment variable, which instructs build.rs to error out rather # than doing an automatic fallback.) pure = [] # As described above, on x86_64 this crate use assembly implementations by # default. Enabling the "prefer_intrinsics" feature makes this crate use # intrinsics implementations on both 32-bit and 64-bit x86, again for testing # purposes. prefer_intrinsics = [] # Disable individual instruction sets. CI testing uses these flags to simulate # different levels of hardware SIMD support. Note that code for the # corresponding instruction set is still compiled; only detection is disabled. # # As noted above, these flags are *for testing only* and are not stable. It's # possible that some users might find that their particular use case performs # better if e.g. AVX-512 is disabled, because of issues like CPU downclocking. # If that comes up, and if disabling the instruction set here at the feature # level turns out to be the right approach, then we can design a stable # feature. Until then, we reserve the right to break these features in a patch # release. no_sse2 = [] no_sse41 = [] no_avx2 = [] no_avx512 = [] no_neon = [] [package.metadata.docs.rs] # Document the rayon/mmap methods and the Serialize/Deserialize/Zeroize impls on docs.rs. features = ["mmap", "rayon", "serde", "zeroize"] [dependencies] arrayref = "0.3.5" arrayvec = { version = "0.7.4", default-features = false } constant_time_eq = { version = "0.3.1", default-features = false } cfg-if = "1.0.0" digest = { version = "0.10.1", features = [ "mac" ], optional = true } memmap2 = { version = "0.9", optional = true } rayon-core = { version = "1.12.1", optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } zeroize = { version = "1", default-features = false, optional = true } [dev-dependencies] hmac = "0.12.0" hex = "0.4.2" page_size = "0.6.0" rand = "0.9.0" rand_chacha = "0.9.0" reference_impl = { path = "./reference_impl" } tempfile = "3.8.0" serde_json = "1.0.107" ciborium = "0.2.2" [build-dependencies] cc = "1.1.12" blake3-1.8.1/LICENSE_A2000064400000000000000000000261411046102023000123270ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2019 Jack O'Connor and Samuel Neves Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. blake3-1.8.1/LICENSE_A2LLVM000064400000000000000000000277531046102023000130340ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2019 Jack O'Connor and Samuel Neves Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ---- LLVM Exceptions to the Apache 2.0 License ---- As an exception, if, as a result of your compiling your source code, portions of this Software are embedded into an Object form of such source code, you may redistribute such embedded portions in such Object form without complying with the conditions of Sections 4(a), 4(b) and 4(d) of the License. In addition, if you combine or link compiled forms of this Software with software that is licensed under the GPLv2 ("Combined Software") and if a court of competent jurisdiction determines that the patent provision (Section 3), the indemnity provision (Section 9) or other Section of the License conflicts with the conditions of the GPLv2, you may retroactively and prospectively choose to deem waived or otherwise exclude such Section(s) of the License, but only in their entirety and only with respect to the Combined Software. blake3-1.8.1/LICENSE_CC0000064400000000000000000000156101046102023000124310ustar 00000000000000Creative Commons Legal Code CC0 1.0 Universal CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER. Statement of Purpose The laws of most jurisdictions throughout the world automatically confer exclusive Copyright and Related Rights (defined below) upon the creator and subsequent owner(s) (each and all, an "owner") of an original work of authorship and/or a database (each, a "Work"). Certain owners wish to permanently relinquish those rights to a Work for the purpose of contributing to a commons of creative, cultural and scientific works ("Commons") that the public can reliably and without fear of later claims of infringement build upon, modify, incorporate in other works, reuse and redistribute as freely as possible in any form whatsoever and for any purposes, including without limitation commercial purposes. These owners may contribute to the Commons to promote the ideal of a free culture and the further production of creative, cultural and scientific works, or to gain reputation or greater distribution for their Work in part through the use and efforts of others. For these and/or other purposes and motivations, and without any expectation of additional consideration or compensation, the person associating CC0 with a Work (the "Affirmer"), to the extent that he or she is an owner of Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to the Work and publicly distribute the Work under its terms, with knowledge of his or her Copyright and Related Rights in the Work and the meaning and intended legal effect of CC0 on those rights. 1. Copyright and Related Rights. A Work made available under CC0 may be protected by copyright and related or neighboring rights ("Copyright and Related Rights"). Copyright and Related Rights include, but are not limited to, the following: i. the right to reproduce, adapt, distribute, perform, display, communicate, and translate a Work; ii. moral rights retained by the original author(s) and/or performer(s); iii. publicity and privacy rights pertaining to a person's image or likeness depicted in a Work; iv. rights protecting against unfair competition in regards to a Work, subject to the limitations in paragraph 4(a), below; v. rights protecting the extraction, dissemination, use and reuse of data in a Work; vi. database rights (such as those arising under Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, and under any national implementation thereof, including any amended or successor version of such directive); and vii. other similar, equivalent or corresponding rights throughout the world based on applicable law or treaty, and any national implementations thereof. 2. Waiver. To the greatest extent permitted by, but not in contravention of, applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and unconditionally waives, abandons, and surrenders all of Affirmer's Copyright and Related Rights and associated claims and causes of action, whether now known or unknown (including existing as well as future claims and causes of action), in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each member of the public at large and to the detriment of Affirmer's heirs and successors, fully intending that such Waiver shall not be subject to revocation, rescission, cancellation, termination, or any other legal or equitable action to disrupt the quiet enjoyment of the Work by the public as contemplated by Affirmer's express Statement of Purpose. 3. Public License Fallback. Should any part of the Waiver for any reason be judged legally invalid or ineffective under applicable law, then the Waiver shall be preserved to the maximum extent permitted taking into account Affirmer's express Statement of Purpose. In addition, to the extent the Waiver is so judged Affirmer hereby grants to each affected person a royalty-free, non transferable, non sublicensable, non exclusive, irrevocable and unconditional license to exercise Affirmer's Copyright and Related Rights in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "License"). The License shall be deemed effective as of the date CC0 was applied by Affirmer to the Work. Should any part of the License for any reason be judged legally invalid or ineffective under applicable law, such partial invalidity or ineffectiveness shall not invalidate the remainder of the License, and in such case Affirmer hereby affirms that he or she will not (i) exercise any of his or her remaining Copyright and Related Rights in the Work or (ii) assert any associated claims and causes of action with respect to the Work, in either case contrary to Affirmer's express Statement of Purpose. 4. Limitations and Disclaimers. a. No trademark or patent rights held by Affirmer are waived, abandoned, surrendered, licensed or otherwise affected by this document. b. Affirmer offers the Work as-is and makes no representations or warranties of any kind concerning the Work, express, implied, statutory or otherwise, including without limitation warranties of title, merchantability, fitness for a particular purpose, non infringement, or the absence of latent or other defects, accuracy, or the present or absence of errors, whether or not discoverable, all to the greatest extent permissible under applicable law. c. Affirmer disclaims responsibility for clearing rights of other persons that may apply to the Work or any use thereof, including without limitation any person's Copyright and Related Rights in the Work. Further, Affirmer disclaims responsibility for obtaining any necessary consents, permissions or other rights required for any use of the Work. d. Affirmer understands and acknowledges that Creative Commons is not a party to this document and has no duty or obligation with respect to this CC0 or use of the Work. blake3-1.8.1/README.md000064400000000000000000000220461046102023000122570ustar 00000000000000# BLAKE3 BLAKE3 is a cryptographic hash function that is: - **Much faster** than MD5, SHA-1, SHA-2, SHA-3, and BLAKE2. - **Secure**, unlike MD5 and SHA-1. And secure against length extension, unlike SHA-2. - **Highly parallelizable** across any number of threads and SIMD lanes, because it's a Merkle tree on the inside. - Capable of **verified streaming** and **incremental updates**, again because it's a Merkle tree. - A **PRF**, **MAC**, **KDF**, and **XOF**, as well as a regular hash. - **One algorithm with no variants**, which is fast on x86-64 and also on smaller architectures. The [chart below](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/benchmarks/bar_chart.py) is an example benchmark of 16 KiB inputs on a Cascade Lake-SP 8275CL server CPU from 2019. For more detailed benchmarks, see the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf).

performance graph

BLAKE3 is based on an optimized instance of the established hash function [BLAKE2](https://blake2.net) and on the [original Bao tree mode](https://github.com/oconnor663/bao/blob/master/docs/spec_0.9.1.md). The specifications and design rationale are available in the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). The default output size is 256 bits. The current version of [Bao](https://github.com/oconnor663/bao) implements verified streaming with BLAKE3. This repository is the official implementation of BLAKE3. It includes: * The [`blake3`](https://crates.io/crates/blake3) Rust crate, which includes optimized implementations for SSE2, SSE4.1, AVX2, AVX-512, NEON, and WASM, with automatic runtime CPU feature detection on x86. The `rayon` feature provides multithreading. * The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which provides a command line interface. It uses multithreading by default, making it an order of magnitude faster than e.g. `sha256sum` on typical desktop hardware. * The [C implementation](c), which like the Rust implementation includes SIMD optimizations (all except WASM), CPU feature detection on x86, and optional multithreading. See [`c/README.md`](c/README.md). * The [Rust reference implementation](reference_impl/reference_impl.rs), which is discussed in Section 5.1 of the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). This implementation is much smaller and simpler than the optimized ones above. If you want to see how BLAKE3 works, or you're writing a port that doesn't need multithreading or SIMD optimizations, start here. Ports of the reference implementation to other languages are hosted in separate repositories ([C](https://github.com/oconnor663/blake3_reference_impl_c), [Python](https://github.com/oconnor663/pure_python_blake3)). * A [set of test vectors](https://github.com/BLAKE3-team/BLAKE3/blob/master/test_vectors/test_vectors.json) that covers extended outputs, all three modes, and a variety of input lengths. * [![Actions Status](https://github.com/BLAKE3-team/BLAKE3/workflows/tests/badge.svg)](https://github.com/BLAKE3-team/BLAKE3/actions) BLAKE3 was designed by: * [@oconnor663] (Jack O'Connor) * [@sneves] (Samuel Neves) * [@veorq] (Jean-Philippe Aumasson) * [@zookozcash] (Zooko) The development of BLAKE3 was sponsored by [Electric Coin Company](https://electriccoin.co). BLAKE3 is also [specified](https://c2sp.org/BLAKE3) in the [Community Cryptography Specification Project (C2SP)](https://c2sp.org). *NOTE: BLAKE3 is not a password hashing algorithm, because it's designed to be fast, whereas password hashing should not be fast. If you hash passwords to store the hashes or if you derive keys from passwords, we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).* ## Usage ### The `b3sum` utility The `b3sum` command line utility prints the BLAKE3 hashes of files or of standard input. Prebuilt binaries are available for Linux, Windows, and macOS (requiring the [unidentified developer workaround](https://support.apple.com/guide/mac-help/open-a-mac-app-from-an-unidentified-developer-mh40616/mac)) on the [releases page](https://github.com/BLAKE3-team/BLAKE3/releases). If you've [installed Rust and Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html), you can also build `b3sum` yourself with: ```bash cargo install b3sum ``` If `rustup` didn't configure your `PATH` for you, you might need to go looking for the installed binary in e.g. `~/.cargo/bin`. You can test out how fast BLAKE3 is on your machine by creating a big file and hashing it, for example: ```bash # Create a 1 GB file. head -c 1000000000 /dev/zero > /tmp/bigfile # Hash it with SHA-256. time openssl sha256 /tmp/bigfile # Hash it with BLAKE3. time b3sum /tmp/bigfile ``` ### The `blake3` crate [![docs.rs](https://docs.rs/blake3/badge.svg)](https://docs.rs/blake3) To use BLAKE3 from Rust code, add a dependency on the `blake3` crate to your `Cargo.toml`. Here's an example of hashing some input bytes: ```rust // Hash an input all at once. let hash1 = blake3::hash(b"foobarbaz"); // Hash an input incrementally. let mut hasher = blake3::Hasher::new(); hasher.update(b"foo"); hasher.update(b"bar"); hasher.update(b"baz"); let hash2 = hasher.finalize(); assert_eq!(hash1, hash2); // Extended output. OutputReader also implements Read and Seek. let mut output = [0; 1000]; let mut output_reader = hasher.finalize_xof(); output_reader.fill(&mut output); assert_eq!(hash1, output[..32]); // Print a hash as hex. println!("{}", hash1); ``` Besides `hash`, BLAKE3 provides two other modes, `keyed_hash` and `derive_key`. The `keyed_hash` mode takes a 256-bit key: ```rust // MAC an input all at once. let example_key = [42u8; 32]; let mac1 = blake3::keyed_hash(&example_key, b"example input"); // MAC incrementally. let mut hasher = blake3::Hasher::new_keyed(&example_key); hasher.update(b"example input"); let mac2 = hasher.finalize(); assert_eq!(mac1, mac2); ``` The `derive_key` mode takes a context string and some key material (not a password). The context string should be hardcoded, globally unique, and application-specific. A good default format for the context string is `"[application] [commit timestamp] [purpose]"`: ```rust // Derive a couple of subkeys for different purposes. const EMAIL_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:10:44 email key"; const API_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:11:21 API key"; let input_key_material = b"usually at least 32 random bytes, not a password"; let email_key = blake3::derive_key(EMAIL_CONTEXT, input_key_material); let api_key = blake3::derive_key(API_CONTEXT, input_key_material); assert_ne!(email_key, api_key); ``` ### The C implementation See [`c/README.md`](c/README.md). ### Other implementations We post links to third-party bindings and implementations on the [@BLAKE3team Twitter account](https://twitter.com/BLAKE3team) whenever we hear about them. Some highlights include [an optimized Go implementation](https://github.com/zeebo/blake3), [Wasm bindings for Node.js and browsers](https://github.com/connor4312/blake3), [binary wheels for Python](https://github.com/oconnor663/blake3-py), [.NET bindings](https://github.com/xoofx/Blake3.NET), and [JNI bindings](https://github.com/sken77/BLAKE3jni). ## Contributing Please see [CONTRIBUTING.md](CONTRIBUTING.md). ## Licenses This work is released into the public domain with [CC0 1.0](./LICENSE_CC0). Alternatively, it is licensed under any of the following: * [Apache 2.0](./LICENSE_A2) * [Apache 2.0 with LLVM exceptions](./LICENSE_A2LLVM) ## Adoption & deployment Here's a (non-exhaustive) list of protocols and software that use BLAKE3: * [Alephium](https://github.com/alephium/alephium/blob/master/crypto/src/main/scala/org/alephium/crypto/Blake3.scala) * [Bazel](https://github.com/bazelbuild/bazel/releases/tag/6.4.0) * [Chia](https://github.com/Chia-Network/chia-blockchain/blob/main/CHANGELOG.md#10beta8-aka-beta-18---2020-07-16) * [IPFS](https://github.com/ipfs/go-verifcid/issues/13) * [Farcaster](https://www.farcaster.xyz/) * [LLVM](https://reviews.llvm.org/D121510) * [Nix](https://github.com/NixOS/nix/pull/12379) * [Nym](https://github.com/nymtech/nym/blob/59056a22c5e6b01a38da2124662bd1fa3c8abef2/common/nymsphinx/params/src/lib.rs#L5) * [OpenZFS](https://github.com/openzfs/zfs/) * [Redox](https://www.redox-os.org/news/pkgar-introduction/) * [Saito](https://saito.tech/) * [Skale](https://github.com/skalenetwork/skale-consensus/pull/284) * [Solana](https://docs.rs/solana-program/1.9.5/solana_program/blake3/index.html) * [Tekken 8](https://en.bandainamcoent.eu/tekken/tekken-8) * [Wasmer](https://github.com/wasmerio/wasmer/blob/4f935a8c162bf604df223003e434e4f7ca253688/lib/cache/src/hash.rs#L21) ## Miscellany - [@veorq] and [@oconnor663] did [an interview with Cryptography FM](https://www.cryptography.fm/3). - [@oconnor663] did [an interview with Saito](https://www.youtube.com/watch?v=cJkmIt7yN_E). [@oconnor663]: https://github.com/oconnor663 [@sneves]: https://github.com/sneves [@veorq]: https://github.com/veorq [@zookozcash]: https://github.com/zookozcash blake3-1.8.1/benches/bench.rs000064400000000000000000000333501046102023000140340ustar 00000000000000#![feature(test)] extern crate test; use arrayref::array_ref; use arrayvec::ArrayVec; use blake3::platform::{Platform, MAX_SIMD_DEGREE}; use blake3::OUT_LEN; use blake3::{BLOCK_LEN, CHUNK_LEN}; use rand::prelude::*; use test::Bencher; const KIB: usize = 1024; // This struct randomizes two things: // 1. The actual bytes of input. // 2. The page offset the input starts at. pub struct RandomInput { buf: Vec, len: usize, offsets: Vec, offset_index: usize, } impl RandomInput { pub fn new(b: &mut Bencher, len: usize) -> Self { b.bytes += len as u64; let page_size: usize = page_size::get(); let mut buf = vec![0u8; len + page_size]; let mut rng = rand::rng(); rng.fill_bytes(&mut buf); let mut offsets: Vec = (0..page_size).collect(); offsets.shuffle(&mut rng); Self { buf, len, offsets, offset_index: 0, } } pub fn get(&mut self) -> &[u8] { let offset = self.offsets[self.offset_index]; self.offset_index += 1; if self.offset_index >= self.offsets.len() { self.offset_index = 0; } &self.buf[offset..][..self.len] } } fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) { let mut state = [1u32; 8]; let mut r = RandomInput::new(b, 64); let input = array_ref!(r.get(), 0, 64); b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0)); } #[bench] fn bench_single_compression_portable(b: &mut Bencher) { bench_single_compression_fn(b, Platform::portable()); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_single_compression_sse2(b: &mut Bencher) { if let Some(platform) = Platform::sse2() { bench_single_compression_fn(b, platform); } } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_single_compression_sse41(b: &mut Bencher) { if let Some(platform) = Platform::sse41() { bench_single_compression_fn(b, platform); } } #[bench] #[cfg(blake3_avx512_ffi)] fn bench_single_compression_avx512(b: &mut Bencher) { if let Some(platform) = Platform::avx512() { bench_single_compression_fn(b, platform); } } fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) { let degree = platform.simd_degree(); let mut inputs = Vec::new(); for _ in 0..degree { inputs.push(RandomInput::new(b, CHUNK_LEN)); } b.iter(|| { let input_arrays: ArrayVec<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE> = inputs .iter_mut() .take(degree) .map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) .collect(); let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; platform.hash_many( &input_arrays[..], &[0; 8], 0, blake3::IncrementCounter::Yes, 0, 0, 0, &mut out, ); }); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_chunks_sse2(b: &mut Bencher) { if let Some(platform) = Platform::sse2() { bench_many_chunks_fn(b, platform); } } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_chunks_sse41(b: &mut Bencher) { if let Some(platform) = Platform::sse41() { bench_many_chunks_fn(b, platform); } } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_chunks_avx2(b: &mut Bencher) { if let Some(platform) = Platform::avx2() { bench_many_chunks_fn(b, platform); } } #[bench] #[cfg(blake3_avx512_ffi)] fn bench_many_chunks_avx512(b: &mut Bencher) { if let Some(platform) = Platform::avx512() { bench_many_chunks_fn(b, platform); } } #[bench] #[cfg(blake3_neon)] fn bench_many_chunks_neon(b: &mut Bencher) { bench_many_chunks_fn(b, Platform::neon().unwrap()); } #[bench] #[cfg(blake3_wasm32_simd)] fn bench_many_chunks_wasm(b: &mut Bencher) { bench_many_chunks_fn(b, Platform::wasm32_simd().unwrap()); } // TODO: When we get const generics we can unify this with the chunks code. fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) { let degree = platform.simd_degree(); let mut inputs = Vec::new(); for _ in 0..degree { inputs.push(RandomInput::new(b, BLOCK_LEN)); } b.iter(|| { let input_arrays: ArrayVec<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE> = inputs .iter_mut() .take(degree) .map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) .collect(); let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; platform.hash_many( &input_arrays[..], &[0; 8], 0, blake3::IncrementCounter::No, 0, 0, 0, &mut out, ); }); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_parents_sse2(b: &mut Bencher) { if let Some(platform) = Platform::sse2() { bench_many_parents_fn(b, platform); } } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_parents_sse41(b: &mut Bencher) { if let Some(platform) = Platform::sse41() { bench_many_parents_fn(b, platform); } } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_parents_avx2(b: &mut Bencher) { if let Some(platform) = Platform::avx2() { bench_many_parents_fn(b, platform); } } #[bench] #[cfg(blake3_avx512_ffi)] fn bench_many_parents_avx512(b: &mut Bencher) { if let Some(platform) = Platform::avx512() { bench_many_parents_fn(b, platform); } } #[bench] #[cfg(blake3_neon)] fn bench_many_parents_neon(b: &mut Bencher) { bench_many_parents_fn(b, Platform::neon().unwrap()); } #[bench] #[cfg(blake3_wasm32_simd)] fn bench_many_parents_wasm(b: &mut Bencher) { bench_many_parents_fn(b, Platform::wasm32_simd().unwrap()); } fn bench_atonce(b: &mut Bencher, len: usize) { let mut input = RandomInput::new(b, len); b.iter(|| blake3::hash(input.get())); } #[bench] fn bench_atonce_0001_block(b: &mut Bencher) { bench_atonce(b, BLOCK_LEN); } #[bench] fn bench_atonce_0001_kib(b: &mut Bencher) { bench_atonce(b, 1 * KIB); } #[bench] fn bench_atonce_0002_kib(b: &mut Bencher) { bench_atonce(b, 2 * KIB); } #[bench] fn bench_atonce_0004_kib(b: &mut Bencher) { bench_atonce(b, 4 * KIB); } #[bench] fn bench_atonce_0008_kib(b: &mut Bencher) { bench_atonce(b, 8 * KIB); } #[bench] fn bench_atonce_0016_kib(b: &mut Bencher) { bench_atonce(b, 16 * KIB); } #[bench] fn bench_atonce_0032_kib(b: &mut Bencher) { bench_atonce(b, 32 * KIB); } #[bench] fn bench_atonce_0064_kib(b: &mut Bencher) { bench_atonce(b, 64 * KIB); } #[bench] fn bench_atonce_0128_kib(b: &mut Bencher) { bench_atonce(b, 128 * KIB); } #[bench] fn bench_atonce_0256_kib(b: &mut Bencher) { bench_atonce(b, 256 * KIB); } #[bench] fn bench_atonce_0512_kib(b: &mut Bencher) { bench_atonce(b, 512 * KIB); } #[bench] fn bench_atonce_1024_kib(b: &mut Bencher) { bench_atonce(b, 1024 * KIB); } fn bench_incremental(b: &mut Bencher, len: usize) { let mut input = RandomInput::new(b, len); b.iter(|| blake3::Hasher::new().update(input.get()).finalize()); } #[bench] fn bench_incremental_0001_block(b: &mut Bencher) { bench_incremental(b, BLOCK_LEN); } #[bench] fn bench_incremental_0001_kib(b: &mut Bencher) { bench_incremental(b, 1 * KIB); } #[bench] fn bench_incremental_0002_kib(b: &mut Bencher) { bench_incremental(b, 2 * KIB); } #[bench] fn bench_incremental_0004_kib(b: &mut Bencher) { bench_incremental(b, 4 * KIB); } #[bench] fn bench_incremental_0008_kib(b: &mut Bencher) { bench_incremental(b, 8 * KIB); } #[bench] fn bench_incremental_0016_kib(b: &mut Bencher) { bench_incremental(b, 16 * KIB); } #[bench] fn bench_incremental_0032_kib(b: &mut Bencher) { bench_incremental(b, 32 * KIB); } #[bench] fn bench_incremental_0064_kib(b: &mut Bencher) { bench_incremental(b, 64 * KIB); } #[bench] fn bench_incremental_0128_kib(b: &mut Bencher) { bench_incremental(b, 128 * KIB); } #[bench] fn bench_incremental_0256_kib(b: &mut Bencher) { bench_incremental(b, 256 * KIB); } #[bench] fn bench_incremental_0512_kib(b: &mut Bencher) { bench_incremental(b, 512 * KIB); } #[bench] fn bench_incremental_1024_kib(b: &mut Bencher) { bench_incremental(b, 1024 * KIB); } fn bench_reference(b: &mut Bencher, len: usize) { let mut input = RandomInput::new(b, len); b.iter(|| { let mut hasher = reference_impl::Hasher::new(); hasher.update(input.get()); let mut out = [0; 32]; hasher.finalize(&mut out); out }); } #[bench] fn bench_reference_0001_block(b: &mut Bencher) { bench_reference(b, BLOCK_LEN); } #[bench] fn bench_reference_0001_kib(b: &mut Bencher) { bench_reference(b, 1 * KIB); } #[bench] fn bench_reference_0002_kib(b: &mut Bencher) { bench_reference(b, 2 * KIB); } #[bench] fn bench_reference_0004_kib(b: &mut Bencher) { bench_reference(b, 4 * KIB); } #[bench] fn bench_reference_0008_kib(b: &mut Bencher) { bench_reference(b, 8 * KIB); } #[bench] fn bench_reference_0016_kib(b: &mut Bencher) { bench_reference(b, 16 * KIB); } #[bench] fn bench_reference_0032_kib(b: &mut Bencher) { bench_reference(b, 32 * KIB); } #[bench] fn bench_reference_0064_kib(b: &mut Bencher) { bench_reference(b, 64 * KIB); } #[bench] fn bench_reference_0128_kib(b: &mut Bencher) { bench_reference(b, 128 * KIB); } #[bench] fn bench_reference_0256_kib(b: &mut Bencher) { bench_reference(b, 256 * KIB); } #[bench] fn bench_reference_0512_kib(b: &mut Bencher) { bench_reference(b, 512 * KIB); } #[bench] fn bench_reference_1024_kib(b: &mut Bencher) { bench_reference(b, 1024 * KIB); } #[cfg(feature = "rayon")] fn bench_rayon(b: &mut Bencher, len: usize) { let mut input = RandomInput::new(b, len); b.iter(|| blake3::Hasher::new().update_rayon(input.get()).finalize()); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0001_block(b: &mut Bencher) { bench_rayon(b, BLOCK_LEN); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0001_kib(b: &mut Bencher) { bench_rayon(b, 1 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0002_kib(b: &mut Bencher) { bench_rayon(b, 2 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0004_kib(b: &mut Bencher) { bench_rayon(b, 4 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0008_kib(b: &mut Bencher) { bench_rayon(b, 8 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0016_kib(b: &mut Bencher) { bench_rayon(b, 16 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0032_kib(b: &mut Bencher) { bench_rayon(b, 32 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0064_kib(b: &mut Bencher) { bench_rayon(b, 64 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0128_kib(b: &mut Bencher) { bench_rayon(b, 128 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0256_kib(b: &mut Bencher) { bench_rayon(b, 256 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0512_kib(b: &mut Bencher) { bench_rayon(b, 512 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_1024_kib(b: &mut Bencher) { bench_rayon(b, 1024 * KIB); } // This checks that update() splits up its input in increasing powers of 2, so // that it can recover a high degree of parallelism when the number of bytes // hashed so far is uneven. The performance of this benchmark should be // reasonably close to bench_incremental_0064_kib, within 80% or so. When we // had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69), // performance was less than half. #[bench] fn bench_two_updates(b: &mut Bencher) { let len = 65536; let mut input = RandomInput::new(b, len); b.iter(|| { let mut hasher = blake3::Hasher::new(); let input = input.get(); hasher.update(&input[..1]); hasher.update(&input[1..]); hasher.finalize() }); } fn bench_xof(b: &mut Bencher, len: usize) { b.bytes = len as u64; let mut output = [0u8; 64 * BLOCK_LEN]; let output_slice = &mut output[..len]; let mut xof = blake3::Hasher::new().finalize_xof(); b.iter(|| xof.fill(output_slice)); } #[bench] fn bench_xof_01_block(b: &mut Bencher) { bench_xof(b, 1 * BLOCK_LEN); } #[bench] fn bench_xof_02_blocks(b: &mut Bencher) { bench_xof(b, 2 * BLOCK_LEN); } #[bench] fn bench_xof_03_blocks(b: &mut Bencher) { bench_xof(b, 3 * BLOCK_LEN); } #[bench] fn bench_xof_04_blocks(b: &mut Bencher) { bench_xof(b, 4 * BLOCK_LEN); } #[bench] fn bench_xof_05_blocks(b: &mut Bencher) { bench_xof(b, 5 * BLOCK_LEN); } #[bench] fn bench_xof_06_blocks(b: &mut Bencher) { bench_xof(b, 6 * BLOCK_LEN); } #[bench] fn bench_xof_07_blocks(b: &mut Bencher) { bench_xof(b, 7 * BLOCK_LEN); } #[bench] fn bench_xof_08_blocks(b: &mut Bencher) { bench_xof(b, 8 * BLOCK_LEN); } #[bench] fn bench_xof_09_blocks(b: &mut Bencher) { bench_xof(b, 9 * BLOCK_LEN); } #[bench] fn bench_xof_10_blocks(b: &mut Bencher) { bench_xof(b, 10 * BLOCK_LEN); } #[bench] fn bench_xof_11_blocks(b: &mut Bencher) { bench_xof(b, 11 * BLOCK_LEN); } #[bench] fn bench_xof_12_blocks(b: &mut Bencher) { bench_xof(b, 12 * BLOCK_LEN); } #[bench] fn bench_xof_13_blocks(b: &mut Bencher) { bench_xof(b, 13 * BLOCK_LEN); } #[bench] fn bench_xof_14_blocks(b: &mut Bencher) { bench_xof(b, 14 * BLOCK_LEN); } #[bench] fn bench_xof_15_blocks(b: &mut Bencher) { bench_xof(b, 15 * BLOCK_LEN); } #[bench] fn bench_xof_16_blocks(b: &mut Bencher) { bench_xof(b, 16 * BLOCK_LEN); } #[bench] fn bench_xof_32_blocks(b: &mut Bencher) { bench_xof(b, 32 * BLOCK_LEN); } #[bench] fn bench_xof_64_blocks(b: &mut Bencher) { bench_xof(b, 64 * BLOCK_LEN); } blake3-1.8.1/build.rs000064400000000000000000000316541046102023000124520ustar 00000000000000use std::env; fn defined(var: &str) -> bool { println!("cargo:rerun-if-env-changed={}", var); env::var_os(var).is_some() } fn is_pure() -> bool { defined("CARGO_FEATURE_PURE") } fn should_prefer_intrinsics() -> bool { defined("CARGO_FEATURE_PREFER_INTRINSICS") } fn is_neon() -> bool { defined("CARGO_FEATURE_NEON") } fn is_no_neon() -> bool { defined("CARGO_FEATURE_NO_NEON") } fn is_wasm32_simd() -> bool { defined("CARGO_FEATURE_WASM32_SIMD") } fn is_ci() -> bool { defined("BLAKE3_CI") } fn warn(warning: &str) { assert!(!warning.contains("\n")); println!("cargo:warning={}", warning); if is_ci() { println!("cargo:warning=Warnings in CI are treated as errors. Build failed."); std::process::exit(1); } } fn target_components() -> Vec { let target = env::var("TARGET").unwrap(); target.split("-").map(|s| s.to_string()).collect() } fn is_x86_64() -> bool { target_components()[0] == "x86_64" } fn is_windows_target() -> bool { env::var("CARGO_CFG_TARGET_OS").unwrap() == "windows" } fn use_msvc_asm() -> bool { const MSVC_NAMES: &[&str] = &["", "cl", "cl.exe"]; let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); let target_env = env::var("CARGO_CFG_TARGET_ENV").unwrap_or_default(); let target_windows_msvc = target_os == "windows" && target_env == "msvc"; let host_triple = env::var("HOST").unwrap_or_default(); let target_triple = env::var("TARGET").unwrap_or_default(); let cross_compiling = host_triple != target_triple; let cc = env::var("CC").unwrap_or_default().to_ascii_lowercase(); if !target_windows_msvc { // We are not building for Windows with the MSVC toolchain. false } else if !cross_compiling && MSVC_NAMES.contains(&&*cc) { // We are building on Windows with the MSVC toolchain (and not cross-compiling for another architecture or target). true } else { // We are cross-compiling to Windows with the MSVC toolchain. let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap_or_default(); let cc = env::var(format!("CC_{target_arch}_{target_vendor}_windows_msvc")) .unwrap_or_default() .to_ascii_lowercase(); // Check if we are using the MSVC compiler. MSVC_NAMES.contains(&&*cc) } } fn is_x86_32() -> bool { let arch = &target_components()[0]; arch == "i386" || arch == "i586" || arch == "i686" } fn is_arm() -> bool { is_armv7() || is_aarch64() || target_components()[0] == "arm" } fn is_aarch64() -> bool { target_components()[0] == "aarch64" } fn is_armv7() -> bool { target_components()[0] == "armv7" } fn is_wasm32() -> bool { target_components()[0] == "wasm32" } fn endianness() -> String { let endianness = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap(); assert!(endianness == "little" || endianness == "big"); endianness } fn is_little_endian() -> bool { endianness() == "little" } fn is_big_endian() -> bool { endianness() == "big" } // Windows targets may be using the MSVC toolchain or the MinGW toolchain. The // right compiler flags to use depend on the toolchain. (And we don't want to // use flag_if_supported, because we don't want features to be silently // disabled by old compilers.) fn is_windows_msvc() -> bool { // Some targets are only two components long, so check in steps. let second_component = &target_components()[1]; (second_component == "pc" || second_component == "win7") && target_components()[2] == "windows" && target_components()[3] == "msvc" } // MinGW toolchain uses 2 different targets depending on the main compiler. // Target for a general MinGW toolchain ends with `-gnu` (GCC is used as C // compiler). Target for a LLVM-MinGW toolchain (Clang is used as C compiler) // ends with `-gnullvm`. fn is_windows_gnu() -> bool { // Some targets are only two components long, so check in steps. let second_component = &target_components()[1]; (second_component == "pc" || second_component == "win7") && target_components()[2] == "windows" && target_components()[3] != "msvc" } fn new_build() -> cc::Build { let mut build = cc::Build::new(); if !is_windows_msvc() { build.flag("-std=c11"); } // Do NOT trigger a rebuild any time the env changes (e.g. $PATH). // This prevents all downstream crates from being rebuilt when `cargo check` // or `cargo build` are run in different environments, like Rust Analyzer // vs. in the terminal vs. in a Git pre-commit hook. build.emit_rerun_if_env_changed(false); build } #[derive(PartialEq)] enum CCompilerSupport { NoCompiler, NoAVX512, YesAVX512, } use CCompilerSupport::*; fn c_compiler_support() -> CCompilerSupport { let build = new_build(); let flags_checked; let support_result: Result = if is_windows_msvc() { flags_checked = "/arch:AVX512"; build.is_flag_supported("/arch:AVX512") } else { // Check for both of the flags we use. If -mavx512f works, then -mavx512vl // will probably always work too, but we might as well be thorough. flags_checked = "-mavx512f and -mavx512vl"; match build.is_flag_supported("-mavx512f") { Ok(true) => build.is_flag_supported("-mavx512vl"), false_or_error => false_or_error, } }; match support_result { Ok(true) => YesAVX512, Ok(false) => { warn(&format!( "The C compiler {:?} does not support {}.", build.get_compiler().path(), flags_checked, )); NoAVX512 } Err(e) => { println!("{:?}", e); warn(&format!( "No C compiler {:?} detected.", build.get_compiler().path() )); NoCompiler } } } fn build_sse2_sse41_avx2_rust_intrinsics() { // No C code to compile here. Set the cfg flags that enable the Rust SSE2, // SSE4.1, and AVX2 intrinsics modules. The regular Cargo build will compile // them. println!("cargo:rustc-cfg=blake3_sse2_rust"); println!("cargo:rustc-cfg=blake3_sse41_rust"); println!("cargo:rustc-cfg=blake3_avx2_rust"); } fn build_sse2_sse41_avx2_assembly() { // Build the assembly implementations for SSE4.1 and AVX2. This is // preferred, but it only supports x86_64. assert!(is_x86_64()); println!("cargo:rustc-cfg=blake3_sse2_ffi"); println!("cargo:rustc-cfg=blake3_sse41_ffi"); println!("cargo:rustc-cfg=blake3_avx2_ffi"); let mut build = new_build(); if is_windows_target() { if use_msvc_asm() { build.file("c/blake3_sse2_x86-64_windows_msvc.asm"); build.file("c/blake3_sse41_x86-64_windows_msvc.asm"); build.file("c/blake3_avx2_x86-64_windows_msvc.asm"); } else { build.file("c/blake3_sse2_x86-64_windows_gnu.S"); build.file("c/blake3_sse41_x86-64_windows_gnu.S"); build.file("c/blake3_avx2_x86-64_windows_gnu.S"); } } else { // All non-Windows implementations are assumed to support // Linux-style assembly. These files do contain a small // explicit workaround for macOS also. build.file("c/blake3_sse2_x86-64_unix.S"); build.file("c/blake3_sse41_x86-64_unix.S"); build.file("c/blake3_avx2_x86-64_unix.S"); } build.compile("blake3_sse2_sse41_avx2_assembly"); } fn build_avx512_c_intrinsics() { // This is required on 32-bit x86 targets, since the assembly // implementation doesn't support those. println!("cargo:rustc-cfg=blake3_avx512_ffi"); let mut build = new_build(); build.file("c/blake3_avx512.c"); if is_windows_msvc() { build.flag("/arch:AVX512"); } else { build.flag("-mavx512f"); build.flag("-mavx512vl"); } if is_windows_gnu() { // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782. build.flag("-fno-asynchronous-unwind-tables"); } build.compile("blake3_avx512_intrinsics"); } fn build_avx512_assembly() { // Build the assembly implementation for AVX-512. This is preferred, but it // only supports x86_64. assert!(is_x86_64()); println!("cargo:rustc-cfg=blake3_avx512_ffi"); let mut build = new_build(); let mut is_msvc = false; if is_windows_target() { if use_msvc_asm() { build.file("c/blake3_avx512_x86-64_windows_msvc.asm"); is_msvc = true; } else { build.file("c/blake3_avx512_x86-64_windows_gnu.S"); } } else { build.file("c/blake3_avx512_x86-64_unix.S"); } // Older versions of Clang require these flags, even for assembly. See // https://github.com/BLAKE3-team/BLAKE3/issues/79. if !is_msvc { build.flag("-mavx512f"); build.flag("-mavx512vl"); } build.compile("blake3_avx512_assembly"); } fn build_neon_c_intrinsics() { let mut build = new_build(); // Note that blake3_neon.c normally depends on the blake3_portable.c // for the single-instance compression function, but we expose // portable.rs over FFI instead. See ffi_neon.rs. build.file("c/blake3_neon.c"); // ARMv7 platforms that support NEON generally need the following // flags. AArch64 supports NEON by default and does not support -mpfu. if is_armv7() { build.flag("-mfpu=neon-vfpv4"); build.flag("-mfloat-abi=hard"); } build.compile("blake3_neon"); } fn build_wasm32_simd() { assert!(is_wasm32()); // No C code to compile here. Set the cfg flags that enable the Wasm SIMD. // The regular Cargo build will compile it. println!("cargo:rustc-cfg=blake3_wasm32_simd"); } fn main() -> Result<(), Box> { // As of Rust 1.80, unrecognized config names are warnings. Give Cargo all of our config names. let all_cfgs = [ "blake3_sse2_ffi", "blake3_sse2_rust", "blake3_sse41_ffi", "blake3_sse41_rust", "blake3_avx2_ffi", "blake3_avx2_rust", "blake3_avx512_ffi", "blake3_neon", "blake3_wasm32_simd", ]; for cfg_name in all_cfgs { // TODO: Switch this whole file to the new :: syntax when our MSRV reaches 1.77. // https://doc.rust-lang.org/cargo/reference/build-scripts.html#outputs-of-the-build-script println!("cargo:rustc-check-cfg=cfg({cfg_name}, values(none()))"); } if is_pure() && is_neon() { panic!("It doesn't make sense to enable both \"pure\" and \"neon\"."); } if is_no_neon() && is_neon() { panic!("It doesn't make sense to enable both \"no_neon\" and \"neon\"."); } if is_x86_64() || is_x86_32() { let support = c_compiler_support(); if is_x86_32() || should_prefer_intrinsics() || is_pure() || support == NoCompiler { build_sse2_sse41_avx2_rust_intrinsics(); } else { // We assume that all C compilers can assemble SSE4.1 and AVX2. We // don't explicitly check for support. build_sse2_sse41_avx2_assembly(); } if is_pure() || support == NoCompiler || support == NoAVX512 { // The binary will not include any AVX-512 code. } else if is_x86_32() || should_prefer_intrinsics() { build_avx512_c_intrinsics(); } else { build_avx512_assembly(); } } if is_neon() && is_big_endian() { panic!("The NEON implementation doesn't support big-endian ARM.") } if (is_arm() && is_neon()) || (!is_no_neon() && !is_pure() && is_aarch64() && is_little_endian()) { println!("cargo:rustc-cfg=blake3_neon"); build_neon_c_intrinsics(); } if is_wasm32() && is_wasm32_simd() { build_wasm32_simd(); } // The `cc` crate doesn't automatically emit rerun-if directives for the // environment variables it supports, in particular for $CC. We expect to // do a lot of benchmarking across different compilers, so we explicitly // add the variables that we're likely to need. println!("cargo:rerun-if-env-changed=CC"); println!("cargo:rerun-if-env-changed=CFLAGS"); // Ditto for source files, though these shouldn't change as often. for file in std::fs::read_dir("c")? { println!( "cargo:rerun-if-changed={}", file?.path().to_str().expect("utf-8") ); } // When compiling with clang-cl for windows, it adds .asm files to the root // which we need to delete so cargo doesn't get angry if is_windows_target() && !use_msvc_asm() { let _ = std::fs::remove_file("blake3_avx2_x86-64_windows_gnu.asm"); let _ = std::fs::remove_file("blake3_avx512_x86-64_windows_gnu.asm"); let _ = std::fs::remove_file("blake3_sse2_x86-64_windows_gnu.asm"); let _ = std::fs::remove_file("blake3_sse41_x86-64_windows_gnu.asm"); } Ok(()) } blake3-1.8.1/c/.gitignore000064400000000000000000000000751046102023000132100ustar 00000000000000blake3 example example_tbb build/ *.o CMakeUserPresets.json blake3-1.8.1/c/CMakeLists.txt000064400000000000000000000334271046102023000137670ustar 00000000000000cmake_minimum_required(VERSION 3.9 FATAL_ERROR) # respect C_EXTENSIONS OFF without explicitly setting C_STANDARD if (POLICY CMP0128) cmake_policy(SET CMP0128 NEW) endif() # mark_as_advanced does not implicitly create UNINITIALIZED cache entries if (POLICY CMP0102) cmake_policy(SET CMP0102 NEW) endif() project(libblake3 VERSION 1.8.1 DESCRIPTION "BLAKE3 C implementation" LANGUAGES C CXX ASM ) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") option(BLAKE3_USE_TBB "Enable oneTBB parallelism" OFF) option(BLAKE3_FETCH_TBB "Allow fetching oneTBB from GitHub if not found on system" OFF) include(CTest) include(FeatureSummary) include(GNUInstallDirs) add_subdirectory(dependencies) # architecture lists for which to enable assembly / SIMD sources set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64) set(BLAKE3_X86_NAMES i686 x86 X86) set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a) # default SIMD compiler flag configuration (can be overriden by toolchains or CLI) if(MSVC) set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2") # MSVC has no dedicated sse4.1 flag (see https://learn.microsoft.com/en-us/cpp/build/reference/arch-x86?view=msvc-170) set(BLAKE3_CFLAGS_SSE4.1 "/arch:AVX" CACHE STRING "the compiler flags to enable SSE4.1") set(BLAKE3_CFLAGS_AVX2 "/arch:AVX2" CACHE STRING "the compiler flags to enable AVX2") set(BLAKE3_CFLAGS_AVX512 "/arch:AVX512" CACHE STRING "the compiler flags to enable AVX512") set(BLAKE3_AMD64_ASM_SOURCES blake3_avx2_x86-64_windows_msvc.asm blake3_avx512_x86-64_windows_msvc.asm blake3_sse2_x86-64_windows_msvc.asm blake3_sse41_x86-64_windows_msvc.asm ) elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang" OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang") set(BLAKE3_CFLAGS_SSE2 "-msse2" CACHE STRING "the compiler flags to enable SSE2") set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1") set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2") set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512") if (WIN32 OR CYGWIN) set(BLAKE3_AMD64_ASM_SOURCES blake3_avx2_x86-64_windows_gnu.S blake3_avx512_x86-64_windows_gnu.S blake3_sse2_x86-64_windows_gnu.S blake3_sse41_x86-64_windows_gnu.S ) elseif(UNIX) set(BLAKE3_AMD64_ASM_SOURCES blake3_avx2_x86-64_unix.S blake3_avx512_x86-64_unix.S blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S ) endif() if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8) # 32-bit ARMv8 needs NEON to be enabled explicitly set(BLAKE3_CFLAGS_NEON "-mfpu=neon" CACHE STRING "the compiler flags to enable NEON") endif() endif() mark_as_advanced(BLAKE3_CFLAGS_SSE2 BLAKE3_CFLAGS_SSE4.1 BLAKE3_CFLAGS_AVX2 BLAKE3_CFLAGS_AVX512 BLAKE3_CFLAGS_NEON) mark_as_advanced(BLAKE3_AMD64_ASM_SOURCES) message(STATUS "BLAKE3 SIMD configuration: ${CMAKE_C_COMPILER_ARCHITECTURE_ID}") if(MSVC AND DEFINED CMAKE_C_COMPILER_ARCHITECTURE_ID) if(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Xx]86") set(BLAKE3_SIMD_TYPE "x86-intrinsics" CACHE STRING "the SIMD acceleration type to use") elseif(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Xx]64") set(BLAKE3_SIMD_TYPE "amd64-asm" CACHE STRING "the SIMD acceleration type to use") elseif(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Aa][Rr][Mm]64") set(BLAKE3_SIMD_TYPE "neon-intrinsics" CACHE STRING "the SIMD acceleration type to use") else() set(BLAKE3_SIMD_TYPE "none" CACHE STRING "the SIMD acceleration type to use") endif() elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES) set(BLAKE3_SIMD_TYPE "amd64-asm" CACHE STRING "the SIMD acceleration type to use") elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES AND DEFINED BLAKE3_CFLAGS_SSE2 AND DEFINED BLAKE3_CFLAGS_SSE4.1 AND DEFINED BLAKE3_CFLAGS_AVX2 AND DEFINED BLAKE3_CFLAGS_AVX512) set(BLAKE3_SIMD_TYPE "x86-intrinsics" CACHE STRING "the SIMD acceleration type to use") elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES OR ANDROID_ABI STREQUAL "armeabi-v7a" OR BLAKE3_USE_NEON_INTRINSICS) AND (DEFINED BLAKE3_CFLAGS_NEON OR CMAKE_SIZEOF_VOID_P EQUAL 8)) set(BLAKE3_SIMD_TYPE "neon-intrinsics" CACHE STRING "the SIMD acceleration type to use") else() set(BLAKE3_SIMD_TYPE "none" CACHE STRING "the SIMD acceleration type to use") endif() mark_as_advanced(BLAKE3_SIMD_TYPE) # library target add_library(blake3 blake3.c blake3_dispatch.c blake3_portable.c ) add_library(BLAKE3::blake3 ALIAS blake3) # library configuration set(BLAKE3_PKGCONFIG_CFLAGS) if (BUILD_SHARED_LIBS) target_compile_definitions(blake3 PUBLIC BLAKE3_DLL PRIVATE BLAKE3_DLL_EXPORTS ) list(APPEND BLAKE3_PKGCONFIG_CFLAGS -DBLAKE3_DLL) endif() target_include_directories(blake3 PUBLIC $ $ ) set_target_properties(blake3 PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 0 C_VISIBILITY_PRESET hidden C_EXTENSIONS OFF ) target_compile_features(blake3 PUBLIC c_std_99) if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.12) target_compile_features(blake3 PUBLIC cxx_std_20) # else: add it further below through `BLAKE3_CMAKE_CXXFLAGS_*` endif() # ensure C_EXTENSIONS OFF is respected without overriding CMAKE_C_STANDARD # which may be set by the user or toolchain file if (NOT POLICY CMP0128 AND NOT DEFINED CMAKE_C_STANDARD) set_target_properties(blake3 PROPERTIES C_STANDARD 99) endif() # optional SIMD sources if(BLAKE3_SIMD_TYPE STREQUAL "amd64-asm") if (NOT DEFINED BLAKE3_AMD64_ASM_SOURCES) message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to 'amd64-asm' but no assembly sources are available for the target architecture.") endif() set(BLAKE3_SIMD_AMD64_ASM ON) if(MSVC) enable_language(ASM_MASM) endif() target_sources(blake3 PRIVATE ${BLAKE3_AMD64_ASM_SOURCES}) elseif(BLAKE3_SIMD_TYPE STREQUAL "x86-intrinsics") if (NOT DEFINED BLAKE3_CFLAGS_SSE2 OR NOT DEFINED BLAKE3_CFLAGS_SSE4.1 OR NOT DEFINED BLAKE3_CFLAGS_AVX2 OR NOT DEFINED BLAKE3_CFLAGS_AVX512) message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to 'x86-intrinsics' but no compiler flags are available for the target architecture.") endif() set(BLAKE3_SIMD_X86_INTRINSICS ON) target_sources(blake3 PRIVATE blake3_avx2.c blake3_avx512.c blake3_sse2.c blake3_sse41.c ) set_source_files_properties(blake3_avx2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX2}") set_source_files_properties(blake3_avx512.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX512}") set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}") set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}") elseif(BLAKE3_SIMD_TYPE STREQUAL "neon-intrinsics") set(BLAKE3_SIMD_NEON_INTRINSICS ON) target_sources(blake3 PRIVATE blake3_neon.c ) target_compile_definitions(blake3 PRIVATE BLAKE3_USE_NEON=1 ) if (DEFINED BLAKE3_CFLAGS_NEON) set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}") endif() elseif(BLAKE3_SIMD_TYPE STREQUAL "none") target_compile_definitions(blake3 PRIVATE BLAKE3_USE_NEON=0 BLAKE3_NO_SSE2 BLAKE3_NO_SSE41 BLAKE3_NO_AVX2 BLAKE3_NO_AVX512 ) else() message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to an unknown value: '${BLAKE3_SIMD_TYPE}'") endif() if(BLAKE3_USE_TBB) find_package(TBB 2021.11.0 QUIET) if(NOT TBB_FOUND AND NOT TARGET TBB::tbb) message(WARNING "oneTBB not found; disabling BLAKE3_USE_TBB\n" "Enable BLAKE3_FETCH_TBB to automatically fetch and build oneTBB" ) set(BLAKE3_USE_TBB OFF) else() target_sources(blake3 PRIVATE blake3_tbb.cpp) target_link_libraries(blake3 PUBLIC # Make shared TBB a transitive dependency. The consuming program is technically not required # to link TBB in order for libblake3 to function but we do this in order to prevent the # possibility of multiple separate TBB runtimes being linked into a final program in case # the consuming program also happens to already use TBB. TBB::tbb) target_compile_definitions(blake3 PUBLIC BLAKE3_USE_TBB) endif() list(APPEND PKG_CONFIG_REQUIRES "tbb >= ${TBB_VERSION}") list(APPEND BLAKE3_PKGCONFIG_CFLAGS -DBLAKE3_USE_TBB) endif() if(BLAKE3_USE_TBB) # Define some scratch variables for building appropriate flags per compiler if(CMAKE_VERSION VERSION_LESS 3.12) set(APPEND BLAKE3_CXX_STANDARD_FLAGS_GNU -std=c++20) set(APPEND BLAKE3_CXX_STANDARD_FLAGS_MSVC /std:c++20) endif() set(BLAKE3_CXXFLAGS_GNU "-fno-exceptions;-fno-rtti;${BLAKE3_CXX_STANDARD_FLAGS_GNU}" CACHE STRING "C++ flags used for compiling private BLAKE3 library components with GNU-like compiler frontends.") set(BLAKE3_CXXFLAGS_MSVC "/EHs-c-;/GR-;${BLAKE3_CXX_STANDARD_FLAGS_MSVC}" CACHE STRING "C++ flags used for compiling private BLAKE3 library components with MSVC-like compiler frontends.") # Get the C++ compiler name without extension get_filename_component(BLAKE3_CMAKE_CXX_COMPILER_NAME "${CMAKE_CXX_COMPILER}" NAME_WE) # Strip any trailing versioning from the C++ compiler name string(REGEX MATCH "^(clang\\+\\+|clang-cl)" BLAKE3_CMAKE_CXX_COMPILER_NAME "${BLAKE3_CMAKE_CXX_COMPILER_NAME}") # TODO: Simplify with CMAKE_CXX_COMPILER_FRONTEND_VARIANT once min CMake version is 3.14. if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") target_compile_options(blake3 PRIVATE $<$:${BLAKE3_CXXFLAGS_GNU}>) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") if(BLAKE3_CMAKE_CXX_COMPILER_NAME STREQUAL "clang++") target_compile_options(blake3 PRIVATE $<$:${BLAKE3_CXXFLAGS_GNU}>) elseif(BLAKE3_CMAKE_CXX_COMPILER_NAME STREQUAL "clang-cl") target_compile_options(blake3 PRIVATE $<$:${BLAKE3_CXXFLAGS_MSVC}>) endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") target_compile_options(blake3 PRIVATE $<$:${BLAKE3_CXXFLAGS_GNU}>) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") target_compile_options(blake3 PRIVATE $<$:${BLAKE3_CXXFLAGS_MSVC}>) endif() # Undefine scratch variables unset(BLAKE3_CXX_STANDARD_FLAGS_GNU) unset(BLAKE3_CXX_STANDARD_FLAGS_MSVC) unset(BLAKE3_CMAKE_CXX_COMPILER_NAME) unset(BLAKE3_CXXFLAGS_GNU) unset(BLAKE3_CXXFLAGS_MSVC) endif() # cmake install support install(FILES blake3.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") install(TARGETS blake3 EXPORT blake3-targets ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" ) install(EXPORT blake3-targets NAMESPACE BLAKE3:: DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3" ) include(CMakePackageConfigHelpers) configure_package_config_file(blake3-config.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/blake3-config.cmake" INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3" ) write_basic_package_version_file( "${CMAKE_CURRENT_BINARY_DIR}/blake3-config-version.cmake" VERSION ${libblake3_VERSION} COMPATIBILITY SameMajorVersion ) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/blake3-config.cmake" "${CMAKE_CURRENT_BINARY_DIR}/blake3-config-version.cmake" DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3" ) # Function for joining paths known from most languages # # SPDX-License-Identifier: (MIT OR CC0-1.0) # Copyright 2020 Jan Tojnar # https://github.com/jtojnar/cmake-snips # # Modelled after Python’s os.path.join # https://docs.python.org/3.7/library/os.path.html#os.path.join # Windows not supported function(join_paths joined_path first_path_segment) set(temp_path "${first_path_segment}") foreach(current_segment IN LISTS ARGN) if(NOT ("${current_segment}" STREQUAL "")) if(IS_ABSOLUTE "${current_segment}") set(temp_path "${current_segment}") else() set(temp_path "${temp_path}/${current_segment}") endif() endif() endforeach() set(${joined_path} "${temp_path}" PARENT_SCOPE) endfunction() # In-place rewrite a list of strings `requires` into a comma separated string. # # TODO: Replace function with list(JOIN) when updating to CMake 3.12 function(join_pkg_config_requires requires) set(_requires "${${requires}}") # avoid shadowing issues, e.g. "${requires}"=len list(LENGTH "${requires}" len) set(idx 1) foreach(req IN LISTS _requires) string(APPEND acc "${req}") if(idx LESS len) string(APPEND acc ", ") endif() math(EXPR idx "${idx} + 1") endforeach() set("${requires}" "${acc}" PARENT_SCOPE) endfunction() # pkg-config support join_pkg_config_requires(PKG_CONFIG_REQUIRES) join_paths(PKG_CONFIG_INSTALL_LIBDIR "\${prefix}" "${CMAKE_INSTALL_LIBDIR}") join_paths(PKG_CONFIG_INSTALL_INCLUDEDIR "\${prefix}" "${CMAKE_INSTALL_INCLUDEDIR}") configure_file(libblake3.pc.in libblake3.pc @ONLY) install(FILES "${CMAKE_BINARY_DIR}/libblake3.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") # print feature summary # add_feature_info cannot directly use the BLAKE3_SIMD_TYPE :( add_feature_info("AMD64 assembly" BLAKE3_SIMD_AMD64_ASM "The library uses hand written amd64 SIMD assembly.") add_feature_info("x86 SIMD intrinsics" BLAKE3_SIMD_X86_INTRINSICS "The library uses x86 SIMD intrinsics.") add_feature_info("NEON SIMD intrinsics" BLAKE3_SIMD_NEON_INTRINSICS "The library uses NEON SIMD intrinsics.") add_feature_info("oneTBB parallelism" BLAKE3_USE_TBB "The library uses oneTBB parallelism.") feature_summary(WHAT ENABLED_FEATURES) if(BLAKE3_EXAMPLES) include(BLAKE3/Examples) endif() if(BLAKE3_TESTING) include(BLAKE3/Testing) endif() blake3-1.8.1/c/CMakePresets.json000064400000000000000000000034701046102023000144430ustar 00000000000000{ "version": 3, "cmakeMinimumRequired": { "major": 3, "minor": 22, "patch": 0 }, "configurePresets": [ { "name": "base", "hidden": true, "binaryDir": "${sourceDir}/build/${presetName}" }, { "name": "msvc", "hidden": true, "generator": "Visual Studio 17 2022", "vendor": { "microsoft.com/VisualStudioSettings/CMake/1.0": { "hostOS": [ "Windows" ] } } }, { "name": "x64-windows-msvc", "inherits": [ "msvc", "base" ], "architecture": "x64" }, { "name": "x86-windows-msvc", "inherits": [ "msvc", "base" ], "architecture": "Win32" }, { "name": "arm64-windows-msvc", "inherits": [ "msvc", "base" ], "architecture": "ARM64" } ], "buildPresets": [ { "name": "x64-windows-msvc-debug", "configurePreset": "x64-windows-msvc", "configuration": "Debug" }, { "name": "x64-windows-msvc-release", "configurePreset": "x64-windows-msvc", "configuration": "RelWithDebInfo" }, { "name": "x86-windows-msvc-debug", "configurePreset": "x86-windows-msvc", "configuration": "Debug" }, { "name": "x86-windows-msvc-release", "configurePreset": "x86-windows-msvc", "configuration": "RelWithDebInfo" } ] }blake3-1.8.1/c/Makefile.testing000064400000000000000000000040111046102023000143260ustar 00000000000000# This Makefile is only for testing. C callers should follow the instructions # in ./README.md to incorporate these C files into their existing build. NAME=blake3 CC=gcc CFLAGS=-O3 -Wall -Wextra -std=c11 -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2 -fPIE -fvisibility=hidden LDFLAGS=-pie -Wl,-z,relro,-z,now TARGETS= ASM_TARGETS= EXTRAFLAGS=-Wa,--noexecstack ifdef BLAKE3_NO_SSE2 EXTRAFLAGS += -DBLAKE3_NO_SSE2 else TARGETS += blake3_sse2.o ASM_TARGETS += blake3_sse2_x86-64_unix.S endif ifdef BLAKE3_NO_SSE41 EXTRAFLAGS += -DBLAKE3_NO_SSE41 else TARGETS += blake3_sse41.o ASM_TARGETS += blake3_sse41_x86-64_unix.S endif ifdef BLAKE3_NO_AVX2 EXTRAFLAGS += -DBLAKE3_NO_AVX2 else TARGETS += blake3_avx2.o ASM_TARGETS += blake3_avx2_x86-64_unix.S endif ifdef BLAKE3_NO_AVX512 EXTRAFLAGS += -DBLAKE3_NO_AVX512 else TARGETS += blake3_avx512.o ASM_TARGETS += blake3_avx512_x86-64_unix.S endif ifdef BLAKE3_USE_NEON EXTRAFLAGS += -DBLAKE3_USE_NEON=1 TARGETS += blake3_neon.o endif ifdef BLAKE3_NO_NEON EXTRAFLAGS += -DBLAKE3_USE_NEON=0 endif all: blake3.c blake3_dispatch.c blake3_portable.c main.c $(TARGETS) $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS) blake3_sse2.o: blake3_sse2.c $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse2 blake3_sse41.o: blake3_sse41.c $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse4.1 blake3_avx2.o: blake3_avx2.c $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx2 blake3_avx512.o: blake3_avx512.c $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx512f -mavx512vl blake3_neon.o: blake3_neon.c $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ test: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined test: all ./test.py asm: blake3.c blake3_dispatch.c blake3_portable.c main.c $(ASM_TARGETS) $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS) test_asm: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined test_asm: asm ./test.py example: example.c blake3.c blake3_dispatch.c blake3_portable.c $(ASM_TARGETS) $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $@ $(LDFLAGS) clean: rm -f $(NAME) *.o blake3-1.8.1/c/README.md000064400000000000000000000325141046102023000125020ustar 00000000000000The official C implementation of BLAKE3. # Example An example program that hashes bytes from standard input and prints the result: ```c #include "blake3.h" #include #include #include #include int main(void) { // Initialize the hasher. blake3_hasher hasher; blake3_hasher_init(&hasher); // Read input bytes from stdin. unsigned char buf[65536]; while (1) { ssize_t n = read(STDIN_FILENO, buf, sizeof(buf)); if (n > 0) { blake3_hasher_update(&hasher, buf, n); } else if (n == 0) { break; // end of file } else { fprintf(stderr, "read failed: %s\n", strerror(errno)); return 1; } } // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. uint8_t output[BLAKE3_OUT_LEN]; blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); // Print the hash as hexadecimal. for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) { printf("%02x", output[i]); } printf("\n"); return 0; } ``` The code above is included in this directory as `example.c`. If you're on x86\_64 with a Unix-like OS, you can compile a working binary like this: ```bash gcc -O3 -o example example.c blake3.c blake3_dispatch.c blake3_portable.c \ blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \ blake3_avx512_x86-64_unix.S ``` # API ## The Struct ```c typedef struct { // private fields } blake3_hasher; ``` An incremental BLAKE3 hashing state, which can accept any number of updates. This implementation doesn't allocate any heap memory, but `sizeof(blake3_hasher)` itself is relatively large, currently 1912 bytes on x86-64. This size can be reduced by restricting the maximum input length, as described in Section 5.4 of [the BLAKE3 spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf), but this implementation doesn't currently support that strategy. ## Common API Functions ```c void blake3_hasher_init( blake3_hasher *self); ``` Initialize a `blake3_hasher` in the default hashing mode. --- ```c void blake3_hasher_update( blake3_hasher *self, const void *input, size_t input_len); ``` Add input to the hasher. This can be called any number of times. This function is always single-threaded; for multithreading see `blake3_hasher_update_tbb` below. --- ```c void blake3_hasher_finalize( const blake3_hasher *self, uint8_t *out, size_t out_len); ``` Finalize the hasher and return an output of any length, given in bytes. This doesn't modify the hasher itself, and it's possible to finalize again after adding more input. The constant `BLAKE3_OUT_LEN` provides the default output length, 32 bytes, which is recommended for most callers. See the [Security Notes](#security-notes) below. ## Less Common API Functions ```c void blake3_hasher_init_keyed( blake3_hasher *self, const uint8_t key[BLAKE3_KEY_LEN]); ``` Initialize a `blake3_hasher` in the keyed hashing mode. The key must be exactly 32 bytes. --- ```c void blake3_hasher_init_derive_key( blake3_hasher *self, const char *context); ``` Initialize a `blake3_hasher` in the key derivation mode. The context string is given as an initialization parameter, and afterwards input key material should be given with `blake3_hasher_update`. The context string is a null-terminated C string which should be **hardcoded, globally unique, and application-specific**. The context string should not include any dynamic input like salts, nonces, or identifiers read from a database at runtime. A good default format for the context string is `"[application] [commit timestamp] [purpose]"`, e.g., `"example.com 2019-12-25 16:18:03 session tokens v1"`. This function is intended for application code written in C. For language bindings, see `blake3_hasher_init_derive_key_raw` below. --- ```c void blake3_hasher_init_derive_key_raw( blake3_hasher *self, const void *context, size_t context_len); ``` As `blake3_hasher_init_derive_key` above, except that the context string is given as a pointer to an array of arbitrary bytes with a provided length. This is intended for writing language bindings, where C string conversion would add unnecessary overhead and new error cases. Unicode strings should be encoded as UTF-8. Application code in C should prefer `blake3_hasher_init_derive_key`, which takes the context as a C string. If you need to use arbitrary bytes as a context string in application code, consider whether you're violating the requirement that context strings should be hardcoded. --- ```c void blake3_hasher_update_tbb( blake3_hasher *self, const void *input, size_t input_len); ``` Add input to the hasher, using [oneTBB] to process large inputs using multiple threads. This can be called any number of times. This gives the same result as `blake3_hasher_update` above. [oneTBB]: https://uxlfoundation.github.io/oneTBB/ NOTE: This function is only enabled when the library is compiled with CMake option `BLAKE3_USE_TBB` and when the oneTBB library is detected on the host system. See the building instructions for further details. To get any performance benefit from multithreading, the input buffer needs to be large. As a rule of thumb on x86_64, `blake3_hasher_update_tbb` is _slower_ than `blake3_hasher_update` for inputs under 128 KiB. That threshold varies quite a lot across different processors, and it's important to benchmark your specific use case. Hashing large files with this function usually requires [memory-mapping](https://en.wikipedia.org/wiki/Memory-mapped_file), since reading a file into memory in a single-threaded loop takes longer than hashing the resulting buffer. Note that hashing a memory-mapped file with this function produces a "random" pattern of disk reads, which can be slow on spinning disks. Again it's important to benchmark your specific use case. This implementation doesn't require configuration of thread resources and will use as many cores as possible by default. More fine-grained control of resources is possible using the [oneTBB] API. --- ```c void blake3_hasher_finalize_seek( const blake3_hasher *self, uint64_t seek, uint8_t *out, size_t out_len); ``` The same as `blake3_hasher_finalize`, but with an additional `seek` parameter for the starting byte position in the output stream. To efficiently stream a large output without allocating memory, call this function in a loop, incrementing `seek` by the output length each time. --- ```c void blake3_hasher_reset( blake3_hasher *self); ``` Reset the hasher to its initial state, prior to any calls to `blake3_hasher_update`. Currently this is no different from calling `blake3_hasher_init` or similar again. # Security Notes Outputs shorter than the default length of 32 bytes (256 bits) provide less security. An N-bit BLAKE3 output is intended to provide N bits of first and second preimage resistance and N/2 bits of collision resistance, for any N up to 256. Longer outputs don't provide any additional security. Avoid relying on the secrecy of the output offset, that is, the `seek` argument of `blake3_hasher_finalize_seek`. [_Block-Cipher-Based Tree Hashing_ by Aldo Gunsing](https://eprint.iacr.org/2022/283) shows that an attacker who knows both the message and the key (if any) can easily determine the offset of an extended output. For comparison, AES-CTR has a similar property: if you know the key, you can decrypt a block from an unknown position in the output stream to recover its block index. Callers with strong secret keys aren't affected in practice, but secret offsets are a [design smell](https://en.wikipedia.org/wiki/Design_smell) in any case. # Building The easiest and most complete method of compiling this library is with CMake. This is the method described in the next section. Toward the end of the building section there are more in depth notes about compiling manually and things that are useful to understand if you need to integrate this library with another build system. ## CMake The minimum version of CMake is 3.9. The following invocations will compile and install `libblake3`. With recent CMake: ```bash cmake -S c -B c/build "-DCMAKE_INSTALL_PREFIX=/usr/local" cmake --build c/build --target install ``` With an older CMake: ```bash cd c mkdir build cd build cmake .. "-DCMAKE_INSTALL_PREFIX=/usr/local" cmake --build . --target install ``` The following options are available when compiling with CMake: - `BLAKE3_USE_TBB`: Enable oneTBB parallelism (Requires a C++20 capable compiler) - `BLAKE3_FETCH_TBB`: Allow fetching oneTBB from GitHub (only if not found on system) - `BLAKE3_EXAMPLES`: Compile and install example programs Options can be enabled like this: ```bash cmake -S c -B c/build "-DCMAKE_INSTALL_PREFIX=/usr/local" -DCMAKE_USE_TBB=1 -DCMAKE_FETCH_TBB=1 ``` ## Building manually We try to keep the build simple enough that you can compile this library "by hand", and it's expected that many callers will integrate it with their pre-existing build systems. See the `gcc` one-liner in the "Example" section above. ### x86 Dynamic dispatch is enabled by default on x86. The implementation will query the CPU at runtime to detect SIMD support, and it will use the widest instruction set available. By default, `blake3_dispatch.c` expects to be linked with code for five different instruction sets: portable C, SSE2, SSE4.1, AVX2, and AVX-512. For each of the x86 SIMD instruction sets, four versions are available: three flavors of assembly (Unix, Windows MSVC, and Windows GNU) and one version using C intrinsics. The assembly versions are generally preferred. They perform better, they perform more consistently across different compilers, and they build more quickly. On the other hand, the assembly versions are x86\_64-only, and you need to select the right flavor for your target platform. Here's an example of building a shared library on x86\_64 Linux using the assembly implementations: ```bash gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \ blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \ blake3_avx512_x86-64_unix.S ``` When building the intrinsics-based implementations, you need to build each implementation separately, with the corresponding instruction set explicitly enabled in the compiler. Here's the same shared library using the intrinsics-based implementations: ```bash gcc -c -fPIC -O3 -msse2 blake3_sse2.c -o blake3_sse2.o gcc -c -fPIC -O3 -msse4.1 blake3_sse41.c -o blake3_sse41.o gcc -c -fPIC -O3 -mavx2 blake3_avx2.c -o blake3_avx2.o gcc -c -fPIC -O3 -mavx512f -mavx512vl blake3_avx512.c -o blake3_avx512.o gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \ blake3_avx2.o blake3_avx512.o blake3_sse41.o blake3_sse2.o ``` Note above that building `blake3_avx512.c` requires both `-mavx512f` and `-mavx512vl` under GCC and Clang. Under MSVC, the single `/arch:AVX512` flag is sufficient. The MSVC equivalent of `-mavx2` is `/arch:AVX2`. MSVC enables SSE2 and SSE4.1 by default, and it doesn't have a corresponding flag. If you want to omit SIMD code entirely, you need to explicitly disable each instruction set. Here's an example of building a shared library on x86 with only portable code: ```bash gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 \ -DBLAKE3_NO_AVX512 blake3.c blake3_dispatch.c blake3_portable.c ``` ### ARM NEON The NEON implementation is enabled by default on AArch64, but not on other ARM targets, since not all of them support it. To enable it, set `BLAKE3_USE_NEON=1`. Here's an example of building a shared library on ARM Linux with NEON support: ```bash gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=1 blake3.c blake3_dispatch.c \ blake3_portable.c blake3_neon.c ``` To explicitiy disable using NEON instructions on AArch64, set `BLAKE3_USE_NEON=0`. ```bash gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=0 blake3.c blake3_dispatch.c \ blake3_portable.c ``` Note that on some targets (ARMv7 in particular), extra flags may be required to activate NEON support in the compiler. If you see an error like... ``` /usr/lib/gcc/armv7l-unknown-linux-gnueabihf/9.2.0/include/arm_neon.h:635:1: error: inlining failed in call to always_inline ‘vaddq_u32’: target specific option mismatch ``` ...then you may need to add something like `-mfpu=neon-vfpv4 -mfloat-abi=hard`. ### Other Platforms The portable implementation should work on most other architectures. For example: ```bash gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c ``` ### Multithreading Multithreading is available using [oneTBB], by compiling the optional C++ support file [`blake3_tbb.cpp`](./blake3_tbb.cpp). For an example of using `mmap` (non-Windows) and `blake3_hasher_update_tbb` to get large-file performance on par with [`b3sum`](../b3sum), see [`example_tbb.c`](./example_tbb.c). You can build it like this: ```bash g++ -c -O3 -fno-exceptions -fno-rtti -DBLAKE3_USE_TBB -o blake3_tbb.o blake3_tbb.cpp gcc -O3 -o example_tbb -lstdc++ -ltbb -DBLAKE3_USE_TBB blake3_tbb.o example_tbb.c blake3.c \ blake3_dispatch.c blake3_portable.c blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S \ blake3_avx2_x86-64_unix.S blake3_avx512_x86-64_unix.S ``` NOTE: `-fno-exceptions` or equivalent is required to compile `blake3_tbb.cpp`, and public API methods with external C linkage are marked `noexcept`. Compiling that file with exceptions enabled will fail. Compiling with RTTI disabled isn't required but is recommended for code size. blake3-1.8.1/c/blake3-config.cmake.in000064400000000000000000000004251046102023000152320ustar 00000000000000@PACKAGE_INIT@ include(CMakeFindDependencyMacro) # Remember TBB option state set(BLAKE3_USE_TBB @BLAKE3_USE_TBB@) if(BLAKE3_USE_TBB) find_dependency(TBB @TBB_VERSION@) endif() include("${CMAKE_CURRENT_LIST_DIR}/blake3-targets.cmake") check_required_components(blake3) blake3-1.8.1/c/blake3.c000064400000000000000000000703611046102023000125320ustar 00000000000000#include #include #include #include "blake3.h" #include "blake3_impl.h" const char *blake3_version(void) { return BLAKE3_VERSION_STRING; } INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], uint8_t flags) { memcpy(self->cv, key, BLAKE3_KEY_LEN); self->chunk_counter = 0; memset(self->buf, 0, BLAKE3_BLOCK_LEN); self->buf_len = 0; self->blocks_compressed = 0; self->flags = flags; } INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], uint64_t chunk_counter) { memcpy(self->cv, key, BLAKE3_KEY_LEN); self->chunk_counter = chunk_counter; self->blocks_compressed = 0; memset(self->buf, 0, BLAKE3_BLOCK_LEN); self->buf_len = 0; } INLINE size_t chunk_state_len(const blake3_chunk_state *self) { return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + ((size_t)self->buf_len); } INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, const uint8_t *input, size_t input_len) { size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); if (take > input_len) { take = input_len; } uint8_t *dest = self->buf + ((size_t)self->buf_len); memcpy(dest, input, take); self->buf_len += (uint8_t)take; return take; } INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { if (self->blocks_compressed == 0) { return CHUNK_START; } else { return 0; } } typedef struct { uint32_t input_cv[8]; uint64_t counter; uint8_t block[BLAKE3_BLOCK_LEN]; uint8_t block_len; uint8_t flags; } output_t; INLINE output_t make_output(const uint32_t input_cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { output_t ret; memcpy(ret.input_cv, input_cv, 32); memcpy(ret.block, block, BLAKE3_BLOCK_LEN); ret.block_len = block_len; ret.counter = counter; ret.flags = flags; return ret; } // Chaining values within a given chunk (specifically the compress_in_place // interface) are represented as words. This avoids unnecessary bytes<->words // conversion overhead in the portable implementation. However, the hash_many // interface handles both user input and parent node blocks, so it accepts // bytes. For that reason, chaining values in the CV stack are represented as // bytes. INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { uint32_t cv_words[8]; memcpy(cv_words, self->input_cv, 32); blake3_compress_in_place(cv_words, self->block, self->block_len, self->counter, self->flags); store_cv_words(cv, cv_words); } INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, size_t out_len) { if (out_len == 0) { return; } uint64_t output_block_counter = seek / 64; size_t offset_within_block = seek % 64; uint8_t wide_buf[64]; if(offset_within_block) { blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf); const size_t available_bytes = 64 - offset_within_block; const size_t bytes = out_len > available_bytes ? available_bytes : out_len; memcpy(out, wide_buf + offset_within_block, bytes); out += bytes; out_len -= bytes; output_block_counter += 1; } if(out_len / 64) { blake3_xof_many(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, out, out_len / 64); } output_block_counter += out_len / 64; out += out_len & -64; out_len -= out_len & -64; if(out_len) { blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf); memcpy(out, wide_buf, out_len); } } INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, size_t input_len) { if (self->buf_len > 0) { size_t take = chunk_state_fill_buf(self, input, input_len); input += take; input_len -= take; if (input_len > 0) { blake3_compress_in_place( self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, self->flags | chunk_state_maybe_start_flag(self)); self->blocks_compressed += 1; self->buf_len = 0; memset(self->buf, 0, BLAKE3_BLOCK_LEN); } } while (input_len > BLAKE3_BLOCK_LEN) { blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, self->chunk_counter, self->flags | chunk_state_maybe_start_flag(self)); self->blocks_compressed += 1; input += BLAKE3_BLOCK_LEN; input_len -= BLAKE3_BLOCK_LEN; } chunk_state_fill_buf(self, input, input_len); } INLINE output_t chunk_state_output(const blake3_chunk_state *self) { uint8_t block_flags = self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, block_flags); } INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], const uint32_t key[8], uint8_t flags) { return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); } // Given some input larger than one chunk, return the number of bytes that // should go in the left subtree. This is the largest power-of-2 number of // chunks that leaves at least 1 byte for the right subtree. INLINE size_t left_subtree_len(size_t input_len) { // Subtract 1 to reserve at least one byte for the right side. input_len // should always be greater than BLAKE3_CHUNK_LEN. size_t full_chunks = (input_len - 1) / BLAKE3_CHUNK_LEN; return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; } // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time // on a single thread. Write out the chunk chaining values and return the // number of chunks hashed. These chunks are never the root and never empty; // those cases use a different codepath. INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out) { #if defined(BLAKE3_TESTING) assert(0 < input_len); assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); #endif const uint8_t *chunks_array[MAX_SIMD_DEGREE]; size_t input_position = 0; size_t chunks_array_len = 0; while (input_len - input_position >= BLAKE3_CHUNK_LEN) { chunks_array[chunks_array_len] = &input[input_position]; input_position += BLAKE3_CHUNK_LEN; chunks_array_len += 1; } blake3_hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, true, flags, CHUNK_START, CHUNK_END, out); // Hash the remaining partial chunk, if there is one. Note that the empty // chunk (meaning the empty message) is a different codepath. if (input_len > input_position) { uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; blake3_chunk_state chunk_state; chunk_state_init(&chunk_state, key, flags); chunk_state.chunk_counter = counter; chunk_state_update(&chunk_state, &input[input_position], input_len - input_position); output_t output = chunk_state_output(&chunk_state); output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); return chunks_array_len + 1; } else { return chunks_array_len; } } // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time // on a single thread. Write out the parent chaining values and return the // number of parents hashed. (If there's an odd input chaining value left over, // return it as an additional output.) These parents are never the root and // never empty; those cases use a different codepath. INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t *out) { #if defined(BLAKE3_TESTING) assert(2 <= num_chaining_values); assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); #endif const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; size_t parents_array_len = 0; while (num_chaining_values - (2 * parents_array_len) >= 2) { parents_array[parents_array_len] = &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; parents_array_len += 1; } blake3_hash_many(parents_array, parents_array_len, 1, key, 0, // Parents always use counter 0. false, flags | PARENT, 0, // Parents have no start flags. 0, // Parents have no end flags. out); // If there's an odd child left over, it becomes an output. if (num_chaining_values > 2 * parents_array_len) { memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], BLAKE3_OUT_LEN); return parents_array_len + 1; } else { return parents_array_len; } } // The wide helper function returns (writes out) an array of chaining values // and returns the length of that array. The number of chaining values returned // is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, // if the input is shorter than that many chunks. The reason for maintaining a // wide array of chaining values going back up the tree, is to allow the // implementation to hash as many parents in parallel as possible. // // As a special case when the SIMD degree is 1, this function will still return // at least 2 outputs. This guarantees that this function doesn't perform the // root compression. (If it did, it would use the wrong flags, and also we // wouldn't be able to implement extendable output.) Note that this function is // not used when the whole input is only 1 chunk long; that's a different // codepath. // // Why not just have the caller split the input on the first update(), instead // of implementing this special rule? Because we don't want to limit SIMD or // multi-threading parallelism for that update(). size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out, bool use_tbb) { // Note that the single chunk case does *not* bump the SIMD degree up to 2 // when it is 1. If this implementation adds multi-threading in the future, // this gives us the option of multi-threading even the 2-chunk case, which // can help performance on smaller platforms. if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, out); } // With more than simd_degree chunks, we need to recurse. Start by dividing // the input into left and right subtrees. (Note that this is only optimal // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree // of 3 or something, we'll need a more complicated strategy.) size_t left_input_len = left_subtree_len(input_len); size_t right_input_len = input_len - left_input_len; const uint8_t *right_input = &input[left_input_len]; uint64_t right_chunk_counter = chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to // account for the special case of returning 2 outputs when the SIMD degree // is 1. uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; size_t degree = blake3_simd_degree(); if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { // The special case: We always use a degree of at least two, to make // sure there are two outputs. Except, as noted above, at the chunk // level, where we allow degree=1. (Note that the 1-chunk-input case is // a different codepath.) degree = 2; } uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; // Recurse! size_t left_n = -1; size_t right_n = -1; #if defined(BLAKE3_USE_TBB) blake3_compress_subtree_wide_join_tbb( key, flags, use_tbb, // left-hand side input, left_input_len, chunk_counter, cv_array, &left_n, // right-hand side right_input, right_input_len, right_chunk_counter, right_cvs, &right_n); #else left_n = blake3_compress_subtree_wide( input, left_input_len, key, chunk_counter, flags, cv_array, use_tbb); right_n = blake3_compress_subtree_wide(right_input, right_input_len, key, right_chunk_counter, flags, right_cvs, use_tbb); #endif // BLAKE3_USE_TBB // The special case again. If simd_degree=1, then we'll have left_n=1 and // right_n=1. Rather than compressing them into a single output, return // them directly, to make sure we always have at least two outputs. if (left_n == 1) { memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); return 2; } // Otherwise, do one layer of parent node compression. size_t num_chaining_values = left_n + right_n; return compress_parents_parallel(cv_array, num_chaining_values, key, flags, out); } // Hash a subtree with compress_subtree_wide(), and then condense the resulting // list of chaining values down to a single parent node. Don't compress that // last parent node, however. Instead, return its message bytes (the // concatenated chaining values of its children). This is necessary when the // first call to update() supplies a complete subtree, because the topmost // parent node of that subtree could end up being the root. It's also necessary // for extended output in the general case. // // As with compress_subtree_wide(), this function is not used on inputs of 1 // chunk or less. That's a different codepath. INLINE void compress_subtree_to_parent_node(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN], bool use_tbb) { #if defined(BLAKE3_TESTING) assert(input_len > BLAKE3_CHUNK_LEN); #endif uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, chunk_counter, flags, cv_array, use_tbb); assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because // as we just asserted, num_cvs will always be <=2 in that case. But GCC // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is // set then it emits incorrect warnings here. We tried a few different // hacks to silence these, but in the end our hacks just produced different // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of // desperation, we ifdef out this entire loop when we know it's not needed. #if MAX_SIMD_DEGREE_OR_2 > 2 // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input, // compress_subtree_wide() returns more than 2 chaining values. Condense // them into 2 by forming parent nodes repeatedly. uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; while (num_cvs > 2) { num_cvs = compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); } #endif memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); } INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], uint8_t flags) { memcpy(self->key, key, BLAKE3_KEY_LEN); chunk_state_init(&self->chunk, key, flags); self->cv_stack_len = 0; } void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } void blake3_hasher_init_keyed(blake3_hasher *self, const uint8_t key[BLAKE3_KEY_LEN]) { uint32_t key_words[8]; load_key_words(key, key_words); hasher_init_base(self, key_words, KEYED_HASH); } void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, size_t context_len) { blake3_hasher context_hasher; hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); blake3_hasher_update(&context_hasher, context, context_len); uint8_t context_key[BLAKE3_KEY_LEN]; blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); uint32_t context_key_words[8]; load_key_words(context_key, context_key_words); hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); } void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { blake3_hasher_init_derive_key_raw(self, context, strlen(context)); } // As described in hasher_push_cv() below, we do "lazy merging", delaying // merges until right before the next CV is about to be added. This is // different from the reference implementation. Another difference is that we // aren't always merging 1 chunk at a time. Instead, each CV might represent // any power-of-two number of chunks, as long as the smaller-above-larger stack // order is maintained. Instead of the "count the trailing 0-bits" algorithm // described in the spec, we use a "count the total number of 1-bits" variant // that doesn't require us to retain the subtree size of the CV on top of the // stack. The principle is the same: each CV that should remain in the stack is // represented by a 1-bit in the total number of chunks (or bytes) so far. INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { size_t post_merge_stack_len = (size_t)popcnt(total_len); while (self->cv_stack_len > post_merge_stack_len) { uint8_t *parent_node = &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; output_t output = parent_output(parent_node, self->key, self->chunk.flags); output_chaining_value(&output, parent_node); self->cv_stack_len -= 1; } } // In reference_impl.rs, we merge the new CV with existing CVs from the stack // before pushing it. We can do that because we know more input is coming, so // we know none of the merges are root. // // This setting is different. We want to feed as much input as possible to // compress_subtree_wide(), without setting aside anything for the chunk_state. // If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once // as a single subtree, if at all possible. // // This leads to two problems: // 1) This 64 KiB input might be the only call that ever gets made to update. // In this case, the root node of the 64 KiB subtree would be the root node // of the whole tree, and it would need to be ROOT finalized. We can't // compress it until we know. // 2) This 64 KiB input might complete a larger tree, whose root node is // similarly going to be the root of the whole tree. For example, maybe // we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the // node at the root of the 256 KiB subtree until we know how to finalize it. // // The second problem is solved with "lazy merging". That is, when we're about // to add a CV to the stack, we don't merge it with anything first, as the // reference impl does. Instead we do merges using the *previous* CV that was // added, which is sitting on top of the stack, and we put the new CV // (unmerged) on top of the stack afterwards. This guarantees that we never // merge the root node until finalize(). // // Solving the first problem requires an additional tool, // compress_subtree_to_parent_node(). That function always returns the top // *two* chaining values of the subtree it's compressing. We then do lazy // merging with each of them separately, so that the second CV will always // remain unmerged. (That also helps us support extendable output when we're // hashing an input all-at-once.) INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter) { hasher_merge_cv_stack(self, chunk_counter); memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN); self->cv_stack_len += 1; } INLINE void blake3_hasher_update_base(blake3_hasher *self, const void *input, size_t input_len, bool use_tbb) { // Explicitly checking for zero avoids causing UB by passing a null pointer // to memcpy. This comes up in practice with things like: // std::vector v; // blake3_hasher_update(&hasher, v.data(), v.size()); if (input_len == 0) { return; } const uint8_t *input_bytes = (const uint8_t *)input; // If we have some partial chunk bytes in the internal chunk_state, we need // to finish that chunk first. if (chunk_state_len(&self->chunk) > 0) { size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); if (take > input_len) { take = input_len; } chunk_state_update(&self->chunk, input_bytes, take); input_bytes += take; input_len -= take; // If we've filled the current chunk and there's more coming, finalize this // chunk and proceed. In this case we know it's not the root. if (input_len > 0) { output_t output = chunk_state_output(&self->chunk); uint8_t chunk_cv[32]; output_chaining_value(&output, chunk_cv); hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); } else { return; } } // Now the chunk_state is clear, and we have more input. If there's more than // a single chunk (so, definitely not the root chunk), hash the largest whole // subtree we can, with the full benefits of SIMD (and maybe in the future, // multi-threading) parallelism. Two restrictions: // - The subtree has to be a power-of-2 number of chunks. Only subtrees along // the right edge can be incomplete, and we don't know where the right edge // is going to be until we get to finalize(). // - The subtree must evenly divide the total number of chunks up until this // point (if total is not 0). If the current incomplete subtree is only // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have // to complete the current subtree first. // Because we might need to break up the input to form powers of 2, or to // evenly divide what we already have, this part runs in a loop. while (input_len > BLAKE3_CHUNK_LEN) { size_t subtree_len = round_down_to_power_of_2(input_len); uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; // Shrink the subtree_len until it evenly divides the count so far. We know // that subtree_len itself is a power of 2, so we can use a bitmasking // trick instead of an actual remainder operation. (Note that if the caller // consistently passes power-of-2 inputs of the same size, as is hopefully // typical, this loop condition will always fail, and subtree_len will // always be the full length of the input.) // // An aside: We don't have to shrink subtree_len quite this much. For // example, if count_so_far is 1, we could pass 2 chunks to // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still // get the right answer in the end, and we might get to use 2-way SIMD // parallelism. The problem with this optimization, is that it gets us // stuck always hashing 2 chunks. The total number of chunks will remain // odd, and we'll never graduate to higher degrees of parallelism. See // https://github.com/BLAKE3-team/BLAKE3/issues/69. while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { subtree_len /= 2; } // The shrunken subtree_len might now be 1 chunk long. If so, hash that one // chunk by itself. Otherwise, compress the subtree into a pair of CVs. uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; if (subtree_len <= BLAKE3_CHUNK_LEN) { blake3_chunk_state chunk_state; chunk_state_init(&chunk_state, self->key, self->chunk.flags); chunk_state.chunk_counter = self->chunk.chunk_counter; chunk_state_update(&chunk_state, input_bytes, subtree_len); output_t output = chunk_state_output(&chunk_state); uint8_t cv[BLAKE3_OUT_LEN]; output_chaining_value(&output, cv); hasher_push_cv(self, cv, chunk_state.chunk_counter); } else { // This is the high-performance happy path, though getting here depends // on the caller giving us a long enough input. uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, self->chunk.chunk_counter, self->chunk.flags, cv_pair, use_tbb); hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], self->chunk.chunk_counter + (subtree_chunks / 2)); } self->chunk.chunk_counter += subtree_chunks; input_bytes += subtree_len; input_len -= subtree_len; } // If there's any remaining input less than a full chunk, add it to the chunk // state. In that case, also do a final merge loop to make sure the subtree // stack doesn't contain any unmerged pairs. The remaining input means we // know these merges are non-root. This merge loop isn't strictly necessary // here, because hasher_push_chunk_cv already does its own merge loop, but it // simplifies blake3_hasher_finalize below. if (input_len > 0) { chunk_state_update(&self->chunk, input_bytes, input_len); hasher_merge_cv_stack(self, self->chunk.chunk_counter); } } void blake3_hasher_update(blake3_hasher *self, const void *input, size_t input_len) { bool use_tbb = false; blake3_hasher_update_base(self, input, input_len, use_tbb); } #if defined(BLAKE3_USE_TBB) void blake3_hasher_update_tbb(blake3_hasher *self, const void *input, size_t input_len) { bool use_tbb = true; blake3_hasher_update_base(self, input, input_len, use_tbb); } #endif // BLAKE3_USE_TBB void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, size_t out_len) { blake3_hasher_finalize_seek(self, 0, out, out_len); } void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, uint8_t *out, size_t out_len) { // Explicitly checking for zero avoids causing UB by passing a null pointer // to memcpy. This comes up in practice with things like: // std::vector v; // blake3_hasher_finalize(&hasher, v.data(), v.size()); if (out_len == 0) { return; } // If the subtree stack is empty, then the current chunk is the root. if (self->cv_stack_len == 0) { output_t output = chunk_state_output(&self->chunk); output_root_bytes(&output, seek, out, out_len); return; } // If there are any bytes in the chunk state, finalize that chunk and do a // roll-up merge between that chunk hash and every subtree in the stack. In // this case, the extra merge loop at the end of blake3_hasher_update // guarantees that none of the subtrees in the stack need to be merged with // each other first. Otherwise, if there are no bytes in the chunk state, // then the top of the stack is a chunk hash, and we start the merge from // that. output_t output; size_t cvs_remaining; if (chunk_state_len(&self->chunk) > 0) { cvs_remaining = self->cv_stack_len; output = chunk_state_output(&self->chunk); } else { // There are always at least 2 CVs in the stack in this case. cvs_remaining = self->cv_stack_len - 2; output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, self->chunk.flags); } while (cvs_remaining > 0) { cvs_remaining -= 1; uint8_t parent_block[BLAKE3_BLOCK_LEN]; memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); output_chaining_value(&output, &parent_block[32]); output = parent_output(parent_block, self->key, self->chunk.flags); } output_root_bytes(&output, seek, out, out_len); } void blake3_hasher_reset(blake3_hasher *self) { chunk_state_reset(&self->chunk, self->key, 0); self->cv_stack_len = 0; } blake3-1.8.1/c/blake3.h000064400000000000000000000055251046102023000125370ustar 00000000000000#ifndef BLAKE3_H #define BLAKE3_H #include #include #if !defined(BLAKE3_API) # if defined(_WIN32) || defined(__CYGWIN__) # if defined(BLAKE3_DLL) # if defined(BLAKE3_DLL_EXPORTS) # define BLAKE3_API __declspec(dllexport) # else # define BLAKE3_API __declspec(dllimport) # endif # define BLAKE3_PRIVATE # else # define BLAKE3_API # define BLAKE3_PRIVATE # endif # elif __GNUC__ >= 4 # define BLAKE3_API __attribute__((visibility("default"))) # define BLAKE3_PRIVATE __attribute__((visibility("hidden"))) # else # define BLAKE3_API # define BLAKE3_PRIVATE # endif #endif #ifdef __cplusplus extern "C" { #endif #define BLAKE3_VERSION_STRING "1.8.1" #define BLAKE3_KEY_LEN 32 #define BLAKE3_OUT_LEN 32 #define BLAKE3_BLOCK_LEN 64 #define BLAKE3_CHUNK_LEN 1024 #define BLAKE3_MAX_DEPTH 54 // This struct is a private implementation detail. It has to be here because // it's part of blake3_hasher below. typedef struct { uint32_t cv[8]; uint64_t chunk_counter; uint8_t buf[BLAKE3_BLOCK_LEN]; uint8_t buf_len; uint8_t blocks_compressed; uint8_t flags; } blake3_chunk_state; typedef struct { uint32_t key[8]; blake3_chunk_state chunk; uint8_t cv_stack_len; // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk // requires a 4th entry, rather than merging everything down to 1, because we // don't know whether more input is coming. This is different from how the // reference implementation does things. uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; } blake3_hasher; BLAKE3_API const char *blake3_version(void); BLAKE3_API void blake3_hasher_init(blake3_hasher *self); BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self, const uint8_t key[BLAKE3_KEY_LEN]); BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, size_t context_len); BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input, size_t input_len); #if defined(BLAKE3_USE_TBB) BLAKE3_API void blake3_hasher_update_tbb(blake3_hasher *self, const void *input, size_t input_len); #endif // BLAKE3_USE_TBB BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, size_t out_len); BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, uint8_t *out, size_t out_len); BLAKE3_API void blake3_hasher_reset(blake3_hasher *self); #ifdef __cplusplus } #endif #endif /* BLAKE3_H */ blake3-1.8.1/c/blake3_avx2.c000064400000000000000000000302441046102023000134660ustar 00000000000000#include "blake3_impl.h" #include #define DEGREE 8 INLINE __m256i loadu(const uint8_t src[32]) { return _mm256_loadu_si256((const __m256i *)src); } INLINE void storeu(__m256i src, uint8_t dest[16]) { _mm256_storeu_si256((__m256i *)dest, src); } INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } // Note that clang-format doesn't like the name "xor" for some reason. INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } INLINE __m256i rot16(__m256i x) { return _mm256_shuffle_epi8( x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); } INLINE __m256i rot12(__m256i x) { return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); } INLINE __m256i rot8(__m256i x) { return _mm256_shuffle_epi8( x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); } INLINE __m256i rot7(__m256i x) { return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); } INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) { v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); v[0] = addv(v[0], v[4]); v[1] = addv(v[1], v[5]); v[2] = addv(v[2], v[6]); v[3] = addv(v[3], v[7]); v[12] = xorv(v[12], v[0]); v[13] = xorv(v[13], v[1]); v[14] = xorv(v[14], v[2]); v[15] = xorv(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = addv(v[8], v[12]); v[9] = addv(v[9], v[13]); v[10] = addv(v[10], v[14]); v[11] = addv(v[11], v[15]); v[4] = xorv(v[4], v[8]); v[5] = xorv(v[5], v[9]); v[6] = xorv(v[6], v[10]); v[7] = xorv(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); v[0] = addv(v[0], v[4]); v[1] = addv(v[1], v[5]); v[2] = addv(v[2], v[6]); v[3] = addv(v[3], v[7]); v[12] = xorv(v[12], v[0]); v[13] = xorv(v[13], v[1]); v[14] = xorv(v[14], v[2]); v[15] = xorv(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = addv(v[8], v[12]); v[9] = addv(v[9], v[13]); v[10] = addv(v[10], v[14]); v[11] = addv(v[11], v[15]); v[4] = xorv(v[4], v[8]); v[5] = xorv(v[5], v[9]); v[6] = xorv(v[6], v[10]); v[7] = xorv(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); v[0] = addv(v[0], v[5]); v[1] = addv(v[1], v[6]); v[2] = addv(v[2], v[7]); v[3] = addv(v[3], v[4]); v[15] = xorv(v[15], v[0]); v[12] = xorv(v[12], v[1]); v[13] = xorv(v[13], v[2]); v[14] = xorv(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = addv(v[10], v[15]); v[11] = addv(v[11], v[12]); v[8] = addv(v[8], v[13]); v[9] = addv(v[9], v[14]); v[5] = xorv(v[5], v[10]); v[6] = xorv(v[6], v[11]); v[7] = xorv(v[7], v[8]); v[4] = xorv(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); v[0] = addv(v[0], v[5]); v[1] = addv(v[1], v[6]); v[2] = addv(v[2], v[7]); v[3] = addv(v[3], v[4]); v[15] = xorv(v[15], v[0]); v[12] = xorv(v[12], v[1]); v[13] = xorv(v[13], v[2]); v[14] = xorv(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = addv(v[10], v[15]); v[11] = addv(v[11], v[12]); v[8] = addv(v[8], v[13]); v[9] = addv(v[9], v[14]); v[5] = xorv(v[5], v[10]); v[6] = xorv(v[6], v[11]); v[7] = xorv(v[7], v[8]); v[4] = xorv(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } INLINE void transpose_vecs(__m256i vecs[DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high // is 22/33/66/77. __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is // 11/33. __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); // Interleave 128-bit lanes. vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); } INLINE void transpose_msg_vecs(const uint8_t *const *inputs, size_t block_offset, __m256i out[16]) { out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]); out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]); out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]); out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]); out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]); out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]); out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]); out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]); out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]); out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]); out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]); out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]); out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]); out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]); out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]); out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]); for (size_t i = 0; i < 8; ++i) { _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs(&out[0]); transpose_vecs(&out[8]); } INLINE void load_counters(uint64_t counter, bool increment_counter, __m256i *out_lo, __m256i *out_hi) { const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter); const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); const __m256i add1 = _mm256_and_si256(mask, add0); __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1); __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000))); __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry); *out_lo = l; *out_hi = h; } static void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m256i h_vecs[8] = { set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), }; __m256i counter_low_vec, counter_high_vec; load_counters(counter, increment_counter, &counter_low_vec, &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { if (block + 1 == blocks) { block_flags |= flags_end; } __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN); __m256i block_flags_vec = set1(block_flags); __m256i msg_vecs[16]; transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); __m256i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn(v, msg_vecs, 0); round_fn(v, msg_vecs, 1); round_fn(v, msg_vecs, 2); round_fn(v, msg_vecs, 3); round_fn(v, msg_vecs, 4); round_fn(v, msg_vecs, 5); round_fn(v, msg_vecs, 6); h_vecs[0] = xorv(v[0], v[8]); h_vecs[1] = xorv(v[1], v[9]); h_vecs[2] = xorv(v[2], v[10]); h_vecs[3] = xorv(v[3], v[11]); h_vecs[4] = xorv(v[4], v[12]); h_vecs[5] = xorv(v[5], v[13]); h_vecs[6] = xorv(v[6], v[14]); h_vecs[7] = xorv(v[7], v[15]); block_flags = flags; } transpose_vecs(h_vecs); storeu(h_vecs[0], &out[0 * sizeof(__m256i)]); storeu(h_vecs[1], &out[1 * sizeof(__m256i)]); storeu(h_vecs[2], &out[2 * sizeof(__m256i)]); storeu(h_vecs[3], &out[3 * sizeof(__m256i)]); storeu(h_vecs[4], &out[4 * sizeof(__m256i)]); storeu(h_vecs[5], &out[5 * sizeof(__m256i)]); storeu(h_vecs[6], &out[6 * sizeof(__m256i)]); storeu(h_vecs[7], &out[7 * sizeof(__m256i)]); } #if !defined(BLAKE3_NO_SSE41) void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #else void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #endif void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { while (num_inputs >= DEGREE) { blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += DEGREE; } inputs += DEGREE; num_inputs -= DEGREE; out = &out[DEGREE * BLAKE3_OUT_LEN]; } #if !defined(BLAKE3_NO_SSE41) blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); #else blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); #endif } blake3-1.8.1/c/blake3_avx2_x86-64_unix.S000064400000000000000000002010021046102023000154350ustar 00000000000000#if defined(__ELF__) && defined(__linux__) .section .note.GNU-stack,"",%progbits #endif #if defined(__ELF__) && defined(__CET__) && defined(__has_include) #if __has_include() #include #endif #endif #if !defined(_CET_ENDBR) #define _CET_ENDBR #endif .intel_syntax noprefix .global _blake3_hash_many_avx2 .global blake3_hash_many_avx2 #ifdef __APPLE__ .text #else .section .text #endif .p2align 6 _blake3_hash_many_avx2: blake3_hash_many_avx2: _CET_ENDBR push r15 push r14 push r13 push r12 push rbx push rbp mov rbp, rsp sub rsp, 680 and rsp, 0xFFFFFFFFFFFFFFC0 neg r9d vmovd xmm0, r9d vpbroadcastd ymm0, xmm0 vmovdqa ymmword ptr [rsp+0x280], ymm0 vpand ymm1, ymm0, ymmword ptr [ADD0+rip] vpand ymm2, ymm0, ymmword ptr [ADD1+rip] vmovdqa ymmword ptr [rsp+0x220], ymm2 vmovd xmm2, r8d vpbroadcastd ymm2, xmm2 vpaddd ymm2, ymm2, ymm1 vmovdqa ymmword ptr [rsp+0x240], ymm2 vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] vpcmpgtd ymm2, ymm1, ymm2 shr r8, 32 vmovd xmm3, r8d vpbroadcastd ymm3, xmm3 vpsubd ymm3, ymm3, ymm2 vmovdqa ymmword ptr [rsp+0x260], ymm3 shl rdx, 6 mov qword ptr [rsp+0x2A0], rdx cmp rsi, 8 jc 3f 2: vpbroadcastd ymm0, dword ptr [rcx] vpbroadcastd ymm1, dword ptr [rcx+0x4] vpbroadcastd ymm2, dword ptr [rcx+0x8] vpbroadcastd ymm3, dword ptr [rcx+0xC] vpbroadcastd ymm4, dword ptr [rcx+0x10] vpbroadcastd ymm5, dword ptr [rcx+0x14] vpbroadcastd ymm6, dword ptr [rcx+0x18] vpbroadcastd ymm7, dword ptr [rcx+0x1C] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x20] mov r13, qword ptr [rdi+0x28] mov r14, qword ptr [rdi+0x30] mov r15, qword ptr [rdi+0x38] movzx eax, byte ptr [rbp+0x38] movzx ebx, byte ptr [rbp+0x40] or eax, ebx xor edx, edx .p2align 5 9: movzx ebx, byte ptr [rbp+0x48] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+0x2A0] cmove eax, ebx mov dword ptr [rsp+0x200], eax vmovups xmm8, xmmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x40] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x40] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x40] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0x20], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0x40], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0x60], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-0x30] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x30] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x30] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+0x80], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0xA0], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0xC0], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0xE0], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x20] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x20] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x20] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+0x100], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0x120], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0x140], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0x160], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-0x10] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x10] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x10] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+0x180], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0x1A0], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0x1C0], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0x1E0], ymm11 vpbroadcastd ymm15, dword ptr [rsp+0x200] prefetcht0 [r8+rdx+0x80] prefetcht0 [r12+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r13+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r14+rdx+0x80] prefetcht0 [r11+rdx+0x80] prefetcht0 [r15+rdx+0x80] vpaddd ymm0, ymm0, ymmword ptr [rsp] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] vpxor ymm15, ymm3, ymm15 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vpxor ymm0, ymm0, ymm8 vpxor ymm1, ymm1, ymm9 vpxor ymm2, ymm2, ymm10 vpxor ymm3, ymm3, ymm11 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpxor ymm4, ymm4, ymm12 vpxor ymm5, ymm5, ymm13 vpxor ymm6, ymm6, ymm14 vpxor ymm7, ymm7, ymm15 movzx eax, byte ptr [rbp+0x38] jne 9b mov rbx, qword ptr [rbp+0x50] vunpcklps ymm8, ymm0, ymm1 vunpcklps ymm9, ymm2, ymm3 vunpckhps ymm10, ymm0, ymm1 vunpcklps ymm11, ymm4, ymm5 vunpcklps ymm0, ymm6, ymm7 vshufps ymm12, ymm8, ymm9, 78 vblendps ymm1, ymm8, ymm12, 0xCC vshufps ymm8, ymm11, ymm0, 78 vunpckhps ymm13, ymm2, ymm3 vblendps ymm2, ymm11, ymm8, 0xCC vblendps ymm3, ymm12, ymm9, 0xCC vperm2f128 ymm12, ymm1, ymm2, 0x20 vmovups ymmword ptr [rbx], ymm12 vunpckhps ymm14, ymm4, ymm5 vblendps ymm4, ymm8, ymm0, 0xCC vunpckhps ymm15, ymm6, ymm7 vperm2f128 ymm7, ymm3, ymm4, 0x20 vmovups ymmword ptr [rbx+0x20], ymm7 vshufps ymm5, ymm10, ymm13, 78 vblendps ymm6, ymm5, ymm13, 0xCC vshufps ymm13, ymm14, ymm15, 78 vblendps ymm10, ymm10, ymm5, 0xCC vblendps ymm14, ymm14, ymm13, 0xCC vperm2f128 ymm8, ymm10, ymm14, 0x20 vmovups ymmword ptr [rbx+0x40], ymm8 vblendps ymm15, ymm13, ymm15, 0xCC vperm2f128 ymm13, ymm6, ymm15, 0x20 vmovups ymmword ptr [rbx+0x60], ymm13 vperm2f128 ymm9, ymm1, ymm2, 0x31 vperm2f128 ymm11, ymm3, ymm4, 0x31 vmovups ymmword ptr [rbx+0x80], ymm9 vperm2f128 ymm14, ymm10, ymm14, 0x31 vperm2f128 ymm15, ymm6, ymm15, 0x31 vmovups ymmword ptr [rbx+0xA0], ymm11 vmovups ymmword ptr [rbx+0xC0], ymm14 vmovups ymmword ptr [rbx+0xE0], ymm15 vmovdqa ymm0, ymmword ptr [rsp+0x220] vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] vmovdqa ymmword ptr [rsp+0x240], ymm1 vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] vpcmpgtd ymm2, ymm0, ymm2 vmovdqa ymm0, ymmword ptr [rsp+0x260] vpsubd ymm2, ymm0, ymm2 vmovdqa ymmword ptr [rsp+0x260], ymm2 add rdi, 64 add rbx, 256 mov qword ptr [rbp+0x50], rbx sub rsi, 8 cmp rsi, 8 jnc 2b test rsi, rsi jnz 3f 4: vzeroupper mov rsp, rbp pop rbp pop rbx pop r12 pop r13 pop r14 pop r15 ret .p2align 5 3: mov rbx, qword ptr [rbp+0x50] mov r15, qword ptr [rsp+0x2A0] movzx r13d, byte ptr [rbp+0x38] movzx r12d, byte ptr [rbp+0x48] test rsi, 0x4 je 3f vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vmovdqa ymm8, ymm0 vmovdqa ymm9, ymm1 vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] vpunpckldq ymm14, ymm12, ymm13 vpunpckhdq ymm15, ymm12, ymm13 vpermq ymm14, ymm14, 0x50 vpermq ymm15, ymm15, 0x50 vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] vpblendd ymm14, ymm14, ymm12, 0x44 vpblendd ymm15, ymm15, ymm12, 0x44 vmovdqa ymmword ptr [rsp], ymm14 vmovdqa ymmword ptr [rsp+0x20], ymm15 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x200], eax vmovups ymm2, ymmword ptr [r8+rdx-0x40] vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 vmovups ymm3, ymmword ptr [r8+rdx-0x30] vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 vshufps ymm4, ymm2, ymm3, 136 vshufps ymm5, ymm2, ymm3, 221 vmovups ymm2, ymmword ptr [r8+rdx-0x20] vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 vmovups ymm3, ymmword ptr [r8+rdx-0x10] vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 vshufps ymm6, ymm2, ymm3, 136 vshufps ymm7, ymm2, ymm3, 221 vpshufd ymm6, ymm6, 0x93 vpshufd ymm7, ymm7, 0x93 vmovups ymm10, ymmword ptr [r10+rdx-0x40] vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 vmovups ymm11, ymmword ptr [r10+rdx-0x30] vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 vshufps ymm12, ymm10, ymm11, 136 vshufps ymm13, ymm10, ymm11, 221 vmovups ymm10, ymmword ptr [r10+rdx-0x20] vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 vmovups ymm11, ymmword ptr [r10+rdx-0x10] vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 vshufps ymm14, ymm10, ymm11, 136 vshufps ymm15, ymm10, ymm11, 221 vpshufd ymm14, ymm14, 0x93 vpshufd ymm15, ymm15, 0x93 prefetcht0 [r8+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r11+rdx+0x80] vpbroadcastd ymm2, dword ptr [rsp+0x200] vmovdqa ymm3, ymmword ptr [rsp] vmovdqa ymm11, ymmword ptr [rsp+0x20] vpblendd ymm3, ymm3, ymm2, 0x88 vpblendd ymm11, ymm11, ymm2, 0x88 vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] vmovdqa ymm10, ymm2 mov al, 7 9: vpaddd ymm0, ymm0, ymm4 vpaddd ymm8, ymm8, ymm12 vmovdqa ymmword ptr [rsp+0x40], ymm4 nop vmovdqa ymmword ptr [rsp+0x60], ymm12 nop vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 12 vpslld ymm9, ymm9, 20 vpor ymm9, ymm9, ymm4 vpaddd ymm0, ymm0, ymm5 vpaddd ymm8, ymm8, ymm13 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vmovdqa ymmword ptr [rsp+0x80], ymm5 vmovdqa ymmword ptr [rsp+0xA0], ymm13 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 7 vpslld ymm9, ymm9, 25 vpor ymm9, ymm9, ymm4 vpshufd ymm0, ymm0, 0x93 vpshufd ymm8, ymm8, 0x93 vpshufd ymm3, ymm3, 0x4E vpshufd ymm11, ymm11, 0x4E vpshufd ymm2, ymm2, 0x39 vpshufd ymm10, ymm10, 0x39 vpaddd ymm0, ymm0, ymm6 vpaddd ymm8, ymm8, ymm14 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 12 vpslld ymm9, ymm9, 20 vpor ymm9, ymm9, ymm4 vpaddd ymm0, ymm0, ymm7 vpaddd ymm8, ymm8, ymm15 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 7 vpslld ymm9, ymm9, 25 vpor ymm9, ymm9, ymm4 vpshufd ymm0, ymm0, 0x39 vpshufd ymm8, ymm8, 0x39 vpshufd ymm3, ymm3, 0x4E vpshufd ymm11, ymm11, 0x4E vpshufd ymm2, ymm2, 0x93 vpshufd ymm10, ymm10, 0x93 dec al je 9f vmovdqa ymm4, ymmword ptr [rsp+0x40] vmovdqa ymm5, ymmword ptr [rsp+0x80] vshufps ymm12, ymm4, ymm5, 214 vpshufd ymm13, ymm4, 0x0F vpshufd ymm4, ymm12, 0x39 vshufps ymm12, ymm6, ymm7, 250 vpblendd ymm13, ymm13, ymm12, 0xAA vpunpcklqdq ymm12, ymm7, ymm5 vpblendd ymm12, ymm12, ymm6, 0x88 vpshufd ymm12, ymm12, 0x78 vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 0x1E vmovdqa ymmword ptr [rsp+0x40], ymm13 vmovdqa ymmword ptr [rsp+0x80], ymm12 vmovdqa ymm12, ymmword ptr [rsp+0x60] vmovdqa ymm13, ymmword ptr [rsp+0xA0] vshufps ymm5, ymm12, ymm13, 214 vpshufd ymm6, ymm12, 0x0F vpshufd ymm12, ymm5, 0x39 vshufps ymm5, ymm14, ymm15, 250 vpblendd ymm6, ymm6, ymm5, 0xAA vpunpcklqdq ymm5, ymm15, ymm13 vpblendd ymm5, ymm5, ymm14, 0x88 vpshufd ymm5, ymm5, 0x78 vpunpckhdq ymm13, ymm13, ymm15 vpunpckldq ymm14, ymm14, ymm13 vpshufd ymm15, ymm14, 0x1E vmovdqa ymm13, ymm6 vmovdqa ymm14, ymm5 vmovdqa ymm5, ymmword ptr [rsp+0x40] vmovdqa ymm6, ymmword ptr [rsp+0x80] jmp 9b 9: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 vpxor ymm8, ymm8, ymm10 vpxor ymm9, ymm9, ymm11 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vmovdqu xmmword ptr [rbx+0x40], xmm8 vmovdqu xmmword ptr [rbx+0x50], xmm9 vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 vmovaps xmm8, xmmword ptr [rsp+0x280] vmovaps xmm0, xmmword ptr [rsp+0x240] vmovaps xmm1, xmmword ptr [rsp+0x250] vmovaps xmm2, xmmword ptr [rsp+0x260] vmovaps xmm3, xmmword ptr [rsp+0x270] vblendvps xmm0, xmm0, xmm1, xmm8 vblendvps xmm2, xmm2, xmm3, xmm8 vmovaps xmmword ptr [rsp+0x240], xmm0 vmovaps xmmword ptr [rsp+0x260], xmm2 add rbx, 128 add rdi, 32 sub rsi, 4 3: test rsi, 0x2 je 3f vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vmovd xmm13, dword ptr [rsp+0x240] vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovd xmm14, dword ptr [rsp+0x244] vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vinserti128 ymm13, ymm13, xmm14, 0x01 vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x200], eax vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] vpbroadcastd ymm8, dword ptr [rsp+0x200] vpblendd ymm3, ymm13, ymm8, 0x88 vmovups ymm8, ymmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 vshufps ymm4, ymm8, ymm9, 136 vshufps ymm5, ymm8, ymm9, 221 vmovups ymm8, ymmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 vshufps ymm6, ymm8, ymm9, 136 vshufps ymm7, ymm8, ymm9, 221 vpshufd ymm6, ymm6, 0x93 vpshufd ymm7, ymm7, 0x93 mov al, 7 9: vpaddd ymm0, ymm0, ymm4 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm14 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm8 vpaddd ymm0, ymm0, ymm5 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm15 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm8 vpshufd ymm0, ymm0, 0x93 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x39 vpaddd ymm0, ymm0, ymm6 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm14 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm8 vpaddd ymm0, ymm0, ymm7 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm15 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm8 vpshufd ymm0, ymm0, 0x39 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x93 dec al jz 9f vshufps ymm8, ymm4, ymm5, 214 vpshufd ymm9, ymm4, 0x0F vpshufd ymm4, ymm8, 0x39 vshufps ymm8, ymm6, ymm7, 250 vpblendd ymm9, ymm9, ymm8, 0xAA vpunpcklqdq ymm8, ymm7, ymm5 vpblendd ymm8, ymm8, ymm6, 0x88 vpshufd ymm8, ymm8, 0x78 vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 0x1E vmovdqa ymm5, ymm9 vmovdqa ymm6, ymm8 jmp 9b 9: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vmovaps ymm8, ymmword ptr [rsp+0x280] vmovaps ymm0, ymmword ptr [rsp+0x240] vmovups ymm1, ymmword ptr [rsp+0x248] vmovaps ymm2, ymmword ptr [rsp+0x260] vmovups ymm3, ymmword ptr [rsp+0x268] vblendvps ymm0, ymm0, ymm1, ymm8 vblendvps ymm2, ymm2, ymm3, ymm8 vmovaps ymmword ptr [rsp+0x240], ymm0 vmovaps ymmword ptr [rsp+0x260], ymm2 add rbx, 64 add rdi, 16 sub rsi, 2 3: test rsi, 0x1 je 4b vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+0x10] vmovd xmm3, dword ptr [rsp+0x240] vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovdqa xmm14, xmmword ptr [ROT16+rip] vmovdqa xmm15, xmmword ptr [ROT8+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] vmovdqa xmm3, xmm13 vpinsrd xmm3, xmm3, eax, 3 vmovups xmm8, xmmword ptr [r8+rdx-0x40] vmovups xmm9, xmmword ptr [r8+rdx-0x30] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vmovups xmm9, xmmword ptr [r8+rdx-0x10] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm14 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 12 vpslld xmm1, xmm1, 20 vpor xmm1, xmm1, xmm8 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm15 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 7 vpslld xmm1, xmm1, 25 vpor xmm1, xmm1, xmm8 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm14 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 12 vpslld xmm1, xmm1, 20 vpor xmm1, xmm1, xmm8 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm15 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 7 vpslld xmm1, xmm1, 25 vpor xmm1, xmm1, xmm8 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 jmp 4b #ifdef __APPLE__ .static_data #else .section .rodata #endif .p2align 6 ADD0: .long 0, 1, 2, 3, 4, 5, 6, 7 ADD1: .long 8, 8, 8, 8, 8, 8, 8, 8 BLAKE3_IV_0: .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A BLAKE3_BLOCK_LEN: .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 ROT16: .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A blake3-1.8.1/c/blake3_avx2_x86-64_windows_gnu.S000064400000000000000000002022571046102023000170320ustar 00000000000000.intel_syntax noprefix .global _blake3_hash_many_avx2 .global blake3_hash_many_avx2 .section .text .p2align 6 _blake3_hash_many_avx2: blake3_hash_many_avx2: push r15 push r14 push r13 push r12 push rsi push rdi push rbx push rbp mov rbp, rsp sub rsp, 880 and rsp, 0xFFFFFFFFFFFFFFC0 vmovdqa xmmword ptr [rsp+0x2D0], xmm6 vmovdqa xmmword ptr [rsp+0x2E0], xmm7 vmovdqa xmmword ptr [rsp+0x2F0], xmm8 vmovdqa xmmword ptr [rsp+0x300], xmm9 vmovdqa xmmword ptr [rsp+0x310], xmm10 vmovdqa xmmword ptr [rsp+0x320], xmm11 vmovdqa xmmword ptr [rsp+0x330], xmm12 vmovdqa xmmword ptr [rsp+0x340], xmm13 vmovdqa xmmword ptr [rsp+0x350], xmm14 vmovdqa xmmword ptr [rsp+0x360], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+0x68] movzx r9, byte ptr [rbp+0x70] neg r9d vmovd xmm0, r9d vpbroadcastd ymm0, xmm0 vmovdqa ymmword ptr [rsp+0x260], ymm0 vpand ymm1, ymm0, ymmword ptr [ADD0+rip] vpand ymm2, ymm0, ymmword ptr [ADD1+rip] vmovdqa ymmword ptr [rsp+0x2A0], ymm2 vmovd xmm2, r8d vpbroadcastd ymm2, xmm2 vpaddd ymm2, ymm2, ymm1 vmovdqa ymmword ptr [rsp+0x220], ymm2 vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] vpcmpgtd ymm2, ymm1, ymm2 shr r8, 32 vmovd xmm3, r8d vpbroadcastd ymm3, xmm3 vpsubd ymm3, ymm3, ymm2 vmovdqa ymmword ptr [rsp+0x240], ymm3 shl rdx, 6 mov qword ptr [rsp+0x2C0], rdx cmp rsi, 8 jc 3f 2: vpbroadcastd ymm0, dword ptr [rcx] vpbroadcastd ymm1, dword ptr [rcx+0x4] vpbroadcastd ymm2, dword ptr [rcx+0x8] vpbroadcastd ymm3, dword ptr [rcx+0xC] vpbroadcastd ymm4, dword ptr [rcx+0x10] vpbroadcastd ymm5, dword ptr [rcx+0x14] vpbroadcastd ymm6, dword ptr [rcx+0x18] vpbroadcastd ymm7, dword ptr [rcx+0x1C] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x20] mov r13, qword ptr [rdi+0x28] mov r14, qword ptr [rdi+0x30] mov r15, qword ptr [rdi+0x38] movzx eax, byte ptr [rbp+0x78] movzx ebx, byte ptr [rbp+0x80] or eax, ebx xor edx, edx .p2align 5 9: movzx ebx, byte ptr [rbp+0x88] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+0x2C0] cmove eax, ebx mov dword ptr [rsp+0x200], eax vmovups xmm8, xmmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x40] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x40] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x40] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0x20], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0x40], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0x60], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-0x30] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x30] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x30] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+0x80], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0xA0], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0xC0], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0xE0], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x20] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x20] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x20] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+0x100], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0x120], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0x140], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0x160], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-0x10] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x10] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x10] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+0x180], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0x1A0], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0x1C0], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0x1E0], ymm11 vpbroadcastd ymm15, dword ptr [rsp+0x200] prefetcht0 [r8+rdx+0x80] prefetcht0 [r12+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r13+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r14+rdx+0x80] prefetcht0 [r11+rdx+0x80] prefetcht0 [r15+rdx+0x80] vpaddd ymm0, ymm0, ymmword ptr [rsp] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm0, ymmword ptr [rsp+0x220] vpxor ymm13, ymm1, ymmword ptr [rsp+0x240] vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] vpxor ymm15, ymm3, ymm15 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vpxor ymm0, ymm0, ymm8 vpxor ymm1, ymm1, ymm9 vpxor ymm2, ymm2, ymm10 vpxor ymm3, ymm3, ymm11 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpxor ymm4, ymm4, ymm12 vpxor ymm5, ymm5, ymm13 vpxor ymm6, ymm6, ymm14 vpxor ymm7, ymm7, ymm15 movzx eax, byte ptr [rbp+0x78] jne 9b mov rbx, qword ptr [rbp+0x90] vunpcklps ymm8, ymm0, ymm1 vunpcklps ymm9, ymm2, ymm3 vunpckhps ymm10, ymm0, ymm1 vunpcklps ymm11, ymm4, ymm5 vunpcklps ymm0, ymm6, ymm7 vshufps ymm12, ymm8, ymm9, 78 vblendps ymm1, ymm8, ymm12, 0xCC vshufps ymm8, ymm11, ymm0, 78 vunpckhps ymm13, ymm2, ymm3 vblendps ymm2, ymm11, ymm8, 0xCC vblendps ymm3, ymm12, ymm9, 0xCC vperm2f128 ymm12, ymm1, ymm2, 0x20 vmovups ymmword ptr [rbx], ymm12 vunpckhps ymm14, ymm4, ymm5 vblendps ymm4, ymm8, ymm0, 0xCC vunpckhps ymm15, ymm6, ymm7 vperm2f128 ymm7, ymm3, ymm4, 0x20 vmovups ymmword ptr [rbx+0x20], ymm7 vshufps ymm5, ymm10, ymm13, 78 vblendps ymm6, ymm5, ymm13, 0xCC vshufps ymm13, ymm14, ymm15, 78 vblendps ymm10, ymm10, ymm5, 0xCC vblendps ymm14, ymm14, ymm13, 0xCC vperm2f128 ymm8, ymm10, ymm14, 0x20 vmovups ymmword ptr [rbx+0x40], ymm8 vblendps ymm15, ymm13, ymm15, 0xCC vperm2f128 ymm13, ymm6, ymm15, 0x20 vmovups ymmword ptr [rbx+0x60], ymm13 vperm2f128 ymm9, ymm1, ymm2, 0x31 vperm2f128 ymm11, ymm3, ymm4, 0x31 vmovups ymmword ptr [rbx+0x80], ymm9 vperm2f128 ymm14, ymm10, ymm14, 0x31 vperm2f128 ymm15, ymm6, ymm15, 0x31 vmovups ymmword ptr [rbx+0xA0], ymm11 vmovups ymmword ptr [rbx+0xC0], ymm14 vmovups ymmword ptr [rbx+0xE0], ymm15 vmovdqa ymm0, ymmword ptr [rsp+0x2A0] vpaddd ymm1, ymm0, ymmword ptr [rsp+0x220] vmovdqa ymmword ptr [rsp+0x220], ymm1 vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] vpcmpgtd ymm2, ymm0, ymm2 vmovdqa ymm0, ymmword ptr [rsp+0x240] vpsubd ymm2, ymm0, ymm2 vmovdqa ymmword ptr [rsp+0x240], ymm2 add rdi, 64 add rbx, 256 mov qword ptr [rbp+0x90], rbx sub rsi, 8 cmp rsi, 8 jnc 2b test rsi, rsi jnz 3f 4: vzeroupper vmovdqa xmm6, xmmword ptr [rsp+0x2D0] vmovdqa xmm7, xmmword ptr [rsp+0x2E0] vmovdqa xmm8, xmmword ptr [rsp+0x2F0] vmovdqa xmm9, xmmword ptr [rsp+0x300] vmovdqa xmm10, xmmword ptr [rsp+0x310] vmovdqa xmm11, xmmword ptr [rsp+0x320] vmovdqa xmm12, xmmword ptr [rsp+0x330] vmovdqa xmm13, xmmword ptr [rsp+0x340] vmovdqa xmm14, xmmword ptr [rsp+0x350] vmovdqa xmm15, xmmword ptr [rsp+0x360] mov rsp, rbp pop rbp pop rbx pop rdi pop rsi pop r12 pop r13 pop r14 pop r15 ret .p2align 5 3: mov rbx, qword ptr [rbp+0x90] mov r15, qword ptr [rsp+0x2C0] movzx r13d, byte ptr [rbp+0x78] movzx r12d, byte ptr [rbp+0x88] test rsi, 0x4 je 3f vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vmovdqa ymm8, ymm0 vmovdqa ymm9, ymm1 vbroadcasti128 ymm12, xmmword ptr [rsp+0x220] vbroadcasti128 ymm13, xmmword ptr [rsp+0x240] vpunpckldq ymm14, ymm12, ymm13 vpunpckhdq ymm15, ymm12, ymm13 vpermq ymm14, ymm14, 0x50 vpermq ymm15, ymm15, 0x50 vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] vpblendd ymm14, ymm14, ymm12, 0x44 vpblendd ymm15, ymm15, ymm12, 0x44 vmovdqa ymmword ptr [rsp], ymm14 vmovdqa ymmword ptr [rsp+0x20], ymm15 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x200], eax vmovups ymm2, ymmword ptr [r8+rdx-0x40] vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 vmovups ymm3, ymmword ptr [r8+rdx-0x30] vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 vshufps ymm4, ymm2, ymm3, 136 vshufps ymm5, ymm2, ymm3, 221 vmovups ymm2, ymmword ptr [r8+rdx-0x20] vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 vmovups ymm3, ymmword ptr [r8+rdx-0x10] vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 vshufps ymm6, ymm2, ymm3, 136 vshufps ymm7, ymm2, ymm3, 221 vpshufd ymm6, ymm6, 0x93 vpshufd ymm7, ymm7, 0x93 vmovups ymm10, ymmword ptr [r10+rdx-0x40] vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 vmovups ymm11, ymmword ptr [r10+rdx-0x30] vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 vshufps ymm12, ymm10, ymm11, 136 vshufps ymm13, ymm10, ymm11, 221 vmovups ymm10, ymmword ptr [r10+rdx-0x20] vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 vmovups ymm11, ymmword ptr [r10+rdx-0x10] vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 vshufps ymm14, ymm10, ymm11, 136 vshufps ymm15, ymm10, ymm11, 221 vpshufd ymm14, ymm14, 0x93 vpshufd ymm15, ymm15, 0x93 vpbroadcastd ymm2, dword ptr [rsp+0x200] vmovdqa ymm3, ymmword ptr [rsp] vmovdqa ymm11, ymmword ptr [rsp+0x20] vpblendd ymm3, ymm3, ymm2, 0x88 vpblendd ymm11, ymm11, ymm2, 0x88 vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] vmovdqa ymm10, ymm2 mov al, 7 9: vpaddd ymm0, ymm0, ymm4 vpaddd ymm8, ymm8, ymm12 vmovdqa ymmword ptr [rsp+0x40], ymm4 nop vmovdqa ymmword ptr [rsp+0x60], ymm12 nop vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 12 vpslld ymm9, ymm9, 20 vpor ymm9, ymm9, ymm4 vpaddd ymm0, ymm0, ymm5 vpaddd ymm8, ymm8, ymm13 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vmovdqa ymmword ptr [rsp+0x80], ymm5 vmovdqa ymmword ptr [rsp+0xA0], ymm13 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 7 vpslld ymm9, ymm9, 25 vpor ymm9, ymm9, ymm4 vpshufd ymm0, ymm0, 0x93 vpshufd ymm8, ymm8, 0x93 vpshufd ymm3, ymm3, 0x4E vpshufd ymm11, ymm11, 0x4E vpshufd ymm2, ymm2, 0x39 vpshufd ymm10, ymm10, 0x39 vpaddd ymm0, ymm0, ymm6 vpaddd ymm8, ymm8, ymm14 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 12 vpslld ymm9, ymm9, 20 vpor ymm9, ymm9, ymm4 vpaddd ymm0, ymm0, ymm7 vpaddd ymm8, ymm8, ymm15 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 7 vpslld ymm9, ymm9, 25 vpor ymm9, ymm9, ymm4 vpshufd ymm0, ymm0, 0x39 vpshufd ymm8, ymm8, 0x39 vpshufd ymm3, ymm3, 0x4E vpshufd ymm11, ymm11, 0x4E vpshufd ymm2, ymm2, 0x93 vpshufd ymm10, ymm10, 0x93 dec al je 9f vmovdqa ymm4, ymmword ptr [rsp+0x40] vmovdqa ymm5, ymmword ptr [rsp+0x80] vshufps ymm12, ymm4, ymm5, 214 vpshufd ymm13, ymm4, 0x0F vpshufd ymm4, ymm12, 0x39 vshufps ymm12, ymm6, ymm7, 250 vpblendd ymm13, ymm13, ymm12, 0xAA vpunpcklqdq ymm12, ymm7, ymm5 vpblendd ymm12, ymm12, ymm6, 0x88 vpshufd ymm12, ymm12, 0x78 vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 0x1E vmovdqa ymmword ptr [rsp+0x40], ymm13 vmovdqa ymmword ptr [rsp+0x80], ymm12 vmovdqa ymm12, ymmword ptr [rsp+0x60] vmovdqa ymm13, ymmword ptr [rsp+0xA0] vshufps ymm5, ymm12, ymm13, 214 vpshufd ymm6, ymm12, 0x0F vpshufd ymm12, ymm5, 0x39 vshufps ymm5, ymm14, ymm15, 250 vpblendd ymm6, ymm6, ymm5, 0xAA vpunpcklqdq ymm5, ymm15, ymm13 vpblendd ymm5, ymm5, ymm14, 0x88 vpshufd ymm5, ymm5, 0x78 vpunpckhdq ymm13, ymm13, ymm15 vpunpckldq ymm14, ymm14, ymm13 vpshufd ymm15, ymm14, 0x1E vmovdqa ymm13, ymm6 vmovdqa ymm14, ymm5 vmovdqa ymm5, ymmword ptr [rsp+0x40] vmovdqa ymm6, ymmword ptr [rsp+0x80] jmp 9b 9: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 vpxor ymm8, ymm8, ymm10 vpxor ymm9, ymm9, ymm11 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vmovdqu xmmword ptr [rbx+0x40], xmm8 vmovdqu xmmword ptr [rbx+0x50], xmm9 vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 vmovaps xmm8, xmmword ptr [rsp+0x260] vmovaps xmm0, xmmword ptr [rsp+0x220] vmovaps xmm1, xmmword ptr [rsp+0x230] vmovaps xmm2, xmmword ptr [rsp+0x240] vmovaps xmm3, xmmword ptr [rsp+0x250] vblendvps xmm0, xmm0, xmm1, xmm8 vblendvps xmm2, xmm2, xmm3, xmm8 vmovaps xmmword ptr [rsp+0x220], xmm0 vmovaps xmmword ptr [rsp+0x240], xmm2 add rbx, 128 add rdi, 32 sub rsi, 4 3: test rsi, 0x2 je 3f vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vmovd xmm13, dword ptr [rsp+0x220] vpinsrd xmm13, xmm13, dword ptr [rsp+0x240], 1 vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovd xmm14, dword ptr [rsp+0x224] vpinsrd xmm14, xmm14, dword ptr [rsp+0x244], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vinserti128 ymm13, ymm13, xmm14, 0x01 vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x200], eax vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] vpbroadcastd ymm8, dword ptr [rsp+0x200] vpblendd ymm3, ymm13, ymm8, 0x88 vmovups ymm8, ymmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 vshufps ymm4, ymm8, ymm9, 136 vshufps ymm5, ymm8, ymm9, 221 vmovups ymm8, ymmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 vshufps ymm6, ymm8, ymm9, 136 vshufps ymm7, ymm8, ymm9, 221 vpshufd ymm6, ymm6, 0x93 vpshufd ymm7, ymm7, 0x93 mov al, 7 9: vpaddd ymm0, ymm0, ymm4 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm14 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm8 vpaddd ymm0, ymm0, ymm5 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm15 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm8 vpshufd ymm0, ymm0, 0x93 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x39 vpaddd ymm0, ymm0, ymm6 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm14 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm8 vpaddd ymm0, ymm0, ymm7 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm15 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm8 vpshufd ymm0, ymm0, 0x39 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x93 dec al jz 9f vshufps ymm8, ymm4, ymm5, 214 vpshufd ymm9, ymm4, 0x0F vpshufd ymm4, ymm8, 0x39 vshufps ymm8, ymm6, ymm7, 250 vpblendd ymm9, ymm9, ymm8, 0xAA vpunpcklqdq ymm8, ymm7, ymm5 vpblendd ymm8, ymm8, ymm6, 0x88 vpshufd ymm8, ymm8, 0x78 vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 0x1E vmovdqa ymm5, ymm9 vmovdqa ymm6, ymm8 jmp 9b 9: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vmovaps ymm8, ymmword ptr [rsp+0x260] vmovaps ymm0, ymmword ptr [rsp+0x220] vmovups ymm1, ymmword ptr [rsp+0x228] vmovaps ymm2, ymmword ptr [rsp+0x240] vmovups ymm3, ymmword ptr [rsp+0x248] vblendvps ymm0, ymm0, ymm1, ymm8 vblendvps ymm2, ymm2, ymm3, ymm8 vmovaps ymmword ptr [rsp+0x220], ymm0 vmovaps ymmword ptr [rsp+0x240], ymm2 add rbx, 64 add rdi, 16 sub rsi, 2 3: test rsi, 0x1 je 4b vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+0x10] vmovd xmm3, dword ptr [rsp+0x220] vpinsrd xmm3, xmm3, dword ptr [rsp+0x240], 1 vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovdqa xmm14, xmmword ptr [ROT16+rip] vmovdqa xmm15, xmmword ptr [ROT8+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] vmovdqa xmm3, xmm13 vpinsrd xmm3, xmm3, eax, 3 vmovups xmm8, xmmword ptr [r8+rdx-0x40] vmovups xmm9, xmmword ptr [r8+rdx-0x30] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vmovups xmm9, xmmword ptr [r8+rdx-0x10] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm14 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 12 vpslld xmm1, xmm1, 20 vpor xmm1, xmm1, xmm8 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm15 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 7 vpslld xmm1, xmm1, 25 vpor xmm1, xmm1, xmm8 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm14 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 12 vpslld xmm1, xmm1, 20 vpor xmm1, xmm1, xmm8 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm15 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 7 vpslld xmm1, xmm1, 25 vpor xmm1, xmm1, xmm8 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 jmp 4b .section .rdata .p2align 6 ADD0: .long 0, 1, 2, 3, 4, 5, 6, 7 ADD1: .long 8, 8, 8, 8, 8, 8, 8, 8 BLAKE3_IV_0: .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A BLAKE3_BLOCK_LEN: .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 ROT16: .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A blake3-1.8.1/c/blake3_avx2_x86-64_windows_msvc.asm000064400000000000000000002006341046102023000175640ustar 00000000000000public _blake3_hash_many_avx2 public blake3_hash_many_avx2 _TEXT SEGMENT ALIGN(16) 'CODE' ALIGN 16 blake3_hash_many_avx2 PROC _blake3_hash_many_avx2 PROC push r15 push r14 push r13 push r12 push rsi push rdi push rbx push rbp mov rbp, rsp sub rsp, 880 and rsp, 0FFFFFFFFFFFFFFC0H vmovdqa xmmword ptr [rsp+2D0H], xmm6 vmovdqa xmmword ptr [rsp+2E0H], xmm7 vmovdqa xmmword ptr [rsp+2F0H], xmm8 vmovdqa xmmword ptr [rsp+300H], xmm9 vmovdqa xmmword ptr [rsp+310H], xmm10 vmovdqa xmmword ptr [rsp+320H], xmm11 vmovdqa xmmword ptr [rsp+330H], xmm12 vmovdqa xmmword ptr [rsp+340H], xmm13 vmovdqa xmmword ptr [rsp+350H], xmm14 vmovdqa xmmword ptr [rsp+360H], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+68H] movzx r9, byte ptr [rbp+70H] neg r9d vmovd xmm0, r9d vpbroadcastd ymm0, xmm0 vmovdqa ymmword ptr [rsp+260H], ymm0 vpand ymm1, ymm0, ymmword ptr [ADD0] vpand ymm2, ymm0, ymmword ptr [ADD1] vmovdqa ymmword ptr [rsp+2A0H], ymm2 vmovd xmm2, r8d vpbroadcastd ymm2, xmm2 vpaddd ymm2, ymm2, ymm1 vmovdqa ymmword ptr [rsp+220H], ymm2 vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK] vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK] vpcmpgtd ymm2, ymm1, ymm2 shr r8, 32 vmovd xmm3, r8d vpbroadcastd ymm3, xmm3 vpsubd ymm3, ymm3, ymm2 vmovdqa ymmword ptr [rsp+240H], ymm3 shl rdx, 6 mov qword ptr [rsp+2C0H], rdx cmp rsi, 8 jc final7blocks outerloop8: vpbroadcastd ymm0, dword ptr [rcx] vpbroadcastd ymm1, dword ptr [rcx+4H] vpbroadcastd ymm2, dword ptr [rcx+8H] vpbroadcastd ymm3, dword ptr [rcx+0CH] vpbroadcastd ymm4, dword ptr [rcx+10H] vpbroadcastd ymm5, dword ptr [rcx+14H] vpbroadcastd ymm6, dword ptr [rcx+18H] vpbroadcastd ymm7, dword ptr [rcx+1CH] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] mov r12, qword ptr [rdi+20H] mov r13, qword ptr [rdi+28H] mov r14, qword ptr [rdi+30H] mov r15, qword ptr [rdi+38H] movzx eax, byte ptr [rbp+78H] movzx ebx, byte ptr [rbp+80H] or eax, ebx xor edx, edx ALIGN 16 innerloop8: movzx ebx, byte ptr [rbp+88H] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+2C0H] cmove eax, ebx mov dword ptr [rsp+200H], eax vmovups xmm8, xmmword ptr [r8+rdx-40H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H vmovups xmm9, xmmword ptr [r9+rdx-40H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-40H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H vmovups xmm11, xmmword ptr [r11+rdx-40H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+20H], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+40H], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+60H], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-30H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H vmovups xmm9, xmmword ptr [r9+rdx-30H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-30H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H vmovups xmm11, xmmword ptr [r11+rdx-30H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+80H], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0A0H], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0C0H], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0E0H], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-20H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H vmovups xmm9, xmmword ptr [r9+rdx-20H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-20H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H vmovups xmm11, xmmword ptr [r11+rdx-20H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+100H], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+120H], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+140H], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+160H], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-10H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H vmovups xmm9, xmmword ptr [r9+rdx-10H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-10H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H vmovups xmm11, xmmword ptr [r11+rdx-10H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+180H], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+1A0H], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+1C0H], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+1E0H], ymm11 vpbroadcastd ymm15, dword ptr [rsp+200H] prefetcht0 byte ptr [r8+rdx+80H] prefetcht0 byte ptr [r12+rdx+80H] prefetcht0 byte ptr [r9+rdx+80H] prefetcht0 byte ptr [r13+rdx+80H] prefetcht0 byte ptr [r10+rdx+80H] prefetcht0 byte ptr [r14+rdx+80H] prefetcht0 byte ptr [r11+rdx+80H] prefetcht0 byte ptr [r15+rdx+80H] vpaddd ymm0, ymm0, ymmword ptr [rsp] vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm0, ymmword ptr [rsp+220H] vpxor ymm13, ymm1, ymmword ptr [rsp+240H] vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN] vpxor ymm15, ymm3, ymm15 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0] vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1] vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2] vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3] vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+100H] vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] vpaddd ymm2, ymm2, ymmword ptr [rsp+0E0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] vpaddd ymm2, ymm2, ymmword ptr [rsp] vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] vpaddd ymm2, ymm2, ymmword ptr [rsp+160H] vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0A0H] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp] vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+180H] vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] vpaddd ymm2, ymm2, ymmword ptr [rsp+140H] vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] vpaddd ymm2, ymm2, ymmword ptr [rsp] vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] vpaddd ymm2, ymm2, ymmword ptr [rsp+0C0H] vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+1E0H] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+200H], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vpxor ymm0, ymm0, ymm8 vpxor ymm1, ymm1, ymm9 vpxor ymm2, ymm2, ymm10 vpxor ymm3, ymm3, ymm11 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpxor ymm4, ymm4, ymm12 vpxor ymm5, ymm5, ymm13 vpxor ymm6, ymm6, ymm14 vpxor ymm7, ymm7, ymm15 movzx eax, byte ptr [rbp+78H] jne innerloop8 mov rbx, qword ptr [rbp+90H] vunpcklps ymm8, ymm0, ymm1 vunpcklps ymm9, ymm2, ymm3 vunpckhps ymm10, ymm0, ymm1 vunpcklps ymm11, ymm4, ymm5 vunpcklps ymm0, ymm6, ymm7 vshufps ymm12, ymm8, ymm9, 78 vblendps ymm1, ymm8, ymm12, 0CCH vshufps ymm8, ymm11, ymm0, 78 vunpckhps ymm13, ymm2, ymm3 vblendps ymm2, ymm11, ymm8, 0CCH vblendps ymm3, ymm12, ymm9, 0CCH vperm2f128 ymm12, ymm1, ymm2, 20H vmovups ymmword ptr [rbx], ymm12 vunpckhps ymm14, ymm4, ymm5 vblendps ymm4, ymm8, ymm0, 0CCH vunpckhps ymm15, ymm6, ymm7 vperm2f128 ymm7, ymm3, ymm4, 20H vmovups ymmword ptr [rbx+20H], ymm7 vshufps ymm5, ymm10, ymm13, 78 vblendps ymm6, ymm5, ymm13, 0CCH vshufps ymm13, ymm14, ymm15, 78 vblendps ymm10, ymm10, ymm5, 0CCH vblendps ymm14, ymm14, ymm13, 0CCH vperm2f128 ymm8, ymm10, ymm14, 20H vmovups ymmword ptr [rbx+40H], ymm8 vblendps ymm15, ymm13, ymm15, 0CCH vperm2f128 ymm13, ymm6, ymm15, 20H vmovups ymmword ptr [rbx+60H], ymm13 vperm2f128 ymm9, ymm1, ymm2, 31H vperm2f128 ymm11, ymm3, ymm4, 31H vmovups ymmword ptr [rbx+80H], ymm9 vperm2f128 ymm14, ymm10, ymm14, 31H vperm2f128 ymm15, ymm6, ymm15, 31H vmovups ymmword ptr [rbx+0A0H], ymm11 vmovups ymmword ptr [rbx+0C0H], ymm14 vmovups ymmword ptr [rbx+0E0H], ymm15 vmovdqa ymm0, ymmword ptr [rsp+2A0H] vpaddd ymm1, ymm0, ymmword ptr [rsp+220H] vmovdqa ymmword ptr [rsp+220H], ymm1 vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK] vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK] vpcmpgtd ymm2, ymm0, ymm2 vmovdqa ymm0, ymmword ptr [rsp+240H] vpsubd ymm2, ymm0, ymm2 vmovdqa ymmword ptr [rsp+240H], ymm2 add rdi, 64 add rbx, 256 mov qword ptr [rbp+90H], rbx sub rsi, 8 cmp rsi, 8 jnc outerloop8 test rsi, rsi jnz final7blocks unwind: vzeroupper vmovdqa xmm6, xmmword ptr [rsp+2D0H] vmovdqa xmm7, xmmword ptr [rsp+2E0H] vmovdqa xmm8, xmmword ptr [rsp+2F0H] vmovdqa xmm9, xmmword ptr [rsp+300H] vmovdqa xmm10, xmmword ptr [rsp+310H] vmovdqa xmm11, xmmword ptr [rsp+320H] vmovdqa xmm12, xmmword ptr [rsp+330H] vmovdqa xmm13, xmmword ptr [rsp+340H] vmovdqa xmm14, xmmword ptr [rsp+350H] vmovdqa xmm15, xmmword ptr [rsp+360H] mov rsp, rbp pop rbp pop rbx pop rdi pop rsi pop r12 pop r13 pop r14 pop r15 ret ALIGN 16 final7blocks: mov rbx, qword ptr [rbp+90H] mov r15, qword ptr [rsp+2C0H] movzx r13d, byte ptr [rbp+78H] movzx r12d, byte ptr [rbp+88H] test rsi, 4H je final3blocks vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+10H] vmovdqa ymm8, ymm0 vmovdqa ymm9, ymm1 vbroadcasti128 ymm12, xmmword ptr [rsp+220H] vbroadcasti128 ymm13, xmmword ptr [rsp+240H] vpunpckldq ymm14, ymm12, ymm13 vpunpckhdq ymm15, ymm12, ymm13 vpermq ymm14, ymm14, 50H vpermq ymm15, ymm15, 50H vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN] vpblendd ymm14, ymm14, ymm12, 44H vpblendd ymm15, ymm15, ymm12, 44H vmovdqa ymmword ptr [rsp], ymm14 vmovdqa ymmword ptr [rsp+20H], ymm15 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx ALIGN 16 innerloop4: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+200H], eax vmovups ymm2, ymmword ptr [r8+rdx-40H] vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-40H], 01H vmovups ymm3, ymmword ptr [r8+rdx-30H] vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-30H], 01H vshufps ymm4, ymm2, ymm3, 136 vshufps ymm5, ymm2, ymm3, 221 vmovups ymm2, ymmword ptr [r8+rdx-20H] vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-20H], 01H vmovups ymm3, ymmword ptr [r8+rdx-10H] vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-10H], 01H vshufps ymm6, ymm2, ymm3, 136 vshufps ymm7, ymm2, ymm3, 221 vpshufd ymm6, ymm6, 93H vpshufd ymm7, ymm7, 93H vmovups ymm10, ymmword ptr [r10+rdx-40H] vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-40H], 01H vmovups ymm11, ymmword ptr [r10+rdx-30H] vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-30H], 01H vshufps ymm12, ymm10, ymm11, 136 vshufps ymm13, ymm10, ymm11, 221 vmovups ymm10, ymmword ptr [r10+rdx-20H] vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-20H], 01H vmovups ymm11, ymmword ptr [r10+rdx-10H] vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-10H], 01H vshufps ymm14, ymm10, ymm11, 136 vshufps ymm15, ymm10, ymm11, 221 vpshufd ymm14, ymm14, 93H vpshufd ymm15, ymm15, 93H vpbroadcastd ymm2, dword ptr [rsp+200H] vmovdqa ymm3, ymmword ptr [rsp] vmovdqa ymm11, ymmword ptr [rsp+20H] vpblendd ymm3, ymm3, ymm2, 88H vpblendd ymm11, ymm11, ymm2, 88H vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] vmovdqa ymm10, ymm2 mov al, 7 roundloop4: vpaddd ymm0, ymm0, ymm4 vpaddd ymm8, ymm8, ymm12 vmovdqa ymmword ptr [rsp+40H], ymm4 nop vmovdqa ymmword ptr [rsp+60H], ymm12 nop vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT16] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 12 vpslld ymm9, ymm9, 20 vpor ymm9, ymm9, ymm4 vpaddd ymm0, ymm0, ymm5 vpaddd ymm8, ymm8, ymm13 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vmovdqa ymmword ptr [rsp+80H], ymm5 vmovdqa ymmword ptr [rsp+0A0H], ymm13 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT8] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 7 vpslld ymm9, ymm9, 25 vpor ymm9, ymm9, ymm4 vpshufd ymm0, ymm0, 93H vpshufd ymm8, ymm8, 93H vpshufd ymm3, ymm3, 4EH vpshufd ymm11, ymm11, 4EH vpshufd ymm2, ymm2, 39H vpshufd ymm10, ymm10, 39H vpaddd ymm0, ymm0, ymm6 vpaddd ymm8, ymm8, ymm14 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT16] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 12 vpslld ymm9, ymm9, 20 vpor ymm9, ymm9, ymm4 vpaddd ymm0, ymm0, ymm7 vpaddd ymm8, ymm8, ymm15 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT8] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 7 vpslld ymm9, ymm9, 25 vpor ymm9, ymm9, ymm4 vpshufd ymm0, ymm0, 39H vpshufd ymm8, ymm8, 39H vpshufd ymm3, ymm3, 4EH vpshufd ymm11, ymm11, 4EH vpshufd ymm2, ymm2, 93H vpshufd ymm10, ymm10, 93H dec al je endroundloop4 vmovdqa ymm4, ymmword ptr [rsp+40H] vmovdqa ymm5, ymmword ptr [rsp+80H] vshufps ymm12, ymm4, ymm5, 214 vpshufd ymm13, ymm4, 0FH vpshufd ymm4, ymm12, 39H vshufps ymm12, ymm6, ymm7, 250 vpblendd ymm13, ymm13, ymm12, 0AAH vpunpcklqdq ymm12, ymm7, ymm5 vpblendd ymm12, ymm12, ymm6, 88H vpshufd ymm12, ymm12, 78H vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 1EH vmovdqa ymmword ptr [rsp+40H], ymm13 vmovdqa ymmword ptr [rsp+80H], ymm12 vmovdqa ymm12, ymmword ptr [rsp+60H] vmovdqa ymm13, ymmword ptr [rsp+0A0H] vshufps ymm5, ymm12, ymm13, 214 vpshufd ymm6, ymm12, 0FH vpshufd ymm12, ymm5, 39H vshufps ymm5, ymm14, ymm15, 250 vpblendd ymm6, ymm6, ymm5, 0AAH vpunpcklqdq ymm5, ymm15, ymm13 vpblendd ymm5, ymm5, ymm14, 88H vpshufd ymm5, ymm5, 78H vpunpckhdq ymm13, ymm13, ymm15 vpunpckldq ymm14, ymm14, ymm13 vpshufd ymm15, ymm14, 1EH vmovdqa ymm13, ymm6 vmovdqa ymm14, ymm5 vmovdqa ymm5, ymmword ptr [rsp+40H] vmovdqa ymm6, ymmword ptr [rsp+80H] jmp roundloop4 endroundloop4: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 vpxor ymm8, ymm8, ymm10 vpxor ymm9, ymm9, ymm11 mov eax, r13d cmp rdx, r15 jne innerloop4 vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+10H], xmm1 vextracti128 xmmword ptr [rbx+20H], ymm0, 01H vextracti128 xmmword ptr [rbx+30H], ymm1, 01H vmovdqu xmmword ptr [rbx+40H], xmm8 vmovdqu xmmword ptr [rbx+50H], xmm9 vextracti128 xmmword ptr [rbx+60H], ymm8, 01H vextracti128 xmmword ptr [rbx+70H], ymm9, 01H vmovaps xmm8, xmmword ptr [rsp+260H] vmovaps xmm0, xmmword ptr [rsp+220H] vmovaps xmm1, xmmword ptr [rsp+230H] vmovaps xmm2, xmmword ptr [rsp+240H] vmovaps xmm3, xmmword ptr [rsp+250H] vblendvps xmm0, xmm0, xmm1, xmm8 vblendvps xmm2, xmm2, xmm3, xmm8 vmovaps xmmword ptr [rsp+220H], xmm0 vmovaps xmmword ptr [rsp+240H], xmm2 add rbx, 128 add rdi, 32 sub rsi, 4 final3blocks: test rsi, 2H je final1blocks vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+10H] vmovd xmm13, dword ptr [rsp+220H] vpinsrd xmm13, xmm13, dword ptr [rsp+240H], 1 vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 vmovd xmm14, dword ptr [rsp+224H] vpinsrd xmm14, xmm14, dword ptr [rsp+244H], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 vinserti128 ymm13, ymm13, xmm14, 01H vbroadcasti128 ymm14, xmmword ptr [ROT16] vbroadcasti128 ymm15, xmmword ptr [ROT8] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx ALIGN 16 innerloop2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+200H], eax vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] vpbroadcastd ymm8, dword ptr [rsp+200H] vpblendd ymm3, ymm13, ymm8, 88H vmovups ymm8, ymmword ptr [r8+rdx-40H] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H vmovups ymm9, ymmword ptr [r8+rdx-30H] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H vshufps ymm4, ymm8, ymm9, 136 vshufps ymm5, ymm8, ymm9, 221 vmovups ymm8, ymmword ptr [r8+rdx-20H] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H vmovups ymm9, ymmword ptr [r8+rdx-10H] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H vshufps ymm6, ymm8, ymm9, 136 vshufps ymm7, ymm8, ymm9, 221 vpshufd ymm6, ymm6, 93H vpshufd ymm7, ymm7, 93H mov al, 7 roundloop2: vpaddd ymm0, ymm0, ymm4 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm14 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm8 vpaddd ymm0, ymm0, ymm5 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm15 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm8 vpshufd ymm0, ymm0, 93H vpshufd ymm3, ymm3, 4EH vpshufd ymm2, ymm2, 39H vpaddd ymm0, ymm0, ymm6 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm14 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm8 vpaddd ymm0, ymm0, ymm7 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm15 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm8 vpshufd ymm0, ymm0, 39H vpshufd ymm3, ymm3, 4EH vpshufd ymm2, ymm2, 93H dec al jz endroundloop2 vshufps ymm8, ymm4, ymm5, 214 vpshufd ymm9, ymm4, 0FH vpshufd ymm4, ymm8, 39H vshufps ymm8, ymm6, ymm7, 250 vpblendd ymm9, ymm9, ymm8, 0AAH vpunpcklqdq ymm8, ymm7, ymm5 vpblendd ymm8, ymm8, ymm6, 88H vpshufd ymm8, ymm8, 78H vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 1EH vmovdqa ymm5, ymm9 vmovdqa ymm6, ymm8 jmp roundloop2 endroundloop2: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 mov eax, r13d cmp rdx, r15 jne innerloop2 vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+10H], xmm1 vextracti128 xmmword ptr [rbx+20H], ymm0, 01H vextracti128 xmmword ptr [rbx+30H], ymm1, 01H vmovaps ymm8, ymmword ptr [rsp+260H] vmovaps ymm0, ymmword ptr [rsp+220H] vmovups ymm1, ymmword ptr [rsp+228H] vmovaps ymm2, ymmword ptr [rsp+240H] vmovups ymm3, ymmword ptr [rsp+248H] vblendvps ymm0, ymm0, ymm1, ymm8 vblendvps ymm2, ymm2, ymm3, ymm8 vmovaps ymmword ptr [rsp+220H], ymm0 vmovaps ymmword ptr [rsp+240H], ymm2 add rbx, 64 add rdi, 16 sub rsi, 2 final1blocks: test rsi, 1H je unwind vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+10H] vmovd xmm3, dword ptr [rsp+220H] vpinsrd xmm3, xmm3, dword ptr [rsp+240H], 1 vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN], 2 vmovdqa xmm14, xmmword ptr [ROT16] vmovdqa xmm15, xmmword ptr [ROT8] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx ALIGN 16 innerloop1: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d vmovdqa xmm2, xmmword ptr [BLAKE3_IV] vmovdqa xmm3, xmm13 vpinsrd xmm3, xmm3, eax, 3 vmovups xmm8, xmmword ptr [r8+rdx-40H] vmovups xmm9, xmmword ptr [r8+rdx-30H] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [r8+rdx-20H] vmovups xmm9, xmmword ptr [r8+rdx-10H] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 93H vpshufd xmm7, xmm7, 93H mov al, 7 roundloop1: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm14 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 12 vpslld xmm1, xmm1, 20 vpor xmm1, xmm1, xmm8 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm15 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 7 vpslld xmm1, xmm1, 25 vpor xmm1, xmm1, xmm8 vpshufd xmm0, xmm0, 93H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 39H vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm14 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 12 vpslld xmm1, xmm1, 20 vpor xmm1, xmm1, xmm8 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm15 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 7 vpslld xmm1, xmm1, 25 vpor xmm1, xmm1, xmm8 vpshufd xmm0, xmm0, 39H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 93H dec al jz endroundloop1 vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0FH vpshufd xmm4, xmm8, 39H vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0AAH vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 88H vpshufd xmm8, xmm8, 78H vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 1EH vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp roundloop1 endroundloop1: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne innerloop1 vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+10H], xmm1 jmp unwind _blake3_hash_many_avx2 ENDP blake3_hash_many_avx2 ENDP _TEXT ENDS _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' ALIGN 64 ADD0: dd 0, 1, 2, 3, 4, 5, 6, 7 ADD1: dd 8 dup (8) BLAKE3_IV_0: dd 8 dup (6A09E667H) BLAKE3_IV_1: dd 8 dup (0BB67AE85H) BLAKE3_IV_2: dd 8 dup (3C6EF372H) BLAKE3_IV_3: dd 8 dup (0A54FF53AH) BLAKE3_BLOCK_LEN: dd 8 dup (64) ROT16: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 CMP_MSB_MASK: dd 8 dup(80000000H) BLAKE3_IV: dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH _RDATA ENDS END blake3-1.8.1/c/blake3_avx512.c000064400000000000000000001525001046102023000136340ustar 00000000000000#include "blake3_impl.h" #include #define _mm_shuffle_ps2(a, b, c) \ (_mm_castps_si128( \ _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) INLINE __m128i loadu_128(const uint8_t src[16]) { return _mm_loadu_si128((void*)src); } INLINE __m256i loadu_256(const uint8_t src[32]) { return _mm256_loadu_si256((void*)src); } INLINE __m512i loadu_512(const uint8_t src[64]) { return _mm512_loadu_si512((void*)src); } INLINE void storeu_128(__m128i src, uint8_t dest[16]) { _mm_storeu_si128((void*)dest, src); } INLINE void storeu_256(__m256i src, uint8_t dest[16]) { _mm256_storeu_si256((void*)dest, src); } INLINE void storeu_512(__m512i src, uint8_t dest[16]) { _mm512_storeu_si512((void*)dest, src); } INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); } INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); } INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); } INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); } INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); } INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); } INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); } INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); } INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); } INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); } INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); } INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); } INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); } INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); } INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); } INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); } INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); } /* * ---------------------------------------------------------------------------- * compress_avx512 * ---------------------------------------------------------------------------- */ INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m) { *row0 = add_128(add_128(*row0, m), *row1); *row3 = xor_128(*row3, *row0); *row3 = rot16_128(*row3); *row2 = add_128(*row2, *row3); *row1 = xor_128(*row1, *row2); *row1 = rot12_128(*row1); } INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m) { *row0 = add_128(add_128(*row0, m), *row1); *row3 = xor_128(*row3, *row0); *row3 = rot8_128(*row3); *row2 = add_128(*row2, *row3); *row1 = xor_128(*row1, *row2); *row1 = rot7_128(*row1); } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); } INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); } INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { rows[0] = loadu_128((uint8_t *)&cv[0]); rows[1] = loadu_128((uint8_t *)&cv[4]); rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); rows[3] = set4(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags); __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]); __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]); __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]); __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]); __m128i t0, t1, t2, t3, tt; // Round 1. The first round permutes the message words from the original // input order, into the groups that get mixed in parallel. t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 2. This round and all following rounds apply a fixed permutation // to the message words from the round before. t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 3 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 4 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 5 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 6 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 7 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); } void blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu_128(xor_128(rows[0], rows[2]), &out[0]); storeu_128(xor_128(rows[1], rows[3]), &out[16]); storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]); storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]); } void blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]); storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]); } /* * ---------------------------------------------------------------------------- * hash4_avx512 * ---------------------------------------------------------------------------- */ INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) { v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); v[0] = add_128(v[0], v[4]); v[1] = add_128(v[1], v[5]); v[2] = add_128(v[2], v[6]); v[3] = add_128(v[3], v[7]); v[12] = xor_128(v[12], v[0]); v[13] = xor_128(v[13], v[1]); v[14] = xor_128(v[14], v[2]); v[15] = xor_128(v[15], v[3]); v[12] = rot16_128(v[12]); v[13] = rot16_128(v[13]); v[14] = rot16_128(v[14]); v[15] = rot16_128(v[15]); v[8] = add_128(v[8], v[12]); v[9] = add_128(v[9], v[13]); v[10] = add_128(v[10], v[14]); v[11] = add_128(v[11], v[15]); v[4] = xor_128(v[4], v[8]); v[5] = xor_128(v[5], v[9]); v[6] = xor_128(v[6], v[10]); v[7] = xor_128(v[7], v[11]); v[4] = rot12_128(v[4]); v[5] = rot12_128(v[5]); v[6] = rot12_128(v[6]); v[7] = rot12_128(v[7]); v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); v[0] = add_128(v[0], v[4]); v[1] = add_128(v[1], v[5]); v[2] = add_128(v[2], v[6]); v[3] = add_128(v[3], v[7]); v[12] = xor_128(v[12], v[0]); v[13] = xor_128(v[13], v[1]); v[14] = xor_128(v[14], v[2]); v[15] = xor_128(v[15], v[3]); v[12] = rot8_128(v[12]); v[13] = rot8_128(v[13]); v[14] = rot8_128(v[14]); v[15] = rot8_128(v[15]); v[8] = add_128(v[8], v[12]); v[9] = add_128(v[9], v[13]); v[10] = add_128(v[10], v[14]); v[11] = add_128(v[11], v[15]); v[4] = xor_128(v[4], v[8]); v[5] = xor_128(v[5], v[9]); v[6] = xor_128(v[6], v[10]); v[7] = xor_128(v[7], v[11]); v[4] = rot7_128(v[4]); v[5] = rot7_128(v[5]); v[6] = rot7_128(v[6]); v[7] = rot7_128(v[7]); v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); v[0] = add_128(v[0], v[5]); v[1] = add_128(v[1], v[6]); v[2] = add_128(v[2], v[7]); v[3] = add_128(v[3], v[4]); v[15] = xor_128(v[15], v[0]); v[12] = xor_128(v[12], v[1]); v[13] = xor_128(v[13], v[2]); v[14] = xor_128(v[14], v[3]); v[15] = rot16_128(v[15]); v[12] = rot16_128(v[12]); v[13] = rot16_128(v[13]); v[14] = rot16_128(v[14]); v[10] = add_128(v[10], v[15]); v[11] = add_128(v[11], v[12]); v[8] = add_128(v[8], v[13]); v[9] = add_128(v[9], v[14]); v[5] = xor_128(v[5], v[10]); v[6] = xor_128(v[6], v[11]); v[7] = xor_128(v[7], v[8]); v[4] = xor_128(v[4], v[9]); v[5] = rot12_128(v[5]); v[6] = rot12_128(v[6]); v[7] = rot12_128(v[7]); v[4] = rot12_128(v[4]); v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); v[0] = add_128(v[0], v[5]); v[1] = add_128(v[1], v[6]); v[2] = add_128(v[2], v[7]); v[3] = add_128(v[3], v[4]); v[15] = xor_128(v[15], v[0]); v[12] = xor_128(v[12], v[1]); v[13] = xor_128(v[13], v[2]); v[14] = xor_128(v[14], v[3]); v[15] = rot8_128(v[15]); v[12] = rot8_128(v[12]); v[13] = rot8_128(v[13]); v[14] = rot8_128(v[14]); v[10] = add_128(v[10], v[15]); v[11] = add_128(v[11], v[12]); v[8] = add_128(v[8], v[13]); v[9] = add_128(v[9], v[14]); v[5] = xor_128(v[5], v[10]); v[6] = xor_128(v[6], v[11]); v[7] = xor_128(v[7], v[8]); v[4] = xor_128(v[4], v[9]); v[5] = rot7_128(v[5]); v[6] = rot7_128(v[6]); v[7] = rot7_128(v[7]); v[4] = rot7_128(v[4]); } INLINE void transpose_vecs_128(__m128i vecs[4]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is // 22/33. Note that this doesn't split the vector into two lanes, as the // AVX2 counterparts do. __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); // Interleave 64-bit lanes. __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); vecs[0] = abcd_0; vecs[1] = abcd_1; vecs[2] = abcd_2; vecs[3] = abcd_3; } INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, size_t block_offset, __m128i out[16]) { out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]); out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]); out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]); out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]); out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]); out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]); out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]); out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]); out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]); out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]); out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]); out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]); out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]); out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]); out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]); out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]); for (size_t i = 0; i < 4; ++i) { _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs_128(&out[0]); transpose_vecs_128(&out[4]); transpose_vecs_128(&out[8]); transpose_vecs_128(&out[12]); } INLINE void load_counters4(uint64_t counter, bool increment_counter, __m128i *out_lo, __m128i *out_hi) { uint64_t mask = (increment_counter ? ~0 : 0); __m256i mask_vec = _mm256_set1_epi64x(mask); __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3); deltas = _mm256_and_si256(mask_vec, deltas); __m256i counters = _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas); *out_lo = _mm256_cvtepi64_epi32(counters); *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32)); } static void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m128i h_vecs[8] = { set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), }; __m128i counter_low_vec, counter_high_vec; load_counters4(counter, increment_counter, &counter_low_vec, &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { if (block + 1 == blocks) { block_flags |= flags_end; } __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN); __m128i block_flags_vec = set1_128(block_flags); __m128i msg_vecs[16]; transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); __m128i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn4(v, msg_vecs, 0); round_fn4(v, msg_vecs, 1); round_fn4(v, msg_vecs, 2); round_fn4(v, msg_vecs, 3); round_fn4(v, msg_vecs, 4); round_fn4(v, msg_vecs, 5); round_fn4(v, msg_vecs, 6); h_vecs[0] = xor_128(v[0], v[8]); h_vecs[1] = xor_128(v[1], v[9]); h_vecs[2] = xor_128(v[2], v[10]); h_vecs[3] = xor_128(v[3], v[11]); h_vecs[4] = xor_128(v[4], v[12]); h_vecs[5] = xor_128(v[5], v[13]); h_vecs[6] = xor_128(v[6], v[14]); h_vecs[7] = xor_128(v[7], v[15]); block_flags = flags; } transpose_vecs_128(&h_vecs[0]); transpose_vecs_128(&h_vecs[4]); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]); storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]); storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]); storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]); storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]); storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]); storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]); storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]); } static void blake3_xof4_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[4 * 64]) { __m128i h_vecs[8] = { set1_128(cv[0]), set1_128(cv[1]), set1_128(cv[2]), set1_128(cv[3]), set1_128(cv[4]), set1_128(cv[5]), set1_128(cv[6]), set1_128(cv[7]), }; uint32_t block_words[16]; load_block_words(block, block_words); __m128i msg_vecs[16]; for (size_t i = 0; i < 16; i++) { msg_vecs[i] = set1_128(block_words[i]); } __m128i counter_low_vec, counter_high_vec; load_counters4(counter, true, &counter_low_vec, &counter_high_vec); __m128i block_len_vec = set1_128(block_len); __m128i block_flags_vec = set1_128(flags); __m128i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn4(v, msg_vecs, 0); round_fn4(v, msg_vecs, 1); round_fn4(v, msg_vecs, 2); round_fn4(v, msg_vecs, 3); round_fn4(v, msg_vecs, 4); round_fn4(v, msg_vecs, 5); round_fn4(v, msg_vecs, 6); for (size_t i = 0; i < 8; i++) { v[i] = xor_128(v[i], v[i+8]); v[i+8] = xor_128(v[i+8], h_vecs[i]); } transpose_vecs_128(&v[0]); transpose_vecs_128(&v[4]); transpose_vecs_128(&v[8]); transpose_vecs_128(&v[12]); for (size_t i = 0; i < 4; i++) { storeu_128(v[i+ 0], &out[(4*i+0) * sizeof(__m128i)]); storeu_128(v[i+ 4], &out[(4*i+1) * sizeof(__m128i)]); storeu_128(v[i+ 8], &out[(4*i+2) * sizeof(__m128i)]); storeu_128(v[i+12], &out[(4*i+3) * sizeof(__m128i)]); } } /* * ---------------------------------------------------------------------------- * hash8_avx512 * ---------------------------------------------------------------------------- */ INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) { v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); v[0] = add_256(v[0], v[4]); v[1] = add_256(v[1], v[5]); v[2] = add_256(v[2], v[6]); v[3] = add_256(v[3], v[7]); v[12] = xor_256(v[12], v[0]); v[13] = xor_256(v[13], v[1]); v[14] = xor_256(v[14], v[2]); v[15] = xor_256(v[15], v[3]); v[12] = rot16_256(v[12]); v[13] = rot16_256(v[13]); v[14] = rot16_256(v[14]); v[15] = rot16_256(v[15]); v[8] = add_256(v[8], v[12]); v[9] = add_256(v[9], v[13]); v[10] = add_256(v[10], v[14]); v[11] = add_256(v[11], v[15]); v[4] = xor_256(v[4], v[8]); v[5] = xor_256(v[5], v[9]); v[6] = xor_256(v[6], v[10]); v[7] = xor_256(v[7], v[11]); v[4] = rot12_256(v[4]); v[5] = rot12_256(v[5]); v[6] = rot12_256(v[6]); v[7] = rot12_256(v[7]); v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); v[0] = add_256(v[0], v[4]); v[1] = add_256(v[1], v[5]); v[2] = add_256(v[2], v[6]); v[3] = add_256(v[3], v[7]); v[12] = xor_256(v[12], v[0]); v[13] = xor_256(v[13], v[1]); v[14] = xor_256(v[14], v[2]); v[15] = xor_256(v[15], v[3]); v[12] = rot8_256(v[12]); v[13] = rot8_256(v[13]); v[14] = rot8_256(v[14]); v[15] = rot8_256(v[15]); v[8] = add_256(v[8], v[12]); v[9] = add_256(v[9], v[13]); v[10] = add_256(v[10], v[14]); v[11] = add_256(v[11], v[15]); v[4] = xor_256(v[4], v[8]); v[5] = xor_256(v[5], v[9]); v[6] = xor_256(v[6], v[10]); v[7] = xor_256(v[7], v[11]); v[4] = rot7_256(v[4]); v[5] = rot7_256(v[5]); v[6] = rot7_256(v[6]); v[7] = rot7_256(v[7]); v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); v[0] = add_256(v[0], v[5]); v[1] = add_256(v[1], v[6]); v[2] = add_256(v[2], v[7]); v[3] = add_256(v[3], v[4]); v[15] = xor_256(v[15], v[0]); v[12] = xor_256(v[12], v[1]); v[13] = xor_256(v[13], v[2]); v[14] = xor_256(v[14], v[3]); v[15] = rot16_256(v[15]); v[12] = rot16_256(v[12]); v[13] = rot16_256(v[13]); v[14] = rot16_256(v[14]); v[10] = add_256(v[10], v[15]); v[11] = add_256(v[11], v[12]); v[8] = add_256(v[8], v[13]); v[9] = add_256(v[9], v[14]); v[5] = xor_256(v[5], v[10]); v[6] = xor_256(v[6], v[11]); v[7] = xor_256(v[7], v[8]); v[4] = xor_256(v[4], v[9]); v[5] = rot12_256(v[5]); v[6] = rot12_256(v[6]); v[7] = rot12_256(v[7]); v[4] = rot12_256(v[4]); v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); v[0] = add_256(v[0], v[5]); v[1] = add_256(v[1], v[6]); v[2] = add_256(v[2], v[7]); v[3] = add_256(v[3], v[4]); v[15] = xor_256(v[15], v[0]); v[12] = xor_256(v[12], v[1]); v[13] = xor_256(v[13], v[2]); v[14] = xor_256(v[14], v[3]); v[15] = rot8_256(v[15]); v[12] = rot8_256(v[12]); v[13] = rot8_256(v[13]); v[14] = rot8_256(v[14]); v[10] = add_256(v[10], v[15]); v[11] = add_256(v[11], v[12]); v[8] = add_256(v[8], v[13]); v[9] = add_256(v[9], v[14]); v[5] = xor_256(v[5], v[10]); v[6] = xor_256(v[6], v[11]); v[7] = xor_256(v[7], v[8]); v[4] = xor_256(v[4], v[9]); v[5] = rot7_256(v[5]); v[6] = rot7_256(v[6]); v[7] = rot7_256(v[7]); v[4] = rot7_256(v[4]); } INLINE void transpose_vecs_256(__m256i vecs[8]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high // is 22/33/66/77. __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is // 11/33. __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); // Interleave 128-bit lanes. vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); } INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, size_t block_offset, __m256i out[16]) { out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]); out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]); out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]); out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]); out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]); out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]); out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]); out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]); out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]); out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]); out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]); out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]); out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]); out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]); out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]); out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]); for (size_t i = 0; i < 8; ++i) { _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs_256(&out[0]); transpose_vecs_256(&out[8]); } INLINE void load_counters8(uint64_t counter, bool increment_counter, __m256i *out_lo, __m256i *out_hi) { uint64_t mask = (increment_counter ? ~0 : 0); __m512i mask_vec = _mm512_set1_epi64(mask); __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); deltas = _mm512_and_si512(mask_vec, deltas); __m512i counters = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas); *out_lo = _mm512_cvtepi64_epi32(counters); *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32)); } static void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m256i h_vecs[8] = { set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]), }; __m256i counter_low_vec, counter_high_vec; load_counters8(counter, increment_counter, &counter_low_vec, &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { if (block + 1 == blocks) { block_flags |= flags_end; } __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN); __m256i block_flags_vec = set1_256(block_flags); __m256i msg_vecs[16]; transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); __m256i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn8(v, msg_vecs, 0); round_fn8(v, msg_vecs, 1); round_fn8(v, msg_vecs, 2); round_fn8(v, msg_vecs, 3); round_fn8(v, msg_vecs, 4); round_fn8(v, msg_vecs, 5); round_fn8(v, msg_vecs, 6); h_vecs[0] = xor_256(v[0], v[8]); h_vecs[1] = xor_256(v[1], v[9]); h_vecs[2] = xor_256(v[2], v[10]); h_vecs[3] = xor_256(v[3], v[11]); h_vecs[4] = xor_256(v[4], v[12]); h_vecs[5] = xor_256(v[5], v[13]); h_vecs[6] = xor_256(v[6], v[14]); h_vecs[7] = xor_256(v[7], v[15]); block_flags = flags; } transpose_vecs_256(h_vecs); storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]); storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]); storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]); storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]); storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]); storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]); storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]); storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]); } static void blake3_xof8_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[8 * 64]) { __m256i h_vecs[8] = { set1_256(cv[0]), set1_256(cv[1]), set1_256(cv[2]), set1_256(cv[3]), set1_256(cv[4]), set1_256(cv[5]), set1_256(cv[6]), set1_256(cv[7]), }; uint32_t block_words[16]; load_block_words(block, block_words); __m256i msg_vecs[16]; for (size_t i = 0; i < 16; i++) { msg_vecs[i] = set1_256(block_words[i]); } __m256i counter_low_vec, counter_high_vec; load_counters8(counter, true, &counter_low_vec, &counter_high_vec); __m256i block_len_vec = set1_256(block_len); __m256i block_flags_vec = set1_256(flags); __m256i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn8(v, msg_vecs, 0); round_fn8(v, msg_vecs, 1); round_fn8(v, msg_vecs, 2); round_fn8(v, msg_vecs, 3); round_fn8(v, msg_vecs, 4); round_fn8(v, msg_vecs, 5); round_fn8(v, msg_vecs, 6); for (size_t i = 0; i < 8; i++) { v[i] = xor_256(v[i], v[i+8]); v[i+8] = xor_256(v[i+8], h_vecs[i]); } transpose_vecs_256(&v[0]); transpose_vecs_256(&v[8]); for (size_t i = 0; i < 8; i++) { storeu_256(v[i+0], &out[(2*i+0) * sizeof(__m256i)]); storeu_256(v[i+8], &out[(2*i+1) * sizeof(__m256i)]); } } /* * ---------------------------------------------------------------------------- * hash16_avx512 * ---------------------------------------------------------------------------- */ INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) { v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); v[0] = add_512(v[0], v[4]); v[1] = add_512(v[1], v[5]); v[2] = add_512(v[2], v[6]); v[3] = add_512(v[3], v[7]); v[12] = xor_512(v[12], v[0]); v[13] = xor_512(v[13], v[1]); v[14] = xor_512(v[14], v[2]); v[15] = xor_512(v[15], v[3]); v[12] = rot16_512(v[12]); v[13] = rot16_512(v[13]); v[14] = rot16_512(v[14]); v[15] = rot16_512(v[15]); v[8] = add_512(v[8], v[12]); v[9] = add_512(v[9], v[13]); v[10] = add_512(v[10], v[14]); v[11] = add_512(v[11], v[15]); v[4] = xor_512(v[4], v[8]); v[5] = xor_512(v[5], v[9]); v[6] = xor_512(v[6], v[10]); v[7] = xor_512(v[7], v[11]); v[4] = rot12_512(v[4]); v[5] = rot12_512(v[5]); v[6] = rot12_512(v[6]); v[7] = rot12_512(v[7]); v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); v[0] = add_512(v[0], v[4]); v[1] = add_512(v[1], v[5]); v[2] = add_512(v[2], v[6]); v[3] = add_512(v[3], v[7]); v[12] = xor_512(v[12], v[0]); v[13] = xor_512(v[13], v[1]); v[14] = xor_512(v[14], v[2]); v[15] = xor_512(v[15], v[3]); v[12] = rot8_512(v[12]); v[13] = rot8_512(v[13]); v[14] = rot8_512(v[14]); v[15] = rot8_512(v[15]); v[8] = add_512(v[8], v[12]); v[9] = add_512(v[9], v[13]); v[10] = add_512(v[10], v[14]); v[11] = add_512(v[11], v[15]); v[4] = xor_512(v[4], v[8]); v[5] = xor_512(v[5], v[9]); v[6] = xor_512(v[6], v[10]); v[7] = xor_512(v[7], v[11]); v[4] = rot7_512(v[4]); v[5] = rot7_512(v[5]); v[6] = rot7_512(v[6]); v[7] = rot7_512(v[7]); v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); v[0] = add_512(v[0], v[5]); v[1] = add_512(v[1], v[6]); v[2] = add_512(v[2], v[7]); v[3] = add_512(v[3], v[4]); v[15] = xor_512(v[15], v[0]); v[12] = xor_512(v[12], v[1]); v[13] = xor_512(v[13], v[2]); v[14] = xor_512(v[14], v[3]); v[15] = rot16_512(v[15]); v[12] = rot16_512(v[12]); v[13] = rot16_512(v[13]); v[14] = rot16_512(v[14]); v[10] = add_512(v[10], v[15]); v[11] = add_512(v[11], v[12]); v[8] = add_512(v[8], v[13]); v[9] = add_512(v[9], v[14]); v[5] = xor_512(v[5], v[10]); v[6] = xor_512(v[6], v[11]); v[7] = xor_512(v[7], v[8]); v[4] = xor_512(v[4], v[9]); v[5] = rot12_512(v[5]); v[6] = rot12_512(v[6]); v[7] = rot12_512(v[7]); v[4] = rot12_512(v[4]); v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); v[0] = add_512(v[0], v[5]); v[1] = add_512(v[1], v[6]); v[2] = add_512(v[2], v[7]); v[3] = add_512(v[3], v[4]); v[15] = xor_512(v[15], v[0]); v[12] = xor_512(v[12], v[1]); v[13] = xor_512(v[13], v[2]); v[14] = xor_512(v[14], v[3]); v[15] = rot8_512(v[15]); v[12] = rot8_512(v[12]); v[13] = rot8_512(v[13]); v[14] = rot8_512(v[14]); v[10] = add_512(v[10], v[15]); v[11] = add_512(v[11], v[12]); v[8] = add_512(v[8], v[13]); v[9] = add_512(v[9], v[14]); v[5] = xor_512(v[5], v[10]); v[6] = xor_512(v[6], v[11]); v[7] = xor_512(v[7], v[8]); v[4] = xor_512(v[4], v[9]); v[5] = rot7_512(v[5]); v[6] = rot7_512(v[6]); v[7] = rot7_512(v[7]); v[4] = rot7_512(v[4]); } // 0b10001000, or lanes a0/a2/b0/b2 in little-endian order #define LO_IMM8 0x88 INLINE __m512i unpack_lo_128(__m512i a, __m512i b) { return _mm512_shuffle_i32x4(a, b, LO_IMM8); } // 0b11011101, or lanes a1/a3/b1/b3 in little-endian order #define HI_IMM8 0xdd INLINE __m512i unpack_hi_128(__m512i a, __m512i b) { return _mm512_shuffle_i32x4(a, b, HI_IMM8); } INLINE void transpose_vecs_512(__m512i vecs[16]) { // Interleave 32-bit lanes. The _0 unpack is lanes // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15. __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]); __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]); __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]); __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]); __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]); __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]); __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]); __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]); __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]); __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]); __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]); __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]); __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]); __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]); __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]); __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]); // Interleave 64-bit lanes. The _0 unpack is lanes // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15. __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0); __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0); __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2); __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2); __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0); __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0); __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2); __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2); __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0); __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0); __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2); __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2); __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0); __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0); __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2); __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2); // Interleave 128-bit lanes. The _0 unpack is // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on. __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0); __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1); __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2); __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3); __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0); __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1); __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2); __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3); __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0); __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1); __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2); __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3); __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0); __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1); __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2); __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3); // Interleave 128-bit lanes again for the final outputs. vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0); vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1); vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2); vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3); vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4); vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5); vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6); vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7); vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0); vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1); vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2); vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3); vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4); vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5); vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6); vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7); } INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, size_t block_offset, __m512i out[16]) { out[0] = loadu_512(&inputs[0][block_offset]); out[1] = loadu_512(&inputs[1][block_offset]); out[2] = loadu_512(&inputs[2][block_offset]); out[3] = loadu_512(&inputs[3][block_offset]); out[4] = loadu_512(&inputs[4][block_offset]); out[5] = loadu_512(&inputs[5][block_offset]); out[6] = loadu_512(&inputs[6][block_offset]); out[7] = loadu_512(&inputs[7][block_offset]); out[8] = loadu_512(&inputs[8][block_offset]); out[9] = loadu_512(&inputs[9][block_offset]); out[10] = loadu_512(&inputs[10][block_offset]); out[11] = loadu_512(&inputs[11][block_offset]); out[12] = loadu_512(&inputs[12][block_offset]); out[13] = loadu_512(&inputs[13][block_offset]); out[14] = loadu_512(&inputs[14][block_offset]); out[15] = loadu_512(&inputs[15][block_offset]); for (size_t i = 0; i < 16; ++i) { _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs_512(out); } INLINE void load_counters16(uint64_t counter, bool increment_counter, __m512i *out_lo, __m512i *out_hi) { const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter); const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); const __m512i masked_deltas = _mm512_and_si512(deltas, mask); const __m512i low_words = _mm512_add_epi32( _mm512_set1_epi32((int32_t)counter), masked_deltas); // The carry bit is 1 if the high bit of the word was 1 before addition and is // 0 after. // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to // compute the carry bits here, and originally we did, but that intrinsic is // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271. const __m512i carries = _mm512_srli_epi32( _mm512_andnot_si512( low_words, // 0 after (gets inverted by andnot) _mm512_set1_epi32((int32_t)counter)), // and 1 before 31); const __m512i high_words = _mm512_add_epi32( _mm512_set1_epi32((int32_t)(counter >> 32)), carries); *out_lo = low_words; *out_hi = high_words; } static void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m512i h_vecs[8] = { set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]), }; __m512i counter_low_vec, counter_high_vec; load_counters16(counter, increment_counter, &counter_low_vec, &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { if (block + 1 == blocks) { block_flags |= flags_end; } __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN); __m512i block_flags_vec = set1_512(block_flags); __m512i msg_vecs[16]; transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); __m512i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn16(v, msg_vecs, 0); round_fn16(v, msg_vecs, 1); round_fn16(v, msg_vecs, 2); round_fn16(v, msg_vecs, 3); round_fn16(v, msg_vecs, 4); round_fn16(v, msg_vecs, 5); round_fn16(v, msg_vecs, 6); h_vecs[0] = xor_512(v[0], v[8]); h_vecs[1] = xor_512(v[1], v[9]); h_vecs[2] = xor_512(v[2], v[10]); h_vecs[3] = xor_512(v[3], v[11]); h_vecs[4] = xor_512(v[4], v[12]); h_vecs[5] = xor_512(v[5], v[13]); h_vecs[6] = xor_512(v[6], v[14]); h_vecs[7] = xor_512(v[7], v[15]); block_flags = flags; } // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8 // state vectors. Pad the matrix with zeros. After transposition, store the // lower half of each vector. __m512i padded[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_512(0), set1_512(0), set1_512(0), set1_512(0), set1_512(0), set1_512(0), set1_512(0), set1_512(0), }; transpose_vecs_512(padded); _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0])); _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1])); _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2])); _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3])); _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4])); _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5])); _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6])); _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7])); _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8])); _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9])); _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10])); _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11])); _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12])); _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13])); _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14])); _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15])); } static void blake3_xof16_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[16 * 64]) { __m512i h_vecs[8] = { set1_512(cv[0]), set1_512(cv[1]), set1_512(cv[2]), set1_512(cv[3]), set1_512(cv[4]), set1_512(cv[5]), set1_512(cv[6]), set1_512(cv[7]), }; uint32_t block_words[16]; load_block_words(block, block_words); __m512i msg_vecs[16]; for (size_t i = 0; i < 16; i++) { msg_vecs[i] = set1_512(block_words[i]); } __m512i counter_low_vec, counter_high_vec; load_counters16(counter, true, &counter_low_vec, &counter_high_vec); __m512i block_len_vec = set1_512(block_len); __m512i block_flags_vec = set1_512(flags); __m512i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn16(v, msg_vecs, 0); round_fn16(v, msg_vecs, 1); round_fn16(v, msg_vecs, 2); round_fn16(v, msg_vecs, 3); round_fn16(v, msg_vecs, 4); round_fn16(v, msg_vecs, 5); round_fn16(v, msg_vecs, 6); for (size_t i = 0; i < 8; i++) { v[i] = xor_512(v[i], v[i+8]); v[i+8] = xor_512(v[i+8], h_vecs[i]); } transpose_vecs_512(&v[0]); for (size_t i = 0; i < 16; i++) { storeu_512(v[i], &out[i * sizeof(__m512i)]); } } /* * ---------------------------------------------------------------------------- * hash_many_avx512 * ---------------------------------------------------------------------------- */ INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { uint32_t cv[8]; memcpy(cv, key, BLAKE3_KEY_LEN); uint8_t block_flags = flags | flags_start; while (blocks > 0) { if (blocks == 1) { block_flags |= flags_end; } blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; } memcpy(out, cv, BLAKE3_OUT_LEN); } void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { while (num_inputs >= 16) { blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 16; } inputs += 16; num_inputs -= 16; out = &out[16 * BLAKE3_OUT_LEN]; } while (num_inputs >= 8) { blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 8; } inputs += 8; num_inputs -= 8; out = &out[8 * BLAKE3_OUT_LEN]; } while (num_inputs >= 4) { blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 4; } inputs += 4; num_inputs -= 4; out = &out[4 * BLAKE3_OUT_LEN]; } while (num_inputs > 0) { hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 1; } inputs += 1; num_inputs -= 1; out = &out[BLAKE3_OUT_LEN]; } } void blake3_xof_many_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t* out, size_t outblocks) { while (outblocks >= 16) { blake3_xof16_avx512(cv, block, block_len, counter, flags, out); counter += 16; outblocks -= 16; out += 16 * BLAKE3_BLOCK_LEN; } while (outblocks >= 8) { blake3_xof8_avx512(cv, block, block_len, counter, flags, out); counter += 8; outblocks -= 8; out += 8 * BLAKE3_BLOCK_LEN; } while (outblocks >= 4) { blake3_xof4_avx512(cv, block, block_len, counter, flags, out); counter += 4; outblocks -= 4; out += 4 * BLAKE3_BLOCK_LEN; } while (outblocks > 0) { blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); counter += 1; outblocks -= 1; out += BLAKE3_BLOCK_LEN; } } blake3-1.8.1/c/blake3_avx512_x86-64_unix.S000064400000000000000000004736101046102023000156230ustar 00000000000000#if defined(__ELF__) && defined(__linux__) .section .note.GNU-stack,"",%progbits #endif #if defined(__ELF__) && defined(__CET__) && defined(__has_include) #if __has_include() #include #endif #endif #if !defined(_CET_ENDBR) #define _CET_ENDBR #endif .intel_syntax noprefix .global _blake3_hash_many_avx512 .global blake3_hash_many_avx512 .global blake3_compress_in_place_avx512 .global _blake3_compress_in_place_avx512 .global blake3_compress_xof_avx512 .global _blake3_compress_xof_avx512 .global blake3_xof_many_avx512 .global _blake3_xof_many_avx512 #ifdef __APPLE__ .text #else .section .text #endif .p2align 6 _blake3_hash_many_avx512: blake3_hash_many_avx512: _CET_ENDBR push r15 push r14 push r13 push r12 push rbx push rbp mov rbp, rsp sub rsp, 144 and rsp, 0xFFFFFFFFFFFFFFC0 neg r9 kmovw k1, r9d vmovd xmm0, r8d vpbroadcastd ymm0, xmm0 shr r8, 32 vmovd xmm1, r8d vpbroadcastd ymm1, xmm1 vmovdqa ymm4, ymm1 vmovdqa ymm5, ymm1 vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] vpcmpltud k2, ymm2, ymm0 vpcmpltud k3, ymm3, ymm0 vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} knotw k2, k1 vmovdqa32 ymm2 {k2}, ymm0 vmovdqa32 ymm3 {k2}, ymm0 vmovdqa32 ymm4 {k2}, ymm1 vmovdqa32 ymm5 {k2}, ymm1 vmovdqa ymmword ptr [rsp], ymm2 vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 shl rdx, 6 mov qword ptr [rsp+0x80], rdx cmp rsi, 16 jc 3f 2: vpbroadcastd zmm0, dword ptr [rcx] vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] movzx eax, byte ptr [rbp+0x38] movzx ebx, byte ptr [rbp+0x40] or eax, ebx xor edx, edx .p2align 5 9: movzx ebx, byte ptr [rbp+0x48] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+0x80] cmove eax, ebx mov dword ptr [rsp+0x88], eax mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x40] mov r13, qword ptr [rdi+0x48] mov r14, qword ptr [rdi+0x50] mov r15, qword ptr [rdi+0x58] vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 vpunpcklqdq zmm8, zmm16, zmm17 vpunpckhqdq zmm9, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 vpunpcklqdq zmm10, zmm18, zmm19 vpunpckhqdq zmm11, zmm18, zmm19 mov r8, qword ptr [rdi+0x20] mov r9, qword ptr [rdi+0x28] mov r10, qword ptr [rdi+0x30] mov r11, qword ptr [rdi+0x38] mov r12, qword ptr [rdi+0x60] mov r13, qword ptr [rdi+0x68] mov r14, qword ptr [rdi+0x70] mov r15, qword ptr [rdi+0x78] vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 vpunpcklqdq zmm12, zmm16, zmm17 vpunpckhqdq zmm13, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 vpunpcklqdq zmm14, zmm18, zmm19 vpunpckhqdq zmm15, zmm18, zmm19 vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] vshufps zmm16, zmm8, zmm10, 136 vshufps zmm17, zmm12, zmm14, 136 vmovdqa32 zmm20, zmm16 vpermt2d zmm16, zmm27, zmm17 vpermt2d zmm20, zmm31, zmm17 vshufps zmm17, zmm8, zmm10, 221 vshufps zmm30, zmm12, zmm14, 221 vmovdqa32 zmm21, zmm17 vpermt2d zmm17, zmm27, zmm30 vpermt2d zmm21, zmm31, zmm30 vshufps zmm18, zmm9, zmm11, 136 vshufps zmm8, zmm13, zmm15, 136 vmovdqa32 zmm22, zmm18 vpermt2d zmm18, zmm27, zmm8 vpermt2d zmm22, zmm31, zmm8 vshufps zmm19, zmm9, zmm11, 221 vshufps zmm8, zmm13, zmm15, 221 vmovdqa32 zmm23, zmm19 vpermt2d zmm19, zmm27, zmm8 vpermt2d zmm23, zmm31, zmm8 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x40] mov r13, qword ptr [rdi+0x48] mov r14, qword ptr [rdi+0x50] mov r15, qword ptr [rdi+0x58] vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm8, zmm24, zmm25 vpunpckhqdq zmm9, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm10, zmm24, zmm25 vpunpckhqdq zmm11, zmm24, zmm25 prefetcht0 [r8+rdx+0x80] prefetcht0 [r12+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r13+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r14+rdx+0x80] prefetcht0 [r11+rdx+0x80] prefetcht0 [r15+rdx+0x80] mov r8, qword ptr [rdi+0x20] mov r9, qword ptr [rdi+0x28] mov r10, qword ptr [rdi+0x30] mov r11, qword ptr [rdi+0x38] mov r12, qword ptr [rdi+0x60] mov r13, qword ptr [rdi+0x68] mov r14, qword ptr [rdi+0x70] mov r15, qword ptr [rdi+0x78] vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm12, zmm24, zmm25 vpunpckhqdq zmm13, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm14, zmm24, zmm25 vpunpckhqdq zmm15, zmm24, zmm25 prefetcht0 [r8+rdx+0x80] prefetcht0 [r12+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r13+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r14+rdx+0x80] prefetcht0 [r11+rdx+0x80] prefetcht0 [r15+rdx+0x80] vshufps zmm24, zmm8, zmm10, 136 vshufps zmm30, zmm12, zmm14, 136 vmovdqa32 zmm28, zmm24 vpermt2d zmm24, zmm27, zmm30 vpermt2d zmm28, zmm31, zmm30 vshufps zmm25, zmm8, zmm10, 221 vshufps zmm30, zmm12, zmm14, 221 vmovdqa32 zmm29, zmm25 vpermt2d zmm25, zmm27, zmm30 vpermt2d zmm29, zmm31, zmm30 vshufps zmm26, zmm9, zmm11, 136 vshufps zmm8, zmm13, zmm15, 136 vmovdqa32 zmm30, zmm26 vpermt2d zmm26, zmm27, zmm8 vpermt2d zmm30, zmm31, zmm8 vshufps zmm8, zmm9, zmm11, 221 vshufps zmm10, zmm13, zmm15, 221 vpermi2d zmm27, zmm8, zmm10 vpermi2d zmm31, zmm8, zmm10 vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] vmovdqa32 zmm12, zmmword ptr [rsp] vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] vpaddd zmm0, zmm0, zmm16 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm20 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm17 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm21 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm24 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm28 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm25 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm29 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm18 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm23 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm22 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm16 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm17 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm25 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm27 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm30 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm19 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm29 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm20 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm18 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm22 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm27 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm21 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm31 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm26 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm30 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm23 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm19 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm20 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm21 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm16 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm24 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm28 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm31 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm29 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm26 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm23 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm16 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm18 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm17 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm25 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm24 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm30 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm28 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm29 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm18 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm19 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm22 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm27 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm17 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm31 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm25 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm30 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm19 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm26 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm20 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpxord zmm0, zmm0, zmm8 vpxord zmm1, zmm1, zmm9 vpxord zmm2, zmm2, zmm10 vpxord zmm3, zmm3, zmm11 vpxord zmm4, zmm4, zmm12 vpxord zmm5, zmm5, zmm13 vpxord zmm6, zmm6, zmm14 vpxord zmm7, zmm7, zmm15 movzx eax, byte ptr [rbp+0x38] jne 9b mov rbx, qword ptr [rbp+0x50] vpunpckldq zmm16, zmm0, zmm1 vpunpckhdq zmm17, zmm0, zmm1 vpunpckldq zmm18, zmm2, zmm3 vpunpckhdq zmm19, zmm2, zmm3 vpunpckldq zmm20, zmm4, zmm5 vpunpckhdq zmm21, zmm4, zmm5 vpunpckldq zmm22, zmm6, zmm7 vpunpckhdq zmm23, zmm6, zmm7 vpunpcklqdq zmm0, zmm16, zmm18 vpunpckhqdq zmm1, zmm16, zmm18 vpunpcklqdq zmm2, zmm17, zmm19 vpunpckhqdq zmm3, zmm17, zmm19 vpunpcklqdq zmm4, zmm20, zmm22 vpunpckhqdq zmm5, zmm20, zmm22 vpunpcklqdq zmm6, zmm21, zmm23 vpunpckhqdq zmm7, zmm21, zmm23 vshufi32x4 zmm16, zmm0, zmm4, 0x88 vshufi32x4 zmm17, zmm1, zmm5, 0x88 vshufi32x4 zmm18, zmm2, zmm6, 0x88 vshufi32x4 zmm19, zmm3, zmm7, 0x88 vshufi32x4 zmm20, zmm0, zmm4, 0xDD vshufi32x4 zmm21, zmm1, zmm5, 0xDD vshufi32x4 zmm22, zmm2, zmm6, 0xDD vshufi32x4 zmm23, zmm3, zmm7, 0xDD vshufi32x4 zmm0, zmm16, zmm17, 0x88 vshufi32x4 zmm1, zmm18, zmm19, 0x88 vshufi32x4 zmm2, zmm20, zmm21, 0x88 vshufi32x4 zmm3, zmm22, zmm23, 0x88 vshufi32x4 zmm4, zmm16, zmm17, 0xDD vshufi32x4 zmm5, zmm18, zmm19, 0xDD vshufi32x4 zmm6, zmm20, zmm21, 0xDD vshufi32x4 zmm7, zmm22, zmm23, 0xDD vmovdqu32 zmmword ptr [rbx], zmm0 vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 vmovdqa32 zmm0, zmmword ptr [rsp] vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] vmovdqa32 zmm2, zmm0 vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} vpcmpltud k2, zmm2, zmm0 vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} vmovdqa32 zmmword ptr [rsp], zmm2 vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 add rdi, 128 add rbx, 512 mov qword ptr [rbp+0x50], rbx sub rsi, 16 cmp rsi, 16 jnc 2b test rsi, rsi jnz 3f 4: vzeroupper mov rsp, rbp pop rbp pop rbx pop r12 pop r13 pop r14 pop r15 ret .p2align 6 3: test esi, 0x8 je 3f vpbroadcastd ymm0, dword ptr [rcx] vpbroadcastd ymm1, dword ptr [rcx+0x4] vpbroadcastd ymm2, dword ptr [rcx+0x8] vpbroadcastd ymm3, dword ptr [rcx+0xC] vpbroadcastd ymm4, dword ptr [rcx+0x10] vpbroadcastd ymm5, dword ptr [rcx+0x14] vpbroadcastd ymm6, dword ptr [rcx+0x18] vpbroadcastd ymm7, dword ptr [rcx+0x1C] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x20] mov r13, qword ptr [rdi+0x28] mov r14, qword ptr [rdi+0x30] mov r15, qword ptr [rdi+0x38] movzx eax, byte ptr [rbp+0x38] movzx ebx, byte ptr [rbp+0x40] or eax, ebx xor edx, edx 2: movzx ebx, byte ptr [rbp+0x48] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+0x80] cmove eax, ebx mov dword ptr [rsp+0x88], eax vmovups xmm8, xmmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x40] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x40] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x40] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm16, ymm12, ymm14, 136 vshufps ymm17, ymm12, ymm14, 221 vshufps ymm18, ymm13, ymm15, 136 vshufps ymm19, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x30] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x30] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x30] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm20, ymm12, ymm14, 136 vshufps ymm21, ymm12, ymm14, 221 vshufps ymm22, ymm13, ymm15, 136 vshufps ymm23, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x20] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x20] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x20] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm24, ymm12, ymm14, 136 vshufps ymm25, ymm12, ymm14, 221 vshufps ymm26, ymm13, ymm15, 136 vshufps ymm27, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x10] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x10] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x10] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm28, ymm12, ymm14, 136 vshufps ymm29, ymm12, ymm14, 221 vshufps ymm30, ymm13, ymm15, 136 vshufps ymm31, ymm13, ymm15, 221 vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] vmovdqa ymm12, ymmword ptr [rsp] vmovdqa ymm13, ymmword ptr [rsp+0x40] vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] vpbroadcastd ymm15, dword ptr [rsp+0x88] vpaddd ymm0, ymm0, ymm16 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm20 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm17 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm21 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm24 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm28 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm25 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm29 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm18 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm23 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm22 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm16 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm17 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm25 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm27 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm30 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm19 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm29 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm20 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm18 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm22 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm27 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm21 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm31 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm26 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm30 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm23 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm19 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm20 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm21 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm16 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm24 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm28 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm31 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm29 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm26 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm23 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm16 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm18 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm17 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm25 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm24 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm30 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm28 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm29 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm18 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm19 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm22 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm27 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm17 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm31 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm25 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm30 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm19 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm26 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm20 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpxor ymm0, ymm0, ymm8 vpxor ymm1, ymm1, ymm9 vpxor ymm2, ymm2, ymm10 vpxor ymm3, ymm3, ymm11 vpxor ymm4, ymm4, ymm12 vpxor ymm5, ymm5, ymm13 vpxor ymm6, ymm6, ymm14 vpxor ymm7, ymm7, ymm15 movzx eax, byte ptr [rbp+0x38] jne 2b mov rbx, qword ptr [rbp+0x50] vunpcklps ymm8, ymm0, ymm1 vunpcklps ymm9, ymm2, ymm3 vunpckhps ymm10, ymm0, ymm1 vunpcklps ymm11, ymm4, ymm5 vunpcklps ymm0, ymm6, ymm7 vshufps ymm12, ymm8, ymm9, 78 vblendps ymm1, ymm8, ymm12, 0xCC vshufps ymm8, ymm11, ymm0, 78 vunpckhps ymm13, ymm2, ymm3 vblendps ymm2, ymm11, ymm8, 0xCC vblendps ymm3, ymm12, ymm9, 0xCC vperm2f128 ymm12, ymm1, ymm2, 0x20 vmovups ymmword ptr [rbx], ymm12 vunpckhps ymm14, ymm4, ymm5 vblendps ymm4, ymm8, ymm0, 0xCC vunpckhps ymm15, ymm6, ymm7 vperm2f128 ymm7, ymm3, ymm4, 0x20 vmovups ymmword ptr [rbx+0x20], ymm7 vshufps ymm5, ymm10, ymm13, 78 vblendps ymm6, ymm5, ymm13, 0xCC vshufps ymm13, ymm14, ymm15, 78 vblendps ymm10, ymm10, ymm5, 0xCC vblendps ymm14, ymm14, ymm13, 0xCC vperm2f128 ymm8, ymm10, ymm14, 0x20 vmovups ymmword ptr [rbx+0x40], ymm8 vblendps ymm15, ymm13, ymm15, 0xCC vperm2f128 ymm13, ymm6, ymm15, 0x20 vmovups ymmword ptr [rbx+0x60], ymm13 vperm2f128 ymm9, ymm1, ymm2, 0x31 vperm2f128 ymm11, ymm3, ymm4, 0x31 vmovups ymmword ptr [rbx+0x80], ymm9 vperm2f128 ymm14, ymm10, ymm14, 0x31 vperm2f128 ymm15, ymm6, ymm15, 0x31 vmovups ymmword ptr [rbx+0xA0], ymm11 vmovups ymmword ptr [rbx+0xC0], ymm14 vmovups ymmword ptr [rbx+0xE0], ymm15 vmovdqa ymm0, ymmword ptr [rsp] vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] vmovdqa ymmword ptr [rsp], ymm0 vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 add rbx, 256 mov qword ptr [rbp+0x50], rbx add rdi, 64 sub rsi, 8 3: mov rbx, qword ptr [rbp+0x50] mov r15, qword ptr [rsp+0x80] movzx r13, byte ptr [rbp+0x38] movzx r12, byte ptr [rbp+0x48] test esi, 0x4 je 3f vbroadcasti32x4 zmm0, xmmword ptr [rcx] vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] vmovdqa xmm12, xmmword ptr [rsp] vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] vpunpckldq xmm14, xmm12, xmm13 vpunpckhdq xmm15, xmm12, xmm13 vpermq ymm14, ymm14, 0xDC vpermq ymm15, ymm15, 0xDC vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] vinserti64x4 zmm13, zmm14, ymm15, 0x01 mov eax, 17476 kmovw k2, eax vpblendmd zmm13 {k2}, zmm13, zmm12 vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov eax, 43690 kmovw k3, eax mov eax, 34952 kmovw k4, eax movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x88], eax vmovdqa32 zmm2, zmm15 vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] vpblendmd zmm3 {k4}, zmm13, zmm8 vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 vmovups zmm9, zmmword ptr [r8+rdx-0x30] vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 vshufps zmm4, zmm8, zmm9, 136 vshufps zmm5, zmm8, zmm9, 221 vmovups zmm8, zmmword ptr [r8+rdx-0x20] vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 vmovups zmm9, zmmword ptr [r8+rdx-0x10] vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 vshufps zmm6, zmm8, zmm9, 136 vshufps zmm7, zmm8, zmm9, 221 vpshufd zmm6, zmm6, 0x93 vpshufd zmm7, zmm7, 0x93 mov al, 7 9: vpaddd zmm0, zmm0, zmm4 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 16 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 12 vpaddd zmm0, zmm0, zmm5 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 8 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 7 vpshufd zmm0, zmm0, 0x93 vpshufd zmm3, zmm3, 0x4E vpshufd zmm2, zmm2, 0x39 vpaddd zmm0, zmm0, zmm6 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 16 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 12 vpaddd zmm0, zmm0, zmm7 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 8 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 7 vpshufd zmm0, zmm0, 0x39 vpshufd zmm3, zmm3, 0x4E vpshufd zmm2, zmm2, 0x93 dec al jz 9f vshufps zmm8, zmm4, zmm5, 214 vpshufd zmm9, zmm4, 0x0F vpshufd zmm4, zmm8, 0x39 vshufps zmm8, zmm6, zmm7, 250 vpblendmd zmm9 {k3}, zmm9, zmm8 vpunpcklqdq zmm8, zmm7, zmm5 vpblendmd zmm8 {k4}, zmm8, zmm6 vpshufd zmm8, zmm8, 0x78 vpunpckhdq zmm5, zmm5, zmm7 vpunpckldq zmm6, zmm6, zmm5 vpshufd zmm7, zmm6, 0x1E vmovdqa32 zmm5, zmm9 vmovdqa32 zmm6, zmm8 jmp 9b 9: vpxord zmm0, zmm0, zmm2 vpxord zmm1, zmm1, zmm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 vmovdqa xmm0, xmmword ptr [rsp] vmovdqa xmm2, xmmword ptr [rsp+0x40] vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] vmovdqa xmmword ptr [rsp], xmm0 vmovdqa xmmword ptr [rsp+0x40], xmm2 add rbx, 128 add rdi, 32 sub rsi, 4 3: test esi, 0x2 je 3f vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vmovd xmm13, dword ptr [rsp] vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovd xmm14, dword ptr [rsp+0x4] vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vinserti128 ymm13, ymm13, xmm14, 0x01 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x88], eax vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] vpbroadcastd ymm8, dword ptr [rsp+0x88] vpblendd ymm3, ymm13, ymm8, 0x88 vmovups ymm8, ymmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 vshufps ymm4, ymm8, ymm9, 136 vshufps ymm5, ymm8, ymm9, 221 vmovups ymm8, ymmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 vshufps ymm6, ymm8, ymm9, 136 vshufps ymm7, ymm8, ymm9, 221 vpshufd ymm6, ymm6, 0x93 vpshufd ymm7, ymm7, 0x93 mov al, 7 9: vpaddd ymm0, ymm0, ymm4 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 16 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 12 vpaddd ymm0, ymm0, ymm5 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 8 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 7 vpshufd ymm0, ymm0, 0x93 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x39 vpaddd ymm0, ymm0, ymm6 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 16 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 12 vpaddd ymm0, ymm0, ymm7 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 8 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 7 vpshufd ymm0, ymm0, 0x39 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x93 dec al jz 9f vshufps ymm8, ymm4, ymm5, 214 vpshufd ymm9, ymm4, 0x0F vpshufd ymm4, ymm8, 0x39 vshufps ymm8, ymm6, ymm7, 250 vpblendd ymm9, ymm9, ymm8, 0xAA vpunpcklqdq ymm8, ymm7, ymm5 vpblendd ymm8, ymm8, ymm6, 0x88 vpshufd ymm8, ymm8, 0x78 vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 0x1E vmovdqa ymm5, ymm9 vmovdqa ymm6, ymm8 jmp 9b 9: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vmovdqa xmm0, xmmword ptr [rsp] vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] vmovdqa xmmword ptr [rsp], xmm0 vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 add rbx, 64 add rdi, 16 sub rsi, 2 3: test esi, 0x1 je 4b vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+0x10] vmovd xmm14, dword ptr [rsp] vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d vpinsrd xmm3, xmm14, eax, 3 vmovdqa xmm2, xmm15 vmovups xmm8, xmmword ptr [r8+rdx-0x40] vmovups xmm9, xmmword ptr [r8+rdx-0x30] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vmovups xmm9, xmmword ptr [r8+rdx-0x10] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 jmp 4b .p2align 6 _blake3_compress_in_place_avx512: blake3_compress_in_place_avx512: _CET_ENDBR vmovdqu xmm0, xmmword ptr [rdi] vmovdqu xmm1, xmmword ptr [rdi+0x10] movzx eax, r8b movzx edx, dl shl rax, 32 add rdx, rax vmovq xmm3, rcx vmovq xmm4, rdx vpunpcklqdq xmm3, xmm3, xmm4 vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] vmovups xmm8, xmmword ptr [rsi] vmovups xmm9, xmmword ptr [rsi+0x10] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [rsi+0x20] vmovups xmm9, xmmword ptr [rsi+0x30] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 vmovdqu xmmword ptr [rdi], xmm0 vmovdqu xmmword ptr [rdi+0x10], xmm1 ret .p2align 6 _blake3_compress_xof_avx512: blake3_compress_xof_avx512: _CET_ENDBR vmovdqu xmm0, xmmword ptr [rdi] vmovdqu xmm1, xmmword ptr [rdi+0x10] movzx eax, r8b movzx edx, dl shl rax, 32 add rdx, rax vmovq xmm3, rcx vmovq xmm4, rdx vpunpcklqdq xmm3, xmm3, xmm4 vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] vmovups xmm8, xmmword ptr [rsi] vmovups xmm9, xmmword ptr [rsi+0x10] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [rsi+0x20] vmovups xmm9, xmmword ptr [rsi+0x30] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 vpxor xmm2, xmm2, [rdi] vpxor xmm3, xmm3, [rdi+0x10] vmovdqu xmmword ptr [r9], xmm0 vmovdqu xmmword ptr [r9+0x10], xmm1 vmovdqu xmmword ptr [r9+0x20], xmm2 vmovdqu xmmword ptr [r9+0x30], xmm3 ret .p2align 6 blake3_xof_many_avx512: _blake3_xof_many_avx512: _CET_ENDBR mov r10,QWORD PTR [rsp+0x8] cmp r10,0x1 ja 2f vmovdqu xmm0,XMMWORD PTR [rdi] vmovdqu xmm1,XMMWORD PTR [rdi+0x10] movzx eax,r8b movzx edx,dl shl rax,0x20 add rdx,rax vmovq xmm3,rcx vmovq xmm4,rdx vpunpcklqdq xmm3,xmm3,xmm4 vmovaps xmm2,XMMWORD PTR [BLAKE3_IV+rip] vmovups xmm8,XMMWORD PTR [rsi] vmovups xmm9,XMMWORD PTR [rsi+0x10] vshufps xmm4,xmm8,xmm9,0x88 vshufps xmm5,xmm8,xmm9,0xdd vmovups xmm8,XMMWORD PTR [rsi+0x20] vmovups xmm9,XMMWORD PTR [rsi+0x30] vshufps xmm6,xmm8,xmm9,0x88 vshufps xmm7,xmm8,xmm9,0xdd vpshufd xmm6,xmm6,0x93 vpshufd xmm7,xmm7,0x93 mov al,0x7 3: vpaddd xmm0,xmm0,xmm4 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x10 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0xc vpaddd xmm0,xmm0,xmm5 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x8 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0x7 vpshufd xmm0,xmm0,0x93 vpshufd xmm3,xmm3,0x4e vpshufd xmm2,xmm2,0x39 vpaddd xmm0,xmm0,xmm6 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x10 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0xc vpaddd xmm0,xmm0,xmm7 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x8 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0x7 vpshufd xmm0,xmm0,0x39 vpshufd xmm3,xmm3,0x4e vpshufd xmm2,xmm2,0x93 dec al je 3f vshufps xmm8,xmm4,xmm5,0xd6 vpshufd xmm9,xmm4,0xf vpshufd xmm4,xmm8,0x39 vshufps xmm8,xmm6,xmm7,0xfa vpblendd xmm9,xmm9,xmm8,0xaa vpunpcklqdq xmm8,xmm7,xmm5 vpblendd xmm8,xmm8,xmm6,0x88 vpshufd xmm8,xmm8,0x78 vpunpckhdq xmm5,xmm5,xmm7 vpunpckldq xmm6,xmm6,xmm5 vpshufd xmm7,xmm6,0x1e vmovdqa xmm5,xmm9 vmovdqa xmm6,xmm8 jmp 3b 3: vpxor xmm0,xmm0,xmm2 vpxor xmm1,xmm1,xmm3 vpxor xmm2,xmm2,XMMWORD PTR [rdi] vpxor xmm3,xmm3,XMMWORD PTR [rdi+0x10] vmovdqu XMMWORD PTR [r9],xmm0 vmovdqu XMMWORD PTR [r9+0x10],xmm1 vmovdqu XMMWORD PTR [r9+0x20],xmm2 vmovdqu XMMWORD PTR [r9+0x30],xmm3 ret .p2align 6 2: push rbp mov rbp,rsp sub rsp,0x90 and rsp,0xffffffffffffffc0 vpbroadcastd zmm0,ecx shr rcx,0x20 vpbroadcastd zmm1,ecx vpaddd zmm2,zmm0,ZMMWORD PTR [ADD0+rip] vpcmpltud k1,zmm2,zmm0 vpaddd zmm1{k1},zmm1,DWORD PTR [ADD1+rip]{1to16} vmovdqa32 ZMMWORD PTR [rsp],zmm2 vmovdqa32 ZMMWORD PTR [rsp+0x40],zmm1 cmp r10,0x10 jb 2f 3: vpbroadcastd zmm16,DWORD PTR [rsi] vpbroadcastd zmm17,DWORD PTR [rsi+0x4] vpbroadcastd zmm18,DWORD PTR [rsi+0x8] vpbroadcastd zmm19,DWORD PTR [rsi+0xc] vpbroadcastd zmm20,DWORD PTR [rsi+0x10] vpbroadcastd zmm21,DWORD PTR [rsi+0x14] vpbroadcastd zmm22,DWORD PTR [rsi+0x18] vpbroadcastd zmm23,DWORD PTR [rsi+0x1c] vpbroadcastd zmm24,DWORD PTR [rsi+0x20] vpbroadcastd zmm25,DWORD PTR [rsi+0x24] vpbroadcastd zmm26,DWORD PTR [rsi+0x28] vpbroadcastd zmm27,DWORD PTR [rsi+0x2c] vpbroadcastd zmm28,DWORD PTR [rsi+0x30] vpbroadcastd zmm29,DWORD PTR [rsi+0x34] vpbroadcastd zmm30,DWORD PTR [rsi+0x38] vpbroadcastd zmm31,DWORD PTR [rsi+0x3c] vpbroadcastd zmm0,DWORD PTR [rdi] vpbroadcastd zmm1,DWORD PTR [rdi+0x4] vpbroadcastd zmm2,DWORD PTR [rdi+0x8] vpbroadcastd zmm3,DWORD PTR [rdi+0xc] vpbroadcastd zmm4,DWORD PTR [rdi+0x10] vpbroadcastd zmm5,DWORD PTR [rdi+0x14] vpbroadcastd zmm6,DWORD PTR [rdi+0x18] vpbroadcastd zmm7,DWORD PTR [rdi+0x1c] vpbroadcastd zmm8,DWORD PTR [BLAKE3_IV_0+rip] vpbroadcastd zmm9,DWORD PTR [BLAKE3_IV_1+rip] vpbroadcastd zmm10,DWORD PTR [BLAKE3_IV_2+rip] vpbroadcastd zmm11,DWORD PTR [BLAKE3_IV_3+rip] vmovdqa32 zmm12,ZMMWORD PTR [rsp] vmovdqa32 zmm13,ZMMWORD PTR [rsp+0x40] vpbroadcastd zmm14,edx vpbroadcastd zmm15,r8d vpaddd zmm0,zmm0,zmm16 vpaddd zmm1,zmm1,zmm18 vpaddd zmm2,zmm2,zmm20 vpaddd zmm3,zmm3,zmm22 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vprord zmm15,zmm15,0x10 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0xc vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vpaddd zmm0,zmm0,zmm17 vpaddd zmm1,zmm1,zmm19 vpaddd zmm2,zmm2,zmm21 vpaddd zmm3,zmm3,zmm23 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vprord zmm15,zmm15,0x8 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0x7 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vpaddd zmm0,zmm0,zmm24 vpaddd zmm1,zmm1,zmm26 vpaddd zmm2,zmm2,zmm28 vpaddd zmm3,zmm3,zmm30 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x10 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vprord zmm4,zmm4,0xc vpaddd zmm0,zmm0,zmm25 vpaddd zmm1,zmm1,zmm27 vpaddd zmm2,zmm2,zmm29 vpaddd zmm3,zmm3,zmm31 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x8 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vprord zmm4,zmm4,0x7 vpaddd zmm0,zmm0,zmm18 vpaddd zmm1,zmm1,zmm19 vpaddd zmm2,zmm2,zmm23 vpaddd zmm3,zmm3,zmm20 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vprord zmm15,zmm15,0x10 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0xc vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vpaddd zmm0,zmm0,zmm22 vpaddd zmm1,zmm1,zmm26 vpaddd zmm2,zmm2,zmm16 vpaddd zmm3,zmm3,zmm29 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vprord zmm15,zmm15,0x8 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0x7 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vpaddd zmm0,zmm0,zmm17 vpaddd zmm1,zmm1,zmm28 vpaddd zmm2,zmm2,zmm25 vpaddd zmm3,zmm3,zmm31 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x10 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vprord zmm4,zmm4,0xc vpaddd zmm0,zmm0,zmm27 vpaddd zmm1,zmm1,zmm21 vpaddd zmm2,zmm2,zmm30 vpaddd zmm3,zmm3,zmm24 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x8 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vprord zmm4,zmm4,0x7 vpaddd zmm0,zmm0,zmm19 vpaddd zmm1,zmm1,zmm26 vpaddd zmm2,zmm2,zmm29 vpaddd zmm3,zmm3,zmm23 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vprord zmm15,zmm15,0x10 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0xc vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vpaddd zmm0,zmm0,zmm20 vpaddd zmm1,zmm1,zmm28 vpaddd zmm2,zmm2,zmm18 vpaddd zmm3,zmm3,zmm30 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vprord zmm15,zmm15,0x8 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0x7 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vpaddd zmm0,zmm0,zmm22 vpaddd zmm1,zmm1,zmm25 vpaddd zmm2,zmm2,zmm27 vpaddd zmm3,zmm3,zmm24 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x10 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vprord zmm4,zmm4,0xc vpaddd zmm0,zmm0,zmm21 vpaddd zmm1,zmm1,zmm16 vpaddd zmm2,zmm2,zmm31 vpaddd zmm3,zmm3,zmm17 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x8 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vprord zmm4,zmm4,0x7 vpaddd zmm0,zmm0,zmm26 vpaddd zmm1,zmm1,zmm28 vpaddd zmm2,zmm2,zmm30 vpaddd zmm3,zmm3,zmm29 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vprord zmm15,zmm15,0x10 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0xc vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vpaddd zmm0,zmm0,zmm23 vpaddd zmm1,zmm1,zmm25 vpaddd zmm2,zmm2,zmm19 vpaddd zmm3,zmm3,zmm31 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vprord zmm15,zmm15,0x8 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0x7 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vpaddd zmm0,zmm0,zmm20 vpaddd zmm1,zmm1,zmm27 vpaddd zmm2,zmm2,zmm21 vpaddd zmm3,zmm3,zmm17 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x10 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vprord zmm4,zmm4,0xc vpaddd zmm0,zmm0,zmm16 vpaddd zmm1,zmm1,zmm18 vpaddd zmm2,zmm2,zmm24 vpaddd zmm3,zmm3,zmm22 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x8 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vprord zmm4,zmm4,0x7 vpaddd zmm0,zmm0,zmm28 vpaddd zmm1,zmm1,zmm25 vpaddd zmm2,zmm2,zmm31 vpaddd zmm3,zmm3,zmm30 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vprord zmm15,zmm15,0x10 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0xc vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vpaddd zmm0,zmm0,zmm29 vpaddd zmm1,zmm1,zmm27 vpaddd zmm2,zmm2,zmm26 vpaddd zmm3,zmm3,zmm24 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vprord zmm15,zmm15,0x8 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0x7 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vpaddd zmm0,zmm0,zmm23 vpaddd zmm1,zmm1,zmm21 vpaddd zmm2,zmm2,zmm16 vpaddd zmm3,zmm3,zmm22 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x10 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vprord zmm4,zmm4,0xc vpaddd zmm0,zmm0,zmm18 vpaddd zmm1,zmm1,zmm19 vpaddd zmm2,zmm2,zmm17 vpaddd zmm3,zmm3,zmm20 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x8 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vprord zmm4,zmm4,0x7 vpaddd zmm0,zmm0,zmm25 vpaddd zmm1,zmm1,zmm27 vpaddd zmm2,zmm2,zmm24 vpaddd zmm3,zmm3,zmm31 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vprord zmm15,zmm15,0x10 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0xc vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vpaddd zmm0,zmm0,zmm30 vpaddd zmm1,zmm1,zmm21 vpaddd zmm2,zmm2,zmm28 vpaddd zmm3,zmm3,zmm17 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vprord zmm15,zmm15,0x8 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0x7 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vpaddd zmm0,zmm0,zmm29 vpaddd zmm1,zmm1,zmm16 vpaddd zmm2,zmm2,zmm18 vpaddd zmm3,zmm3,zmm20 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x10 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vprord zmm4,zmm4,0xc vpaddd zmm0,zmm0,zmm19 vpaddd zmm1,zmm1,zmm26 vpaddd zmm2,zmm2,zmm22 vpaddd zmm3,zmm3,zmm23 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x8 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vprord zmm4,zmm4,0x7 vpaddd zmm0,zmm0,zmm27 vpaddd zmm1,zmm1,zmm21 vpaddd zmm2,zmm2,zmm17 vpaddd zmm3,zmm3,zmm24 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vprord zmm15,zmm15,0x10 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0xc vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vpaddd zmm0,zmm0,zmm31 vpaddd zmm1,zmm1,zmm16 vpaddd zmm2,zmm2,zmm25 vpaddd zmm3,zmm3,zmm22 vpaddd zmm0,zmm0,zmm4 vpaddd zmm1,zmm1,zmm5 vpaddd zmm2,zmm2,zmm6 vpaddd zmm3,zmm3,zmm7 vpxord zmm12,zmm12,zmm0 vpxord zmm13,zmm13,zmm1 vpxord zmm14,zmm14,zmm2 vpxord zmm15,zmm15,zmm3 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vprord zmm15,zmm15,0x8 vpaddd zmm8,zmm8,zmm12 vpaddd zmm9,zmm9,zmm13 vpaddd zmm10,zmm10,zmm14 vpaddd zmm11,zmm11,zmm15 vpxord zmm4,zmm4,zmm8 vpxord zmm5,zmm5,zmm9 vpxord zmm6,zmm6,zmm10 vpxord zmm7,zmm7,zmm11 vprord zmm4,zmm4,0x7 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vpaddd zmm0,zmm0,zmm30 vpaddd zmm1,zmm1,zmm18 vpaddd zmm2,zmm2,zmm19 vpaddd zmm3,zmm3,zmm23 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x10 vprord zmm12,zmm12,0x10 vprord zmm13,zmm13,0x10 vprord zmm14,zmm14,0x10 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0xc vprord zmm6,zmm6,0xc vprord zmm7,zmm7,0xc vprord zmm4,zmm4,0xc vpaddd zmm0,zmm0,zmm26 vpaddd zmm1,zmm1,zmm28 vpaddd zmm2,zmm2,zmm20 vpaddd zmm3,zmm3,zmm29 vpaddd zmm0,zmm0,zmm5 vpaddd zmm1,zmm1,zmm6 vpaddd zmm2,zmm2,zmm7 vpaddd zmm3,zmm3,zmm4 vpxord zmm15,zmm15,zmm0 vpxord zmm12,zmm12,zmm1 vpxord zmm13,zmm13,zmm2 vpxord zmm14,zmm14,zmm3 vprord zmm15,zmm15,0x8 vprord zmm12,zmm12,0x8 vprord zmm13,zmm13,0x8 vprord zmm14,zmm14,0x8 vpaddd zmm10,zmm10,zmm15 vpaddd zmm11,zmm11,zmm12 vpaddd zmm8,zmm8,zmm13 vpaddd zmm9,zmm9,zmm14 vpxord zmm5,zmm5,zmm10 vpxord zmm6,zmm6,zmm11 vpxord zmm7,zmm7,zmm8 vpxord zmm4,zmm4,zmm9 vprord zmm5,zmm5,0x7 vprord zmm6,zmm6,0x7 vprord zmm7,zmm7,0x7 vprord zmm4,zmm4,0x7 vpxord zmm0,zmm0,zmm8 vpxord zmm1,zmm1,zmm9 vpxord zmm2,zmm2,zmm10 vpxord zmm3,zmm3,zmm11 vpxord zmm4,zmm4,zmm12 vpxord zmm5,zmm5,zmm13 vpxord zmm6,zmm6,zmm14 vpxord zmm7,zmm7,zmm15 vpxord zmm8,zmm8,DWORD PTR [rdi]{1to16} vpxord zmm9,zmm9,DWORD PTR [rdi+0x4]{1to16} vpxord zmm10,zmm10,DWORD PTR [rdi+0x8]{1to16} vpxord zmm11,zmm11,DWORD PTR [rdi+0xc]{1to16} vpxord zmm12,zmm12,DWORD PTR [rdi+0x10]{1to16} vpxord zmm13,zmm13,DWORD PTR [rdi+0x14]{1to16} vpxord zmm14,zmm14,DWORD PTR [rdi+0x18]{1to16} vpxord zmm15,zmm15,DWORD PTR [rdi+0x1c]{1to16} vpunpckldq zmm16,zmm0,zmm1 vpunpckhdq zmm17,zmm0,zmm1 vpunpckldq zmm18,zmm2,zmm3 vpunpckhdq zmm19,zmm2,zmm3 vpunpckldq zmm20,zmm4,zmm5 vpunpckhdq zmm21,zmm4,zmm5 vpunpckldq zmm22,zmm6,zmm7 vpunpckhdq zmm23,zmm6,zmm7 vpunpckldq zmm24,zmm8,zmm9 vpunpckhdq zmm25,zmm8,zmm9 vpunpckldq zmm26,zmm10,zmm11 vpunpckhdq zmm27,zmm10,zmm11 vpunpckldq zmm28,zmm12,zmm13 vpunpckhdq zmm29,zmm12,zmm13 vpunpckldq zmm30,zmm14,zmm15 vpunpckhdq zmm31,zmm14,zmm15 vpunpcklqdq zmm0,zmm16,zmm18 vpunpckhqdq zmm1,zmm16,zmm18 vpunpcklqdq zmm2,zmm17,zmm19 vpunpckhqdq zmm3,zmm17,zmm19 vpunpcklqdq zmm4,zmm20,zmm22 vpunpckhqdq zmm5,zmm20,zmm22 vpunpcklqdq zmm6,zmm21,zmm23 vpunpckhqdq zmm7,zmm21,zmm23 vpunpcklqdq zmm8,zmm24,zmm26 vpunpckhqdq zmm9,zmm24,zmm26 vpunpcklqdq zmm10,zmm25,zmm27 vpunpckhqdq zmm11,zmm25,zmm27 vpunpcklqdq zmm12,zmm28,zmm30 vpunpckhqdq zmm13,zmm28,zmm30 vpunpcklqdq zmm14,zmm29,zmm31 vpunpckhqdq zmm15,zmm29,zmm31 vshufi32x4 zmm16,zmm0,zmm4,0x88 vshufi32x4 zmm17,zmm1,zmm5,0x88 vshufi32x4 zmm18,zmm2,zmm6,0x88 vshufi32x4 zmm19,zmm3,zmm7,0x88 vshufi32x4 zmm20,zmm0,zmm4,0xdd vshufi32x4 zmm21,zmm1,zmm5,0xdd vshufi32x4 zmm22,zmm2,zmm6,0xdd vshufi32x4 zmm23,zmm3,zmm7,0xdd vshufi32x4 zmm24,zmm8,zmm12,0x88 vshufi32x4 zmm25,zmm9,zmm13,0x88 vshufi32x4 zmm26,zmm10,zmm14,0x88 vshufi32x4 zmm27,zmm11,zmm15,0x88 vshufi32x4 zmm28,zmm8,zmm12,0xdd vshufi32x4 zmm29,zmm9,zmm13,0xdd vshufi32x4 zmm30,zmm10,zmm14,0xdd vshufi32x4 zmm31,zmm11,zmm15,0xdd vshufi32x4 zmm0,zmm16,zmm24,0x88 vshufi32x4 zmm1,zmm17,zmm25,0x88 vshufi32x4 zmm2,zmm18,zmm26,0x88 vshufi32x4 zmm3,zmm19,zmm27,0x88 vshufi32x4 zmm4,zmm20,zmm28,0x88 vshufi32x4 zmm5,zmm21,zmm29,0x88 vshufi32x4 zmm6,zmm22,zmm30,0x88 vshufi32x4 zmm7,zmm23,zmm31,0x88 vshufi32x4 zmm8,zmm16,zmm24,0xdd vshufi32x4 zmm9,zmm17,zmm25,0xdd vshufi32x4 zmm10,zmm18,zmm26,0xdd vshufi32x4 zmm11,zmm19,zmm27,0xdd vshufi32x4 zmm12,zmm20,zmm28,0xdd vshufi32x4 zmm13,zmm21,zmm29,0xdd vshufi32x4 zmm14,zmm22,zmm30,0xdd vshufi32x4 zmm15,zmm23,zmm31,0xdd vmovdqu32 ZMMWORD PTR [r9],zmm0 vmovdqu32 ZMMWORD PTR [r9+0x40],zmm1 vmovdqu32 ZMMWORD PTR [r9+0x80],zmm2 vmovdqu32 ZMMWORD PTR [r9+0xc0],zmm3 vmovdqu32 ZMMWORD PTR [r9+0x100],zmm4 vmovdqu32 ZMMWORD PTR [r9+0x140],zmm5 vmovdqu32 ZMMWORD PTR [r9+0x180],zmm6 vmovdqu32 ZMMWORD PTR [r9+0x1c0],zmm7 vmovdqu32 ZMMWORD PTR [r9+0x200],zmm8 vmovdqu32 ZMMWORD PTR [r9+0x240],zmm9 vmovdqu32 ZMMWORD PTR [r9+0x280],zmm10 vmovdqu32 ZMMWORD PTR [r9+0x2c0],zmm11 vmovdqu32 ZMMWORD PTR [r9+0x300],zmm12 vmovdqu32 ZMMWORD PTR [r9+0x340],zmm13 vmovdqu32 ZMMWORD PTR [r9+0x380],zmm14 vmovdqu32 ZMMWORD PTR [r9+0x3c0],zmm15 vmovdqa32 zmm0,ZMMWORD PTR [rsp] vmovdqa32 zmm1,ZMMWORD PTR [rsp+0x40] vpaddd zmm2,zmm0,DWORD PTR [ADD16+rip]{1to16} vpcmpltud k1,zmm2,zmm0 vpaddd zmm1{k1},zmm1,DWORD PTR [ADD1+rip]{1to16} vmovdqa32 ZMMWORD PTR [rsp],zmm2 vmovdqa32 ZMMWORD PTR [rsp+0x40],zmm1 add r9,0x400 sub r10,0x10 cmp r10,0x10 jae 3b test r10,r10 jne 2f 9: vzeroupper mov rsp,rbp pop rbp ret 2: test r10,0x8 je 2f vpbroadcastd ymm16,DWORD PTR [rsi] vpbroadcastd ymm17,DWORD PTR [rsi+0x4] vpbroadcastd ymm18,DWORD PTR [rsi+0x8] vpbroadcastd ymm19,DWORD PTR [rsi+0xc] vpbroadcastd ymm20,DWORD PTR [rsi+0x10] vpbroadcastd ymm21,DWORD PTR [rsi+0x14] vpbroadcastd ymm22,DWORD PTR [rsi+0x18] vpbroadcastd ymm23,DWORD PTR [rsi+0x1c] vpbroadcastd ymm24,DWORD PTR [rsi+0x20] vpbroadcastd ymm25,DWORD PTR [rsi+0x24] vpbroadcastd ymm26,DWORD PTR [rsi+0x28] vpbroadcastd ymm27,DWORD PTR [rsi+0x2c] vpbroadcastd ymm28,DWORD PTR [rsi+0x30] vpbroadcastd ymm29,DWORD PTR [rsi+0x34] vpbroadcastd ymm30,DWORD PTR [rsi+0x38] vpbroadcastd ymm31,DWORD PTR [rsi+0x3c] vpbroadcastd ymm0,DWORD PTR [rdi] vpbroadcastd ymm1,DWORD PTR [rdi+0x4] vpbroadcastd ymm2,DWORD PTR [rdi+0x8] vpbroadcastd ymm3,DWORD PTR [rdi+0xc] vpbroadcastd ymm4,DWORD PTR [rdi+0x10] vpbroadcastd ymm5,DWORD PTR [rdi+0x14] vpbroadcastd ymm6,DWORD PTR [rdi+0x18] vpbroadcastd ymm7,DWORD PTR [rdi+0x1c] vpbroadcastd ymm8,DWORD PTR [BLAKE3_IV_0+rip] vpbroadcastd ymm9,DWORD PTR [BLAKE3_IV_1+rip] vpbroadcastd ymm10,DWORD PTR [BLAKE3_IV_2+rip] vpbroadcastd ymm11,DWORD PTR [BLAKE3_IV_3+rip] vmovdqa ymm12,YMMWORD PTR [rsp] vmovdqa ymm13,YMMWORD PTR [rsp+0x40] vpbroadcastd ymm14,edx vpbroadcastd ymm15,r8d vpaddd ymm0,ymm0,ymm16 vpaddd ymm1,ymm1,ymm18 vpaddd ymm2,ymm2,ymm20 vpaddd ymm3,ymm3,ymm22 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vprord ymm15,ymm15,0x10 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0xc vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vpaddd ymm0,ymm0,ymm17 vpaddd ymm1,ymm1,ymm19 vpaddd ymm2,ymm2,ymm21 vpaddd ymm3,ymm3,ymm23 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vprord ymm15,ymm15,0x8 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0x7 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vpaddd ymm0,ymm0,ymm24 vpaddd ymm1,ymm1,ymm26 vpaddd ymm2,ymm2,ymm28 vpaddd ymm3,ymm3,ymm30 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x10 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vprord ymm4,ymm4,0xc vpaddd ymm0,ymm0,ymm25 vpaddd ymm1,ymm1,ymm27 vpaddd ymm2,ymm2,ymm29 vpaddd ymm3,ymm3,ymm31 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x8 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vprord ymm4,ymm4,0x7 vpaddd ymm0,ymm0,ymm18 vpaddd ymm1,ymm1,ymm19 vpaddd ymm2,ymm2,ymm23 vpaddd ymm3,ymm3,ymm20 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vprord ymm15,ymm15,0x10 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0xc vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vpaddd ymm0,ymm0,ymm22 vpaddd ymm1,ymm1,ymm26 vpaddd ymm2,ymm2,ymm16 vpaddd ymm3,ymm3,ymm29 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vprord ymm15,ymm15,0x8 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0x7 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vpaddd ymm0,ymm0,ymm17 vpaddd ymm1,ymm1,ymm28 vpaddd ymm2,ymm2,ymm25 vpaddd ymm3,ymm3,ymm31 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x10 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vprord ymm4,ymm4,0xc vpaddd ymm0,ymm0,ymm27 vpaddd ymm1,ymm1,ymm21 vpaddd ymm2,ymm2,ymm30 vpaddd ymm3,ymm3,ymm24 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x8 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vprord ymm4,ymm4,0x7 vpaddd ymm0,ymm0,ymm19 vpaddd ymm1,ymm1,ymm26 vpaddd ymm2,ymm2,ymm29 vpaddd ymm3,ymm3,ymm23 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vprord ymm15,ymm15,0x10 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0xc vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vpaddd ymm0,ymm0,ymm20 vpaddd ymm1,ymm1,ymm28 vpaddd ymm2,ymm2,ymm18 vpaddd ymm3,ymm3,ymm30 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vprord ymm15,ymm15,0x8 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0x7 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vpaddd ymm0,ymm0,ymm22 vpaddd ymm1,ymm1,ymm25 vpaddd ymm2,ymm2,ymm27 vpaddd ymm3,ymm3,ymm24 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x10 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vprord ymm4,ymm4,0xc vpaddd ymm0,ymm0,ymm21 vpaddd ymm1,ymm1,ymm16 vpaddd ymm2,ymm2,ymm31 vpaddd ymm3,ymm3,ymm17 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x8 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vprord ymm4,ymm4,0x7 vpaddd ymm0,ymm0,ymm26 vpaddd ymm1,ymm1,ymm28 vpaddd ymm2,ymm2,ymm30 vpaddd ymm3,ymm3,ymm29 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vprord ymm15,ymm15,0x10 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0xc vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vpaddd ymm0,ymm0,ymm23 vpaddd ymm1,ymm1,ymm25 vpaddd ymm2,ymm2,ymm19 vpaddd ymm3,ymm3,ymm31 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vprord ymm15,ymm15,0x8 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0x7 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vpaddd ymm0,ymm0,ymm20 vpaddd ymm1,ymm1,ymm27 vpaddd ymm2,ymm2,ymm21 vpaddd ymm3,ymm3,ymm17 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x10 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vprord ymm4,ymm4,0xc vpaddd ymm0,ymm0,ymm16 vpaddd ymm1,ymm1,ymm18 vpaddd ymm2,ymm2,ymm24 vpaddd ymm3,ymm3,ymm22 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x8 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vprord ymm4,ymm4,0x7 vpaddd ymm0,ymm0,ymm28 vpaddd ymm1,ymm1,ymm25 vpaddd ymm2,ymm2,ymm31 vpaddd ymm3,ymm3,ymm30 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vprord ymm15,ymm15,0x10 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0xc vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vpaddd ymm0,ymm0,ymm29 vpaddd ymm1,ymm1,ymm27 vpaddd ymm2,ymm2,ymm26 vpaddd ymm3,ymm3,ymm24 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vprord ymm15,ymm15,0x8 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0x7 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vpaddd ymm0,ymm0,ymm23 vpaddd ymm1,ymm1,ymm21 vpaddd ymm2,ymm2,ymm16 vpaddd ymm3,ymm3,ymm22 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x10 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vprord ymm4,ymm4,0xc vpaddd ymm0,ymm0,ymm18 vpaddd ymm1,ymm1,ymm19 vpaddd ymm2,ymm2,ymm17 vpaddd ymm3,ymm3,ymm20 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x8 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vprord ymm4,ymm4,0x7 vpaddd ymm0,ymm0,ymm25 vpaddd ymm1,ymm1,ymm27 vpaddd ymm2,ymm2,ymm24 vpaddd ymm3,ymm3,ymm31 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vprord ymm15,ymm15,0x10 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0xc vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vpaddd ymm0,ymm0,ymm30 vpaddd ymm1,ymm1,ymm21 vpaddd ymm2,ymm2,ymm28 vpaddd ymm3,ymm3,ymm17 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vprord ymm15,ymm15,0x8 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0x7 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vpaddd ymm0,ymm0,ymm29 vpaddd ymm1,ymm1,ymm16 vpaddd ymm2,ymm2,ymm18 vpaddd ymm3,ymm3,ymm20 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x10 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vprord ymm4,ymm4,0xc vpaddd ymm0,ymm0,ymm19 vpaddd ymm1,ymm1,ymm26 vpaddd ymm2,ymm2,ymm22 vpaddd ymm3,ymm3,ymm23 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x8 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vprord ymm4,ymm4,0x7 vpaddd ymm0,ymm0,ymm27 vpaddd ymm1,ymm1,ymm21 vpaddd ymm2,ymm2,ymm17 vpaddd ymm3,ymm3,ymm24 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vprord ymm15,ymm15,0x10 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0xc vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vpaddd ymm0,ymm0,ymm31 vpaddd ymm1,ymm1,ymm16 vpaddd ymm2,ymm2,ymm25 vpaddd ymm3,ymm3,ymm22 vpaddd ymm0,ymm0,ymm4 vpaddd ymm1,ymm1,ymm5 vpaddd ymm2,ymm2,ymm6 vpaddd ymm3,ymm3,ymm7 vpxord ymm12,ymm12,ymm0 vpxord ymm13,ymm13,ymm1 vpxord ymm14,ymm14,ymm2 vpxord ymm15,ymm15,ymm3 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vprord ymm15,ymm15,0x8 vpaddd ymm8,ymm8,ymm12 vpaddd ymm9,ymm9,ymm13 vpaddd ymm10,ymm10,ymm14 vpaddd ymm11,ymm11,ymm15 vpxord ymm4,ymm4,ymm8 vpxord ymm5,ymm5,ymm9 vpxord ymm6,ymm6,ymm10 vpxord ymm7,ymm7,ymm11 vprord ymm4,ymm4,0x7 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vpaddd ymm0,ymm0,ymm30 vpaddd ymm1,ymm1,ymm18 vpaddd ymm2,ymm2,ymm19 vpaddd ymm3,ymm3,ymm23 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x10 vprord ymm12,ymm12,0x10 vprord ymm13,ymm13,0x10 vprord ymm14,ymm14,0x10 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0xc vprord ymm6,ymm6,0xc vprord ymm7,ymm7,0xc vprord ymm4,ymm4,0xc vpaddd ymm0,ymm0,ymm26 vpaddd ymm1,ymm1,ymm28 vpaddd ymm2,ymm2,ymm20 vpaddd ymm3,ymm3,ymm29 vpaddd ymm0,ymm0,ymm5 vpaddd ymm1,ymm1,ymm6 vpaddd ymm2,ymm2,ymm7 vpaddd ymm3,ymm3,ymm4 vpxord ymm15,ymm15,ymm0 vpxord ymm12,ymm12,ymm1 vpxord ymm13,ymm13,ymm2 vpxord ymm14,ymm14,ymm3 vprord ymm15,ymm15,0x8 vprord ymm12,ymm12,0x8 vprord ymm13,ymm13,0x8 vprord ymm14,ymm14,0x8 vpaddd ymm10,ymm10,ymm15 vpaddd ymm11,ymm11,ymm12 vpaddd ymm8,ymm8,ymm13 vpaddd ymm9,ymm9,ymm14 vpxord ymm5,ymm5,ymm10 vpxord ymm6,ymm6,ymm11 vpxord ymm7,ymm7,ymm8 vpxord ymm4,ymm4,ymm9 vprord ymm5,ymm5,0x7 vprord ymm6,ymm6,0x7 vprord ymm7,ymm7,0x7 vprord ymm4,ymm4,0x7 vpxor ymm0,ymm0,ymm8 vpxor ymm1,ymm1,ymm9 vpxor ymm2,ymm2,ymm10 vpxor ymm3,ymm3,ymm11 vpxor ymm4,ymm4,ymm12 vpxor ymm5,ymm5,ymm13 vpxor ymm6,ymm6,ymm14 vpxor ymm7,ymm7,ymm15 vpxord ymm8,ymm8,DWORD PTR [rdi]{1to8} vpxord ymm9,ymm9,DWORD PTR [rdi+0x4]{1to8} vpxord ymm10,ymm10,DWORD PTR [rdi+0x8]{1to8} vpxord ymm11,ymm11,DWORD PTR [rdi+0xc]{1to8} vpxord ymm12,ymm12,DWORD PTR [rdi+0x10]{1to8} vpxord ymm13,ymm13,DWORD PTR [rdi+0x14]{1to8} vpxord ymm14,ymm14,DWORD PTR [rdi+0x18]{1to8} vpxord ymm15,ymm15,DWORD PTR [rdi+0x1c]{1to8} vpunpckldq ymm16,ymm0,ymm1 vpunpckhdq ymm17,ymm0,ymm1 vpunpckldq ymm18,ymm2,ymm3 vpunpckhdq ymm19,ymm2,ymm3 vpunpckldq ymm20,ymm4,ymm5 vpunpckhdq ymm21,ymm4,ymm5 vpunpckldq ymm22,ymm6,ymm7 vpunpckhdq ymm23,ymm6,ymm7 vpunpckldq ymm24,ymm8,ymm9 vpunpckhdq ymm25,ymm8,ymm9 vpunpckldq ymm26,ymm10,ymm11 vpunpckhdq ymm27,ymm10,ymm11 vpunpckldq ymm28,ymm12,ymm13 vpunpckhdq ymm29,ymm12,ymm13 vpunpckldq ymm30,ymm14,ymm15 vpunpckhdq ymm31,ymm14,ymm15 vpunpcklqdq ymm0,ymm16,ymm18 vpunpckhqdq ymm1,ymm16,ymm18 vpunpcklqdq ymm2,ymm17,ymm19 vpunpckhqdq ymm3,ymm17,ymm19 vpunpcklqdq ymm4,ymm20,ymm22 vpunpckhqdq ymm5,ymm20,ymm22 vpunpcklqdq ymm6,ymm21,ymm23 vpunpckhqdq ymm7,ymm21,ymm23 vpunpcklqdq ymm8,ymm24,ymm26 vpunpckhqdq ymm9,ymm24,ymm26 vpunpcklqdq ymm10,ymm25,ymm27 vpunpckhqdq ymm11,ymm25,ymm27 vpunpcklqdq ymm12,ymm28,ymm30 vpunpckhqdq ymm13,ymm28,ymm30 vpunpcklqdq ymm14,ymm29,ymm31 vpunpckhqdq ymm15,ymm29,ymm31 vshufi32x4 ymm16,ymm0,ymm4,0x0 vshufi32x4 ymm17,ymm8,ymm12,0x0 vshufi32x4 ymm18,ymm1,ymm5,0x0 vshufi32x4 ymm19,ymm9,ymm13,0x0 vshufi32x4 ymm20,ymm2,ymm6,0x0 vshufi32x4 ymm21,ymm10,ymm14,0x0 vshufi32x4 ymm22,ymm3,ymm7,0x0 vshufi32x4 ymm23,ymm11,ymm15,0x0 vshufi32x4 ymm24,ymm0,ymm4,0x3 vshufi32x4 ymm25,ymm8,ymm12,0x3 vshufi32x4 ymm26,ymm1,ymm5,0x3 vshufi32x4 ymm27,ymm9,ymm13,0x3 vshufi32x4 ymm28,ymm2,ymm6,0x3 vshufi32x4 ymm29,ymm10,ymm14,0x3 vshufi32x4 ymm30,ymm3,ymm7,0x3 vshufi32x4 ymm31,ymm11,ymm15,0x3 vmovdqu32 YMMWORD PTR [r9],ymm16 vmovdqu32 YMMWORD PTR [r9+0x20],ymm17 vmovdqu32 YMMWORD PTR [r9+0x40],ymm18 vmovdqu32 YMMWORD PTR [r9+0x60],ymm19 vmovdqu32 YMMWORD PTR [r9+0x80],ymm20 vmovdqu32 YMMWORD PTR [r9+0xa0],ymm21 vmovdqu32 YMMWORD PTR [r9+0xc0],ymm22 vmovdqu32 YMMWORD PTR [r9+0xe0],ymm23 vmovdqu32 YMMWORD PTR [r9+0x100],ymm24 vmovdqu32 YMMWORD PTR [r9+0x120],ymm25 vmovdqu32 YMMWORD PTR [r9+0x140],ymm26 vmovdqu32 YMMWORD PTR [r9+0x160],ymm27 vmovdqu32 YMMWORD PTR [r9+0x180],ymm28 vmovdqu32 YMMWORD PTR [r9+0x1a0],ymm29 vmovdqu32 YMMWORD PTR [r9+0x1c0],ymm30 vmovdqu32 YMMWORD PTR [r9+0x1e0],ymm31 vmovdqa ymm0,YMMWORD PTR [rsp+0x20] vmovdqa ymm1,YMMWORD PTR [rsp+0x60] vmovdqa YMMWORD PTR [rsp],ymm0 vmovdqa YMMWORD PTR [rsp+0x40],ymm1 add r9,0x200 sub r10,0x8 2: test r10,0x4 je 2f vbroadcasti32x4 zmm0,XMMWORD PTR [rdi] vbroadcasti32x4 zmm1,XMMWORD PTR [rdi+0x10] vbroadcasti32x4 zmm2,XMMWORD PTR [BLAKE3_IV+rip] vmovdqa xmm12,XMMWORD PTR [rsp] vmovdqa xmm13,XMMWORD PTR [rsp+0x40] vpunpckldq xmm14,xmm12,xmm13 vpunpckhdq xmm15,xmm12,xmm13 vpermq ymm14,ymm14,0xdc vpermq ymm15,ymm15,0xdc vpbroadcastd zmm12,edx vinserti64x4 zmm13,zmm14,ymm15,0x1 mov eax,0x4444 kmovw k2,eax vpblendmd zmm13{k2},zmm13,zmm12 vpbroadcastd zmm15,r8d mov eax,0x8888 kmovw k4,eax vpblendmd zmm3{k4},zmm13,zmm15 mov eax,0xaaaa kmovw k3,eax vbroadcasti32x4 zmm8,XMMWORD PTR [rsi] vbroadcasti32x4 zmm9,XMMWORD PTR [rsi+0x10] vshufps zmm4,zmm8,zmm9,0x88 vshufps zmm5,zmm8,zmm9,0xdd vbroadcasti32x4 zmm8,XMMWORD PTR [rsi+0x20] vbroadcasti32x4 zmm9,XMMWORD PTR [rsi+0x30] vshufps zmm6,zmm8,zmm9,0x88 vshufps zmm7,zmm8,zmm9,0xdd vpshufd zmm6,zmm6,0x93 vpshufd zmm7,zmm7,0x93 mov al,0x7 3: vpaddd zmm0,zmm0,zmm4 vpaddd zmm0,zmm0,zmm1 vpxord zmm3,zmm3,zmm0 vprord zmm3,zmm3,0x10 vpaddd zmm2,zmm2,zmm3 vpxord zmm1,zmm1,zmm2 vprord zmm1,zmm1,0xc vpaddd zmm0,zmm0,zmm5 vpaddd zmm0,zmm0,zmm1 vpxord zmm3,zmm3,zmm0 vprord zmm3,zmm3,0x8 vpaddd zmm2,zmm2,zmm3 vpxord zmm1,zmm1,zmm2 vprord zmm1,zmm1,0x7 vpshufd zmm0,zmm0,0x93 vpshufd zmm3,zmm3,0x4e vpshufd zmm2,zmm2,0x39 vpaddd zmm0,zmm0,zmm6 vpaddd zmm0,zmm0,zmm1 vpxord zmm3,zmm3,zmm0 vprord zmm3,zmm3,0x10 vpaddd zmm2,zmm2,zmm3 vpxord zmm1,zmm1,zmm2 vprord zmm1,zmm1,0xc vpaddd zmm0,zmm0,zmm7 vpaddd zmm0,zmm0,zmm1 vpxord zmm3,zmm3,zmm0 vprord zmm3,zmm3,0x8 vpaddd zmm2,zmm2,zmm3 vpxord zmm1,zmm1,zmm2 vprord zmm1,zmm1,0x7 vpshufd zmm0,zmm0,0x39 vpshufd zmm3,zmm3,0x4e vpshufd zmm2,zmm2,0x93 dec al je 3f vshufps zmm8,zmm4,zmm5,0xd6 vpshufd zmm9,zmm4,0xf vpshufd zmm4,zmm8,0x39 vshufps zmm8,zmm6,zmm7,0xfa vpblendmd zmm9{k3},zmm9,zmm8 vpunpcklqdq zmm8,zmm7,zmm5 vpblendmd zmm8{k4},zmm8,zmm6 vpshufd zmm8,zmm8,0x78 vpunpckhdq zmm5,zmm5,zmm7 vpunpckldq zmm6,zmm6,zmm5 vpshufd zmm7,zmm6,0x1e vmovdqa32 zmm5,zmm9 vmovdqa32 zmm6,zmm8 jmp 3b 3: vpxord zmm0,zmm0,zmm2 vpxord zmm1,zmm1,zmm3 vbroadcasti32x4 zmm8,XMMWORD PTR [rdi] vbroadcasti32x4 zmm9,XMMWORD PTR [rdi+0x10] vpxord zmm2,zmm2,zmm8 vpxord zmm3,zmm3,zmm9 vmovdqu XMMWORD PTR [r9],xmm0 vmovdqu XMMWORD PTR [r9+0x10],xmm1 vmovdqu XMMWORD PTR [r9+0x20],xmm2 vmovdqu XMMWORD PTR [r9+0x30],xmm3 vextracti128 XMMWORD PTR [r9+0x40],ymm0,0x1 vextracti128 XMMWORD PTR [r9+0x50],ymm1,0x1 vextracti128 XMMWORD PTR [r9+0x60],ymm2,0x1 vextracti128 XMMWORD PTR [r9+0x70],ymm3,0x1 vextracti32x4 XMMWORD PTR [r9+0x80],zmm0,0x2 vextracti32x4 XMMWORD PTR [r9+0x90],zmm1,0x2 vextracti32x4 XMMWORD PTR [r9+0xa0],zmm2,0x2 vextracti32x4 XMMWORD PTR [r9+0xb0],zmm3,0x2 vextracti32x4 XMMWORD PTR [r9+0xc0],zmm0,0x3 vextracti32x4 XMMWORD PTR [r9+0xd0],zmm1,0x3 vextracti32x4 XMMWORD PTR [r9+0xe0],zmm2,0x3 vextracti32x4 XMMWORD PTR [r9+0xf0],zmm3,0x3 vmovdqa xmm0,XMMWORD PTR [rsp+0x10] vmovdqa xmm1,XMMWORD PTR [rsp+0x50] vmovdqa XMMWORD PTR [rsp],xmm0 vmovdqa XMMWORD PTR [rsp+0x40],xmm1 add r9,0x100 sub r10,0x4 2: test r10,0x2 je 2f vbroadcasti128 ymm0,XMMWORD PTR [rdi] vbroadcasti128 ymm1,XMMWORD PTR [rdi+0x10] vmovd xmm13,DWORD PTR [rsp] vpinsrd xmm13,xmm13,DWORD PTR [rsp+0x40],0x1 vpinsrd xmm13,xmm13,edx,0x2 vmovd xmm14,DWORD PTR [rsp+0x4] vpinsrd xmm14,xmm14,DWORD PTR [rsp+0x44],0x1 vpinsrd xmm14,xmm14,edx,0x2 vinserti128 ymm13,ymm13,xmm14,0x1 vbroadcasti128 ymm2,XMMWORD PTR [BLAKE3_IV+rip] vpbroadcastd ymm8,r8d vpblendd ymm3,ymm13,ymm8,0x88 vbroadcasti128 ymm8,XMMWORD PTR [rsi] vbroadcasti128 ymm9,XMMWORD PTR [rsi+0x10] vshufps ymm4,ymm8,ymm9,0x88 vshufps ymm5,ymm8,ymm9,0xdd vbroadcasti128 ymm8,XMMWORD PTR [rsi+0x20] vbroadcasti128 ymm9,XMMWORD PTR [rsi+0x30] vshufps ymm6,ymm8,ymm9,0x88 vshufps ymm7,ymm8,ymm9,0xdd vpshufd ymm6,ymm6,0x93 vpshufd ymm7,ymm7,0x93 mov al,0x7 3: vpaddd ymm0,ymm0,ymm4 vpaddd ymm0,ymm0,ymm1 vpxord ymm3,ymm3,ymm0 vprord ymm3,ymm3,0x10 vpaddd ymm2,ymm2,ymm3 vpxord ymm1,ymm1,ymm2 vprord ymm1,ymm1,0xc vpaddd ymm0,ymm0,ymm5 vpaddd ymm0,ymm0,ymm1 vpxord ymm3,ymm3,ymm0 vprord ymm3,ymm3,0x8 vpaddd ymm2,ymm2,ymm3 vpxord ymm1,ymm1,ymm2 vprord ymm1,ymm1,0x7 vpshufd ymm0,ymm0,0x93 vpshufd ymm3,ymm3,0x4e vpshufd ymm2,ymm2,0x39 vpaddd ymm0,ymm0,ymm6 vpaddd ymm0,ymm0,ymm1 vpxord ymm3,ymm3,ymm0 vprord ymm3,ymm3,0x10 vpaddd ymm2,ymm2,ymm3 vpxord ymm1,ymm1,ymm2 vprord ymm1,ymm1,0xc vpaddd ymm0,ymm0,ymm7 vpaddd ymm0,ymm0,ymm1 vpxord ymm3,ymm3,ymm0 vprord ymm3,ymm3,0x8 vpaddd ymm2,ymm2,ymm3 vpxord ymm1,ymm1,ymm2 vprord ymm1,ymm1,0x7 vpshufd ymm0,ymm0,0x39 vpshufd ymm3,ymm3,0x4e vpshufd ymm2,ymm2,0x93 dec al je 3f vshufps ymm8,ymm4,ymm5,0xd6 vpshufd ymm9,ymm4,0xf vpshufd ymm4,ymm8,0x39 vshufps ymm8,ymm6,ymm7,0xfa vpblendd ymm9,ymm9,ymm8,0xaa vpunpcklqdq ymm8,ymm7,ymm5 vpblendd ymm8,ymm8,ymm6,0x88 vpshufd ymm8,ymm8,0x78 vpunpckhdq ymm5,ymm5,ymm7 vpunpckldq ymm6,ymm6,ymm5 vpshufd ymm7,ymm6,0x1e vmovdqa ymm5,ymm9 vmovdqa ymm6,ymm8 jmp 3b 3: vpxor ymm0,ymm0,ymm2 vpxor ymm1,ymm1,ymm3 vbroadcasti128 ymm8,XMMWORD PTR [rdi] vbroadcasti128 ymm9,XMMWORD PTR [rdi+0x10] vpxor ymm2,ymm2,ymm8 vpxor ymm3,ymm3,ymm9 vmovdqu XMMWORD PTR [r9],xmm0 vmovdqu XMMWORD PTR [r9+0x10],xmm1 vmovdqu XMMWORD PTR [r9+0x20],xmm2 vmovdqu XMMWORD PTR [r9+0x30],xmm3 vextracti128 XMMWORD PTR [r9+0x40],ymm0,0x1 vextracti128 XMMWORD PTR [r9+0x50],ymm1,0x1 vextracti128 XMMWORD PTR [r9+0x60],ymm2,0x1 vextracti128 XMMWORD PTR [r9+0x70],ymm3,0x1 vmovdqu xmm0,XMMWORD PTR [rsp+0x8] vmovdqu xmm1,XMMWORD PTR [rsp+0x48] vmovdqa XMMWORD PTR [rsp],xmm0 vmovdqa XMMWORD PTR [rsp+0x40],xmm1 add r9,0x80 sub r10,0x2 2: test r10,0x1 je 9b vmovdqu xmm0,XMMWORD PTR [rdi] vmovdqu xmm1,XMMWORD PTR [rdi+0x10] vmovd xmm14,DWORD PTR [rsp] vpinsrd xmm14,xmm14,DWORD PTR [rsp+0x40],0x1 vpinsrd xmm14,xmm14,edx,0x2 vmovdqa xmm2,XMMWORD PTR [BLAKE3_IV+rip] vpinsrd xmm3,xmm14,r8d,0x3 vmovups xmm8,XMMWORD PTR [rsi] vmovups xmm9,XMMWORD PTR [rsi+0x10] vshufps xmm4,xmm8,xmm9,0x88 vshufps xmm5,xmm8,xmm9,0xdd vmovups xmm8,XMMWORD PTR [rsi+0x20] vmovups xmm9,XMMWORD PTR [rsi+0x30] vshufps xmm6,xmm8,xmm9,0x88 vshufps xmm7,xmm8,xmm9,0xdd vpshufd xmm6,xmm6,0x93 vpshufd xmm7,xmm7,0x93 mov al,0x7 3: vpaddd xmm0,xmm0,xmm4 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x10 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0xc vpaddd xmm0,xmm0,xmm5 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x8 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0x7 vpshufd xmm0,xmm0,0x93 vpshufd xmm3,xmm3,0x4e vpshufd xmm2,xmm2,0x39 vpaddd xmm0,xmm0,xmm6 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x10 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0xc vpaddd xmm0,xmm0,xmm7 vpaddd xmm0,xmm0,xmm1 vpxord xmm3,xmm3,xmm0 vprord xmm3,xmm3,0x8 vpaddd xmm2,xmm2,xmm3 vpxord xmm1,xmm1,xmm2 vprord xmm1,xmm1,0x7 vpshufd xmm0,xmm0,0x39 vpshufd xmm3,xmm3,0x4e vpshufd xmm2,xmm2,0x93 dec al je 3f vshufps xmm8,xmm4,xmm5,0xd6 vpshufd xmm9,xmm4,0xf vpshufd xmm4,xmm8,0x39 vshufps xmm8,xmm6,xmm7,0xfa vpblendd xmm9,xmm9,xmm8,0xaa vpunpcklqdq xmm8,xmm7,xmm5 vpblendd xmm8,xmm8,xmm6,0x88 vpshufd xmm8,xmm8,0x78 vpunpckhdq xmm5,xmm5,xmm7 vpunpckldq xmm6,xmm6,xmm5 vpshufd xmm7,xmm6,0x1e vmovdqa xmm5,xmm9 vmovdqa xmm6,xmm8 jmp 3b 3: vpxor xmm0,xmm0,xmm2 vpxor xmm1,xmm1,xmm3 vpxor xmm2,xmm2,XMMWORD PTR [rdi] vpxor xmm3,xmm3,XMMWORD PTR [rdi+0x10] vmovdqu XMMWORD PTR [r9],xmm0 vmovdqu XMMWORD PTR [r9+0x10],xmm1 vmovdqu XMMWORD PTR [r9+0x20],xmm2 vmovdqu XMMWORD PTR [r9+0x30],xmm3 jmp 9b #ifdef __APPLE__ .static_data #else .section .rodata #endif .p2align 6 INDEX0: .long 0, 1, 2, 3, 16, 17, 18, 19 .long 8, 9, 10, 11, 24, 25, 26, 27 INDEX1: .long 4, 5, 6, 7, 20, 21, 22, 23 .long 12, 13, 14, 15, 28, 29, 30, 31 ADD0: .long 0, 1, 2, 3, 4, 5, 6, 7 .long 8, 9, 10, 11, 12, 13, 14, 15 ADD1: .long 1 ADD16: .long 16 BLAKE3_BLOCK_LEN: .long 64 .p2align 6 BLAKE3_IV: BLAKE3_IV_0: .long 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A blake3-1.8.1/c/blake3_avx512_x86-64_windows_gnu.S000064400000000000000000002615731046102023000172060ustar 00000000000000.intel_syntax noprefix .global _blake3_hash_many_avx512 .global blake3_hash_many_avx512 .global blake3_compress_in_place_avx512 .global _blake3_compress_in_place_avx512 .global blake3_compress_xof_avx512 .global _blake3_compress_xof_avx512 .section .text .p2align 6 _blake3_hash_many_avx512: blake3_hash_many_avx512: push r15 push r14 push r13 push r12 push rdi push rsi push rbx push rbp mov rbp, rsp sub rsp, 304 and rsp, 0xFFFFFFFFFFFFFFC0 vmovdqa xmmword ptr [rsp+0x90], xmm6 vmovdqa xmmword ptr [rsp+0xA0], xmm7 vmovdqa xmmword ptr [rsp+0xB0], xmm8 vmovdqa xmmword ptr [rsp+0xC0], xmm9 vmovdqa xmmword ptr [rsp+0xD0], xmm10 vmovdqa xmmword ptr [rsp+0xE0], xmm11 vmovdqa xmmword ptr [rsp+0xF0], xmm12 vmovdqa xmmword ptr [rsp+0x100], xmm13 vmovdqa xmmword ptr [rsp+0x110], xmm14 vmovdqa xmmword ptr [rsp+0x120], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+0x68] movzx r9, byte ptr [rbp+0x70] neg r9 kmovw k1, r9d vmovd xmm0, r8d vpbroadcastd ymm0, xmm0 shr r8, 32 vmovd xmm1, r8d vpbroadcastd ymm1, xmm1 vmovdqa ymm4, ymm1 vmovdqa ymm5, ymm1 vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] vpcmpltud k2, ymm2, ymm0 vpcmpltud k3, ymm3, ymm0 vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} knotw k2, k1 vmovdqa32 ymm2 {k2}, ymm0 vmovdqa32 ymm3 {k2}, ymm0 vmovdqa32 ymm4 {k2}, ymm1 vmovdqa32 ymm5 {k2}, ymm1 vmovdqa ymmword ptr [rsp], ymm2 vmovdqa ymmword ptr [rsp+0x20], ymm3 vmovdqa ymmword ptr [rsp+0x40], ymm4 vmovdqa ymmword ptr [rsp+0x60], ymm5 shl rdx, 6 mov qword ptr [rsp+0x80], rdx cmp rsi, 16 jc 3f 2: vpbroadcastd zmm0, dword ptr [rcx] vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] movzx eax, byte ptr [rbp+0x78] movzx ebx, byte ptr [rbp+0x80] or eax, ebx xor edx, edx .p2align 5 9: movzx ebx, byte ptr [rbp+0x88] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+0x80] cmove eax, ebx mov dword ptr [rsp+0x88], eax mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x40] mov r13, qword ptr [rdi+0x48] mov r14, qword ptr [rdi+0x50] mov r15, qword ptr [rdi+0x58] vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 vpunpcklqdq zmm8, zmm16, zmm17 vpunpckhqdq zmm9, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 vpunpcklqdq zmm10, zmm18, zmm19 vpunpckhqdq zmm11, zmm18, zmm19 mov r8, qword ptr [rdi+0x20] mov r9, qword ptr [rdi+0x28] mov r10, qword ptr [rdi+0x30] mov r11, qword ptr [rdi+0x38] mov r12, qword ptr [rdi+0x60] mov r13, qword ptr [rdi+0x68] mov r14, qword ptr [rdi+0x70] mov r15, qword ptr [rdi+0x78] vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 vpunpcklqdq zmm12, zmm16, zmm17 vpunpckhqdq zmm13, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 vpunpcklqdq zmm14, zmm18, zmm19 vpunpckhqdq zmm15, zmm18, zmm19 vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] vshufps zmm16, zmm8, zmm10, 136 vshufps zmm17, zmm12, zmm14, 136 vmovdqa32 zmm20, zmm16 vpermt2d zmm16, zmm27, zmm17 vpermt2d zmm20, zmm31, zmm17 vshufps zmm17, zmm8, zmm10, 221 vshufps zmm30, zmm12, zmm14, 221 vmovdqa32 zmm21, zmm17 vpermt2d zmm17, zmm27, zmm30 vpermt2d zmm21, zmm31, zmm30 vshufps zmm18, zmm9, zmm11, 136 vshufps zmm8, zmm13, zmm15, 136 vmovdqa32 zmm22, zmm18 vpermt2d zmm18, zmm27, zmm8 vpermt2d zmm22, zmm31, zmm8 vshufps zmm19, zmm9, zmm11, 221 vshufps zmm8, zmm13, zmm15, 221 vmovdqa32 zmm23, zmm19 vpermt2d zmm19, zmm27, zmm8 vpermt2d zmm23, zmm31, zmm8 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x40] mov r13, qword ptr [rdi+0x48] mov r14, qword ptr [rdi+0x50] mov r15, qword ptr [rdi+0x58] vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm8, zmm24, zmm25 vpunpckhqdq zmm9, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm10, zmm24, zmm25 vpunpckhqdq zmm11, zmm24, zmm25 prefetcht0 [r8+rdx+0x80] prefetcht0 [r12+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r13+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r14+rdx+0x80] prefetcht0 [r11+rdx+0x80] prefetcht0 [r15+rdx+0x80] mov r8, qword ptr [rdi+0x20] mov r9, qword ptr [rdi+0x28] mov r10, qword ptr [rdi+0x30] mov r11, qword ptr [rdi+0x38] mov r12, qword ptr [rdi+0x60] mov r13, qword ptr [rdi+0x68] mov r14, qword ptr [rdi+0x70] mov r15, qword ptr [rdi+0x78] vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm12, zmm24, zmm25 vpunpckhqdq zmm13, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm14, zmm24, zmm25 vpunpckhqdq zmm15, zmm24, zmm25 prefetcht0 [r8+rdx+0x80] prefetcht0 [r12+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r13+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r14+rdx+0x80] prefetcht0 [r11+rdx+0x80] prefetcht0 [r15+rdx+0x80] vshufps zmm24, zmm8, zmm10, 136 vshufps zmm30, zmm12, zmm14, 136 vmovdqa32 zmm28, zmm24 vpermt2d zmm24, zmm27, zmm30 vpermt2d zmm28, zmm31, zmm30 vshufps zmm25, zmm8, zmm10, 221 vshufps zmm30, zmm12, zmm14, 221 vmovdqa32 zmm29, zmm25 vpermt2d zmm25, zmm27, zmm30 vpermt2d zmm29, zmm31, zmm30 vshufps zmm26, zmm9, zmm11, 136 vshufps zmm8, zmm13, zmm15, 136 vmovdqa32 zmm30, zmm26 vpermt2d zmm26, zmm27, zmm8 vpermt2d zmm30, zmm31, zmm8 vshufps zmm8, zmm9, zmm11, 221 vshufps zmm10, zmm13, zmm15, 221 vpermi2d zmm27, zmm8, zmm10 vpermi2d zmm31, zmm8, zmm10 vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] vmovdqa32 zmm12, zmmword ptr [rsp] vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] vpaddd zmm0, zmm0, zmm16 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm20 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm17 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm21 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm24 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm28 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm25 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm29 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm18 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm23 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm22 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm16 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm17 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm25 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm27 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm30 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm19 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm29 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm20 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm18 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm22 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm27 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm21 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm31 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm26 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm30 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm23 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm19 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm20 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm21 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm16 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm24 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm28 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm31 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm29 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm26 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm23 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm16 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm18 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm17 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm25 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm24 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm30 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm28 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm29 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm18 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm19 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm22 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm27 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm17 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm31 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm25 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm30 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm19 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm26 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm20 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpxord zmm0, zmm0, zmm8 vpxord zmm1, zmm1, zmm9 vpxord zmm2, zmm2, zmm10 vpxord zmm3, zmm3, zmm11 vpxord zmm4, zmm4, zmm12 vpxord zmm5, zmm5, zmm13 vpxord zmm6, zmm6, zmm14 vpxord zmm7, zmm7, zmm15 movzx eax, byte ptr [rbp+0x78] jne 9b mov rbx, qword ptr [rbp+0x90] vpunpckldq zmm16, zmm0, zmm1 vpunpckhdq zmm17, zmm0, zmm1 vpunpckldq zmm18, zmm2, zmm3 vpunpckhdq zmm19, zmm2, zmm3 vpunpckldq zmm20, zmm4, zmm5 vpunpckhdq zmm21, zmm4, zmm5 vpunpckldq zmm22, zmm6, zmm7 vpunpckhdq zmm23, zmm6, zmm7 vpunpcklqdq zmm0, zmm16, zmm18 vpunpckhqdq zmm1, zmm16, zmm18 vpunpcklqdq zmm2, zmm17, zmm19 vpunpckhqdq zmm3, zmm17, zmm19 vpunpcklqdq zmm4, zmm20, zmm22 vpunpckhqdq zmm5, zmm20, zmm22 vpunpcklqdq zmm6, zmm21, zmm23 vpunpckhqdq zmm7, zmm21, zmm23 vshufi32x4 zmm16, zmm0, zmm4, 0x88 vshufi32x4 zmm17, zmm1, zmm5, 0x88 vshufi32x4 zmm18, zmm2, zmm6, 0x88 vshufi32x4 zmm19, zmm3, zmm7, 0x88 vshufi32x4 zmm20, zmm0, zmm4, 0xDD vshufi32x4 zmm21, zmm1, zmm5, 0xDD vshufi32x4 zmm22, zmm2, zmm6, 0xDD vshufi32x4 zmm23, zmm3, zmm7, 0xDD vshufi32x4 zmm0, zmm16, zmm17, 0x88 vshufi32x4 zmm1, zmm18, zmm19, 0x88 vshufi32x4 zmm2, zmm20, zmm21, 0x88 vshufi32x4 zmm3, zmm22, zmm23, 0x88 vshufi32x4 zmm4, zmm16, zmm17, 0xDD vshufi32x4 zmm5, zmm18, zmm19, 0xDD vshufi32x4 zmm6, zmm20, zmm21, 0xDD vshufi32x4 zmm7, zmm22, zmm23, 0xDD vmovdqu32 zmmword ptr [rbx], zmm0 vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 vmovdqa32 zmm0, zmmword ptr [rsp] vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] vmovdqa32 zmm2, zmm0 vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} vpcmpltud k2, zmm2, zmm0 vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} vmovdqa32 zmmword ptr [rsp], zmm2 vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 add rdi, 128 add rbx, 512 mov qword ptr [rbp+0x90], rbx sub rsi, 16 cmp rsi, 16 jnc 2b test rsi, rsi jne 3f 4: vzeroupper vmovdqa xmm6, xmmword ptr [rsp+0x90] vmovdqa xmm7, xmmword ptr [rsp+0xA0] vmovdqa xmm8, xmmword ptr [rsp+0xB0] vmovdqa xmm9, xmmword ptr [rsp+0xC0] vmovdqa xmm10, xmmword ptr [rsp+0xD0] vmovdqa xmm11, xmmword ptr [rsp+0xE0] vmovdqa xmm12, xmmword ptr [rsp+0xF0] vmovdqa xmm13, xmmword ptr [rsp+0x100] vmovdqa xmm14, xmmword ptr [rsp+0x110] vmovdqa xmm15, xmmword ptr [rsp+0x120] mov rsp, rbp pop rbp pop rbx pop rsi pop rdi pop r12 pop r13 pop r14 pop r15 ret .p2align 6 3: test esi, 0x8 je 3f vpbroadcastd ymm0, dword ptr [rcx] vpbroadcastd ymm1, dword ptr [rcx+0x4] vpbroadcastd ymm2, dword ptr [rcx+0x8] vpbroadcastd ymm3, dword ptr [rcx+0xC] vpbroadcastd ymm4, dword ptr [rcx+0x10] vpbroadcastd ymm5, dword ptr [rcx+0x14] vpbroadcastd ymm6, dword ptr [rcx+0x18] vpbroadcastd ymm7, dword ptr [rcx+0x1C] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x20] mov r13, qword ptr [rdi+0x28] mov r14, qword ptr [rdi+0x30] mov r15, qword ptr [rdi+0x38] movzx eax, byte ptr [rbp+0x78] movzx ebx, byte ptr [rbp+0x80] or eax, ebx xor edx, edx 2: movzx ebx, byte ptr [rbp+0x88] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+0x80] cmove eax, ebx mov dword ptr [rsp+0x88], eax vmovups xmm8, xmmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x40] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x40] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x40] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm16, ymm12, ymm14, 136 vshufps ymm17, ymm12, ymm14, 221 vshufps ymm18, ymm13, ymm15, 136 vshufps ymm19, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x30] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x30] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x30] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm20, ymm12, ymm14, 136 vshufps ymm21, ymm12, ymm14, 221 vshufps ymm22, ymm13, ymm15, 136 vshufps ymm23, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x20] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x20] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x20] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm24, ymm12, ymm14, 136 vshufps ymm25, ymm12, ymm14, 221 vshufps ymm26, ymm13, ymm15, 136 vshufps ymm27, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x10] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x10] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x10] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm28, ymm12, ymm14, 136 vshufps ymm29, ymm12, ymm14, 221 vshufps ymm30, ymm13, ymm15, 136 vshufps ymm31, ymm13, ymm15, 221 vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] vmovdqa ymm12, ymmword ptr [rsp] vmovdqa ymm13, ymmword ptr [rsp+0x40] vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] vpbroadcastd ymm15, dword ptr [rsp+0x88] vpaddd ymm0, ymm0, ymm16 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm20 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm17 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm21 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm24 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm28 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm25 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm29 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm18 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm23 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm22 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm16 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm17 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm25 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm27 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm30 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm19 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm29 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm20 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm18 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm22 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm27 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm21 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm31 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm26 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm30 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm23 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm19 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm20 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm21 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm16 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm24 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm28 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm31 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm29 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm26 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm23 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm16 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm18 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm17 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm25 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm24 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm30 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm28 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm29 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm18 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm19 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm22 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm27 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm17 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm31 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm25 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm30 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm19 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm26 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm20 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpxor ymm0, ymm0, ymm8 vpxor ymm1, ymm1, ymm9 vpxor ymm2, ymm2, ymm10 vpxor ymm3, ymm3, ymm11 vpxor ymm4, ymm4, ymm12 vpxor ymm5, ymm5, ymm13 vpxor ymm6, ymm6, ymm14 vpxor ymm7, ymm7, ymm15 movzx eax, byte ptr [rbp+0x78] jne 2b mov rbx, qword ptr [rbp+0x90] vunpcklps ymm8, ymm0, ymm1 vunpcklps ymm9, ymm2, ymm3 vunpckhps ymm10, ymm0, ymm1 vunpcklps ymm11, ymm4, ymm5 vunpcklps ymm0, ymm6, ymm7 vshufps ymm12, ymm8, ymm9, 78 vblendps ymm1, ymm8, ymm12, 0xCC vshufps ymm8, ymm11, ymm0, 78 vunpckhps ymm13, ymm2, ymm3 vblendps ymm2, ymm11, ymm8, 0xCC vblendps ymm3, ymm12, ymm9, 0xCC vperm2f128 ymm12, ymm1, ymm2, 0x20 vmovups ymmword ptr [rbx], ymm12 vunpckhps ymm14, ymm4, ymm5 vblendps ymm4, ymm8, ymm0, 0xCC vunpckhps ymm15, ymm6, ymm7 vperm2f128 ymm7, ymm3, ymm4, 0x20 vmovups ymmword ptr [rbx+0x20], ymm7 vshufps ymm5, ymm10, ymm13, 78 vblendps ymm6, ymm5, ymm13, 0xCC vshufps ymm13, ymm14, ymm15, 78 vblendps ymm10, ymm10, ymm5, 0xCC vblendps ymm14, ymm14, ymm13, 0xCC vperm2f128 ymm8, ymm10, ymm14, 0x20 vmovups ymmword ptr [rbx+0x40], ymm8 vblendps ymm15, ymm13, ymm15, 0xCC vperm2f128 ymm13, ymm6, ymm15, 0x20 vmovups ymmword ptr [rbx+0x60], ymm13 vperm2f128 ymm9, ymm1, ymm2, 0x31 vperm2f128 ymm11, ymm3, ymm4, 0x31 vmovups ymmword ptr [rbx+0x80], ymm9 vperm2f128 ymm14, ymm10, ymm14, 0x31 vperm2f128 ymm15, ymm6, ymm15, 0x31 vmovups ymmword ptr [rbx+0xA0], ymm11 vmovups ymmword ptr [rbx+0xC0], ymm14 vmovups ymmword ptr [rbx+0xE0], ymm15 vmovdqa ymm0, ymmword ptr [rsp] vmovdqa ymm2, ymmword ptr [rsp+0x40] vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] vmovdqa ymmword ptr [rsp], ymm0 vmovdqa ymmword ptr [rsp+0x40], ymm2 add rbx, 256 mov qword ptr [rbp+0x90], rbx add rdi, 64 sub rsi, 8 3: mov rbx, qword ptr [rbp+0x90] mov r15, qword ptr [rsp+0x80] movzx r13, byte ptr [rbp+0x78] movzx r12, byte ptr [rbp+0x88] test esi, 0x4 je 3f vbroadcasti32x4 zmm0, xmmword ptr [rcx] vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] vmovdqa xmm12, xmmword ptr [rsp] vmovdqa xmm13, xmmword ptr [rsp+0x40] vpunpckldq xmm14, xmm12, xmm13 vpunpckhdq xmm15, xmm12, xmm13 vpermq ymm14, ymm14, 0xDC vpermq ymm15, ymm15, 0xDC vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] vinserti64x4 zmm13, zmm14, ymm15, 0x01 mov eax, 17476 kmovw k2, eax vpblendmd zmm13 {k2}, zmm13, zmm12 vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov eax, 43690 kmovw k3, eax mov eax, 34952 kmovw k4, eax movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x88], eax vmovdqa32 zmm2, zmm15 vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] vpblendmd zmm3 {k4}, zmm13, zmm8 vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 vmovups zmm9, zmmword ptr [r8+rdx-0x30] vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 vshufps zmm4, zmm8, zmm9, 136 vshufps zmm5, zmm8, zmm9, 221 vmovups zmm8, zmmword ptr [r8+rdx-0x20] vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 vmovups zmm9, zmmword ptr [r8+rdx-0x10] vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 vshufps zmm6, zmm8, zmm9, 136 vshufps zmm7, zmm8, zmm9, 221 vpshufd zmm6, zmm6, 0x93 vpshufd zmm7, zmm7, 0x93 mov al, 7 9: vpaddd zmm0, zmm0, zmm4 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 16 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 12 vpaddd zmm0, zmm0, zmm5 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 8 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 7 vpshufd zmm0, zmm0, 0x93 vpshufd zmm3, zmm3, 0x4E vpshufd zmm2, zmm2, 0x39 vpaddd zmm0, zmm0, zmm6 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 16 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 12 vpaddd zmm0, zmm0, zmm7 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 8 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 7 vpshufd zmm0, zmm0, 0x39 vpshufd zmm3, zmm3, 0x4E vpshufd zmm2, zmm2, 0x93 dec al jz 9f vshufps zmm8, zmm4, zmm5, 214 vpshufd zmm9, zmm4, 0x0F vpshufd zmm4, zmm8, 0x39 vshufps zmm8, zmm6, zmm7, 250 vpblendmd zmm9 {k3}, zmm9, zmm8 vpunpcklqdq zmm8, zmm7, zmm5 vpblendmd zmm8 {k4}, zmm8, zmm6 vpshufd zmm8, zmm8, 0x78 vpunpckhdq zmm5, zmm5, zmm7 vpunpckldq zmm6, zmm6, zmm5 vpshufd zmm7, zmm6, 0x1E vmovdqa32 zmm5, zmm9 vmovdqa32 zmm6, zmm8 jmp 9b 9: vpxord zmm0, zmm0, zmm2 vpxord zmm1, zmm1, zmm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 vmovdqa xmm0, xmmword ptr [rsp] vmovdqa xmm2, xmmword ptr [rsp+0x40] vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] vmovdqa xmmword ptr [rsp], xmm0 vmovdqa xmmword ptr [rsp+0x40], xmm2 add rbx, 128 add rdi, 32 sub rsi, 4 3: test esi, 0x2 je 3f vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vmovd xmm13, dword ptr [rsp] vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovd xmm14, dword ptr [rsp+0x4] vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vinserti128 ymm13, ymm13, xmm14, 0x01 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x88], eax vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] vpbroadcastd ymm8, dword ptr [rsp+0x88] vpblendd ymm3, ymm13, ymm8, 0x88 vmovups ymm8, ymmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 vshufps ymm4, ymm8, ymm9, 136 vshufps ymm5, ymm8, ymm9, 221 vmovups ymm8, ymmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 vshufps ymm6, ymm8, ymm9, 136 vshufps ymm7, ymm8, ymm9, 221 vpshufd ymm6, ymm6, 0x93 vpshufd ymm7, ymm7, 0x93 mov al, 7 9: vpaddd ymm0, ymm0, ymm4 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 16 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 12 vpaddd ymm0, ymm0, ymm5 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 8 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 7 vpshufd ymm0, ymm0, 0x93 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x39 vpaddd ymm0, ymm0, ymm6 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 16 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 12 vpaddd ymm0, ymm0, ymm7 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 8 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 7 vpshufd ymm0, ymm0, 0x39 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x93 dec al jz 9f vshufps ymm8, ymm4, ymm5, 214 vpshufd ymm9, ymm4, 0x0F vpshufd ymm4, ymm8, 0x39 vshufps ymm8, ymm6, ymm7, 250 vpblendd ymm9, ymm9, ymm8, 0xAA vpunpcklqdq ymm8, ymm7, ymm5 vpblendd ymm8, ymm8, ymm6, 0x88 vpshufd ymm8, ymm8, 0x78 vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 0x1E vmovdqa ymm5, ymm9 vmovdqa ymm6, ymm8 jmp 9b 9: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vmovdqa xmm0, xmmword ptr [rsp] vmovdqa xmm2, xmmword ptr [rsp+0x40] vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] vmovdqa xmmword ptr [rsp], xmm0 vmovdqa xmmword ptr [rsp+0x40], xmm2 add rbx, 64 add rdi, 16 sub rsi, 2 3: test esi, 0x1 je 4b vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+0x10] vmovd xmm14, dword ptr [rsp] vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d vpinsrd xmm3, xmm14, eax, 3 vmovdqa xmm2, xmm15 vmovups xmm8, xmmword ptr [r8+rdx-0x40] vmovups xmm9, xmmword ptr [r8+rdx-0x30] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vmovups xmm9, xmmword ptr [r8+rdx-0x10] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 jmp 4b .p2align 6 _blake3_compress_in_place_avx512: blake3_compress_in_place_avx512: sub rsp, 72 vmovdqa xmmword ptr [rsp], xmm6 vmovdqa xmmword ptr [rsp+0x10], xmm7 vmovdqa xmmword ptr [rsp+0x20], xmm8 vmovdqa xmmword ptr [rsp+0x30], xmm9 vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+0x10] movzx eax, byte ptr [rsp+0x70] movzx r8d, r8b shl rax, 32 add r8, rax vmovq xmm3, r9 vmovq xmm4, r8 vpunpcklqdq xmm3, xmm3, xmm4 vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] vmovups xmm8, xmmword ptr [rdx] vmovups xmm9, xmmword ptr [rdx+0x10] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [rdx+0x20] vmovups xmm9, xmmword ptr [rdx+0x30] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 vmovdqu xmmword ptr [rcx], xmm0 vmovdqu xmmword ptr [rcx+0x10], xmm1 vmovdqa xmm6, xmmword ptr [rsp] vmovdqa xmm7, xmmword ptr [rsp+0x10] vmovdqa xmm8, xmmword ptr [rsp+0x20] vmovdqa xmm9, xmmword ptr [rsp+0x30] add rsp, 72 ret .p2align 6 _blake3_compress_xof_avx512: blake3_compress_xof_avx512: sub rsp, 72 vmovdqa xmmword ptr [rsp], xmm6 vmovdqa xmmword ptr [rsp+0x10], xmm7 vmovdqa xmmword ptr [rsp+0x20], xmm8 vmovdqa xmmword ptr [rsp+0x30], xmm9 vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+0x10] movzx eax, byte ptr [rsp+0x70] movzx r8d, r8b mov r10, qword ptr [rsp+0x78] shl rax, 32 add r8, rax vmovq xmm3, r9 vmovq xmm4, r8 vpunpcklqdq xmm3, xmm3, xmm4 vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] vmovups xmm8, xmmword ptr [rdx] vmovups xmm9, xmmword ptr [rdx+0x10] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [rdx+0x20] vmovups xmm9, xmmword ptr [rdx+0x30] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 vpxor xmm2, xmm2, xmmword ptr [rcx] vpxor xmm3, xmm3, xmmword ptr [rcx+0x10] vmovdqu xmmword ptr [r10], xmm0 vmovdqu xmmword ptr [r10+0x10], xmm1 vmovdqu xmmword ptr [r10+0x20], xmm2 vmovdqu xmmword ptr [r10+0x30], xmm3 vmovdqa xmm6, xmmword ptr [rsp] vmovdqa xmm7, xmmword ptr [rsp+0x10] vmovdqa xmm8, xmmword ptr [rsp+0x20] vmovdqa xmm9, xmmword ptr [rsp+0x30] add rsp, 72 ret .section .rdata .p2align 6 INDEX0: .long 0, 1, 2, 3, 16, 17, 18, 19 .long 8, 9, 10, 11, 24, 25, 26, 27 INDEX1: .long 4, 5, 6, 7, 20, 21, 22, 23 .long 12, 13, 14, 15, 28, 29, 30, 31 ADD0: .long 0, 1, 2, 3, 4, 5, 6, 7 .long 8, 9, 10, 11, 12, 13, 14, 15 ADD1: .long 1 ADD16: .long 16 BLAKE3_BLOCK_LEN: .long 64 .p2align 6 BLAKE3_IV: BLAKE3_IV_0: .long 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A blake3-1.8.1/c/blake3_avx512_x86-64_windows_msvc.asm000064400000000000000000002627251046102023000177430ustar 00000000000000public _blake3_hash_many_avx512 public blake3_hash_many_avx512 public blake3_compress_in_place_avx512 public _blake3_compress_in_place_avx512 public blake3_compress_xof_avx512 public _blake3_compress_xof_avx512 _TEXT SEGMENT ALIGN(16) 'CODE' ALIGN 16 blake3_hash_many_avx512 PROC _blake3_hash_many_avx512 PROC push r15 push r14 push r13 push r12 push rdi push rsi push rbx push rbp mov rbp, rsp sub rsp, 304 and rsp, 0FFFFFFFFFFFFFFC0H vmovdqa xmmword ptr [rsp+90H], xmm6 vmovdqa xmmword ptr [rsp+0A0H], xmm7 vmovdqa xmmword ptr [rsp+0B0H], xmm8 vmovdqa xmmword ptr [rsp+0C0H], xmm9 vmovdqa xmmword ptr [rsp+0D0H], xmm10 vmovdqa xmmword ptr [rsp+0E0H], xmm11 vmovdqa xmmword ptr [rsp+0F0H], xmm12 vmovdqa xmmword ptr [rsp+100H], xmm13 vmovdqa xmmword ptr [rsp+110H], xmm14 vmovdqa xmmword ptr [rsp+120H], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+68H] movzx r9, byte ptr [rbp+70H] neg r9 kmovw k1, r9d vmovd xmm0, r8d vpbroadcastd ymm0, xmm0 shr r8, 32 vmovd xmm1, r8d vpbroadcastd ymm1, xmm1 vmovdqa ymm4, ymm1 vmovdqa ymm5, ymm1 vpaddd ymm2, ymm0, ymmword ptr [ADD0] vpaddd ymm3, ymm0, ymmword ptr [ADD0+32] vpcmpud k2, ymm2, ymm0, 1 vpcmpud k3, ymm3, ymm0, 1 ; XXX: ml64.exe does not currently understand the syntax. We use a workaround. vpbroadcastd ymm6, dword ptr [ADD1] vpaddd ymm4 {k2}, ymm4, ymm6 vpaddd ymm5 {k3}, ymm5, ymm6 ; vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1] {1to8} ; vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1] {1to8} knotw k2, k1 vmovdqa32 ymm2 {k2}, ymm0 vmovdqa32 ymm3 {k2}, ymm0 vmovdqa32 ymm4 {k2}, ymm1 vmovdqa32 ymm5 {k2}, ymm1 vmovdqa ymmword ptr [rsp], ymm2 vmovdqa ymmword ptr [rsp+20H], ymm3 vmovdqa ymmword ptr [rsp+40H], ymm4 vmovdqa ymmword ptr [rsp+60H], ymm5 shl rdx, 6 mov qword ptr [rsp+80H], rdx cmp rsi, 16 jc final15blocks outerloop16: vpbroadcastd zmm0, dword ptr [rcx] vpbroadcastd zmm1, dword ptr [rcx+1H*4H] vpbroadcastd zmm2, dword ptr [rcx+2H*4H] vpbroadcastd zmm3, dword ptr [rcx+3H*4H] vpbroadcastd zmm4, dword ptr [rcx+4H*4H] vpbroadcastd zmm5, dword ptr [rcx+5H*4H] vpbroadcastd zmm6, dword ptr [rcx+6H*4H] vpbroadcastd zmm7, dword ptr [rcx+7H*4H] movzx eax, byte ptr [rbp+78H] movzx ebx, byte ptr [rbp+80H] or eax, ebx xor edx, edx ALIGN 16 innerloop16: movzx ebx, byte ptr [rbp+88H] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+80H] cmove eax, ebx mov dword ptr [rsp+88H], eax mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] mov r12, qword ptr [rdi+40H] mov r13, qword ptr [rdi+48H] mov r14, qword ptr [rdi+50H] mov r15, qword ptr [rdi+58H] vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H vpunpcklqdq zmm8, zmm16, zmm17 vpunpckhqdq zmm9, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H vpunpcklqdq zmm10, zmm18, zmm19 vpunpckhqdq zmm11, zmm18, zmm19 mov r8, qword ptr [rdi+20H] mov r9, qword ptr [rdi+28H] mov r10, qword ptr [rdi+30H] mov r11, qword ptr [rdi+38H] mov r12, qword ptr [rdi+60H] mov r13, qword ptr [rdi+68H] mov r14, qword ptr [rdi+70H] mov r15, qword ptr [rdi+78H] vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H vpunpcklqdq zmm12, zmm16, zmm17 vpunpckhqdq zmm13, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H vpunpcklqdq zmm14, zmm18, zmm19 vpunpckhqdq zmm15, zmm18, zmm19 vmovdqa32 zmm27, zmmword ptr [INDEX0] vmovdqa32 zmm31, zmmword ptr [INDEX1] vshufps zmm16, zmm8, zmm10, 136 vshufps zmm17, zmm12, zmm14, 136 vmovdqa32 zmm20, zmm16 vpermt2d zmm16, zmm27, zmm17 vpermt2d zmm20, zmm31, zmm17 vshufps zmm17, zmm8, zmm10, 221 vshufps zmm30, zmm12, zmm14, 221 vmovdqa32 zmm21, zmm17 vpermt2d zmm17, zmm27, zmm30 vpermt2d zmm21, zmm31, zmm30 vshufps zmm18, zmm9, zmm11, 136 vshufps zmm8, zmm13, zmm15, 136 vmovdqa32 zmm22, zmm18 vpermt2d zmm18, zmm27, zmm8 vpermt2d zmm22, zmm31, zmm8 vshufps zmm19, zmm9, zmm11, 221 vshufps zmm8, zmm13, zmm15, 221 vmovdqa32 zmm23, zmm19 vpermt2d zmm19, zmm27, zmm8 vpermt2d zmm23, zmm31, zmm8 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] mov r12, qword ptr [rdi+40H] mov r13, qword ptr [rdi+48H] mov r14, qword ptr [rdi+50H] mov r15, qword ptr [rdi+58H] vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H vpunpcklqdq zmm8, zmm24, zmm25 vpunpckhqdq zmm9, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H vpunpcklqdq zmm10, zmm24, zmm25 vpunpckhqdq zmm11, zmm24, zmm25 prefetcht0 byte ptr [r8+rdx+80H] prefetcht0 byte ptr [r12+rdx+80H] prefetcht0 byte ptr [r9+rdx+80H] prefetcht0 byte ptr [r13+rdx+80H] prefetcht0 byte ptr [r10+rdx+80H] prefetcht0 byte ptr [r14+rdx+80H] prefetcht0 byte ptr [r11+rdx+80H] prefetcht0 byte ptr [r15+rdx+80H] mov r8, qword ptr [rdi+20H] mov r9, qword ptr [rdi+28H] mov r10, qword ptr [rdi+30H] mov r11, qword ptr [rdi+38H] mov r12, qword ptr [rdi+60H] mov r13, qword ptr [rdi+68H] mov r14, qword ptr [rdi+70H] mov r15, qword ptr [rdi+78H] vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H vpunpcklqdq zmm12, zmm24, zmm25 vpunpckhqdq zmm13, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H vpunpcklqdq zmm14, zmm24, zmm25 vpunpckhqdq zmm15, zmm24, zmm25 prefetcht0 byte ptr [r8+rdx+80H] prefetcht0 byte ptr [r12+rdx+80H] prefetcht0 byte ptr [r9+rdx+80H] prefetcht0 byte ptr [r13+rdx+80H] prefetcht0 byte ptr [r10+rdx+80H] prefetcht0 byte ptr [r14+rdx+80H] prefetcht0 byte ptr [r11+rdx+80H] prefetcht0 byte ptr [r15+rdx+80H] vshufps zmm24, zmm8, zmm10, 136 vshufps zmm30, zmm12, zmm14, 136 vmovdqa32 zmm28, zmm24 vpermt2d zmm24, zmm27, zmm30 vpermt2d zmm28, zmm31, zmm30 vshufps zmm25, zmm8, zmm10, 221 vshufps zmm30, zmm12, zmm14, 221 vmovdqa32 zmm29, zmm25 vpermt2d zmm25, zmm27, zmm30 vpermt2d zmm29, zmm31, zmm30 vshufps zmm26, zmm9, zmm11, 136 vshufps zmm8, zmm13, zmm15, 136 vmovdqa32 zmm30, zmm26 vpermt2d zmm26, zmm27, zmm8 vpermt2d zmm30, zmm31, zmm8 vshufps zmm8, zmm9, zmm11, 221 vshufps zmm10, zmm13, zmm15, 221 vpermi2d zmm27, zmm8, zmm10 vpermi2d zmm31, zmm8, zmm10 vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0] vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1] vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2] vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3] vmovdqa32 zmm12, zmmword ptr [rsp] vmovdqa32 zmm13, zmmword ptr [rsp+1H*40H] vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN] vpbroadcastd zmm15, dword ptr [rsp+22H*4H] vpaddd zmm0, zmm0, zmm16 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm20 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm17 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm21 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm24 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm28 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm25 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm29 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm18 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm23 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm22 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm16 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm17 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm25 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm27 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm30 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm19 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm29 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm20 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm18 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm22 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm27 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm21 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm31 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm26 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm30 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm23 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm19 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm20 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm21 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm16 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm24 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm28 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm31 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm29 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm26 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm23 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm16 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm18 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm17 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm25 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm24 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm30 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm28 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm29 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm18 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm19 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm22 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm27 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm17 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm31 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm25 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm30 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm19 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm26 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm20 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpxord zmm0, zmm0, zmm8 vpxord zmm1, zmm1, zmm9 vpxord zmm2, zmm2, zmm10 vpxord zmm3, zmm3, zmm11 vpxord zmm4, zmm4, zmm12 vpxord zmm5, zmm5, zmm13 vpxord zmm6, zmm6, zmm14 vpxord zmm7, zmm7, zmm15 movzx eax, byte ptr [rbp+78H] jne innerloop16 mov rbx, qword ptr [rbp+90H] vpunpckldq zmm16, zmm0, zmm1 vpunpckhdq zmm17, zmm0, zmm1 vpunpckldq zmm18, zmm2, zmm3 vpunpckhdq zmm19, zmm2, zmm3 vpunpckldq zmm20, zmm4, zmm5 vpunpckhdq zmm21, zmm4, zmm5 vpunpckldq zmm22, zmm6, zmm7 vpunpckhdq zmm23, zmm6, zmm7 vpunpcklqdq zmm0, zmm16, zmm18 vpunpckhqdq zmm1, zmm16, zmm18 vpunpcklqdq zmm2, zmm17, zmm19 vpunpckhqdq zmm3, zmm17, zmm19 vpunpcklqdq zmm4, zmm20, zmm22 vpunpckhqdq zmm5, zmm20, zmm22 vpunpcklqdq zmm6, zmm21, zmm23 vpunpckhqdq zmm7, zmm21, zmm23 vshufi32x4 zmm16, zmm0, zmm4, 88H vshufi32x4 zmm17, zmm1, zmm5, 88H vshufi32x4 zmm18, zmm2, zmm6, 88H vshufi32x4 zmm19, zmm3, zmm7, 88H vshufi32x4 zmm20, zmm0, zmm4, 0DDH vshufi32x4 zmm21, zmm1, zmm5, 0DDH vshufi32x4 zmm22, zmm2, zmm6, 0DDH vshufi32x4 zmm23, zmm3, zmm7, 0DDH vshufi32x4 zmm0, zmm16, zmm17, 88H vshufi32x4 zmm1, zmm18, zmm19, 88H vshufi32x4 zmm2, zmm20, zmm21, 88H vshufi32x4 zmm3, zmm22, zmm23, 88H vshufi32x4 zmm4, zmm16, zmm17, 0DDH vshufi32x4 zmm5, zmm18, zmm19, 0DDH vshufi32x4 zmm6, zmm20, zmm21, 0DDH vshufi32x4 zmm7, zmm22, zmm23, 0DDH vmovdqu32 zmmword ptr [rbx], zmm0 vmovdqu32 zmmword ptr [rbx+1H*40H], zmm1 vmovdqu32 zmmword ptr [rbx+2H*40H], zmm2 vmovdqu32 zmmword ptr [rbx+3H*40H], zmm3 vmovdqu32 zmmword ptr [rbx+4H*40H], zmm4 vmovdqu32 zmmword ptr [rbx+5H*40H], zmm5 vmovdqu32 zmmword ptr [rbx+6H*40H], zmm6 vmovdqu32 zmmword ptr [rbx+7H*40H], zmm7 vmovdqa32 zmm0, zmmword ptr [rsp] vmovdqa32 zmm1, zmmword ptr [rsp+1H*40H] vmovdqa32 zmm2, zmm0 ; XXX: ml64.exe does not currently understand the syntax. We use a workaround. vpbroadcastd zmm4, dword ptr [ADD16] vpbroadcastd zmm5, dword ptr [ADD1] vpaddd zmm2{k1}, zmm0, zmm4 ; vpaddd zmm2{k1}, zmm0, dword ptr [ADD16] ; {1to16} vpcmpud k2, zmm2, zmm0, 1 vpaddd zmm1 {k2}, zmm1, zmm5 ; vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1] ; {1to16} vmovdqa32 zmmword ptr [rsp], zmm2 vmovdqa32 zmmword ptr [rsp+1H*40H], zmm1 add rdi, 128 add rbx, 512 mov qword ptr [rbp+90H], rbx sub rsi, 16 cmp rsi, 16 jnc outerloop16 test rsi, rsi jne final15blocks unwind: vzeroupper vmovdqa xmm6, xmmword ptr [rsp+90H] vmovdqa xmm7, xmmword ptr [rsp+0A0H] vmovdqa xmm8, xmmword ptr [rsp+0B0H] vmovdqa xmm9, xmmword ptr [rsp+0C0H] vmovdqa xmm10, xmmword ptr [rsp+0D0H] vmovdqa xmm11, xmmword ptr [rsp+0E0H] vmovdqa xmm12, xmmword ptr [rsp+0F0H] vmovdqa xmm13, xmmword ptr [rsp+100H] vmovdqa xmm14, xmmword ptr [rsp+110H] vmovdqa xmm15, xmmword ptr [rsp+120H] mov rsp, rbp pop rbp pop rbx pop rsi pop rdi pop r12 pop r13 pop r14 pop r15 ret ALIGN 16 final15blocks: test esi, 8H je final7blocks vpbroadcastd ymm0, dword ptr [rcx] vpbroadcastd ymm1, dword ptr [rcx+4H] vpbroadcastd ymm2, dword ptr [rcx+8H] vpbroadcastd ymm3, dword ptr [rcx+0CH] vpbroadcastd ymm4, dword ptr [rcx+10H] vpbroadcastd ymm5, dword ptr [rcx+14H] vpbroadcastd ymm6, dword ptr [rcx+18H] vpbroadcastd ymm7, dword ptr [rcx+1CH] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] mov r12, qword ptr [rdi+20H] mov r13, qword ptr [rdi+28H] mov r14, qword ptr [rdi+30H] mov r15, qword ptr [rdi+38H] movzx eax, byte ptr [rbp+78H] movzx ebx, byte ptr [rbp+80H] or eax, ebx xor edx, edx innerloop8: movzx ebx, byte ptr [rbp+88H] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+80H] cmove eax, ebx mov dword ptr [rsp+88H], eax vmovups xmm8, xmmword ptr [r8+rdx-40H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H vmovups xmm9, xmmword ptr [r9+rdx-40H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-40H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H vmovups xmm11, xmmword ptr [r11+rdx-40H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm16, ymm12, ymm14, 136 vshufps ymm17, ymm12, ymm14, 221 vshufps ymm18, ymm13, ymm15, 136 vshufps ymm19, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-30H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H vmovups xmm9, xmmword ptr [r9+rdx-30H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-30H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H vmovups xmm11, xmmword ptr [r11+rdx-30H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm20, ymm12, ymm14, 136 vshufps ymm21, ymm12, ymm14, 221 vshufps ymm22, ymm13, ymm15, 136 vshufps ymm23, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-20H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H vmovups xmm9, xmmword ptr [r9+rdx-20H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-20H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H vmovups xmm11, xmmword ptr [r11+rdx-20H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm24, ymm12, ymm14, 136 vshufps ymm25, ymm12, ymm14, 221 vshufps ymm26, ymm13, ymm15, 136 vshufps ymm27, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-10H] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H vmovups xmm9, xmmword ptr [r9+rdx-10H] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-10H] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H vmovups xmm11, xmmword ptr [r11+rdx-10H] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm28, ymm12, ymm14, 136 vshufps ymm29, ymm12, ymm14, 221 vshufps ymm30, ymm13, ymm15, 136 vshufps ymm31, ymm13, ymm15, 221 vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0] vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1] vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2] vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3] vmovdqa ymm12, ymmword ptr [rsp] vmovdqa ymm13, ymmword ptr [rsp+40H] vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN] vpbroadcastd ymm15, dword ptr [rsp+88H] vpaddd ymm0, ymm0, ymm16 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm20 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm17 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm21 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm24 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm28 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm25 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm29 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm18 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm23 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm22 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm16 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm17 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm25 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm27 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm30 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm19 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm29 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm20 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm18 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm22 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm27 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm21 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm31 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm26 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm30 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm23 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm19 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm20 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm21 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm16 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm24 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm28 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm31 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm29 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm26 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm23 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm16 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm18 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm17 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm25 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm24 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm30 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm28 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm29 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm18 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm19 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm22 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm27 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm17 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm31 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm25 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm30 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm19 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm26 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm20 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpxor ymm0, ymm0, ymm8 vpxor ymm1, ymm1, ymm9 vpxor ymm2, ymm2, ymm10 vpxor ymm3, ymm3, ymm11 vpxor ymm4, ymm4, ymm12 vpxor ymm5, ymm5, ymm13 vpxor ymm6, ymm6, ymm14 vpxor ymm7, ymm7, ymm15 movzx eax, byte ptr [rbp+78H] jne innerloop8 mov rbx, qword ptr [rbp+90H] vunpcklps ymm8, ymm0, ymm1 vunpcklps ymm9, ymm2, ymm3 vunpckhps ymm10, ymm0, ymm1 vunpcklps ymm11, ymm4, ymm5 vunpcklps ymm0, ymm6, ymm7 vshufps ymm12, ymm8, ymm9, 78 vblendps ymm1, ymm8, ymm12, 0CCH vshufps ymm8, ymm11, ymm0, 78 vunpckhps ymm13, ymm2, ymm3 vblendps ymm2, ymm11, ymm8, 0CCH vblendps ymm3, ymm12, ymm9, 0CCH vperm2f128 ymm12, ymm1, ymm2, 20H vmovups ymmword ptr [rbx], ymm12 vunpckhps ymm14, ymm4, ymm5 vblendps ymm4, ymm8, ymm0, 0CCH vunpckhps ymm15, ymm6, ymm7 vperm2f128 ymm7, ymm3, ymm4, 20H vmovups ymmword ptr [rbx+20H], ymm7 vshufps ymm5, ymm10, ymm13, 78 vblendps ymm6, ymm5, ymm13, 0CCH vshufps ymm13, ymm14, ymm15, 78 vblendps ymm10, ymm10, ymm5, 0CCH vblendps ymm14, ymm14, ymm13, 0CCH vperm2f128 ymm8, ymm10, ymm14, 20H vmovups ymmword ptr [rbx+40H], ymm8 vblendps ymm15, ymm13, ymm15, 0CCH vperm2f128 ymm13, ymm6, ymm15, 20H vmovups ymmword ptr [rbx+60H], ymm13 vperm2f128 ymm9, ymm1, ymm2, 31H vperm2f128 ymm11, ymm3, ymm4, 31H vmovups ymmword ptr [rbx+80H], ymm9 vperm2f128 ymm14, ymm10, ymm14, 31H vperm2f128 ymm15, ymm6, ymm15, 31H vmovups ymmword ptr [rbx+0A0H], ymm11 vmovups ymmword ptr [rbx+0C0H], ymm14 vmovups ymmword ptr [rbx+0E0H], ymm15 vmovdqa ymm0, ymmword ptr [rsp] vmovdqa ymm2, ymmword ptr [rsp+40H] vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+1H*20H] vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+3H*20H] vmovdqa ymmword ptr [rsp], ymm0 vmovdqa ymmword ptr [rsp+40H], ymm2 add rbx, 256 mov qword ptr [rbp+90H], rbx add rdi, 64 sub rsi, 8 final7blocks: mov rbx, qword ptr [rbp+90H] mov r15, qword ptr [rsp+80H] movzx r13, byte ptr [rbp+78H] movzx r12, byte ptr [rbp+88H] test esi, 4H je final3blocks vbroadcasti32x4 zmm0, xmmword ptr [rcx] vbroadcasti32x4 zmm1, xmmword ptr [rcx+1H*10H] vmovdqa xmm12, xmmword ptr [rsp] vmovdqa xmm13, xmmword ptr [rsp+40H] vpunpckldq xmm14, xmm12, xmm13 vpunpckhdq xmm15, xmm12, xmm13 vpermq ymm14, ymm14, 0DCH vpermq ymm15, ymm15, 0DCH vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN] vinserti64x4 zmm13, zmm14, ymm15, 01H mov eax, 17476 kmovw k2, eax vpblendmd zmm13 {k2}, zmm13, zmm12 vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] mov eax, 43690 kmovw k3, eax mov eax, 34952 kmovw k4, eax movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx ALIGN 16 innerloop4: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+88H], eax vmovdqa32 zmm2, zmm15 vpbroadcastd zmm8, dword ptr [rsp+22H*4H] vpblendmd zmm3 {k4}, zmm13, zmm8 vmovups zmm8, zmmword ptr [r8+rdx-1H*40H] vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-4H*10H], 01H vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-4H*10H], 02H vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-4H*10H], 03H vmovups zmm9, zmmword ptr [r8+rdx-30H] vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-3H*10H], 01H vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-3H*10H], 02H vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-3H*10H], 03H vshufps zmm4, zmm8, zmm9, 136 vshufps zmm5, zmm8, zmm9, 221 vmovups zmm8, zmmword ptr [r8+rdx-20H] vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-2H*10H], 01H vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-2H*10H], 02H vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-2H*10H], 03H vmovups zmm9, zmmword ptr [r8+rdx-10H] vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-1H*10H], 01H vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-1H*10H], 02H vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-1H*10H], 03H vshufps zmm6, zmm8, zmm9, 136 vshufps zmm7, zmm8, zmm9, 221 vpshufd zmm6, zmm6, 93H vpshufd zmm7, zmm7, 93H mov al, 7 roundloop4: vpaddd zmm0, zmm0, zmm4 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 16 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 12 vpaddd zmm0, zmm0, zmm5 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 8 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 7 vpshufd zmm0, zmm0, 93H vpshufd zmm3, zmm3, 4EH vpshufd zmm2, zmm2, 39H vpaddd zmm0, zmm0, zmm6 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 16 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 12 vpaddd zmm0, zmm0, zmm7 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 8 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 7 vpshufd zmm0, zmm0, 39H vpshufd zmm3, zmm3, 4EH vpshufd zmm2, zmm2, 93H dec al jz endroundloop4 vshufps zmm8, zmm4, zmm5, 214 vpshufd zmm9, zmm4, 0FH vpshufd zmm4, zmm8, 39H vshufps zmm8, zmm6, zmm7, 250 vpblendmd zmm9 {k3}, zmm9, zmm8 vpunpcklqdq zmm8, zmm7, zmm5 vpblendmd zmm8 {k4}, zmm8, zmm6 vpshufd zmm8, zmm8, 78H vpunpckhdq zmm5, zmm5, zmm7 vpunpckldq zmm6, zmm6, zmm5 vpshufd zmm7, zmm6, 1EH vmovdqa32 zmm5, zmm9 vmovdqa32 zmm6, zmm8 jmp roundloop4 endroundloop4: vpxord zmm0, zmm0, zmm2 vpxord zmm1, zmm1, zmm3 mov eax, r13d cmp rdx, r15 jne innerloop4 vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+10H], xmm1 vextracti128 xmmword ptr [rbx+20H], ymm0, 01H vextracti128 xmmword ptr [rbx+30H], ymm1, 01H vextracti32x4 xmmword ptr [rbx+4H*10H], zmm0, 02H vextracti32x4 xmmword ptr [rbx+5H*10H], zmm1, 02H vextracti32x4 xmmword ptr [rbx+6H*10H], zmm0, 03H vextracti32x4 xmmword ptr [rbx+7H*10H], zmm1, 03H vmovdqa xmm0, xmmword ptr [rsp] vmovdqa xmm2, xmmword ptr [rsp+40H] vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+1H*10H] vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+5H*10H] vmovdqa xmmword ptr [rsp], xmm0 vmovdqa xmmword ptr [rsp+40H], xmm2 add rbx, 128 add rdi, 32 sub rsi, 4 final3blocks: test esi, 2H je final1block vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+10H] vmovd xmm13, dword ptr [rsp] vpinsrd xmm13, xmm13, dword ptr [rsp+40H], 1 vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 vmovd xmm14, dword ptr [rsp+4H] vpinsrd xmm14, xmm14, dword ptr [rsp+44H], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 vinserti128 ymm13, ymm13, xmm14, 01H mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx ALIGN 16 innerloop2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+88H], eax vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] vpbroadcastd ymm8, dword ptr [rsp+88H] vpblendd ymm3, ymm13, ymm8, 88H vmovups ymm8, ymmword ptr [r8+rdx-40H] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H vmovups ymm9, ymmword ptr [r8+rdx-30H] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H vshufps ymm4, ymm8, ymm9, 136 vshufps ymm5, ymm8, ymm9, 221 vmovups ymm8, ymmword ptr [r8+rdx-20H] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H vmovups ymm9, ymmword ptr [r8+rdx-10H] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H vshufps ymm6, ymm8, ymm9, 136 vshufps ymm7, ymm8, ymm9, 221 vpshufd ymm6, ymm6, 93H vpshufd ymm7, ymm7, 93H mov al, 7 roundloop2: vpaddd ymm0, ymm0, ymm4 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 16 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 12 vpaddd ymm0, ymm0, ymm5 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 8 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 7 vpshufd ymm0, ymm0, 93H vpshufd ymm3, ymm3, 4EH vpshufd ymm2, ymm2, 39H vpaddd ymm0, ymm0, ymm6 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 16 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 12 vpaddd ymm0, ymm0, ymm7 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 8 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 7 vpshufd ymm0, ymm0, 39H vpshufd ymm3, ymm3, 4EH vpshufd ymm2, ymm2, 93H dec al jz endroundloop2 vshufps ymm8, ymm4, ymm5, 214 vpshufd ymm9, ymm4, 0FH vpshufd ymm4, ymm8, 39H vshufps ymm8, ymm6, ymm7, 250 vpblendd ymm9, ymm9, ymm8, 0AAH vpunpcklqdq ymm8, ymm7, ymm5 vpblendd ymm8, ymm8, ymm6, 88H vpshufd ymm8, ymm8, 78H vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 1EH vmovdqa ymm5, ymm9 vmovdqa ymm6, ymm8 jmp roundloop2 endroundloop2: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 mov eax, r13d cmp rdx, r15 jne innerloop2 vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+10H], xmm1 vextracti128 xmmword ptr [rbx+20H], ymm0, 01H vextracti128 xmmword ptr [rbx+30H], ymm1, 01H vmovdqa xmm0, xmmword ptr [rsp] vmovdqa xmm2, xmmword ptr [rsp+40H] vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+8H] vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+48H] vmovdqa xmmword ptr [rsp], xmm0 vmovdqa xmmword ptr [rsp+40H], xmm2 add rbx, 64 add rdi, 16 sub rsi, 2 final1block: test esi, 1H je unwind vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+10H] vmovd xmm14, dword ptr [rsp] vpinsrd xmm14, xmm14, dword ptr [rsp+40H], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 vmovdqa xmm15, xmmword ptr [BLAKE3_IV] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx ALIGN 16 innerloop1: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d vpinsrd xmm3, xmm14, eax, 3 vmovdqa xmm2, xmm15 vmovups xmm8, xmmword ptr [r8+rdx-40H] vmovups xmm9, xmmword ptr [r8+rdx-30H] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [r8+rdx-20H] vmovups xmm9, xmmword ptr [r8+rdx-10H] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 93H vpshufd xmm7, xmm7, 93H mov al, 7 roundloop1: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 93H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 39H vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 39H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 93H dec al jz endroundloop1 vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0FH vpshufd xmm4, xmm8, 39H vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0AAH vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 88H vpshufd xmm8, xmm8, 78H vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 1EH vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp roundloop1 endroundloop1: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne innerloop1 vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+10H], xmm1 jmp unwind _blake3_hash_many_avx512 ENDP blake3_hash_many_avx512 ENDP ALIGN 16 blake3_compress_in_place_avx512 PROC _blake3_compress_in_place_avx512 PROC sub rsp, 72 vmovdqa xmmword ptr [rsp], xmm6 vmovdqa xmmword ptr [rsp+10H], xmm7 vmovdqa xmmword ptr [rsp+20H], xmm8 vmovdqa xmmword ptr [rsp+30H], xmm9 vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+10H] movzx eax, byte ptr [rsp+70H] movzx r8d, r8b shl rax, 32 add r8, rax vmovq xmm3, r9 vmovq xmm4, r8 vpunpcklqdq xmm3, xmm3, xmm4 vmovaps xmm2, xmmword ptr [BLAKE3_IV] vmovups xmm8, xmmword ptr [rdx] vmovups xmm9, xmmword ptr [rdx+10H] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [rdx+20H] vmovups xmm9, xmmword ptr [rdx+30H] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 93H vpshufd xmm7, xmm7, 93H mov al, 7 @@: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 93H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 39H vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 39H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 93H dec al jz @F vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0FH vpshufd xmm4, xmm8, 39H vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0AAH vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 88H vpshufd xmm8, xmm8, 78H vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 1EH vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp @B @@: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 vmovdqu xmmword ptr [rcx], xmm0 vmovdqu xmmword ptr [rcx+10H], xmm1 vmovdqa xmm6, xmmword ptr [rsp] vmovdqa xmm7, xmmword ptr [rsp+10H] vmovdqa xmm8, xmmword ptr [rsp+20H] vmovdqa xmm9, xmmword ptr [rsp+30H] add rsp, 72 ret _blake3_compress_in_place_avx512 ENDP blake3_compress_in_place_avx512 ENDP ALIGN 16 blake3_compress_xof_avx512 PROC _blake3_compress_xof_avx512 PROC sub rsp, 72 vmovdqa xmmword ptr [rsp], xmm6 vmovdqa xmmword ptr [rsp+10H], xmm7 vmovdqa xmmword ptr [rsp+20H], xmm8 vmovdqa xmmword ptr [rsp+30H], xmm9 vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+10H] movzx eax, byte ptr [rsp+70H] movzx r8d, r8b mov r10, qword ptr [rsp+78H] shl rax, 32 add r8, rax vmovq xmm3, r9 vmovq xmm4, r8 vpunpcklqdq xmm3, xmm3, xmm4 vmovaps xmm2, xmmword ptr [BLAKE3_IV] vmovups xmm8, xmmword ptr [rdx] vmovups xmm9, xmmword ptr [rdx+10H] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [rdx+20H] vmovups xmm9, xmmword ptr [rdx+30H] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 93H vpshufd xmm7, xmm7, 93H mov al, 7 @@: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 93H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 39H vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 39H vpshufd xmm3, xmm3, 4EH vpshufd xmm2, xmm2, 93H dec al jz @F vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0FH vpshufd xmm4, xmm8, 39H vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0AAH vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 88H vpshufd xmm8, xmm8, 78H vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 1EH vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp @B @@: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 vpxor xmm2, xmm2, xmmword ptr [rcx] vpxor xmm3, xmm3, xmmword ptr [rcx+10H] vmovdqu xmmword ptr [r10], xmm0 vmovdqu xmmword ptr [r10+10H], xmm1 vmovdqu xmmword ptr [r10+20H], xmm2 vmovdqu xmmword ptr [r10+30H], xmm3 vmovdqa xmm6, xmmword ptr [rsp] vmovdqa xmm7, xmmword ptr [rsp+10H] vmovdqa xmm8, xmmword ptr [rsp+20H] vmovdqa xmm9, xmmword ptr [rsp+30H] add rsp, 72 ret _blake3_compress_xof_avx512 ENDP blake3_compress_xof_avx512 ENDP _TEXT ENDS _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' ALIGN 64 INDEX0: dd 0, 1, 2, 3, 16, 17, 18, 19 dd 8, 9, 10, 11, 24, 25, 26, 27 INDEX1: dd 4, 5, 6, 7, 20, 21, 22, 23 dd 12, 13, 14, 15, 28, 29, 30, 31 ADD0: dd 0, 1, 2, 3, 4, 5, 6, 7 dd 8, 9, 10, 11, 12, 13, 14, 15 ADD1: dd 1 ADD16: dd 16 BLAKE3_BLOCK_LEN: dd 64 ALIGN 64 BLAKE3_IV: BLAKE3_IV_0: dd 06A09E667H BLAKE3_IV_1: dd 0BB67AE85H BLAKE3_IV_2: dd 03C6EF372H BLAKE3_IV_3: dd 0A54FF53AH _RDATA ENDS END blake3-1.8.1/c/blake3_dispatch.c000064400000000000000000000217151046102023000144100ustar 00000000000000#include #include #include #include "blake3_impl.h" #if defined(_MSC_VER) #include #endif #if defined(IS_X86) #if defined(_MSC_VER) #include #elif defined(__GNUC__) #include #else #undef IS_X86 /* Unimplemented! */ #endif #endif #if !defined(BLAKE3_ATOMICS) #if defined(__has_include) #if __has_include() && !defined(_MSC_VER) #define BLAKE3_ATOMICS 1 #else #define BLAKE3_ATOMICS 0 #endif /* __has_include() && !defined(_MSC_VER) */ #else #define BLAKE3_ATOMICS 0 #endif /* defined(__has_include) */ #endif /* BLAKE3_ATOMICS */ #if BLAKE3_ATOMICS #define ATOMIC_INT _Atomic int #define ATOMIC_LOAD(x) x #define ATOMIC_STORE(x, y) x = y #elif defined(_MSC_VER) #define ATOMIC_INT LONG #define ATOMIC_LOAD(x) InterlockedOr(&x, 0) #define ATOMIC_STORE(x, y) InterlockedExchange(&x, y) #else #define ATOMIC_INT int #define ATOMIC_LOAD(x) x #define ATOMIC_STORE(x, y) x = y #endif #define MAYBE_UNUSED(x) (void)((x)) #if defined(IS_X86) static uint64_t xgetbv(void) { #if defined(_MSC_VER) return _xgetbv(0); #else uint32_t eax = 0, edx = 0; __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); return ((uint64_t)edx << 32) | eax; #endif } static void cpuid(uint32_t out[4], uint32_t id) { #if defined(_MSC_VER) __cpuid((int *)out, id); #elif defined(__i386__) || defined(_M_IX86) __asm__ __volatile__("movl %%ebx, %1\n" "cpuid\n" "xchgl %1, %%ebx\n" : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id)); #else __asm__ __volatile__("cpuid\n" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id)); #endif } static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { #if defined(_MSC_VER) __cpuidex((int *)out, id, sid); #elif defined(__i386__) || defined(_M_IX86) __asm__ __volatile__("movl %%ebx, %1\n" "cpuid\n" "xchgl %1, %%ebx\n" : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id), "c"(sid)); #else __asm__ __volatile__("cpuid\n" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(id), "c"(sid)); #endif } #endif enum cpu_feature { SSE2 = 1 << 0, SSSE3 = 1 << 1, SSE41 = 1 << 2, AVX = 1 << 3, AVX2 = 1 << 4, AVX512F = 1 << 5, AVX512VL = 1 << 6, /* ... */ UNDEFINED = 1 << 30 }; #if !defined(BLAKE3_TESTING) static /* Allow the variable to be controlled manually for testing */ #endif ATOMIC_INT g_cpu_features = UNDEFINED; #if !defined(BLAKE3_TESTING) static #endif enum cpu_feature get_cpu_features(void) { /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */ enum cpu_feature features = ATOMIC_LOAD(g_cpu_features); if (features != UNDEFINED) { return features; } else { #if defined(IS_X86) uint32_t regs[4] = {0}; uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; (void)edx; features = 0; cpuid(regs, 0); const int max_id = *eax; cpuid(regs, 1); #if defined(__amd64__) || defined(_M_X64) features |= SSE2; #else if (*edx & (1UL << 26)) features |= SSE2; #endif if (*ecx & (1UL << 9)) features |= SSSE3; if (*ecx & (1UL << 19)) features |= SSE41; if (*ecx & (1UL << 27)) { // OSXSAVE const uint64_t mask = xgetbv(); if ((mask & 6) == 6) { // SSE and AVX states if (*ecx & (1UL << 28)) features |= AVX; if (max_id >= 7) { cpuidex(regs, 7, 0); if (*ebx & (1UL << 5)) features |= AVX2; if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm if (*ebx & (1UL << 31)) features |= AVX512VL; if (*ebx & (1UL << 16)) features |= AVX512F; } } } } ATOMIC_STORE(g_cpu_features, features); return features; #else /* How to detect NEON? */ return 0; #endif } } void blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); MAYBE_UNUSED(features); #if !defined(BLAKE3_NO_AVX512) if (features & AVX512VL) { blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); return; } #endif #if !defined(BLAKE3_NO_SSE41) if (features & SSE41) { blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); return; } #endif #if !defined(BLAKE3_NO_SSE2) if (features & SSE2) { blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); return; } #endif #endif blake3_compress_in_place_portable(cv, block, block_len, counter, flags); } void blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) { #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); MAYBE_UNUSED(features); #if !defined(BLAKE3_NO_AVX512) if (features & AVX512VL) { blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); return; } #endif #if !defined(BLAKE3_NO_SSE41) if (features & SSE41) { blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); return; } #endif #if !defined(BLAKE3_NO_SSE2) if (features & SSE2) { blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); return; } #endif #endif blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); } void blake3_xof_many(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64], size_t outblocks) { if (outblocks == 0) { // The current assembly implementation always outputs at least 1 block. return; } #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); MAYBE_UNUSED(features); #if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512) if (features & AVX512VL) { blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks); return; } #endif #endif for(size_t i = 0; i < outblocks; ++i) { blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i); } } void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); MAYBE_UNUSED(features); #if !defined(BLAKE3_NO_AVX512) if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; } #endif #if !defined(BLAKE3_NO_AVX2) if (features & AVX2) { blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; } #endif #if !defined(BLAKE3_NO_SSE41) if (features & SSE41) { blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; } #endif #if !defined(BLAKE3_NO_SSE2) if (features & SSE2) { blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; } #endif #endif #if BLAKE3_USE_NEON == 1 blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; #endif blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); } // The dynamically detected SIMD degree of the current platform. size_t blake3_simd_degree(void) { #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); MAYBE_UNUSED(features); #if !defined(BLAKE3_NO_AVX512) if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { return 16; } #endif #if !defined(BLAKE3_NO_AVX2) if (features & AVX2) { return 8; } #endif #if !defined(BLAKE3_NO_SSE41) if (features & SSE41) { return 4; } #endif #if !defined(BLAKE3_NO_SSE2) if (features & SSE2) { return 4; } #endif #endif #if BLAKE3_USE_NEON == 1 return 4; #endif return 1; } blake3-1.8.1/c/blake3_impl.h000064400000000000000000000273761046102023000135700ustar 00000000000000#ifndef BLAKE3_IMPL_H #define BLAKE3_IMPL_H #include #include #include #include #include #include "blake3.h" #ifdef __cplusplus extern "C" { #endif // internal flags enum blake3_flags { CHUNK_START = 1 << 0, CHUNK_END = 1 << 1, PARENT = 1 << 2, ROOT = 1 << 3, KEYED_HASH = 1 << 4, DERIVE_KEY_CONTEXT = 1 << 5, DERIVE_KEY_MATERIAL = 1 << 6, }; // This C implementation tries to support recent versions of GCC, Clang, and // MSVC. #if defined(_MSC_VER) #define INLINE static __forceinline #else #define INLINE static inline __attribute__((always_inline)) #endif #ifdef __cplusplus #define NOEXCEPT noexcept #else #define NOEXCEPT #endif #if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC) #define IS_X86 #define IS_X86_64 #endif #if defined(__i386__) || defined(_M_IX86) #define IS_X86 #define IS_X86_32 #endif #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) #define IS_AARCH64 #endif #if defined(IS_X86) #if defined(_MSC_VER) #include #endif #endif #if !defined(BLAKE3_USE_NEON) // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness #if defined(IS_AARCH64) #if defined(__ARM_BIG_ENDIAN) #define BLAKE3_USE_NEON 0 #else #define BLAKE3_USE_NEON 1 #endif #else #define BLAKE3_USE_NEON 0 #endif #endif #if defined(IS_X86) #define MAX_SIMD_DEGREE 16 #elif BLAKE3_USE_NEON == 1 #define MAX_SIMD_DEGREE 4 #else #define MAX_SIMD_DEGREE 1 #endif // There are some places where we want a static size that's equal to the // MAX_SIMD_DEGREE, but also at least 2. #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL}; static const uint8_t MSG_SCHEDULE[7][16] = { {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, }; /* Find index of the highest set bit */ /* x is assumed to be nonzero. */ static unsigned int highest_one(uint64_t x) { #if defined(__GNUC__) || defined(__clang__) return 63 ^ (unsigned int)__builtin_clzll(x); #elif defined(_MSC_VER) && defined(IS_X86_64) unsigned long index; _BitScanReverse64(&index, x); return index; #elif defined(_MSC_VER) && defined(IS_X86_32) if(x >> 32) { unsigned long index; _BitScanReverse(&index, (unsigned long)(x >> 32)); return 32 + index; } else { unsigned long index; _BitScanReverse(&index, (unsigned long)x); return index; } #else unsigned int c = 0; if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } if(x & 0x0000000000000002ULL) { c += 1; } return c; #endif } // Count the number of 1 bits. INLINE unsigned int popcnt(uint64_t x) { #if defined(__GNUC__) || defined(__clang__) return (unsigned int)__builtin_popcountll(x); #else unsigned int count = 0; while (x != 0) { count += 1; x &= x - 1; } return count; #endif } // Largest power of two less than or equal to x. As a special case, returns 1 // when x is 0. INLINE uint64_t round_down_to_power_of_2(uint64_t x) { return 1ULL << highest_one(x | 1); } INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } INLINE uint32_t counter_high(uint64_t counter) { return (uint32_t)(counter >> 32); } INLINE uint32_t load32(const void *src) { const uint8_t *p = (const uint8_t *)src; return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); } INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], uint32_t key_words[8]) { key_words[0] = load32(&key[0 * 4]); key_words[1] = load32(&key[1 * 4]); key_words[2] = load32(&key[2 * 4]); key_words[3] = load32(&key[3 * 4]); key_words[4] = load32(&key[4 * 4]); key_words[5] = load32(&key[5 * 4]); key_words[6] = load32(&key[6 * 4]); key_words[7] = load32(&key[7 * 4]); } INLINE void load_block_words(const uint8_t block[BLAKE3_BLOCK_LEN], uint32_t block_words[16]) { for (size_t i = 0; i < 16; i++) { block_words[i] = load32(&block[i * 4]); } } INLINE void store32(void *dst, uint32_t w) { uint8_t *p = (uint8_t *)dst; p[0] = (uint8_t)(w >> 0); p[1] = (uint8_t)(w >> 8); p[2] = (uint8_t)(w >> 16); p[3] = (uint8_t)(w >> 24); } INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { store32(&bytes_out[0 * 4], cv_words[0]); store32(&bytes_out[1 * 4], cv_words[1]); store32(&bytes_out[2 * 4], cv_words[2]); store32(&bytes_out[3 * 4], cv_words[3]); store32(&bytes_out[4 * 4], cv_words[4]); store32(&bytes_out[5 * 4], cv_words[5]); store32(&bytes_out[6 * 4], cv_words[6]); store32(&bytes_out[7 * 4], cv_words[7]); } void blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); void blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); void blake3_xof_many(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64], size_t outblocks); void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); size_t blake3_simd_degree(void); BLAKE3_PRIVATE size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out, bool use_tbb); #if defined(BLAKE3_USE_TBB) BLAKE3_PRIVATE void blake3_compress_subtree_wide_join_tbb( // shared params const uint32_t key[8], uint8_t flags, bool use_tbb, // left-hand side params const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter, uint8_t *l_cvs, size_t *l_n, // right-hand side params const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter, uint8_t *r_cvs, size_t *r_n) NOEXCEPT; #endif // Declarations for implementation-specific functions. void blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); void blake3_compress_xof_portable(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #if defined(IS_X86) #if !defined(BLAKE3_NO_SSE2) void blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); void blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #endif #if !defined(BLAKE3_NO_SSE41) void blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); void blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #endif #if !defined(BLAKE3_NO_AVX2) void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #endif #if !defined(BLAKE3_NO_AVX512) void blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); void blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #if !defined(_WIN32) void blake3_xof_many_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t* out, size_t outblocks); #endif #endif #endif #if BLAKE3_USE_NEON == 1 void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); #endif #ifdef __cplusplus } #endif #endif /* BLAKE3_IMPL_H */ blake3-1.8.1/c/blake3_neon.c000064400000000000000000000330111046102023000135400ustar 00000000000000#include "blake3_impl.h" #include #ifdef __ARM_BIG_ENDIAN #error "This implementation only supports little-endian ARM." // It might be that all we need for big-endian support here is to get the loads // and stores right, but step zero would be finding a way to test it in CI. #endif INLINE uint32x4_t loadu_128(const uint8_t src[16]) { // vld1q_u32 has alignment requirements. Don't use it. return vreinterpretq_u32_u8(vld1q_u8(src)); } INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) { // vst1q_u32 has alignment requirements. Don't use it. vst1q_u8(dest, vreinterpretq_u8_u32(src)); } INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) { return vaddq_u32(a, b); } INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) { return veorq_u32(a, b); } INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); } INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { uint32_t array[4] = {a, b, c, d}; return vld1q_u32(array); } INLINE uint32x4_t rot16_128(uint32x4_t x) { // The straightforward implementation would be two shifts and an or, but that's // slower on microarchitectures we've tested. See // https://github.com/BLAKE3-team/BLAKE3/pull/319. // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16)); return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))); } INLINE uint32x4_t rot12_128(uint32x4_t x) { // See comment in rot16_128. // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12)); return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12); } INLINE uint32x4_t rot8_128(uint32x4_t x) { // See comment in rot16_128. // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8)); #if defined(__clang__) return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12)); #elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700 static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12}; return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8)); #else return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8); #endif } INLINE uint32x4_t rot7_128(uint32x4_t x) { // See comment in rot16_128. // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7)); return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7); } // TODO: compress_neon // TODO: hash2_neon /* * ---------------------------------------------------------------------------- * hash4_neon * ---------------------------------------------------------------------------- */ INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) { v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); v[0] = add_128(v[0], v[4]); v[1] = add_128(v[1], v[5]); v[2] = add_128(v[2], v[6]); v[3] = add_128(v[3], v[7]); v[12] = xor_128(v[12], v[0]); v[13] = xor_128(v[13], v[1]); v[14] = xor_128(v[14], v[2]); v[15] = xor_128(v[15], v[3]); v[12] = rot16_128(v[12]); v[13] = rot16_128(v[13]); v[14] = rot16_128(v[14]); v[15] = rot16_128(v[15]); v[8] = add_128(v[8], v[12]); v[9] = add_128(v[9], v[13]); v[10] = add_128(v[10], v[14]); v[11] = add_128(v[11], v[15]); v[4] = xor_128(v[4], v[8]); v[5] = xor_128(v[5], v[9]); v[6] = xor_128(v[6], v[10]); v[7] = xor_128(v[7], v[11]); v[4] = rot12_128(v[4]); v[5] = rot12_128(v[5]); v[6] = rot12_128(v[6]); v[7] = rot12_128(v[7]); v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); v[0] = add_128(v[0], v[4]); v[1] = add_128(v[1], v[5]); v[2] = add_128(v[2], v[6]); v[3] = add_128(v[3], v[7]); v[12] = xor_128(v[12], v[0]); v[13] = xor_128(v[13], v[1]); v[14] = xor_128(v[14], v[2]); v[15] = xor_128(v[15], v[3]); v[12] = rot8_128(v[12]); v[13] = rot8_128(v[13]); v[14] = rot8_128(v[14]); v[15] = rot8_128(v[15]); v[8] = add_128(v[8], v[12]); v[9] = add_128(v[9], v[13]); v[10] = add_128(v[10], v[14]); v[11] = add_128(v[11], v[15]); v[4] = xor_128(v[4], v[8]); v[5] = xor_128(v[5], v[9]); v[6] = xor_128(v[6], v[10]); v[7] = xor_128(v[7], v[11]); v[4] = rot7_128(v[4]); v[5] = rot7_128(v[5]); v[6] = rot7_128(v[6]); v[7] = rot7_128(v[7]); v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); v[0] = add_128(v[0], v[5]); v[1] = add_128(v[1], v[6]); v[2] = add_128(v[2], v[7]); v[3] = add_128(v[3], v[4]); v[15] = xor_128(v[15], v[0]); v[12] = xor_128(v[12], v[1]); v[13] = xor_128(v[13], v[2]); v[14] = xor_128(v[14], v[3]); v[15] = rot16_128(v[15]); v[12] = rot16_128(v[12]); v[13] = rot16_128(v[13]); v[14] = rot16_128(v[14]); v[10] = add_128(v[10], v[15]); v[11] = add_128(v[11], v[12]); v[8] = add_128(v[8], v[13]); v[9] = add_128(v[9], v[14]); v[5] = xor_128(v[5], v[10]); v[6] = xor_128(v[6], v[11]); v[7] = xor_128(v[7], v[8]); v[4] = xor_128(v[4], v[9]); v[5] = rot12_128(v[5]); v[6] = rot12_128(v[6]); v[7] = rot12_128(v[7]); v[4] = rot12_128(v[4]); v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); v[0] = add_128(v[0], v[5]); v[1] = add_128(v[1], v[6]); v[2] = add_128(v[2], v[7]); v[3] = add_128(v[3], v[4]); v[15] = xor_128(v[15], v[0]); v[12] = xor_128(v[12], v[1]); v[13] = xor_128(v[13], v[2]); v[14] = xor_128(v[14], v[3]); v[15] = rot8_128(v[15]); v[12] = rot8_128(v[12]); v[13] = rot8_128(v[13]); v[14] = rot8_128(v[14]); v[10] = add_128(v[10], v[15]); v[11] = add_128(v[11], v[12]); v[8] = add_128(v[8], v[13]); v[9] = add_128(v[9], v[14]); v[5] = xor_128(v[5], v[10]); v[6] = xor_128(v[6], v[11]); v[7] = xor_128(v[7], v[8]); v[4] = xor_128(v[4], v[9]); v[5] = rot7_128(v[5]); v[6] = rot7_128(v[6]); v[7] = rot7_128(v[7]); v[4] = rot7_128(v[4]); } INLINE void transpose_vecs_128(uint32x4_t vecs[4]) { // Individually transpose the four 2x2 sub-matrices in each corner. uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]); uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]); // Swap the top-right and bottom-left 2x2s (which just got transposed). vecs[0] = vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0])); vecs[1] = vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1])); vecs[2] = vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0])); vecs[3] = vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1])); } INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, size_t block_offset, uint32x4_t out[16]) { out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]); out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]); out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]); out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]); out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]); out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]); out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]); out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]); out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]); out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]); out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]); out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]); out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]); out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]); out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]); out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]); transpose_vecs_128(&out[0]); transpose_vecs_128(&out[4]); transpose_vecs_128(&out[8]); transpose_vecs_128(&out[12]); } INLINE void load_counters4(uint64_t counter, bool increment_counter, uint32x4_t *out_low, uint32x4_t *out_high) { uint64_t mask = (increment_counter ? ~0 : 0); *out_low = set4( counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3))); *out_high = set4( counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3))); } void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { uint32x4_t h_vecs[8] = { set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), }; uint32x4_t counter_low_vec, counter_high_vec; load_counters4(counter, increment_counter, &counter_low_vec, &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { if (block + 1 == blocks) { block_flags |= flags_end; } uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN); uint32x4_t block_flags_vec = set1_128(block_flags); uint32x4_t msg_vecs[16]; transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); uint32x4_t v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn4(v, msg_vecs, 0); round_fn4(v, msg_vecs, 1); round_fn4(v, msg_vecs, 2); round_fn4(v, msg_vecs, 3); round_fn4(v, msg_vecs, 4); round_fn4(v, msg_vecs, 5); round_fn4(v, msg_vecs, 6); h_vecs[0] = xor_128(v[0], v[8]); h_vecs[1] = xor_128(v[1], v[9]); h_vecs[2] = xor_128(v[2], v[10]); h_vecs[3] = xor_128(v[3], v[11]); h_vecs[4] = xor_128(v[4], v[12]); h_vecs[5] = xor_128(v[5], v[13]); h_vecs[6] = xor_128(v[6], v[14]); h_vecs[7] = xor_128(v[7], v[15]); block_flags = flags; } transpose_vecs_128(&h_vecs[0]); transpose_vecs_128(&h_vecs[4]); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]); storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]); storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]); storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]); storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]); storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]); storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]); storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]); } /* * ---------------------------------------------------------------------------- * hash_many_neon * ---------------------------------------------------------------------------- */ void blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); INLINE void hash_one_neon(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { uint32_t cv[8]; memcpy(cv, key, BLAKE3_KEY_LEN); uint8_t block_flags = flags | flags_start; while (blocks > 0) { if (blocks == 1) { block_flags |= flags_end; } // TODO: Implement compress_neon. However note that according to // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227, // compress_neon might not be any faster than compress_portable. blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; } memcpy(out, cv, BLAKE3_OUT_LEN); } void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { while (num_inputs >= 4) { blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 4; } inputs += 4; num_inputs -= 4; out = &out[4 * BLAKE3_OUT_LEN]; } while (num_inputs > 0) { hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 1; } inputs += 1; num_inputs -= 1; out = &out[BLAKE3_OUT_LEN]; } } blake3-1.8.1/c/blake3_portable.c000064400000000000000000000134011046102023000144120ustar 00000000000000#include "blake3_impl.h" #include INLINE uint32_t rotr32(uint32_t w, uint32_t c) { return (w >> c) | (w << (32 - c)); } INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, uint32_t x, uint32_t y) { state[a] = state[a] + state[b] + x; state[d] = rotr32(state[d] ^ state[a], 16); state[c] = state[c] + state[d]; state[b] = rotr32(state[b] ^ state[c], 12); state[a] = state[a] + state[b] + y; state[d] = rotr32(state[d] ^ state[a], 8); state[c] = state[c] + state[d]; state[b] = rotr32(state[b] ^ state[c], 7); } INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) { // Select the message schedule based on the round. const uint8_t *schedule = MSG_SCHEDULE[round]; // Mix the columns. g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); // Mix the rows. g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); } INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { uint32_t block_words[16]; block_words[0] = load32(block + 4 * 0); block_words[1] = load32(block + 4 * 1); block_words[2] = load32(block + 4 * 2); block_words[3] = load32(block + 4 * 3); block_words[4] = load32(block + 4 * 4); block_words[5] = load32(block + 4 * 5); block_words[6] = load32(block + 4 * 6); block_words[7] = load32(block + 4 * 7); block_words[8] = load32(block + 4 * 8); block_words[9] = load32(block + 4 * 9); block_words[10] = load32(block + 4 * 10); block_words[11] = load32(block + 4 * 11); block_words[12] = load32(block + 4 * 12); block_words[13] = load32(block + 4 * 13); block_words[14] = load32(block + 4 * 14); block_words[15] = load32(block + 4 * 15); state[0] = cv[0]; state[1] = cv[1]; state[2] = cv[2]; state[3] = cv[3]; state[4] = cv[4]; state[5] = cv[5]; state[6] = cv[6]; state[7] = cv[7]; state[8] = IV[0]; state[9] = IV[1]; state[10] = IV[2]; state[11] = IV[3]; state[12] = counter_low(counter); state[13] = counter_high(counter); state[14] = (uint32_t)block_len; state[15] = (uint32_t)flags; round_fn(state, &block_words[0], 0); round_fn(state, &block_words[0], 1); round_fn(state, &block_words[0], 2); round_fn(state, &block_words[0], 3); round_fn(state, &block_words[0], 4); round_fn(state, &block_words[0], 5); round_fn(state, &block_words[0], 6); } void blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { uint32_t state[16]; compress_pre(state, cv, block, block_len, counter, flags); cv[0] = state[0] ^ state[8]; cv[1] = state[1] ^ state[9]; cv[2] = state[2] ^ state[10]; cv[3] = state[3] ^ state[11]; cv[4] = state[4] ^ state[12]; cv[5] = state[5] ^ state[13]; cv[6] = state[6] ^ state[14]; cv[7] = state[7] ^ state[15]; } void blake3_compress_xof_portable(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) { uint32_t state[16]; compress_pre(state, cv, block, block_len, counter, flags); store32(&out[0 * 4], state[0] ^ state[8]); store32(&out[1 * 4], state[1] ^ state[9]); store32(&out[2 * 4], state[2] ^ state[10]); store32(&out[3 * 4], state[3] ^ state[11]); store32(&out[4 * 4], state[4] ^ state[12]); store32(&out[5 * 4], state[5] ^ state[13]); store32(&out[6 * 4], state[6] ^ state[14]); store32(&out[7 * 4], state[7] ^ state[15]); store32(&out[8 * 4], state[8] ^ cv[0]); store32(&out[9 * 4], state[9] ^ cv[1]); store32(&out[10 * 4], state[10] ^ cv[2]); store32(&out[11 * 4], state[11] ^ cv[3]); store32(&out[12 * 4], state[12] ^ cv[4]); store32(&out[13 * 4], state[13] ^ cv[5]); store32(&out[14 * 4], state[14] ^ cv[6]); store32(&out[15 * 4], state[15] ^ cv[7]); } INLINE void hash_one_portable(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { uint32_t cv[8]; memcpy(cv, key, BLAKE3_KEY_LEN); uint8_t block_flags = flags | flags_start; while (blocks > 0) { if (blocks == 1) { block_flags |= flags_end; } blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; } store_cv_words(out, cv); } void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { while (num_inputs > 0) { hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 1; } inputs += 1; num_inputs -= 1; out = &out[BLAKE3_OUT_LEN]; } } blake3-1.8.1/c/blake3_sse2.c000064400000000000000000000510151046102023000134610ustar 00000000000000#include "blake3_impl.h" #include #define DEGREE 4 #define _mm_shuffle_ps2(a, b, c) \ (_mm_castps_si128( \ _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) INLINE __m128i loadu(const uint8_t src[16]) { return _mm_loadu_si128((const __m128i *)src); } INLINE void storeu(__m128i src, uint8_t dest[16]) { _mm_storeu_si128((__m128i *)dest, src); } INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } // Note that clang-format doesn't like the name "xor" for some reason. INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); } INLINE __m128i rot16(__m128i x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1); } INLINE __m128i rot12(__m128i x) { return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); } INLINE __m128i rot8(__m128i x) { return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8)); } INLINE __m128i rot7(__m128i x) { return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); } INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m) { *row0 = addv(addv(*row0, m), *row1); *row3 = xorv(*row3, *row0); *row3 = rot16(*row3); *row2 = addv(*row2, *row3); *row1 = xorv(*row1, *row2); *row1 = rot12(*row1); } INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m) { *row0 = addv(addv(*row0, m), *row1); *row3 = xorv(*row3, *row0); *row3 = rot8(*row3); *row2 = addv(*row2, *row3); *row1 = xorv(*row1, *row2); *row1 = rot7(*row1); } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); } INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); } INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) { const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); __m128i mask = _mm_set1_epi16(imm8); mask = _mm_and_si128(mask, bits); mask = _mm_cmpeq_epi16(mask, bits); return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)); } INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { rows[0] = loadu((uint8_t *)&cv[0]); rows[1] = loadu((uint8_t *)&cv[4]); rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); rows[3] = set4(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags); __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); __m128i t0, t1, t2, t3, tt; // Round 1. The first round permutes the message words from the original // input order, into the groups that get mixed in parallel. t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 2. This round and all following rounds apply a fixed permutation // to the message words from the round before. t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 3 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 4 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 5 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 6 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 7 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); } void blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); } void blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu(xorv(rows[0], rows[2]), &out[0]); storeu(xorv(rows[1], rows[3]), &out[16]); storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); } INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); v[0] = addv(v[0], v[4]); v[1] = addv(v[1], v[5]); v[2] = addv(v[2], v[6]); v[3] = addv(v[3], v[7]); v[12] = xorv(v[12], v[0]); v[13] = xorv(v[13], v[1]); v[14] = xorv(v[14], v[2]); v[15] = xorv(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = addv(v[8], v[12]); v[9] = addv(v[9], v[13]); v[10] = addv(v[10], v[14]); v[11] = addv(v[11], v[15]); v[4] = xorv(v[4], v[8]); v[5] = xorv(v[5], v[9]); v[6] = xorv(v[6], v[10]); v[7] = xorv(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); v[0] = addv(v[0], v[4]); v[1] = addv(v[1], v[5]); v[2] = addv(v[2], v[6]); v[3] = addv(v[3], v[7]); v[12] = xorv(v[12], v[0]); v[13] = xorv(v[13], v[1]); v[14] = xorv(v[14], v[2]); v[15] = xorv(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = addv(v[8], v[12]); v[9] = addv(v[9], v[13]); v[10] = addv(v[10], v[14]); v[11] = addv(v[11], v[15]); v[4] = xorv(v[4], v[8]); v[5] = xorv(v[5], v[9]); v[6] = xorv(v[6], v[10]); v[7] = xorv(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); v[0] = addv(v[0], v[5]); v[1] = addv(v[1], v[6]); v[2] = addv(v[2], v[7]); v[3] = addv(v[3], v[4]); v[15] = xorv(v[15], v[0]); v[12] = xorv(v[12], v[1]); v[13] = xorv(v[13], v[2]); v[14] = xorv(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = addv(v[10], v[15]); v[11] = addv(v[11], v[12]); v[8] = addv(v[8], v[13]); v[9] = addv(v[9], v[14]); v[5] = xorv(v[5], v[10]); v[6] = xorv(v[6], v[11]); v[7] = xorv(v[7], v[8]); v[4] = xorv(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); v[0] = addv(v[0], v[5]); v[1] = addv(v[1], v[6]); v[2] = addv(v[2], v[7]); v[3] = addv(v[3], v[4]); v[15] = xorv(v[15], v[0]); v[12] = xorv(v[12], v[1]); v[13] = xorv(v[13], v[2]); v[14] = xorv(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = addv(v[10], v[15]); v[11] = addv(v[11], v[12]); v[8] = addv(v[8], v[13]); v[9] = addv(v[9], v[14]); v[5] = xorv(v[5], v[10]); v[6] = xorv(v[6], v[11]); v[7] = xorv(v[7], v[8]); v[4] = xorv(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } INLINE void transpose_vecs(__m128i vecs[DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is // 22/33. Note that this doesn't split the vector into two lanes, as the // AVX2 counterparts do. __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); // Interleave 64-bit lanes. __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); vecs[0] = abcd_0; vecs[1] = abcd_1; vecs[2] = abcd_2; vecs[3] = abcd_3; } INLINE void transpose_msg_vecs(const uint8_t *const *inputs, size_t block_offset, __m128i out[16]) { out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); for (size_t i = 0; i < 4; ++i) { _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs(&out[0]); transpose_vecs(&out[4]); transpose_vecs(&out[8]); transpose_vecs(&out[12]); } INLINE void load_counters(uint64_t counter, bool increment_counter, __m128i *out_lo, __m128i *out_hi) { const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); const __m128i add1 = _mm_and_si128(mask, add0); __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry); *out_lo = l; *out_hi = h; } static void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m128i h_vecs[8] = { set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), }; __m128i counter_low_vec, counter_high_vec; load_counters(counter, increment_counter, &counter_low_vec, &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { if (block + 1 == blocks) { block_flags |= flags_end; } __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); __m128i block_flags_vec = set1(block_flags); __m128i msg_vecs[16]; transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); __m128i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn(v, msg_vecs, 0); round_fn(v, msg_vecs, 1); round_fn(v, msg_vecs, 2); round_fn(v, msg_vecs, 3); round_fn(v, msg_vecs, 4); round_fn(v, msg_vecs, 5); round_fn(v, msg_vecs, 6); h_vecs[0] = xorv(v[0], v[8]); h_vecs[1] = xorv(v[1], v[9]); h_vecs[2] = xorv(v[2], v[10]); h_vecs[3] = xorv(v[3], v[11]); h_vecs[4] = xorv(v[4], v[12]); h_vecs[5] = xorv(v[5], v[13]); h_vecs[6] = xorv(v[6], v[14]); h_vecs[7] = xorv(v[7], v[15]); block_flags = flags; } transpose_vecs(&h_vecs[0]); transpose_vecs(&h_vecs[4]); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); } INLINE void hash_one_sse2(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { uint32_t cv[8]; memcpy(cv, key, BLAKE3_KEY_LEN); uint8_t block_flags = flags | flags_start; while (blocks > 0) { if (blocks == 1) { block_flags |= flags_end; } blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; } memcpy(out, cv, BLAKE3_OUT_LEN); } void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { while (num_inputs >= DEGREE) { blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += DEGREE; } inputs += DEGREE; num_inputs -= DEGREE; out = &out[DEGREE * BLAKE3_OUT_LEN]; } while (num_inputs > 0) { hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 1; } inputs += 1; num_inputs -= 1; out = &out[BLAKE3_OUT_LEN]; } } blake3-1.8.1/c/blake3_sse2_x86-64_unix.S000064400000000000000000002063721046102023000154500ustar 00000000000000#if defined(__ELF__) && defined(__linux__) .section .note.GNU-stack,"",%progbits #endif #if defined(__ELF__) && defined(__CET__) && defined(__has_include) #if __has_include() #include #endif #endif #if !defined(_CET_ENDBR) #define _CET_ENDBR #endif .intel_syntax noprefix .global blake3_hash_many_sse2 .global _blake3_hash_many_sse2 .global blake3_compress_in_place_sse2 .global _blake3_compress_in_place_sse2 .global blake3_compress_xof_sse2 .global _blake3_compress_xof_sse2 #ifdef __APPLE__ .text #else .section .text #endif .p2align 6 _blake3_hash_many_sse2: blake3_hash_many_sse2: _CET_ENDBR push r15 push r14 push r13 push r12 push rbx push rbp mov rbp, rsp sub rsp, 360 and rsp, 0xFFFFFFFFFFFFFFC0 neg r9d movd xmm0, r9d pshufd xmm0, xmm0, 0x00 movdqa xmmword ptr [rsp+0x130], xmm0 movdqa xmm1, xmm0 pand xmm1, xmmword ptr [ADD0+rip] pand xmm0, xmmword ptr [ADD1+rip] movdqa xmmword ptr [rsp+0x150], xmm0 movd xmm0, r8d pshufd xmm0, xmm0, 0x00 paddd xmm0, xmm1 movdqa xmmword ptr [rsp+0x110], xmm0 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm1, xmm0 shr r8, 32 movd xmm2, r8d pshufd xmm2, xmm2, 0x00 psubd xmm2, xmm1 movdqa xmmword ptr [rsp+0x120], xmm2 mov rbx, qword ptr [rbp+0x50] mov r15, rdx shl r15, 6 movzx r13d, byte ptr [rbp+0x38] movzx r12d, byte ptr [rbp+0x48] cmp rsi, 4 jc 3f 2: movdqu xmm3, xmmword ptr [rcx] pshufd xmm0, xmm3, 0x00 pshufd xmm1, xmm3, 0x55 pshufd xmm2, xmm3, 0xAA pshufd xmm3, xmm3, 0xFF movdqu xmm7, xmmword ptr [rcx+0x10] pshufd xmm4, xmm7, 0x00 pshufd xmm5, xmm7, 0x55 pshufd xmm6, xmm7, 0xAA pshufd xmm7, xmm7, 0xFF mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 9: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movdqu xmm8, xmmword ptr [r8+rdx-0x40] movdqu xmm9, xmmword ptr [r9+rdx-0x40] movdqu xmm10, xmmword ptr [r10+rdx-0x40] movdqu xmm11, xmmword ptr [r11+rdx-0x40] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp], xmm8 movdqa xmmword ptr [rsp+0x10], xmm9 movdqa xmmword ptr [rsp+0x20], xmm12 movdqa xmmword ptr [rsp+0x30], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x30] movdqu xmm9, xmmword ptr [r9+rdx-0x30] movdqu xmm10, xmmword ptr [r10+rdx-0x30] movdqu xmm11, xmmword ptr [r11+rdx-0x30] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x40], xmm8 movdqa xmmword ptr [rsp+0x50], xmm9 movdqa xmmword ptr [rsp+0x60], xmm12 movdqa xmmword ptr [rsp+0x70], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x20] movdqu xmm9, xmmword ptr [r9+rdx-0x20] movdqu xmm10, xmmword ptr [r10+rdx-0x20] movdqu xmm11, xmmword ptr [r11+rdx-0x20] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x80], xmm8 movdqa xmmword ptr [rsp+0x90], xmm9 movdqa xmmword ptr [rsp+0xA0], xmm12 movdqa xmmword ptr [rsp+0xB0], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x10] movdqu xmm9, xmmword ptr [r9+rdx-0x10] movdqu xmm10, xmmword ptr [r10+rdx-0x10] movdqu xmm11, xmmword ptr [r11+rdx-0x10] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0xC0], xmm8 movdqa xmmword ptr [rsp+0xD0], xmm9 movdqa xmmword ptr [rsp+0xE0], xmm12 movdqa xmmword ptr [rsp+0xF0], xmm13 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] movdqa xmm12, xmmword ptr [rsp+0x110] movdqa xmm13, xmmword ptr [rsp+0x120] movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] movd xmm15, eax pshufd xmm15, xmm15, 0x00 prefetcht0 [r8+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r11+rdx+0x80] paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x80] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x70] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xB0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x50] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xC0] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xA0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0x60] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xF0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 pxor xmm0, xmm8 pxor xmm1, xmm9 pxor xmm2, xmm10 pxor xmm3, xmm11 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 pxor xmm4, xmm12 pxor xmm5, xmm13 pxor xmm6, xmm14 pxor xmm7, xmm15 mov eax, r13d jne 9b movdqa xmm9, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm9, xmm1 movdqa xmm11, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm11, xmm3 movdqa xmm1, xmm0 punpcklqdq xmm0, xmm2 punpckhqdq xmm1, xmm2 movdqa xmm3, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm3, xmm11 movdqu xmmword ptr [rbx], xmm0 movdqu xmmword ptr [rbx+0x20], xmm1 movdqu xmmword ptr [rbx+0x40], xmm9 movdqu xmmword ptr [rbx+0x60], xmm3 movdqa xmm9, xmm4 punpckldq xmm4, xmm5 punpckhdq xmm9, xmm5 movdqa xmm11, xmm6 punpckldq xmm6, xmm7 punpckhdq xmm11, xmm7 movdqa xmm5, xmm4 punpcklqdq xmm4, xmm6 punpckhqdq xmm5, xmm6 movdqa xmm7, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm7, xmm11 movdqu xmmword ptr [rbx+0x10], xmm4 movdqu xmmword ptr [rbx+0x30], xmm5 movdqu xmmword ptr [rbx+0x50], xmm9 movdqu xmmword ptr [rbx+0x70], xmm7 movdqa xmm1, xmmword ptr [rsp+0x110] movdqa xmm0, xmm1 paddd xmm1, xmmword ptr [rsp+0x150] movdqa xmmword ptr [rsp+0x110], xmm1 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm0, xmm1 movdqa xmm1, xmmword ptr [rsp+0x120] psubd xmm1, xmm0 movdqa xmmword ptr [rsp+0x120], xmm1 add rbx, 128 add rdi, 32 sub rsi, 4 cmp rsi, 4 jnc 2b test rsi, rsi jnz 3f 4: mov rsp, rbp pop rbp pop rbx pop r12 pop r13 pop r14 pop r15 ret .p2align 5 3: test esi, 0x2 je 3f movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+0x110] movd xmm14, dword ptr [rsp+0x120] punpckldq xmm13, xmm14 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+0x114] movd xmm13, dword ptr [rsp+0x124] punpckldq xmm14, xmm13 movaps xmmword ptr [rsp+0x10], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm10, xmm2 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm3, xmm4 shufps xmm4, xmm5, 136 shufps xmm3, xmm5, 221 movaps xmm5, xmm3 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm3, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm3, xmm7, 221 pshufd xmm7, xmm3, 0x93 movups xmm12, xmmword ptr [r9+rdx-0x40] movups xmm13, xmmword ptr [r9+rdx-0x30] movaps xmm11, xmm12 shufps xmm12, xmm13, 136 shufps xmm11, xmm13, 221 movaps xmm13, xmm11 movups xmm14, xmmword ptr [r9+rdx-0x20] movups xmm15, xmmword ptr [r9+rdx-0x10] movaps xmm11, xmm14 shufps xmm14, xmm15, 136 pshufd xmm14, xmm14, 0x93 shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 0x93 shl rax, 0x20 or rax, 0x40 movq xmm3, rax movdqa xmmword ptr [rsp+0x20], xmm3 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+0x10] punpcklqdq xmm3, xmmword ptr [rsp+0x20] punpcklqdq xmm11, xmmword ptr [rsp+0x20] mov al, 7 9: paddd xmm0, xmm4 paddd xmm8, xmm12 movaps xmmword ptr [rsp+0x20], xmm4 movaps xmmword ptr [rsp+0x30], xmm12 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 pshuflw xmm11, xmm11, 0xB1 pshufhw xmm11, xmm11, 0xB1 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm5 paddd xmm8, xmm13 movaps xmmword ptr [rsp+0x40], xmm5 movaps xmmword ptr [rsp+0x50], xmm13 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x93 pshufd xmm8, xmm8, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x39 pshufd xmm10, xmm10, 0x39 paddd xmm0, xmm6 paddd xmm8, xmm14 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 pshuflw xmm11, xmm11, 0xB1 pshufhw xmm11, xmm11, 0xB1 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm7 paddd xmm8, xmm15 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x39 pshufd xmm8, xmm8, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x93 pshufd xmm10, xmm10, 0x93 dec al je 9f movdqa xmm12, xmmword ptr [rsp+0x20] movdqa xmm5, xmmword ptr [rsp+0x40] pshufd xmm13, xmm12, 0x0F shufps xmm12, xmm5, 214 pshufd xmm4, xmm12, 0x39 movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm13, xmm12 movdqa xmmword ptr [rsp+0x20], xmm13 movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 movdqa xmm13, xmm6 pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm12, xmm13 pshufd xmm12, xmm12, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmmword ptr [rsp+0x40], xmm12 movdqa xmm5, xmmword ptr [rsp+0x30] movdqa xmm13, xmmword ptr [rsp+0x50] pshufd xmm6, xmm5, 0x0F shufps xmm5, xmm13, 214 pshufd xmm12, xmm5, 0x39 movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm6, xmm5 movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 movdqa xmmword ptr [rsp+0x30], xmm2 movdqa xmm2, xmm14 pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm5, xmm2 movdqa xmm2, xmmword ptr [rsp+0x30] pshufd xmm5, xmm5, 0x78 punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 pshufd xmm15, xmm14, 0x1E movdqa xmm13, xmm6 movdqa xmm14, xmm5 movdqa xmm5, xmmword ptr [rsp+0x20] movdqa xmm6, xmmword ptr [rsp+0x40] jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm8, xmm10 pxor xmm9, xmm11 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 movups xmmword ptr [rbx+0x20], xmm8 movups xmmword ptr [rbx+0x30], xmm9 mov eax, dword ptr [rsp+0x130] neg eax mov r10d, dword ptr [rsp+0x110+8*rax] mov r11d, dword ptr [rsp+0x120+8*rax] mov dword ptr [rsp+0x110], r10d mov dword ptr [rsp+0x120], r11d add rdi, 16 add rbx, 64 sub rsi, 2 3: test esi, 0x1 je 4b movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movd xmm13, dword ptr [rsp+0x110] movd xmm14, dword ptr [rsp+0x120] punpckldq xmm13, xmm14 mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] shl rax, 32 or rax, 64 movq xmm12, rax movdqa xmm3, xmm13 punpcklqdq xmm3, xmm12 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm10, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 jmp 4b .p2align 6 blake3_compress_in_place_sse2: _blake3_compress_in_place_sse2: _CET_ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] shl r8, 32 add rdx, r8 movq xmm3, rcx movq xmm4, rdx punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rsi] movups xmm5, xmmword ptr [rsi+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rsi+0x20] movups xmm7, xmmword ptr [rsi+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm10, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 movups xmmword ptr [rdi], xmm0 movups xmmword ptr [rdi+0x10], xmm1 ret .p2align 6 blake3_compress_xof_sse2: _blake3_compress_xof_sse2: _CET_ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movzx eax, r8b movzx edx, dl shl rax, 32 add rdx, rax movq xmm3, rcx movq xmm4, rdx punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rsi] movups xmm5, xmmword ptr [rsi+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rsi+0x20] movups xmm7, xmmword ptr [rsi+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm10, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: movdqu xmm4, xmmword ptr [rdi] movdqu xmm5, xmmword ptr [rdi+0x10] pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm2, xmm4 pxor xmm3, xmm5 movups xmmword ptr [r9], xmm0 movups xmmword ptr [r9+0x10], xmm1 movups xmmword ptr [r9+0x20], xmm2 movups xmmword ptr [r9+0x30], xmm3 ret #ifdef __APPLE__ .static_data #else .section .rodata #endif .p2align 6 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 .long 0x3C6EF372, 0xA54FF53A ADD0: .long 0, 1, 2, 3 ADD1: .long 4, 4, 4, 4 BLAKE3_IV_0: .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A BLAKE3_BLOCK_LEN: .long 64, 64, 64, 64 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 PBLENDW_0x33_MASK: .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 PBLENDW_0xCC_MASK: .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF PBLENDW_0x3F_MASK: .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 PBLENDW_0xC0_MASK: .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF blake3-1.8.1/c/blake3_sse2_x86-64_windows_gnu.S000064400000000000000000002130421046102023000170200ustar 00000000000000.intel_syntax noprefix .global blake3_hash_many_sse2 .global _blake3_hash_many_sse2 .global blake3_compress_in_place_sse2 .global _blake3_compress_in_place_sse2 .global blake3_compress_xof_sse2 .global _blake3_compress_xof_sse2 .section .text .p2align 6 _blake3_hash_many_sse2: blake3_hash_many_sse2: push r15 push r14 push r13 push r12 push rsi push rdi push rbx push rbp mov rbp, rsp sub rsp, 528 and rsp, 0xFFFFFFFFFFFFFFC0 movdqa xmmword ptr [rsp+0x170], xmm6 movdqa xmmword ptr [rsp+0x180], xmm7 movdqa xmmword ptr [rsp+0x190], xmm8 movdqa xmmword ptr [rsp+0x1A0], xmm9 movdqa xmmword ptr [rsp+0x1B0], xmm10 movdqa xmmword ptr [rsp+0x1C0], xmm11 movdqa xmmword ptr [rsp+0x1D0], xmm12 movdqa xmmword ptr [rsp+0x1E0], xmm13 movdqa xmmword ptr [rsp+0x1F0], xmm14 movdqa xmmword ptr [rsp+0x200], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+0x68] movzx r9, byte ptr [rbp+0x70] neg r9d movd xmm0, r9d pshufd xmm0, xmm0, 0x00 movdqa xmmword ptr [rsp+0x130], xmm0 movdqa xmm1, xmm0 pand xmm1, xmmword ptr [ADD0+rip] pand xmm0, xmmword ptr [ADD1+rip] movdqa xmmword ptr [rsp+0x150], xmm0 movd xmm0, r8d pshufd xmm0, xmm0, 0x00 paddd xmm0, xmm1 movdqa xmmword ptr [rsp+0x110], xmm0 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm1, xmm0 shr r8, 32 movd xmm2, r8d pshufd xmm2, xmm2, 0x00 psubd xmm2, xmm1 movdqa xmmword ptr [rsp+0x120], xmm2 mov rbx, qword ptr [rbp+0x90] mov r15, rdx shl r15, 6 movzx r13d, byte ptr [rbp+0x78] movzx r12d, byte ptr [rbp+0x88] cmp rsi, 4 jc 3f 2: movdqu xmm3, xmmword ptr [rcx] pshufd xmm0, xmm3, 0x00 pshufd xmm1, xmm3, 0x55 pshufd xmm2, xmm3, 0xAA pshufd xmm3, xmm3, 0xFF movdqu xmm7, xmmword ptr [rcx+0x10] pshufd xmm4, xmm7, 0x00 pshufd xmm5, xmm7, 0x55 pshufd xmm6, xmm7, 0xAA pshufd xmm7, xmm7, 0xFF mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx 9: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movdqu xmm8, xmmword ptr [r8+rdx-0x40] movdqu xmm9, xmmword ptr [r9+rdx-0x40] movdqu xmm10, xmmword ptr [r10+rdx-0x40] movdqu xmm11, xmmword ptr [r11+rdx-0x40] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp], xmm8 movdqa xmmword ptr [rsp+0x10], xmm9 movdqa xmmword ptr [rsp+0x20], xmm12 movdqa xmmword ptr [rsp+0x30], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x30] movdqu xmm9, xmmword ptr [r9+rdx-0x30] movdqu xmm10, xmmword ptr [r10+rdx-0x30] movdqu xmm11, xmmword ptr [r11+rdx-0x30] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x40], xmm8 movdqa xmmword ptr [rsp+0x50], xmm9 movdqa xmmword ptr [rsp+0x60], xmm12 movdqa xmmword ptr [rsp+0x70], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x20] movdqu xmm9, xmmword ptr [r9+rdx-0x20] movdqu xmm10, xmmword ptr [r10+rdx-0x20] movdqu xmm11, xmmword ptr [r11+rdx-0x20] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x80], xmm8 movdqa xmmword ptr [rsp+0x90], xmm9 movdqa xmmword ptr [rsp+0xA0], xmm12 movdqa xmmword ptr [rsp+0xB0], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x10] movdqu xmm9, xmmword ptr [r9+rdx-0x10] movdqu xmm10, xmmword ptr [r10+rdx-0x10] movdqu xmm11, xmmword ptr [r11+rdx-0x10] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0xC0], xmm8 movdqa xmmword ptr [rsp+0xD0], xmm9 movdqa xmmword ptr [rsp+0xE0], xmm12 movdqa xmmword ptr [rsp+0xF0], xmm13 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] movdqa xmm12, xmmword ptr [rsp+0x110] movdqa xmm13, xmmword ptr [rsp+0x120] movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] movd xmm15, eax pshufd xmm15, xmm15, 0x00 prefetcht0 [r8+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r11+rdx+0x80] paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x80] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x70] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xB0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x50] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xC0] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xA0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0x60] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xF0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 pxor xmm0, xmm8 pxor xmm1, xmm9 pxor xmm2, xmm10 pxor xmm3, xmm11 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 pxor xmm4, xmm12 pxor xmm5, xmm13 pxor xmm6, xmm14 pxor xmm7, xmm15 mov eax, r13d jne 9b movdqa xmm9, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm9, xmm1 movdqa xmm11, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm11, xmm3 movdqa xmm1, xmm0 punpcklqdq xmm0, xmm2 punpckhqdq xmm1, xmm2 movdqa xmm3, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm3, xmm11 movdqu xmmword ptr [rbx], xmm0 movdqu xmmword ptr [rbx+0x20], xmm1 movdqu xmmword ptr [rbx+0x40], xmm9 movdqu xmmword ptr [rbx+0x60], xmm3 movdqa xmm9, xmm4 punpckldq xmm4, xmm5 punpckhdq xmm9, xmm5 movdqa xmm11, xmm6 punpckldq xmm6, xmm7 punpckhdq xmm11, xmm7 movdqa xmm5, xmm4 punpcklqdq xmm4, xmm6 punpckhqdq xmm5, xmm6 movdqa xmm7, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm7, xmm11 movdqu xmmword ptr [rbx+0x10], xmm4 movdqu xmmword ptr [rbx+0x30], xmm5 movdqu xmmword ptr [rbx+0x50], xmm9 movdqu xmmword ptr [rbx+0x70], xmm7 movdqa xmm1, xmmword ptr [rsp+0x110] movdqa xmm0, xmm1 paddd xmm1, xmmword ptr [rsp+0x150] movdqa xmmword ptr [rsp+0x110], xmm1 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm0, xmm1 movdqa xmm1, xmmword ptr [rsp+0x120] psubd xmm1, xmm0 movdqa xmmword ptr [rsp+0x120], xmm1 add rbx, 128 add rdi, 32 sub rsi, 4 cmp rsi, 4 jnc 2b test rsi, rsi jne 3f 4: movdqa xmm6, xmmword ptr [rsp+0x170] movdqa xmm7, xmmword ptr [rsp+0x180] movdqa xmm8, xmmword ptr [rsp+0x190] movdqa xmm9, xmmword ptr [rsp+0x1A0] movdqa xmm10, xmmword ptr [rsp+0x1B0] movdqa xmm11, xmmword ptr [rsp+0x1C0] movdqa xmm12, xmmword ptr [rsp+0x1D0] movdqa xmm13, xmmword ptr [rsp+0x1E0] movdqa xmm14, xmmword ptr [rsp+0x1F0] movdqa xmm15, xmmword ptr [rsp+0x200] mov rsp, rbp pop rbp pop rbx pop rdi pop rsi pop r12 pop r13 pop r14 pop r15 ret .p2align 5 3: test esi, 0x2 je 3f movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+0x110] movd xmm14, dword ptr [rsp+0x120] punpckldq xmm13, xmm14 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+0x114] movd xmm13, dword ptr [rsp+0x124] punpckldq xmm14, xmm13 movaps xmmword ptr [rsp+0x10], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm10, xmm2 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm3, xmm4 shufps xmm4, xmm5, 136 shufps xmm3, xmm5, 221 movaps xmm5, xmm3 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm3, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm3, xmm7, 221 pshufd xmm7, xmm3, 0x93 movups xmm12, xmmword ptr [r9+rdx-0x40] movups xmm13, xmmword ptr [r9+rdx-0x30] movaps xmm11, xmm12 shufps xmm12, xmm13, 136 shufps xmm11, xmm13, 221 movaps xmm13, xmm11 movups xmm14, xmmword ptr [r9+rdx-0x20] movups xmm15, xmmword ptr [r9+rdx-0x10] movaps xmm11, xmm14 shufps xmm14, xmm15, 136 pshufd xmm14, xmm14, 0x93 shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 0x93 shl rax, 0x20 or rax, 0x40 movq xmm3, rax movdqa xmmword ptr [rsp+0x20], xmm3 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+0x10] punpcklqdq xmm3, xmmword ptr [rsp+0x20] punpcklqdq xmm11, xmmword ptr [rsp+0x20] mov al, 7 9: paddd xmm0, xmm4 paddd xmm8, xmm12 movaps xmmword ptr [rsp+0x20], xmm4 movaps xmmword ptr [rsp+0x30], xmm12 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 pshuflw xmm11, xmm11, 0xB1 pshufhw xmm11, xmm11, 0xB1 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm5 paddd xmm8, xmm13 movaps xmmword ptr [rsp+0x40], xmm5 movaps xmmword ptr [rsp+0x50], xmm13 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x93 pshufd xmm8, xmm8, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x39 pshufd xmm10, xmm10, 0x39 paddd xmm0, xmm6 paddd xmm8, xmm14 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 pshuflw xmm11, xmm11, 0xB1 pshufhw xmm11, xmm11, 0xB1 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm7 paddd xmm8, xmm15 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x39 pshufd xmm8, xmm8, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x93 pshufd xmm10, xmm10, 0x93 dec al je 9f movdqa xmm12, xmmword ptr [rsp+0x20] movdqa xmm5, xmmword ptr [rsp+0x40] pshufd xmm13, xmm12, 0x0F shufps xmm12, xmm5, 214 pshufd xmm4, xmm12, 0x39 movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm13, xmm12 movdqa xmmword ptr [rsp+0x20], xmm13 movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 movdqa xmm13, xmm6 pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm12, xmm13 pshufd xmm12, xmm12, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmmword ptr [rsp+0x40], xmm12 movdqa xmm5, xmmword ptr [rsp+0x30] movdqa xmm13, xmmword ptr [rsp+0x50] pshufd xmm6, xmm5, 0x0F shufps xmm5, xmm13, 214 pshufd xmm12, xmm5, 0x39 movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm6, xmm5 movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 movdqa xmmword ptr [rsp+0x30], xmm2 movdqa xmm2, xmm14 pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm5, xmm2 movdqa xmm2, xmmword ptr [rsp+0x30] pshufd xmm5, xmm5, 0x78 punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 pshufd xmm15, xmm14, 0x1E movdqa xmm13, xmm6 movdqa xmm14, xmm5 movdqa xmm5, xmmword ptr [rsp+0x20] movdqa xmm6, xmmword ptr [rsp+0x40] jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm8, xmm10 pxor xmm9, xmm11 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 movups xmmword ptr [rbx+0x20], xmm8 movups xmmword ptr [rbx+0x30], xmm9 mov eax, dword ptr [rsp+0x130] neg eax mov r10d, dword ptr [rsp+0x110+8*rax] mov r11d, dword ptr [rsp+0x120+8*rax] mov dword ptr [rsp+0x110], r10d mov dword ptr [rsp+0x120], r11d add rdi, 16 add rbx, 64 sub rsi, 2 3: test esi, 0x1 je 4b movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movd xmm13, dword ptr [rsp+0x110] movd xmm14, dword ptr [rsp+0x120] punpckldq xmm13, xmm14 mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] shl rax, 32 or rax, 64 movq xmm12, rax movdqa xmm3, xmm13 punpcklqdq xmm3, xmm12 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm10, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 jmp 4b .p2align 6 blake3_compress_in_place_sse2: _blake3_compress_in_place_sse2: sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+0x10], xmm7 movdqa xmmword ptr [rsp+0x20], xmm8 movdqa xmmword ptr [rsp+0x30], xmm9 movdqa xmmword ptr [rsp+0x40], xmm11 movdqa xmmword ptr [rsp+0x50], xmm14 movdqa xmmword ptr [rsp+0x60], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movzx eax, byte ptr [rsp+0xA0] movzx r8d, r8b shl rax, 32 add r8, rax movq xmm3, r9 movq xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+0x20] movups xmm7, xmmword ptr [rdx+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm14 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 movups xmmword ptr [rcx], xmm0 movups xmmword ptr [rcx+0x10], xmm1 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+0x10] movdqa xmm8, xmmword ptr [rsp+0x20] movdqa xmm9, xmmword ptr [rsp+0x30] movdqa xmm11, xmmword ptr [rsp+0x40] movdqa xmm14, xmmword ptr [rsp+0x50] movdqa xmm15, xmmword ptr [rsp+0x60] add rsp, 120 ret .p2align 6 _blake3_compress_xof_sse2: blake3_compress_xof_sse2: sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+0x10], xmm7 movdqa xmmword ptr [rsp+0x20], xmm8 movdqa xmmword ptr [rsp+0x30], xmm9 movdqa xmmword ptr [rsp+0x40], xmm11 movdqa xmmword ptr [rsp+0x50], xmm14 movdqa xmmword ptr [rsp+0x60], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movzx eax, byte ptr [rsp+0xA0] movzx r8d, r8b mov r10, qword ptr [rsp+0xA8] shl rax, 32 add r8, rax movq xmm3, r9 movq xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+0x20] movups xmm7, xmmword ptr [rdx+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm14 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: movdqu xmm4, xmmword ptr [rcx] movdqu xmm5, xmmword ptr [rcx+0x10] pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm2, xmm4 pxor xmm3, xmm5 movups xmmword ptr [r10], xmm0 movups xmmword ptr [r10+0x10], xmm1 movups xmmword ptr [r10+0x20], xmm2 movups xmmword ptr [r10+0x30], xmm3 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+0x10] movdqa xmm8, xmmword ptr [rsp+0x20] movdqa xmm9, xmmword ptr [rsp+0x30] movdqa xmm11, xmmword ptr [rsp+0x40] movdqa xmm14, xmmword ptr [rsp+0x50] movdqa xmm15, xmmword ptr [rsp+0x60] add rsp, 120 ret .section .rdata .p2align 6 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 .long 0x3C6EF372, 0xA54FF53A ADD0: .long 0, 1, 2, 3 ADD1: .long 4, 4, 4, 4 BLAKE3_IV_0: .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A BLAKE3_BLOCK_LEN: .long 64, 64, 64, 64 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 PBLENDW_0x33_MASK: .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 PBLENDW_0xCC_MASK: .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF PBLENDW_0x3F_MASK: .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 PBLENDW_0xC0_MASK: .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF blake3-1.8.1/c/blake3_sse2_x86-64_windows_msvc.asm000064400000000000000000002125061046102023000175610ustar 00000000000000public _blake3_hash_many_sse2 public blake3_hash_many_sse2 public blake3_compress_in_place_sse2 public _blake3_compress_in_place_sse2 public blake3_compress_xof_sse2 public _blake3_compress_xof_sse2 _TEXT SEGMENT ALIGN(16) 'CODE' ALIGN 16 blake3_hash_many_sse2 PROC _blake3_hash_many_sse2 PROC push r15 push r14 push r13 push r12 push rsi push rdi push rbx push rbp mov rbp, rsp sub rsp, 528 and rsp, 0FFFFFFFFFFFFFFC0H movdqa xmmword ptr [rsp+170H], xmm6 movdqa xmmword ptr [rsp+180H], xmm7 movdqa xmmword ptr [rsp+190H], xmm8 movdqa xmmword ptr [rsp+1A0H], xmm9 movdqa xmmword ptr [rsp+1B0H], xmm10 movdqa xmmword ptr [rsp+1C0H], xmm11 movdqa xmmword ptr [rsp+1D0H], xmm12 movdqa xmmword ptr [rsp+1E0H], xmm13 movdqa xmmword ptr [rsp+1F0H], xmm14 movdqa xmmword ptr [rsp+200H], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+68H] movzx r9, byte ptr [rbp+70H] neg r9d movd xmm0, r9d pshufd xmm0, xmm0, 00H movdqa xmmword ptr [rsp+130H], xmm0 movdqa xmm1, xmm0 pand xmm1, xmmword ptr [ADD0] pand xmm0, xmmword ptr [ADD1] movdqa xmmword ptr [rsp+150H], xmm0 movd xmm0, r8d pshufd xmm0, xmm0, 00H paddd xmm0, xmm1 movdqa xmmword ptr [rsp+110H], xmm0 pxor xmm0, xmmword ptr [CMP_MSB_MASK] pxor xmm1, xmmword ptr [CMP_MSB_MASK] pcmpgtd xmm1, xmm0 shr r8, 32 movd xmm2, r8d pshufd xmm2, xmm2, 00H psubd xmm2, xmm1 movdqa xmmword ptr [rsp+120H], xmm2 mov rbx, qword ptr [rbp+90H] mov r15, rdx shl r15, 6 movzx r13d, byte ptr [rbp+78H] movzx r12d, byte ptr [rbp+88H] cmp rsi, 4 jc final3blocks outerloop4: movdqu xmm3, xmmword ptr [rcx] pshufd xmm0, xmm3, 00H pshufd xmm1, xmm3, 55H pshufd xmm2, xmm3, 0AAH pshufd xmm3, xmm3, 0FFH movdqu xmm7, xmmword ptr [rcx+10H] pshufd xmm4, xmm7, 00H pshufd xmm5, xmm7, 55H pshufd xmm6, xmm7, 0AAH pshufd xmm7, xmm7, 0FFH mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx innerloop4: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movdqu xmm8, xmmword ptr [r8+rdx-40H] movdqu xmm9, xmmword ptr [r9+rdx-40H] movdqu xmm10, xmmword ptr [r10+rdx-40H] movdqu xmm11, xmmword ptr [r11+rdx-40H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp], xmm8 movdqa xmmword ptr [rsp+10H], xmm9 movdqa xmmword ptr [rsp+20H], xmm12 movdqa xmmword ptr [rsp+30H], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-30H] movdqu xmm9, xmmword ptr [r9+rdx-30H] movdqu xmm10, xmmword ptr [r10+rdx-30H] movdqu xmm11, xmmword ptr [r11+rdx-30H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+40H], xmm8 movdqa xmmword ptr [rsp+50H], xmm9 movdqa xmmword ptr [rsp+60H], xmm12 movdqa xmmword ptr [rsp+70H], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-20H] movdqu xmm9, xmmword ptr [r9+rdx-20H] movdqu xmm10, xmmword ptr [r10+rdx-20H] movdqu xmm11, xmmword ptr [r11+rdx-20H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+80H], xmm8 movdqa xmmword ptr [rsp+90H], xmm9 movdqa xmmword ptr [rsp+0A0H], xmm12 movdqa xmmword ptr [rsp+0B0H], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-10H] movdqu xmm9, xmmword ptr [r9+rdx-10H] movdqu xmm10, xmmword ptr [r10+rdx-10H] movdqu xmm11, xmmword ptr [r11+rdx-10H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0C0H], xmm8 movdqa xmmword ptr [rsp+0D0H], xmm9 movdqa xmmword ptr [rsp+0E0H], xmm12 movdqa xmmword ptr [rsp+0F0H], xmm13 movdqa xmm9, xmmword ptr [BLAKE3_IV_1] movdqa xmm10, xmmword ptr [BLAKE3_IV_2] movdqa xmm11, xmmword ptr [BLAKE3_IV_3] movdqa xmm12, xmmword ptr [rsp+110H] movdqa xmm13, xmmword ptr [rsp+120H] movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] movd xmm15, eax pshufd xmm15, xmm15, 00H prefetcht0 byte ptr [r8+rdx+80H] prefetcht0 byte ptr [r9+rdx+80H] prefetcht0 byte ptr [r10+rdx+80H] prefetcht0 byte ptr [r11+rdx+80H] paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+20H] paddd xmm2, xmmword ptr [rsp+40H] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [BLAKE3_IV_0] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+10H] paddd xmm1, xmmword ptr [rsp+30H] paddd xmm2, xmmword ptr [rsp+50H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+80H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp+0C0H] paddd xmm3, xmmword ptr [rsp+0E0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+90H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+0D0H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+20H] paddd xmm1, xmmword ptr [rsp+30H] paddd xmm2, xmmword ptr [rsp+70H] paddd xmm3, xmmword ptr [rsp+40H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+60H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0D0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+10H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+90H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0B0H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp+0E0H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+30H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp+0D0H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+40H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+20H] paddd xmm3, xmmword ptr [rsp+0E0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+60H] paddd xmm1, xmmword ptr [rsp+90H] paddd xmm2, xmmword ptr [rsp+0B0H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+50H] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0F0H] paddd xmm3, xmmword ptr [rsp+10H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0A0H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+0E0H] paddd xmm3, xmmword ptr [rsp+0D0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+70H] paddd xmm1, xmmword ptr [rsp+90H] paddd xmm2, xmmword ptr [rsp+30H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+40H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+50H] paddd xmm3, xmmword ptr [rsp+10H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+20H] paddd xmm2, xmmword ptr [rsp+80H] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0C0H] paddd xmm1, xmmword ptr [rsp+90H] paddd xmm2, xmmword ptr [rsp+0F0H] paddd xmm3, xmmword ptr [rsp+0E0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0D0H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+0A0H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+70H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+20H] paddd xmm1, xmmword ptr [rsp+30H] paddd xmm2, xmmword ptr [rsp+10H] paddd xmm3, xmmword ptr [rsp+40H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+90H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+80H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0E0H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp+0C0H] paddd xmm3, xmmword ptr [rsp+10H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0D0H] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+20H] paddd xmm3, xmmword ptr [rsp+40H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+30H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp+60H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0B0H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp+10H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0F0H] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+90H] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0E0H] paddd xmm1, xmmword ptr [rsp+20H] paddd xmm2, xmmword ptr [rsp+30H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0B1H pshufhw xmm15, xmm15, 0B1H pshuflw xmm12, xmm12, 0B1H pshufhw xmm12, xmm12, 0B1H pshuflw xmm13, xmm13, 0B1H pshufhw xmm13, xmm13, 0B1H pshuflw xmm14, xmm14, 0B1H pshufhw xmm14, xmm14, 0B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0A0H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+40H] paddd xmm3, xmmword ptr [rsp+0D0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 pxor xmm0, xmm8 pxor xmm1, xmm9 pxor xmm2, xmm10 pxor xmm3, xmm11 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 pxor xmm4, xmm12 pxor xmm5, xmm13 pxor xmm6, xmm14 pxor xmm7, xmm15 mov eax, r13d jne innerloop4 movdqa xmm9, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm9, xmm1 movdqa xmm11, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm11, xmm3 movdqa xmm1, xmm0 punpcklqdq xmm0, xmm2 punpckhqdq xmm1, xmm2 movdqa xmm3, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm3, xmm11 movdqu xmmword ptr [rbx], xmm0 movdqu xmmword ptr [rbx+20H], xmm1 movdqu xmmword ptr [rbx+40H], xmm9 movdqu xmmword ptr [rbx+60H], xmm3 movdqa xmm9, xmm4 punpckldq xmm4, xmm5 punpckhdq xmm9, xmm5 movdqa xmm11, xmm6 punpckldq xmm6, xmm7 punpckhdq xmm11, xmm7 movdqa xmm5, xmm4 punpcklqdq xmm4, xmm6 punpckhqdq xmm5, xmm6 movdqa xmm7, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm7, xmm11 movdqu xmmword ptr [rbx+10H], xmm4 movdqu xmmword ptr [rbx+30H], xmm5 movdqu xmmword ptr [rbx+50H], xmm9 movdqu xmmword ptr [rbx+70H], xmm7 movdqa xmm1, xmmword ptr [rsp+110H] movdqa xmm0, xmm1 paddd xmm1, xmmword ptr [rsp+150H] movdqa xmmword ptr [rsp+110H], xmm1 pxor xmm0, xmmword ptr [CMP_MSB_MASK] pxor xmm1, xmmword ptr [CMP_MSB_MASK] pcmpgtd xmm0, xmm1 movdqa xmm1, xmmword ptr [rsp+120H] psubd xmm1, xmm0 movdqa xmmword ptr [rsp+120H], xmm1 add rbx, 128 add rdi, 32 sub rsi, 4 cmp rsi, 4 jnc outerloop4 test rsi, rsi jne final3blocks unwind: movdqa xmm6, xmmword ptr [rsp+170H] movdqa xmm7, xmmword ptr [rsp+180H] movdqa xmm8, xmmword ptr [rsp+190H] movdqa xmm9, xmmword ptr [rsp+1A0H] movdqa xmm10, xmmword ptr [rsp+1B0H] movdqa xmm11, xmmword ptr [rsp+1C0H] movdqa xmm12, xmmword ptr [rsp+1D0H] movdqa xmm13, xmmword ptr [rsp+1E0H] movdqa xmm14, xmmword ptr [rsp+1F0H] movdqa xmm15, xmmword ptr [rsp+200H] mov rsp, rbp pop rbp pop rbx pop rdi pop rsi pop r12 pop r13 pop r14 pop r15 ret ALIGN 16 final3blocks: test esi, 2H je final1block movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+110H] movd xmm14, dword ptr [rsp+120H] punpckldq xmm13, xmm14 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+114H] movd xmm13, dword ptr [rsp+124H] punpckldq xmm14, xmm13 movaps xmmword ptr [rsp+10H], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx innerloop2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV] movaps xmm10, xmm2 movups xmm4, xmmword ptr [r8+rdx-40H] movups xmm5, xmmword ptr [r8+rdx-30H] movaps xmm3, xmm4 shufps xmm4, xmm5, 136 shufps xmm3, xmm5, 221 movaps xmm5, xmm3 movups xmm6, xmmword ptr [r8+rdx-20H] movups xmm7, xmmword ptr [r8+rdx-10H] movaps xmm3, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm3, xmm7, 221 pshufd xmm7, xmm3, 93H movups xmm12, xmmword ptr [r9+rdx-40H] movups xmm13, xmmword ptr [r9+rdx-30H] movaps xmm11, xmm12 shufps xmm12, xmm13, 136 shufps xmm11, xmm13, 221 movaps xmm13, xmm11 movups xmm14, xmmword ptr [r9+rdx-20H] movups xmm15, xmmword ptr [r9+rdx-10H] movaps xmm11, xmm14 shufps xmm14, xmm15, 136 pshufd xmm14, xmm14, 93H shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 93H shl rax, 20H or rax, 40H movd xmm3, rax movdqa xmmword ptr [rsp+20H], xmm3 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+10H] punpcklqdq xmm3, xmmword ptr [rsp+20H] punpcklqdq xmm11, xmmword ptr [rsp+20H] mov al, 7 roundloop2: paddd xmm0, xmm4 paddd xmm8, xmm12 movaps xmmword ptr [rsp+20H], xmm4 movaps xmmword ptr [rsp+30H], xmm12 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H pshuflw xmm11, xmm11, 0B1H pshufhw xmm11, xmm11, 0B1H paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm5 paddd xmm8, xmm13 movaps xmmword ptr [rsp+40H], xmm5 movaps xmmword ptr [rsp+50H], xmm13 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 93H pshufd xmm8, xmm8, 93H pshufd xmm3, xmm3, 4EH pshufd xmm11, xmm11, 4EH pshufd xmm2, xmm2, 39H pshufd xmm10, xmm10, 39H paddd xmm0, xmm6 paddd xmm8, xmm14 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H pshuflw xmm11, xmm11, 0B1H pshufhw xmm11, xmm11, 0B1H paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm7 paddd xmm8, xmm15 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 39H pshufd xmm8, xmm8, 39H pshufd xmm3, xmm3, 4EH pshufd xmm11, xmm11, 4EH pshufd xmm2, xmm2, 93H pshufd xmm10, xmm10, 93H dec al je endroundloop2 movdqa xmm12, xmmword ptr [rsp+20H] movdqa xmm5, xmmword ptr [rsp+40H] pshufd xmm13, xmm12, 0FH shufps xmm12, xmm5, 214 pshufd xmm4, xmm12, 39H movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 pand xmm13, xmmword ptr [PBLENDW_0x33_MASK] pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK] por xmm13, xmm12 movdqa xmmword ptr [rsp+20H], xmm13 movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 movdqa xmm13, xmm6 pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK] pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK] por xmm12, xmm13 pshufd xmm12, xmm12, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmmword ptr [rsp+40H], xmm12 movdqa xmm5, xmmword ptr [rsp+30H] movdqa xmm13, xmmword ptr [rsp+50H] pshufd xmm6, xmm5, 0FH shufps xmm5, xmm13, 214 pshufd xmm12, xmm5, 39H movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 pand xmm6, xmmword ptr [PBLENDW_0x33_MASK] pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK] por xmm6, xmm5 movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 movdqa xmmword ptr [rsp+30H], xmm2 movdqa xmm2, xmm14 pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK] pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK] por xmm5, xmm2 movdqa xmm2, xmmword ptr [rsp+30H] pshufd xmm5, xmm5, 78H punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 pshufd xmm15, xmm14, 1EH movdqa xmm13, xmm6 movdqa xmm14, xmm5 movdqa xmm5, xmmword ptr [rsp+20H] movdqa xmm6, xmmword ptr [rsp+40H] jmp roundloop2 endroundloop2: pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm8, xmm10 pxor xmm9, xmm11 mov eax, r13d cmp rdx, r15 jne innerloop2 movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+10H], xmm1 movups xmmword ptr [rbx+20H], xmm8 movups xmmword ptr [rbx+30H], xmm9 mov eax, dword ptr [rsp+130H] neg eax mov r10d, dword ptr [rsp+110H+8*rax] mov r11d, dword ptr [rsp+120H+8*rax] mov dword ptr [rsp+110H], r10d mov dword ptr [rsp+120H], r11d add rdi, 16 add rbx, 64 sub rsi, 2 final1block: test esi, 1H je unwind movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movd xmm13, dword ptr [rsp+110H] movd xmm14, dword ptr [rsp+120H] punpckldq xmm13, xmm14 mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx innerloop1: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV] shl rax, 32 or rax, 64 movd xmm12, rax movdqa xmm3, xmm13 punpcklqdq xmm3, xmm12 movups xmm4, xmmword ptr [r8+rdx-40H] movups xmm5, xmmword ptr [r8+rdx-30H] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [r8+rdx-20H] movups xmm7, xmmword ptr [r8+rdx-10H] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H mov al, 7 roundloop1: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 93H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 39H paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 39H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 93H dec al jz endroundloop1 movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0FH pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm10, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] por xmm8, xmm10 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp roundloop1 endroundloop1: pxor xmm0, xmm2 pxor xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne innerloop1 movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+10H], xmm1 jmp unwind _blake3_hash_many_sse2 ENDP blake3_hash_many_sse2 ENDP blake3_compress_in_place_sse2 PROC _blake3_compress_in_place_sse2 PROC sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+10H], xmm7 movdqa xmmword ptr [rsp+20H], xmm8 movdqa xmmword ptr [rsp+30H], xmm9 movdqa xmmword ptr [rsp+40H], xmm11 movdqa xmmword ptr [rsp+50H], xmm14 movdqa xmmword ptr [rsp+60H], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm2, xmmword ptr [BLAKE3_IV] movzx eax, byte ptr [rsp+0A0H] movzx r8d, r8b shl rax, 32 add r8, rax movd xmm3, r9 movd xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+10H] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+20H] movups xmm7, xmmword ptr [rdx+30H] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H mov al, 7 @@: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 93H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 39H paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 39H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 93H dec al jz @F movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0FH pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] por xmm8, xmm14 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp @B @@: pxor xmm0, xmm2 pxor xmm1, xmm3 movups xmmword ptr [rcx], xmm0 movups xmmword ptr [rcx+10H], xmm1 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+10H] movdqa xmm8, xmmword ptr [rsp+20H] movdqa xmm9, xmmword ptr [rsp+30H] movdqa xmm11, xmmword ptr [rsp+40H] movdqa xmm14, xmmword ptr [rsp+50H] movdqa xmm15, xmmword ptr [rsp+60H] add rsp, 120 ret _blake3_compress_in_place_sse2 ENDP blake3_compress_in_place_sse2 ENDP ALIGN 16 blake3_compress_xof_sse2 PROC _blake3_compress_xof_sse2 PROC sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+10H], xmm7 movdqa xmmword ptr [rsp+20H], xmm8 movdqa xmmword ptr [rsp+30H], xmm9 movdqa xmmword ptr [rsp+40H], xmm11 movdqa xmmword ptr [rsp+50H], xmm14 movdqa xmmword ptr [rsp+60H], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm2, xmmword ptr [BLAKE3_IV] movzx eax, byte ptr [rsp+0A0H] movzx r8d, r8b mov r10, qword ptr [rsp+0A8H] shl rax, 32 add r8, rax movd xmm3, r9 movd xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+10H] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+20H] movups xmm7, xmmword ptr [rdx+30H] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H mov al, 7 @@: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 93H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 39H paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0B1H pshufhw xmm3, xmm3, 0B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 39H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 93H dec al jz @F movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0FH pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] por xmm8, xmm14 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp @B @@: movdqu xmm4, xmmword ptr [rcx] movdqu xmm5, xmmword ptr [rcx+10H] pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm2, xmm4 pxor xmm3, xmm5 movups xmmword ptr [r10], xmm0 movups xmmword ptr [r10+10H], xmm1 movups xmmword ptr [r10+20H], xmm2 movups xmmword ptr [r10+30H], xmm3 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+10H] movdqa xmm8, xmmword ptr [rsp+20H] movdqa xmm9, xmmword ptr [rsp+30H] movdqa xmm11, xmmword ptr [rsp+40H] movdqa xmm14, xmmword ptr [rsp+50H] movdqa xmm15, xmmword ptr [rsp+60H] add rsp, 120 ret _blake3_compress_xof_sse2 ENDP blake3_compress_xof_sse2 ENDP _TEXT ENDS _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' ALIGN 64 BLAKE3_IV: dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH ADD0: dd 0, 1, 2, 3 ADD1: dd 4 dup (4) BLAKE3_IV_0: dd 4 dup (6A09E667H) BLAKE3_IV_1: dd 4 dup (0BB67AE85H) BLAKE3_IV_2: dd 4 dup (3C6EF372H) BLAKE3_IV_3: dd 4 dup (0A54FF53AH) BLAKE3_BLOCK_LEN: dd 4 dup (64) CMP_MSB_MASK: dd 8 dup(80000000H) PBLENDW_0x33_MASK: dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H PBLENDW_0xCC_MASK: dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH PBLENDW_0x3F_MASK: dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H PBLENDW_0xC0_MASK: dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH _RDATA ENDS END blake3-1.8.1/c/blake3_sse41.c000064400000000000000000000505151046102023000135500ustar 00000000000000#include "blake3_impl.h" #include #define DEGREE 4 #define _mm_shuffle_ps2(a, b, c) \ (_mm_castps_si128( \ _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) INLINE __m128i loadu(const uint8_t src[16]) { return _mm_loadu_si128((const __m128i *)src); } INLINE void storeu(__m128i src, uint8_t dest[16]) { _mm_storeu_si128((__m128i *)dest, src); } INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } // Note that clang-format doesn't like the name "xor" for some reason. INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); } INLINE __m128i rot16(__m128i x) { return _mm_shuffle_epi8( x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); } INLINE __m128i rot12(__m128i x) { return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); } INLINE __m128i rot8(__m128i x) { return _mm_shuffle_epi8( x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); } INLINE __m128i rot7(__m128i x) { return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); } INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m) { *row0 = addv(addv(*row0, m), *row1); *row3 = xorv(*row3, *row0); *row3 = rot16(*row3); *row2 = addv(*row2, *row3); *row1 = xorv(*row1, *row2); *row1 = rot12(*row1); } INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m) { *row0 = addv(addv(*row0, m), *row1); *row3 = xorv(*row3, *row0); *row3 = rot8(*row3); *row2 = addv(*row2, *row3); *row1 = xorv(*row1, *row2); *row1 = rot7(*row1); } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); } INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); } INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { rows[0] = loadu((uint8_t *)&cv[0]); rows[1] = loadu((uint8_t *)&cv[4]); rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); rows[3] = set4(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags); __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); __m128i t0, t1, t2, t3, tt; // Round 1. The first round permutes the message words from the original // input order, into the groups that get mixed in parallel. t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 2. This round and all following rounds apply a fixed permutation // to the message words from the round before. t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 3 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 4 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 5 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 6 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 7 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); } void blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); } void blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu(xorv(rows[0], rows[2]), &out[0]); storeu(xorv(rows[1], rows[3]), &out[16]); storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); } INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); v[0] = addv(v[0], v[4]); v[1] = addv(v[1], v[5]); v[2] = addv(v[2], v[6]); v[3] = addv(v[3], v[7]); v[12] = xorv(v[12], v[0]); v[13] = xorv(v[13], v[1]); v[14] = xorv(v[14], v[2]); v[15] = xorv(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = addv(v[8], v[12]); v[9] = addv(v[9], v[13]); v[10] = addv(v[10], v[14]); v[11] = addv(v[11], v[15]); v[4] = xorv(v[4], v[8]); v[5] = xorv(v[5], v[9]); v[6] = xorv(v[6], v[10]); v[7] = xorv(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); v[0] = addv(v[0], v[4]); v[1] = addv(v[1], v[5]); v[2] = addv(v[2], v[6]); v[3] = addv(v[3], v[7]); v[12] = xorv(v[12], v[0]); v[13] = xorv(v[13], v[1]); v[14] = xorv(v[14], v[2]); v[15] = xorv(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = addv(v[8], v[12]); v[9] = addv(v[9], v[13]); v[10] = addv(v[10], v[14]); v[11] = addv(v[11], v[15]); v[4] = xorv(v[4], v[8]); v[5] = xorv(v[5], v[9]); v[6] = xorv(v[6], v[10]); v[7] = xorv(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); v[0] = addv(v[0], v[5]); v[1] = addv(v[1], v[6]); v[2] = addv(v[2], v[7]); v[3] = addv(v[3], v[4]); v[15] = xorv(v[15], v[0]); v[12] = xorv(v[12], v[1]); v[13] = xorv(v[13], v[2]); v[14] = xorv(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = addv(v[10], v[15]); v[11] = addv(v[11], v[12]); v[8] = addv(v[8], v[13]); v[9] = addv(v[9], v[14]); v[5] = xorv(v[5], v[10]); v[6] = xorv(v[6], v[11]); v[7] = xorv(v[7], v[8]); v[4] = xorv(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); v[0] = addv(v[0], v[5]); v[1] = addv(v[1], v[6]); v[2] = addv(v[2], v[7]); v[3] = addv(v[3], v[4]); v[15] = xorv(v[15], v[0]); v[12] = xorv(v[12], v[1]); v[13] = xorv(v[13], v[2]); v[14] = xorv(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = addv(v[10], v[15]); v[11] = addv(v[11], v[12]); v[8] = addv(v[8], v[13]); v[9] = addv(v[9], v[14]); v[5] = xorv(v[5], v[10]); v[6] = xorv(v[6], v[11]); v[7] = xorv(v[7], v[8]); v[4] = xorv(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } INLINE void transpose_vecs(__m128i vecs[DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is // 22/33. Note that this doesn't split the vector into two lanes, as the // AVX2 counterparts do. __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); // Interleave 64-bit lanes. __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); vecs[0] = abcd_0; vecs[1] = abcd_1; vecs[2] = abcd_2; vecs[3] = abcd_3; } INLINE void transpose_msg_vecs(const uint8_t *const *inputs, size_t block_offset, __m128i out[16]) { out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); for (size_t i = 0; i < 4; ++i) { _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs(&out[0]); transpose_vecs(&out[4]); transpose_vecs(&out[8]); transpose_vecs(&out[12]); } INLINE void load_counters(uint64_t counter, bool increment_counter, __m128i *out_lo, __m128i *out_hi) { const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); const __m128i add1 = _mm_and_si128(mask, add0); __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry); *out_lo = l; *out_hi = h; } static void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m128i h_vecs[8] = { set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), }; __m128i counter_low_vec, counter_high_vec; load_counters(counter, increment_counter, &counter_low_vec, &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { if (block + 1 == blocks) { block_flags |= flags_end; } __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); __m128i block_flags_vec = set1(block_flags); __m128i msg_vecs[16]; transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); __m128i v[16] = { h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn(v, msg_vecs, 0); round_fn(v, msg_vecs, 1); round_fn(v, msg_vecs, 2); round_fn(v, msg_vecs, 3); round_fn(v, msg_vecs, 4); round_fn(v, msg_vecs, 5); round_fn(v, msg_vecs, 6); h_vecs[0] = xorv(v[0], v[8]); h_vecs[1] = xorv(v[1], v[9]); h_vecs[2] = xorv(v[2], v[10]); h_vecs[3] = xorv(v[3], v[11]); h_vecs[4] = xorv(v[4], v[12]); h_vecs[5] = xorv(v[5], v[13]); h_vecs[6] = xorv(v[6], v[14]); h_vecs[7] = xorv(v[7], v[15]); block_flags = flags; } transpose_vecs(&h_vecs[0]); transpose_vecs(&h_vecs[4]); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); } INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { uint32_t cv[8]; memcpy(cv, key, BLAKE3_KEY_LEN); uint8_t block_flags = flags | flags_start; while (blocks > 0) { if (blocks == 1) { block_flags |= flags_end; } blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; } memcpy(out, cv, BLAKE3_OUT_LEN); } void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { while (num_inputs >= DEGREE) { blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += DEGREE; } inputs += DEGREE; num_inputs -= DEGREE; out = &out[DEGREE * BLAKE3_OUT_LEN]; } while (num_inputs > 0) { hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); if (increment_counter) { counter += 1; } inputs += 1; num_inputs -= 1; out = &out[BLAKE3_OUT_LEN]; } } blake3-1.8.1/c/blake3_sse41_x86-64_unix.S000064400000000000000000001673271046102023000155410ustar 00000000000000#if defined(__ELF__) && defined(__linux__) .section .note.GNU-stack,"",%progbits #endif #if defined(__ELF__) && defined(__CET__) && defined(__has_include) #if __has_include() #include #endif #endif #if !defined(_CET_ENDBR) #define _CET_ENDBR #endif .intel_syntax noprefix .global blake3_hash_many_sse41 .global _blake3_hash_many_sse41 .global blake3_compress_in_place_sse41 .global _blake3_compress_in_place_sse41 .global blake3_compress_xof_sse41 .global _blake3_compress_xof_sse41 #ifdef __APPLE__ .text #else .section .text #endif .p2align 6 _blake3_hash_many_sse41: blake3_hash_many_sse41: _CET_ENDBR push r15 push r14 push r13 push r12 push rbx push rbp mov rbp, rsp sub rsp, 360 and rsp, 0xFFFFFFFFFFFFFFC0 neg r9d movd xmm0, r9d pshufd xmm0, xmm0, 0x00 movdqa xmmword ptr [rsp+0x130], xmm0 movdqa xmm1, xmm0 pand xmm1, xmmword ptr [ADD0+rip] pand xmm0, xmmword ptr [ADD1+rip] movdqa xmmword ptr [rsp+0x150], xmm0 movd xmm0, r8d pshufd xmm0, xmm0, 0x00 paddd xmm0, xmm1 movdqa xmmword ptr [rsp+0x110], xmm0 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm1, xmm0 shr r8, 32 movd xmm2, r8d pshufd xmm2, xmm2, 0x00 psubd xmm2, xmm1 movdqa xmmword ptr [rsp+0x120], xmm2 mov rbx, qword ptr [rbp+0x50] mov r15, rdx shl r15, 6 movzx r13d, byte ptr [rbp+0x38] movzx r12d, byte ptr [rbp+0x48] cmp rsi, 4 jc 3f 2: movdqu xmm3, xmmword ptr [rcx] pshufd xmm0, xmm3, 0x00 pshufd xmm1, xmm3, 0x55 pshufd xmm2, xmm3, 0xAA pshufd xmm3, xmm3, 0xFF movdqu xmm7, xmmword ptr [rcx+0x10] pshufd xmm4, xmm7, 0x00 pshufd xmm5, xmm7, 0x55 pshufd xmm6, xmm7, 0xAA pshufd xmm7, xmm7, 0xFF mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 9: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movdqu xmm8, xmmword ptr [r8+rdx-0x40] movdqu xmm9, xmmword ptr [r9+rdx-0x40] movdqu xmm10, xmmword ptr [r10+rdx-0x40] movdqu xmm11, xmmword ptr [r11+rdx-0x40] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp], xmm8 movdqa xmmword ptr [rsp+0x10], xmm9 movdqa xmmword ptr [rsp+0x20], xmm12 movdqa xmmword ptr [rsp+0x30], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x30] movdqu xmm9, xmmword ptr [r9+rdx-0x30] movdqu xmm10, xmmword ptr [r10+rdx-0x30] movdqu xmm11, xmmword ptr [r11+rdx-0x30] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x40], xmm8 movdqa xmmword ptr [rsp+0x50], xmm9 movdqa xmmword ptr [rsp+0x60], xmm12 movdqa xmmword ptr [rsp+0x70], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x20] movdqu xmm9, xmmword ptr [r9+rdx-0x20] movdqu xmm10, xmmword ptr [r10+rdx-0x20] movdqu xmm11, xmmword ptr [r11+rdx-0x20] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x80], xmm8 movdqa xmmword ptr [rsp+0x90], xmm9 movdqa xmmword ptr [rsp+0xA0], xmm12 movdqa xmmword ptr [rsp+0xB0], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x10] movdqu xmm9, xmmword ptr [r9+rdx-0x10] movdqu xmm10, xmmword ptr [r10+rdx-0x10] movdqu xmm11, xmmword ptr [r11+rdx-0x10] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0xC0], xmm8 movdqa xmmword ptr [rsp+0xD0], xmm9 movdqa xmmword ptr [rsp+0xE0], xmm12 movdqa xmmword ptr [rsp+0xF0], xmm13 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] movdqa xmm12, xmmword ptr [rsp+0x110] movdqa xmm13, xmmword ptr [rsp+0x120] movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] movd xmm15, eax pshufd xmm15, xmm15, 0x00 prefetcht0 [r8+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r11+rdx+0x80] paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x80] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x70] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xB0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x50] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xC0] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xA0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0x60] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xF0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 pxor xmm0, xmm8 pxor xmm1, xmm9 pxor xmm2, xmm10 pxor xmm3, xmm11 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 pxor xmm4, xmm12 pxor xmm5, xmm13 pxor xmm6, xmm14 pxor xmm7, xmm15 mov eax, r13d jne 9b movdqa xmm9, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm9, xmm1 movdqa xmm11, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm11, xmm3 movdqa xmm1, xmm0 punpcklqdq xmm0, xmm2 punpckhqdq xmm1, xmm2 movdqa xmm3, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm3, xmm11 movdqu xmmword ptr [rbx], xmm0 movdqu xmmword ptr [rbx+0x20], xmm1 movdqu xmmword ptr [rbx+0x40], xmm9 movdqu xmmword ptr [rbx+0x60], xmm3 movdqa xmm9, xmm4 punpckldq xmm4, xmm5 punpckhdq xmm9, xmm5 movdqa xmm11, xmm6 punpckldq xmm6, xmm7 punpckhdq xmm11, xmm7 movdqa xmm5, xmm4 punpcklqdq xmm4, xmm6 punpckhqdq xmm5, xmm6 movdqa xmm7, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm7, xmm11 movdqu xmmword ptr [rbx+0x10], xmm4 movdqu xmmword ptr [rbx+0x30], xmm5 movdqu xmmword ptr [rbx+0x50], xmm9 movdqu xmmword ptr [rbx+0x70], xmm7 movdqa xmm1, xmmword ptr [rsp+0x110] movdqa xmm0, xmm1 paddd xmm1, xmmword ptr [rsp+0x150] movdqa xmmword ptr [rsp+0x110], xmm1 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm0, xmm1 movdqa xmm1, xmmword ptr [rsp+0x120] psubd xmm1, xmm0 movdqa xmmword ptr [rsp+0x120], xmm1 add rbx, 128 add rdi, 32 sub rsi, 4 cmp rsi, 4 jnc 2b test rsi, rsi jnz 3f 4: mov rsp, rbp pop rbp pop rbx pop r12 pop r13 pop r14 pop r15 ret .p2align 5 3: test esi, 0x2 je 3f movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+0x110] pinsrd xmm13, dword ptr [rsp+0x120], 1 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+0x114] pinsrd xmm14, dword ptr [rsp+0x124], 1 pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 movaps xmmword ptr [rsp+0x10], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm10, xmm2 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm3, xmm4 shufps xmm4, xmm5, 136 shufps xmm3, xmm5, 221 movaps xmm5, xmm3 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm3, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm3, xmm7, 221 pshufd xmm7, xmm3, 0x93 movups xmm12, xmmword ptr [r9+rdx-0x40] movups xmm13, xmmword ptr [r9+rdx-0x30] movaps xmm11, xmm12 shufps xmm12, xmm13, 136 shufps xmm11, xmm13, 221 movaps xmm13, xmm11 movups xmm14, xmmword ptr [r9+rdx-0x20] movups xmm15, xmmword ptr [r9+rdx-0x10] movaps xmm11, xmm14 shufps xmm14, xmm15, 136 pshufd xmm14, xmm14, 0x93 shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 0x93 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+0x10] pinsrd xmm3, eax, 3 pinsrd xmm11, eax, 3 mov al, 7 9: paddd xmm0, xmm4 paddd xmm8, xmm12 movaps xmmword ptr [rsp+0x20], xmm4 movaps xmmword ptr [rsp+0x30], xmm12 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movaps xmm12, xmmword ptr [ROT16+rip] pshufb xmm3, xmm12 pshufb xmm11, xmm12 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm5 paddd xmm8, xmm13 movaps xmmword ptr [rsp+0x40], xmm5 movaps xmmword ptr [rsp+0x50], xmm13 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movaps xmm13, xmmword ptr [ROT8+rip] pshufb xmm3, xmm13 pshufb xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x93 pshufd xmm8, xmm8, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x39 pshufd xmm10, xmm10, 0x39 paddd xmm0, xmm6 paddd xmm8, xmm14 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshufb xmm3, xmm12 pshufb xmm11, xmm12 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm7 paddd xmm8, xmm15 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshufb xmm3, xmm13 pshufb xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x39 pshufd xmm8, xmm8, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x93 pshufd xmm10, xmm10, 0x93 dec al je 9f movdqa xmm12, xmmword ptr [rsp+0x20] movdqa xmm5, xmmword ptr [rsp+0x40] pshufd xmm13, xmm12, 0x0F shufps xmm12, xmm5, 214 pshufd xmm4, xmm12, 0x39 movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 pblendw xmm13, xmm12, 0xCC movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 pblendw xmm12, xmm6, 0xC0 pshufd xmm12, xmm12, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmmword ptr [rsp+0x20], xmm13 movdqa xmmword ptr [rsp+0x40], xmm12 movdqa xmm5, xmmword ptr [rsp+0x30] movdqa xmm13, xmmword ptr [rsp+0x50] pshufd xmm6, xmm5, 0x0F shufps xmm5, xmm13, 214 pshufd xmm12, xmm5, 0x39 movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 pblendw xmm6, xmm5, 0xCC movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 pblendw xmm5, xmm14, 0xC0 pshufd xmm5, xmm5, 0x78 punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 pshufd xmm15, xmm14, 0x1E movdqa xmm13, xmm6 movdqa xmm14, xmm5 movdqa xmm5, xmmword ptr [rsp+0x20] movdqa xmm6, xmmword ptr [rsp+0x40] jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm8, xmm10 pxor xmm9, xmm11 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 movups xmmword ptr [rbx+0x20], xmm8 movups xmmword ptr [rbx+0x30], xmm9 movdqa xmm0, xmmword ptr [rsp+0x130] movdqa xmm1, xmmword ptr [rsp+0x110] movdqa xmm2, xmmword ptr [rsp+0x120] movdqu xmm3, xmmword ptr [rsp+0x118] movdqu xmm4, xmmword ptr [rsp+0x128] blendvps xmm1, xmm3, xmm0 blendvps xmm2, xmm4, xmm0 movdqa xmmword ptr [rsp+0x110], xmm1 movdqa xmmword ptr [rsp+0x120], xmm2 add rdi, 16 add rbx, 64 sub rsi, 2 3: test esi, 0x1 je 4b movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movd xmm13, dword ptr [rsp+0x110] pinsrd xmm13, dword ptr [rsp+0x120], 1 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm3, xmm13 pinsrd xmm3, eax, 3 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0xCC movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0xC0 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 jmp 4b .p2align 6 blake3_compress_in_place_sse41: _blake3_compress_in_place_sse41: _CET_ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] shl r8, 32 add rdx, r8 movq xmm3, rcx movq xmm4, rdx punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rsi] movups xmm5, xmmword ptr [rsi+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rsi+0x20] movups xmm7, xmmword ptr [rsi+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0xCC movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0xC0 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 movups xmmword ptr [rdi], xmm0 movups xmmword ptr [rdi+0x10], xmm1 ret .p2align 6 blake3_compress_xof_sse41: _blake3_compress_xof_sse41: _CET_ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movzx eax, r8b movzx edx, dl shl rax, 32 add rdx, rax movq xmm3, rcx movq xmm4, rdx punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rsi] movups xmm5, xmmword ptr [rsi+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rsi+0x20] movups xmm7, xmmword ptr [rsi+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0xCC movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0xC0 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: movdqu xmm4, xmmword ptr [rdi] movdqu xmm5, xmmword ptr [rdi+0x10] pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm2, xmm4 pxor xmm3, xmm5 movups xmmword ptr [r9], xmm0 movups xmmword ptr [r9+0x10], xmm1 movups xmmword ptr [r9+0x20], xmm2 movups xmmword ptr [r9+0x30], xmm3 ret #ifdef __APPLE__ .static_data #else .section .rodata #endif .p2align 6 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 .long 0x3C6EF372, 0xA54FF53A ROT16: .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 ADD0: .long 0, 1, 2, 3 ADD1: .long 4, 4, 4, 4 BLAKE3_IV_0: .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A BLAKE3_BLOCK_LEN: .long 64, 64, 64, 64 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 blake3-1.8.1/c/blake3_sse41_x86-64_windows_gnu.S000064400000000000000000001737431046102023000171200ustar 00000000000000.intel_syntax noprefix .global blake3_hash_many_sse41 .global _blake3_hash_many_sse41 .global blake3_compress_in_place_sse41 .global _blake3_compress_in_place_sse41 .global blake3_compress_xof_sse41 .global _blake3_compress_xof_sse41 .section .text .p2align 6 _blake3_hash_many_sse41: blake3_hash_many_sse41: push r15 push r14 push r13 push r12 push rsi push rdi push rbx push rbp mov rbp, rsp sub rsp, 528 and rsp, 0xFFFFFFFFFFFFFFC0 movdqa xmmword ptr [rsp+0x170], xmm6 movdqa xmmword ptr [rsp+0x180], xmm7 movdqa xmmword ptr [rsp+0x190], xmm8 movdqa xmmword ptr [rsp+0x1A0], xmm9 movdqa xmmword ptr [rsp+0x1B0], xmm10 movdqa xmmword ptr [rsp+0x1C0], xmm11 movdqa xmmword ptr [rsp+0x1D0], xmm12 movdqa xmmword ptr [rsp+0x1E0], xmm13 movdqa xmmword ptr [rsp+0x1F0], xmm14 movdqa xmmword ptr [rsp+0x200], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+0x68] movzx r9, byte ptr [rbp+0x70] neg r9d movd xmm0, r9d pshufd xmm0, xmm0, 0x00 movdqa xmmword ptr [rsp+0x130], xmm0 movdqa xmm1, xmm0 pand xmm1, xmmword ptr [ADD0+rip] pand xmm0, xmmword ptr [ADD1+rip] movdqa xmmword ptr [rsp+0x150], xmm0 movd xmm0, r8d pshufd xmm0, xmm0, 0x00 paddd xmm0, xmm1 movdqa xmmword ptr [rsp+0x110], xmm0 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm1, xmm0 shr r8, 32 movd xmm2, r8d pshufd xmm2, xmm2, 0x00 psubd xmm2, xmm1 movdqa xmmword ptr [rsp+0x120], xmm2 mov rbx, qword ptr [rbp+0x90] mov r15, rdx shl r15, 6 movzx r13d, byte ptr [rbp+0x78] movzx r12d, byte ptr [rbp+0x88] cmp rsi, 4 jc 3f 2: movdqu xmm3, xmmword ptr [rcx] pshufd xmm0, xmm3, 0x00 pshufd xmm1, xmm3, 0x55 pshufd xmm2, xmm3, 0xAA pshufd xmm3, xmm3, 0xFF movdqu xmm7, xmmword ptr [rcx+0x10] pshufd xmm4, xmm7, 0x00 pshufd xmm5, xmm7, 0x55 pshufd xmm6, xmm7, 0xAA pshufd xmm7, xmm7, 0xFF mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx 9: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movdqu xmm8, xmmword ptr [r8+rdx-0x40] movdqu xmm9, xmmword ptr [r9+rdx-0x40] movdqu xmm10, xmmword ptr [r10+rdx-0x40] movdqu xmm11, xmmword ptr [r11+rdx-0x40] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp], xmm8 movdqa xmmword ptr [rsp+0x10], xmm9 movdqa xmmword ptr [rsp+0x20], xmm12 movdqa xmmword ptr [rsp+0x30], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x30] movdqu xmm9, xmmword ptr [r9+rdx-0x30] movdqu xmm10, xmmword ptr [r10+rdx-0x30] movdqu xmm11, xmmword ptr [r11+rdx-0x30] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x40], xmm8 movdqa xmmword ptr [rsp+0x50], xmm9 movdqa xmmword ptr [rsp+0x60], xmm12 movdqa xmmword ptr [rsp+0x70], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x20] movdqu xmm9, xmmword ptr [r9+rdx-0x20] movdqu xmm10, xmmword ptr [r10+rdx-0x20] movdqu xmm11, xmmword ptr [r11+rdx-0x20] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x80], xmm8 movdqa xmmword ptr [rsp+0x90], xmm9 movdqa xmmword ptr [rsp+0xA0], xmm12 movdqa xmmword ptr [rsp+0xB0], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x10] movdqu xmm9, xmmword ptr [r9+rdx-0x10] movdqu xmm10, xmmword ptr [r10+rdx-0x10] movdqu xmm11, xmmword ptr [r11+rdx-0x10] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0xC0], xmm8 movdqa xmmword ptr [rsp+0xD0], xmm9 movdqa xmmword ptr [rsp+0xE0], xmm12 movdqa xmmword ptr [rsp+0xF0], xmm13 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] movdqa xmm12, xmmword ptr [rsp+0x110] movdqa xmm13, xmmword ptr [rsp+0x120] movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] movd xmm15, eax pshufd xmm15, xmm15, 0x00 prefetcht0 [r8+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r11+rdx+0x80] paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x80] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x70] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xB0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x50] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xC0] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xA0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0x60] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xF0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 pxor xmm0, xmm8 pxor xmm1, xmm9 pxor xmm2, xmm10 pxor xmm3, xmm11 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 pxor xmm4, xmm12 pxor xmm5, xmm13 pxor xmm6, xmm14 pxor xmm7, xmm15 mov eax, r13d jne 9b movdqa xmm9, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm9, xmm1 movdqa xmm11, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm11, xmm3 movdqa xmm1, xmm0 punpcklqdq xmm0, xmm2 punpckhqdq xmm1, xmm2 movdqa xmm3, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm3, xmm11 movdqu xmmword ptr [rbx], xmm0 movdqu xmmword ptr [rbx+0x20], xmm1 movdqu xmmword ptr [rbx+0x40], xmm9 movdqu xmmword ptr [rbx+0x60], xmm3 movdqa xmm9, xmm4 punpckldq xmm4, xmm5 punpckhdq xmm9, xmm5 movdqa xmm11, xmm6 punpckldq xmm6, xmm7 punpckhdq xmm11, xmm7 movdqa xmm5, xmm4 punpcklqdq xmm4, xmm6 punpckhqdq xmm5, xmm6 movdqa xmm7, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm7, xmm11 movdqu xmmword ptr [rbx+0x10], xmm4 movdqu xmmword ptr [rbx+0x30], xmm5 movdqu xmmword ptr [rbx+0x50], xmm9 movdqu xmmword ptr [rbx+0x70], xmm7 movdqa xmm1, xmmword ptr [rsp+0x110] movdqa xmm0, xmm1 paddd xmm1, xmmword ptr [rsp+0x150] movdqa xmmword ptr [rsp+0x110], xmm1 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm0, xmm1 movdqa xmm1, xmmword ptr [rsp+0x120] psubd xmm1, xmm0 movdqa xmmword ptr [rsp+0x120], xmm1 add rbx, 128 add rdi, 32 sub rsi, 4 cmp rsi, 4 jnc 2b test rsi, rsi jne 3f 4: movdqa xmm6, xmmword ptr [rsp+0x170] movdqa xmm7, xmmword ptr [rsp+0x180] movdqa xmm8, xmmword ptr [rsp+0x190] movdqa xmm9, xmmword ptr [rsp+0x1A0] movdqa xmm10, xmmword ptr [rsp+0x1B0] movdqa xmm11, xmmword ptr [rsp+0x1C0] movdqa xmm12, xmmword ptr [rsp+0x1D0] movdqa xmm13, xmmword ptr [rsp+0x1E0] movdqa xmm14, xmmword ptr [rsp+0x1F0] movdqa xmm15, xmmword ptr [rsp+0x200] mov rsp, rbp pop rbp pop rbx pop rdi pop rsi pop r12 pop r13 pop r14 pop r15 ret .p2align 5 3: test esi, 0x2 je 3f movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+0x110] pinsrd xmm13, dword ptr [rsp+0x120], 1 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+0x114] pinsrd xmm14, dword ptr [rsp+0x124], 1 pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 movaps xmmword ptr [rsp+0x10], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm10, xmm2 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm3, xmm4 shufps xmm4, xmm5, 136 shufps xmm3, xmm5, 221 movaps xmm5, xmm3 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm3, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm3, xmm7, 221 pshufd xmm7, xmm3, 0x93 movups xmm12, xmmword ptr [r9+rdx-0x40] movups xmm13, xmmword ptr [r9+rdx-0x30] movaps xmm11, xmm12 shufps xmm12, xmm13, 136 shufps xmm11, xmm13, 221 movaps xmm13, xmm11 movups xmm14, xmmword ptr [r9+rdx-0x20] movups xmm15, xmmword ptr [r9+rdx-0x10] movaps xmm11, xmm14 shufps xmm14, xmm15, 136 pshufd xmm14, xmm14, 0x93 shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 0x93 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+0x10] pinsrd xmm3, eax, 3 pinsrd xmm11, eax, 3 mov al, 7 9: paddd xmm0, xmm4 paddd xmm8, xmm12 movaps xmmword ptr [rsp+0x20], xmm4 movaps xmmword ptr [rsp+0x30], xmm12 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movaps xmm12, xmmword ptr [ROT16+rip] pshufb xmm3, xmm12 pshufb xmm11, xmm12 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm5 paddd xmm8, xmm13 movaps xmmword ptr [rsp+0x40], xmm5 movaps xmmword ptr [rsp+0x50], xmm13 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movaps xmm13, xmmword ptr [ROT8+rip] pshufb xmm3, xmm13 pshufb xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x93 pshufd xmm8, xmm8, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x39 pshufd xmm10, xmm10, 0x39 paddd xmm0, xmm6 paddd xmm8, xmm14 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshufb xmm3, xmm12 pshufb xmm11, xmm12 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm7 paddd xmm8, xmm15 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshufb xmm3, xmm13 pshufb xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x39 pshufd xmm8, xmm8, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x93 pshufd xmm10, xmm10, 0x93 dec al je 9f movdqa xmm12, xmmword ptr [rsp+0x20] movdqa xmm5, xmmword ptr [rsp+0x40] pshufd xmm13, xmm12, 0x0F shufps xmm12, xmm5, 214 pshufd xmm4, xmm12, 0x39 movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 pblendw xmm13, xmm12, 0xCC movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 pblendw xmm12, xmm6, 0xC0 pshufd xmm12, xmm12, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmmword ptr [rsp+0x20], xmm13 movdqa xmmword ptr [rsp+0x40], xmm12 movdqa xmm5, xmmword ptr [rsp+0x30] movdqa xmm13, xmmword ptr [rsp+0x50] pshufd xmm6, xmm5, 0x0F shufps xmm5, xmm13, 214 pshufd xmm12, xmm5, 0x39 movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 pblendw xmm6, xmm5, 0xCC movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 pblendw xmm5, xmm14, 0xC0 pshufd xmm5, xmm5, 0x78 punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 pshufd xmm15, xmm14, 0x1E movdqa xmm13, xmm6 movdqa xmm14, xmm5 movdqa xmm5, xmmword ptr [rsp+0x20] movdqa xmm6, xmmword ptr [rsp+0x40] jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm8, xmm10 pxor xmm9, xmm11 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 movups xmmword ptr [rbx+0x20], xmm8 movups xmmword ptr [rbx+0x30], xmm9 movdqa xmm0, xmmword ptr [rsp+0x130] movdqa xmm1, xmmword ptr [rsp+0x110] movdqa xmm2, xmmword ptr [rsp+0x120] movdqu xmm3, xmmword ptr [rsp+0x118] movdqu xmm4, xmmword ptr [rsp+0x128] blendvps xmm1, xmm3, xmm0 blendvps xmm2, xmm4, xmm0 movdqa xmmword ptr [rsp+0x110], xmm1 movdqa xmmword ptr [rsp+0x120], xmm2 add rdi, 16 add rbx, 64 sub rsi, 2 3: test esi, 0x1 je 4b movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movd xmm13, dword ptr [rsp+0x110] pinsrd xmm13, dword ptr [rsp+0x120], 1 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x80] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm3, xmm13 pinsrd xmm3, eax, 3 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0xCC movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0xC0 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 jmp 4b .p2align 6 blake3_compress_in_place_sse41: _blake3_compress_in_place_sse41: sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+0x10], xmm7 movdqa xmmword ptr [rsp+0x20], xmm8 movdqa xmmword ptr [rsp+0x30], xmm9 movdqa xmmword ptr [rsp+0x40], xmm11 movdqa xmmword ptr [rsp+0x50], xmm14 movdqa xmmword ptr [rsp+0x60], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movzx eax, byte ptr [rsp+0xA0] movzx r8d, r8b shl rax, 32 add r8, rax movq xmm3, r9 movq xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+0x20] movups xmm7, xmmword ptr [rdx+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0xCC movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0xC0 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 movups xmmword ptr [rcx], xmm0 movups xmmword ptr [rcx+0x10], xmm1 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+0x10] movdqa xmm8, xmmword ptr [rsp+0x20] movdqa xmm9, xmmword ptr [rsp+0x30] movdqa xmm11, xmmword ptr [rsp+0x40] movdqa xmm14, xmmword ptr [rsp+0x50] movdqa xmm15, xmmword ptr [rsp+0x60] add rsp, 120 ret .p2align 6 _blake3_compress_xof_sse41: blake3_compress_xof_sse41: sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+0x10], xmm7 movdqa xmmword ptr [rsp+0x20], xmm8 movdqa xmmword ptr [rsp+0x30], xmm9 movdqa xmmword ptr [rsp+0x40], xmm11 movdqa xmmword ptr [rsp+0x50], xmm14 movdqa xmmword ptr [rsp+0x60], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movzx eax, byte ptr [rsp+0xA0] movzx r8d, r8b mov r10, qword ptr [rsp+0xA8] shl rax, 32 add r8, rax movq xmm3, r9 movq xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+0x20] movups xmm7, xmmword ptr [rdx+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0xCC movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0xC0 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: movdqu xmm4, xmmword ptr [rcx] movdqu xmm5, xmmword ptr [rcx+0x10] pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm2, xmm4 pxor xmm3, xmm5 movups xmmword ptr [r10], xmm0 movups xmmword ptr [r10+0x10], xmm1 movups xmmword ptr [r10+0x20], xmm2 movups xmmword ptr [r10+0x30], xmm3 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+0x10] movdqa xmm8, xmmword ptr [rsp+0x20] movdqa xmm9, xmmword ptr [rsp+0x30] movdqa xmm11, xmmword ptr [rsp+0x40] movdqa xmm14, xmmword ptr [rsp+0x50] movdqa xmm15, xmmword ptr [rsp+0x60] add rsp, 120 ret .section .rdata .p2align 6 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 .long 0x3C6EF372, 0xA54FF53A ROT16: .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 ADD0: .long 0, 1, 2, 3 ADD1: .long 4, 4, 4, 4 BLAKE3_IV_0: .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A BLAKE3_BLOCK_LEN: .long 64, 64, 64, 64 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 blake3-1.8.1/c/blake3_sse41_x86-64_windows_msvc.asm000064400000000000000000001733321046102023000176470ustar 00000000000000public _blake3_hash_many_sse41 public blake3_hash_many_sse41 public blake3_compress_in_place_sse41 public _blake3_compress_in_place_sse41 public blake3_compress_xof_sse41 public _blake3_compress_xof_sse41 _TEXT SEGMENT ALIGN(16) 'CODE' ALIGN 16 blake3_hash_many_sse41 PROC _blake3_hash_many_sse41 PROC push r15 push r14 push r13 push r12 push rsi push rdi push rbx push rbp mov rbp, rsp sub rsp, 528 and rsp, 0FFFFFFFFFFFFFFC0H movdqa xmmword ptr [rsp+170H], xmm6 movdqa xmmword ptr [rsp+180H], xmm7 movdqa xmmword ptr [rsp+190H], xmm8 movdqa xmmword ptr [rsp+1A0H], xmm9 movdqa xmmword ptr [rsp+1B0H], xmm10 movdqa xmmword ptr [rsp+1C0H], xmm11 movdqa xmmword ptr [rsp+1D0H], xmm12 movdqa xmmword ptr [rsp+1E0H], xmm13 movdqa xmmword ptr [rsp+1F0H], xmm14 movdqa xmmword ptr [rsp+200H], xmm15 mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, qword ptr [rbp+68H] movzx r9, byte ptr [rbp+70H] neg r9d movd xmm0, r9d pshufd xmm0, xmm0, 00H movdqa xmmword ptr [rsp+130H], xmm0 movdqa xmm1, xmm0 pand xmm1, xmmword ptr [ADD0] pand xmm0, xmmword ptr [ADD1] movdqa xmmword ptr [rsp+150H], xmm0 movd xmm0, r8d pshufd xmm0, xmm0, 00H paddd xmm0, xmm1 movdqa xmmword ptr [rsp+110H], xmm0 pxor xmm0, xmmword ptr [CMP_MSB_MASK] pxor xmm1, xmmword ptr [CMP_MSB_MASK] pcmpgtd xmm1, xmm0 shr r8, 32 movd xmm2, r8d pshufd xmm2, xmm2, 00H psubd xmm2, xmm1 movdqa xmmword ptr [rsp+120H], xmm2 mov rbx, qword ptr [rbp+90H] mov r15, rdx shl r15, 6 movzx r13d, byte ptr [rbp+78H] movzx r12d, byte ptr [rbp+88H] cmp rsi, 4 jc final3blocks outerloop4: movdqu xmm3, xmmword ptr [rcx] pshufd xmm0, xmm3, 00H pshufd xmm1, xmm3, 55H pshufd xmm2, xmm3, 0AAH pshufd xmm3, xmm3, 0FFH movdqu xmm7, xmmword ptr [rcx+10H] pshufd xmm4, xmm7, 00H pshufd xmm5, xmm7, 55H pshufd xmm6, xmm7, 0AAH pshufd xmm7, xmm7, 0FFH mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] mov r10, qword ptr [rdi+10H] mov r11, qword ptr [rdi+18H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx innerloop4: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movdqu xmm8, xmmword ptr [r8+rdx-40H] movdqu xmm9, xmmword ptr [r9+rdx-40H] movdqu xmm10, xmmword ptr [r10+rdx-40H] movdqu xmm11, xmmword ptr [r11+rdx-40H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp], xmm8 movdqa xmmword ptr [rsp+10H], xmm9 movdqa xmmword ptr [rsp+20H], xmm12 movdqa xmmword ptr [rsp+30H], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-30H] movdqu xmm9, xmmword ptr [r9+rdx-30H] movdqu xmm10, xmmword ptr [r10+rdx-30H] movdqu xmm11, xmmword ptr [r11+rdx-30H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+40H], xmm8 movdqa xmmword ptr [rsp+50H], xmm9 movdqa xmmword ptr [rsp+60H], xmm12 movdqa xmmword ptr [rsp+70H], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-20H] movdqu xmm9, xmmword ptr [r9+rdx-20H] movdqu xmm10, xmmword ptr [r10+rdx-20H] movdqu xmm11, xmmword ptr [r11+rdx-20H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+80H], xmm8 movdqa xmmword ptr [rsp+90H], xmm9 movdqa xmmword ptr [rsp+0A0H], xmm12 movdqa xmmword ptr [rsp+0B0H], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-10H] movdqu xmm9, xmmword ptr [r9+rdx-10H] movdqu xmm10, xmmword ptr [r10+rdx-10H] movdqu xmm11, xmmword ptr [r11+rdx-10H] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0C0H], xmm8 movdqa xmmword ptr [rsp+0D0H], xmm9 movdqa xmmword ptr [rsp+0E0H], xmm12 movdqa xmmword ptr [rsp+0F0H], xmm13 movdqa xmm9, xmmword ptr [BLAKE3_IV_1] movdqa xmm10, xmmword ptr [BLAKE3_IV_2] movdqa xmm11, xmmword ptr [BLAKE3_IV_3] movdqa xmm12, xmmword ptr [rsp+110H] movdqa xmm13, xmmword ptr [rsp+120H] movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] movd xmm15, eax pshufd xmm15, xmm15, 00H prefetcht0 byte ptr [r8+rdx+80H] prefetcht0 byte ptr [r9+rdx+80H] prefetcht0 byte ptr [r10+rdx+80H] prefetcht0 byte ptr [r11+rdx+80H] paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+20H] paddd xmm2, xmmword ptr [rsp+40H] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [BLAKE3_IV_0] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+10H] paddd xmm1, xmmword ptr [rsp+30H] paddd xmm2, xmmword ptr [rsp+50H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+80H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp+0C0H] paddd xmm3, xmmword ptr [rsp+0E0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+90H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+0D0H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+20H] paddd xmm1, xmmword ptr [rsp+30H] paddd xmm2, xmmword ptr [rsp+70H] paddd xmm3, xmmword ptr [rsp+40H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+60H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0D0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+10H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+90H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0B0H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp+0E0H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+30H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp+0D0H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+40H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+20H] paddd xmm3, xmmword ptr [rsp+0E0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+60H] paddd xmm1, xmmword ptr [rsp+90H] paddd xmm2, xmmword ptr [rsp+0B0H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+50H] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0F0H] paddd xmm3, xmmword ptr [rsp+10H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0A0H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+0E0H] paddd xmm3, xmmword ptr [rsp+0D0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+70H] paddd xmm1, xmmword ptr [rsp+90H] paddd xmm2, xmmword ptr [rsp+30H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+40H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+50H] paddd xmm3, xmmword ptr [rsp+10H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+20H] paddd xmm2, xmmword ptr [rsp+80H] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0C0H] paddd xmm1, xmmword ptr [rsp+90H] paddd xmm2, xmmword ptr [rsp+0F0H] paddd xmm3, xmmword ptr [rsp+0E0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0D0H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+0A0H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+70H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+20H] paddd xmm1, xmmword ptr [rsp+30H] paddd xmm2, xmmword ptr [rsp+10H] paddd xmm3, xmmword ptr [rsp+40H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+90H] paddd xmm1, xmmword ptr [rsp+0B0H] paddd xmm2, xmmword ptr [rsp+80H] paddd xmm3, xmmword ptr [rsp+0F0H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0E0H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp+0C0H] paddd xmm3, xmmword ptr [rsp+10H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0D0H] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+20H] paddd xmm3, xmmword ptr [rsp+40H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+30H] paddd xmm1, xmmword ptr [rsp+0A0H] paddd xmm2, xmmword ptr [rsp+60H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0B0H] paddd xmm1, xmmword ptr [rsp+50H] paddd xmm2, xmmword ptr [rsp+10H] paddd xmm3, xmmword ptr [rsp+80H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0F0H] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+90H] paddd xmm3, xmmword ptr [rsp+60H] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0E0H] paddd xmm1, xmmword ptr [rsp+20H] paddd xmm2, xmmword ptr [rsp+30H] paddd xmm3, xmmword ptr [rsp+70H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+100H], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0A0H] paddd xmm1, xmmword ptr [rsp+0C0H] paddd xmm2, xmmword ptr [rsp+40H] paddd xmm3, xmmword ptr [rsp+0D0H] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 pxor xmm0, xmm8 pxor xmm1, xmm9 pxor xmm2, xmm10 pxor xmm3, xmm11 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 pxor xmm4, xmm12 pxor xmm5, xmm13 pxor xmm6, xmm14 pxor xmm7, xmm15 mov eax, r13d jne innerloop4 movdqa xmm9, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm9, xmm1 movdqa xmm11, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm11, xmm3 movdqa xmm1, xmm0 punpcklqdq xmm0, xmm2 punpckhqdq xmm1, xmm2 movdqa xmm3, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm3, xmm11 movdqu xmmword ptr [rbx], xmm0 movdqu xmmword ptr [rbx+20H], xmm1 movdqu xmmword ptr [rbx+40H], xmm9 movdqu xmmword ptr [rbx+60H], xmm3 movdqa xmm9, xmm4 punpckldq xmm4, xmm5 punpckhdq xmm9, xmm5 movdqa xmm11, xmm6 punpckldq xmm6, xmm7 punpckhdq xmm11, xmm7 movdqa xmm5, xmm4 punpcklqdq xmm4, xmm6 punpckhqdq xmm5, xmm6 movdqa xmm7, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm7, xmm11 movdqu xmmword ptr [rbx+10H], xmm4 movdqu xmmword ptr [rbx+30H], xmm5 movdqu xmmword ptr [rbx+50H], xmm9 movdqu xmmword ptr [rbx+70H], xmm7 movdqa xmm1, xmmword ptr [rsp+110H] movdqa xmm0, xmm1 paddd xmm1, xmmword ptr [rsp+150H] movdqa xmmword ptr [rsp+110H], xmm1 pxor xmm0, xmmword ptr [CMP_MSB_MASK] pxor xmm1, xmmword ptr [CMP_MSB_MASK] pcmpgtd xmm0, xmm1 movdqa xmm1, xmmword ptr [rsp+120H] psubd xmm1, xmm0 movdqa xmmword ptr [rsp+120H], xmm1 add rbx, 128 add rdi, 32 sub rsi, 4 cmp rsi, 4 jnc outerloop4 test rsi, rsi jne final3blocks unwind: movdqa xmm6, xmmword ptr [rsp+170H] movdqa xmm7, xmmword ptr [rsp+180H] movdqa xmm8, xmmword ptr [rsp+190H] movdqa xmm9, xmmword ptr [rsp+1A0H] movdqa xmm10, xmmword ptr [rsp+1B0H] movdqa xmm11, xmmword ptr [rsp+1C0H] movdqa xmm12, xmmword ptr [rsp+1D0H] movdqa xmm13, xmmword ptr [rsp+1E0H] movdqa xmm14, xmmword ptr [rsp+1F0H] movdqa xmm15, xmmword ptr [rsp+200H] mov rsp, rbp pop rbp pop rbx pop rdi pop rsi pop r12 pop r13 pop r14 pop r15 ret ALIGN 16 final3blocks: test esi, 2H je final1block movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+110H] pinsrd xmm13, dword ptr [rsp+120H], 1 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+114H] pinsrd xmm14, dword ptr [rsp+124H], 1 pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 movaps xmmword ptr [rsp+10H], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx innerloop2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV] movaps xmm10, xmm2 movups xmm4, xmmword ptr [r8+rdx-40H] movups xmm5, xmmword ptr [r8+rdx-30H] movaps xmm3, xmm4 shufps xmm4, xmm5, 136 shufps xmm3, xmm5, 221 movaps xmm5, xmm3 movups xmm6, xmmword ptr [r8+rdx-20H] movups xmm7, xmmword ptr [r8+rdx-10H] movaps xmm3, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm3, xmm7, 221 pshufd xmm7, xmm3, 93H movups xmm12, xmmword ptr [r9+rdx-40H] movups xmm13, xmmword ptr [r9+rdx-30H] movaps xmm11, xmm12 shufps xmm12, xmm13, 136 shufps xmm11, xmm13, 221 movaps xmm13, xmm11 movups xmm14, xmmword ptr [r9+rdx-20H] movups xmm15, xmmword ptr [r9+rdx-10H] movaps xmm11, xmm14 shufps xmm14, xmm15, 136 pshufd xmm14, xmm14, 93H shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 93H movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+10H] pinsrd xmm3, eax, 3 pinsrd xmm11, eax, 3 mov al, 7 roundloop2: paddd xmm0, xmm4 paddd xmm8, xmm12 movaps xmmword ptr [rsp+20H], xmm4 movaps xmmword ptr [rsp+30H], xmm12 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movaps xmm12, xmmword ptr [ROT16] pshufb xmm3, xmm12 pshufb xmm11, xmm12 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm5 paddd xmm8, xmm13 movaps xmmword ptr [rsp+40H], xmm5 movaps xmmword ptr [rsp+50H], xmm13 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movaps xmm13, xmmword ptr [ROT8] pshufb xmm3, xmm13 pshufb xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 93H pshufd xmm8, xmm8, 93H pshufd xmm3, xmm3, 4EH pshufd xmm11, xmm11, 4EH pshufd xmm2, xmm2, 39H pshufd xmm10, xmm10, 39H paddd xmm0, xmm6 paddd xmm8, xmm14 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshufb xmm3, xmm12 pshufb xmm11, xmm12 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm7 paddd xmm8, xmm15 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshufb xmm3, xmm13 pshufb xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 39H pshufd xmm8, xmm8, 39H pshufd xmm3, xmm3, 4EH pshufd xmm11, xmm11, 4EH pshufd xmm2, xmm2, 93H pshufd xmm10, xmm10, 93H dec al je endroundloop2 movdqa xmm12, xmmword ptr [rsp+20H] movdqa xmm5, xmmword ptr [rsp+40H] pshufd xmm13, xmm12, 0FH shufps xmm12, xmm5, 214 pshufd xmm4, xmm12, 39H movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 pblendw xmm13, xmm12, 0CCH movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 pblendw xmm12, xmm6, 0C0H pshufd xmm12, xmm12, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmmword ptr [rsp+20H], xmm13 movdqa xmmword ptr [rsp+40H], xmm12 movdqa xmm5, xmmword ptr [rsp+30H] movdqa xmm13, xmmword ptr [rsp+50H] pshufd xmm6, xmm5, 0FH shufps xmm5, xmm13, 214 pshufd xmm12, xmm5, 39H movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 pblendw xmm6, xmm5, 0CCH movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 pblendw xmm5, xmm14, 0C0H pshufd xmm5, xmm5, 78H punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 pshufd xmm15, xmm14, 1EH movdqa xmm13, xmm6 movdqa xmm14, xmm5 movdqa xmm5, xmmword ptr [rsp+20H] movdqa xmm6, xmmword ptr [rsp+40H] jmp roundloop2 endroundloop2: pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm8, xmm10 pxor xmm9, xmm11 mov eax, r13d cmp rdx, r15 jne innerloop2 movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+10H], xmm1 movups xmmword ptr [rbx+20H], xmm8 movups xmmword ptr [rbx+30H], xmm9 movdqa xmm0, xmmword ptr [rsp+130H] movdqa xmm1, xmmword ptr [rsp+110H] movdqa xmm2, xmmword ptr [rsp+120H] movdqu xmm3, xmmword ptr [rsp+118H] movdqu xmm4, xmmword ptr [rsp+128H] blendvps xmm1, xmm3, xmm0 blendvps xmm2, xmm4, xmm0 movdqa xmmword ptr [rsp+110H], xmm1 movdqa xmmword ptr [rsp+120H], xmm2 add rdi, 16 add rbx, 64 sub rsi, 2 final1block: test esi, 1H je unwind movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movd xmm13, dword ptr [rsp+110H] pinsrd xmm13, dword ptr [rsp+120H], 1 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 movaps xmm14, xmmword ptr [ROT8] movaps xmm15, xmmword ptr [ROT16] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+80H] or eax, r13d xor edx, edx innerloop1: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV] movaps xmm3, xmm13 pinsrd xmm3, eax, 3 movups xmm4, xmmword ptr [r8+rdx-40H] movups xmm5, xmmword ptr [r8+rdx-30H] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [r8+rdx-20H] movups xmm7, xmmword ptr [r8+rdx-10H] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H mov al, 7 roundloop1: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 93H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 39H paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 39H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 93H dec al jz endroundloop1 movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0FH pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0CCH movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0C0H pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp roundloop1 endroundloop1: pxor xmm0, xmm2 pxor xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne innerloop1 movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+10H], xmm1 jmp unwind _blake3_hash_many_sse41 ENDP blake3_hash_many_sse41 ENDP blake3_compress_in_place_sse41 PROC _blake3_compress_in_place_sse41 PROC sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+10H], xmm7 movdqa xmmword ptr [rsp+20H], xmm8 movdqa xmmword ptr [rsp+30H], xmm9 movdqa xmmword ptr [rsp+40H], xmm11 movdqa xmmword ptr [rsp+50H], xmm14 movdqa xmmword ptr [rsp+60H], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm2, xmmword ptr [BLAKE3_IV] movzx eax, byte ptr [rsp+0A0H] movzx r8d, r8b shl rax, 32 add r8, rax movd xmm3, r9 movd xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+10H] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+20H] movups xmm7, xmmword ptr [rdx+30H] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H movaps xmm14, xmmword ptr [ROT8] movaps xmm15, xmmword ptr [ROT16] mov al, 7 @@: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 93H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 39H paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 39H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 93H dec al jz @F movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0FH pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0CCH movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0C0H pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp @B @@: pxor xmm0, xmm2 pxor xmm1, xmm3 movups xmmword ptr [rcx], xmm0 movups xmmword ptr [rcx+10H], xmm1 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+10H] movdqa xmm8, xmmword ptr [rsp+20H] movdqa xmm9, xmmword ptr [rsp+30H] movdqa xmm11, xmmword ptr [rsp+40H] movdqa xmm14, xmmword ptr [rsp+50H] movdqa xmm15, xmmword ptr [rsp+60H] add rsp, 120 ret _blake3_compress_in_place_sse41 ENDP blake3_compress_in_place_sse41 ENDP ALIGN 16 blake3_compress_xof_sse41 PROC _blake3_compress_xof_sse41 PROC sub rsp, 120 movdqa xmmword ptr [rsp], xmm6 movdqa xmmword ptr [rsp+10H], xmm7 movdqa xmmword ptr [rsp+20H], xmm8 movdqa xmmword ptr [rsp+30H], xmm9 movdqa xmmword ptr [rsp+40H], xmm11 movdqa xmmword ptr [rsp+50H], xmm14 movdqa xmmword ptr [rsp+60H], xmm15 movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movaps xmm2, xmmword ptr [BLAKE3_IV] movzx eax, byte ptr [rsp+0A0H] movzx r8d, r8b mov r10, qword ptr [rsp+0A8H] shl rax, 32 add r8, rax movd xmm3, r9 movd xmm4, r8 punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rdx] movups xmm5, xmmword ptr [rdx+10H] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rdx+20H] movups xmm7, xmmword ptr [rdx+30H] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H movaps xmm14, xmmword ptr [ROT8] movaps xmm15, xmmword ptr [ROT16] mov al, 7 @@: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 93H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 39H paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 39H pshufd xmm3, xmm3, 4EH pshufd xmm2, xmm2, 93H dec al jz @F movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0FH pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0CCH movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0C0H pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 1EH movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp @B @@: movdqu xmm4, xmmword ptr [rcx] movdqu xmm5, xmmword ptr [rcx+10H] pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm2, xmm4 pxor xmm3, xmm5 movups xmmword ptr [r10], xmm0 movups xmmword ptr [r10+10H], xmm1 movups xmmword ptr [r10+20H], xmm2 movups xmmword ptr [r10+30H], xmm3 movdqa xmm6, xmmword ptr [rsp] movdqa xmm7, xmmword ptr [rsp+10H] movdqa xmm8, xmmword ptr [rsp+20H] movdqa xmm9, xmmword ptr [rsp+30H] movdqa xmm11, xmmword ptr [rsp+40H] movdqa xmm14, xmmword ptr [rsp+50H] movdqa xmm15, xmmword ptr [rsp+60H] add rsp, 120 ret _blake3_compress_xof_sse41 ENDP blake3_compress_xof_sse41 ENDP _TEXT ENDS _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' ALIGN 64 BLAKE3_IV: dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH ADD0: dd 0, 1, 2, 3 ADD1: dd 4 dup (4) BLAKE3_IV_0: dd 4 dup (6A09E667H) BLAKE3_IV_1: dd 4 dup (0BB67AE85H) BLAKE3_IV_2: dd 4 dup (3C6EF372H) BLAKE3_IV_3: dd 4 dup (0A54FF53AH) BLAKE3_BLOCK_LEN: dd 4 dup (64) ROT16: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 CMP_MSB_MASK: dd 8 dup(80000000H) _RDATA ENDS END blake3-1.8.1/c/blake3_tbb.cpp000064400000000000000000000024251046102023000137150ustar 00000000000000#include #include #include #include "blake3_impl.h" static_assert(TBB_USE_EXCEPTIONS == 0, "This file should be compiled with C++ exceptions disabled."); extern "C" void blake3_compress_subtree_wide_join_tbb( // shared params const uint32_t key[8], uint8_t flags, bool use_tbb, // left-hand side params const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter, uint8_t *l_cvs, size_t *l_n, // right-hand side params const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter, uint8_t *r_cvs, size_t *r_n) noexcept { if (!use_tbb) { *l_n = blake3_compress_subtree_wide(l_input, l_input_len, key, l_chunk_counter, flags, l_cvs, use_tbb); *r_n = blake3_compress_subtree_wide(r_input, r_input_len, key, r_chunk_counter, flags, r_cvs, use_tbb); return; } oneapi::tbb::parallel_invoke( [=]() { *l_n = blake3_compress_subtree_wide( l_input, l_input_len, key, l_chunk_counter, flags, l_cvs, use_tbb); }, [=]() { *r_n = blake3_compress_subtree_wide( r_input, r_input_len, key, r_chunk_counter, flags, r_cvs, use_tbb); }); } blake3-1.8.1/c/cmake/BLAKE3/ContinuousIntegration.cmake000064400000000000000000000170431046102023000205600ustar 00000000000000cmake_minimum_required(VERSION 3.13 FATAL_ERROR) if(BUILD_SHARED_LIBS) message(FATAL_ERROR "BUILD_SHARED_LIBS is incompatible with BLAKE3_TESTING_CI") endif() include(CTest) # Declare a testing specific variant of the `blake3` library target. # # We use a separate library target in order to be able to perform compilation with various # combinations of features which are too noisy to specify in the main CMake config as options for # the normal `blake3` target. # # Initially this target has no properties but eventually we will populate them by copying all of the # relevant properties from the normal `blake3` target. add_library(blake3-testing blake3.c blake3_dispatch.c blake3_portable.c ) if(BLAKE3_USE_TBB AND TBB_FOUND) target_sources(blake3-testing PRIVATE blake3_tbb.cpp) endif() if(BLAKE3_SIMD_TYPE STREQUAL "amd64-asm") # Conditionally add amd64 asm files to `blake3-testing` sources if(MSVC) if(NOT BLAKE3_NO_AVX2) list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx2_x86-64_windows_msvc.asm) endif() if(NOT BLAKE3_NO_AVX512) list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx512_x86-64_windows_msvc.asm) endif() if(NOT BLAKE3_NO_SSE2) list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse2_x86-64_windows_msvc.asm) endif() if(NOT BLAKE3_NO_SSE41) list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse41_x86-64_windows_msvc.asm) endif() elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang" OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang") if (WIN32) if(NOT BLAKE3_NO_AVX2) list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx2_x86-64_windows_gnu.S) endif() if(NOT BLAKE3_NO_AVX512) list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx512_x86-64_windows_gnu.S) endif() if(NOT BLAKE3_NO_SSE2) list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse2_x86-64_windows_gnu.S) endif() if(NOT BLAKE3_NO_SSE41) list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse41_x86-64_windows_gnu.S) endif() elseif(UNIX) if(NOT BLAKE3_NO_AVX2) list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx2_x86-64_unix.S) endif() if(NOT BLAKE3_NO_AVX512) list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx512_x86-64_unix.S) endif() if(NOT BLAKE3_NO_SSE2) list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse2_x86-64_unix.S) endif() if(NOT BLAKE3_NO_SSE41) list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse41_x86-64_unix.S) endif() endif() endif() target_sources(blake3-testing PRIVATE ${BLAKE3_AMD64_ASM_SOURCES}) elseif(BLAKE3_SIMD_TYPE STREQUAL "x86-intrinsics") # Conditionally add amd64 C files to `blake3-testing` sources if (NOT DEFINED BLAKE3_CFLAGS_SSE2 OR NOT DEFINED BLAKE3_CFLAGS_SSE4.1 OR NOT DEFINED BLAKE3_CFLAGS_AVX2 OR NOT DEFINED BLAKE3_CFLAGS_AVX512) message(WARNING "BLAKE3_SIMD_TYPE is set to 'x86-intrinsics' but no compiler flags are available for the target architecture.") else() set(BLAKE3_SIMD_X86_INTRINSICS ON) endif() if(NOT BLAKE3_NO_AVX2) target_sources(blake3-testing PRIVATE blake3_avx2.c) set_source_files_properties(blake3_avx2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX2}") endif() if(NOT BLAKE3_NO_AVX512) target_sources(blake3-testing PRIVATE blake3_avx512.c) set_source_files_properties(blake3_avx512.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX512}") endif() if(NOT BLAKE3_NO_SSE2) target_sources(blake3-testing PRIVATE blake3_sse2.c) set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}") endif() if(NOT BLAKE3_NO_SSE41) target_sources(blake3-testing PRIVATE blake3_sse41.c) set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}") endif() elseif(BLAKE3_SIMD_TYPE STREQUAL "neon-intrinsics") # Conditionally add neon C files to `blake3-testing` sources target_sources(blake3-testing PRIVATE blake3_neon.c ) target_compile_definitions(blake3-testing PRIVATE BLAKE3_USE_NEON=1 ) if (DEFINED BLAKE3_CFLAGS_NEON) set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}") endif() elseif(BLAKE3_SIMD_TYPE STREQUAL "none") # Disable neon if simd type is "none". We check for individual amd64 features further below. target_compile_definitions(blake3-testing PRIVATE BLAKE3_USE_NEON=0 ) endif() if(BLAKE3_NO_AVX2) target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_AVX2) endif() if(BLAKE3_NO_AVX512) target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_AVX512) endif() if(BLAKE3_NO_SSE2) target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_SSE2) endif() if(BLAKE3_NO_SSE41) target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_SSE41) endif() target_compile_definitions(blake3-testing PUBLIC BLAKE3_TESTING) get_target_property(BLAKE3_COMPILE_DEFINITIONS blake3 COMPILE_DEFINITIONS) if(BLAKE3_COMPILE_DEFINITIONS) target_compile_definitions(blake3-testing PUBLIC ${BLAKE3_COMPILE_DEFINITIONS}) endif() get_target_property(BLAKE3_COMPILE_OPTIONS blake3 COMPILE_OPTIONS) if(BLAKE3_COMPILE_OPTIONS) target_compile_options(blake3-testing PRIVATE ${BLAKE3_COMPILE_OPTIONS} -O3 -Wall -Wextra -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2 -fPIE -fvisibility=hidden -fsanitize=address,undefined ) endif() get_target_property(BLAKE3_INCLUDE_DIRECTORIES blake3 INCLUDE_DIRECTORIES) if(BLAKE3_INCLUDE_DIRECTORIES) target_include_directories(blake3-testing PUBLIC $ $ ) endif() get_target_property(BLAKE3_LINK_LIBRARIES blake3 LINK_LIBRARIES) if(BLAKE3_LINK_LIBRARIES) target_link_libraries(blake3-testing PRIVATE ${BLAKE3_LINK_LIBRARIES}) endif() get_target_property(BLAKE3_LINK_OPTIONS blake3 LINK_OPTIONS) if(BLAKE3_LINK_OPTIONS) target_link_options(blake3-testing PRIVATE ${BLAKE3_LINK_OPTIONS} -fsanitize=address,undefined -pie -Wl,-z,relro,-z,now ) endif() # test asm target add_executable(blake3-asm-test main.c ) set_target_properties(blake3-asm-test PROPERTIES OUTPUT_NAME blake3 RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}) target_link_libraries(blake3-asm-test PRIVATE blake3-testing) target_compile_definitions(blake3-asm-test PRIVATE BLAKE3_TESTING) target_compile_options(blake3-asm-test PRIVATE -O3 -Wall -Wextra -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2 -fPIE -fvisibility=hidden -fsanitize=address,undefined ) target_link_options(blake3-asm-test PRIVATE -fsanitize=address,undefined -pie -Wl,-z,relro,-z,now ) add_test(NAME blake3-testing COMMAND "${CMAKE_CTEST_COMMAND}" --verbose --extra-verbose --build-and-test "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" --build-generator "${CMAKE_GENERATOR}" --build-makeprogram "${CMAKE_MAKE_PROGRAM}" --build-project libblake3 --build-target blake3-asm-test --build-options --fresh "-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}" "-DBLAKE3_TESTING=${BLAKE3_TESTING}" "-DBLAKE3_TESTING_CI=${BLAKE3_TESTING_CI}" "-DBLAKE3_USE_TBB=${BLAKE3_USE_TBB}" "-DBLAKE3_SIMD_TYPE=${BLAKE3_SIMD_TYPE}" "-DBLAKE3_NO_SSE2=${BLAKE3_NO_SSE2}" "-DBLAKE3_NO_SSE41=${BLAKE3_NO_SSE41}" "-DBLAKE3_NO_AVX2=${BLAKE3_NO_AVX2}" "-DBLAKE3_NO_AVX512=${BLAKE3_NO_AVX512}" --test-command "${CMAKE_SOURCE_DIR}/test.py" ) blake3-1.8.1/c/cmake/BLAKE3/Examples.cmake000064400000000000000000000005361046102023000157630ustar 00000000000000if(NOT WIN32) add_executable(blake3-example example.c) target_link_libraries(blake3-example PRIVATE blake3) install(TARGETS blake3-example) if(BLAKE3_USE_TBB) add_executable(blake3-example-tbb example_tbb.c) target_link_libraries(blake3-example-tbb PRIVATE blake3) install(TARGETS blake3-example-tbb) endif() endif() blake3-1.8.1/c/cmake/BLAKE3/Testing.cmake000064400000000000000000000001061046102023000156130ustar 00000000000000if(BLAKE3_TESTING_CI) include(BLAKE3/ContinuousIntegration) endif() blake3-1.8.1/c/dependencies/CMakeLists.txt000064400000000000000000000000651046102023000164050ustar 00000000000000if(BLAKE3_USE_TBB) add_subdirectory(tbb) endif() blake3-1.8.1/c/dependencies/tbb/CMakeLists.txt000064400000000000000000000014751046102023000171620ustar 00000000000000if(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") message(FATAL_ERROR "BLAKE3_USE_TBB requires building with Clang or MSVC on Windows") set(BLAKE3_USE_TBB OFF) endif() find_package(TBB 2021.11.0 QUIET) if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.11) include(FetchContent) if(NOT TBB_FOUND AND BLAKE3_FETCH_TBB) set(CMAKE_C_STANDARD 99) set(CMAKE_C_EXTENSIONS OFF) set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_EXTENSIONS ON) option(TBB_TEST OFF "") option(TBBMALLOC_BUILD OFF "") mark_as_advanced(TBB_TEST) mark_as_advanced(TBBMALLOC_BUILD) FetchContent_Declare( TBB GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB GIT_TAG 0c0ff192a2304e114bc9e6557582dfba101360ff # v2022.0.0 GIT_SHALLOW TRUE ) FetchContent_MakeAvailable(TBB) endif() endif() blake3-1.8.1/c/example.c000064400000000000000000000015441046102023000130210ustar 00000000000000#include "blake3.h" #include #include #include #include int main(void) { // Initialize the hasher. blake3_hasher hasher; blake3_hasher_init(&hasher); // Read input bytes from stdin. unsigned char buf[65536]; while (1) { ssize_t n = read(STDIN_FILENO, buf, sizeof(buf)); if (n > 0) { blake3_hasher_update(&hasher, buf, n); } else if (n == 0) { break; // end of file } else { fprintf(stderr, "read failed: %s\n", strerror(errno)); return 1; } } // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. uint8_t output[BLAKE3_OUT_LEN]; blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); // Print the hash as hexadecimal. for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) { printf("%02x", output[i]); } printf("\n"); return 0; } blake3-1.8.1/c/example_tbb.c000064400000000000000000000031311046102023000136420ustar 00000000000000#include "blake3.h" #include #include #include #include #include #include #include int main(int argc, char **argv) { // For each filepath argument, memory map it and hash it. for (int i = 1; i < argc; i++) { // Open and memory map the file. int fd = open(argv[i], O_RDONLY); if (fd == -1) { fprintf(stderr, "open failed: %s\n", strerror(errno)); return 1; } struct stat statbuf; if (fstat(fd, &statbuf) == -1) { fprintf(stderr, "stat failed: %s\n", strerror(errno)); return 1; } void *mapped = mmap(NULL, statbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (mapped == MAP_FAILED) { fprintf(stderr, "mmap failed: %s\n", strerror(errno)); return 1; } // Initialize the hasher. blake3_hasher hasher; blake3_hasher_init(&hasher); // Hash the mapped file using multiple threads. blake3_hasher_update_tbb(&hasher, mapped, statbuf.st_size); // Unmap and close the file. if (munmap(mapped, statbuf.st_size) == -1) { fprintf(stderr, "munmap failed: %s\n", strerror(errno)); return 1; } if (close(fd) == -1) { fprintf(stderr, "close failed: %s\n", strerror(errno)); return 1; } // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. uint8_t output[BLAKE3_OUT_LEN]; blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); // Print the hash as hexadecimal. for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) { printf("%02x", output[i]); } printf("\n"); } } blake3-1.8.1/c/libblake3.pc.in000064400000000000000000000005201046102023000137740ustar 00000000000000prefix="@CMAKE_INSTALL_PREFIX@" exec_prefix="${prefix}" libdir="@PKG_CONFIG_INSTALL_LIBDIR@" includedir="@PKG_CONFIG_INSTALL_INCLUDEDIR@" Name: @PROJECT_NAME@ Description: @PROJECT_DESCRIPTION@ Version: @PROJECT_VERSION@ Requires: @PKG_CONFIG_REQUIRES@ Libs: -L"${libdir}" -lblake3 Cflags: -I"${includedir}" @BLAKE3_PKGCONFIG_CFLAGS@ blake3-1.8.1/c/main.c000064400000000000000000000077131046102023000123160ustar 00000000000000/* * This main file is intended for testing via `make test`. It does not build in * other settings. See README.md in this directory for examples of how to build * C code. */ #include #include #include #include #include #include #include "blake3.h" #include "blake3_impl.h" #define HASH_MODE 0 #define KEYED_HASH_MODE 1 #define DERIVE_KEY_MODE 2 static void hex_char_value(uint8_t c, uint8_t *value, bool *valid) { if ('0' <= c && c <= '9') { *value = c - '0'; *valid = true; } else if ('a' <= c && c <= 'f') { *value = 10 + c - 'a'; *valid = true; } else { *valid = false; } } static int parse_key(char *hex_key, uint8_t out[BLAKE3_KEY_LEN]) { size_t hex_len = strlen(hex_key); if (hex_len != 64) { fprintf(stderr, "Expected a 64-char hexadecimal key, got %zu chars.\n", hex_len); return 1; } for (size_t i = 0; i < 64; i++) { uint8_t value; bool valid; hex_char_value(hex_key[i], &value, &valid); if (!valid) { fprintf(stderr, "Invalid hex char.\n"); return 1; } if (i % 2 == 0) { out[i / 2] = 0; value <<= 4; } out[i / 2] += value; } return 0; } /* A little repetition here */ enum cpu_feature { SSE2 = 1 << 0, SSSE3 = 1 << 1, SSE41 = 1 << 2, AVX = 1 << 3, AVX2 = 1 << 4, AVX512F = 1 << 5, AVX512VL = 1 << 6, /* ... */ UNDEFINED = 1 << 30 }; extern enum cpu_feature g_cpu_features; enum cpu_feature get_cpu_features(void); int main(int argc, char **argv) { size_t out_len = BLAKE3_OUT_LEN; uint8_t key[BLAKE3_KEY_LEN]; char *context = ""; uint8_t mode = HASH_MODE; while (argc > 1) { if (argc <= 2) { fprintf(stderr, "Odd number of arguments.\n"); return 1; } if (strcmp("--length", argv[1]) == 0) { char *endptr = NULL; errno = 0; unsigned long long out_len_ll = strtoull(argv[2], &endptr, 10); if (errno != 0 || out_len_ll > SIZE_MAX || endptr == argv[2] || *endptr != 0) { fprintf(stderr, "Bad length argument.\n"); return 1; } out_len = (size_t)out_len_ll; } else if (strcmp("--keyed", argv[1]) == 0) { mode = KEYED_HASH_MODE; int ret = parse_key(argv[2], key); if (ret != 0) { return ret; } } else if (strcmp("--derive-key", argv[1]) == 0) { mode = DERIVE_KEY_MODE; context = argv[2]; } else { fprintf(stderr, "Unknown flag.\n"); return 1; } argc -= 2; argv += 2; } /* * We're going to hash the input multiple times, so we need to buffer it all. * This is just for test cases, so go ahead and assume that the input is less * than 1 MiB. */ size_t buf_capacity = 1 << 20; uint8_t *buf = malloc(buf_capacity); assert(buf != NULL); size_t buf_len = 0; while (1) { size_t n = fread(&buf[buf_len], 1, buf_capacity - buf_len, stdin); if (n == 0) { break; } buf_len += n; assert(buf_len < buf_capacity); } const int mask = get_cpu_features(); int feature = 0; do { fprintf(stderr, "Testing 0x%08X\n", feature); g_cpu_features = feature; blake3_hasher hasher; switch (mode) { case HASH_MODE: blake3_hasher_init(&hasher); break; case KEYED_HASH_MODE: blake3_hasher_init_keyed(&hasher, key); break; case DERIVE_KEY_MODE: blake3_hasher_init_derive_key(&hasher, context); break; default: abort(); } blake3_hasher_update(&hasher, buf, buf_len); /* TODO: An incremental output reader API to avoid this allocation. */ uint8_t *out = malloc(out_len); if (out_len > 0 && out == NULL) { fprintf(stderr, "malloc() failed.\n"); return 1; } blake3_hasher_finalize(&hasher, out, out_len); for (size_t i = 0; i < out_len; i++) { printf("%02x", out[i]); } printf("\n"); free(out); feature = (feature - mask) & mask; } while (feature != 0); free(buf); return 0; } blake3-1.8.1/c/test.py000075500000000000000000000071061046102023000125560ustar 00000000000000#! /usr/bin/env python3 from binascii import hexlify import json from os import path import subprocess HERE = path.dirname(__file__) TEST_VECTORS_PATH = path.join(HERE, "..", "test_vectors", "test_vectors.json") TEST_VECTORS = json.load(open(TEST_VECTORS_PATH)) def run_blake3(args, input): output = subprocess.run([path.join(HERE, "blake3")] + args, input=input, stdout=subprocess.PIPE, check=True) return output.stdout.decode().strip() # Fill the input with a repeating byte pattern. We use a cycle length of 251, # because that's the largest prime number less than 256. This makes it unlikely # to swapping any two adjacent input blocks or chunks will give the same # answer. def make_test_input(length): i = 0 buf = bytearray() while len(buf) < length: buf.append(i) i = (i + 1) % 251 return buf def main(): for case in TEST_VECTORS["cases"]: input_len = case["input_len"] input = make_test_input(input_len) hex_key = hexlify(TEST_VECTORS["key"].encode()) context_string = TEST_VECTORS["context_string"] expected_hash_xof = case["hash"] expected_hash = expected_hash_xof[:64] expected_keyed_hash_xof = case["keyed_hash"] expected_keyed_hash = expected_keyed_hash_xof[:64] expected_derive_key_xof = case["derive_key"] expected_derive_key = expected_derive_key_xof[:64] # Test the default hash. test_hash = run_blake3([], input) for line in test_hash.splitlines(): assert expected_hash == line, \ "hash({}): {} != {}".format(input_len, expected_hash, line) # Test the extended hash. xof_len = len(expected_hash_xof) // 2 test_hash_xof = run_blake3(["--length", str(xof_len)], input) for line in test_hash_xof.splitlines(): assert expected_hash_xof == line, \ "hash_xof({}): {} != {}".format( input_len, expected_hash_xof, line) # Test the default keyed hash. test_keyed_hash = run_blake3(["--keyed", hex_key], input) for line in test_keyed_hash.splitlines(): assert expected_keyed_hash == line, \ "keyed_hash({}): {} != {}".format( input_len, expected_keyed_hash, line) # Test the extended keyed hash. xof_len = len(expected_keyed_hash_xof) // 2 test_keyed_hash_xof = run_blake3( ["--keyed", hex_key, "--length", str(xof_len)], input) for line in test_keyed_hash_xof.splitlines(): assert expected_keyed_hash_xof == line, \ "keyed_hash_xof({}): {} != {}".format( input_len, expected_keyed_hash_xof, line) # Test the default derive key. test_derive_key = run_blake3(["--derive-key", context_string], input) for line in test_derive_key.splitlines(): assert expected_derive_key == line, \ "derive_key({}): {} != {}".format( input_len, expected_derive_key, line) # Test the extended derive key. xof_len = len(expected_derive_key_xof) // 2 test_derive_key_xof = run_blake3( ["--derive-key", context_string, "--length", str(xof_len)], input) for line in test_derive_key_xof.splitlines(): assert expected_derive_key_xof == line, \ "derive_key_xof({}): {} != {}".format( input_len, expected_derive_key_xof, line) if __name__ == "__main__": main() blake3-1.8.1/media/B3.svg000064400000000000000000000075161046102023000130510ustar 00000000000000 image/svg+xml blake3-1.8.1/media/BLAKE3.svg000064400000000000000000000152121046102023000134760ustar 00000000000000 image/svg+xml blake3-1.8.1/media/speed.svg000064400000000000000000001334251046102023000137040ustar 00000000000000 blake3-1.8.1/src/ffi_avx2.rs000064400000000000000000000031761046102023000136440ustar 00000000000000use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; // Note that there is no AVX2 implementation of compress_in_place or // compress_xof. // Unsafe because this may only be called on platforms supporting AVX2. pub unsafe fn hash_many( inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { unsafe { // The Rust hash_many implementations do bounds checking on the `out` // array, but the C implementations don't. Even though this is an unsafe // function, assert the bounds here. assert!(out.len() >= inputs.len() * OUT_LEN); ffi::blake3_hash_many_avx2( inputs.as_ptr() as *const *const u8, inputs.len(), N / BLOCK_LEN, key.as_ptr(), counter, increment_counter.yes(), flags, flags_start, flags_end, out.as_mut_ptr(), ) } } pub mod ffi { extern "C" { pub fn blake3_hash_many_avx2( inputs: *const *const u8, num_inputs: usize, blocks: usize, key: *const u32, counter: u64, increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); } } #[cfg(test)] mod test { use super::*; #[test] fn test_hash_many() { if !crate::platform::avx2_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.8.1/src/ffi_avx512.rs000064400000000000000000000077711046102023000140170ustar 00000000000000use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; // Unsafe because this may only be called on platforms supporting AVX-512. pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { unsafe { ffi::blake3_compress_in_place_avx512( cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags, ) } } // Unsafe because this may only be called on platforms supporting AVX-512. pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { unsafe { let mut out = [0u8; 64]; ffi::blake3_compress_xof_avx512( cv.as_ptr(), block.as_ptr(), block_len, counter, flags, out.as_mut_ptr(), ); out } } // Unsafe because this may only be called on platforms supporting AVX-512. pub unsafe fn hash_many( inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { unsafe { // The Rust hash_many implementations do bounds checking on the `out` // array, but the C implementations don't. Even though this is an unsafe // function, assert the bounds here. assert!(out.len() >= inputs.len() * OUT_LEN); ffi::blake3_hash_many_avx512( inputs.as_ptr() as *const *const u8, inputs.len(), N / BLOCK_LEN, key.as_ptr(), counter, increment_counter.yes(), flags, flags_start, flags_end, out.as_mut_ptr(), ) } } // Unsafe because this may only be called on platforms supporting AVX-512. #[cfg(unix)] pub unsafe fn xof_many( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, out: &mut [u8], ) { unsafe { debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only"); ffi::blake3_xof_many_avx512( cv.as_ptr(), block.as_ptr(), block_len, counter, flags, out.as_mut_ptr(), out.len() / BLOCK_LEN, ); } } pub mod ffi { extern "C" { pub fn blake3_compress_in_place_avx512( cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8, ); pub fn blake3_compress_xof_avx512( cv: *const u32, block: *const u8, block_len: u8, counter: u64, flags: u8, out: *mut u8, ); pub fn blake3_hash_many_avx512( inputs: *const *const u8, num_inputs: usize, blocks: usize, key: *const u32, counter: u64, increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); #[cfg(unix)] pub fn blake3_xof_many_avx512( cv: *const u32, block: *const u8, block_len: u8, counter: u64, flags: u8, out: *mut u8, outblocks: usize, ); } } #[cfg(test)] mod test { use super::*; #[test] fn test_compress() { if !crate::platform::avx512_detected() { return; } crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { if !crate::platform::avx512_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } #[cfg(unix)] #[test] fn test_xof_many() { if !crate::platform::avx512_detected() { return; } crate::test::test_xof_many_fn(xof_many); } } blake3-1.8.1/src/ffi_neon.rs000064400000000000000000000042231046102023000137150ustar 00000000000000use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; // Unsafe because this may only be called on platforms supporting NEON. pub unsafe fn hash_many( inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { // The Rust hash_many implementations do bounds checking on the `out` // array, but the C implementations don't. Even though this is an unsafe // function, assert the bounds here. assert!(out.len() >= inputs.len() * OUT_LEN); ffi::blake3_hash_many_neon( inputs.as_ptr() as *const *const u8, inputs.len(), N / BLOCK_LEN, key.as_ptr(), counter, increment_counter.yes(), flags, flags_start, flags_end, out.as_mut_ptr(), ) } // blake3_neon.c normally depends on blake3_portable.c, because the NEON // implementation only provides 4x compression, and it relies on the portable // implementation for 1x compression. However, we expose the portable Rust // implementation here instead, to avoid linking in unnecessary code. #[no_mangle] pub extern "C" fn blake3_compress_in_place_portable( cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8, ) { unsafe { crate::portable::compress_in_place( &mut *(cv as *mut [u32; 8]), &*(block as *const [u8; 64]), block_len, counter, flags, ) } } pub mod ffi { extern "C" { pub fn blake3_hash_many_neon( inputs: *const *const u8, num_inputs: usize, blocks: usize, key: *const u32, counter: u64, increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); } } #[cfg(test)] mod test { use super::*; #[test] fn test_hash_many() { // This entire file is gated on feature="neon", so NEON support is // assumed here. crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.8.1/src/ffi_sse2.rs000064400000000000000000000057571046102023000136470ustar 00000000000000use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; // Unsafe because this may only be called on platforms supporting SSE2. pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { unsafe { ffi::blake3_compress_in_place_sse2( cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags, ) } } // Unsafe because this may only be called on platforms supporting SSE2. pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { unsafe { let mut out = [0u8; 64]; ffi::blake3_compress_xof_sse2( cv.as_ptr(), block.as_ptr(), block_len, counter, flags, out.as_mut_ptr(), ); out } } // Unsafe because this may only be called on platforms supporting SSE2. pub unsafe fn hash_many( inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { unsafe { // The Rust hash_many implementations do bounds checking on the `out` // array, but the C implementations don't. Even though this is an unsafe // function, assert the bounds here. assert!(out.len() >= inputs.len() * OUT_LEN); ffi::blake3_hash_many_sse2( inputs.as_ptr() as *const *const u8, inputs.len(), N / BLOCK_LEN, key.as_ptr(), counter, increment_counter.yes(), flags, flags_start, flags_end, out.as_mut_ptr(), ) } } pub mod ffi { extern "C" { pub fn blake3_compress_in_place_sse2( cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8, ); pub fn blake3_compress_xof_sse2( cv: *const u32, block: *const u8, block_len: u8, counter: u64, flags: u8, out: *mut u8, ); pub fn blake3_hash_many_sse2( inputs: *const *const u8, num_inputs: usize, blocks: usize, key: *const u32, counter: u64, increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); } } #[cfg(test)] mod test { use super::*; #[test] fn test_compress() { if !crate::platform::sse2_detected() { return; } crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { if !crate::platform::sse2_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.8.1/src/ffi_sse41.rs000064400000000000000000000057751046102023000137320ustar 00000000000000use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; // Unsafe because this may only be called on platforms supporting SSE4.1. pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { unsafe { ffi::blake3_compress_in_place_sse41( cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags, ) } } // Unsafe because this may only be called on platforms supporting SSE4.1. pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { unsafe { let mut out = [0u8; 64]; ffi::blake3_compress_xof_sse41( cv.as_ptr(), block.as_ptr(), block_len, counter, flags, out.as_mut_ptr(), ); out } } // Unsafe because this may only be called on platforms supporting SSE4.1. pub unsafe fn hash_many( inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { unsafe { // The Rust hash_many implementations do bounds checking on the `out` // array, but the C implementations don't. Even though this is an unsafe // function, assert the bounds here. assert!(out.len() >= inputs.len() * OUT_LEN); ffi::blake3_hash_many_sse41( inputs.as_ptr() as *const *const u8, inputs.len(), N / BLOCK_LEN, key.as_ptr(), counter, increment_counter.yes(), flags, flags_start, flags_end, out.as_mut_ptr(), ) } } pub mod ffi { extern "C" { pub fn blake3_compress_in_place_sse41( cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8, ); pub fn blake3_compress_xof_sse41( cv: *const u32, block: *const u8, block_len: u8, counter: u64, flags: u8, out: *mut u8, ); pub fn blake3_hash_many_sse41( inputs: *const *const u8, num_inputs: usize, blocks: usize, key: *const u32, counter: u64, increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); } } #[cfg(test)] mod test { use super::*; #[test] fn test_compress() { if !crate::platform::sse41_detected() { return; } crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { if !crate::platform::sse41_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.8.1/src/guts.rs000064400000000000000000000027431046102023000131210ustar 00000000000000//! Deprecated in favor of [`hazmat`](crate::hazmat) pub use crate::{BLOCK_LEN, CHUNK_LEN}; #[derive(Clone, Debug)] pub struct ChunkState(crate::ChunkState); impl ChunkState { // Currently this type only supports the regular hash mode. If an // incremental user needs keyed_hash or derive_key, we can add that. pub fn new(chunk_counter: u64) -> Self { Self(crate::ChunkState::new( crate::IV, chunk_counter, 0, crate::platform::Platform::detect(), )) } #[inline] pub fn len(&self) -> usize { self.0.count() } #[inline] pub fn update(&mut self, input: &[u8]) -> &mut Self { self.0.update(input); self } pub fn finalize(&self, is_root: bool) -> crate::Hash { let output = self.0.output(); if is_root { output.root_hash() } else { output.chaining_value().into() } } } // As above, this currently assumes the regular hash mode. If an incremental // user needs keyed_hash or derive_key, we can add that. pub fn parent_cv( left_child: &crate::Hash, right_child: &crate::Hash, is_root: bool, ) -> crate::Hash { let output = crate::parent_node_output( left_child.as_bytes(), right_child.as_bytes(), crate::IV, 0, crate::platform::Platform::detect(), ); if is_root { output.root_hash() } else { output.chaining_value().into() } } blake3-1.8.1/src/hazmat.rs000064400000000000000000000704521046102023000134250ustar 00000000000000//! Low-level tree manipulations and other sharp tools //! //! The target audience for this module is projects like [Bao](https://github.com/oconnor663/bao), //! which work directly with the interior hashes ("chaining values") of BLAKE3 chunks and subtrees. //! For example, you could use these functions to implement a BitTorrent-like protocol using the //! BLAKE3 tree structure, or to hash an input that's distributed across different machines. These //! use cases are advanced, and most applications don't need this module. Also: //! //!
//! //! **Warning:** This module is *hazardous material*. If you've heard folks say *don't roll your //! own crypto,* this is the sort of thing they're talking about. These functions have complicated //! requirements, and any mistakes will give you garbage output and/or break the security //! properties that BLAKE3 is supposed to have. Read section 2.1 of [the BLAKE3 //! paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) to understand the //! tree structure you need to maintain. Test your code against [`blake3::hash`](../fn.hash.html) //! and make sure you can get the same outputs for [lots of different //! inputs](https://github.com/BLAKE3-team/BLAKE3/blob/master/test_vectors/test_vectors.json). //! //!
//! //! On the other hand: //! //!
//! //! **Encouragement:** Playing with these functions is a great way to learn how BLAKE3 works on the //! inside. Have fun! //! //!
//! //! The main entrypoint for this module is the [`HasherExt`] trait, particularly the //! [`set_input_offset`](HasherExt::set_input_offset) and //! [`finalize_non_root`](HasherExt::finalize_non_root) methods. These let you compute the chaining //! values of individual chunks or subtrees. You then combine these chaining values into larger //! subtrees using [`merge_subtrees_non_root`] and finally (once at the very top) //! [`merge_subtrees_root`] or [`merge_subtrees_root_xof`]. //! //! # Examples //! //! Here's an example of computing all the interior hashes in a 3-chunk tree: //! //! ```text //! root //! / \ //! parent \ //! / \ \ //! chunk0 chunk1 chunk2 //! ``` //! //! ``` //! # fn main() { //! use blake3::{Hasher, CHUNK_LEN}; //! use blake3::hazmat::{merge_subtrees_non_root, merge_subtrees_root, Mode}; //! use blake3::hazmat::HasherExt; // an extension trait for Hasher //! //! let chunk0 = [b'a'; CHUNK_LEN]; //! let chunk1 = [b'b'; CHUNK_LEN]; //! let chunk2 = [b'c'; 42]; // The final chunk can be short. //! //! // Compute the non-root hashes ("chaining values") of all three chunks. Chunks or subtrees //! // that don't begin at the start of the input use `set_input_offset` to say where they begin. //! let chunk0_cv = Hasher::new() //! // .set_input_offset(0) is the default. //! .update(&chunk0) //! .finalize_non_root(); //! let chunk1_cv = Hasher::new() //! .set_input_offset(CHUNK_LEN as u64) //! .update(&chunk1) //! .finalize_non_root(); //! let chunk2_cv = Hasher::new() //! .set_input_offset(2 * CHUNK_LEN as u64) //! .update(&chunk2) //! .finalize_non_root(); //! //! // Join the first two chunks with a non-root parent node and compute its chaining value. //! let parent_cv = merge_subtrees_non_root(&chunk0_cv, &chunk1_cv, Mode::Hash); //! //! // Join that parent node and the third chunk with a root parent node and compute the hash. //! let root_hash = merge_subtrees_root(&parent_cv, &chunk2_cv, Mode::Hash); //! //! // Double check that we got the right answer. //! let mut combined_input = Vec::new(); //! combined_input.extend_from_slice(&chunk0); //! combined_input.extend_from_slice(&chunk1); //! combined_input.extend_from_slice(&chunk2); //! assert_eq!(root_hash, blake3::hash(&combined_input)); //! # } //! ``` //! //! Hashing many chunks together is important for performance, because it allows the implementation //! to use SIMD parallelism internally. ([AVX-512](https://en.wikipedia.org/wiki/AVX-512) for //! example needs 16 chunks to really get going.) We can reproduce `parent_cv` by hashing `chunk0` //! and `chunk1` at the same time: //! //! ``` //! # fn main() { //! # use blake3::{Hasher, CHUNK_LEN}; //! # use blake3::hazmat::{Mode, HasherExt, merge_subtrees_non_root, merge_subtrees_root}; //! # let chunk0 = [b'a'; CHUNK_LEN]; //! # let chunk1 = [b'b'; CHUNK_LEN]; //! # let chunk0_cv = Hasher::new().update(&chunk0).finalize_non_root(); //! # let chunk1_cv = Hasher::new().set_input_offset(CHUNK_LEN as u64).update(&chunk1).finalize_non_root(); //! # let parent_cv = merge_subtrees_non_root(&chunk0_cv, &chunk1_cv, Mode::Hash); //! # let mut combined_input = Vec::new(); //! # combined_input.extend_from_slice(&chunk0); //! # combined_input.extend_from_slice(&chunk1); //! let left_subtree_cv = Hasher::new() //! // .set_input_offset(0) is the default. //! .update(&combined_input[..2 * CHUNK_LEN]) //! .finalize_non_root(); //! assert_eq!(left_subtree_cv, parent_cv); //! //! // Using multiple updates gives the same answer, though it's not as efficient. //! let mut subtree_hasher = Hasher::new(); //! // Again, .set_input_offset(0) is the default. //! subtree_hasher.update(&chunk0); //! subtree_hasher.update(&chunk1); //! assert_eq!(left_subtree_cv, subtree_hasher.finalize_non_root()); //! # } //! ``` //! //! However, hashing multiple chunks together **must** respect the overall tree structure. Hashing //! `chunk0` and `chunk1` together is valid, but hashing `chunk1` and `chunk2` together is //! incorrect and gives a garbage result that will never match a standard BLAKE3 hash. The //! implementation includes a few best-effort asserts to catch some of these mistakes, but these //! checks aren't guaranteed. For example, this second call to `update` currently panics: //! //! ```should_panic //! # fn main() { //! # use blake3::{Hasher, CHUNK_LEN}; //! # use blake3::hazmat::HasherExt; //! # let chunk0 = [b'a'; CHUNK_LEN]; //! # let chunk1 = [b'b'; CHUNK_LEN]; //! # let chunk2 = [b'c'; 42]; //! let oops = Hasher::new() //! .set_input_offset(CHUNK_LEN as u64) //! .update(&chunk1) //! // PANIC: "the subtree starting at 1024 contains at most 1024 bytes" //! .update(&chunk2) //! .finalize_non_root(); //! # } //! ``` //! //! For more on valid tree structures, see the docs for and [`left_subtree_len`] and //! [`max_subtree_len`], and see section 2.1 of [the BLAKE3 //! paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). Note that the //! merging functions ([`merge_subtrees_root`] and friends) don't know the shape of the left and //! right subtrees you're giving them, and they can't help you catch mistakes. The best way to //! catch mistakes with these is to compare your root output to the [`blake3::hash`](crate::hash) //! of the same input. use crate::platform::Platform; use crate::{CVWords, Hasher, CHUNK_LEN, IV, KEY_LEN, OUT_LEN}; /// Extension methods for [`Hasher`]. This is the main entrypoint to the `hazmat` module. pub trait HasherExt { /// Similar to [`Hasher::new_derive_key`] but using a pre-hashed [`ContextKey`] from /// [`hash_derive_key_context`]. /// /// The [`hash_derive_key_context`] function is _only_ valid source of the [`ContextKey`] /// /// # Example /// /// ``` /// use blake3::Hasher; /// use blake3::hazmat::HasherExt; /// /// let context_key = blake3::hazmat::hash_derive_key_context("foo"); /// let mut hasher = Hasher::new_from_context_key(&context_key); /// hasher.update(b"bar"); /// let derived_key = *hasher.finalize().as_bytes(); /// /// assert_eq!(derived_key, blake3::derive_key("foo", b"bar")); /// ``` fn new_from_context_key(context_key: &ContextKey) -> Self; /// Configure the `Hasher` to process a chunk or subtree starting at `offset` bytes into the /// whole input. /// /// You must call this function before processing any input with [`update`](Hasher::update) or /// similar. This step isn't required for the first chunk, or for a subtree that includes the /// first chunk (i.e. when the `offset` is zero), but it's required for all other chunks and /// subtrees. /// /// The starting input offset of a subtree implies a maximum possible length for that subtree. /// See [`max_subtree_len`] and section 2.1 of [the BLAKE3 /// paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). Note that only /// subtrees along the right edge of the whole tree can have a length less than their maximum /// possible length. /// /// See the [module level examples](index.html#examples). /// /// # Panics /// /// This function panics if the `Hasher` has already accepted any input with /// [`update`](Hasher::update) or similar. /// /// This should always be paired with [`finalize_non_root`](HasherExt::finalize_non_root). It's /// never correct to use a non-zero input offset with [`finalize`](Hasher::finalize) or /// [`finalize_xof`](Hasher::finalize_xof). The `offset` must also be a multiple of /// `CHUNK_LEN`. Violating either of these rules will currently fail an assertion and panic, /// but this is not guaranteed. fn set_input_offset(&mut self, offset: u64) -> &mut Self; /// Finalize the non-root hash ("chaining value") of the current chunk or subtree. /// /// Afterwards you can merge subtree chaining values into parent nodes using /// [`merge_subtrees_non_root`] and ultimately into the root node with either /// [`merge_subtrees_root`] (similar to [`Hasher::finalize`]) or [`merge_subtrees_root_xof`] /// (similar to [`Hasher::finalize_xof`]). /// /// See the [module level examples](index.html#examples), particularly the discussion of valid /// tree structures. fn finalize_non_root(&self) -> ChainingValue; } impl HasherExt for Hasher { fn new_from_context_key(context_key: &[u8; KEY_LEN]) -> Hasher { let context_key_words = crate::platform::words_from_le_bytes_32(context_key); Hasher::new_internal(&context_key_words, crate::DERIVE_KEY_MATERIAL) } fn set_input_offset(&mut self, offset: u64) -> &mut Hasher { assert_eq!(self.count(), 0, "hasher has already accepted input"); assert_eq!( offset % CHUNK_LEN as u64, 0, "offset ({offset}) must be a chunk boundary (divisible by {CHUNK_LEN})", ); let counter = offset / CHUNK_LEN as u64; self.chunk_state.chunk_counter = counter; self.initial_chunk_counter = counter; self } fn finalize_non_root(&self) -> ChainingValue { assert_ne!(self.count(), 0, "empty subtrees are never valid"); self.final_output().chaining_value() } } /// The maximum length of a subtree in bytes, given its starting offset in bytes /// /// If you try to hash more than this many bytes as one subtree, you'll end up merging parent nodes /// that shouldn't be merged, and your output will be garbage. [`Hasher::update`] will currently /// panic in this case, but this is not guaranteed. /// /// For input offset zero (the default), there is no maximum length, and this function returns /// `None`. For all other offsets it returns `Some`. Note that valid offsets must be a multiple of /// [`CHUNK_LEN`] (1024); it's not possible to start hashing a chunk in the middle. /// /// In the example tree below, chunks are numbered by their _0-based index_. The subtree that /// _starts_ with chunk 3, i.e. `input_offset = 3 * CHUNK_LEN`, includes only that one chunk, so /// its max length is `Some(CHUNK_LEN)`. The subtree that starts with chunk 6 includes chunk 7 but /// not chunk 8, so its max length is `Some(2 * CHUNK_LEN)`. The subtree that starts with chunk 12 /// includes chunks 13, 14, and 15, but if the tree were bigger it would not include chunk 16, so /// its max length is `Some(4 * CHUNK_LEN)`. One way to think about the rule here is that, if you /// go beyond the max subtree length from a given starting offset, you start dealing with subtrees /// that include chunks _to the left_ of where you started. /// /// ```text /// root /// / \ /// . . /// / \ / \ /// . . . . /// / \ / \ / \ / \ /// . . . . . . . . /// / \ / \ / \ / \ / \ / \ / \ / \ /// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 /// ``` /// /// The general rule turns out to be that for a subtree starting at a 0-based chunk index N greater /// than zero, the maximum number of chunks in that subtree is the largest power-of-two that /// divides N, which is given by `1 << N.trailing_zeros()`. /// /// This function can be useful for writing tests or debug assertions, but it's actually rare to /// use this for real control flow. Callers who split their input recursively using /// [`left_subtree_len`] will automatically satisfy the `max_subtree_len` bound and don't /// necessarily need to check. It's also common to choose some fixed power-of-two subtree size, say /// 64 chunks, and divide your input up into slices of that fixed length (with the final slice /// possibly short). This approach also automatically satisfies the `max_subtree_len` bound and /// doesn't need to check. Proving that this is true can be an interesting exercise. Note that /// chunks 0, 4, 8, and 12 all begin subtrees of at least 4 chunks in the example tree above. /// /// # Panics /// /// This function currently panics if `input_offset` is not a multiple of `CHUNK_LEN`. This is not /// guaranteed. #[inline(always)] pub fn max_subtree_len(input_offset: u64) -> Option { if input_offset == 0 { return None; } assert_eq!(input_offset % CHUNK_LEN as u64, 0); let counter = input_offset / CHUNK_LEN as u64; let max_chunks = 1 << counter.trailing_zeros(); Some(max_chunks * CHUNK_LEN as u64) } #[test] fn test_max_subtree_len() { assert_eq!(max_subtree_len(0), None); // (chunk index, max chunks) let cases = [ (1, 1), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 1), (8, 8), ]; for (chunk_index, max_chunks) in cases { let input_offset = chunk_index * CHUNK_LEN as u64; assert_eq!( max_subtree_len(input_offset), Some(max_chunks * CHUNK_LEN as u64), ); } } /// Given the length in bytes of either a complete input or a subtree input, return the number of /// bytes that belong to its left child subtree. The rest belong to its right child subtree. /// /// Concretely, this function returns the largest power-of-two number of bytes that's strictly less /// than `input_len`. This leads to a tree where all left subtrees are "complete" and at least as /// large as their sibling right subtrees, as specified in section 2.1 of [the BLAKE3 /// paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). For example, if an /// input is exactly two chunks, its left and right subtrees both get one chunk. But if an input is /// two chunks plus one more byte, then its left subtree gets two chunks, and its right subtree /// only gets one byte. /// /// This function isn't meaningful for one chunk of input, because chunks don't have children. It /// currently panics in debug mode if `input_len <= CHUNK_LEN`. /// /// # Example /// /// Hash a input of random length as two subtrees: /// /// ``` /// # #[cfg(feature = "std")] { /// use blake3::hazmat::{left_subtree_len, merge_subtrees_root, HasherExt, Mode}; /// use blake3::{Hasher, CHUNK_LEN}; /// /// // Generate a random-length input. Note that to be split into two subtrees, the input length /// // must be greater than CHUNK_LEN. /// let input_len = rand::random_range(CHUNK_LEN + 1..1_000_000); /// let mut input = vec![0; input_len]; /// rand::fill(&mut input[..]); /// /// // Compute the left and right subtree hashes and then the root hash. left_subtree_len() tells /// // us exactly where to split the input. Any other split would either panic (if we're lucky) or /// // lead to an incorrect root hash. /// let left_len = left_subtree_len(input_len as u64) as usize; /// let left_subtree_cv = Hasher::new() /// .update(&input[..left_len]) /// .finalize_non_root(); /// let right_subtree_cv = Hasher::new() /// .set_input_offset(left_len as u64) /// .update(&input[left_len..]) /// .finalize_non_root(); /// let root_hash = merge_subtrees_root(&left_subtree_cv, &right_subtree_cv, Mode::Hash); /// /// // Double check the answer. /// assert_eq!(root_hash, blake3::hash(&input)); /// # } /// ``` #[inline(always)] pub fn left_subtree_len(input_len: u64) -> u64 { debug_assert!(input_len > CHUNK_LEN as u64); // Note that .next_power_of_two() is greater than *or equal*. ((input_len + 1) / 2).next_power_of_two() } #[test] fn test_left_subtree_len() { assert_eq!(left_subtree_len(1025), 1024); for boundary_case in [2, 4, 8, 16, 32, 64] { let input_len = boundary_case * CHUNK_LEN as u64; assert_eq!(left_subtree_len(input_len - 1), input_len / 2); assert_eq!(left_subtree_len(input_len), input_len / 2); assert_eq!(left_subtree_len(input_len + 1), input_len); } } /// The `mode` argument to [`merge_subtrees_root`] and friends /// /// See the [module level examples](index.html#examples). #[derive(Copy, Clone, Debug)] pub enum Mode<'a> { /// Corresponding to [`hash`](crate::hash) Hash, /// Corresponding to [`keyed_hash`](crate::hash) KeyedHash(&'a [u8; KEY_LEN]), /// Corresponding to [`derive_key`](crate::hash) /// /// The [`ContextKey`] comes from [`hash_derive_key_context`]. DeriveKeyMaterial(&'a ContextKey), } impl<'a> Mode<'a> { fn key_words(&self) -> CVWords { match self { Mode::Hash => *IV, Mode::KeyedHash(key) => crate::platform::words_from_le_bytes_32(key), Mode::DeriveKeyMaterial(cx_key) => crate::platform::words_from_le_bytes_32(cx_key), } } fn flags_byte(&self) -> u8 { match self { Mode::Hash => 0, Mode::KeyedHash(_) => crate::KEYED_HASH, Mode::DeriveKeyMaterial(_) => crate::DERIVE_KEY_MATERIAL, } } } /// "Chaining value" is the academic term for a non-root or non-final hash. /// /// Besides just sounding fancy, it turns out there are [security /// reasons](https://jacko.io/tree_hashing.html) to be careful about the difference between /// (root/final) hashes and (non-root/non-final) chaining values. pub type ChainingValue = [u8; OUT_LEN]; fn merge_subtrees_inner( left_child: &ChainingValue, right_child: &ChainingValue, mode: Mode, ) -> crate::Output { crate::parent_node_output( &left_child, &right_child, &mode.key_words(), mode.flags_byte(), Platform::detect(), ) } /// Compute a non-root parent node chaining value from two child chaining values. /// /// See the [module level examples](index.html#examples), particularly the discussion of valid tree /// structures. The left and right child chaining values can come from either /// [`Hasher::finalize_non_root`](HasherExt::finalize_non_root) or other calls to /// `merge_subtrees_non_root`. "Chaining value" is the academic term for a non-root or non-final /// hash. pub fn merge_subtrees_non_root( left_child: &ChainingValue, right_child: &ChainingValue, mode: Mode, ) -> ChainingValue { merge_subtrees_inner(left_child, right_child, mode).chaining_value() } /// Compute a root hash from two child chaining values. /// /// See the [module level examples](index.html#examples), particularly the discussion of valid tree /// structures. The left and right child chaining values can come from either /// [`Hasher::finalize_non_root`](HasherExt::finalize_non_root) or [`merge_subtrees_non_root`]. /// "Chaining value" is the academic term for a non-root or non-final hash. /// /// Note that inputs of [`CHUNK_LEN`] or less don't produce any parent nodes and can't be hashed /// using this function. In that case you must get the root hash from [`Hasher::finalize`] (or just /// [`blake3::hash`](crate::hash)). pub fn merge_subtrees_root( left_child: &ChainingValue, right_child: &ChainingValue, mode: Mode, ) -> crate::Hash { merge_subtrees_inner(left_child, right_child, mode).root_hash() } /// Build a root [`OutputReader`](crate::OutputReader) from two child chaining values. /// /// See also the [module level examples](index.html#examples), particularly the discussion of valid /// tree structures. The left and right child chaining values can come from either /// [`Hasher::finalize_non_root`](HasherExt::finalize_non_root) or [`merge_subtrees_non_root`]. /// "Chaining value" is the academic term for a non-root or non-final hash. /// /// Note that inputs of [`CHUNK_LEN`] or less don't produce any parent nodes and can't be hashed /// using this function. In that case you must get the `OutputReader` from /// [`Hasher::finalize_xof`]. /// /// # Example /// /// ``` /// use blake3::hazmat::{merge_subtrees_root_xof, HasherExt, Mode}; /// use blake3::{Hasher, CHUNK_LEN}; /// /// // Hash a 2-chunk subtree in steps. Note that only /// // the final chunk can be shorter than CHUNK_LEN. /// let chunk0 = &[42; CHUNK_LEN]; /// let chunk1 = b"hello world"; /// let chunk0_cv = Hasher::new() /// .update(chunk0) /// .finalize_non_root(); /// let chunk1_cv = Hasher::new() /// .set_input_offset(CHUNK_LEN as u64) /// .update(chunk1) /// .finalize_non_root(); /// /// // Obtain a blake3::OutputReader at the root and extract 1000 bytes. /// let mut output_reader = merge_subtrees_root_xof(&chunk0_cv, &chunk1_cv, Mode::Hash); /// let mut output_bytes = [0; 1_000]; /// output_reader.fill(&mut output_bytes); /// /// // Double check the answer. /// let mut hasher = Hasher::new(); /// hasher.update(chunk0); /// hasher.update(chunk1); /// let mut expected = [0; 1_000]; /// hasher.finalize_xof().fill(&mut expected); /// assert_eq!(output_bytes, expected); /// ``` pub fn merge_subtrees_root_xof( left_child: &ChainingValue, right_child: &ChainingValue, mode: Mode, ) -> crate::OutputReader { crate::OutputReader::new(merge_subtrees_inner(left_child, right_child, mode)) } /// An alias to distinguish [`hash_derive_key_context`] outputs from other keys. pub type ContextKey = [u8; KEY_LEN]; /// Hash a [`derive_key`](crate::derive_key) context string and return a [`ContextKey`]. /// /// The _only_ valid uses for the returned [`ContextKey`] are [`Hasher::new_from_context_key`] and /// [`Mode::DeriveKeyMaterial`] (together with the merge subtree functions). /// /// # Example /// /// ``` /// use blake3::Hasher; /// use blake3::hazmat::HasherExt; /// /// let context_key = blake3::hazmat::hash_derive_key_context("foo"); /// let mut hasher = Hasher::new_from_context_key(&context_key); /// hasher.update(b"bar"); /// let derived_key = *hasher.finalize().as_bytes(); /// /// assert_eq!(derived_key, blake3::derive_key("foo", b"bar")); /// ``` pub fn hash_derive_key_context(context: &str) -> ContextKey { crate::hash_all_at_once::( context.as_bytes(), IV, crate::DERIVE_KEY_CONTEXT, ) .root_hash() .0 } #[cfg(test)] mod test { use super::*; #[test] #[should_panic] fn test_empty_subtree_should_panic() { Hasher::new().finalize_non_root(); } #[test] #[should_panic] fn test_unaligned_offset_should_panic() { Hasher::new().set_input_offset(1); } #[test] #[should_panic] fn test_hasher_already_accepted_input_should_panic() { Hasher::new().update(b"x").set_input_offset(0); } #[test] #[should_panic] fn test_too_much_input_should_panic() { Hasher::new() .set_input_offset(CHUNK_LEN as u64) .update(&[0; CHUNK_LEN + 1]); } #[test] #[should_panic] fn test_set_input_offset_cant_finalize() { Hasher::new().set_input_offset(CHUNK_LEN as u64).finalize(); } #[test] #[should_panic] fn test_set_input_offset_cant_finalize_xof() { Hasher::new() .set_input_offset(CHUNK_LEN as u64) .finalize_xof(); } #[test] fn test_grouped_hash() { const MAX_CHUNKS: usize = (crate::test::TEST_CASES_MAX + 1) / CHUNK_LEN; let mut input_buf = [0; crate::test::TEST_CASES_MAX]; crate::test::paint_test_input(&mut input_buf); for subtree_chunks in [1, 2, 4, 8, 16, 32] { #[cfg(feature = "std")] dbg!(subtree_chunks); let subtree_len = subtree_chunks * CHUNK_LEN; for &case in crate::test::TEST_CASES { if case <= subtree_len { continue; } #[cfg(feature = "std")] dbg!(case); let input = &input_buf[..case]; let expected_hash = crate::hash(input); // Collect all the group chaining values. let mut chaining_values = arrayvec::ArrayVec::::new(); let mut subtree_offset = 0; while subtree_offset < input.len() { let take = core::cmp::min(subtree_len, input.len() - subtree_offset); let subtree_input = &input[subtree_offset..][..take]; let subtree_cv = Hasher::new() .set_input_offset(subtree_offset as u64) .update(subtree_input) .finalize_non_root(); chaining_values.push(subtree_cv); subtree_offset += take; } // Compress all the chaining_values together, layer by layer. assert!(chaining_values.len() >= 2); while chaining_values.len() > 2 { let n = chaining_values.len(); // Merge each side-by-side pair in place, overwriting the front half of the // array with the merged results. This moves us "up one level" in the tree. for i in 0..(n / 2) { chaining_values[i] = merge_subtrees_non_root( &chaining_values[2 * i], &chaining_values[2 * i + 1], Mode::Hash, ); } // If there's an odd CV out, it moves up. if n % 2 == 1 { chaining_values[n / 2] = chaining_values[n - 1]; } chaining_values.truncate(n / 2 + n % 2); } assert_eq!(chaining_values.len(), 2); let root_hash = merge_subtrees_root(&chaining_values[0], &chaining_values[1], Mode::Hash); assert_eq!(expected_hash, root_hash); } } } #[test] fn test_keyed_hash_xof() { let group0 = &[42; 4096]; let group1 = &[43; 4095]; let mut input = [0; 8191]; input[..4096].copy_from_slice(group0); input[4096..].copy_from_slice(group1); let key = &[44; 32]; let mut expected_output = [0; 100]; Hasher::new_keyed(&key) .update(&input) .finalize_xof() .fill(&mut expected_output); let mut hazmat_output = [0; 100]; let left = Hasher::new_keyed(key).update(group0).finalize_non_root(); let right = Hasher::new_keyed(key) .set_input_offset(group0.len() as u64) .update(group1) .finalize_non_root(); merge_subtrees_root_xof(&left, &right, Mode::KeyedHash(&key)).fill(&mut hazmat_output); assert_eq!(expected_output, hazmat_output); } #[test] fn test_derive_key() { let context = "foo"; let mut input = [0; 1025]; crate::test::paint_test_input(&mut input); let expected = crate::derive_key(context, &input); let cx_key = hash_derive_key_context(context); let left = Hasher::new_from_context_key(&cx_key) .update(&input[..1024]) .finalize_non_root(); let right = Hasher::new_from_context_key(&cx_key) .set_input_offset(1024) .update(&input[1024..]) .finalize_non_root(); let derived_key = merge_subtrees_root(&left, &right, Mode::DeriveKeyMaterial(&cx_key)).0; assert_eq!(expected, derived_key); } } blake3-1.8.1/src/io.rs000064400000000000000000000062541046102023000125470ustar 00000000000000//! Helper functions for efficient IO. #[cfg(feature = "std")] pub(crate) fn copy_wide( mut reader: impl std::io::Read, hasher: &mut crate::Hasher, ) -> std::io::Result { let mut buffer = [0; 65536]; let mut total = 0; loop { match reader.read(&mut buffer) { Ok(0) => return Ok(total), Ok(n) => { hasher.update(&buffer[..n]); total += n as u64; } // see test_update_reader_interrupted Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, Err(e) => return Err(e), } } } // Mmap a file, if it looks like a good idea. Return None in cases where we know mmap will fail, or // if the file is short enough that mmapping isn't worth it. However, if we do try to mmap and it // fails, return the error. // // SAFETY: Mmaps are fundamentally unsafe, because you can call invariant-checking functions like // str::from_utf8 on them and then have them change out from under you. Letting a safe caller get // their hands on an mmap, or even a &[u8] that's backed by an mmap, is unsound. However, because // this function is crate-private, we can guarantee that all can ever happen in the event of a race // condition is that we either hash nonsense bytes or crash with SIGBUS or similar, neither of // which should risk memory corruption in a safe caller. // // PARANOIA: But a data race...is a data race...is a data race...right? Even if we know that no // platform in the "real world" is ever going to do anything other than compute the "wrong answer" // if we race on this mmap while we hash it, aren't we still supposed to feel bad about doing this? // Well, maybe. This is IO, and IO gets special carve-outs in the memory model. Consider a // memory-mapped register that returns random 32-bit words. (This is actually realistic if you have // a hardware RNG.) It's probably sound to construct a *const i32 pointing to that register and do // some raw pointer reads from it. Those reads should be volatile if you don't want the compiler to // coalesce them, but either way the compiler isn't allowed to just _go nuts_ and insert // should-never-happen branches to wipe your hard drive if two adjacent reads happen to give // different values. As far as I'm aware, there's no such thing as a read that's allowed if it's // volatile but prohibited if it's not (unlike atomics). As mentioned above, it's not ok to // construct a safe &i32 to the register if you're going to leak that reference to unknown callers. // But if you "know what you're doing," I don't think *const i32 and &i32 are fundamentally // different here. Feedback needed. #[cfg(feature = "mmap")] pub(crate) fn maybe_mmap_file(file: &std::fs::File) -> std::io::Result> { let metadata = file.metadata()?; let file_size = metadata.len(); if !metadata.is_file() { // Not a real file. Ok(None) } else if file_size < 16 * 1024 { // Mapping small files is not worth it, and some special files that can't be mapped report // a size of zero. Ok(None) } else { let map = unsafe { memmap2::Mmap::map(file)? }; Ok(Some(map)) } } blake3-1.8.1/src/join.rs000064400000000000000000000055561046102023000131030ustar 00000000000000//! The multi-threading abstractions used by `Hasher::update_with_join`. //! //! Different implementations of the `Join` trait determine whether //! `Hasher::update_with_join` performs multi-threading on sufficiently large //! inputs. The `SerialJoin` implementation is single-threaded, and the //! `RayonJoin` implementation (gated by the `rayon` feature) is multi-threaded. //! Interfaces other than `Hasher::update_with_join`, like [`hash`](crate::hash) //! and [`Hasher::update`](crate::Hasher::update), always use `SerialJoin` //! internally. //! //! The `Join` trait is an almost exact copy of the [`rayon::join`] API, and //! `RayonJoin` is the only non-trivial implementation. Previously this trait //! was public, but currently it's been re-privatized, as it's both 1) of no //! value to most callers and 2) a pretty big implementation detail to commit //! to. //! //! [`rayon::join`]: https://docs.rs/rayon/1.3.0/rayon/fn.join.html /// The trait that abstracts over single-threaded and multi-threaded recursion. /// /// See the [`join` module docs](index.html) for more details. pub trait Join { fn join(oper_a: A, oper_b: B) -> (RA, RB) where A: FnOnce() -> RA + Send, B: FnOnce() -> RB + Send, RA: Send, RB: Send; } /// The trivial, serial implementation of `Join`. The left and right sides are /// executed one after the other, on the calling thread. The standalone hashing /// functions and the `Hasher::update` method use this implementation /// internally. /// /// See the [`join` module docs](index.html) for more details. pub enum SerialJoin {} impl Join for SerialJoin { #[inline] fn join(oper_a: A, oper_b: B) -> (RA, RB) where A: FnOnce() -> RA + Send, B: FnOnce() -> RB + Send, RA: Send, RB: Send, { (oper_a(), oper_b()) } } /// The Rayon-based implementation of `Join`. The left and right sides are /// executed on the Rayon thread pool, potentially in parallel. This /// implementation is gated by the `rayon` feature, which is off by default. /// /// See the [`join` module docs](index.html) for more details. #[cfg(feature = "rayon")] pub enum RayonJoin {} #[cfg(feature = "rayon")] impl Join for RayonJoin { #[inline] fn join(oper_a: A, oper_b: B) -> (RA, RB) where A: FnOnce() -> RA + Send, B: FnOnce() -> RB + Send, RA: Send, RB: Send, { rayon_core::join(oper_a, oper_b) } } #[cfg(test)] mod test { use super::*; #[test] fn test_serial_join() { let oper_a = || 1 + 1; let oper_b = || 2 + 2; assert_eq!((2, 4), SerialJoin::join(oper_a, oper_b)); } #[test] #[cfg(feature = "rayon")] fn test_rayon_join() { let oper_a = || 1 + 1; let oper_b = || 2 + 2; assert_eq!((2, 4), RayonJoin::join(oper_a, oper_b)); } } blake3-1.8.1/src/lib.rs000064400000000000000000002164631046102023000127130ustar 00000000000000//! The official Rust implementation of the [BLAKE3] cryptographic hash //! function. //! //! # Examples //! //! ``` //! # fn main() -> Result<(), Box> { //! // Hash an input all at once. //! let hash1 = blake3::hash(b"foobarbaz"); //! //! // Hash an input incrementally. //! let mut hasher = blake3::Hasher::new(); //! hasher.update(b"foo"); //! hasher.update(b"bar"); //! hasher.update(b"baz"); //! let hash2 = hasher.finalize(); //! assert_eq!(hash1, hash2); //! //! // Extended output. OutputReader also implements Read and Seek. //! # #[cfg(feature = "std")] { //! let mut output = [0; 1000]; //! let mut output_reader = hasher.finalize_xof(); //! output_reader.fill(&mut output); //! assert_eq!(hash1, output[..32]); //! # } //! //! // Print a hash as hex. //! println!("{}", hash1); //! # Ok(()) //! # } //! ``` //! //! # Cargo Features //! //! The `std` feature (the only feature enabled by default) is required for //! implementations of the [`Write`] and [`Seek`] traits, the //! [`update_reader`](Hasher::update_reader) helper method, and runtime CPU //! feature detection on x86. If this feature is disabled, the only way to use //! the x86 SIMD implementations is to enable the corresponding instruction sets //! globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting binary //! will not be portable to other machines. //! //! The `rayon` feature (disabled by default, but enabled for [docs.rs]) adds //! the [`update_rayon`](Hasher::update_rayon) and (in combination with `mmap` //! below) [`update_mmap_rayon`](Hasher::update_mmap_rayon) methods, for //! multithreaded hashing. However, even if this feature is enabled, all other //! APIs remain single-threaded. //! //! The `mmap` feature (disabled by default, but enabled for [docs.rs]) adds the //! [`update_mmap`](Hasher::update_mmap) and (in combination with `rayon` above) //! [`update_mmap_rayon`](Hasher::update_mmap_rayon) helper methods for //! memory-mapped IO. //! //! The `zeroize` feature (disabled by default, but enabled for [docs.rs]) //! implements //! [`Zeroize`](https://docs.rs/zeroize/latest/zeroize/trait.Zeroize.html) for //! this crate's types. //! //! The `serde` feature (disabled by default, but enabled for [docs.rs]) implements //! [`serde::Serialize`](https://docs.rs/serde/latest/serde/trait.Serialize.html) and //! [`serde::Deserialize`](https://docs.rs/serde/latest/serde/trait.Deserialize.html) //! for [`Hash`](struct@Hash). //! //! The NEON implementation is enabled by default for AArch64 but requires the //! `neon` feature for other ARM targets. Not all ARMv7 CPUs support NEON, and //! enabling this feature will produce a binary that's not portable to CPUs //! without NEON support. //! //! The `wasm32_simd` feature enables the WASM SIMD implementation for all `wasm32-` //! targets. Similar to the `neon` feature, if `wasm32_simd` is enabled, WASM SIMD //! support is assumed. This may become the default in the future. //! //! The `traits-preview` feature enables implementations of traits from the //! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`. //! However, the traits aren't stable, and they're expected to change in //! incompatible ways before that crate reaches 1.0. For that reason, this crate //! makes no SemVer guarantees for this feature, and callers who use it should //! expect breaking changes between patch versions. (The "-preview" feature name //! follows the conventions of the RustCrypto [`signature`] crate.) //! //! [`Hasher::update_rayon`]: struct.Hasher.html#method.update_rayon //! [BLAKE3]: https://blake3.io //! [Rayon]: https://github.com/rayon-rs/rayon //! [docs.rs]: https://docs.rs/ //! [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html //! [`Seek`]: https://doc.rust-lang.org/std/io/trait.Seek.html //! [`digest`]: https://crates.io/crates/digest //! [`signature`]: https://crates.io/crates/signature #![cfg_attr(not(feature = "std"), no_std)] #[cfg(test)] mod test; #[doc(hidden)] #[deprecated(since = "1.8.0", note = "use the hazmat module instead")] pub mod guts; pub mod hazmat; /// Undocumented and unstable, for benchmarks only. #[doc(hidden)] pub mod platform; // Platform-specific implementations of the compression function. These // BLAKE3-specific cfg flags are set in build.rs. #[cfg(blake3_avx2_rust)] #[path = "rust_avx2.rs"] mod avx2; #[cfg(blake3_avx2_ffi)] #[path = "ffi_avx2.rs"] mod avx2; #[cfg(blake3_avx512_ffi)] #[path = "ffi_avx512.rs"] mod avx512; #[cfg(blake3_neon)] #[path = "ffi_neon.rs"] mod neon; mod portable; #[cfg(blake3_sse2_rust)] #[path = "rust_sse2.rs"] mod sse2; #[cfg(blake3_sse2_ffi)] #[path = "ffi_sse2.rs"] mod sse2; #[cfg(blake3_sse41_rust)] #[path = "rust_sse41.rs"] mod sse41; #[cfg(blake3_sse41_ffi)] #[path = "ffi_sse41.rs"] mod sse41; #[cfg(blake3_wasm32_simd)] #[path = "wasm32_simd.rs"] mod wasm32_simd; #[cfg(feature = "traits-preview")] pub mod traits; mod io; mod join; use arrayref::{array_mut_ref, array_ref}; use arrayvec::{ArrayString, ArrayVec}; use core::cmp; use core::fmt; use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2}; #[cfg(feature = "zeroize")] use zeroize::Zeroize; /// The number of bytes in a [`Hash`](struct.Hash.html), 32. pub const OUT_LEN: usize = 32; /// The number of bytes in a key, 32. pub const KEY_LEN: usize = 32; /// The number of bytes in a block, 64. /// /// You don't usually need to think about this number. One case where it matters is calling /// [`OutputReader::fill`] in a loop, where using a `buf` argument that's a multiple of `BLOCK_LEN` /// avoids repeating work. pub const BLOCK_LEN: usize = 64; /// The number of bytes in a chunk, 1024. /// /// You don't usually need to think about this number, but it often comes up in benchmarks, because /// the maximum degree of parallelism used by the implementation equals the number of chunks. pub const CHUNK_LEN: usize = 1024; const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64 // While iterating the compression function within a chunk, the CV is // represented as words, to avoid doing two extra endianness conversions for // each compression in the portable implementation. But the hash_many interface // needs to hash both input bytes and parent nodes, so its better for its // output CVs to be represented as bytes. type CVWords = [u32; 8]; type CVBytes = [u8; 32]; // little-endian const IV: &CVWords = &[ 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, ]; const MSG_SCHEDULE: [[usize; 16]; 7] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], ]; // These are the internal flags that we use to domain separate root/non-root, // chunk/parent, and chunk beginning/middle/end. These get set at the high end // of the block flags word in the compression function, so their values start // high and go down. const CHUNK_START: u8 = 1 << 0; const CHUNK_END: u8 = 1 << 1; const PARENT: u8 = 1 << 2; const ROOT: u8 = 1 << 3; const KEYED_HASH: u8 = 1 << 4; const DERIVE_KEY_CONTEXT: u8 = 1 << 5; const DERIVE_KEY_MATERIAL: u8 = 1 << 6; #[inline] fn counter_low(counter: u64) -> u32 { counter as u32 } #[inline] fn counter_high(counter: u64) -> u32 { (counter >> 32) as u32 } /// An output of the default size, 32 bytes, which provides constant-time /// equality checking. /// /// `Hash` implements [`From`] and [`Into`] for `[u8; 32]`, and it provides /// [`from_bytes`] and [`as_bytes`] for explicit conversions between itself and /// `[u8; 32]`. However, byte arrays and slices don't provide constant-time /// equality checking, which is often a security requirement in software that /// handles private data. `Hash` doesn't implement [`Deref`] or [`AsRef`], to /// avoid situations where a type conversion happens implicitly and the /// constant-time property is accidentally lost. /// /// `Hash` provides the [`to_hex`] and [`from_hex`] methods for converting to /// and from hexadecimal. It also implements [`Display`] and [`FromStr`]. /// /// [`From`]: https://doc.rust-lang.org/std/convert/trait.From.html /// [`Into`]: https://doc.rust-lang.org/std/convert/trait.Into.html /// [`as_bytes`]: #method.as_bytes /// [`from_bytes`]: #method.from_bytes /// [`Deref`]: https://doc.rust-lang.org/stable/std/ops/trait.Deref.html /// [`AsRef`]: https://doc.rust-lang.org/std/convert/trait.AsRef.html /// [`to_hex`]: #method.to_hex /// [`from_hex`]: #method.from_hex /// [`Display`]: https://doc.rust-lang.org/std/fmt/trait.Display.html /// [`FromStr`]: https://doc.rust-lang.org/std/str/trait.FromStr.html #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] #[derive(Clone, Copy, Hash, Eq)] pub struct Hash([u8; OUT_LEN]); impl Hash { /// The raw bytes of the `Hash`. Note that byte arrays don't provide /// constant-time equality checking, so if you need to compare hashes, /// prefer the `Hash` type. #[inline] pub const fn as_bytes(&self) -> &[u8; OUT_LEN] { &self.0 } /// Create a `Hash` from its raw bytes representation. pub const fn from_bytes(bytes: [u8; OUT_LEN]) -> Self { Self(bytes) } /// Create a `Hash` from its raw bytes representation as a slice. /// /// Returns an error if the slice is not exactly 32 bytes long. pub fn from_slice(bytes: &[u8]) -> Result { Ok(Self::from_bytes(bytes.try_into()?)) } /// Encode a `Hash` in lowercase hexadecimal. /// /// The returned [`ArrayString`] is a fixed size and doesn't allocate memory /// on the heap. Note that [`ArrayString`] doesn't provide constant-time /// equality checking, so if you need to compare hashes, prefer the `Hash` /// type. /// /// [`ArrayString`]: https://docs.rs/arrayvec/0.5.1/arrayvec/struct.ArrayString.html pub fn to_hex(&self) -> ArrayString<{ 2 * OUT_LEN }> { let mut s = ArrayString::new(); let table = b"0123456789abcdef"; for &b in self.0.iter() { s.push(table[(b >> 4) as usize] as char); s.push(table[(b & 0xf) as usize] as char); } s } /// Decode a `Hash` from hexadecimal. Both uppercase and lowercase ASCII /// bytes are supported. /// /// Any byte outside the ranges `'0'...'9'`, `'a'...'f'`, and `'A'...'F'` /// results in an error. An input length other than 64 also results in an /// error. /// /// Note that `Hash` also implements `FromStr`, so `Hash::from_hex("...")` /// is equivalent to `"...".parse()`. pub fn from_hex(hex: impl AsRef<[u8]>) -> Result { fn hex_val(byte: u8) -> Result { match byte { b'A'..=b'F' => Ok(byte - b'A' + 10), b'a'..=b'f' => Ok(byte - b'a' + 10), b'0'..=b'9' => Ok(byte - b'0'), _ => Err(HexError(HexErrorInner::InvalidByte(byte))), } } let hex_bytes: &[u8] = hex.as_ref(); if hex_bytes.len() != OUT_LEN * 2 { return Err(HexError(HexErrorInner::InvalidLen(hex_bytes.len()))); } let mut hash_bytes: [u8; OUT_LEN] = [0; OUT_LEN]; for i in 0..OUT_LEN { hash_bytes[i] = 16 * hex_val(hex_bytes[2 * i])? + hex_val(hex_bytes[2 * i + 1])?; } Ok(Hash::from(hash_bytes)) } } impl From<[u8; OUT_LEN]> for Hash { #[inline] fn from(bytes: [u8; OUT_LEN]) -> Self { Self::from_bytes(bytes) } } impl From for [u8; OUT_LEN] { #[inline] fn from(hash: Hash) -> Self { hash.0 } } impl core::str::FromStr for Hash { type Err = HexError; fn from_str(s: &str) -> Result { Hash::from_hex(s) } } #[cfg(feature = "zeroize")] impl Zeroize for Hash { fn zeroize(&mut self) { // Destructuring to trigger compile error as a reminder to update this impl. let Self(bytes) = self; bytes.zeroize(); } } /// This implementation is constant-time. impl PartialEq for Hash { #[inline] fn eq(&self, other: &Hash) -> bool { constant_time_eq::constant_time_eq_32(&self.0, &other.0) } } /// This implementation is constant-time. impl PartialEq<[u8; OUT_LEN]> for Hash { #[inline] fn eq(&self, other: &[u8; OUT_LEN]) -> bool { constant_time_eq::constant_time_eq_32(&self.0, other) } } /// This implementation is constant-time if the target is 32 bytes long. impl PartialEq<[u8]> for Hash { #[inline] fn eq(&self, other: &[u8]) -> bool { constant_time_eq::constant_time_eq(&self.0, other) } } impl fmt::Display for Hash { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // Formatting field as `&str` to reduce code size since the `Debug` // dynamic dispatch table for `&str` is likely needed elsewhere already, // but that for `ArrayString<[u8; 64]>` is not. let hex = self.to_hex(); let hex: &str = hex.as_str(); f.write_str(hex) } } impl fmt::Debug for Hash { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // Formatting field as `&str` to reduce code size since the `Debug` // dynamic dispatch table for `&str` is likely needed elsewhere already, // but that for `ArrayString<[u8; 64]>` is not. let hex = self.to_hex(); let hex: &str = hex.as_str(); f.debug_tuple("Hash").field(&hex).finish() } } /// The error type for [`Hash::from_hex`]. /// /// The `.to_string()` representation of this error currently distinguishes between bad length /// errors and bad character errors. This is to help with logging and debugging, but it isn't a /// stable API detail, and it may change at any time. #[derive(Clone, Debug)] pub struct HexError(HexErrorInner); #[derive(Clone, Debug)] enum HexErrorInner { InvalidByte(u8), InvalidLen(usize), } impl fmt::Display for HexError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self.0 { HexErrorInner::InvalidByte(byte) => { if byte < 128 { write!(f, "invalid hex character: {:?}", byte as char) } else { write!(f, "invalid hex character: 0x{:x}", byte) } } HexErrorInner::InvalidLen(len) => { write!(f, "expected 64 hex bytes, received {}", len) } } } } #[cfg(feature = "std")] impl std::error::Error for HexError {} // Each chunk or parent node can produce either a 32-byte chaining value or, by // setting the ROOT flag, any number of final output bytes. The Output struct // captures the state just prior to choosing between those two possibilities. #[derive(Clone)] struct Output { input_chaining_value: CVWords, block: [u8; 64], block_len: u8, counter: u64, flags: u8, platform: Platform, } impl Output { fn chaining_value(&self) -> CVBytes { let mut cv = self.input_chaining_value; self.platform.compress_in_place( &mut cv, &self.block, self.block_len, self.counter, self.flags, ); platform::le_bytes_from_words_32(&cv) } fn root_hash(&self) -> Hash { debug_assert_eq!(self.counter, 0); let mut cv = self.input_chaining_value; self.platform .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT); Hash(platform::le_bytes_from_words_32(&cv)) } fn root_output_block(&self) -> [u8; 2 * OUT_LEN] { self.platform.compress_xof( &self.input_chaining_value, &self.block, self.block_len, self.counter, self.flags | ROOT, ) } } #[cfg(feature = "zeroize")] impl Zeroize for Output { fn zeroize(&mut self) { // Destructuring to trigger compile error as a reminder to update this impl. let Self { input_chaining_value, block, block_len, counter, flags, platform: _, } = self; input_chaining_value.zeroize(); block.zeroize(); block_len.zeroize(); counter.zeroize(); flags.zeroize(); } } #[derive(Clone)] struct ChunkState { cv: CVWords, chunk_counter: u64, buf: [u8; BLOCK_LEN], buf_len: u8, blocks_compressed: u8, flags: u8, platform: Platform, } impl ChunkState { fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self { Self { cv: *key, chunk_counter, buf: [0; BLOCK_LEN], buf_len: 0, blocks_compressed: 0, flags, platform, } } fn count(&self) -> usize { BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize } fn fill_buf(&mut self, input: &mut &[u8]) { let want = BLOCK_LEN - self.buf_len as usize; let take = cmp::min(want, input.len()); self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]); self.buf_len += take as u8; *input = &input[take..]; } fn start_flag(&self) -> u8 { if self.blocks_compressed == 0 { CHUNK_START } else { 0 } } // Try to avoid buffering as much as possible, by compressing directly from // the input slice when full blocks are available. fn update(&mut self, mut input: &[u8]) -> &mut Self { if self.buf_len > 0 { self.fill_buf(&mut input); if !input.is_empty() { debug_assert_eq!(self.buf_len as usize, BLOCK_LEN); let block_flags = self.flags | self.start_flag(); // borrowck self.platform.compress_in_place( &mut self.cv, &self.buf, BLOCK_LEN as u8, self.chunk_counter, block_flags, ); self.buf_len = 0; self.buf = [0; BLOCK_LEN]; self.blocks_compressed += 1; } } while input.len() > BLOCK_LEN { debug_assert_eq!(self.buf_len, 0); let block_flags = self.flags | self.start_flag(); // borrowck self.platform.compress_in_place( &mut self.cv, array_ref!(input, 0, BLOCK_LEN), BLOCK_LEN as u8, self.chunk_counter, block_flags, ); self.blocks_compressed += 1; input = &input[BLOCK_LEN..]; } self.fill_buf(&mut input); debug_assert!(input.is_empty()); debug_assert!(self.count() <= CHUNK_LEN); self } fn output(&self) -> Output { let block_flags = self.flags | self.start_flag() | CHUNK_END; Output { input_chaining_value: self.cv, block: self.buf, block_len: self.buf_len, counter: self.chunk_counter, flags: block_flags, platform: self.platform, } } } // Don't derive(Debug), because the state may be secret. impl fmt::Debug for ChunkState { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("ChunkState") .field("count", &self.count()) .field("chunk_counter", &self.chunk_counter) .field("flags", &self.flags) .field("platform", &self.platform) .finish() } } #[cfg(feature = "zeroize")] impl Zeroize for ChunkState { fn zeroize(&mut self) { // Destructuring to trigger compile error as a reminder to update this impl. let Self { cv, chunk_counter, buf, buf_len, blocks_compressed, flags, platform: _, } = self; cv.zeroize(); chunk_counter.zeroize(); buf.zeroize(); buf_len.zeroize(); blocks_compressed.zeroize(); flags.zeroize(); } } // IMPLEMENTATION NOTE // =================== // The recursive function compress_subtree_wide(), implemented below, is the // basis of high-performance BLAKE3. We use it both for all-at-once hashing, // and for the incremental input with Hasher (though we have to be careful with // subtree boundaries in the incremental case). compress_subtree_wide() applies // several optimizations at the same time: // - Multithreading with Rayon. // - Parallel chunk hashing with SIMD. // - Parallel parent hashing with SIMD. Note that while SIMD chunk hashing // maxes out at MAX_SIMD_DEGREE*CHUNK_LEN, parallel parent hashing continues // to benefit from larger inputs, because more levels of the tree benefit can // use full-width SIMD vectors for parent hashing. Without parallel parent // hashing, we lose about 10% of overall throughput on AVX2 and AVX-512. /// Undocumented and unstable, for benchmarks only. #[doc(hidden)] #[derive(Clone, Copy)] pub enum IncrementCounter { Yes, No, } impl IncrementCounter { #[inline] fn yes(&self) -> bool { match self { IncrementCounter::Yes => true, IncrementCounter::No => false, } } } // The largest power of two less than or equal to `n`, used in Hasher::update(). This is similar to // left_subtree_len(n), but note that left_subtree_len(n) is strictly less than `n`. fn largest_power_of_two_leq(n: usize) -> usize { ((n / 2) + 1).next_power_of_two() } // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time // on a single thread. Write out the chunk chaining values and return the // number of chunks hashed. These chunks are never the root and never empty; // those cases use a different codepath. fn compress_chunks_parallel( input: &[u8], key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform, out: &mut [u8], ) -> usize { debug_assert!(!input.is_empty(), "empty chunks below the root"); debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN); let mut chunks_exact = input.chunks_exact(CHUNK_LEN); let mut chunks_array = ArrayVec::<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE>::new(); for chunk in &mut chunks_exact { chunks_array.push(array_ref!(chunk, 0, CHUNK_LEN)); } platform.hash_many( &chunks_array, key, chunk_counter, IncrementCounter::Yes, flags, CHUNK_START, CHUNK_END, out, ); // Hash the remaining partial chunk, if there is one. Note that the empty // chunk (meaning the empty message) is a different codepath. let chunks_so_far = chunks_array.len(); if !chunks_exact.remainder().is_empty() { let counter = chunk_counter + chunks_so_far as u64; let mut chunk_state = ChunkState::new(key, counter, flags, platform); chunk_state.update(chunks_exact.remainder()); *array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) = chunk_state.output().chaining_value(); chunks_so_far + 1 } else { chunks_so_far } } // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time // on a single thread. Write out the parent chaining values and return the // number of parents hashed. (If there's an odd input chaining value left over, // return it as an additional output.) These parents are never the root and // never empty; those cases use a different codepath. fn compress_parents_parallel( child_chaining_values: &[u8], key: &CVWords, flags: u8, platform: Platform, out: &mut [u8], ) -> usize { debug_assert_eq!(child_chaining_values.len() % OUT_LEN, 0, "wacky hash bytes"); let num_children = child_chaining_values.len() / OUT_LEN; debug_assert!(num_children >= 2, "not enough children"); debug_assert!(num_children <= 2 * MAX_SIMD_DEGREE_OR_2, "too many"); let mut parents_exact = child_chaining_values.chunks_exact(BLOCK_LEN); // Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of // the requirements of compress_subtree_wide(). let mut parents_array = ArrayVec::<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE_OR_2>::new(); for parent in &mut parents_exact { parents_array.push(array_ref!(parent, 0, BLOCK_LEN)); } platform.hash_many( &parents_array, key, 0, // Parents always use counter 0. IncrementCounter::No, flags | PARENT, 0, // Parents have no start flags. 0, // Parents have no end flags. out, ); // If there's an odd child left over, it becomes an output. let parents_so_far = parents_array.len(); if !parents_exact.remainder().is_empty() { out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder()); parents_so_far + 1 } else { parents_so_far } } // The wide helper function returns (writes out) an array of chaining values // and returns the length of that array. The number of chaining values returned // is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, // if the input is shorter than that many chunks. The reason for maintaining a // wide array of chaining values going back up the tree, is to allow the // implementation to hash as many parents in parallel as possible. // // As a special case when the SIMD degree is 1, this function will still return // at least 2 outputs. This guarantees that this function doesn't perform the // root compression. (If it did, it would use the wrong flags, and also we // wouldn't be able to implement extendable output.) Note that this function is // not used when the whole input is only 1 chunk long; that's a different // codepath. // // Why not just have the caller split the input on the first update(), instead // of implementing this special rule? Because we don't want to limit SIMD or // multithreading parallelism for that update(). fn compress_subtree_wide( input: &[u8], key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform, out: &mut [u8], ) -> usize { // Note that the single chunk case does *not* bump the SIMD degree up to 2 // when it is 1. This allows Rayon the option of multithreading even the // 2-chunk case, which can help performance on smaller platforms. if input.len() <= platform.simd_degree() * CHUNK_LEN { return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out); } // With more than simd_degree chunks, we need to recurse. Start by dividing // the input into left and right subtrees. (Note that this is only optimal // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree // of 3 or something, we'll need a more complicated strategy.) debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2"); let (left, right) = input.split_at(hazmat::left_subtree_len(input.len() as u64) as usize); let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64; // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to // account for the special case of returning 2 outputs when the SIMD degree // is 1. let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; let degree = if left.len() == CHUNK_LEN { // The "simd_degree=1 and we're at the leaf nodes" case. debug_assert_eq!(platform.simd_degree(), 1); 1 } else { cmp::max(platform.simd_degree(), 2) }; let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN); // Recurse! For update_rayon(), this is where we take advantage of RayonJoin and use multiple // threads. let (left_n, right_n) = J::join( || compress_subtree_wide::(left, key, chunk_counter, flags, platform, left_out), || compress_subtree_wide::(right, key, right_chunk_counter, flags, platform, right_out), ); // The special case again. If simd_degree=1, then we'll have left_n=1 and // right_n=1. Rather than compressing them into a single output, return // them directly, to make sure we always have at least two outputs. debug_assert_eq!(left_n, degree); debug_assert!(right_n >= 1 && right_n <= left_n); if left_n == 1 { out[..2 * OUT_LEN].copy_from_slice(&cv_array[..2 * OUT_LEN]); return 2; } // Otherwise, do one layer of parent node compression. let num_children = left_n + right_n; compress_parents_parallel( &cv_array[..num_children * OUT_LEN], key, flags, platform, out, ) } // Hash a subtree with compress_subtree_wide(), and then condense the resulting // list of chaining values down to a single parent node. Don't compress that // last parent node, however. Instead, return its message bytes (the // concatenated chaining values of its children). This is necessary when the // first call to update() supplies a complete subtree, because the topmost // parent node of that subtree could end up being the root. It's also necessary // for extended output in the general case. // // As with compress_subtree_wide(), this function is not used on inputs of 1 // chunk or less. That's a different codepath. fn compress_subtree_to_parent_node( input: &[u8], key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform, ) -> [u8; BLOCK_LEN] { debug_assert!(input.len() > CHUNK_LEN); let mut cv_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; let mut num_cvs = compress_subtree_wide::(input, &key, chunk_counter, flags, platform, &mut cv_array); debug_assert!(num_cvs >= 2); // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, // compress_subtree_wide() returns more than 2 chaining values. Condense // them into 2 by forming parent nodes repeatedly. let mut out_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / 2]; while num_cvs > 2 { let cv_slice = &cv_array[..num_cvs * OUT_LEN]; num_cvs = compress_parents_parallel(cv_slice, key, flags, platform, &mut out_array); cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]); } *array_ref!(cv_array, 0, 2 * OUT_LEN) } // Hash a complete input all at once. Unlike compress_subtree_wide() and // compress_subtree_to_parent_node(), this function handles the 1 chunk case. fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output { let platform = Platform::detect(); // If the whole subtree is one chunk, hash it directly with a ChunkState. if input.len() <= CHUNK_LEN { return ChunkState::new(key, 0, flags, platform) .update(input) .output(); } // Otherwise construct an Output object from the parent node returned by // compress_subtree_to_parent_node(). Output { input_chaining_value: *key, block: compress_subtree_to_parent_node::(input, key, 0, flags, platform), block_len: BLOCK_LEN as u8, counter: 0, flags: flags | PARENT, platform, } } /// The default hash function. /// /// For an incremental version that accepts multiple writes, see [`Hasher::new`], /// [`Hasher::update`], and [`Hasher::finalize`]. These two lines are equivalent: /// /// ``` /// let hash = blake3::hash(b"foo"); /// # let hash1 = hash; /// /// let hash = blake3::Hasher::new().update(b"foo").finalize(); /// # let hash2 = hash; /// # assert_eq!(hash1, hash2); /// ``` /// /// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`] and /// [`OutputReader`]. /// /// This function is always single-threaded. For multithreading support, see /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). pub fn hash(input: &[u8]) -> Hash { hash_all_at_once::(input, IV, 0).root_hash() } /// The keyed hash function. /// /// This is suitable for use as a message authentication code, for example to /// replace an HMAC instance. In that use case, the constant-time equality /// checking provided by [`Hash`](struct.Hash.html) is almost always a security /// requirement, and callers need to be careful not to compare MACs as raw /// bytes. /// /// For an incremental version that accepts multiple writes, see [`Hasher::new_keyed`], /// [`Hasher::update`], and [`Hasher::finalize`]. These two lines are equivalent: /// /// ``` /// # const KEY: &[u8; 32] = &[0; 32]; /// let mac = blake3::keyed_hash(KEY, b"foo"); /// # let mac1 = mac; /// /// let mac = blake3::Hasher::new_keyed(KEY).update(b"foo").finalize(); /// # let mac2 = mac; /// # assert_eq!(mac1, mac2); /// ``` /// /// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`], and [`OutputReader`]. /// /// This function is always single-threaded. For multithreading support, see /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash { let key_words = platform::words_from_le_bytes_32(key); hash_all_at_once::(input, &key_words, KEYED_HASH).root_hash() } /// The key derivation function. /// /// Given cryptographic key material of any length and a context string of any /// length, this function outputs a 32-byte derived subkey. **The context string /// should be hardcoded, globally unique, and application-specific.** A good /// default format for such strings is `"[application] [commit timestamp] /// [purpose]"`, e.g., `"example.com 2019-12-25 16:18:03 session tokens v1"`. /// /// Key derivation is important when you want to use the same key in multiple /// algorithms or use cases. Using the same key with different cryptographic /// algorithms is generally forbidden, and deriving a separate subkey for each /// use case protects you from bad interactions. Derived keys also mitigate the /// damage from one part of your application accidentally leaking its key. /// /// As a rare exception to that general rule, however, it is possible to use /// `derive_key` itself with key material that you are already using with /// another algorithm. You might need to do this if you're adding features to /// an existing application, which does not yet use key derivation internally. /// However, you still must not share key material with algorithms that forbid /// key reuse entirely, like a one-time pad. For more on this, see sections 6.2 /// and 7.8 of the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). /// /// Note that BLAKE3 is not a password hash, and **`derive_key` should never be /// used with passwords.** Instead, use a dedicated password hash like /// [Argon2]. Password hashes are entirely different from generic hash /// functions, with opposite design requirements. /// /// For an incremental version that accepts multiple writes, see [`Hasher::new_derive_key`], /// [`Hasher::update`], and [`Hasher::finalize`]. These two statements are equivalent: /// /// ``` /// # const CONTEXT: &str = "example.com 2019-12-25 16:18:03 session tokens v1"; /// let key = blake3::derive_key(CONTEXT, b"key material, not a password"); /// # let key1 = key; /// /// let key: [u8; 32] = blake3::Hasher::new_derive_key(CONTEXT) /// .update(b"key material, not a password") /// .finalize() /// .into(); /// # let key2 = key; /// # assert_eq!(key1, key2); /// ``` /// /// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`], and [`OutputReader`]. /// /// This function is always single-threaded. For multithreading support, see /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). /// /// [Argon2]: https://en.wikipedia.org/wiki/Argon2 pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] { let context_key = hazmat::hash_derive_key_context(context); let context_key_words = platform::words_from_le_bytes_32(&context_key); hash_all_at_once::(key_material, &context_key_words, DERIVE_KEY_MATERIAL) .root_hash() .0 } fn parent_node_output( left_child: &CVBytes, right_child: &CVBytes, key: &CVWords, flags: u8, platform: Platform, ) -> Output { let mut block = [0; BLOCK_LEN]; block[..32].copy_from_slice(left_child); block[32..].copy_from_slice(right_child); Output { input_chaining_value: *key, block, block_len: BLOCK_LEN as u8, counter: 0, flags: flags | PARENT, platform, } } /// An incremental hash state that can accept any number of writes. /// /// The `rayon` and `mmap` Cargo features enable additional methods on this /// type related to multithreading and memory-mapped IO. /// /// When the `traits-preview` Cargo feature is enabled, this type implements /// several commonly used traits from the /// [`digest`](https://crates.io/crates/digest) crate. However, those /// traits aren't stable, and they're expected to change in incompatible ways /// before that crate reaches 1.0. For that reason, this crate makes no SemVer /// guarantees for this feature, and callers who use it should expect breaking /// changes between patch versions. /// /// # Examples /// /// ``` /// # fn main() -> Result<(), Box> { /// // Hash an input incrementally. /// let mut hasher = blake3::Hasher::new(); /// hasher.update(b"foo"); /// hasher.update(b"bar"); /// hasher.update(b"baz"); /// assert_eq!(hasher.finalize(), blake3::hash(b"foobarbaz")); /// /// // Extended output. OutputReader also implements Read and Seek. /// # #[cfg(feature = "std")] { /// let mut output = [0; 1000]; /// let mut output_reader = hasher.finalize_xof(); /// output_reader.fill(&mut output); /// assert_eq!(&output[..32], blake3::hash(b"foobarbaz").as_bytes()); /// # } /// # Ok(()) /// # } /// ``` #[derive(Clone)] pub struct Hasher { key: CVWords, chunk_state: ChunkState, initial_chunk_counter: u64, // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk // requires a 4th entry, rather than merging everything down to 1, because // we don't know whether more input is coming. This is different from how // the reference implementation does things. cv_stack: ArrayVec, } impl Hasher { fn new_internal(key: &CVWords, flags: u8) -> Self { Self { key: *key, chunk_state: ChunkState::new(key, 0, flags, Platform::detect()), initial_chunk_counter: 0, cv_stack: ArrayVec::new(), } } /// Construct a new `Hasher` for the regular hash function. pub fn new() -> Self { Self::new_internal(IV, 0) } /// Construct a new `Hasher` for the keyed hash function. See /// [`keyed_hash`]. /// /// [`keyed_hash`]: fn.keyed_hash.html pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { let key_words = platform::words_from_le_bytes_32(key); Self::new_internal(&key_words, KEYED_HASH) } /// Construct a new `Hasher` for the key derivation function. See /// [`derive_key`]. The context string should be hardcoded, globally /// unique, and application-specific. /// /// [`derive_key`]: fn.derive_key.html pub fn new_derive_key(context: &str) -> Self { let context_key = hazmat::hash_derive_key_context(context); let context_key_words = platform::words_from_le_bytes_32(&context_key); Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL) } /// Reset the `Hasher` to its initial state. /// /// This is functionally the same as overwriting the `Hasher` with a new /// one, using the same key or context string if any. pub fn reset(&mut self) -> &mut Self { self.chunk_state = ChunkState::new( &self.key, 0, self.chunk_state.flags, self.chunk_state.platform, ); self.cv_stack.clear(); self } // As described in push_cv() below, we do "lazy merging", delaying merges // until right before the next CV is about to be added. This is different // from the reference implementation. Another difference is that we aren't // always merging 1 chunk at a time. Instead, each CV might represent any // power-of-two number of chunks, as long as the smaller-above-larger stack // order is maintained. Instead of the "count the trailing 0-bits" // algorithm described in the spec, we use a "count the total number of // 1-bits" variant that doesn't require us to retain the subtree size of // the CV on top of the stack. The principle is the same: each CV that // should remain in the stack is represented by a 1-bit in the total number // of chunks (or bytes) so far. fn merge_cv_stack(&mut self, chunk_counter: u64) { // Account for non-zero cases of Hasher::set_input_offset, where there are no prior // subtrees in the stack. Note that initial_chunk_counter is always 0 for callers who don't // use the hazmat module. let post_merge_stack_len = (chunk_counter - self.initial_chunk_counter).count_ones() as usize; while self.cv_stack.len() > post_merge_stack_len { let right_child = self.cv_stack.pop().unwrap(); let left_child = self.cv_stack.pop().unwrap(); let parent_output = parent_node_output( &left_child, &right_child, &self.key, self.chunk_state.flags, self.chunk_state.platform, ); self.cv_stack.push(parent_output.chaining_value()); } } // In reference_impl.rs, we merge the new CV with existing CVs from the // stack before pushing it. We can do that because we know more input is // coming, so we know none of the merges are root. // // This setting is different. We want to feed as much input as possible to // compress_subtree_wide(), without setting aside anything for the // chunk_state. If the user gives us 64 KiB, we want to parallelize over // all 64 KiB at once as a single subtree, if at all possible. // // This leads to two problems: // 1) This 64 KiB input might be the only call that ever gets made to // update. In this case, the root node of the 64 KiB subtree would be // the root node of the whole tree, and it would need to be ROOT // finalized. We can't compress it until we know. // 2) This 64 KiB input might complete a larger tree, whose root node is // similarly going to be the root of the whole tree. For example, // maybe we have 196 KiB (that is, 128 + 64) hashed so far. We can't // compress the node at the root of the 256 KiB subtree until we know // how to finalize it. // // The second problem is solved with "lazy merging". That is, when we're // about to add a CV to the stack, we don't merge it with anything first, // as the reference impl does. Instead we do merges using the *previous* CV // that was added, which is sitting on top of the stack, and we put the new // CV (unmerged) on top of the stack afterwards. This guarantees that we // never merge the root node until finalize(). // // Solving the first problem requires an additional tool, // compress_subtree_to_parent_node(). That function always returns the top // *two* chaining values of the subtree it's compressing. We then do lazy // merging with each of them separately, so that the second CV will always // remain unmerged. (That also helps us support extendable output when // we're hashing an input all-at-once.) fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) { self.merge_cv_stack(chunk_counter); self.cv_stack.push(*new_cv); } /// Add input bytes to the hash state. You can call this any number of times. /// /// This method is always single-threaded. For multithreading support, see /// [`update_rayon`](#method.update_rayon) (enabled with the `rayon` Cargo feature). /// /// Note that the degree of SIMD parallelism that `update` can use is limited by the size of /// this input buffer. See [`update_reader`](#method.update_reader). pub fn update(&mut self, input: &[u8]) -> &mut Self { self.update_with_join::(input) } fn update_with_join(&mut self, mut input: &[u8]) -> &mut Self { let input_offset = self.initial_chunk_counter * CHUNK_LEN as u64; if let Some(max) = hazmat::max_subtree_len(input_offset) { let remaining = max - self.count(); assert!( input.len() as u64 <= remaining, "the subtree starting at {} contains at most {} bytes (found {})", CHUNK_LEN as u64 * self.initial_chunk_counter, max, input.len(), ); } // If we have some partial chunk bytes in the internal chunk_state, we // need to finish that chunk first. if self.chunk_state.count() > 0 { let want = CHUNK_LEN - self.chunk_state.count(); let take = cmp::min(want, input.len()); self.chunk_state.update(&input[..take]); input = &input[take..]; if !input.is_empty() { // We've filled the current chunk, and there's more input // coming, so we know it's not the root and we can finalize it. // Then we'll proceed to hashing whole chunks below. debug_assert_eq!(self.chunk_state.count(), CHUNK_LEN); let chunk_cv = self.chunk_state.output().chaining_value(); self.push_cv(&chunk_cv, self.chunk_state.chunk_counter); self.chunk_state = ChunkState::new( &self.key, self.chunk_state.chunk_counter + 1, self.chunk_state.flags, self.chunk_state.platform, ); } else { return self; } } // Now the chunk_state is clear, and we have more input. If there's // more than a single chunk (so, definitely not the root chunk), hash // the largest whole subtree we can, with the full benefits of SIMD and // multithreading parallelism. Two restrictions: // - The subtree has to be a power-of-2 number of chunks. Only subtrees // along the right edge can be incomplete, and we don't know where // the right edge is going to be until we get to finalize(). // - The subtree must evenly divide the total number of chunks up until // this point (if total is not 0). If the current incomplete subtree // is only waiting for 1 more chunk, we can't hash a subtree of 4 // chunks. We have to complete the current subtree first. // Because we might need to break up the input to form powers of 2, or // to evenly divide what we already have, this part runs in a loop. while input.len() > CHUNK_LEN { debug_assert_eq!(self.chunk_state.count(), 0, "no partial chunk data"); debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len"); let mut subtree_len = largest_power_of_two_leq(input.len()); let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64; // Shrink the subtree_len until it evenly divides the count so far. // We know that subtree_len itself is a power of 2, so we can use a // bitmasking trick instead of an actual remainder operation. (Note // that if the caller consistently passes power-of-2 inputs of the // same size, as is hopefully typical, this loop condition will // always fail, and subtree_len will always be the full length of // the input.) // // An aside: We don't have to shrink subtree_len quite this much. // For example, if count_so_far is 1, we could pass 2 chunks to // compress_subtree_to_parent_node. Since we'll get 2 CVs back, // we'll still get the right answer in the end, and we might get to // use 2-way SIMD parallelism. The problem with this optimization, // is that it gets us stuck always hashing 2 chunks. The total // number of chunks will remain odd, and we'll never graduate to // higher degrees of parallelism. See // https://github.com/BLAKE3-team/BLAKE3/issues/69. while (subtree_len - 1) as u64 & count_so_far != 0 { subtree_len /= 2; } // The shrunken subtree_len might now be 1 chunk long. If so, hash // that one chunk by itself. Otherwise, compress the subtree into a // pair of CVs. let subtree_chunks = (subtree_len / CHUNK_LEN) as u64; if subtree_len <= CHUNK_LEN { debug_assert_eq!(subtree_len, CHUNK_LEN); self.push_cv( &ChunkState::new( &self.key, self.chunk_state.chunk_counter, self.chunk_state.flags, self.chunk_state.platform, ) .update(&input[..subtree_len]) .output() .chaining_value(), self.chunk_state.chunk_counter, ); } else { // This is the high-performance happy path, though getting here // depends on the caller giving us a long enough input. let cv_pair = compress_subtree_to_parent_node::( &input[..subtree_len], &self.key, self.chunk_state.chunk_counter, self.chunk_state.flags, self.chunk_state.platform, ); let left_cv = array_ref!(cv_pair, 0, 32); let right_cv = array_ref!(cv_pair, 32, 32); // Push the two CVs we received into the CV stack in order. Because // the stack merges lazily, this guarantees we aren't merging the // root. self.push_cv(left_cv, self.chunk_state.chunk_counter); self.push_cv( right_cv, self.chunk_state.chunk_counter + (subtree_chunks / 2), ); } self.chunk_state.chunk_counter += subtree_chunks; input = &input[subtree_len..]; } // What remains is 1 chunk or less. Add it to the chunk state. debug_assert!(input.len() <= CHUNK_LEN); if !input.is_empty() { self.chunk_state.update(input); // Having added some input to the chunk_state, we know what's in // the CV stack won't become the root node, and we can do an extra // merge. This simplifies finalize(). self.merge_cv_stack(self.chunk_state.chunk_counter); } self } fn final_output(&self) -> Output { // If the current chunk is the only chunk, that makes it the root node // also. Convert it directly into an Output. Otherwise, we need to // merge subtrees below. if self.cv_stack.is_empty() { debug_assert_eq!(self.chunk_state.chunk_counter, self.initial_chunk_counter); return self.chunk_state.output(); } // If there are any bytes in the ChunkState, finalize that chunk and // merge its CV with everything in the CV stack. In that case, the work // we did at the end of update() above guarantees that the stack // doesn't contain any unmerged subtrees that need to be merged first. // (This is important, because if there were two chunk hashes sitting // on top of the stack, they would need to merge with each other, and // merging a new chunk hash into them would be incorrect.) // // If there are no bytes in the ChunkState, we'll merge what's already // in the stack. In this case it's fine if there are unmerged chunks on // top, because we'll merge them with each other. Note that the case of // the empty chunk is taken care of above. let mut output: Output; let mut num_cvs_remaining = self.cv_stack.len(); if self.chunk_state.count() > 0 { debug_assert_eq!( self.cv_stack.len(), (self.chunk_state.chunk_counter - self.initial_chunk_counter).count_ones() as usize, "cv stack does not need a merge", ); output = self.chunk_state.output(); } else { debug_assert!(self.cv_stack.len() >= 2); output = parent_node_output( &self.cv_stack[num_cvs_remaining - 2], &self.cv_stack[num_cvs_remaining - 1], &self.key, self.chunk_state.flags, self.chunk_state.platform, ); num_cvs_remaining -= 2; } while num_cvs_remaining > 0 { output = parent_node_output( &self.cv_stack[num_cvs_remaining - 1], &output.chaining_value(), &self.key, self.chunk_state.flags, self.chunk_state.platform, ); num_cvs_remaining -= 1; } output } /// Finalize the hash state and return the [`Hash`](struct.Hash.html) of /// the input. /// /// This method is idempotent. Calling it twice will give the same result. /// You can also add more input and finalize again. pub fn finalize(&self) -> Hash { assert_eq!( self.initial_chunk_counter, 0, "set_input_offset must be used with finalize_non_root", ); self.final_output().root_hash() } /// Finalize the hash state and return an [`OutputReader`], which can /// supply any number of output bytes. /// /// This method is idempotent. Calling it twice will give the same result. /// You can also add more input and finalize again. /// /// [`OutputReader`]: struct.OutputReader.html pub fn finalize_xof(&self) -> OutputReader { assert_eq!( self.initial_chunk_counter, 0, "set_input_offset must be used with finalize_non_root", ); OutputReader::new(self.final_output()) } /// Return the total number of bytes hashed so far. /// /// [`hazmat::HasherExt::set_input_offset`] does not affect this value. This only counts bytes /// passed to [`update`](Hasher::update). pub fn count(&self) -> u64 { // Account for non-zero cases of Hasher::set_input_offset. Note that initial_chunk_counter // is always 0 for callers who don't use the hazmat module. (self.chunk_state.chunk_counter - self.initial_chunk_counter) * CHUNK_LEN as u64 + self.chunk_state.count() as u64 } /// As [`update`](Hasher::update), but reading from a /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) implementation. /// /// [`Hasher`] implements /// [`std::io::Write`](https://doc.rust-lang.org/std/io/trait.Write.html), so it's possible to /// use [`std::io::copy`](https://doc.rust-lang.org/std/io/fn.copy.html) to update a [`Hasher`] /// from any reader. Unfortunately, this standard approach can limit performance, because /// `copy` currently uses an internal 8 KiB buffer that isn't big enough to take advantage of /// all SIMD instruction sets. (In particular, [AVX-512](https://en.wikipedia.org/wiki/AVX-512) /// needs a 16 KiB buffer.) `update_reader` avoids this performance problem and is slightly /// more convenient. /// /// The internal buffer size this method uses may change at any time, and it may be different /// for different targets. The only guarantee is that it will be large enough for all of this /// crate's SIMD implementations on the current platform. /// /// The most common implementer of /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) might be /// [`std::fs::File`](https://doc.rust-lang.org/std/fs/struct.File.html), but note that memory /// mapping can be faster than this method for hashing large files. See /// [`update_mmap`](Hasher::update_mmap) and [`update_mmap_rayon`](Hasher::update_mmap_rayon), /// which require the `mmap` and (for the latter) `rayon` Cargo features. /// /// This method requires the `std` Cargo feature, which is enabled by default. /// /// # Example /// /// ```no_run /// # use std::fs::File; /// # use std::io; /// # fn main() -> io::Result<()> { /// // Hash standard input. /// let mut hasher = blake3::Hasher::new(); /// hasher.update_reader(std::io::stdin().lock())?; /// println!("{}", hasher.finalize()); /// # Ok(()) /// # } /// ``` #[cfg(feature = "std")] pub fn update_reader(&mut self, reader: impl std::io::Read) -> std::io::Result<&mut Self> { io::copy_wide(reader, self)?; Ok(self) } /// As [`update`](Hasher::update), but using Rayon-based multithreading /// internally. /// /// This method is gated by the `rayon` Cargo feature, which is disabled by /// default but enabled on [docs.rs](https://docs.rs). /// /// To get any performance benefit from multithreading, the input buffer /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is /// _slower_ than `update` for inputs under 128 KiB. That threshold varies /// quite a lot across different processors, and it's important to benchmark /// your specific use case. See also the performance warning associated with /// [`update_mmap_rayon`](Hasher::update_mmap_rayon). /// /// If you already have a large buffer in memory, and you want to hash it /// with multiple threads, this method is a good option. However, reading a /// file into memory just to call this method can be a performance mistake, /// both because it requires lots of memory and because single-threaded /// reads can be slow. For hashing whole files, see /// [`update_mmap_rayon`](Hasher::update_mmap_rayon), which is gated by both /// the `rayon` and `mmap` Cargo features. #[cfg(feature = "rayon")] pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self { self.update_with_join::(input) } /// As [`update`](Hasher::update), but reading the contents of a file using memory mapping. /// /// Not all files can be memory mapped, and memory mapping small files can be slower than /// reading them the usual way. In those cases, this method will fall back to standard file IO. /// The heuristic for whether to use memory mapping is currently very simple (file size >= /// 16 KiB), and it might change at any time. /// /// Like [`update`](Hasher::update), this method is single-threaded. In this author's /// experience, memory mapping improves single-threaded performance by ~10% for large files /// that are already in cache. This probably varies between platforms, and as always it's a /// good idea to benchmark your own use case. In comparison, the multithreaded /// [`update_mmap_rayon`](Hasher::update_mmap_rayon) method can have a much larger impact on /// performance. /// /// There's a correctness reason that this method takes /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) instead of /// [`File`](https://doc.rust-lang.org/std/fs/struct.File.html): reading from a memory-mapped /// file ignores the seek position of the original file handle (it neither respects the current /// position nor updates the position). This difference in behavior would've caused /// `update_mmap` and [`update_reader`](Hasher::update_reader) to give different answers and /// have different side effects in some cases. Taking a /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) avoids this problem by /// making it clear that a new [`File`](https://doc.rust-lang.org/std/fs/struct.File.html) is /// opened internally. /// /// This method requires the `mmap` Cargo feature, which is disabled by default but enabled on /// [docs.rs](https://docs.rs). /// /// # Example /// /// ```no_run /// # use std::io; /// # use std::path::Path; /// # fn main() -> io::Result<()> { /// let path = Path::new("file.dat"); /// let mut hasher = blake3::Hasher::new(); /// hasher.update_mmap(path)?; /// println!("{}", hasher.finalize()); /// # Ok(()) /// # } /// ``` #[cfg(feature = "mmap")] pub fn update_mmap(&mut self, path: impl AsRef) -> std::io::Result<&mut Self> { let file = std::fs::File::open(path.as_ref())?; if let Some(mmap) = io::maybe_mmap_file(&file)? { self.update(&mmap); } else { io::copy_wide(&file, self)?; } Ok(self) } /// As [`update_rayon`](Hasher::update_rayon), but reading the contents of a file using /// memory mapping. This is the default behavior of `b3sum`. /// /// For large files that are likely to be in cache, this can be much faster than /// single-threaded hashing. When benchmarks report that BLAKE3 is 10x or 20x faster than other /// cryptographic hashes, this is usually what they're measuring. However... /// /// **Performance Warning:** There are cases where multithreading hurts performance. The worst /// case is [a large file on a spinning disk](https://github.com/BLAKE3-team/BLAKE3/issues/31), /// where simultaneous reads from multiple threads can cause "thrashing" (i.e. the disk spends /// more time seeking around than reading data). Windows tends to be somewhat worse about this, /// in part because it's less likely than Linux to keep very large files in cache. More /// generally, if your CPU cores are already busy, then multithreading will add overhead /// without improving performance. If your code runs in different environments that you don't /// control and can't measure, then unfortunately there's no one-size-fits-all answer for /// whether multithreading is a good idea. /// /// The memory mapping behavior of this function is the same as /// [`update_mmap`](Hasher::update_mmap), and the heuristic for when to fall back to standard /// file IO might change at any time. /// /// This method requires both the `mmap` and `rayon` Cargo features, which are disabled by /// default but enabled on [docs.rs](https://docs.rs). /// /// # Example /// /// ```no_run /// # use std::io; /// # use std::path::Path; /// # fn main() -> io::Result<()> { /// # #[cfg(feature = "rayon")] /// # { /// let path = Path::new("big_file.dat"); /// let mut hasher = blake3::Hasher::new(); /// hasher.update_mmap_rayon(path)?; /// println!("{}", hasher.finalize()); /// # } /// # Ok(()) /// # } /// ``` #[cfg(feature = "mmap")] #[cfg(feature = "rayon")] pub fn update_mmap_rayon( &mut self, path: impl AsRef, ) -> std::io::Result<&mut Self> { let file = std::fs::File::open(path.as_ref())?; if let Some(mmap) = io::maybe_mmap_file(&file)? { self.update_rayon(&mmap); } else { io::copy_wide(&file, self)?; } Ok(self) } } // Don't derive(Debug), because the state may be secret. impl fmt::Debug for Hasher { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Hasher") .field("flags", &self.chunk_state.flags) .field("platform", &self.chunk_state.platform) .finish() } } impl Default for Hasher { #[inline] fn default() -> Self { Self::new() } } #[cfg(feature = "std")] impl std::io::Write for Hasher { /// This is equivalent to [`update`](#method.update). #[inline] fn write(&mut self, input: &[u8]) -> std::io::Result { self.update(input); Ok(input.len()) } #[inline] fn flush(&mut self) -> std::io::Result<()> { Ok(()) } } #[cfg(feature = "zeroize")] impl Zeroize for Hasher { fn zeroize(&mut self) { // Destructuring to trigger compile error as a reminder to update this impl. let Self { key, chunk_state, initial_chunk_counter, cv_stack, } = self; key.zeroize(); chunk_state.zeroize(); initial_chunk_counter.zeroize(); cv_stack.zeroize(); } } /// An incremental reader for extended output, returned by /// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof). /// /// Shorter BLAKE3 outputs are prefixes of longer ones, and explicitly requesting a short output is /// equivalent to truncating the default-length output. Note that this is a difference between /// BLAKE2 and BLAKE3. /// /// # Security notes /// /// Outputs shorter than the default length of 32 bytes (256 bits) provide less security. An N-bit /// BLAKE3 output is intended to provide N bits of first and second preimage resistance and N/2 /// bits of collision resistance, for any N up to 256. Longer outputs don't provide any additional /// security. /// /// Avoid relying on the secrecy of the output offset, that is, the number of output bytes read or /// the arguments to [`seek`](struct.OutputReader.html#method.seek) or /// [`set_position`](struct.OutputReader.html#method.set_position). [_Block-Cipher-Based Tree /// Hashing_ by Aldo Gunsing](https://eprint.iacr.org/2022/283) shows that an attacker who knows /// both the message and the key (if any) can easily determine the offset of an extended output. /// For comparison, AES-CTR has a similar property: if you know the key, you can decrypt a block /// from an unknown position in the output stream to recover its block index. Callers with strong /// secret keys aren't affected in practice, but secret offsets are a [design /// smell](https://en.wikipedia.org/wiki/Design_smell) in any case. #[derive(Clone)] pub struct OutputReader { inner: Output, position_within_block: u8, } impl OutputReader { fn new(inner: Output) -> Self { Self { inner, position_within_block: 0, } } // This helper function handles both the case where the output buffer is // shorter than one block, and the case where our position_within_block is // non-zero. fn fill_one_block(&mut self, buf: &mut &mut [u8]) { let output_block: [u8; BLOCK_LEN] = self.inner.root_output_block(); let output_bytes = &output_block[self.position_within_block as usize..]; let take = cmp::min(buf.len(), output_bytes.len()); buf[..take].copy_from_slice(&output_bytes[..take]); self.position_within_block += take as u8; if self.position_within_block == BLOCK_LEN as u8 { self.inner.counter += 1; self.position_within_block = 0; } // Advance the dest buffer. mem::take() is a borrowck workaround. *buf = &mut core::mem::take(buf)[take..]; } /// Fill a buffer with output bytes and advance the position of the /// `OutputReader`. This is equivalent to [`Read::read`], except that it /// doesn't return a `Result`. Both methods always fill the entire buffer. /// /// Note that `OutputReader` doesn't buffer output bytes internally, so /// calling `fill` repeatedly with a short-length or odd-length slice will /// end up performing the same compression multiple times. If you're /// reading output in a loop, prefer a slice length that's a multiple of /// [`BLOCK_LEN`] (64 bytes). /// /// The maximum output size of BLAKE3 is 264-1 bytes. If you try /// to extract more than that, for example by seeking near the end and /// reading further, the behavior is unspecified. /// /// [`Read::read`]: #method.read pub fn fill(&mut self, mut buf: &mut [u8]) { if buf.is_empty() { return; } // If we're partway through a block, try to get to a block boundary. if self.position_within_block != 0 { self.fill_one_block(&mut buf); } let full_blocks = buf.len() / BLOCK_LEN; let full_blocks_len = full_blocks * BLOCK_LEN; if full_blocks > 0 { debug_assert_eq!(0, self.position_within_block); self.inner.platform.xof_many( &self.inner.input_chaining_value, &self.inner.block, self.inner.block_len, self.inner.counter, self.inner.flags | ROOT, &mut buf[..full_blocks_len], ); self.inner.counter += full_blocks as u64; buf = &mut buf[full_blocks * BLOCK_LEN..]; } if !buf.is_empty() { debug_assert!(buf.len() < BLOCK_LEN); self.fill_one_block(&mut buf); debug_assert!(buf.is_empty()); } } /// Return the current read position in the output stream. This is /// equivalent to [`Seek::stream_position`], except that it doesn't return /// a `Result`. The position of a new `OutputReader` starts at 0, and each /// call to [`fill`] or [`Read::read`] moves the position forward by the /// number of bytes read. /// /// [`Seek::stream_position`]: #method.stream_position /// [`fill`]: #method.fill /// [`Read::read`]: #method.read pub fn position(&self) -> u64 { self.inner.counter * BLOCK_LEN as u64 + self.position_within_block as u64 } /// Seek to a new read position in the output stream. This is equivalent to /// calling [`Seek::seek`] with [`SeekFrom::Start`], except that it doesn't /// return a `Result`. /// /// [`Seek::seek`]: #method.seek /// [`SeekFrom::Start`]: https://doc.rust-lang.org/std/io/enum.SeekFrom.html pub fn set_position(&mut self, position: u64) { self.position_within_block = (position % BLOCK_LEN as u64) as u8; self.inner.counter = position / BLOCK_LEN as u64; } } // Don't derive(Debug), because the state may be secret. impl fmt::Debug for OutputReader { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("OutputReader") .field("position", &self.position()) .finish() } } #[cfg(feature = "std")] impl std::io::Read for OutputReader { #[inline] fn read(&mut self, buf: &mut [u8]) -> std::io::Result { self.fill(buf); Ok(buf.len()) } } #[cfg(feature = "std")] impl std::io::Seek for OutputReader { fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result { let max_position = u64::max_value() as i128; let target_position: i128 = match pos { std::io::SeekFrom::Start(x) => x as i128, std::io::SeekFrom::Current(x) => self.position() as i128 + x as i128, std::io::SeekFrom::End(_) => { return Err(std::io::Error::new( std::io::ErrorKind::InvalidInput, "seek from end not supported", )); } }; if target_position < 0 { return Err(std::io::Error::new( std::io::ErrorKind::InvalidInput, "seek before start", )); } self.set_position(cmp::min(target_position, max_position) as u64); Ok(self.position()) } } #[cfg(feature = "zeroize")] impl Zeroize for OutputReader { fn zeroize(&mut self) { // Destructuring to trigger compile error as a reminder to update this impl. let Self { inner, position_within_block, } = self; inner.zeroize(); position_within_block.zeroize(); } } blake3-1.8.1/src/platform.rs000064400000000000000000000463541046102023000137710ustar 00000000000000use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN}; use arrayref::{array_mut_ref, array_ref}; cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if::cfg_if! { if #[cfg(blake3_avx512_ffi)] { pub const MAX_SIMD_DEGREE: usize = 16; } else { pub const MAX_SIMD_DEGREE: usize = 8; } } } else if #[cfg(blake3_neon)] { pub const MAX_SIMD_DEGREE: usize = 4; } else if #[cfg(blake3_wasm32_simd)] { pub const MAX_SIMD_DEGREE: usize = 4; } else { pub const MAX_SIMD_DEGREE: usize = 1; } } // There are some places where we want a static size that's equal to the // MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently // allowed to use cmp::max, so we have to hardcode this additional constant // value. Get rid of this once cmp::max is a const fn. cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if::cfg_if! { if #[cfg(blake3_avx512_ffi)] { pub const MAX_SIMD_DEGREE_OR_2: usize = 16; } else { pub const MAX_SIMD_DEGREE_OR_2: usize = 8; } } } else if #[cfg(blake3_neon)] { pub const MAX_SIMD_DEGREE_OR_2: usize = 4; } else if #[cfg(blake3_wasm32_simd)] { pub const MAX_SIMD_DEGREE_OR_2: usize = 4; } else { pub const MAX_SIMD_DEGREE_OR_2: usize = 2; } } #[derive(Clone, Copy, Debug)] pub enum Platform { Portable, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] SSE2, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] SSE41, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] AVX2, #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] AVX512, #[cfg(blake3_neon)] NEON, #[cfg(blake3_wasm32_simd)] #[allow(non_camel_case_types)] WASM32_SIMD, } impl Platform { #[allow(unreachable_code)] pub fn detect() -> Self { #[cfg(miri)] { return Platform::Portable; } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { #[cfg(blake3_avx512_ffi)] { if avx512_detected() { return Platform::AVX512; } } if avx2_detected() { return Platform::AVX2; } if sse41_detected() { return Platform::SSE41; } if sse2_detected() { return Platform::SSE2; } } // We don't use dynamic feature detection for NEON. If the "neon" // feature is on, NEON is assumed to be supported. #[cfg(blake3_neon)] { return Platform::NEON; } #[cfg(blake3_wasm32_simd)] { return Platform::WASM32_SIMD; } Platform::Portable } pub fn simd_degree(&self) -> usize { let degree = match self { Platform::Portable => 1, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => 4, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 => 4, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => 8, #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => 16, #[cfg(blake3_neon)] Platform::NEON => 4, #[cfg(blake3_wasm32_simd)] Platform::WASM32_SIMD => 4, }; debug_assert!(degree <= MAX_SIMD_DEGREE); degree } pub fn compress_in_place( &self, cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { match self { Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags), // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => unsafe { crate::sse2::compress_in_place(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 | Platform::AVX2 => unsafe { crate::sse41::compress_in_place(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::compress_in_place(cv, block, block_len, counter, flags) }, // No NEON compress_in_place() implementation yet. #[cfg(blake3_neon)] Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags), #[cfg(blake3_wasm32_simd)] Platform::WASM32_SIMD => { crate::wasm32_simd::compress_in_place(cv, block, block_len, counter, flags) } } } pub fn compress_xof( &self, cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { match self { Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags), // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => unsafe { crate::sse2::compress_xof(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 | Platform::AVX2 => unsafe { crate::sse41::compress_xof(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::compress_xof(cv, block, block_len, counter, flags) }, // No NEON compress_xof() implementation yet. #[cfg(blake3_neon)] Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags), #[cfg(blake3_wasm32_simd)] Platform::WASM32_SIMD => { crate::wasm32_simd::compress_xof(cv, block, block_len, counter, flags) } } } // IMPLEMENTATION NOTE // =================== // hash_many() applies two optimizations. The critically important // optimization is the high-performance parallel SIMD hashing mode, // described in detail in the spec. This more than doubles throughput per // thread. Another optimization is keeping the state vectors transposed // from block to block within a chunk. When state vectors are transposed // after every block, there's a small but measurable performance loss. // Compressing chunks with a dedicated loop avoids this. pub fn hash_many( &self, inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { match self { Platform::Portable => portable::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ), // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => unsafe { crate::sse2::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 => unsafe { crate::sse41::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => unsafe { crate::avx2::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Assumed to be safe if the "neon" feature is on. #[cfg(blake3_neon)] Platform::NEON => unsafe { crate::neon::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Assumed to be safe if the "wasm32_simd" feature is on. #[cfg(blake3_wasm32_simd)] Platform::WASM32_SIMD => unsafe { crate::wasm32_simd::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, } } pub fn xof_many( &self, cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, mut counter: u64, flags: u8, out: &mut [u8], ) { debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only"); if out.is_empty() { // The current assembly implementation always outputs at least 1 block. return; } match self { // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] #[cfg(unix)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::xof_many(cv, block, block_len, counter, flags, out) }, _ => { // For platforms without an optimized xof_many, fall back to a loop over // compress_xof. This is still faster than portable code. for out_block in out.chunks_exact_mut(BLOCK_LEN) { // TODO: Use array_chunks_mut here once that's stable. let out_array: &mut [u8; BLOCK_LEN] = out_block.try_into().unwrap(); *out_array = self.compress_xof(cv, block, block_len, counter, flags); counter += 1; } } } } // Explicit platform constructors, for benchmarks. pub fn portable() -> Self { Self::Portable } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn sse2() -> Option { if sse2_detected() { Some(Self::SSE2) } else { None } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn sse41() -> Option { if sse41_detected() { Some(Self::SSE41) } else { None } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn avx2() -> Option { if avx2_detected() { Some(Self::AVX2) } else { None } } #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn avx512() -> Option { if avx512_detected() { Some(Self::AVX512) } else { None } } #[cfg(blake3_neon)] pub fn neon() -> Option { // Assumed to be safe if the "neon" feature is on. Some(Self::NEON) } #[cfg(blake3_wasm32_simd)] pub fn wasm32_simd() -> Option { // Assumed to be safe if the "wasm32_simd" feature is on. Some(Self::WASM32_SIMD) } } // Note that AVX-512 is divided into multiple featuresets, and we use two of // them, F and VL. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] #[allow(unreachable_code)] pub fn avx512_detected() -> bool { if cfg!(miri) { return false; } // A testing-only short-circuit. if cfg!(feature = "no_avx512") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") { return true; } } false } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] #[allow(unreachable_code)] pub fn avx2_detected() -> bool { if cfg!(miri) { return false; } // A testing-only short-circuit. if cfg!(feature = "no_avx2") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(target_feature = "avx2")] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("avx2") { return true; } } false } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] #[allow(unreachable_code)] pub fn sse41_detected() -> bool { if cfg!(miri) { return false; } // A testing-only short-circuit. if cfg!(feature = "no_sse41") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(target_feature = "sse4.1")] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("sse4.1") { return true; } } false } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] #[allow(unreachable_code)] pub fn sse2_detected() -> bool { if cfg!(miri) { return false; } // A testing-only short-circuit. if cfg!(feature = "no_sse2") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(target_feature = "sse2")] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("sse2") { return true; } } false } #[inline(always)] pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] { let mut out = [0; 8]; out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); out } #[inline(always)] pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] { let mut out = [0; 16]; out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4)); out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4)); out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4)); out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4)); out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4)); out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4)); out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4)); out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4)); out } #[inline(always)] pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { let mut out = [0; 32]; *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); out } #[inline(always)] pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] { let mut out = [0; 64]; *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes(); *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes(); *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes(); *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes(); *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes(); *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes(); *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes(); *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes(); out } blake3-1.8.1/src/portable.rs000064400000000000000000000124511046102023000137440ustar 00000000000000use crate::{ counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, }; use arrayref::{array_mut_ref, array_ref}; #[inline(always)] fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) { state[a] = state[a].wrapping_add(state[b]).wrapping_add(x); state[d] = (state[d] ^ state[a]).rotate_right(16); state[c] = state[c].wrapping_add(state[d]); state[b] = (state[b] ^ state[c]).rotate_right(12); state[a] = state[a].wrapping_add(state[b]).wrapping_add(y); state[d] = (state[d] ^ state[a]).rotate_right(8); state[c] = state[c].wrapping_add(state[d]); state[b] = (state[b] ^ state[c]).rotate_right(7); } #[inline(always)] fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) { // Select the message schedule based on the round. let schedule = MSG_SCHEDULE[round]; // Mix the columns. g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); // Mix the diagonals. g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); } #[inline(always)] fn compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u32; 16] { let block_words = crate::platform::words_from_le_bytes_64(block); let mut state = [ cv[0], cv[1], cv[2], cv[3], cv[4], cv[5], cv[6], cv[7], IV[0], IV[1], IV[2], IV[3], counter_low(counter), counter_high(counter), block_len as u32, flags as u32, ]; round(&mut state, &block_words, 0); round(&mut state, &block_words, 1); round(&mut state, &block_words, 2); round(&mut state, &block_words, 3); round(&mut state, &block_words, 4); round(&mut state, &block_words, 5); round(&mut state, &block_words, 6); state } pub fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { let state = compress_pre(cv, block, block_len, counter, flags); cv[0] = state[0] ^ state[8]; cv[1] = state[1] ^ state[9]; cv[2] = state[2] ^ state[10]; cv[3] = state[3] ^ state[11]; cv[4] = state[4] ^ state[12]; cv[5] = state[5] ^ state[13]; cv[6] = state[6] ^ state[14]; cv[7] = state[7] ^ state[15]; } pub fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { let mut state = compress_pre(cv, block, block_len, counter, flags); state[0] ^= state[8]; state[1] ^= state[9]; state[2] ^= state[10]; state[3] ^= state[11]; state[4] ^= state[12]; state[5] ^= state[13]; state[6] ^= state[14]; state[7] ^= state[15]; state[8] ^= cv[0]; state[9] ^= cv[1]; state[10] ^= cv[2]; state[11] ^= cv[3]; state[12] ^= cv[4]; state[13] ^= cv[5]; state[14] ^= cv[6]; state[15] ^= cv[7]; crate::platform::le_bytes_from_words_64(&state) } pub fn hash1( input: &[u8; N], key: &CVWords, counter: u64, flags: u8, flags_start: u8, flags_end: u8, out: &mut CVBytes, ) { debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); let mut cv = *key; let mut block_flags = flags | flags_start; let mut slice = &input[..]; while slice.len() >= BLOCK_LEN { if slice.len() == BLOCK_LEN { block_flags |= flags_end; } compress_in_place( &mut cv, array_ref!(slice, 0, BLOCK_LEN), BLOCK_LEN as u8, counter, block_flags, ); block_flags = flags; slice = &slice[BLOCK_LEN..]; } *out = crate::platform::le_bytes_from_words_32(&cv); } pub fn hash_many( inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { hash1( input, key, counter, flags, flags_start, flags_end, array_mut_ref!(output, 0, OUT_LEN), ); if increment_counter.yes() { counter += 1; } } } #[cfg(test)] pub mod test { use super::*; // This is basically testing the portable implementation against itself, // but it also checks that compress_in_place and compress_xof are // consistent. And there are tests against the reference implementation and // against hardcoded test vectors elsewhere. #[test] fn test_compress() { crate::test::test_compress_fn(compress_in_place, compress_xof); } // Ditto. #[test] fn test_hash_many() { crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.8.1/src/rust_avx2.rs000064400000000000000000000366631046102023000141040ustar 00000000000000#[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; use crate::{ counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, }; use arrayref::{array_mut_ref, mut_array_refs}; pub const DEGREE: usize = 8; #[inline(always)] unsafe fn loadu(src: *const u8) -> __m256i { // This is an unaligned load, so the pointer cast is allowed. _mm256_loadu_si256(src as *const __m256i) } #[inline(always)] unsafe fn storeu(src: __m256i, dest: *mut u8) { // This is an unaligned store, so the pointer cast is allowed. _mm256_storeu_si256(dest as *mut __m256i, src) } #[inline(always)] unsafe fn add(a: __m256i, b: __m256i) -> __m256i { _mm256_add_epi32(a, b) } #[inline(always)] unsafe fn xor(a: __m256i, b: __m256i) -> __m256i { _mm256_xor_si256(a, b) } #[inline(always)] unsafe fn set1(x: u32) -> __m256i { _mm256_set1_epi32(x as i32) } #[inline(always)] unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i { _mm256_setr_epi32( a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32, ) } // These rotations are the "simple/shifts version". For the // "complicated/shuffles version", see // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. // For a discussion of the tradeoffs, see // https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug // (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better // on recent x86 chips. #[inline(always)] unsafe fn rot16(x: __m256i) -> __m256i { _mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16)) } #[inline(always)] unsafe fn rot12(x: __m256i) -> __m256i { _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)) } #[inline(always)] unsafe fn rot8(x: __m256i) -> __m256i { _mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8)) } #[inline(always)] unsafe fn rot7(x: __m256i) -> __m256i { _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)) } #[inline(always)] unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) { v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } #[inline(always)] unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) { ( _mm256_permute2x128_si256(a, b, 0x20), _mm256_permute2x128_si256(a, b, 0x31), ) } // There are several ways to do a transposition. We could do it naively, with 8 separate // _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy // the vecs into contiguous storage and then use gather instructions. This third approach is to use // a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the // fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the // https://github.com/oconnor663/bao_experiments repo. #[inline(always)] unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77. let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is 11/33. let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); // Interleave 128-bit lanes. let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04); let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15); let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26); let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37); vecs[0] = abcdefgh_0; vecs[1] = abcdefgh_1; vecs[2] = abcdefgh_2; vecs[3] = abcdefgh_3; vecs[4] = abcdefgh_4; vecs[5] = abcdefgh_5; vecs[6] = abcdefgh_6; vecs[7] = abcdefgh_7; } #[inline(always)] unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] { let mut vecs = [ loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)), ]; for i in 0..DEGREE { _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); } let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); vecs } #[inline(always)] unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) { let mask = if increment_counter.yes() { !0 } else { 0 }; ( set8( counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)), counter_low(counter + (mask & 4)), counter_low(counter + (mask & 5)), counter_low(counter + (mask & 6)), counter_low(counter + (mask & 7)), ), set8( counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)), counter_high(counter + (mask & 4)), counter_high(counter + (mask & 5)), counter_high(counter + (mask & 6)), counter_high(counter + (mask & 7)), ), ) } #[target_feature(enable = "avx2")] pub unsafe fn hash8( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], ) { let mut h_vecs = [ set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), ]; let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); let mut block_flags = flags | flags_start; for block in 0..blocks { if block + 1 == blocks { block_flags |= flags_end; } let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only let block_flags_vec = set1(block_flags as u32); let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); // The transposed compression function. Note that inlining this // manually here improves compile times by a lot, compared to factoring // it out into its own function and making it #[inline(always)]. Just // guessing, it might have something to do with loop unrolling. let mut v = [ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, ]; round(&mut v, &msg_vecs, 0); round(&mut v, &msg_vecs, 1); round(&mut v, &msg_vecs, 2); round(&mut v, &msg_vecs, 3); round(&mut v, &msg_vecs, 4); round(&mut v, &msg_vecs, 5); round(&mut v, &msg_vecs, 6); h_vecs[0] = xor(v[0], v[8]); h_vecs[1] = xor(v[1], v[9]); h_vecs[2] = xor(v[2], v[10]); h_vecs[3] = xor(v[3], v[11]); h_vecs[4] = xor(v[4], v[12]); h_vecs[5] = xor(v[5], v[13]); h_vecs[6] = xor(v[6], v[14]); h_vecs[7] = xor(v[7], v[15]); block_flags = flags; } transpose_vecs(&mut h_vecs); storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE)); storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE)); storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE)); storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE)); storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE)); storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE)); storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); } #[target_feature(enable = "avx2")] pub unsafe fn hash_many( mut inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, mut out: &mut [u8], ) { debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { // Safe because the layout of arrays is guaranteed, and because the // `blocks` count is determined statically from the argument type. let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); let blocks = N / BLOCK_LEN; hash8( input_ptrs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, array_mut_ref!(out, 0, DEGREE * OUT_LEN), ); if increment_counter.yes() { counter += DEGREE as u64; } inputs = &inputs[DEGREE..]; out = &mut out[DEGREE * OUT_LEN..]; } crate::sse41::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ); } #[cfg(test)] mod test { use super::*; #[test] fn test_transpose() { if !crate::platform::avx2_detected() { return; } #[target_feature(enable = "avx2")] unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) { transpose_vecs(vecs); } let mut matrix = [[0 as u32; DEGREE]; DEGREE]; for i in 0..DEGREE { for j in 0..DEGREE { matrix[i][j] = (i * DEGREE + j) as u32; } } unsafe { let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix); transpose_wrapper(&mut vecs); matrix = core::mem::transmute(vecs); } for i in 0..DEGREE { for j in 0..DEGREE { // Reversed indexes from above. assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); } } } #[test] fn test_hash_many() { if !crate::platform::avx2_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.8.1/src/rust_sse2.rs000064400000000000000000000604631046102023000140730ustar 00000000000000#[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; use crate::{ counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, }; use arrayref::{array_mut_ref, array_ref, mut_array_refs}; pub const DEGREE: usize = 4; #[inline(always)] unsafe fn loadu(src: *const u8) -> __m128i { // This is an unaligned load, so the pointer cast is allowed. _mm_loadu_si128(src as *const __m128i) } #[inline(always)] unsafe fn storeu(src: __m128i, dest: *mut u8) { // This is an unaligned store, so the pointer cast is allowed. _mm_storeu_si128(dest as *mut __m128i, src) } #[inline(always)] unsafe fn add(a: __m128i, b: __m128i) -> __m128i { _mm_add_epi32(a, b) } #[inline(always)] unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { _mm_xor_si128(a, b) } #[inline(always)] unsafe fn set1(x: u32) -> __m128i { _mm_set1_epi32(x as i32) } #[inline(always)] unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) } // These rotations are the "simple/shifts version". For the // "complicated/shuffles version", see // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. // For a discussion of the tradeoffs, see // https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug // (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better // on recent x86 chips. #[inline(always)] unsafe fn rot16(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) } #[inline(always)] unsafe fn rot12(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) } #[inline(always)] unsafe fn rot8(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) } #[inline(always)] unsafe fn rot7(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) } #[inline(always)] unsafe fn g1( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, ) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot16(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot12(*row1); } #[inline(always)] unsafe fn g2( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, ) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot8(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot7(*row1); } // Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. macro_rules! _MM_SHUFFLE { ($z:expr, $y:expr, $x:expr, $w:expr) => { ($z << 6) | ($y << 4) | ($x << 2) | $w }; } macro_rules! shuffle2 { ($a:expr, $b:expr, $c:expr) => { _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps($a), _mm_castsi128_ps($b), $c, )) }; } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 #[inline(always)] unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); } #[inline(always)] unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); } #[inline(always)] unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { let bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); let mut mask = _mm_set1_epi16(imm8 as i16); mask = _mm_and_si128(mask, bits); mask = _mm_cmpeq_epi16(mask, bits); _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)) } #[inline(always)] unsafe fn compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [__m128i; 4] { let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); let row3 = &mut set4( counter_low(counter), counter_high(counter), block_len as u32, flags as u32, ); let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); let mut t0; let mut t1; let mut t2; let mut t3; let mut tt; // Round 1. The first round permutes the message words from the original // input order, into the groups that get mixed in parallel. t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 g1(row0, row1, row2, row3, t2); t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 2. This round and all following rounds apply a fixed permutation // to the message words from the round before. t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 3 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 4 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 5 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 6 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 7 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); [*row0, *row1, *row2, *row3] } #[target_feature(enable = "sse2")] pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); } #[target_feature(enable = "sse2")] pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { let [mut row0, mut row1, mut row2, mut row3] = compress_pre(cv, block, block_len, counter, flags); row0 = xor(row0, row2); row1 = xor(row1, row3); row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); core::mem::transmute([row0, row1, row2, row3]) } #[inline(always)] unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } #[inline(always)] unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is // 22/33. Note that this doesn't split the vector into two lanes, as the // AVX2 counterparts do. let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); // Interleave 64-bit lanes. let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); vecs[0] = abcd_0; vecs[1] = abcd_1; vecs[2] = abcd_2; vecs[3] = abcd_3; } #[inline(always)] unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { let mut vecs = [ loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), ]; for i in 0..DEGREE { _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); } let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); transpose_vecs(squares.2); transpose_vecs(squares.3); vecs } #[inline(always)] unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { let mask = if increment_counter.yes() { !0 } else { 0 }; ( set4( counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)), ), set4( counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)), ), ) } #[target_feature(enable = "sse2")] pub unsafe fn hash4( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], ) { let mut h_vecs = [ set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), ]; let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); let mut block_flags = flags | flags_start; for block in 0..blocks { if block + 1 == blocks { block_flags |= flags_end; } let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only let block_flags_vec = set1(block_flags as u32); let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); // The transposed compression function. Note that inlining this // manually here improves compile times by a lot, compared to factoring // it out into its own function and making it #[inline(always)]. Just // guessing, it might have something to do with loop unrolling. let mut v = [ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, ]; round(&mut v, &msg_vecs, 0); round(&mut v, &msg_vecs, 1); round(&mut v, &msg_vecs, 2); round(&mut v, &msg_vecs, 3); round(&mut v, &msg_vecs, 4); round(&mut v, &msg_vecs, 5); round(&mut v, &msg_vecs, 6); h_vecs[0] = xor(v[0], v[8]); h_vecs[1] = xor(v[1], v[9]); h_vecs[2] = xor(v[2], v[10]); h_vecs[3] = xor(v[3], v[11]); h_vecs[4] = xor(v[4], v[12]); h_vecs[5] = xor(v[5], v[13]); h_vecs[6] = xor(v[6], v[14]); h_vecs[7] = xor(v[7], v[15]); block_flags = flags; } let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); } #[target_feature(enable = "sse2")] unsafe fn hash1( input: &[u8; N], key: &CVWords, counter: u64, flags: u8, flags_start: u8, flags_end: u8, out: &mut CVBytes, ) { debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); let mut cv = *key; let mut block_flags = flags | flags_start; let mut slice = &input[..]; while slice.len() >= BLOCK_LEN { if slice.len() == BLOCK_LEN { block_flags |= flags_end; } compress_in_place( &mut cv, array_ref!(slice, 0, BLOCK_LEN), BLOCK_LEN as u8, counter, block_flags, ); block_flags = flags; slice = &slice[BLOCK_LEN..]; } *out = core::mem::transmute(cv); // x86 is little-endian } #[target_feature(enable = "sse2")] pub unsafe fn hash_many( mut inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, mut out: &mut [u8], ) { debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { // Safe because the layout of arrays is guaranteed, and because the // `blocks` count is determined statically from the argument type. let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); let blocks = N / BLOCK_LEN; hash4( input_ptrs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, array_mut_ref!(out, 0, DEGREE * OUT_LEN), ); if increment_counter.yes() { counter += DEGREE as u64; } inputs = &inputs[DEGREE..]; out = &mut out[DEGREE * OUT_LEN..]; } for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { hash1( input, key, counter, flags, flags_start, flags_end, array_mut_ref!(output, 0, OUT_LEN), ); if increment_counter.yes() { counter += 1; } } } #[cfg(test)] mod test { use super::*; #[test] fn test_transpose() { if !crate::platform::sse2_detected() { return; } #[target_feature(enable = "sse2")] unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { transpose_vecs(vecs); } let mut matrix = [[0 as u32; DEGREE]; DEGREE]; for i in 0..DEGREE { for j in 0..DEGREE { matrix[i][j] = (i * DEGREE + j) as u32; } } unsafe { let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); transpose_wrapper(&mut vecs); matrix = core::mem::transmute(vecs); } for i in 0..DEGREE { for j in 0..DEGREE { // Reversed indexes from above. assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); } } } #[test] fn test_compress() { if !crate::platform::sse2_detected() { return; } crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { if !crate::platform::sse2_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.8.1/src/rust_sse41.rs000064400000000000000000000600071046102023000141500ustar 00000000000000#[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; use crate::{ counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, }; use arrayref::{array_mut_ref, array_ref, mut_array_refs}; pub const DEGREE: usize = 4; #[inline(always)] unsafe fn loadu(src: *const u8) -> __m128i { // This is an unaligned load, so the pointer cast is allowed. _mm_loadu_si128(src as *const __m128i) } #[inline(always)] unsafe fn storeu(src: __m128i, dest: *mut u8) { // This is an unaligned store, so the pointer cast is allowed. _mm_storeu_si128(dest as *mut __m128i, src) } #[inline(always)] unsafe fn add(a: __m128i, b: __m128i) -> __m128i { _mm_add_epi32(a, b) } #[inline(always)] unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { _mm_xor_si128(a, b) } #[inline(always)] unsafe fn set1(x: u32) -> __m128i { _mm_set1_epi32(x as i32) } #[inline(always)] unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) } // These rotations are the "simple/shifts version". For the // "complicated/shuffles version", see // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. // For a discussion of the tradeoffs, see // https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug // (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better // on recent x86 chips. #[inline(always)] unsafe fn rot16(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) } #[inline(always)] unsafe fn rot12(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) } #[inline(always)] unsafe fn rot8(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) } #[inline(always)] unsafe fn rot7(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) } #[inline(always)] unsafe fn g1( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, ) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot16(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot12(*row1); } #[inline(always)] unsafe fn g2( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, ) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot8(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot7(*row1); } // Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. macro_rules! _MM_SHUFFLE { ($z:expr, $y:expr, $x:expr, $w:expr) => { ($z << 6) | ($y << 4) | ($x << 2) | $w }; } macro_rules! shuffle2 { ($a:expr, $b:expr, $c:expr) => { _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps($a), _mm_castsi128_ps($b), $c, )) }; } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 #[inline(always)] unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); } #[inline(always)] unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); } #[inline(always)] unsafe fn compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [__m128i; 4] { let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); let row3 = &mut set4( counter_low(counter), counter_high(counter), block_len as u32, flags as u32, ); let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); let mut t0; let mut t1; let mut t2; let mut t3; let mut tt; // Round 1. The first round permutes the message words from the original // input order, into the groups that get mixed in parallel. t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 g1(row0, row1, row2, row3, t2); t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 2. This round and all following rounds apply a fixed permutation // to the message words from the round before. t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 3 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 4 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 5 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 6 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 7 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); [*row0, *row1, *row2, *row3] } #[target_feature(enable = "sse4.1")] pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); } #[target_feature(enable = "sse4.1")] pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { let [mut row0, mut row1, mut row2, mut row3] = compress_pre(cv, block, block_len, counter, flags); row0 = xor(row0, row2); row1 = xor(row1, row3); row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); core::mem::transmute([row0, row1, row2, row3]) } #[inline(always)] unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } #[inline(always)] unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is // 22/33. Note that this doesn't split the vector into two lanes, as the // AVX2 counterparts do. let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); // Interleave 64-bit lanes. let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); vecs[0] = abcd_0; vecs[1] = abcd_1; vecs[2] = abcd_2; vecs[3] = abcd_3; } #[inline(always)] unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { let mut vecs = [ loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), ]; for i in 0..DEGREE { _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); } let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); transpose_vecs(squares.2); transpose_vecs(squares.3); vecs } #[inline(always)] unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { let mask = if increment_counter.yes() { !0 } else { 0 }; ( set4( counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)), ), set4( counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)), ), ) } #[target_feature(enable = "sse4.1")] pub unsafe fn hash4( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], ) { let mut h_vecs = [ set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), ]; let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); let mut block_flags = flags | flags_start; for block in 0..blocks { if block + 1 == blocks { block_flags |= flags_end; } let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only let block_flags_vec = set1(block_flags as u32); let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); // The transposed compression function. Note that inlining this // manually here improves compile times by a lot, compared to factoring // it out into its own function and making it #[inline(always)]. Just // guessing, it might have something to do with loop unrolling. let mut v = [ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, ]; round(&mut v, &msg_vecs, 0); round(&mut v, &msg_vecs, 1); round(&mut v, &msg_vecs, 2); round(&mut v, &msg_vecs, 3); round(&mut v, &msg_vecs, 4); round(&mut v, &msg_vecs, 5); round(&mut v, &msg_vecs, 6); h_vecs[0] = xor(v[0], v[8]); h_vecs[1] = xor(v[1], v[9]); h_vecs[2] = xor(v[2], v[10]); h_vecs[3] = xor(v[3], v[11]); h_vecs[4] = xor(v[4], v[12]); h_vecs[5] = xor(v[5], v[13]); h_vecs[6] = xor(v[6], v[14]); h_vecs[7] = xor(v[7], v[15]); block_flags = flags; } let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); } #[target_feature(enable = "sse4.1")] unsafe fn hash1( input: &[u8; N], key: &CVWords, counter: u64, flags: u8, flags_start: u8, flags_end: u8, out: &mut CVBytes, ) { debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); let mut cv = *key; let mut block_flags = flags | flags_start; let mut slice = &input[..]; while slice.len() >= BLOCK_LEN { if slice.len() == BLOCK_LEN { block_flags |= flags_end; } compress_in_place( &mut cv, array_ref!(slice, 0, BLOCK_LEN), BLOCK_LEN as u8, counter, block_flags, ); block_flags = flags; slice = &slice[BLOCK_LEN..]; } *out = core::mem::transmute(cv); // x86 is little-endian } #[target_feature(enable = "sse4.1")] pub unsafe fn hash_many( mut inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, mut out: &mut [u8], ) { debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { // Safe because the layout of arrays is guaranteed, and because the // `blocks` count is determined statically from the argument type. let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); let blocks = N / BLOCK_LEN; hash4( input_ptrs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, array_mut_ref!(out, 0, DEGREE * OUT_LEN), ); if increment_counter.yes() { counter += DEGREE as u64; } inputs = &inputs[DEGREE..]; out = &mut out[DEGREE * OUT_LEN..]; } for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { hash1( input, key, counter, flags, flags_start, flags_end, array_mut_ref!(output, 0, OUT_LEN), ); if increment_counter.yes() { counter += 1; } } } #[cfg(test)] mod test { use super::*; #[test] fn test_transpose() { if !crate::platform::sse41_detected() { return; } #[target_feature(enable = "sse4.1")] unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { transpose_vecs(vecs); } let mut matrix = [[0 as u32; DEGREE]; DEGREE]; for i in 0..DEGREE { for j in 0..DEGREE { matrix[i][j] = (i * DEGREE + j) as u32; } } unsafe { let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); transpose_wrapper(&mut vecs); matrix = core::mem::transmute(vecs); } for i in 0..DEGREE { for j in 0..DEGREE { // Reversed indexes from above. assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); } } } #[test] fn test_compress() { if !crate::platform::sse41_detected() { return; } crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { if !crate::platform::sse41_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.8.1/src/test.rs000064400000000000000000001073511046102023000131170ustar 00000000000000use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN}; use arrayref::array_ref; use arrayvec::ArrayVec; use core::usize; use rand::prelude::*; // Interesting input lengths to run tests on. pub const TEST_CASES: &[usize] = &[ 0, 1, 2, 3, 4, 5, 6, 7, 8, BLOCK_LEN - 1, BLOCK_LEN, BLOCK_LEN + 1, 2 * BLOCK_LEN - 1, 2 * BLOCK_LEN, 2 * BLOCK_LEN + 1, CHUNK_LEN - 1, CHUNK_LEN, CHUNK_LEN + 1, 2 * CHUNK_LEN, 2 * CHUNK_LEN + 1, 3 * CHUNK_LEN, 3 * CHUNK_LEN + 1, 4 * CHUNK_LEN, 4 * CHUNK_LEN + 1, 5 * CHUNK_LEN, 5 * CHUNK_LEN + 1, 6 * CHUNK_LEN, 6 * CHUNK_LEN + 1, 7 * CHUNK_LEN, 7 * CHUNK_LEN + 1, 8 * CHUNK_LEN, 8 * CHUNK_LEN + 1, 16 * CHUNK_LEN - 1, 16 * CHUNK_LEN, // AVX512's bandwidth 16 * CHUNK_LEN + 1, 31 * CHUNK_LEN - 1, 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 31 * CHUNK_LEN + 1, 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks ]; pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN; // There's a test to make sure these two are equal below. pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; pub const TEST_KEY_WORDS: CVWords = [ 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, ]; // Paint the input with a repeating byte pattern. We use a cycle length of 251, // because that's the largest prime number less than 256. This makes it // unlikely to swapping any two adjacent input blocks or chunks will give the // same answer. pub fn paint_test_input(buf: &mut [u8]) { for (i, b) in buf.iter_mut().enumerate() { *b = (i % 251) as u8; } } type CompressInPlaceFn = unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8); type CompressXofFn = unsafe fn( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64]; // A shared helper function for platform-specific tests. pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { let initial_state = TEST_KEY_WORDS; let block_len: u8 = 61; let mut block = [0; BLOCK_LEN]; paint_test_input(&mut block[..block_len as usize]); // Use a counter with set bits in both 32-bit words. let counter = (5u64 << 32) + 6; let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH; let portable_out = crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags); let mut test_state = initial_state; unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) }; let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state); let test_xof = unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) }; assert_eq!(&portable_out[..32], &test_state_bytes[..]); assert_eq!(&portable_out[..], &test_xof[..]); } type HashManyFn = unsafe fn( inputs: &[&A], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ); // A shared helper function for platform-specific tests. pub fn test_hash_many_fn( hash_many_chunks_fn: HashManyFn<[u8; CHUNK_LEN]>, hash_many_parents_fn: HashManyFn<[u8; 2 * OUT_LEN]>, ) { // Test a few different initial counter values. // - 0: The base case. // - u32::MAX: The low word of the counter overflows for all inputs except the first. // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR // when you're supposed to ANDNOT... let initial_counters = [0, u32::MAX as u64, i32::MAX as u64]; for counter in initial_counters { #[cfg(feature = "std")] dbg!(counter); // 31 (16 + 8 + 4 + 2 + 1) inputs const NUM_INPUTS: usize = 31; let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; crate::test::paint_test_input(&mut input_buf); // First hash chunks. let mut chunks = ArrayVec::<&[u8; CHUNK_LEN], NUM_INPUTS>::new(); for i in 0..NUM_INPUTS { chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN)); } let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; crate::portable::hash_many( &chunks, &TEST_KEY_WORDS, counter, IncrementCounter::Yes, crate::KEYED_HASH, crate::CHUNK_START, crate::CHUNK_END, &mut portable_chunks_out, ); let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN]; unsafe { hash_many_chunks_fn( &chunks[..], &TEST_KEY_WORDS, counter, IncrementCounter::Yes, crate::KEYED_HASH, crate::CHUNK_START, crate::CHUNK_END, &mut test_chunks_out, ); } for n in 0..NUM_INPUTS { #[cfg(feature = "std")] dbg!(n); assert_eq!( &portable_chunks_out[n * OUT_LEN..][..OUT_LEN], &test_chunks_out[n * OUT_LEN..][..OUT_LEN] ); } // Then hash parents. let mut parents = ArrayVec::<&[u8; 2 * OUT_LEN], NUM_INPUTS>::new(); for i in 0..NUM_INPUTS { parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN)); } let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; crate::portable::hash_many( &parents, &TEST_KEY_WORDS, counter, IncrementCounter::No, crate::KEYED_HASH | crate::PARENT, 0, 0, &mut portable_parents_out, ); let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN]; unsafe { hash_many_parents_fn( &parents[..], &TEST_KEY_WORDS, counter, IncrementCounter::No, crate::KEYED_HASH | crate::PARENT, 0, 0, &mut test_parents_out, ); } for n in 0..NUM_INPUTS { #[cfg(feature = "std")] dbg!(n); assert_eq!( &portable_parents_out[n * OUT_LEN..][..OUT_LEN], &test_parents_out[n * OUT_LEN..][..OUT_LEN] ); } } } #[allow(unused)] type XofManyFunction = unsafe fn( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, out: &mut [u8], ); // A shared helper function for platform-specific tests. #[allow(unused)] pub fn test_xof_many_fn(xof_many_function: XofManyFunction) { let mut block = [0; BLOCK_LEN]; let block_len = 42; crate::test::paint_test_input(&mut block[..block_len]); let cv = [40, 41, 42, 43, 44, 45, 46, 47]; let flags = crate::KEYED_HASH; // Test a few different initial counter values. // - 0: The base case. // - u32::MAX: The low word of the counter overflows for all inputs except the first. // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR // when you're supposed to ANDNOT... let initial_counters = [0, u32::MAX as u64, i32::MAX as u64]; for counter in initial_counters { #[cfg(feature = "std")] dbg!(counter); // 31 (16 + 8 + 4 + 2 + 1) outputs const OUTPUT_SIZE: usize = 31 * BLOCK_LEN; let mut portable_out = [0u8; OUTPUT_SIZE]; for (i, out_block) in portable_out.chunks_exact_mut(64).enumerate() { out_block.copy_from_slice(&crate::portable::compress_xof( &cv, &block, block_len as u8, counter + i as u64, flags, )); } let mut test_out = [0u8; OUTPUT_SIZE]; unsafe { xof_many_function(&cv, &block, block_len as u8, counter, flags, &mut test_out); } assert_eq!(portable_out, test_out); } // Test that xof_many doesn't write more blocks than requested. Note that the current assembly // implementation always outputs at least one block, so we don't test the zero case. for block_count in 1..=32 { let mut array = [0; BLOCK_LEN * 33]; let output_start = 17; let output_len = block_count * BLOCK_LEN; let output_end = output_start + output_len; let output = &mut array[output_start..output_end]; unsafe { xof_many_function(&cv, &block, block_len as u8, 0, flags, output); } for i in 0..array.len() { if i < output_start || output_end <= i { assert_eq!(0, array[i], "index {i}"); } } } } #[test] fn test_key_bytes_equal_key_words() { assert_eq!( TEST_KEY_WORDS, crate::platform::words_from_le_bytes_32(&TEST_KEY), ); } #[test] fn test_reference_impl_size() { // Because the Rust compiler optimizes struct layout, it's possible that // some future version of the compiler will produce a different size. If // that happens, we can either disable this test, or test for multiple // expected values. For now, the purpose of this test is to make sure we // notice if that happens. assert_eq!(1880, core::mem::size_of::()); } #[test] fn test_counter_words() { let counter: u64 = (1 << 32) + 2; assert_eq!(crate::counter_low(counter), 2); assert_eq!(crate::counter_high(counter), 1); } #[test] fn test_largest_power_of_two_leq() { let input_output = &[ // The zero case is nonsensical, but it does work. (0, 1), (1, 1), (2, 2), (3, 2), (4, 4), (5, 4), (6, 4), (7, 4), (8, 8), // the largest possible usize (usize::MAX, (usize::MAX >> 1) + 1), ]; for &(input, output) in input_output { assert_eq!( output, crate::largest_power_of_two_leq(input), "wrong output for n={}", input ); } } #[test] fn test_compare_reference_impl() { const OUT: usize = 303; // more than 64, not a multiple of 4 let mut input_buf = [0; TEST_CASES_MAX]; paint_test_input(&mut input_buf); for &case in TEST_CASES { let input = &input_buf[..case]; #[cfg(feature = "std")] dbg!(case); // regular { let mut reference_hasher = reference_impl::Hasher::new(); reference_hasher.update(input); let mut expected_out = [0; OUT]; reference_hasher.finalize(&mut expected_out); // all at once let test_out = crate::hash(input); assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); // incremental let mut hasher = crate::Hasher::new(); hasher.update(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), test_out); // incremental (rayon) #[cfg(feature = "rayon")] { let mut hasher = crate::Hasher::new(); hasher.update_rayon(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), test_out); } // xof let mut extended = [0; OUT]; hasher.finalize_xof().fill(&mut extended); assert_eq!(extended, expected_out); } // keyed { let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); reference_hasher.update(input); let mut expected_out = [0; OUT]; reference_hasher.finalize(&mut expected_out); // all at once let test_out = crate::keyed_hash(&TEST_KEY, input); assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); // incremental let mut hasher = crate::Hasher::new_keyed(&TEST_KEY); hasher.update(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), test_out); // incremental (rayon) #[cfg(feature = "rayon")] { let mut hasher = crate::Hasher::new_keyed(&TEST_KEY); hasher.update_rayon(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), test_out); } // xof let mut extended = [0; OUT]; hasher.finalize_xof().fill(&mut extended); assert_eq!(extended, expected_out); } // derive_key { let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)"; let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); reference_hasher.update(input); let mut expected_out = [0; OUT]; reference_hasher.finalize(&mut expected_out); // all at once let test_out = crate::derive_key(context, input); assert_eq!(test_out, expected_out[..32]); // incremental let mut hasher = crate::Hasher::new_derive_key(context); hasher.update(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32)); // incremental (rayon) #[cfg(feature = "rayon")] { let mut hasher = crate::Hasher::new_derive_key(context); hasher.update_rayon(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32)); } // xof let mut extended = [0; OUT]; hasher.finalize_xof().fill(&mut extended); assert_eq!(extended, expected_out); } } } #[test] fn test_compare_reference_impl_long_xof() { let mut reference_output = [0u8; 32 * BLOCK_LEN - 1]; let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); reference_hasher.update(b"hello world"); reference_hasher.finalize(&mut reference_output); let mut test_output = [0u8; 32 * BLOCK_LEN - 1]; let mut test_hasher = crate::Hasher::new_keyed(&TEST_KEY); test_hasher.update(b"hello world"); test_hasher.finalize_xof().fill(&mut test_output); assert_eq!(reference_output, test_output); } #[test] fn test_xof_partial_blocks() { const OUT_LEN: usize = 6 * BLOCK_LEN; let mut reference_out = [0u8; OUT_LEN]; reference_impl::Hasher::new().finalize(&mut reference_out); let mut all_at_once_out = [0u8; OUT_LEN]; crate::Hasher::new() .finalize_xof() .fill(&mut all_at_once_out); assert_eq!(reference_out, all_at_once_out); let mut partial_out = [0u8; OUT_LEN]; let partial_start = 32; let partial_end = OUT_LEN - 32; let mut xof = crate::Hasher::new().finalize_xof(); xof.fill(&mut partial_out[..partial_start]); xof.fill(&mut partial_out[partial_start..partial_end]); xof.fill(&mut partial_out[partial_end..]); assert_eq!(reference_out, partial_out); } fn reference_hash(input: &[u8]) -> crate::Hash { let mut hasher = reference_impl::Hasher::new(); hasher.update(input); let mut bytes = [0; 32]; hasher.finalize(&mut bytes); bytes.into() } #[test] fn test_compare_update_multiple() { // Don't use all the long test cases here, since that's unnecessarily slow // in debug mode. let mut short_test_cases = TEST_CASES; while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN { short_test_cases = &short_test_cases[..short_test_cases.len() - 1]; } assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN); let mut input_buf = [0; 2 * TEST_CASES_MAX]; paint_test_input(&mut input_buf); for &first_update in short_test_cases { #[cfg(feature = "std")] dbg!(first_update); let first_input = &input_buf[..first_update]; let mut test_hasher = crate::Hasher::new(); test_hasher.update(first_input); for &second_update in short_test_cases { #[cfg(feature = "std")] dbg!(second_update); let second_input = &input_buf[first_update..][..second_update]; let total_input = &input_buf[..first_update + second_update]; // Clone the hasher with first_update bytes already written, so // that the next iteration can reuse it. let mut test_hasher = test_hasher.clone(); test_hasher.update(second_input); let expected = reference_hash(total_input); assert_eq!(expected, test_hasher.finalize()); } } } #[test] fn test_fuzz_hasher() { const INPUT_MAX: usize = 4 * CHUNK_LEN; let mut input_buf = [0; 3 * INPUT_MAX]; paint_test_input(&mut input_buf); // Don't do too many iterations in debug mode, to keep the tests under a // second or so. CI should run tests in release mode also. Provide an // environment variable for specifying a larger number of fuzz iterations. let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 }; // Use a fixed RNG seed for reproducibility. let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); for _num_test in 0..num_tests { #[cfg(feature = "std")] dbg!(_num_test); let mut hasher = crate::Hasher::new(); let mut total_input = 0; // For each test, write 3 inputs of random length. for _ in 0..3 { let input_len = rng.random_range(0..(INPUT_MAX + 1)); #[cfg(feature = "std")] dbg!(input_len); let input = &input_buf[total_input..][..input_len]; hasher.update(input); total_input += input_len; } let expected = reference_hash(&input_buf[..total_input]); assert_eq!(expected, hasher.finalize()); } } #[test] fn test_fuzz_xof() { let mut input_buf = [0u8; 3 * BLOCK_LEN]; paint_test_input(&mut input_buf); // Don't do too many iterations in debug mode, to keep the tests under a // second or so. CI should run tests in release mode also. Provide an // environment variable for specifying a larger number of fuzz iterations. let num_tests = if cfg!(debug_assertions) { 100 } else { 2500 }; // Use a fixed RNG seed for reproducibility. let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); for _num_test in 0..num_tests { #[cfg(feature = "std")] dbg!(_num_test); // 31 (16 + 8 + 4 + 2 + 1) outputs let mut output_buf = [0; 31 * CHUNK_LEN]; let input_len = rng.random_range(0..input_buf.len()); let mut xof = crate::Hasher::new() .update(&input_buf[..input_len]) .finalize_xof(); let partial_start = rng.random_range(0..output_buf.len()); let partial_end = rng.random_range(partial_start..output_buf.len()); xof.fill(&mut output_buf[..partial_start]); xof.fill(&mut output_buf[partial_start..partial_end]); xof.fill(&mut output_buf[partial_end..]); let mut reference_buf = [0; 31 * CHUNK_LEN]; let mut reference_hasher = reference_impl::Hasher::new(); reference_hasher.update(&input_buf[..input_len]); reference_hasher.finalize(&mut reference_buf); assert_eq!(reference_buf, output_buf); } } #[test] fn test_xof_seek() { let mut out = [0; 533]; let mut hasher = crate::Hasher::new(); hasher.update(b"foo"); hasher.finalize_xof().fill(&mut out); assert_eq!(hasher.finalize().as_bytes(), &out[0..32]); let mut reader = hasher.finalize_xof(); reader.set_position(303); let mut out2 = [0; 102]; reader.fill(&mut out2); assert_eq!(&out[303..][..102], &out2[..]); #[cfg(feature = "std")] { use std::io::prelude::*; let mut reader = hasher.finalize_xof(); reader.seek(std::io::SeekFrom::Start(303)).unwrap(); let mut out3 = Vec::new(); reader.by_ref().take(102).read_to_end(&mut out3).unwrap(); assert_eq!(&out[303..][..102], &out3[..]); assert_eq!( reader.seek(std::io::SeekFrom::Current(0)).unwrap(), 303 + 102 ); reader.seek(std::io::SeekFrom::Current(-5)).unwrap(); assert_eq!( reader.seek(std::io::SeekFrom::Current(0)).unwrap(), 303 + 102 - 5 ); let mut out4 = [0; 17]; assert_eq!(reader.read(&mut out4).unwrap(), 17); assert_eq!(&out[303 + 102 - 5..][..17], &out4[..]); assert_eq!( reader.seek(std::io::SeekFrom::Current(0)).unwrap(), 303 + 102 - 5 + 17 ); assert!(reader.seek(std::io::SeekFrom::End(0)).is_err()); assert!(reader.seek(std::io::SeekFrom::Current(-1000)).is_err()); } } #[test] fn test_msg_schedule_permutation() { let permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; let mut generated = [[0; 16]; 7]; generated[0] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; for round in 1..7 { for i in 0..16 { generated[round][i] = generated[round - 1][permutation[i]]; } } assert_eq!(generated, crate::MSG_SCHEDULE); } #[test] fn test_reset() { let mut hasher = crate::Hasher::new(); hasher.update(&[42; 3 * CHUNK_LEN + 7]); hasher.reset(); hasher.update(&[42; CHUNK_LEN + 3]); assert_eq!(hasher.finalize(), crate::hash(&[42; CHUNK_LEN + 3])); let key = &[99; crate::KEY_LEN]; let mut keyed_hasher = crate::Hasher::new_keyed(key); keyed_hasher.update(&[42; 3 * CHUNK_LEN + 7]); keyed_hasher.reset(); keyed_hasher.update(&[42; CHUNK_LEN + 3]); assert_eq!( keyed_hasher.finalize(), crate::keyed_hash(key, &[42; CHUNK_LEN + 3]), ); let context = "BLAKE3 2020-02-12 10:20:58 reset test"; let mut kdf = crate::Hasher::new_derive_key(context); kdf.update(&[42; 3 * CHUNK_LEN + 7]); kdf.reset(); kdf.update(&[42; CHUNK_LEN + 3]); let expected = crate::derive_key(context, &[42; CHUNK_LEN + 3]); assert_eq!(kdf.finalize(), expected); } #[test] fn test_hex_encoding_decoding() { let digest_str = "04e0bb39f30b1a3feb89f536c93be15055482df748674b00d26e5a75777702e9"; let mut hasher = crate::Hasher::new(); hasher.update(b"foo"); let digest = hasher.finalize(); assert_eq!(digest.to_hex().as_str(), digest_str); #[cfg(feature = "std")] assert_eq!(digest.to_string(), digest_str); // Test round trip let digest = crate::Hash::from_hex(digest_str).unwrap(); assert_eq!(digest.to_hex().as_str(), digest_str); // Test uppercase let digest = crate::Hash::from_hex(digest_str.to_uppercase()).unwrap(); assert_eq!(digest.to_hex().as_str(), digest_str); // Test string parsing via FromStr let digest: crate::Hash = digest_str.parse().unwrap(); assert_eq!(digest.to_hex().as_str(), digest_str); // Test errors let bad_len = "04e0bb39f30b1"; let _result = crate::Hash::from_hex(bad_len).unwrap_err(); #[cfg(feature = "std")] assert_eq!(_result.to_string(), "expected 64 hex bytes, received 13"); let bad_char = "Z4e0bb39f30b1a3feb89f536c93be15055482df748674b00d26e5a75777702e9"; let _result = crate::Hash::from_hex(bad_char).unwrap_err(); #[cfg(feature = "std")] assert_eq!(_result.to_string(), "invalid hex character: 'Z'"); let _result = crate::Hash::from_hex([128; 64]).unwrap_err(); #[cfg(feature = "std")] assert_eq!(_result.to_string(), "invalid hex character: 0x80"); } // This test is a mimized failure case for the Windows SSE2 bug described in // https://github.com/BLAKE3-team/BLAKE3/issues/206. // // Before that issue was fixed, this test would fail on Windows in the following configuration: // // cargo test --features=no_avx512,no_avx2,no_sse41 --release // // Bugs like this one (stomping on a caller's register) are very sensitive to the details of // surrounding code, so it's not especially likely that this test will catch another bug (or even // the same bug) in the future. Still, there's no harm in keeping it. #[test] fn test_issue_206_windows_sse2() { // This stupid loop has to be here to trigger the bug. I don't know why. for _ in &[0] { // The length 65 (two blocks) is significant. It doesn't repro with 64 (one block). It also // doesn't repro with an all-zero input. let input = &[0xff; 65]; let expected_hash = [ 183, 235, 50, 217, 156, 24, 190, 219, 2, 216, 176, 255, 224, 53, 28, 95, 57, 148, 179, 245, 162, 90, 37, 121, 0, 142, 219, 62, 234, 204, 225, 161, ]; // This throwaway call has to be here to trigger the bug. crate::Hasher::new().update(input); // This assert fails when the bug is triggered. assert_eq!(crate::Hasher::new().update(input).finalize(), expected_hash); } } #[test] fn test_hash_conversions() { let bytes1 = [42; 32]; let hash1: crate::Hash = bytes1.into(); let bytes2: [u8; 32] = hash1.into(); assert_eq!(bytes1, bytes2); let bytes3 = *hash1.as_bytes(); assert_eq!(bytes1, bytes3); let hash2 = crate::Hash::from_bytes(bytes1); assert_eq!(hash1, hash2); let hex = hash1.to_hex(); let hash3 = crate::Hash::from_hex(hex.as_bytes()).unwrap(); assert_eq!(hash1, hash3); let slice1: &[u8] = bytes1.as_slice(); let hash4 = crate::Hash::from_slice(slice1).expect("correct length"); assert_eq!(hash1, hash4); assert!(crate::Hash::from_slice(&[]).is_err()); assert!(crate::Hash::from_slice(&[42]).is_err()); assert!(crate::Hash::from_slice([42; 31].as_slice()).is_err()); assert!(crate::Hash::from_slice([42; 33].as_slice()).is_err()); assert!(crate::Hash::from_slice([42; 100].as_slice()).is_err()); } #[test] const fn test_hash_const_conversions() { let bytes = [42; 32]; let hash = crate::Hash::from_bytes(bytes); _ = hash.as_bytes(); } #[cfg(feature = "zeroize")] #[test] fn test_zeroize() { use zeroize::Zeroize; let mut hash = crate::Hash([42; 32]); hash.zeroize(); assert_eq!(hash.0, [0u8; 32]); let mut hasher = crate::Hasher { chunk_state: crate::ChunkState { cv: [42; 8], chunk_counter: 42, buf: [42; 64], buf_len: 42, blocks_compressed: 42, flags: 42, platform: crate::Platform::Portable, }, initial_chunk_counter: 42, key: [42; 8], cv_stack: [[42; 32]; { crate::MAX_DEPTH + 1 }].into(), }; hasher.zeroize(); assert_eq!(hasher.chunk_state.cv, [0; 8]); assert_eq!(hasher.chunk_state.chunk_counter, 0); assert_eq!(hasher.chunk_state.buf, [0; 64]); assert_eq!(hasher.chunk_state.buf_len, 0); assert_eq!(hasher.chunk_state.blocks_compressed, 0); assert_eq!(hasher.chunk_state.flags, 0); assert!(matches!( hasher.chunk_state.platform, crate::Platform::Portable )); assert_eq!(hasher.initial_chunk_counter, 0); assert_eq!(hasher.key, [0; 8]); assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]); let mut output_reader = crate::OutputReader { inner: crate::Output { input_chaining_value: [42; 8], block: [42; 64], counter: 42, block_len: 42, flags: 42, platform: crate::Platform::Portable, }, position_within_block: 42, }; output_reader.zeroize(); assert_eq!(output_reader.inner.input_chaining_value, [0; 8]); assert_eq!(output_reader.inner.block, [0; 64]); assert_eq!(output_reader.inner.counter, 0); assert_eq!(output_reader.inner.block_len, 0); assert_eq!(output_reader.inner.flags, 0); assert!(matches!( output_reader.inner.platform, crate::Platform::Portable )); assert_eq!(output_reader.position_within_block, 0); } #[test] #[cfg(feature = "std")] fn test_update_reader() -> Result<(), std::io::Error> { // This is a brief test, since update_reader() is mostly a wrapper around update(), which already // has substantial testing. let mut input = vec![0; 1_000_000]; paint_test_input(&mut input); assert_eq!( crate::Hasher::new().update_reader(&input[..])?.finalize(), crate::hash(&input), ); Ok(()) } #[test] #[cfg(feature = "std")] fn test_update_reader_interrupted() -> std::io::Result<()> { use std::io; struct InterruptingReader<'a> { already_interrupted: bool, slice: &'a [u8], } impl<'a> InterruptingReader<'a> { fn new(slice: &'a [u8]) -> Self { Self { already_interrupted: false, slice, } } } impl<'a> io::Read for InterruptingReader<'a> { fn read(&mut self, buf: &mut [u8]) -> io::Result { if !self.already_interrupted { self.already_interrupted = true; return Err(io::Error::from(io::ErrorKind::Interrupted)); } let take = std::cmp::min(self.slice.len(), buf.len()); buf[..take].copy_from_slice(&self.slice[..take]); self.slice = &self.slice[take..]; Ok(take) } } let input = b"hello world"; let mut reader = InterruptingReader::new(input); let mut hasher = crate::Hasher::new(); hasher.update_reader(&mut reader)?; assert_eq!(hasher.finalize(), crate::hash(input)); Ok(()) } #[test] #[cfg(feature = "mmap")] // NamedTempFile isn't Miri-compatible #[cfg(not(miri))] fn test_mmap() -> Result<(), std::io::Error> { // This is a brief test, since update_mmap() is mostly a wrapper around update(), which already // has substantial testing. use std::io::prelude::*; let mut input = vec![0; 1_000_000]; paint_test_input(&mut input); let mut tempfile = tempfile::NamedTempFile::new()?; tempfile.write_all(&input)?; tempfile.flush()?; assert_eq!( crate::Hasher::new() .update_mmap(tempfile.path())? .finalize(), crate::hash(&input), ); Ok(()) } #[test] #[cfg(feature = "mmap")] #[cfg(target_os = "linux")] fn test_mmap_virtual_file() -> Result<(), std::io::Error> { // Virtual files like /proc/version can't be mmapped, because their contents don't actually // exist anywhere in memory. Make sure we fall back to regular file IO in these cases. // Currently this is handled with a length check, where the assumption is that virtual files // will always report length 0. If that assumption ever breaks, hopefully this test will catch // it. let virtual_filepath = "/proc/version"; let mut mmap_hasher = crate::Hasher::new(); // We'll fail right here if the fallback doesn't work. mmap_hasher.update_mmap(virtual_filepath)?; let mut read_hasher = crate::Hasher::new(); read_hasher.update_reader(std::fs::File::open(virtual_filepath)?)?; assert_eq!(mmap_hasher.finalize(), read_hasher.finalize()); Ok(()) } #[test] #[cfg(feature = "mmap")] #[cfg(feature = "rayon")] // NamedTempFile isn't Miri-compatible #[cfg(not(miri))] fn test_mmap_rayon() -> Result<(), std::io::Error> { // This is a brief test, since update_mmap_rayon() is mostly a wrapper around update_rayon(), // which already has substantial testing. use std::io::prelude::*; let mut input = vec![0; 1_000_000]; paint_test_input(&mut input); let mut tempfile = tempfile::NamedTempFile::new()?; tempfile.write_all(&input)?; tempfile.flush()?; assert_eq!( crate::Hasher::new() .update_mmap_rayon(tempfile.path())? .finalize(), crate::hash(&input), ); Ok(()) } #[test] #[cfg(feature = "std")] #[cfg(feature = "serde")] fn test_serde() { // Henrik suggested that we use 0xfe / 254 for byte test data instead of 0xff / 255, due to the // fact that 0xfe is not a well formed CBOR item. let hash: crate::Hash = [0xfe; 32].into(); let json = serde_json::to_string(&hash).unwrap(); assert_eq!( json, "[254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]", ); let hash2: crate::Hash = serde_json::from_str(&json).unwrap(); assert_eq!(hash, hash2); let mut cbor = Vec::::new(); ciborium::into_writer(&hash, &mut cbor).unwrap(); assert_eq!( cbor, [ 0x98, 0x20, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, ] ); let hash_from_cbor: crate::Hash = ciborium::from_reader(&cbor[..]).unwrap(); assert_eq!(hash_from_cbor, hash); // Version 1.5.2 of this crate changed the default serialization format to a bytestring // (instead of an array/list) to save bytes on the wire. That was a backwards compatibility // mistake for non-self-describing formats, and it's been reverted. Since some small number of // serialized bytestrings will probably exist forever in the wild, we shold test that we can // still deserialize these from self-describing formats. let bytestring_cbor: &[u8] = &[ 0x58, 0x20, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, ]; let hash_from_bytestring_cbor: crate::Hash = ciborium::from_reader(bytestring_cbor).unwrap(); assert_eq!(hash_from_bytestring_cbor, hash); } // `cargo +nightly miri test` currently works, but it takes forever, because some of our test // inputs are quite large. Most of our unsafe code is platform specific and incompatible with Miri // anyway, but we'd like it to be possible for callers to run their own tests under Miri, assuming // they don't use incompatible features like Rayon or mmap. This test should get reasonable // coverage of our public API without using any large inputs, so we can run it in CI and catch // obvious breaks. (For example, constant_time_eq is not compatible with Miri.) #[test] fn test_miri_smoketest() { let mut hasher = crate::Hasher::new_derive_key("Miri smoketest"); hasher.update(b"foo"); #[cfg(feature = "std")] hasher.update_reader(&b"bar"[..]).unwrap(); assert_eq!(hasher.finalize(), hasher.finalize()); let mut reader = hasher.finalize_xof(); reader.set_position(999999); reader.fill(&mut [0]); } // I had to move these tests out of the deprecated guts module, because leaving them there causes // an un-silenceable warning: https://github.com/rust-lang/rust/issues/47238 #[cfg(test)] #[allow(deprecated)] mod guts_tests { use crate::guts::*; #[test] fn test_chunk() { assert_eq!( crate::hash(b"foo"), ChunkState::new(0).update(b"foo").finalize(true) ); } #[test] fn test_parents() { let mut hasher = crate::Hasher::new(); let mut buf = [0; crate::CHUNK_LEN]; buf[0] = 'a' as u8; hasher.update(&buf); let chunk0_cv = ChunkState::new(0).update(&buf).finalize(false); buf[0] = 'b' as u8; hasher.update(&buf); let chunk1_cv = ChunkState::new(1).update(&buf).finalize(false); hasher.update(b"c"); let chunk2_cv = ChunkState::new(2).update(b"c").finalize(false); let parent = parent_cv(&chunk0_cv, &chunk1_cv, false); let root = parent_cv(&parent, &chunk2_cv, true); assert_eq!(hasher.finalize(), root); } } blake3-1.8.1/src/traits.rs000064400000000000000000000157751046102023000134560ustar 00000000000000//! Implementations of commonly used traits like `Digest` and `Mac` from the //! [`digest`](https://crates.io/crates/digest) crate. pub use digest; use crate::{Hasher, OutputReader}; use digest::crypto_common; use digest::generic_array::{typenum::U32, typenum::U64, GenericArray}; impl digest::HashMarker for Hasher {} impl digest::Update for Hasher { #[inline] fn update(&mut self, data: &[u8]) { self.update(data); } } impl digest::Reset for Hasher { #[inline] fn reset(&mut self) { self.reset(); // the inherent method } } impl digest::OutputSizeUser for Hasher { type OutputSize = U32; } impl digest::FixedOutput for Hasher { #[inline] fn finalize_into(self, out: &mut GenericArray) { out.copy_from_slice(self.finalize().as_bytes()); } } impl digest::FixedOutputReset for Hasher { #[inline] fn finalize_into_reset(&mut self, out: &mut GenericArray) { out.copy_from_slice(self.finalize().as_bytes()); self.reset(); } } impl digest::ExtendableOutput for Hasher { type Reader = OutputReader; #[inline] fn finalize_xof(self) -> Self::Reader { Hasher::finalize_xof(&self) } } impl digest::ExtendableOutputReset for Hasher { #[inline] fn finalize_xof_reset(&mut self) -> Self::Reader { let reader = Hasher::finalize_xof(self); self.reset(); reader } } impl digest::XofReader for OutputReader { #[inline] fn read(&mut self, buffer: &mut [u8]) { self.fill(buffer); } } impl crypto_common::KeySizeUser for Hasher { type KeySize = U32; } impl crypto_common::BlockSizeUser for Hasher { type BlockSize = U64; } impl digest::MacMarker for Hasher {} impl digest::KeyInit for Hasher { #[inline] fn new(key: &digest::Key) -> Self { let key_bytes: [u8; 32] = (*key).into(); Hasher::new_keyed(&key_bytes) } } #[cfg(test)] mod test { use super::*; #[test] fn test_digest_traits() { // Inherent methods. let mut hasher1 = crate::Hasher::new(); hasher1.update(b"foo"); hasher1.update(b"bar"); hasher1.update(b"baz"); let out1 = hasher1.finalize(); let mut xof1 = [0; 301]; hasher1.finalize_xof().fill(&mut xof1); assert_eq!(out1.as_bytes(), &xof1[..32]); // Trait implementations. let mut hasher2: crate::Hasher = digest::Digest::new(); digest::Digest::update(&mut hasher2, b"xxx"); digest::Digest::reset(&mut hasher2); digest::Digest::update(&mut hasher2, b"foo"); digest::Digest::update(&mut hasher2, b"bar"); digest::Digest::update(&mut hasher2, b"baz"); let out2 = digest::Digest::finalize(hasher2.clone()); let mut xof2 = [0; 301]; digest::XofReader::read( &mut digest::ExtendableOutput::finalize_xof(hasher2.clone()), &mut xof2, ); assert_eq!(out1.as_bytes(), &out2[..]); assert_eq!(xof1[..], xof2[..]); // Again with the resetting variants. let mut hasher3: crate::Hasher = digest::Digest::new(); digest::Digest::update(&mut hasher3, b"foobarbaz"); let mut out3 = [0; 32]; digest::FixedOutputReset::finalize_into_reset( &mut hasher3, GenericArray::from_mut_slice(&mut out3), ); digest::Digest::update(&mut hasher3, b"foobarbaz"); let mut out4 = [0; 32]; digest::FixedOutputReset::finalize_into_reset( &mut hasher3, GenericArray::from_mut_slice(&mut out4), ); digest::Digest::update(&mut hasher3, b"foobarbaz"); let mut xof3 = [0; 301]; digest::XofReader::read( &mut digest::ExtendableOutputReset::finalize_xof_reset(&mut hasher3), &mut xof3, ); digest::Digest::update(&mut hasher3, b"foobarbaz"); let mut xof4 = [0; 301]; digest::XofReader::read( &mut digest::ExtendableOutputReset::finalize_xof_reset(&mut hasher3), &mut xof4, ); assert_eq!(out1.as_bytes(), &out3[..]); assert_eq!(out1.as_bytes(), &out4[..]); assert_eq!(xof1[..], xof3[..]); assert_eq!(xof1[..], xof4[..]); } #[test] fn test_mac_trait() { // Inherent methods. let key = b"some super secret key bytes fooo"; let mut hasher1 = crate::Hasher::new_keyed(key); hasher1.update(b"foo"); hasher1.update(b"bar"); hasher1.update(b"baz"); let out1 = hasher1.finalize(); // Trait implementation. let generic_key = (*key).into(); let mut hasher2: crate::Hasher = digest::Mac::new(&generic_key); digest::Mac::update(&mut hasher2, b"xxx"); digest::Mac::reset(&mut hasher2); digest::Mac::update(&mut hasher2, b"foo"); digest::Mac::update(&mut hasher2, b"bar"); digest::Mac::update(&mut hasher2, b"baz"); let out2 = digest::Mac::finalize(hasher2); assert_eq!(out1.as_bytes(), out2.into_bytes().as_slice()); } fn expected_hmac_blake3(key: &[u8], input: &[u8]) -> [u8; 32] { // See https://en.wikipedia.org/wiki/HMAC. let key_hash; let key_prime = if key.len() <= 64 { key } else { key_hash = *crate::hash(key).as_bytes(); &key_hash }; let mut ipad = [0x36; 64]; let mut opad = [0x5c; 64]; for i in 0..key_prime.len() { ipad[i] ^= key_prime[i]; opad[i] ^= key_prime[i]; } let mut inner_state = crate::Hasher::new(); inner_state.update(&ipad); inner_state.update(input); let mut outer_state = crate::Hasher::new(); outer_state.update(&opad); outer_state.update(inner_state.finalize().as_bytes()); outer_state.finalize().into() } #[test] fn test_hmac_compatibility() { use hmac::{Mac, SimpleHmac}; // Test a short key. let mut x = SimpleHmac::::new_from_slice(b"key").unwrap(); hmac::digest::Update::update(&mut x, b"data"); let output = x.finalize().into_bytes(); assert_ne!(output.len(), 0); let expected = expected_hmac_blake3(b"key", b"data"); assert_eq!(expected, output.as_ref()); // Test a range of key and data lengths, particularly to exercise the long-key logic. let mut input_bytes = [0; crate::test::TEST_CASES_MAX]; crate::test::paint_test_input(&mut input_bytes); for &input_len in crate::test::TEST_CASES { #[cfg(feature = "std")] dbg!(input_len); let input = &input_bytes[..input_len]; let mut x = SimpleHmac::::new_from_slice(input).unwrap(); hmac::digest::Update::update(&mut x, input); let output = x.finalize().into_bytes(); assert_ne!(output.len(), 0); let expected = expected_hmac_blake3(input, input); assert_eq!(expected, output.as_ref()); } } } blake3-1.8.1/src/wasm32_simd.rs000064400000000000000000000604151046102023000142670ustar 00000000000000/* * This code is based on rust_sse2.rs of the same distribution, and is subject to further improvements. * Some comments are left intact even if their applicability is questioned. * * Performance measurements with a primitive benchmark with ~16Kb of data: * * | M1 native | 11,610 ns | * | M1 Wasm SIMD | 13,355 ns | * | M1 Wasm | 22,037 ns | * | x64 native | 6,713 ns | * | x64 Wasm SIMD | 11,985 ns | * | x64 Wasm | 25,978 ns | * * wasmtime v12.0.1 was used on both platforms. */ use core::arch::wasm32::*; use crate::{ counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, }; use arrayref::{array_mut_ref, array_ref, mut_array_refs}; pub const DEGREE: usize = 4; #[inline(always)] unsafe fn loadu(src: *const u8) -> v128 { // This is an unaligned load, so the pointer cast is allowed. v128_load(src as *const v128) } #[inline(always)] unsafe fn storeu(src: v128, dest: *mut u8) { // This is an unaligned store, so the pointer cast is allowed. v128_store(dest as *mut v128, src) } #[inline(always)] fn add(a: v128, b: v128) -> v128 { i32x4_add(a, b) } #[inline(always)] fn xor(a: v128, b: v128) -> v128 { v128_xor(a, b) } #[inline(always)] fn set1(x: u32) -> v128 { i32x4_splat(x as i32) } #[inline(always)] fn set4(a: u32, b: u32, c: u32, d: u32) -> v128 { i32x4(a as i32, b as i32, c as i32, d as i32) } // These rotations are the "simple/shifts version". For the // "complicated/shuffles version", see // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. // For a discussion of the tradeoffs, see // https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug // (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better // on recent x86 chips. #[inline(always)] fn rot16(a: v128) -> v128 { v128_or(u32x4_shr(a, 16), u32x4_shl(a, 32 - 16)) } #[inline(always)] fn rot12(a: v128) -> v128 { v128_or(u32x4_shr(a, 12), u32x4_shl(a, 32 - 12)) } #[inline(always)] fn rot8(a: v128) -> v128 { v128_or(u32x4_shr(a, 8), u32x4_shl(a, 32 - 8)) } #[inline(always)] fn rot7(a: v128) -> v128 { v128_or(u32x4_shr(a, 7), u32x4_shl(a, 32 - 7)) } #[inline(always)] fn g1(row0: &mut v128, row1: &mut v128, row2: &mut v128, row3: &mut v128, m: v128) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot16(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot12(*row1); } #[inline(always)] fn g2(row0: &mut v128, row1: &mut v128, row2: &mut v128, row3: &mut v128, m: v128) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot8(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot7(*row1); } // It could be a function, but artimetics in const generics is too limited yet. macro_rules! shuffle { ($a: expr, $b: expr, $z:expr, $y:expr, $x:expr, $w:expr) => { i32x4_shuffle::<{ $w }, { $x }, { $y + 4 }, { $z + 4 }>($a, $b) }; } #[inline(always)] fn unpacklo_epi64(a: v128, b: v128) -> v128 { i64x2_shuffle::<0, 2>(a, b) } #[inline(always)] fn unpackhi_epi64(a: v128, b: v128) -> v128 { i64x2_shuffle::<1, 3>(a, b) } #[inline(always)] fn unpacklo_epi32(a: v128, b: v128) -> v128 { i32x4_shuffle::<0, 4, 1, 5>(a, b) } #[inline(always)] fn unpackhi_epi32(a: v128, b: v128) -> v128 { i32x4_shuffle::<2, 6, 3, 7>(a, b) } #[inline(always)] fn shuffle_epi32( a: v128, ) -> v128 { // Please note that generic arguments in delcaration and imlementation are in // different order. // second arg is actually ignored. i32x4_shuffle::(a, a) } #[inline(always)] fn blend_epi16(a: v128, b: v128, imm8: i32) -> v128 { // imm8 is always constant; it allows to implement this function with // i16x8_shuffle. However, it is marginally slower on x64. let bits = i16x8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80); let mut mask = i16x8_splat(imm8 as i16); mask = v128_and(mask, bits); mask = i16x8_eq(mask, bits); // The swapped argument order is equivalent to mask negation. v128_bitselect(b, a, mask) } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 #[inline(always)] fn diagonalize(row0: &mut v128, row2: &mut v128, row3: &mut v128) { *row0 = shuffle_epi32::<2, 1, 0, 3>(*row0); *row3 = shuffle_epi32::<1, 0, 3, 2>(*row3); *row2 = shuffle_epi32::<0, 3, 2, 1>(*row2); } #[inline(always)] fn undiagonalize(row0: &mut v128, row2: &mut v128, row3: &mut v128) { *row0 = shuffle_epi32::<0, 3, 2, 1>(*row0); *row3 = shuffle_epi32::<1, 0, 3, 2>(*row3); *row2 = shuffle_epi32::<2, 1, 0, 3>(*row2); } #[inline(always)] fn compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [v128; 4] { // safe because CVWords is [u32; 8] let row0 = &mut unsafe { loadu(cv.as_ptr().add(0) as *const u8) }; let row1 = &mut unsafe { loadu(cv.as_ptr().add(4) as *const u8) }; let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); let row3 = &mut set4( counter_low(counter), counter_high(counter), block_len as u32, flags as u32, ); // safe because block is &[u8; 64] let mut m0 = unsafe { loadu(block.as_ptr().add(0 * 4 * DEGREE)) }; let mut m1 = unsafe { loadu(block.as_ptr().add(1 * 4 * DEGREE)) }; let mut m2 = unsafe { loadu(block.as_ptr().add(2 * 4 * DEGREE)) }; let mut m3 = unsafe { loadu(block.as_ptr().add(3 * 4 * DEGREE)) }; let mut t0; let mut t1; let mut t2; let mut t3; let mut tt; // Round 1. The first round permutes the message words from the original // input order, into the groups that get mixed in parallel. t0 = shuffle!(m0, m1, 2, 0, 2, 0); // 6 4 2 0 g1(row0, row1, row2, row3, t0); t1 = shuffle!(m0, m1, 3, 1, 3, 1); // 7 5 3 1 g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = shuffle!(m2, m3, 2, 0, 2, 0); // 14 12 10 8 t2 = shuffle_epi32::<2, 1, 0, 3>(t2); // 12 10 8 14 g1(row0, row1, row2, row3, t2); t3 = shuffle!(m2, m3, 3, 1, 3, 1); // 15 13 11 9 t3 = shuffle_epi32::<2, 1, 0, 3>(t3); // 13 11 9 15 g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 2. This round and all following rounds apply a fixed permutation // to the message words from the round before. t0 = shuffle!(m0, m1, 3, 1, 1, 2); t0 = shuffle_epi32::<0, 3, 2, 1>(t0); g1(row0, row1, row2, row3, t0); t1 = shuffle!(m2, m3, 3, 3, 2, 2); tt = shuffle_epi32::<0, 0, 3, 3>(m0); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = shuffle_epi32::<1, 3, 2, 0>(tt); g1(row0, row1, row2, row3, t2); t3 = unpackhi_epi32(m1, m3); tt = unpacklo_epi32(m2, t3); t3 = shuffle_epi32::<0, 1, 3, 2>(tt); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 3 t0 = shuffle!(m0, m1, 3, 1, 1, 2); t0 = shuffle_epi32::<0, 3, 2, 1>(t0); g1(row0, row1, row2, row3, t0); t1 = shuffle!(m2, m3, 3, 3, 2, 2); tt = shuffle_epi32::<0, 0, 3, 3>(m0); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = shuffle_epi32::<1, 3, 2, 0>(tt); g1(row0, row1, row2, row3, t2); t3 = unpackhi_epi32(m1, m3); tt = unpacklo_epi32(m2, t3); t3 = shuffle_epi32::<0, 1, 3, 2>(tt); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 4 t0 = shuffle!(m0, m1, 3, 1, 1, 2); t0 = shuffle_epi32::<0, 3, 2, 1>(t0); g1(row0, row1, row2, row3, t0); t1 = shuffle!(m2, m3, 3, 3, 2, 2); tt = shuffle_epi32::<0, 0, 3, 3>(m0); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = shuffle_epi32::<1, 3, 2, 0>(tt); g1(row0, row1, row2, row3, t2); t3 = unpackhi_epi32(m1, m3); tt = unpacklo_epi32(m2, t3); t3 = shuffle_epi32::<0, 1, 3, 2>(tt); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 5 t0 = shuffle!(m0, m1, 3, 1, 1, 2); t0 = shuffle_epi32::<0, 3, 2, 1>(t0); g1(row0, row1, row2, row3, t0); t1 = shuffle!(m2, m3, 3, 3, 2, 2); tt = shuffle_epi32::<0, 0, 3, 3>(m0); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = shuffle_epi32::<1, 3, 2, 0>(tt); g1(row0, row1, row2, row3, t2); t3 = unpackhi_epi32(m1, m3); tt = unpacklo_epi32(m2, t3); t3 = shuffle_epi32::<0, 1, 3, 2>(tt); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 6 t0 = shuffle!(m0, m1, 3, 1, 1, 2); t0 = shuffle_epi32::<0, 3, 2, 1>(t0); g1(row0, row1, row2, row3, t0); t1 = shuffle!(m2, m3, 3, 3, 2, 2); tt = shuffle_epi32::<0, 0, 3, 3>(m0); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = shuffle_epi32::<1, 3, 2, 0>(tt); g1(row0, row1, row2, row3, t2); t3 = unpackhi_epi32(m1, m3); tt = unpacklo_epi32(m2, t3); t3 = shuffle_epi32::<0, 1, 3, 2>(tt); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 7 t0 = shuffle!(m0, m1, 3, 1, 1, 2); t0 = shuffle_epi32::<0, 3, 2, 1>(t0); g1(row0, row1, row2, row3, t0); t1 = shuffle!(m2, m3, 3, 3, 2, 2); tt = shuffle_epi32::<0, 0, 3, 3>(m0); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = shuffle_epi32::<1, 3, 2, 0>(tt); g1(row0, row1, row2, row3, t2); t3 = unpackhi_epi32(m1, m3); tt = unpacklo_epi32(m2, t3); t3 = shuffle_epi32::<0, 1, 3, 2>(tt); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); [*row0, *row1, *row2, *row3] } #[target_feature(enable = "simd128")] pub fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); // it stores in reversed order... // safe because CVWords is [u32; 8] unsafe { storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); } } #[target_feature(enable = "simd128")] pub fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { let [mut row0, mut row1, mut row2, mut row3] = compress_pre(cv, block, block_len, counter, flags); row0 = xor(row0, row2); row1 = xor(row1, row3); // safe because CVWords is [u32; 8] row2 = xor(row2, unsafe { loadu(cv.as_ptr().add(0) as *const u8) }); row3 = xor(row3, unsafe { loadu(cv.as_ptr().add(4) as *const u8) }); // It seems to be architecture dependent, but works. // safe because sizes match, and every state of u8 is valid. unsafe { core::mem::transmute([row0, row1, row2, row3]) } } #[inline(always)] fn round(v: &mut [v128; 16], m: &[v128; 16], r: usize) { v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } #[inline(always)] fn transpose_vecs(vecs: &mut [v128; DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is // 22/33. Note that this doesn't split the vector into two lanes, as the // AVX2 counterparts do. let ab_01 = unpacklo_epi32(vecs[0], vecs[1]); let ab_23 = unpackhi_epi32(vecs[0], vecs[1]); let cd_01 = unpacklo_epi32(vecs[2], vecs[3]); let cd_23 = unpackhi_epi32(vecs[2], vecs[3]); // Interleave 64-bit lanes. let abcd_0 = unpacklo_epi64(ab_01, cd_01); let abcd_1 = unpackhi_epi64(ab_01, cd_01); let abcd_2 = unpacklo_epi64(ab_23, cd_23); let abcd_3 = unpackhi_epi64(ab_23, cd_23); vecs[0] = abcd_0; vecs[1] = abcd_1; vecs[2] = abcd_2; vecs[3] = abcd_3; } #[inline(always)] unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [v128; 16] { let mut vecs = [ loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), ]; let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); transpose_vecs(squares.2); transpose_vecs(squares.3); vecs } #[inline(always)] fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (v128, v128) { let mask = if increment_counter.yes() { !0 } else { 0 }; ( set4( counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)), ), set4( counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)), ), ) } #[target_feature(enable = "simd128")] pub unsafe fn hash4( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], ) { let mut h_vecs = [ set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), ]; let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); let mut block_flags = flags | flags_start; for block in 0..blocks { if block + 1 == blocks { block_flags |= flags_end; } let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only let block_flags_vec = set1(block_flags as u32); let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); // The transposed compression function. Note that inlining this // manually here improves compile times by a lot, compared to factoring // it out into its own function and making it #[inline(always)]. Just // guessing, it might have something to do with loop unrolling. let mut v = [ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, ]; round(&mut v, &msg_vecs, 0); round(&mut v, &msg_vecs, 1); round(&mut v, &msg_vecs, 2); round(&mut v, &msg_vecs, 3); round(&mut v, &msg_vecs, 4); round(&mut v, &msg_vecs, 5); round(&mut v, &msg_vecs, 6); h_vecs[0] = xor(v[0], v[8]); h_vecs[1] = xor(v[1], v[9]); h_vecs[2] = xor(v[2], v[10]); h_vecs[3] = xor(v[3], v[11]); h_vecs[4] = xor(v[4], v[12]); h_vecs[5] = xor(v[5], v[13]); h_vecs[6] = xor(v[6], v[14]); h_vecs[7] = xor(v[7], v[15]); block_flags = flags; } let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); } #[target_feature(enable = "simd128")] unsafe fn hash1( input: &[u8; N], key: &CVWords, counter: u64, flags: u8, flags_start: u8, flags_end: u8, out: &mut CVBytes, ) { debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); let mut cv = *key; let mut block_flags = flags | flags_start; let mut slice = &input[..]; while slice.len() >= BLOCK_LEN { if slice.len() == BLOCK_LEN { block_flags |= flags_end; } compress_in_place( &mut cv, array_ref!(slice, 0, BLOCK_LEN), BLOCK_LEN as u8, counter, block_flags, ); block_flags = flags; slice = &slice[BLOCK_LEN..]; } *out = core::mem::transmute(cv); } #[target_feature(enable = "simd128")] pub unsafe fn hash_many( mut inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, mut out: &mut [u8], ) { debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { // Safe because the layout of arrays is guaranteed, and because the // `blocks` count is determined statically from the argument type. let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); let blocks = N / BLOCK_LEN; hash4( input_ptrs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, array_mut_ref!(out, 0, DEGREE * OUT_LEN), ); if increment_counter.yes() { counter += DEGREE as u64; } inputs = &inputs[DEGREE..]; out = &mut out[DEGREE * OUT_LEN..]; } for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { hash1( input, key, counter, flags, flags_start, flags_end, array_mut_ref!(output, 0, OUT_LEN), ); if increment_counter.yes() { counter += 1; } } } #[cfg(test)] mod test { use super::*; #[test] fn test_transpose() { #[target_feature(enable = "simd128")] fn transpose_wrapper(vecs: &mut [v128; DEGREE]) { transpose_vecs(vecs); } let mut matrix = [[0 as u32; DEGREE]; DEGREE]; for i in 0..DEGREE { for j in 0..DEGREE { matrix[i][j] = (i * DEGREE + j) as u32; } } unsafe { let mut vecs: [v128; DEGREE] = core::mem::transmute(matrix); transpose_wrapper(&mut vecs); matrix = core::mem::transmute(vecs); } for i in 0..DEGREE { for j in 0..DEGREE { // Reversed indexes from above. assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); } } } #[test] fn test_compress() { crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { crate::test::test_hash_many_fn(hash_many, hash_many); } } blake3-1.8.1/tools/release.md000064400000000000000000000012601046102023000140750ustar 00000000000000# Release checklist - Make sure `cargo outdated -R` is clean in the root and in b3sum/. - Bump the version in the root Cargo.toml. - Bump the version in b3sum/Cargo.toml. - Bump the dependency version too, if new features are used. - Delete b3sum/Cargo.lock and recreate it with `cargo build` or similar. - Update the `-h` output in b3sum/README.md if it's changed. - Bump `BLAKE3_VERSION_STRING` in c/blake3.h. - Bump `VERSION` in c/CMakeLists.txt. - Make a version bump commit with change notes. - `git push` and make sure CI is green. - `git tag` the version bump commit with the new version number. - `git push --tags` - `cargo publish` in the root. - `cargo publish` in b3sum/.