matrixmultiply-0.3.9/.cargo_vcs_info.json0000644000000001360000000000100141510ustar { "git": { "sha1": "bb3dd0be9d8a9f592e567e4402950546bf710eba" }, "path_in_vcs": "" }matrixmultiply-0.3.9/.github/workflows/ci.yml000064400000000000000000000127631046102023000174650ustar 00000000000000on: push: branches: [ master ] pull_request: branches: [ master ] name: Continuous integration env: CARGO_TERM_COLOR: always CARGO_INCREMENTAL: 0 MATMUL_NUM_THREADS: 4 RUST_BACKTRACE: full jobs: tests: runs-on: ${{ matrix.os }} continue-on-error: ${{ matrix.experimental }} strategy: matrix: include: - rust: 1.41.1 # MSRV experimental: false os: ubuntu-latest target: x86_64-unknown-linux-gnu features: cgemm - rust: stable experimental: false os: ubuntu-latest target: x86_64-unknown-linux-gnu features: threading cgemm test_examples: yes_examples test_benchmark: yes_bench - rust: nightly experimental: false os: ubuntu-latest target: x86_64-unknown-linux-gnu mmtest_feature: avx - rust: nightly os: ubuntu-latest target: x86_64-unknown-linux-gnu features: threading cgemm mmtest_feature: fma experimental: false - rust: nightly os: ubuntu-latest target: i686-unknown-linux-gnu features: cgemm install_deps: | sudo apt-get update sudo apt-get install -y gcc-multilib experimental: false - rust: stable experimental: false os: macos-latest target: x86_64-apple-darwin features: threading cgemm test_examples: yes_examples name: tests/${{ matrix.target }}/${{ matrix.rust }} steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: toolchain: ${{ matrix.rust }} targets: ${{ matrix.target }} - name: Install dependencies if: matrix.install_deps run: ${{ matrix.install_deps }} - name: Tests run: | rustc -C target-cpu=native --print cfg cargo build -v --features "${{ matrix.features }}" --target "${{ matrix.target }}" cargo test -v --tests --lib --no-fail-fast --features "${{ matrix.features }}" --target "${{ matrix.target }}" cargo test -v --tests --lib --release --no-fail-fast --features "${{ matrix.features }}" --target "${{ matrix.target }}" - name: Test examples if: matrix.test_examples run: | cargo test -v --examples --features "${{ matrix.features }}" --target "${{ matrix.target }}" - name: Test benchmark if: matrix.test_benchmark run: | cargo bench --no-run -v --features "${{ matrix.features }}" --target "${{ matrix.target }}" python3 ./benches/benchloop.py -t f32 f64 c32 c64 --mc 32 -s 32 64 | tee bench.csv cat bench.csv - name: Test specific feature if: matrix.mmtest_feature env: MMTEST_FEATURE: ${{ matrix.mmtest_feature }} MMTEST_ENSUREFEATURE: 1 run: | cargo test -v --no-fail-fast nostd-build: runs-on: ubuntu-latest continue-on-error: ${{ matrix.experimental }} strategy: matrix: include: - rust: 1.41.1 # MSRV experimental: false target: thumbv6m-none-eabi - rust: stable experimental: false target: thumbv6m-none-eabi name: nostd-build/${{ matrix.target }}/${{ matrix.rust }} steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: toolchain: ${{ matrix.rust }} targets: ${{ matrix.target }} - name: Tests run: | cargo rustc "--target=${{ matrix.target }}" --manifest-path=ensure_no_std/Cargo.toml cross_test: runs-on: ubuntu-latest strategy: matrix: include: - rust: stable target: s390x-unknown-linux-gnu features: constconf cgemm threading - rust: stable target: aarch64-unknown-linux-gnu features: constconf cgemm threading - rust: 1.65.0 target: aarch64-unknown-linux-gnu features: cgemm name: cross_test/${{ matrix.target }}/${{ matrix.rust }} steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: profile: minimal targets: ${{ matrix.target }} - name: Cache cargo plugins id: cache uses: actions/cache@v1 with: path: ~/.cargo/bin/ key: ${{ runner.os }}-cargo-plugins - name: Install cross if: steps.cache.outputs.cache-hit != 'true' run: cargo install cross - name: Tests run: cross test --target "${{ matrix.target }}" --features "${{ matrix.features }}" env: MMTEST_FAST_TEST: 1 - name: Tests (Release) run: cross test --release --target "${{ matrix.target }}" --features "${{ matrix.features }}" env: MMTEST_FAST_TEST: 1 cargo-careful: runs-on: ubuntu-latest name: cargo-careful steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: toolchain: nightly - uses: Swatinem/rust-cache@v2 - name: Install cargo-careful run: cargo install cargo-careful - run: cargo careful test -Zcareful-sanitizer=thread --features=threading,cgemm miri: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Miri run: ci/miri.sh --features cgemm matrixmultiply-0.3.9/.gitignore000064400000000000000000000000231046102023000147240ustar 00000000000000Cargo.lock target/ matrixmultiply-0.3.9/Cargo.lock0000644000000057210000000000100121310ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "bencher" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dfdb4953a096c551ce9ace855a604d702e6e62d77fac690575ae347571717f5" [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "crossbeam-channel" version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c02a4d71819009c192cf4872265391563fd6a84c81ff2c0f2a7026ca4c1d85c" dependencies = [ "cfg-if", "crossbeam-utils", ] [[package]] name = "crossbeam-utils" version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ff1f980957787286a554052d03c7aee98d99cc32e09f6d45f0a814133c87978" dependencies = [ "cfg-if", "once_cell", ] [[package]] name = "either" version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" [[package]] name = "hermit-abi" version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" dependencies = [ "libc", ] [[package]] name = "itertools" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484" dependencies = [ "either", ] [[package]] name = "libc" version = "0.2.126" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836" [[package]] name = "matrixmultiply" version = "0.3.9" dependencies = [ "autocfg", "bencher", "itertools", "num_cpus", "once_cell", "rawpointer", "thread-tree", ] [[package]] name = "num_cpus" version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ "hermit-abi", "libc", ] [[package]] name = "once_cell" version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7709cef83f0c1f58f666e746a08b21e0085f7440fa6a29cc194d68aac97a4225" [[package]] name = "rawpointer" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" [[package]] name = "thread-tree" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffbd370cb847953a25954d9f63e14824a36113f8c72eecf6eccef5dc4b45d630" dependencies = [ "crossbeam-channel", ] matrixmultiply-0.3.9/Cargo.toml0000644000000044510000000000100121530ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "matrixmultiply" version = "0.3.9" authors = [ "bluss", "R. Janis Goldschmidt", ] build = "build.rs" exclude = ["docs/*"] autobins = false autoexamples = false autotests = false autobenches = false description = """ General matrix multiplication for f32 and f64 matrices. Operates on matrices with general layout (they can use arbitrary row and column stride). Detects and uses AVX or SSE2 on x86 platforms transparently for higher performance. Uses a microkernel strategy, so that the implementation is easy to parallelize and optimize. Supports multithreading.""" documentation = "https://docs.rs/matrixmultiply/" readme = false keywords = [ "matrix", "sgemm", "dgemm", ] categories = ["science"] license = "MIT/Apache-2.0" repository = "https://github.com/bluss/matrixmultiply/" [package.metadata.docs.rs] features = ["cgemm"] rustdoc-args = [ "--cfg", "docsrs", ] [package.metadata.release] no-dev-version = true tag-name = "{{version}}" [profile.bench] debug = 2 [profile.release] debug = 2 [lib] name = "matrixmultiply" path = "src/lib.rs" bench = false [[example]] name = "benchmark" path = "examples/benchmark.rs" [[example]] name = "usegemm" path = "examples/usegemm.rs" [[test]] name = "sgemm" path = "tests/sgemm.rs" [[bench]] name = "benchmarks" path = "benches/benchmarks.rs" harness = false [dependencies.num_cpus] version = "1.13" optional = true [dependencies.once_cell] version = "1.7" optional = true [dependencies.rawpointer] version = "0.2" [dependencies.thread-tree] version = "0.3.2" optional = true [dev-dependencies.bencher] version = "0.1.2" [dev-dependencies.itertools] version = "0.8" [build-dependencies.autocfg] version = "1" [features] cgemm = [] constconf = [] default = ["std"] std = [] threading = [ "thread-tree", "std", "once_cell", "num_cpus", ] matrixmultiply-0.3.9/Cargo.toml.orig000064400000000000000000000030261046102023000156310ustar 00000000000000[package] name = "matrixmultiply" edition = "2018" version = "0.3.9" authors = [ "bluss", "R. Janis Goldschmidt" ] license = "MIT/Apache-2.0" repository = "https://github.com/bluss/matrixmultiply/" documentation = "https://docs.rs/matrixmultiply/" description = """ General matrix multiplication for f32 and f64 matrices. Operates on matrices with general layout (they can use arbitrary row and column stride). Detects and uses AVX or SSE2 on x86 platforms transparently for higher performance. Uses a microkernel strategy, so that the implementation is easy to parallelize and optimize. Supports multithreading.""" keywords = ["matrix", "sgemm", "dgemm"] categories = ["science"] exclude = ["docs/*"] build = "build.rs" [lib] bench = false [[bench]] name = "benchmarks" harness = false [dependencies] rawpointer = "0.2" thread-tree = { version = "0.3.2", optional = true } once_cell = { version = "1.7", optional = true } num_cpus = { version = "1.13", optional = true } [dev-dependencies] bencher = "0.1.2" itertools = "0.8" [features] default = ["std"] # support for complex f32, complex f64 cgemm = [] threading = ["thread-tree", "std", "once_cell", "num_cpus"] std = [] # support for compile-time configuration constconf = [] [build-dependencies] autocfg = "1" [profile.release] debug = true [profile.bench] debug = true [package.metadata.release] no-dev-version = true tag-name = "{{version}}" [package.metadata.docs.rs] features = ["cgemm"] # defines the configuration attribute `docsrs` rustdoc-args = ["--cfg", "docsrs"] matrixmultiply-0.3.9/LICENSE-APACHE000064400000000000000000000251371046102023000146750ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. matrixmultiply-0.3.9/LICENSE-MIT000064400000000000000000000022071046102023000143760ustar 00000000000000Copyright (c) 2016 - 2023 Ulrik Sverdrup "bluss" Copyirhgt (c) 2018 R. Janis Goldschmidt Copyright (c) 2021 DutchGhost [constparse.rs] Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. matrixmultiply-0.3.9/README.rst000064400000000000000000000216301046102023000144320ustar 00000000000000matrixmultiply ============== General matrix multiplication for f32, f64, and complex matrices. Operates on matrices with general layout (they can use arbitrary row and column stride). Please read the `API documentation here`__ __ https://docs.rs/matrixmultiply/ We presently provide a few good microkernels, portable and for x86-64 and AArch64 NEON, and only one operation: the general matrix-matrix multiplication (“gemm”). This crate was inspired by the macro/microkernel approach to matrix multiplication that is used by the BLIS_ project. .. _BLIS: https://github.com/flame/blis |crates|_ .. |crates| image:: https://img.shields.io/crates/v/matrixmultiply.svg .. _crates: https://crates.io/crates/matrixmultiply Development Goals ----------------- - Code clarity and maintainability - Portability and stable Rust - Performance: provide target-specific microkernels when it is beneficial - Testing: Test diverse inputs and test and benchmark all microkernels - Small code footprint and fast compilation - We are not reimplementing BLAS. Benchmarks ---------- - ``cargo bench`` is useful for special cases and small matrices - The best gemm and threading benchmark is is ``examples/benchmarks.rs`` which supports custom sizes, some configuration, and csv output. Use the script ``benches/benchloop.py`` to run benchmarks over parameter ranges. Blog Posts About This Crate --------------------------- + `gemm: a rabbit hole`__ __ https://bluss.github.io/rust/2016/03/28/a-gemmed-rabbit-hole/ Recent Changes -------------- - 0.3.9 - Fix debug assertion for alignment on s390x, involving alignment of the mask buffer; the change lowered the static alignment request to (16) in thread local storage on all platforms (except macos was unchanged). - Test with cargo-careful - 0.3.8 - Lower alignment requirement for thread local storage value on macos, since it's was not respected and caused a debug assertion. (Previous issue #55) - 0.3.7 - Rename a directory, avoiding spaces in filenames, to be compatible with Bazel. By @xander-zitara - 0.3.6 - Fix the build for the combination of cgemm and no_std (#76) - 0.3.5 - Significant improvements to complex matrix packing and kernels (#75) - Use a specialized AVX2 matrix packing function for sgemm, dgemm when this feature is detected on x86-64 - 0.3.4 - Sgemm, dgemm microkernel implementations for AArch64 NEON (ARM) Matrixmultiply now uses autocfg to detect rust version to enable these kernels when AArch64 intrinsics are available from Rust 1.61. - Small change to matrix packing functions so that they in some cases optimize better due to improvements to pointer alias information. - 0.3.3 - Attempt to fix macos bug #55 again (manifesting as a debug assertion, only in debug builds.) - Updated comments for x86 kernels by @Tastaturtaste - Updates to MIRI/CI by @jturner314 - Silenced Send/Sync future compatibility warnings for a raw pointer wrapper - 0.3.2 - Add optional feature ``cgemm`` for complex matmult functions ``cgemm`` and ``zgemm`` - Add optional feature ``constconf`` for compile-time configuration of matrix kernel parameters for chunking. Improved scripts for benchmarking over ranges of different settings. With thanks to @DutchGhost for the const-time parsing functions. - Improved benchmarking and testing. - Threading is now slightly more eager to threads (depending on matrix element count). - 0.3.1 - Attempt to fix bug #55 were the mask buffer in TLS did not seem to get its requested alignment on macos. The mask buffer pointer is now aligned manually (again, like it was in 0.2.x). - Fix a minor issue where we were passing a buffer pointer as ``&T`` when it should have been ``&[T]``. - 0.3.0 - Implement initial support for threading using a bespoke thread pool with little contention. To use, enable feature ``threading`` (and configure number of threads with the variable ``MATMUL_NUM_THREADS``). Initial support is for up to 4 threads - will be updated with more experience in coming versions. - Added a better benchmarking program for arbitrary size and layout, see ``examples/benchmark.rs`` for this; it supports csv output for better recording of measurements - Minimum supported rust version is 1.41.1 and the version update policy has been updated. - Updated to Rust 2018 edition - Moved CI to github actions (so long travis and thanks for all the fish). - 0.2.4 - Support no-std mode by @vadixidav and @jturner314 New (default) feature flag "std"; use default-features = false to disable and use no-std. Note that runtime CPU feature detection requires std. - Fix tests so that they build correctly on non-x86 #49 platforms, and manage the release by @bluss - 0.2.3 - Update rawpointer dependency to 0.2 - Minor changes to inlining for ``-Ctarget-cpu=native`` use (not recommended - use automatic runtime feature detection. - Minor improvements to kernel masking (#42, #41) by @bluss and @SuperFluffy - 0.2.2 - New dgemm avx and fma kernels implemented by R. Janis Goldschmidt (@SuperFluffy). With fast cases for both row and column major output. Benchmark improvements: Using fma instructions reduces execution time on dgemm benchmarks by 25-35% compared with the avx kernel, see issue `#35`_ Using the avx dgemm kernel reduces execution time on dgemm benchmarks by 5-7% compared with the previous version's autovectorized kernel. - New fma adaption of the sgemm avx kernel by R. Janis Goldschmidt (@SuperFluffy). Benchmark improvement: Using fma instructions reduces execution time on sgemm benchmarks by 10-15% compared with the avx kernel, see issue `#35`_ - More flexible kernel selection allows kernels to individually set all their parameters, ensures the fallback (plain Rust) kernels can be tuned for performance as well, and moves feature detection out of the gemm loop. Benchmark improvement: Reduces execution time on various benchmarks by 1-2% in the avx kernels, see `#37`_. - Improved testing to cover input/output strides of more diversity. .. _#35: https://github.com/bluss/matrixmultiply/issues/35 .. _#37: https://github.com/bluss/matrixmultiply/issues/37 - 0.2.1 - Improve matrix packing by taking better advantage of contiguous inputs. Benchmark improvement: execution time for 64×64 problem where inputs are either both row major or both column major changed by -5% sgemm and -1% for dgemm. (#26) - In the sgemm avx kernel, handle column major output arrays just like it does row major arrays. Benchmark improvement: execution time for 32×32 problem where output is column major changed by -11%. (#27) - 0.2.0 - Use runtime feature detection on x86 and x86-64 platforms, to enable AVX-specific microkernels at runtime if available on the currently executing configuration. This means no special compiler flags are needed to enable native instruction performance! - Implement a specialized 8×8 sgemm (f32) AVX microkernel, this speeds up matrix multiplication by another 25%. - Use ``std::alloc`` for allocation of aligned packing buffers - We now require Rust 1.28 as the minimal version - 0.1.15 - Fix bug where the result matrix C was not updated in the case of a M × K by K × N matrix multiplication where K was zero. (This resulted in the output C potentially being left uninitialized or with incorrect values in this specific scenario.) By @jturner314 (PR #21) - 0.1.14 - Avoid an unused code warning - 0.1.13 - Pick 8x8 sgemm (f32) kernel when AVX target feature is enabled (with Rust 1.14 or later, no effect otherwise). - Use ``rawpointer``, a µcrate with raw pointer methods taken from this project. - 0.1.12 - Internal cleanup with retained performance - 0.1.11 - Adjust sgemm (f32) kernel to optimize better on recent Rust. - 0.1.10 - Update doc links to docs.rs - 0.1.9 - Workaround optimization regression in rust nightly (1.12-ish) (#9) - 0.1.8 - Improved docs - 0.1.7 - Reduce overhead slightly for small matrix multiplication problems by using only one allocation call for both packing buffers. - 0.1.6 - Disable manual loop unrolling in debug mode (quicker debug builds) - 0.1.5 - Update sgemm to use a 4x8 microkernel (“still in simplistic rust”), which improves throughput by 10%. - 0.1.4 - Prepare support for aligned packed buffers - Update dgemm to use a 8x4 microkernel, still in simplistic rust, which improves throughput by 10-20% when using AVX. - 0.1.3 - Silence some debug prints - 0.1.2 - Major performance improvement for sgemm and dgemm (20-30% when using AVX). Since it all depends on what the optimizer does, I'd love to get issue reports that report good or bad performance. - Made the kernel masking generic, which is a cleaner design - 0.1.1 - Minor improvement in the kernel matrixmultiply-0.3.9/benches/benchloop.py000075500000000000000000000101241046102023000166740ustar 00000000000000#!/usr/bin/python3 """ Benchmarking script. See --help for details. Compiles benchmark, runs and outputs csv. """ import argparse import os import subprocess import sys import time _POST_COMPILE_SLEEP = 0.5 _GEMMTYPE = { 'f32': 'SGEMM', 'f64': 'DGEMM', 'c32': 'CGEMM', 'c64': 'ZGEMM', } _COMPILE = "cargo rustc --example benchmark --release".split() _DEFAULT_FEATURES = "constconf cgemm".split() _EXEC = "./target/release/examples/benchmark" def bench_loop(args, *, file): extra_header = ",nc,kc,mc,threads" print("m,k,n,layout,type,average_ns,minimum_ns,median_ns,samples,gflops", extra_header, sep="", file=file) ncs = [None] if args.nc is None else args.nc kcs = [None] if args.kc is None else args.kc mcs = [None] if args.mc is None else args.mc for threads in args.threads: for ty in args.type: for nc in ncs: for kc in kcs: for mc in mcs: bench_iteration(args.size, ty, nc, kc, mc, threads=threads, file=file, sleep=args.sleep) def bench_iteration(sizes, ty, nc, kc, mc, *, threads, file, sleep): features = list(_DEFAULT_FEATURES) if threads > 0: features.append("threading") compile_argv = list(_COMPILE) compile_argv.append("--features=" + ",".join(features)) file.flush() env = os.environ.copy() for value, name in zip([nc, kc, mc], ["nc", "kc", "mc"]): if value is not None: env["MATMUL_" + _GEMMTYPE[ty] + "_" + name.upper()] = str(value) print("Running", " ".join(compile_argv), file=sys.stderr) subprocess.run(compile_argv, env=env) time.sleep(_POST_COMPILE_SLEEP) exec_env = os.environ.copy() exec_env["MATMUL_NUM_THREADS"] = str(threads) extra_column = ",".join((str(value) if value is not None else "") for value in [nc, kc, mc, threads]) for size in sizes: argv = [_EXEC] argv.extend(["--type", ty]) argv.extend(["--csv", "--layout", "fcc", "--extra-column", extra_column]) argv.extend([str(size)] * 3) print("Running", " ".join(argv), file=sys.stderr) subprocess.run(argv, env=exec_env, stdout=file) time.sleep(sleep) def main(): parser = argparse.ArgumentParser() parser.add_argument("--type", "-t", type=str, default=["f64"], choices=["f64", "f32", "c64", "c32"], nargs="+") parser.add_argument("--size", "-s", type=str, nargs="+", required=True, help="Sizes or size ranges like 16 or 16:64:8") parser.add_argument("--nc", type=str, nargs="+", help="Sizes or size ranges like 16 or 16:64:8") parser.add_argument("--kc", type=str, nargs="+", help="Sizes or size ranges like 16 or 16:64:8") parser.add_argument("--mc", type=str, nargs="+", help="Sizes or size ranges like 16 or 16:64:8") parser.add_argument("--threads", type=int, default=[0], nargs="+", help="Thread use. 0: not enabled; 1: enabled but one thread; n: enabled with n threads.") parser.add_argument("--sleep", type=int, default=1, help="Time to wait between every run") parser.add_argument("--output", type=str, default=None, help="Output file (csv format)") args = parser.parse_args() if len(args.type) > 1 and any([args.nc, args.kc, args.mc]): print("Warning: Combining type loop with nc, mc, kc might not make sense", file=sys.stderr) # postprocess nc, kc, mc to parse x:y:z into ranges for var in ['size', 'nc', 'kc', 'mc']: arg_values = getattr(args, var) if arg_values is not None: new_arg_values = [] for arg_value in arg_values: if ":" in arg_value: parts = list(int(v) for v in arg_value.split(":")) values = list(range(*parts)) new_arg_values.extend(values) else: new_arg_values.append(arg_value) setattr(args, var, new_arg_values) if args.output is not None: with open(args.output, "w") as f: bench_loop(args, file=f) else: bench_loop(args, file=sys.stdout) if __name__ == "__main__": main() matrixmultiply-0.3.9/benches/benchmarks.rs000064400000000000000000000216641046102023000170440ustar 00000000000000extern crate matrixmultiply; pub use matrixmultiply::dgemm; pub use matrixmultiply::sgemm; #[macro_use] extern crate bencher; // Compute GFlop/s // by flop / s = 2 M N K / time benchmark_main!(mat_mul_f32, mat_mul_f64, layout_f32_032, layout_f64_032); macro_rules! mat_mul { ($modname:ident, $gemm:ident, $(($name:ident, $m:expr, $n:expr, $k:expr))+) => { mod $modname { use bencher::{Bencher}; use crate::$gemm; $( pub fn $name(bench: &mut Bencher) { let a = vec![0.; $m * $n]; let b = vec![0.; $n * $k]; let mut c = vec![0.; $m * $k]; bench.iter(|| { unsafe { $gemm( $m, $n, $k, 1., a.as_ptr(), $n, 1, b.as_ptr(), $k, 1, 0., c.as_mut_ptr(), $k, 1, ) } }); } )+ } benchmark_group!{ $modname, $($modname::$name),+ } }; } mat_mul! {mat_mul_f32, sgemm, (m004, 4, 4, 4) (m006, 6, 6, 6) (m008, 8, 8, 8) (m012, 12, 12, 12) (m016, 16, 16, 16) (m032, 32, 32, 32) (m064, 64, 64, 64) (m127, 127, 127, 127) /* (m256, 256, 256, 256) (m512, 512, 512, 512) (mix16x4, 32, 4, 32) (mix32x2, 32, 2, 32) (mix97, 97, 97, 125) (mix128x10000x128, 128, 10000, 128) */ } mat_mul! {mat_mul_f64, dgemm, (m004, 4, 4, 4) (m006, 6, 6, 6) (m008, 8, 8, 8) (m012, 12, 12, 12) (m016, 16, 16, 16) (m032, 32, 32, 32) (m064, 64, 64, 64) (m127, 127, 127, 127) /* (m256, 256, 256, 256) (m512, 512, 512, 512) (mix16x4, 32, 4, 32) (mix32x2, 32, 2, 32) (mix97, 97, 97, 125) (mix128x10000x128, 128, 10000, 128) */ } /// benchmarks combinations of inputs using various layouts /// row-major ("c") vs column-major ("f") experiments /// /// These benchmarks give information about /// /// 1. Matrix packing sensitivity to input layouts (A, B layouts) /// 2. Microkernel sensitivity to output layouts (C layout) /// and performance for beta != 0. vs == 0. /// /// Clike: all elements spaced at least 2 apart /// Flike: all elements spaced at least 2 apart enum Layout { C, F, Clike, Flike, } use self::Layout::*; impl Layout { fn spread(&self) -> usize { match *self { C | F => 1, Clike | Flike => 2, } } fn strides(&self, rs: isize, cs: isize) -> (isize, isize) { let spread = self.spread() as isize; match *self { C | Clike => (rs * spread, cs * spread), F | Flike => (cs * spread, rs * spread), } } } macro_rules! gemm_layout { ($modname:ident, $gemm:ident, $(($name:ident, $m:expr))+) => { mod $modname { use bencher::{Bencher}; use super::Layout::{self, *}; use crate::$gemm; $( fn base(bench: &mut Bencher, al: Layout, bl: Layout, cl: Layout, use_beta: bool) { let a = vec![0.; $m * $m * al.spread()]; let b = vec![0.; $m * $m * bl.spread()]; let mut c = vec![0.; $m * $m * cl.spread()]; let beta = if use_beta { 0.1 } else { 0. }; let (rsa, csa) = al.strides($m, 1); let (rsb, csb) = bl.strides($m, 1); let (rsc, csc) = cl.strides($m, 1); let max_stride_a = (rsa as usize) * ($m - 1) + (csa as usize) * ($m - 1); let max_stride_b = (rsb as usize) * ($m - 1) + (csb as usize) * ($m - 1); let max_stride_c = (rsc as usize) * ($m - 1) + (csc as usize) * ($m - 1); debug_assert!(max_stride_a < a.len()); debug_assert!(max_stride_b < b.len()); debug_assert!(max_stride_c < c.len()); bench.iter(|| { unsafe { $gemm( $m, $m, $m, 1., a.as_ptr(), rsa, csa, b.as_ptr(), rsb, csb, beta, c.as_mut_ptr(), rsc, csc, ) } }); } pub fn nobeta_ccc(bench: &mut Bencher) { base(bench, C, C, C, false); } pub fn nobeta_ccf(bench: &mut Bencher) { base(bench, C, C, F, false); } pub fn nobeta_fcc(bench: &mut Bencher) { base(bench, F, C, C, false); } pub fn nobeta_cfc(bench: &mut Bencher) { base(bench, C, F, C, false); } pub fn nobeta_ffc(bench: &mut Bencher) { base(bench, F, F, C, false); } pub fn nobeta_cff(bench: &mut Bencher) { base(bench, C, F, F, false); } pub fn nobeta_fcf(bench: &mut Bencher) { base(bench, F, C, F, false); } pub fn nobeta_fff(bench: &mut Bencher) { base(bench, F, F, F, false); } pub fn nobeta_cfc_spread_yyn(bench: &mut Bencher) { base(bench, Clike, Flike, C, false); } pub fn nobeta_fcc_spread_yyn(bench: &mut Bencher) { base(bench, Flike, Clike, C, false); } pub fn nobeta_fcc_spread_nny(bench: &mut Bencher) { base(bench, C, F, Clike, false); } pub fn nobeta_fcf_spread_nny(bench: &mut Bencher) { base(bench, C, F, Flike, false); } pub fn beta_ccc(bench: &mut Bencher) { base(bench, C, C, C, true); } pub fn beta_ccf(bench: &mut Bencher) { base(bench, C, C, F, true); } pub fn beta_fcc(bench: &mut Bencher) { base(bench, F, C, C, true); } pub fn beta_cfc(bench: &mut Bencher) { base(bench, C, F, C, true); } pub fn beta_ffc(bench: &mut Bencher) { base(bench, F, F, C, true); } pub fn beta_cff(bench: &mut Bencher) { base(bench, C, F, F, true); } pub fn beta_fcf(bench: &mut Bencher) { base(bench, F, C, F, true); } pub fn beta_fff(bench: &mut Bencher) { base(bench, F, F, F, true); } pub fn beta_fcc_spread_nny(bench: &mut Bencher) { base(bench, C, F, Clike, true); } pub fn beta_fcf_spread_nny(bench: &mut Bencher) { base(bench, C, F, Flike, true); } )+ } benchmark_group!{ $modname, $modname::nobeta_ccc, $modname::nobeta_ccf, $modname::nobeta_fcc, $modname::nobeta_cfc, $modname::nobeta_ffc, $modname::nobeta_cff, $modname::nobeta_fcf, $modname::nobeta_fff, $modname::nobeta_cfc_spread_yyn, $modname::nobeta_fcc_spread_yyn, $modname::nobeta_fcc_spread_nny, $modname::nobeta_fcf_spread_nny, $modname::beta_ccc, $modname::beta_ccf, $modname::beta_fcc, $modname::beta_cfc, $modname::beta_ffc, $modname::beta_cff, $modname::beta_fcf, $modname::beta_fff, $modname::beta_fcc_spread_nny, $modname::beta_fcf_spread_nny } }; } gemm_layout! {layout_f32_032, sgemm, (m032, 32) } gemm_layout! {layout_f64_032, dgemm, (m032, 32) } use std::ops::{Add, Mul}; trait Z { fn zero() -> Self; } impl Z for f32 { fn zero() -> Self { 0. } } impl Z for f64 { fn zero() -> Self { 0. } } // simple, slow, correct (hopefully) mat mul (Row Major) #[inline(never)] fn reference_mat_mul(m: usize, k: usize, n: usize, a: &[A], b: &[A], c: &mut [A]) where A: Z + Add + Mul + Copy, { assert!(a.len() >= m * k); assert!(b.len() >= k * n); assert!(c.len() >= m * n); for i in 0..m { for j in 0..n { unsafe { let celt = c.get_unchecked_mut(i * m + j); *celt = (0..k).fold(A::zero(), move |s, x| { s + *a.get_unchecked(i * k + x) * *b.get_unchecked(x * n + j) }); } } } } macro_rules! ref_mat_mul { ($modname:ident, $ty:ty, $(($name:ident, $m:expr, $n:expr, $k:expr))+) => { mod $modname { use bencher::{Bencher}; use super::reference_mat_mul; $( pub fn $name(bench: &mut Bencher) { let a = vec![0. as $ty; $m * $n]; let b = vec![0.; $n * $k]; let mut c = vec![0.; $m * $k]; bench.iter(|| { reference_mat_mul($m, $n, $k, &a, &b, &mut c); c[0] }); } )+ } benchmark_group!{ $modname, $($modname::$name),+ } }; } ref_mat_mul! {ref_mat_mul_f32, f32, (m004, 4, 4, 4) (m005, 5, 5, 5) (m006, 6, 6, 6) (m007, 7, 7, 7) (m008, 8, 8, 8) (m009, 9, 9, 9) (m012, 12, 12, 12) (m016, 16, 16, 16) (m032, 32, 32, 32) (m064, 64, 64, 64) } matrixmultiply-0.3.9/build.rs000064400000000000000000000007171046102023000144130ustar 00000000000000fn main() { println!("cargo:rerun-if-changed=build.rs"); if std::env::var("CARGO_CFG_TARGET_ARCH").unwrap_or(String::new()) == "aarch64" { match autocfg::AutoCfg::new() { // From 1.61 aarch64 intrinsics and #[target_feature] Ok(ac) => if ac.probe_rustc_version(1, 61) { println!("cargo:rustc-cfg=has_aarch64_simd"); } Err(err) => println!("cargo:warning={}", err), } } } matrixmultiply-0.3.9/ci/miri.sh000075500000000000000000000007111046102023000146320ustar 00000000000000#!/bin/sh set -ex export CARGO_NET_RETRY=5 export CARGO_NET_TIMEOUT=10 MIRI_NIGHTLY=nightly-$(curl -s https://rust-lang.github.io/rustup-components-history/x86_64-unknown-linux-gnu/miri) echo "Installing latest nightly with Miri: $MIRI_NIGHTLY" rustup default "$MIRI_NIGHTLY" rustup component add miri cargo miri setup # Disable isolation for num_cpus::get_physical. MIRIFLAGS="-Zmiri-disable-isolation" \ MMTEST_FAST_TEST=1 \ cargo miri test "$@" matrixmultiply-0.3.9/examples/benchmark.rs000064400000000000000000000312231046102023000170600ustar 00000000000000//! Run this executable to benchmark sgemm and dgemm for arbitrary size matrices //! See --help for usage examples. Remember to run in release mode. extern crate itertools; extern crate matrixmultiply; use std::cell::Cell; use std::fmt::Debug; use std::time::Instant; use itertools::zip; use itertools::Itertools; include!("../testdefs/testdefs.rs"); enum Arg { Flag { long: &'static str }, Value { long: &'static str }, } impl Arg { fn is_flag(&self) -> bool { match *self { Arg::Flag { .. } => true, Arg::Value { .. } => false, } } fn long(&self) -> &str { use Arg::*; match *self { Flag { long, .. } | Value { long, .. } => long, } } } struct Argparse<'a> { spec: &'a [&'a Arg], // true: this arg has already been parsed, false: unused used: Vec>, args: Vec, } // Simple argument parser impl<'a> Argparse<'a> { pub fn new(spec: &'a [&'a Arg], args: impl IntoIterator) -> Self { let strings: Vec<_> = args.into_iter().collect(); Argparse { spec, used: vec![Cell::new(false); strings.len()], args: strings, } } fn get_arg(&self, long: &str) -> Option<(bool, &str)> { self.used[0].set(true); let arg_spec = self.spec.iter().find(|arg| arg.long() == long).expect("No such argument"); for (i, arg) in self.args.iter().enumerate() { if self.used[i].get() { continue; } let arg_long = arg_spec.long(); if arg.starts_with("--") { if arg[2..].starts_with(arg_long) { /* has arg */ self.used[i].set(true); if arg_spec.is_flag() { return Some((false, "")); } if arg[2 + arg_long.len()..].is_empty() && self.args.len() > i + 1 { self.used[i + 1].set(true); return Some((true, &self.args[i + 1])); } else { return Some((true, &arg[3 + arg_long.len()..])) } } } } None } pub fn get_flag(&self, long: &str) -> Option { self.get_arg(long).map(|_| true) } pub fn get_string(&self, long: &str) -> Option<&str> { self.get_arg(long).map(|(_, arg)| arg) } pub fn check_usage(&self) -> Result<(), String> { for (i, arg) in self.args.iter().enumerate() { if !self.used[i].get() && arg.starts_with("-") { return Err(format!("Unknown argument {:?}", arg)); } } Ok(()) } pub fn next_positional_int(&self) -> Option { for (i, arg) in self.args.iter().enumerate() { if !self.used[i].get() { self.used[i].set(true); return Some(arg.parse::().unwrap()) } } None } } fn main() -> Result<(), String> { run_main(std::env::args()) } fn run_main(args: impl IntoIterator) -> Result<(), String> { #[cfg(debug_assertions)] eprintln!("Warning: running benchmark with debug assertions"); let opts = match parse_args(args) { Ok(o) => o, Err(e) => { eprintln!("Usage: [--type ] [--layout ] [--csv] m-size k-size n-size"); eprintln!(); eprintln!("Where is one of: f32, f64, c32, c64"); eprintln!("Where is 3 letters from c, f like: ccc fcc fff"); eprintln!(); eprintln!("Example: --type f64 --layout fcf 1000 1000 1000"); eprintln!(); eprintln!("csv headers: m,k,n,layout,type,average_ns,minimum_ns,median_ns,samples,gflops"); eprintln!(); return Err(format!("Error parsing arguments: {}", e)); } }; match opts.use_type { UseType::F32 => test_matrix::(opts.m, opts.k, opts.n, opts.layout, opts.use_csv, opts.use_type, &opts.extra_column), UseType::F64 => test_matrix::(opts.m, opts.k, opts.n, opts.layout, opts.use_csv, opts.use_type, &opts.extra_column), #[cfg(feature="cgemm")] UseType::C32 => test_matrix::(opts.m, opts.k, opts.n, opts.layout, opts.use_csv, opts.use_type, &opts.extra_column), #[cfg(feature="cgemm")] UseType::C64 => test_matrix::(opts.m, opts.k, opts.n, opts.layout, opts.use_csv, opts.use_type, &opts.extra_column), #[cfg(not(feature="cgemm"))] _otherwise => unimplemented!("cgemm feature missing"), } Ok(()) } #[derive(Debug, Copy, Clone)] enum UseType { F32, F64, C32, C64, } impl UseType { fn type_name(self) -> &'static str { use UseType::*; match self { F32 => "f32", F64 => "f64", C32 => "c32", C64 => "c64", } } fn flop_factor(self) -> f64 { match self { // estimate one multiply and one addition UseType::F32 | UseType::F64 => 2., // (P + Qi)(R + Si) = .. // estimate 8 flop (4 float multiplies and 4 additions). UseType::C32 | UseType::C64 => 8., } } } impl Default for UseType { fn default() -> Self { Self::F64 } } #[derive(Debug, Clone, Default)] struct Options { m: usize, k: usize, n: usize, layout: [Layout; 3], use_type: UseType, use_csv: bool, extra_column: Option, } fn parse_args(args: impl IntoIterator) -> Result { let mut opts = Options::default(); //./target/release/examples/benchmark 1280 1280 1280 c64 fcf let parse = Argparse::new(&[ &Arg::Flag { long: "csv" }, &Arg::Value { long: "layout" }, &Arg::Value { long: "type" }, &Arg::Value { long: "extra-column" }, ], args); opts.use_type = match parse.get_string("type") { Some("f32") => UseType::F32, Some("f64") => UseType::F64, Some("c32") => UseType::C32, Some("c64") => UseType::C64, Some(_otherwise) => return Err("Unknown type".to_string()), None => UseType::F64, }; if let Some(layout) = parse.get_string("layout") { if layout.len() != 3 || !layout.chars().all(|c| c == 'c' || c == 'f') { Err(format!("Unknown argument {}", layout))?; } for (elt, layout_arg) in zip(&mut opts.layout[..], layout.chars()) { *elt = if layout_arg == 'c' { Layout::C } else { Layout::F }; } } opts.use_csv = parse.get_flag("csv").is_some(); opts.extra_column = parse.get_string("extra-column").map(|s| s.to_string()); parse.check_usage()?; opts.m = parse.next_positional_int().ok_or("Expected argument".to_string())? as usize; opts.k = parse.next_positional_int().ok_or("Expected argument".to_string())? as usize; opts.n = parse.next_positional_int().ok_or("Expected argument".to_string())? as usize; Ok(opts) } // // Custom stride tests // #[derive(Copy, Clone, Debug)] enum Layout { C, F } use self::Layout::*; impl Layout { fn strides_scaled(self, m: usize, n: usize, scale: [usize; 2]) -> (isize, isize) { match self { C => ((n * scale[0] * scale[1]) as isize, scale[1] as isize), F => (scale[0] as isize, (m * scale[1] * scale[0]) as isize), } } } impl Default for Layout { fn default() -> Self { C } } fn test_matrix(m: usize, k: usize, n: usize, layouts: [Layout; 3], use_csv: bool, use_type: UseType, extra: &Option) where F: Gemm + Float { let (m, k, n) = (m, k, n); // stride multipliers let stride_multipliers = vec![[1, 1], [1, 1], [1, 1]]; let mstridea = stride_multipliers[0]; let mstrideb = stride_multipliers[1]; let mstridec = stride_multipliers[2]; let mut a = vec![F::zero(); m * k * mstridea[0] * mstridea[1]]; let mut b = vec![F::zero(); k * n * mstrideb[0] * mstrideb[1]]; let mut c1 = vec![F::zero(); m * n * mstridec[0] * mstridec[1]]; for (i, elt) in a.iter_mut().enumerate() { *elt = F::from(i as i64); } for (i, elt) in b.iter_mut().enumerate() { *elt = F::from(i as i64); } let la = layouts[0]; let lb = layouts[1]; let lc1 = layouts[2]; let (rs_a, cs_a) = la.strides_scaled(m, k, mstridea); let (rs_b, cs_b) = lb.strides_scaled(k, n, mstrideb); let (rs_c1, cs_c1) = lc1.strides_scaled(m, n, mstridec); if !use_csv { println!("Test matrix a : {} × {} layout: {:?} strides {}, {}", m, k, la, rs_a, cs_a); println!("Test matrix b : {} × {} layout: {:?} strides {}, {}", k, n, lb, rs_b, cs_b); println!("Test matrix c : {} × {} layout: {:?} strides {}, {}", m, n, lc1, rs_c1, cs_c1); } let statistics = measure(10, use_csv, || { unsafe { // C1 = A B F::gemm( m, k, n, F::from(1), a.as_ptr(), rs_a, cs_a, b.as_ptr(), rs_b, cs_b, F::zero(), c1.as_mut_ptr(), rs_c1, cs_c1, ); } }); let gflop = use_type.flop_factor() * (m as f64 * n as f64 * k as f64) / statistics.average as f64; if !use_csv { print!("{}×{}×{} {:?} {} .. {} ns", m, k, n, layouts, use_type.type_name(), fmt_thousands_sep(statistics.average, " ")); print!(" [minimum: {} ns .. median {} ns .. sample count {}]", fmt_thousands_sep(statistics.minimum, " "), fmt_thousands_sep(statistics.median, " "), statistics.samples.len()); // by flop / s = 2 M N K / time print!(" {:.2} Gflop/s", gflop); println!(); } else { print!("{},{},{},", m, k, n); print!("{:?},", layouts.iter().format("")); print!("{},", use_type.type_name()); print!("{},{},{},{},", statistics.average, statistics.minimum, statistics.median, statistics.samples.len()); print!("{}", gflop); if let Some(extra) = extra { print!(",{}", extra); } println!(); } } #[derive(Default, Debug)] struct Statistics { samples: Vec, samples_sorted: Vec, average: u64, median: u64, minimum: u64, } const OUTLIER_HIGH_PCT: usize = 25; //const OUTLIER_LOW_PCT: usize = 10; fn measure(max_samples: usize, quiet: bool, mut function: impl FnMut()) -> Statistics { let mut statistics = Statistics::default(); statistics.samples.reserve(max_samples); let mut goal_samples = max_samples; let start_batch = Instant::now(); let mut print_each = false; while statistics.samples.len() < goal_samples { for _ in 0..goal_samples { let start = Instant::now(); function(); let dur = start.elapsed(); let elapsed_ns = dur.as_secs() * 1_000_000_000 + dur.subsec_nanos() as u64; statistics.samples.push(elapsed_ns); print_each |= dur.as_secs() >= 1; if !quiet && print_each { println!(" {}", fmt_thousands_sep(elapsed_ns, " ")); } } let batch_dur = start_batch.elapsed(); if batch_dur.as_millis() < 1000 { goal_samples *= 5; } } let nsamples = statistics.samples.len(); let nsamples_winnow = nsamples - (nsamples * OUTLIER_HIGH_PCT) / 100; statistics.samples_sorted = statistics.samples.clone(); // sort low to high statistics.samples_sorted.sort_unstable(); statistics.samples_sorted.truncate(nsamples_winnow); statistics.average = (statistics.samples_sorted.iter().sum::() as f64 / (nsamples_winnow as f64)) as u64; statistics.minimum = statistics.samples_sorted[0]; statistics.median = statistics.samples_sorted[nsamples_winnow / 2]; statistics } // Format a number with thousands separators fn fmt_thousands_sep(mut n: u64, sep: &str) -> String { use std::fmt::Write; let mut output = String::new(); let mut trailing = false; for &pow in &[12, 9, 6, 3, 0] { let base = 10_u64.pow(pow); if pow == 0 || trailing || n / base != 0 { if !trailing { output.write_fmt(format_args!("{}", n / base)).unwrap(); } else { output.write_fmt(format_args!("{:03}", n / base)).unwrap(); } if pow != 0 { output.push_str(sep); } trailing = true; } n %= base; } output } #[test] fn test_benchmark() { run_main("ignored 128 128 128 f64 fcc".split_whitespace().map(str::to_string)).unwrap(); } matrixmultiply-0.3.9/examples/usegemm.rs000064400000000000000000000160461046102023000165760ustar 00000000000000 // Example of using sgemm/dgemm from matrixmultiply, // we show that we can multiply matrices of differing strides. // // Jump down to the next place where it says EXAMPLE. extern crate core; extern crate itertools; extern crate matrixmultiply; use matrixmultiply::{sgemm, dgemm}; use itertools::Itertools; use itertools::{ cloned, enumerate, repeat_n, }; use core::fmt::{Display, Debug}; trait Float : Copy + Display + Debug + PartialEq { fn zero() -> Self; fn one() -> Self; fn from(x: i64) -> Self; fn nan() -> Self; fn is_nan(self) -> bool; } impl Float for f32 { fn zero() -> Self { 0. } fn one() -> Self { 1. } fn from(x: i64) -> Self { x as Self } fn nan() -> Self { 0./0. } fn is_nan(self) -> bool { self.is_nan() } } impl Float for f64 { fn zero() -> Self { 0. } fn one() -> Self { 1. } fn from(x: i64) -> Self { x as Self } fn nan() -> Self { 0./0. } fn is_nan(self) -> bool { self.is_nan() } } trait Gemm : Sized { unsafe fn gemm( m: usize, k: usize, n: usize, alpha: Self, a: *const Self, rsa: isize, csa: isize, b: *const Self, rsb: isize, csb: isize, beta: Self, c: *mut Self, rsc: isize, csc: isize); } impl Gemm for f32 { unsafe fn gemm( m: usize, k: usize, n: usize, alpha: Self, a: *const Self, rsa: isize, csa: isize, b: *const Self, rsb: isize, csb: isize, beta: Self, c: *mut Self, rsc: isize, csc: isize) { sgemm( m, k, n, alpha, a, rsa, csa, b, rsb, csb, beta, c, rsc, csc) } } impl Gemm for f64 { unsafe fn gemm( m: usize, k: usize, n: usize, alpha: Self, a: *const Self, rsa: isize, csa: isize, b: *const Self, rsb: isize, csb: isize, beta: Self, c: *mut Self, rsc: isize, csc: isize) { dgemm( m, k, n, alpha, a, rsa, csa, b, rsb, csb, beta, c, rsc, csc) } } fn main() { test_gemm_strides::(); test_gemm_strides::(); } fn test_gemm_strides() where F: Gemm + Float { let test_sizes = [77]; for &n in &test_sizes { test_strides::(n, n, n); } } // // Custom stride tests // #[derive(Copy, Clone, Debug)] enum Layout { C, F } use self::Layout::*; impl Layout { fn strides_scaled(self, m: usize, n: usize, scale: [usize; 2]) -> (isize, isize) { match self { C => ((n * scale[0] * scale[1]) as isize, scale[1] as isize), F => (scale[0] as isize, (m * scale[1] * scale[0]) as isize), } } } impl Default for Layout { fn default() -> Self { C } } fn test_strides(m: usize, k: usize, n: usize) where F: Gemm + Float { let (m, k, n) = (m, k, n); let stride_multipliers = vec![[1, 1], [1, 1], [1, 1], [1, 1], [2, 2]]; let mut multipliers_iter = cloned(&stride_multipliers).cycle(); let layout_species = [C, F]; let layouts_iter = repeat_n(cloned(&layout_species), 4).multi_cartesian_product(); for elt in layouts_iter { let layouts = [elt[0], elt[1], elt[2], elt[3]]; let (m0, m1, m2, m3) = multipliers_iter.next_tuple().unwrap(); test_strides_inner::(m, k, n, [m0, m1, m2, m3], layouts); } } fn test_strides_inner(m: usize, k: usize, n: usize, stride_multipliers: [[usize; 2]; 4], layouts: [Layout; 4]) where F: Gemm + Float { let (m, k, n) = (m, k, n); // stride multipliers let mstridea = stride_multipliers[0]; let mstrideb = stride_multipliers[1]; let mstridec = stride_multipliers[2]; let mstridec2 = stride_multipliers[3]; let mut a = vec![F::zero(); m * k * mstridea[0] * mstridea[1]]; let mut b = vec![F::zero(); k * n * mstrideb[0] * mstrideb[1]]; let mut c1 = vec![F::nan(); m * n * mstridec[0] * mstridec[1]]; let mut c2 = vec![F::nan(); m * n * mstridec2[0] * mstridec2[1]]; for (i, elt) in a.iter_mut().enumerate() { *elt = F::from(i as i64); } for (i, elt) in b.iter_mut().enumerate() { *elt = F::from(i as i64); } let la = layouts[0]; let lb = layouts[1]; let lc1 = layouts[2]; let lc2 = layouts[3]; let (rs_a, cs_a) = la.strides_scaled(m, k, mstridea); let (rs_b, cs_b) = lb.strides_scaled(k, n, mstrideb); let (rs_c1, cs_c1) = lc1.strides_scaled(m, n, mstridec); let (rs_c2, cs_c2) = lc2.strides_scaled(m, n, mstridec2); println!("Test matrix a : {} × {} layout: {:?} strides {}, {}", m, k, la, rs_a, cs_a); println!("Test matrix b : {} × {} layout: {:?} strides {}, {}", k, n, lb, rs_b, cs_b); println!("Test matrix c1: {} × {} layout: {:?} strides {}, {}", m, n, lc1, rs_c1, cs_c1); println!("Test matrix c2: {} × {} layout: {:?} strides {}, {}", m, n, lc2, rs_c2, cs_c2); macro_rules! c1 { ($i:expr, $j:expr) => (c1[(rs_c1 * $i as isize + cs_c1 * $j as isize) as usize]); } macro_rules! c2 { ($i:expr, $j:expr) => (c2[(rs_c2 * $i as isize + cs_c2 * $j as isize) as usize]); } unsafe { // EXAMPLE: Compute the same result in C1 and C2 in two different ways. // We only use whole integer values in the low range of floats here, // so we have no loss of precision. // C1 = A B F::gemm( m, k, n, F::from(1), a.as_ptr(), rs_a, cs_a, b.as_ptr(), rs_b, cs_b, F::zero(), c1.as_mut_ptr(), rs_c1, cs_c1, ); // C1 += 2 A B F::gemm( m, k, n, F::from(2), a.as_ptr(), rs_a, cs_a, b.as_ptr(), rs_b, cs_b, F::from(1), c1.as_mut_ptr(), rs_c1, cs_c1, ); // C2 = 3 A B F::gemm( m, k, n, F::from(3), a.as_ptr(), rs_a, cs_a, b.as_ptr(), rs_b, cs_b, F::zero(), c2.as_mut_ptr(), rs_c2, cs_c2, ); } for i in 0..m { for j in 0..n { let c1_elt = c1![i, j]; let c2_elt = c2![i, j]; assert_eq!(c1_elt, c2_elt, "assertion failed for matrices, mismatch at {},{} \n\ a:: {:?}\n\ b:: {:?}\n\ c1: {:?}\n\ c2: {:?}\n", i, j, a, b, c1, c2); } } // check we haven't overwritten the NaN values outside the passed output for (index, elt) in enumerate(&c1) { let i = index / rs_c1 as usize; let j = index / cs_c1 as usize; let irem = index % rs_c1 as usize; let jrem = index % cs_c1 as usize; if irem != 0 && jrem != 0 { assert!(elt.is_nan(), "Element at index={} ({}, {}) should be NaN, but was {}\n\ c1: {:?}\n", index, i, j, elt, c1); } } println!("{}×{}×{} {:?} .. passed.", m, k, n, layouts); } matrixmultiply-0.3.9/spare_kernels/aarch64_neon_4x4.rs000064400000000000000000000107151046102023000211160ustar 00000000000000#[cfg(target_arch="aarch64")] struct KernelArmNeon; #[cfg(target_arch="aarch64")] impl GemmKernel for KernelArmNeon { type Elem = T; type MRTy = U4; type NRTy = U4; #[inline(always)] fn align_to() -> usize { 16 } #[inline(always)] fn always_masked() -> bool { false } #[inline(always)] fn nc() -> usize { archparam::S_NC } #[inline(always)] fn kc() -> usize { archparam::S_KC } #[inline(always)] fn mc() -> usize { archparam::S_MC } #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_arm_neon(k, alpha, a, b, beta, c, rsc, csc) } } // 4x4 neon kernel unrolled developed for apple silicon M1 #[cfg(target_arch="aarch64")] #[target_feature(enable="neon")] unsafe fn kernel_target_arm_neon(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { use core::arch::aarch64::*; const MR: usize = KernelArmNeon::MR; const NR: usize = KernelArmNeon::NR; let (mut a, mut b, rsc, csc) = if rsc == 1 { (b, a, csc, rsc) } else { (a, b, rsc, csc) }; let mut ab = [vmovq_n_f32(0.); MR]; let mut ab2 = [vmovq_n_f32(0.); MR]; let mut ab3 = [vmovq_n_f32(0.); MR]; let mut ab4 = [vmovq_n_f32(0.); MR]; let use_fma = true; // Compute // ab_ij = a_i * b_j for all i, j macro_rules! ab_ij_equals_ai_bj { ($dest:ident, $av:expr, $bv:expr) => { if use_fma { $dest[0] = vfmaq_laneq_f32($dest[0], $bv, $av, 0); $dest[1] = vfmaq_laneq_f32($dest[1], $bv, $av, 1); $dest[2] = vfmaq_laneq_f32($dest[2], $bv, $av, 2); $dest[3] = vfmaq_laneq_f32($dest[3], $bv, $av, 3); } else { $dest[0] = vaddq_f32($dest[0], vmulq_laneq_f32($bv, $av, 0)); $dest[1] = vaddq_f32($dest[1], vmulq_laneq_f32($bv, $av, 1)); $dest[2] = vaddq_f32($dest[2], vmulq_laneq_f32($bv, $av, 2)); $dest[3] = vaddq_f32($dest[3], vmulq_laneq_f32($bv, $av, 3)); } } } const UNROLL_BY: usize = 4; for _ in 0..k / UNROLL_BY { let av = vld1q_f32(a); let bv = vld1q_f32(b); // eprintln!("a: {av:?}"); // eprintln!("b: {bv:?}"); // FMLA instruction // Cortex 7A: FMA has 7 cycles latency or 3 cycles when the dependency is on the accumulator // M1: Latency 3, throughput 0.25 ab_ij_equals_ai_bj!(ab, av, bv); let av = vld1q_f32(a.add(4)); let bv = vld1q_f32(b.add(4)); ab_ij_equals_ai_bj!(ab2, av, bv); if UNROLL_BY > 2 { let av = vld1q_f32(a.add(8)); let bv = vld1q_f32(b.add(8)); ab_ij_equals_ai_bj!(ab3, av, bv); let av = vld1q_f32(a.add(12)); let bv = vld1q_f32(b.add(12)); ab_ij_equals_ai_bj!(ab4, av, bv); } a = a.offset(UNROLL_BY as isize * MR as isize); b = b.offset(UNROLL_BY as isize * NR as isize); } for _ in 0..k % UNROLL_BY { let av = vld1q_f32(a); let bv = vld1q_f32(b); ab_ij_equals_ai_bj!(ab, av, bv); a = a.offset(MR as isize); b = b.offset(NR as isize); } macro_rules! c { ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize)); } macro_rules! extract { ($v:expr, $imm:expr) => ( f32::from_bits(vgetq_lane_u32(core::mem::transmute::<_, uint32x4_t>($v), $imm)) ) } // Combine accumulators and multiply by alpha loop4!(i, ab[i] = vaddq_f32(vaddq_f32(ab[i], ab2[i]), vaddq_f32(ab3[i], ab4[i]))); loop4!(i, ab[i] = vmulq_n_f32(ab[i], alpha)); if beta == 0. { // set C = α A B if csc == 1 { loop4!(i, vst1q_f32(c![i, 0], ab[i])); } else { loop4!(i, vst1q_lane_f32(c![i, 0], ab[i], 0)); loop4!(i, vst1q_lane_f32(c![i, 1], ab[i], 1)); loop4!(i, vst1q_lane_f32(c![i, 2], ab[i], 2)); loop4!(i, vst1q_lane_f32(c![i, 3], ab[i], 3)); } } else { // set C = α A B + beta C loop4!(i, *c![i, 0] = *c![i, 0] * beta + extract!(ab[i], 0)); loop4!(i, *c![i, 1] = *c![i, 1] * beta + extract!(ab[i], 1)); loop4!(i, *c![i, 2] = *c![i, 2] * beta + extract!(ab[i], 2)); loop4!(i, *c![i, 3] = *c![i, 3] * beta + extract!(ab[i], 3)); } } matrixmultiply-0.3.9/spare_kernels/x86_sse_sgemm.rs000064400000000000000000000053061046102023000206370ustar 00000000000000 // 4x4 sse sgemm macro_rules! mm_transpose4 { ($c0:expr, $c1:expr, $c2:expr, $c3:expr) => {{ // This is _MM_TRANSPOSE4_PS except we take variables, not references let tmp0 = _mm_unpacklo_ps($c0, $c1); let tmp2 = _mm_unpacklo_ps($c2, $c3); let tmp1 = _mm_unpackhi_ps($c0, $c1); let tmp3 = _mm_unpackhi_ps($c2, $c3); $c0 = _mm_movelh_ps(tmp0, tmp2); $c1 = _mm_movehl_ps(tmp2, tmp0); $c2 = _mm_movelh_ps(tmp1, tmp3); $c3 = _mm_movehl_ps(tmp3, tmp1); }} } #[inline(always)] #[cfg(any(target_arch="x86", target_arch="x86_64"))] unsafe fn kernel_x86_sse(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { let mut ab = [_mm_setzero_ps(); MR]; let mut bv; let (mut a, mut b) = (a, b); // Compute A B for _ in 0..k { bv = _mm_load_ps(b as _); // aligned due to GemmKernel::align_to loop_m!(i, { // Compute ab_i += [ai b_j+0, ai b_j+1, ai b_j+2, ai b_j+3] let aiv = _mm_set1_ps(at(a, i)); ab[i] = _mm_add_ps(ab[i], _mm_mul_ps(aiv, bv)); }); a = a.add(MR); b = b.add(NR); } // Compute α (A B) let alphav = _mm_set1_ps(alpha); loop_m!(i, ab[i] = _mm_mul_ps(alphav, ab[i])); macro_rules! c { ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize)); } // C ← α A B + β C let mut c = [_mm_setzero_ps(); MR]; let betav = _mm_set1_ps(beta); if beta != 0. { // Read C if csc == 1 { loop_m!(i, c[i] = _mm_loadu_ps(c![i, 0])); } else if rsc == 1 { loop_m!(i, c[i] = _mm_loadu_ps(c![0, i])); mm_transpose4!(c[0], c[1], c[2], c[3]); } else { loop_m!(i, c[i] = _mm_set_ps(*c![i, 3], *c![i, 2], *c![i, 1], *c![i, 0])); } // Compute β C loop_m!(i, c[i] = _mm_mul_ps(c[i], betav)); } // Compute (α A B) + (β C) loop_m!(i, c[i] = _mm_add_ps(c[i], ab[i])); // Store C back to memory if csc == 1 { loop_m!(i, _mm_storeu_ps(c![i, 0], c[i])); } else if rsc == 1 { mm_transpose4!(c[0], c[1], c[2], c[3]); loop_m!(i, _mm_storeu_ps(c![0, i], c[i])); } else { // extract the nth value of a vector using _mm_cvtss_f32 (extract lowest) // in combination with shuffle (move nth value to first position) loop_m!(i, *c![i, 0] = _mm_cvtss_f32(c[i])); loop_m!(i, *c![i, 1] = _mm_cvtss_f32(_mm_shuffle_ps(c[i], c[i], 1))); loop_m!(i, *c![i, 2] = _mm_cvtss_f32(_mm_shuffle_ps(c[i], c[i], 2))); loop_m!(i, *c![i, 3] = _mm_cvtss_f32(_mm_shuffle_ps(c[i], c[i], 3))); } } matrixmultiply-0.3.9/src/aarch64/macros.rs000064400000000000000000000020711046102023000166120ustar 00000000000000macro_rules! is_aarch64_feature_detected_ { ($name:tt) => {{ #[cfg(feature="std")] { // For testing purposes, we can make sure only one specific feature // is enabled by setting MMTEST_FEATURE=featurename (all others // disabled). This does not force it to be detected, it must also be. compile_env_matches_or_is_empty!("MMTEST_FEATURE", $name) && std::arch::is_aarch64_feature_detected!($name) } #[cfg(not(feature="std"))] { // For testing purposes, we can make sure only one specific feature // is enabled by setting MMTEST_FEATURE=featurename (all others // disabled). This does not force it to be detected, it must also // be. In the `no_std` case, the `is_86_feature_detected` macro is // not available, so we have to fall back to checking whether the // feature is enabled at compile-time. compile_env_matches_or_is_empty!("MMTEST_FEATURE", $name) && cfg!(target_feature=$name) } }}; } matrixmultiply-0.3.9/src/aarch64/mod.rs000064400000000000000000000000311046102023000160770ustar 00000000000000#[macro_use] mod macros; matrixmultiply-0.3.9/src/aligned_alloc.rs000064400000000000000000000035571046102023000166650ustar 00000000000000#[cfg(not(feature = "std"))] use ::alloc::alloc::{self, handle_alloc_error, Layout}; use core::{cmp, mem}; #[cfg(feature = "std")] use std::alloc::{self, handle_alloc_error, Layout}; #[cfg(test)] use core::ops::{Deref, DerefMut}; #[cfg(test)] use core::slice; pub(crate) struct Alloc { ptr: *mut T, len: usize, align: usize, } impl Alloc { #[inline] pub unsafe fn new(nelem: usize, align: usize) -> Self { let align = cmp::max(align, mem::align_of::()); #[cfg(debug_assertions)] let layout = Layout::from_size_align(mem::size_of::() * nelem, align).unwrap(); #[cfg(not(debug_assertions))] let layout = Layout::from_size_align_unchecked(mem::size_of::() * nelem, align); dprint!("Allocating nelem={}, layout={:?}", nelem, layout); let ptr = alloc::alloc(layout); if ptr.is_null() { handle_alloc_error(layout); } Alloc { ptr: ptr as *mut T, len: nelem, align, } } #[cfg(test)] pub fn init_with(mut self, elt: T) -> Alloc where T: Copy, { for elt1 in &mut self[..] { *elt1 = elt; } self } #[inline] pub fn ptr_mut(&mut self) -> *mut T { self.ptr } } impl Drop for Alloc { fn drop(&mut self) { unsafe { let layout = Layout::from_size_align_unchecked(mem::size_of::() * self.len, self.align); alloc::dealloc(self.ptr as _, layout); } } } #[cfg(test)] impl Deref for Alloc { type Target = [T]; fn deref(&self) -> &[T] { unsafe { slice::from_raw_parts(self.ptr, self.len) } } } #[cfg(test)] impl DerefMut for Alloc { fn deref_mut(&mut self) -> &mut [T] { unsafe { slice::from_raw_parts_mut(self.ptr, self.len) } } } matrixmultiply-0.3.9/src/archmacros.rs000064400000000000000000000004461046102023000162240ustar 00000000000000 #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch="aarch64"))] macro_rules! compile_env_matches_or_is_empty { ($envvar:tt, $feature_name:tt) => { (match option_env!($envvar) { None => true, Some(v) => v == $feature_name }) } } matrixmultiply-0.3.9/src/archparam.rs000064400000000000000000000041361046102023000160400ustar 00000000000000// Copyright 2016 - 2021 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! architechture specific parameters //! //! NC: Columns in C, B that we handle at a time. (5th loop) //! KC: Rows of Bj at a time (4th loop) //! MC: Rows of Ap at a time. (3rd loop) use crate::archparam_defaults; use crate::constparse::parse_unwarp; macro_rules! conf_env_or_default { ($env_name:tt, $default:expr) => { match option_env!($env_name) { Some(x) => parse_unwarp(x), None => $default, } } } pub(crate) const S_NC: usize = conf_env_or_default!("MATMUL_SGEMM_NC", archparam_defaults::S_NC); pub(crate) const S_KC: usize = conf_env_or_default!("MATMUL_SGEMM_KC", archparam_defaults::S_KC); pub(crate) const S_MC: usize = conf_env_or_default!("MATMUL_SGEMM_MC", archparam_defaults::S_MC); pub(crate) const D_NC: usize = conf_env_or_default!("MATMUL_DGEMM_NC", archparam_defaults::D_NC); pub(crate) const D_KC: usize = conf_env_or_default!("MATMUL_DGEMM_KC", archparam_defaults::D_KC); pub(crate) const D_MC: usize = conf_env_or_default!("MATMUL_DGEMM_MC", archparam_defaults::D_MC); #[cfg(feature = "cgemm")] pub(crate) const C_NC: usize = conf_env_or_default!("MATMUL_CGEMM_NC", archparam_defaults::C_NC); #[cfg(feature = "cgemm")] pub(crate) const C_KC: usize = conf_env_or_default!("MATMUL_CGEMM_KC", archparam_defaults::C_KC); #[cfg(feature = "cgemm")] pub(crate) const C_MC: usize = conf_env_or_default!("MATMUL_CGEMM_MC", archparam_defaults::C_MC); #[cfg(feature = "cgemm")] pub(crate) const Z_NC: usize = conf_env_or_default!("MATMUL_ZGEMM_NC", archparam_defaults::Z_NC); #[cfg(feature = "cgemm")] pub(crate) const Z_KC: usize = conf_env_or_default!("MATMUL_ZGEMM_KC", archparam_defaults::Z_KC); #[cfg(feature = "cgemm")] pub(crate) const Z_MC: usize = conf_env_or_default!("MATMUL_ZGEMM_MC", archparam_defaults::Z_MC); matrixmultiply-0.3.9/src/archparam_defaults.rs000064400000000000000000000051571046102023000177330ustar 00000000000000// Copyright 2016 - 2018 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! architechture specific parameters /// Columns in C, B that we handle at a time. (5th loop) /// /// Cuts B into B0, B1, .. Bj, .. B_NC pub const S_NC: usize = 1024; //pub const S_NC: usize = option_env!("MATMUL_SGEMM_NC").map(parse_unwrap).unwrap_or(S_NC); /// Rows of Bj at a time (4th loop) /// /// Columns of A at a time. /// /// Cuts A into Ap /// /// Cuts Bj into Bp, which is packed into B~. /// /// Size of B~ is NC x KC pub const S_KC: usize = 256; /// Rows of Ap at a time. (3rd loop) /// /// Cuts Ap into A0, A1, .., Ai, .. A_MC /// /// Ai is packed into A~. /// /// Size of A~ is KC x MC pub const S_MC: usize = 64; /// Columns in C, B that we handle at a time. (5th loop) /// /// Cuts B into B0, B1, .. Bj, .. B_NC pub const D_NC: usize = 1024; /// Rows of Bj at a time (4th loop) /// /// Columns of A at a time. /// /// Cuts A into Ap /// /// Cuts Bj into Bp, which is packed into B~. /// /// Size of B~ is NC x KC pub const D_KC: usize = 256; /// Rows of Ap at a time. (3rd loop) /// /// Cuts Ap into A0, A1, .., Ai, .. A_MC /// /// Ai is packed into A~. /// /// Size of A~ is KC x MC pub const D_MC: usize = 64; #[cfg(feature = "cgemm")] /// Columns in C, B that we handle at a time. (5th loop) /// /// Cuts B into B0, B1, .. Bj, .. B_NC pub const C_NC: usize = S_NC / 2; #[cfg(feature = "cgemm")] /// Rows of Bj at a time (4th loop) /// /// Columns of A at a time. /// /// Cuts A into Ap /// /// Cuts Bj into Bp, which is packed into B~. /// /// Size of B~ is NC x KC pub const C_KC: usize = S_KC; #[cfg(feature = "cgemm")] /// Rows of Ap at a time. (3rd loop) /// /// Cuts Ap into A0, A1, .., Ai, .. A_MC /// /// Ai is packed into A~. /// /// Size of A~ is KC x MC pub const C_MC: usize = S_MC / 2; #[cfg(feature = "cgemm")] /// Columns in C, B that we handle at a time. (5th loop) /// /// Cuts B into B0, B1, .. Bj, .. B_NC pub const Z_NC: usize = D_NC / 2; #[cfg(feature = "cgemm")] /// Rows of Bj at a time (4th loop) /// /// Columns of A at a time. /// /// Cuts A into Ap /// /// Cuts Bj into Bp, which is packed into B~. /// /// Size of B~ is NC x KC pub const Z_KC: usize = D_KC; #[cfg(feature = "cgemm")] /// Rows of Ap at a time. (3rd loop) /// /// Cuts Ap into A0, A1, .., Ai, .. A_MC /// /// Ai is packed into A~. /// /// Size of A~ is KC x MC pub const Z_MC: usize = D_MC / 2; matrixmultiply-0.3.9/src/cgemm_common.rs000064400000000000000000000147361046102023000165510ustar 00000000000000// Copyright 2021-2023 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use core::mem; use core::ptr::copy_nonoverlapping; use rawpointer::PointerExt; use crate::kernel::Element; use crate::kernel::ConstNum; #[cfg(feature = "std")] macro_rules! fmuladd { // conceptually $dst += $a * $b, optionally use fused multiply-add (fma_yes, $dst:expr, $a:expr, $b:expr) => { { $dst = $a.mul_add($b, $dst); } }; (fma_no, $dst:expr, $a:expr, $b:expr) => { { $dst += $a * $b; } }; } #[cfg(not(feature = "std"))] macro_rules! fmuladd { ($any:tt, $dst:expr, $a:expr, $b:expr) => { { $dst += $a * $b; } }; } // kernel fallback impl macro // Depends on a couple of macro and function defitions to be in scope - loop_m/_n, at, etc. // $fma_opt: fma_yes or fma_no to use f32::mul_add etc or not macro_rules! kernel_fallback_impl_complex { ([$($attr:meta)*] [$fma_opt:tt] $name:ident, $elem_ty:ty, $real_ty:ty, $mr:expr, $nr:expr, $unroll:tt) => { $(#[$attr])* unsafe fn $name(k: usize, alpha: $elem_ty, a: *const $elem_ty, b: *const $elem_ty, beta: $elem_ty, c: *mut $elem_ty, rsc: isize, csc: isize) { const MR: usize = $mr; const NR: usize = $nr; debug_assert_eq!(beta, <$elem_ty>::zero(), "Beta must be 0 or is not masked"); let mut pp = [<$real_ty>::zero(); MR]; let mut qq = [<$real_ty>::zero(); MR]; let mut rr = [<$real_ty>::zero(); NR]; let mut ss = [<$real_ty>::zero(); NR]; let mut ab: [[$elem_ty; NR]; MR] = [[<$elem_ty>::zero(); NR]; MR]; let mut areal = a as *const $real_ty; let mut breal = b as *const $real_ty; unroll_by!($unroll => k, { // We set: // P + Q i = A // R + S i = B // // see pack_complex for how data is packed let aimag = areal.add(MR); let bimag = breal.add(NR); // AB = PR - QS + i (QR + PS) loop_m!(i, { pp[i] = at(areal, i); qq[i] = at(aimag, i); }); loop_n!(j, { rr[j] = at(breal, j); ss[j] = at(bimag, j); }); loop_m!(i, { loop_n!(j, { // optionally use fma fmuladd!($fma_opt, ab[i][j][0], pp[i], rr[j]); fmuladd!($fma_opt, ab[i][j][1], pp[i], ss[j]); fmuladd!($fma_opt, ab[i][j][0], -qq[i], ss[j]); fmuladd!($fma_opt, ab[i][j][1], qq[i], rr[j]); }) }); areal = aimag.add(MR); breal = bimag.add(NR); }); macro_rules! c { ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize)); } // set C = α A B loop_n!(j, loop_m!(i, *c![i, j] = mul(alpha, ab[i][j]))); } }; } /// GemmKernel packing trait methods macro_rules! pack_methods { () => { #[inline] unsafe fn pack_mr(kc: usize, mc: usize, pack: &mut [Self::Elem], a: *const Self::Elem, rsa: isize, csa: isize) { pack_complex::(kc, mc, pack, a, rsa, csa) } #[inline] unsafe fn pack_nr(kc: usize, mc: usize, pack: &mut [Self::Elem], a: *const Self::Elem, rsa: isize, csa: isize) { pack_complex::(kc, mc, pack, a, rsa, csa) } } } /// Pack complex: similar to general packing but separate rows for real and imag parts. /// /// Source matrix contains [p0 + q0i, p1 + q1i, p2 + q2i, ..] and it's packed into /// alternate rows of real and imaginary parts. /// /// [ p0 p1 p2 p3 .. (MR repeats) /// q0 q1 q2 q3 .. (MR repeats) /// px p_ p_ p_ .. (x = MR) /// qx q_ q_ q_ .. (x = MR) /// py p_ p_ p_ .. (y = 2 * MR) /// qy q_ q_ q_ .. (y = 2 * MR) /// ... /// ] pub(crate) unsafe fn pack_complex(kc: usize, mc: usize, pack: &mut [T], a: *const T, rsa: isize, csa: isize) where MR: ConstNum, T: Element, TReal: Element, { // use pointers as pointer to TReal let pack = pack.as_mut_ptr() as *mut TReal; let areal = a as *const TReal; let aimag = areal.add(1); assert_eq!(mem::size_of::(), 2 * mem::size_of::()); let mr = MR::VALUE; let mut p = 0; // offset into pack // general layout case (no contig case when stride != 1) for ir in 0..mc/mr { let row_offset = ir * mr; for j in 0..kc { // real row for i in 0..mr { let a_elt = areal.stride_offset(2 * rsa, i + row_offset) .stride_offset(2 * csa, j); copy_nonoverlapping(a_elt, pack.add(p), 1); p += 1; } // imag row for i in 0..mr { let a_elt = aimag.stride_offset(2 * rsa, i + row_offset) .stride_offset(2 * csa, j); copy_nonoverlapping(a_elt, pack.add(p), 1); p += 1; } } } let zero = TReal::zero(); // Pad with zeros to multiple of kernel size (uneven mc) let rest = mc % mr; if rest > 0 { let row_offset = (mc/mr) * mr; for j in 0..kc { // real row for i in 0..mr { if i < rest { let a_elt = areal.stride_offset(2 * rsa, i + row_offset) .stride_offset(2 * csa, j); copy_nonoverlapping(a_elt, pack.add(p), 1); } else { *pack.add(p) = zero; } p += 1; } // imag row for i in 0..mr { if i < rest { let a_elt = aimag.stride_offset(2 * rsa, i + row_offset) .stride_offset(2 * csa, j); copy_nonoverlapping(a_elt, pack.add(p), 1); } else { *pack.add(p) = zero; } p += 1; } } } } matrixmultiply-0.3.9/src/cgemm_kernel.rs000064400000000000000000000207101046102023000165260ustar 00000000000000// Copyright 2016 - 2021 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use crate::kernel::GemmKernel; use crate::kernel::GemmSelect; use crate::kernel::{U2, U4, c32, Element, c32_mul as mul}; use crate::archparam; use crate::cgemm_common::pack_complex; #[cfg(any(target_arch="x86", target_arch="x86_64"))] struct KernelAvx2; #[cfg(any(target_arch="x86", target_arch="x86_64"))] struct KernelFma; #[cfg(target_arch = "aarch64")] #[cfg(has_aarch64_simd)] struct KernelNeon; struct KernelFallback; type T = c32; type TReal = f32; /// Detect which implementation to use and select it using the selector's /// .select(Kernel) method. /// /// This function is called one or more times during a whole program's /// execution, it may be called for each gemm kernel invocation or fewer times. #[inline] pub(crate) fn detect(selector: G) where G: GemmSelect { // dispatch to specific compiled versions #[cfg(any(target_arch="x86", target_arch="x86_64"))] { if is_x86_feature_detected_!("fma") { if is_x86_feature_detected_!("avx2") { return selector.select(KernelAvx2); } return selector.select(KernelFma); } } #[cfg(target_arch = "aarch64")] #[cfg(has_aarch64_simd)] { if is_aarch64_feature_detected_!("neon") { return selector.select(KernelNeon); } } return selector.select(KernelFallback); } #[cfg(any(target_arch="x86", target_arch="x86_64"))] impl GemmKernel for KernelAvx2 { type Elem = T; type MRTy = U4; type NRTy = U4; #[inline(always)] fn align_to() -> usize { 32 } #[inline(always)] fn always_masked() -> bool { KernelFallback::always_masked() } #[inline(always)] fn nc() -> usize { archparam::C_NC } #[inline(always)] fn kc() -> usize { archparam::C_KC } #[inline(always)] fn mc() -> usize { archparam::C_MC } pack_methods!{} #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_avx2(k, alpha, a, b, beta, c, rsc, csc) } } #[cfg(any(target_arch="x86", target_arch="x86_64"))] impl GemmKernel for KernelFma { type Elem = T; type MRTy = U4; type NRTy = U4; #[inline(always)] fn align_to() -> usize { 16 } #[inline(always)] fn always_masked() -> bool { KernelFallback::always_masked() } #[inline(always)] fn nc() -> usize { archparam::C_NC } #[inline(always)] fn kc() -> usize { archparam::C_KC } #[inline(always)] fn mc() -> usize { archparam::C_MC } pack_methods!{} #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_fma(k, alpha, a, b, beta, c, rsc, csc) } } #[cfg(target_arch = "aarch64")] #[cfg(has_aarch64_simd)] impl GemmKernel for KernelNeon { type Elem = T; type MRTy = U4; type NRTy = U2; #[inline(always)] fn align_to() -> usize { 16 } #[inline(always)] fn always_masked() -> bool { KernelFallback::always_masked() } #[inline(always)] fn nc() -> usize { archparam::C_NC } #[inline(always)] fn kc() -> usize { archparam::C_KC } #[inline(always)] fn mc() -> usize { archparam::C_MC } pack_methods!{} #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_neon(k, alpha, a, b, beta, c, rsc, csc) } } impl GemmKernel for KernelFallback { type Elem = T; type MRTy = U4; type NRTy = U2; #[inline(always)] fn align_to() -> usize { 0 } #[inline(always)] fn always_masked() -> bool { true } #[inline(always)] fn nc() -> usize { archparam::C_NC } #[inline(always)] fn kc() -> usize { archparam::C_KC } #[inline(always)] fn mc() -> usize { archparam::C_MC } pack_methods!{} #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc) } } // Kernel AVX2 #[cfg(any(target_arch="x86", target_arch="x86_64"))] macro_rules! loop_m { ($i:ident, $e:expr) => { loop4!($i, $e) }; } #[cfg(any(target_arch="x86", target_arch="x86_64"))] macro_rules! loop_n { ($j:ident, $e:expr) => { loop4!($j, $e) }; } #[cfg(any(target_arch="x86", target_arch="x86_64"))] kernel_fallback_impl_complex! { // instantiate separately [inline target_feature(enable="avx2") target_feature(enable="fma")] [fma_yes] kernel_target_avx2, T, TReal, KernelAvx2::MR, KernelAvx2::NR, 4 } // Kernel Fma #[cfg(any(target_arch="x86", target_arch="x86_64"))] macro_rules! loop_m { ($i:ident, $e:expr) => { loop4!($i, $e) }; } #[cfg(any(target_arch="x86", target_arch="x86_64"))] macro_rules! loop_n { ($j:ident, $e:expr) => { loop4!($j, $e) }; } #[cfg(any(target_arch="x86", target_arch="x86_64"))] kernel_fallback_impl_complex! { // instantiate separately [inline target_feature(enable="fma")] [fma_no] kernel_target_fma, T, TReal, KernelFma::MR, KernelFma::NR, 2 } // Kernel neon #[cfg(target_arch = "aarch64")] #[cfg(has_aarch64_simd)] macro_rules! loop_m { ($i:ident, $e:expr) => { loop4!($i, $e) }; } #[cfg(target_arch = "aarch64")] #[cfg(has_aarch64_simd)] macro_rules! loop_n { ($j:ident, $e:expr) => { loop2!($j, $e) }; } #[cfg(target_arch = "aarch64")] #[cfg(has_aarch64_simd)] kernel_fallback_impl_complex! { [inline target_feature(enable="neon")] [fma_yes] kernel_target_neon, T, TReal, KernelNeon::MR, KernelNeon::NR, 1 } // Kernel fallback macro_rules! loop_m { ($i:ident, $e:expr) => { loop4!($i, $e) }; } macro_rules! loop_n { ($j:ident, $e:expr) => { loop2!($j, $e) }; } kernel_fallback_impl_complex! { [inline(always)] [fma_no] kernel_fallback_impl, T, TReal, KernelFallback::MR, KernelFallback::NR, 1 } #[inline(always)] unsafe fn at(ptr: *const TReal, i: usize) -> TReal { *ptr.add(i) } #[cfg(test)] mod tests { use super::*; use crate::kernel::test::test_complex_packed_kernel; #[test] fn test_kernel_fallback_impl() { test_complex_packed_kernel::("kernel"); } #[cfg(target_arch = "aarch64")] #[cfg(has_aarch64_simd)] mod test_kernel_aarch64 { use super::test_complex_packed_kernel; use super::super::*; #[cfg(feature = "std")] use std::println; macro_rules! test_arch_kernels { ($($feature_name:tt, $name:ident, $kernel_ty:ty),*) => { $( #[test] fn $name() { if is_aarch64_feature_detected_!($feature_name) { test_complex_packed_kernel::<$kernel_ty, _, TReal>(stringify!($name)); } else { #[cfg(feature = "std")] println!("Skipping, host does not have feature: {:?}", $feature_name); } } )* } } test_arch_kernels! { "neon", neon, KernelNeon } } #[cfg(any(target_arch="x86", target_arch="x86_64"))] mod test_arch_kernels { use super::test_complex_packed_kernel; use super::super::*; #[cfg(feature = "std")] use std::println; macro_rules! test_arch_kernels_x86 { ($($feature_name:tt, $name:ident, $kernel_ty:ty),*) => { $( #[test] fn $name() { if is_x86_feature_detected_!($feature_name) { test_complex_packed_kernel::<$kernel_ty, _, TReal>(stringify!($name)); } else { #[cfg(feature = "std")] println!("Skipping, host does not have feature: {:?}", $feature_name); } } )* } } test_arch_kernels_x86! { "fma", fma, KernelFma, "avx2", avx2, KernelAvx2 } } } matrixmultiply-0.3.9/src/constparse.rs000064400000000000000000000045561046102023000162710ustar 00000000000000// Copyright (c) 2021 DutchGhost // Copyright (c) 2021 matrixmultiply authors // // Incorporpated in matrixmultiply under these terms, see main license files. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. #[derive(Clone, Copy)] pub(crate) enum ParseIntError { InvalidDigit, } const fn parse_byte(b: u8, pow10: usize) -> Result { let r = b.wrapping_sub(48); if r > 9 { Err(ParseIntError::InvalidDigit) } else { Ok((r as usize) * pow10) } } pub(crate) const POW10: [usize; 20] = { let mut array = [0; 20]; let mut pow10 = 1usize; let mut index = 20; loop { index -= 1; array[index] = pow10; if index == 0 { break } let (new_power, overflow) = pow10.overflowing_mul(10); pow10 = new_power; if overflow { break; } } array }; /// Parse the input to integer; or otherwise cause /// a const error, an "unwarp" in space and time. pub(crate) const fn parse_unwarp(b: &str) -> usize { match parse(b) { Ok(t) => t, res @ Err(_) => { [0, /* const error: failed to parse environment variable */][res.is_err() as usize] } } } /// Parse the input to usize pub(crate) const fn parse(b: &str) -> Result { let bytes = b.as_bytes(); let mut result: usize = 0; let len = bytes.len(); // Start at the correct index of the table, // (skip the power's that are too large) let mut index_const_table = POW10.len().wrapping_sub(len); let mut index = 0; while index < b.len() { let a = bytes[index]; let p = POW10[index_const_table]; let r = match parse_byte(a, p) { Err(e) => return Err(e), Ok(d) => d, }; result = result.wrapping_add(r); index += 1; index_const_table += 1; } Ok(result) } #[test] fn test_parse() { for i in 0..500 { assert_eq!(parse_unwarp(&i.to_string()), i); } for bits in 0..std::mem::size_of::() * 8 { let i = (1 << bits) - 1; assert_eq!(parse_unwarp(&i.to_string()), i); } } matrixmultiply-0.3.9/src/debugmacros.rs000064400000000000000000000012261046102023000163720ustar 00000000000000// Copyright 2016 - 2018 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. // for debugging -- like println during debugging macro_rules! dprint { ($($t:tt)*) => { debug!(println!($($t)*)) } } #[cfg(feature = "use_debug")] macro_rules! debug { ($e:expr) => { $e; } } #[cfg(not(feature = "use_debug"))] macro_rules! debug { ($e:expr) => { } } matrixmultiply-0.3.9/src/dgemm_kernel.rs000064400000000000000000001066531046102023000165420ustar 00000000000000// Copyright 2016 - 2023 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use crate::kernel::GemmKernel; use crate::kernel::GemmSelect; #[allow(unused)] use crate::kernel::{U4, U8}; use crate::archparam; #[cfg(target_arch="x86")] use core::arch::x86::*; #[cfg(target_arch="x86_64")] use core::arch::x86_64::*; #[cfg(any(target_arch="x86", target_arch="x86_64"))] use crate::x86::{FusedMulAdd, AvxMulAdd, DMultiplyAdd}; #[cfg(any(target_arch="x86", target_arch="x86_64"))] struct KernelAvx; #[cfg(any(target_arch="x86", target_arch="x86_64"))] struct KernelFmaAvx2; #[cfg(any(target_arch="x86", target_arch="x86_64"))] struct KernelFma; #[cfg(any(target_arch="x86", target_arch="x86_64"))] struct KernelSse2; #[cfg(target_arch="aarch64")] #[cfg(has_aarch64_simd)] struct KernelNeon; struct KernelFallback; type T = f64; /// Detect which implementation to use and select it using the selector's /// .select(Kernel) method. /// /// This function is called one or more times during a whole program's /// execution, it may be called for each gemm kernel invocation or fewer times. #[inline] pub(crate) fn detect(selector: G) where G: GemmSelect { // dispatch to specific compiled versions #[cfg(any(target_arch="x86", target_arch="x86_64"))] { if is_x86_feature_detected_!("fma") { if is_x86_feature_detected_!("avx2") { return selector.select(KernelFmaAvx2); } return selector.select(KernelFma); } else if is_x86_feature_detected_!("avx") { return selector.select(KernelAvx); } else if is_x86_feature_detected_!("sse2") { return selector.select(KernelSse2); } } #[cfg(target_arch="aarch64")] #[cfg(has_aarch64_simd)] { if is_aarch64_feature_detected_!("neon") { return selector.select(KernelNeon); } } return selector.select(KernelFallback); } #[cfg(any(target_arch="x86", target_arch="x86_64"))] macro_rules! loop_m { ($i:ident, $e:expr) => { loop8!($i, $e) }; } #[cfg(any(target_arch="x86", target_arch="x86_64"))] impl GemmKernel for KernelAvx { type Elem = T; type MRTy = U8; type NRTy = U4; #[inline(always)] fn align_to() -> usize { 32 } #[inline(always)] fn always_masked() -> bool { false } #[inline(always)] fn nc() -> usize { archparam::D_NC } #[inline(always)] fn kc() -> usize { archparam::D_KC } #[inline(always)] fn mc() -> usize { archparam::D_MC } #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_avx(k, alpha, a, b, beta, c, rsc, csc) } } #[cfg(any(target_arch="x86", target_arch="x86_64"))] impl GemmKernel for KernelFma { type Elem = T; type MRTy = ::MRTy; type NRTy = ::NRTy; #[inline(always)] fn align_to() -> usize { KernelAvx::align_to() } #[inline(always)] fn always_masked() -> bool { KernelAvx::always_masked() } #[inline(always)] fn nc() -> usize { archparam::D_NC } #[inline(always)] fn kc() -> usize { archparam::D_KC } #[inline(always)] fn mc() -> usize { archparam::D_MC } #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_fma(k, alpha, a, b, beta, c, rsc, csc) } } #[cfg(any(target_arch="x86", target_arch="x86_64"))] impl GemmKernel for KernelFmaAvx2 { type Elem = T; type MRTy = ::MRTy; type NRTy = ::NRTy; #[inline(always)] fn align_to() -> usize { KernelAvx::align_to() } #[inline(always)] fn always_masked() -> bool { KernelAvx::always_masked() } #[inline(always)] fn nc() -> usize { archparam::D_NC } #[inline(always)] fn kc() -> usize { archparam::D_KC } #[inline(always)] fn mc() -> usize { archparam::D_MC } #[inline] unsafe fn pack_mr(kc: usize, mc: usize, pack: &mut [Self::Elem], a: *const Self::Elem, rsa: isize, csa: isize) { // safety: Avx2 is enabled crate::packing::pack_avx2::(kc, mc, pack, a, rsa, csa) } #[inline] unsafe fn pack_nr(kc: usize, mc: usize, pack: &mut [Self::Elem], a: *const Self::Elem, rsa: isize, csa: isize) { // safety: Avx2 is enabled crate::packing::pack_avx2::(kc, mc, pack, a, rsa, csa) } #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_fma(k, alpha, a, b, beta, c, rsc, csc) } } #[cfg(any(target_arch="x86", target_arch="x86_64"))] impl GemmKernel for KernelSse2 { type Elem = T; type MRTy = U4; type NRTy = U4; #[inline(always)] fn align_to() -> usize { 16 } #[inline(always)] fn always_masked() -> bool { true } #[inline(always)] fn nc() -> usize { archparam::D_NC } #[inline(always)] fn kc() -> usize { archparam::D_KC } #[inline(always)] fn mc() -> usize { archparam::D_MC } #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_sse2(k, alpha, a, b, beta, c, rsc, csc) } } #[cfg(target_arch="aarch64")] #[cfg(has_aarch64_simd)] impl GemmKernel for KernelNeon { type Elem = T; type MRTy = U8; type NRTy = U4; #[inline(always)] fn align_to() -> usize { 32 } #[inline(always)] fn always_masked() -> bool { false } #[inline(always)] fn nc() -> usize { archparam::S_NC } #[inline(always)] fn kc() -> usize { archparam::S_KC } #[inline(always)] fn mc() -> usize { archparam::S_MC } #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_neon(k, alpha, a, b, beta, c, rsc, csc) } } impl GemmKernel for KernelFallback { type Elem = T; type MRTy = U4; type NRTy = U4; #[inline(always)] fn align_to() -> usize { 0 } #[inline(always)] fn always_masked() -> bool { true } #[inline(always)] fn nc() -> usize { archparam::D_NC } #[inline(always)] fn kc() -> usize { archparam::D_KC } #[inline(always)] fn mc() -> usize { archparam::D_MC } #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc) } } // no inline for unmasked kernels #[cfg(any(target_arch="x86", target_arch="x86_64"))] #[target_feature(enable="fma")] unsafe fn kernel_target_fma(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_x86_avx::(k, alpha, a, b, beta, c, rsc, csc) } // no inline for unmasked kernels #[cfg(any(target_arch="x86", target_arch="x86_64"))] #[target_feature(enable="avx")] unsafe fn kernel_target_avx(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_x86_avx::(k, alpha, a, b, beta, c, rsc, csc) } #[inline] #[target_feature(enable="sse2")] #[cfg(any(target_arch="x86", target_arch="x86_64"))] unsafe fn kernel_target_sse2(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc) } #[inline(always)] #[cfg(any(target_arch="x86", target_arch="x86_64"))] unsafe fn kernel_x86_avx(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) where MA: DMultiplyAdd { const MR: usize = KernelAvx::MR; const NR: usize = KernelAvx::NR; debug_assert_ne!(k, 0); let mut ab = [_mm256_setzero_pd(); MR]; let (mut a, mut b) = (a, b); // With MR=8, we load sets of 4 doubles from a let mut a_0123 = _mm256_load_pd(a); let mut a_4567 = _mm256_load_pd(a.add(4)); // With NR=4, we load 4 doubles from b let mut b_0123 = _mm256_load_pd(b); unroll_by_with_last!(4 => k, is_last, { // We need to multiply each element of b with each element of a_0 // and a_1. To do so, we need to generate all possible permutations // for the doubles in b, but without two permutations having the // same double at the same spot. // // So, if we are given the permutations (indices of the doubles // in the packed 4-vector): // // 0 1 2 3 // // Then another valid permutation has to shuffle all elements // around without a single element remaining at the same index // it was before. // // A possible set of valid combination then are: // // 0 1 2 3 (the original) // 1 0 3 2 (chosen because of _mm256_shuffle_pd) // 3 2 1 0 (chosen because of _mm256_permute2f128_pd) // 2 3 0 1 (chosen because of _mm256_shuffle_pd) let b_1032 = _mm256_shuffle_pd(b_0123, b_0123, 0b0101); // Both packed 4-vectors are the same, so one could also perform // the selection 0b0000_0001 or 0b0010_0001 or 0b0010_0011. // The confusing part is that of the lower 4 bits and upper 4 bits // only 2 bits are used in each. The same choice could have been // encoded in a nibble (4 bits) total, i.e. 0b1100, had the intrinsics // been defined differently. The highest bit in each nibble controls // zero-ing behaviour though. let b_3210 = _mm256_permute2f128_pd(b_1032, b_1032, 0b0011); let b_2301 = _mm256_shuffle_pd(b_3210, b_3210, 0b0101); // The ideal distribution of a_i b_j pairs in the resulting panel of // c in order to have the matching products / sums of products in the // right places would look like this after the first iteration: // // ab_0 || a0 b0 | a0 b1 | a0 b2 | a0 b3 // ab_1 || a1 b0 | a1 b1 | a1 b2 | a1 b3 // ab_2 || a2 b0 | a2 b1 | a2 b2 | a2 b3 // ab_3 || a3 b0 | a3 b1 | a3 b2 | a3 b3 // || ----------------------------- // ab_4 || a4 b0 | a4 b1 | a4 b2 | a4 b3 // ab_5 || a5 b0 | a5 b1 | a5 b2 | a5 b3 // ab_6 || a6 b0 | a6 b1 | a6 b2 | a6 b3 // ab_7 || a7 b0 | a7 b1 | a7 b2 | a7 b3 // // As this is not possible / would require too many extra variables // and thus operations, we get the following configuration, and thus // have to be smart about putting the correct values into their // respective places at the end. // // ab_0 || a0 b0 | a1 b1 | a2 b2 | a3 b3 // ab_1 || a0 b1 | a1 b0 | a2 b3 | a3 b2 // ab_2 || a0 b2 | a1 b3 | a2 b0 | a3 b1 // ab_3 || a0 b3 | a1 b2 | a2 b1 | a3 b0 // || ----------------------------- // ab_4 || a4 b0 | a5 b1 | a6 b2 | a7 b3 // ab_5 || a4 b1 | a5 b0 | a6 b3 | a7 b2 // ab_6 || a4 b2 | a5 b3 | a6 b0 | a7 b1 // ab_7 || a4 b3 | a5 b2 | a6 b1 | a7 b0 // Add and multiply in one go ab[0] = MA::multiply_add(a_0123, b_0123, ab[0]); ab[1] = MA::multiply_add(a_0123, b_1032, ab[1]); ab[2] = MA::multiply_add(a_0123, b_2301, ab[2]); ab[3] = MA::multiply_add(a_0123, b_3210, ab[3]); ab[4] = MA::multiply_add(a_4567, b_0123, ab[4]); ab[5] = MA::multiply_add(a_4567, b_1032, ab[5]); ab[6] = MA::multiply_add(a_4567, b_2301, ab[6]); ab[7] = MA::multiply_add(a_4567, b_3210, ab[7]); if !is_last { a = a.add(MR); b = b.add(NR); a_0123 = _mm256_load_pd(a); a_4567 = _mm256_load_pd(a.add(4)); b_0123 = _mm256_load_pd(b); } }); // Our products/sums are currently stored according to the // table below. Each row corresponds to one packed simd // 4-vector. // // ab_0 || a0 b0 | a1 b1 | a2 b2 | a3 b3 // ab_1 || a0 b1 | a1 b0 | a2 b3 | a3 b2 // ab_2 || a0 b2 | a1 b3 | a2 b0 | a3 b1 // ab_3 || a0 b3 | a1 b2 | a2 b1 | a3 b0 // || ----------------------------- // ab_4 || a4 b0 | a5 b1 | a6 b2 | a7 b3 // ab_5 || a4 b1 | a5 b0 | a6 b3 | a7 b2 // ab_6 || a4 b2 | a5 b3 | a6 b0 | a7 b1 // ab_7 || a4 b3 | a5 b2 | a6 b1 | a7 b0 // // This is the final results, where indices are stored // in their proper location. // // || a0 b0 | a0 b1 | a0 b2 | a0 b3 // || a1 b0 | a1 b1 | a1 b2 | a1 b3 // || a2 b0 | a2 b1 | a2 b2 | a2 b3 // || a3 b0 | a3 b1 | a3 b2 | a3 b3 // || ----------------------------- // || a4 b0 | a4 b1 | a4 b2 | a4 b3 // || a5 b0 | a5 b1 | a5 b2 | a5 b3 // || a6 b0 | a6 b1 | a6 b2 | a6 b3 // || a7 b0 | a7 b1 | a7 b2 | a7 b3 // // Given the simd intrinsics available through avx, we have two // ways of achieving this format. By either: // // a) Creating packed 4-vectors of rows, or // b) creating packed 4-vectors of columns. // // ** We will use option a) because it has slightly cheaper throughput // characteristics (see below). // // # a) Creating packed 4-vectors of columns // // To create packed 4-vectors of columns, we make us of // _mm256_blend_pd operations, followed by _mm256_permute2f128_pd. // // The first operation has latency 1 (all architectures), and 0.33 // throughput (Skylake, Broadwell, Haswell), or 0.5 (Ivy Bridge). // // The second operation has latency 3 (on Skylake, Broadwell, Haswell), // or latency 2 (on Ivy Brdige), and throughput 1 (all architectures). // // We start by applying _mm256_blend_pd on adjacent rows: // // Step 0.0 // a0 b0 | a1 b1 | a2 b2 | a3 b3 // a0 b1 | a1 b0 | a2 b3 | a3 b2 // => _mm256_blend_pd with 0b1010 // a0 b0 | a1 b0 | a2 b2 | a3 b2 (only columns 0 and 2) // // Step 0.1 // a0 b1 | a1 b0 | a2 b3 | a3 b2 (flipped the order) // a0 b0 | a1 b1 | a2 b2 | a3 b3 // => _mm256_blend_pd with 0b1010 // a0 b1 | a1 b1 | a2 b3 | a3 b3 (only columns 1 and 3) // // Step 0.2 // a0 b2 | a1 b3 | a2 b0 | a3 b1 // a0 b3 | a1 b2 | a2 b1 | a3 b0 // => _mm256_blend_pd with 0b1010 // a0 b2 | a1 b2 | a2 b0 | a3 b0 (only columns 0 and 2) // // Step 0.3 // a0 b3 | a1 b2 | a2 b1 | a3 b0 (flipped the order) // a0 b2 | a1 b3 | a2 b0 | a3 b1 // => _mm256_blend_pd with 0b1010 // a0 b3 | a1 b3 | a2 b1 | a3 b1 (only columns 1 and 3) // // Step 1.0 (combining steps 0.0 and 0.2) // // a0 b0 | a1 b0 | a2 b2 | a3 b2 // a0 b2 | a1 b2 | a2 b0 | a3 b0 // => _mm256_permute2f128_pd with 0x30 = 0b0011_0000 // a0 b0 | a1 b0 | a2 b0 | a3 b0 // // Step 1.1 (combining steps 0.0 and 0.2) // // a0 b0 | a1 b0 | a2 b2 | a3 b2 // a0 b2 | a1 b2 | a2 b0 | a3 b0 // => _mm256_permute2f128_pd with 0x12 = 0b0001_0010 // a0 b2 | a1 b2 | a2 b2 | a3 b2 // // Step 1.2 (combining steps 0.1 and 0.3) // a0 b1 | a1 b1 | a2 b3 | a3 b3 // a0 b3 | a1 b3 | a2 b1 | a3 b1 // => _mm256_permute2f128_pd with 0x30 = 0b0011_0000 // a0 b1 | a1 b1 | a2 b1 | a3 b1 // // Step 1.3 (combining steps 0.1 and 0.3) // a0 b1 | a1 b1 | a2 b3 | a3 b3 // a0 b3 | a1 b3 | a2 b1 | a3 b1 // => _mm256_permute2f128_pd with 0x12 = 0b0001_0010 // a0 b3 | a1 b3 | a2 b3 | a3 b3 // // # b) Creating packed 4-vectors of rows // // To create packed 4-vectors of rows, we make use of // _mm256_shuffle_pd operations followed by _mm256_permute2f128_pd. // // The first operation has a latency 1, throughput 1 (on architectures // Skylake, Broadwell, Haswell, and Ivy Bridge). // // The second operation has latency 3 (on Skylake, Broadwell, Haswell), // or latency 2 (on Ivy Brdige), and throughput 1 (all architectures). // // To achieve this, we can execute a _mm256_shuffle_pd on // rows 0 and 1 stored in ab_0 and ab_1: // // Step 0.0 // a0 b0 | a1 b1 | a2 b2 | a3 b3 // a0 b1 | a1 b0 | a2 b3 | a3 b2 // => _mm256_shuffle_pd with 0000 // a0 b0 | a0 b1 | a2 b2 | a2 b3 (only rows 0 and 2) // // Step 0.1 // a0 b1 | a1 b0 | a2 b3 | a3 b2 (flipped the order) // a0 b0 | a1 b1 | a2 b2 | a3 b3 // => _mm256_shuffle_pd with 1111 // a1 b0 | a1 b1 | a3 b2 | a3 b3 (only rows 1 and 3) // // Next, we perform the same operation on the other two rows: // // Step 0.2 // a0 b2 | a1 b3 | a2 b0 | a3 b1 // a0 b3 | a1 b2 | a2 b1 | a3 b0 // => _mm256_shuffle_pd with 0000 // a0 b2 | a0 b3 | a2 b0 | a2 b1 (only rows 0 and 2) // // Step 0.3 // a0 b3 | a1 b2 | a2 b1 | a3 b0 // a0 b2 | a1 b3 | a2 b0 | a3 b1 // => _mm256_shuffle_pd with 1111 // a1 b2 | a1 b3 | a3 b0 | a3 b1 (only rows 1 and 3) // // Next, we can apply _mm256_permute2f128_pd to select the // correct columns on the matching rows: // // Step 1.0 (combining Steps 0.0 and 0.2): // a0 b0 | a0 b1 | a2 b2 | a2 b3 // a0 b2 | a0 b3 | a2 b0 | a2 b1 // => _mm256_permute_2f128_pd with 0x20 = 0b0010_0000 // a0 b0 | a0 b1 | a0 b2 | a0 b3 // // Step 1.1 (combining Steps 0.0 and 0.2): // a0 b0 | a0 b1 | a2 b2 | a2 b3 // a0 b2 | a0 b3 | a2 b0 | a2 b1 // => _mm256_permute_2f128_pd with 0x03 = 0b0001_0011 // a2 b0 | a2 b1 | a2 b2 | a2 b3 // // Step 1.2 (combining Steps 0.1 and 0.3): // a1 b0 | a1 b1 | a3 b2 | a3 b3 // a1 b2 | a1 b3 | a3 b0 | a3 b1 // => _mm256_permute_2f128_pd with 0x20 = 0b0010_0000 // a1 b0 | a1 b1 | a1 b2 | a1 b3 // // Step 1.3 (combining Steps 0.1 and 0.3): // a1 b0 | a1 b1 | a3 b2 | a3 b3 // a1 b2 | a1 b3 | a3 b0 | a3 b1 // => _mm256_permute_2f128_pd with 0x03 = 0b0001_0011 // a3 b0 | a3 b1 | a3 b2 | a3 b3 // We use scheme a) as the default case, i.e. if c is column-major, rsc==1, or if // c is of general form. Row-major c matrices, csc==1, are treated using schema b). if csc == 1 { // Scheme b), step 0.0 // a0 b0 | a1 b1 | a2 b2 | a3 b3 // a0 b1 | a1 b0 | a2 b3 | a3 b2 let a0b0_a0b1_a2b2_a2b3 = _mm256_shuffle_pd(ab[0], ab[1], 0b0000); // Scheme b), step 0.1 // a0 b1 | a1 b0 | a2 b3 | a3 b2 (flipped the order) // a0 b0 | a1 b1 | a2 b2 | a3 b3 let a1b0_a1b1_a3b2_a3b3 = _mm256_shuffle_pd(ab[1], ab[0], 0b1111); // Scheme b), step 0.2 // a0 b2 | a1 b3 | a2 b0 | a3 b1 // a0 b3 | a1 b2 | a2 b1 | a3 b0 let a0b2_a0b3_a2b0_a2b1 = _mm256_shuffle_pd(ab[2], ab[3], 0b0000); // Scheme b), step 0.3 // a0 b3 | a1 b2 | a2 b1 | a3 b0 (flipped the order) // a0 b2 | a1 b3 | a2 b0 | a3 b1 let a1b2_a1b3_a3b0_a3b1 = _mm256_shuffle_pd(ab[3], ab[2], 0b1111); let a4b0_a4b1_a6b2_a6b3 = _mm256_shuffle_pd(ab[4], ab[5], 0b0000); let a5b0_a5b1_a7b2_a7b3 = _mm256_shuffle_pd(ab[5], ab[4], 0b1111); let a4b2_a4b3_a6b0_a6b1 = _mm256_shuffle_pd(ab[6], ab[7], 0b0000); let a5b2_a5b3_a7b0_a7b1 = _mm256_shuffle_pd(ab[7], ab[6], 0b1111); // Next, we can apply _mm256_permute2f128_pd to select the // correct columns on the matching rows: // // Step 1.0 (combining Steps 0.0 and 0.2): // a0 b0 | a0 b1 | a2 b2 | a2 b3 // a0 b2 | a0 b3 | a2 b0 | a2 b1 // => _mm256_permute_2f128_pd with 0x20 = 0b0010_0000 // a0 b0 | a0 b1 | a0 b2 | a0 b3 // // Step 1.1 (combining Steps 0.0 and 0.2): // a0 b0 | a0 b1 | a2 b2 | a2 b3 // a0 b2 | a0 b3 | a2 b0 | a2 b1 // => _mm256_permute_2f128_pd with 0x03 = 0b0001_0011 // a2 b0 | a2 b1 | a2 b2 | a2 b3 // // Step 1.2 (combining Steps 0.1 and 0.3): // a1 b0 | a1 b1 | a3 b2 | a3 b3 // a1 b2 | a1 b3 | a3 b0 | a3 b1 // => _mm256_permute_2f128_pd with 0x20 = 0b0010_0000 // a1 b0 | a1 b1 | a1 b2 | a1 b3 // // Step 1.3 (combining Steps 0.1 and 0.3): // a1 b0 | a1 b1 | a3 b2 | a3 b3 // a1 b2 | a1 b3 | a3 b0 | a3 b1 // => _mm256_permute_2f128_pd with 0x03 = 0b0001_0011 // a3 b0 | a3 b1 | a3 b2 | a3 b3 // Scheme b), step 1.0 let a0b0_a0b1_a0b2_a0b3 = _mm256_permute2f128_pd( a0b0_a0b1_a2b2_a2b3, a0b2_a0b3_a2b0_a2b1, 0x20 ); // Scheme b), step 1.1 let a2b0_a2b1_a2b2_a2b3 = _mm256_permute2f128_pd( a0b0_a0b1_a2b2_a2b3, a0b2_a0b3_a2b0_a2b1, 0x13 ); // Scheme b) step 1.2 let a1b0_a1b1_a1b2_a1b3 = _mm256_permute2f128_pd( a1b0_a1b1_a3b2_a3b3, a1b2_a1b3_a3b0_a3b1, 0x20 ); // Scheme b) step 1.3 let a3b0_a3b1_a3b2_a3b3 = _mm256_permute2f128_pd( a1b0_a1b1_a3b2_a3b3, a1b2_a1b3_a3b0_a3b1, 0x13 ); // As above, but for ab[4..7] let a4b0_a4b1_a4b2_a4b3 = _mm256_permute2f128_pd( a4b0_a4b1_a6b2_a6b3, a4b2_a4b3_a6b0_a6b1, 0x20 ); let a6b0_a6b1_a6b2_a6b3 = _mm256_permute2f128_pd( a4b0_a4b1_a6b2_a6b3, a4b2_a4b3_a6b0_a6b1, 0x13 ); let a5b0_a5b1_a5b2_a5b3 = _mm256_permute2f128_pd( a5b0_a5b1_a7b2_a7b3, a5b2_a5b3_a7b0_a7b1, 0x20 ); let a7b0_a7b1_a7b2_a7b3 = _mm256_permute2f128_pd( a5b0_a5b1_a7b2_a7b3, a5b2_a5b3_a7b0_a7b1, 0x13 ); ab[0] = a0b0_a0b1_a0b2_a0b3; ab[1] = a1b0_a1b1_a1b2_a1b3; ab[2] = a2b0_a2b1_a2b2_a2b3; ab[3] = a3b0_a3b1_a3b2_a3b3; ab[4] = a4b0_a4b1_a4b2_a4b3; ab[5] = a5b0_a5b1_a5b2_a5b3; ab[6] = a6b0_a6b1_a6b2_a6b3; ab[7] = a7b0_a7b1_a7b2_a7b3; // rsc == 1 and general matrix orders } else { // Scheme a), step 0.0 // ab[0] = a0 b0 | a1 b1 | a2 b2 | a3 b3 // ab[1] = a0 b1 | a1 b0 | a2 b3 | a3 b2 let a0b0_a1b0_a2b2_a3b2 = _mm256_blend_pd(ab[0], ab[1], 0b1010); // Scheme a), step 0.1 let a0b1_a1b1_a2b3_a3b3 = _mm256_blend_pd(ab[1], ab[0], 0b1010); // Scheme a), steps 0.2 // ab[2] = a0 b2 | a1 b3 | a2 b0 | a3 b1 // ab[3] = a0 b3 | a1 b2 | a2 b1 | a3 b0 let a0b2_a1b2_a2b0_a3b0 = _mm256_blend_pd(ab[2], ab[3], 0b1010); // Scheme a), steps 0.3 let a0b3_a1b3_a2b1_a3b1 = _mm256_blend_pd(ab[3], ab[2], 0b1010); // ab[4] = a4 b0 | a5 b1 | a6 b2 | a7 b3 // ab[5] = a4 b1 | a5 b0 | a6 b3 | a7 b2 let a4b0_a5b0_a6b2_a7b2 = _mm256_blend_pd(ab[4], ab[5], 0b1010); let a4b1_a5b1_a6b3_a7b3 = _mm256_blend_pd(ab[5], ab[4], 0b1010); // ab[6] = a0 b2 | a1 b3 | a2 b0 | a3 b1 // ab[7] = a0 b3 | a1 b2 | a2 b1 | a3 b0 let a4b2_a5b2_a6b0_a7b0 = _mm256_blend_pd(ab[6], ab[7], 0b1010); let a4b3_a5b3_a6b1_a7b1 = _mm256_blend_pd(ab[7], ab[6], 0b1010); // Scheme a), step 1.0 let a0b0_a1b0_a2b0_a3b0 = _mm256_permute2f128_pd( a0b0_a1b0_a2b2_a3b2, a0b2_a1b2_a2b0_a3b0, 0x30 ); // Scheme a), step 1.1 let a0b2_a1b2_a2b2_a3b2 = _mm256_permute2f128_pd( a0b0_a1b0_a2b2_a3b2, a0b2_a1b2_a2b0_a3b0, 0x12, ); // Scheme a) step 1.2 let a0b1_a1b1_a2b1_a3b1 = _mm256_permute2f128_pd( a0b1_a1b1_a2b3_a3b3, a0b3_a1b3_a2b1_a3b1, 0x30 ); // Scheme a) step 1.3 let a0b3_a1b3_a2b3_a3b3 = _mm256_permute2f128_pd( a0b1_a1b1_a2b3_a3b3, a0b3_a1b3_a2b1_a3b1, 0x12 ); // As above, but for ab[4..7] let a4b0_a5b0_a6b0_a7b0 = _mm256_permute2f128_pd( a4b0_a5b0_a6b2_a7b2, a4b2_a5b2_a6b0_a7b0, 0x30 ); let a4b2_a5b2_a6b2_a7b2 = _mm256_permute2f128_pd( a4b0_a5b0_a6b2_a7b2, a4b2_a5b2_a6b0_a7b0, 0x12, ); let a4b1_a5b1_a6b1_a7b1 = _mm256_permute2f128_pd( a4b1_a5b1_a6b3_a7b3, a4b3_a5b3_a6b1_a7b1, 0x30 ); let a4b3_a5b3_a6b3_a7b3 = _mm256_permute2f128_pd( a4b1_a5b1_a6b3_a7b3, a4b3_a5b3_a6b1_a7b1, 0x12 ); ab[0] = a0b0_a1b0_a2b0_a3b0; ab[1] = a0b1_a1b1_a2b1_a3b1; ab[2] = a0b2_a1b2_a2b2_a3b2; ab[3] = a0b3_a1b3_a2b3_a3b3; ab[4] = a4b0_a5b0_a6b0_a7b0; ab[5] = a4b1_a5b1_a6b1_a7b1; ab[6] = a4b2_a5b2_a6b2_a7b2; ab[7] = a4b3_a5b3_a6b3_a7b3; } // Compute α (A B) // Compute here if we don't have fma, else pick up α further down let alphav = _mm256_broadcast_sd(&alpha); if !MA::IS_FUSED { loop_m!(i, ab[i] = _mm256_mul_pd(alphav, ab[i])); } macro_rules! c { ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize)); } // C ← α A B + β C let mut cv = [_mm256_setzero_pd(); MR]; if beta != 0. { // Read C if rsc == 1 { loop4!(i, cv[i] = _mm256_loadu_pd(c![0, i])); loop4!(i, cv[i + 4] = _mm256_loadu_pd(c![4, i])); } else if csc == 1 { loop4!(i, cv[i] = _mm256_loadu_pd(c![i, 0])); loop4!(i, cv[i+4] = _mm256_loadu_pd(c![i+4, 0])); } else { loop4!(i, cv[i] = _mm256_setr_pd( *c![0, i], *c![1, i], *c![2, i], *c![3, i] )); loop4!(i, cv[i + 4] = _mm256_setr_pd( *c![4, i], *c![5, i], *c![6, i], *c![7, i] )); } // Compute β C // _mm256_set1_pd and _mm256_broadcast_sd seem to achieve the same thing. let beta_v = _mm256_broadcast_sd(&beta); loop_m!(i, cv[i] = _mm256_mul_pd(cv[i], beta_v)); } // Compute (α A B) + (β C) if !MA::IS_FUSED { loop_m!(i, cv[i] = _mm256_add_pd(cv[i], ab[i])); } else { loop_m!(i, cv[i] = MA::multiply_add(alphav, ab[i], cv[i])); } if rsc == 1 { loop4!(i, _mm256_storeu_pd(c![0, i], cv[i])); loop4!(i, _mm256_storeu_pd(c![4, i], cv[i + 4])); } else if csc == 1 { loop4!(i, _mm256_storeu_pd(c![i, 0], cv[i])); loop4!(i, _mm256_storeu_pd(c![i+4, 0], cv[i + 4])); } else { // Permute to bring each element in the vector to the front and store loop4!(i, { // E.g. c_0_lo = a0b0 | a1b0 let c_lo: __m128d = _mm256_extractf128_pd(cv[i], 0); // E.g. c_0_hi = a2b0 | a3b0 let c_hi: __m128d = _mm256_extractf128_pd(cv[i], 1); _mm_storel_pd(c![0, i], c_lo); _mm_storeh_pd(c![1, i], c_lo); _mm_storel_pd(c![2, i], c_hi); _mm_storeh_pd(c![3, i], c_hi); // E.g. c_0_lo = a0b0 | a1b0 let c_lo: __m128d = _mm256_extractf128_pd(cv[i+4], 0); // E.g. c_0_hi = a2b0 | a3b0 let c_hi: __m128d = _mm256_extractf128_pd(cv[i+4], 1); _mm_storel_pd(c![4, i], c_lo); _mm_storeh_pd(c![5, i], c_lo); _mm_storel_pd(c![6, i], c_hi); _mm_storeh_pd(c![7, i], c_hi); }); } } #[cfg(target_arch="aarch64")] #[cfg(has_aarch64_simd)] #[target_feature(enable="neon")] unsafe fn kernel_target_neon(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { use core::arch::aarch64::*; const MR: usize = KernelNeon::MR; const NR: usize = KernelNeon::NR; let (mut a, mut b) = (a, b); // Kernel 8 x 4 (a x b) // Four quadrants of 4 x 2 let mut ab11 = [vmovq_n_f64(0.); 4]; let mut ab12 = [vmovq_n_f64(0.); 4]; let mut ab21 = [vmovq_n_f64(0.); 4]; let mut ab22 = [vmovq_n_f64(0.); 4]; // Compute // ab_ij = a_i * b_j for all i, j macro_rules! ab_ij_equals_ai_bj_12 { ($dest:ident, $av:expr, $bv:expr) => { $dest[0] = vfmaq_laneq_f64($dest[0], $bv, $av, 0); $dest[1] = vfmaq_laneq_f64($dest[1], $bv, $av, 1); } } macro_rules! ab_ij_equals_ai_bj_23 { ($dest:ident, $av:expr, $bv:expr) => { $dest[2] = vfmaq_laneq_f64($dest[2], $bv, $av, 0); $dest[3] = vfmaq_laneq_f64($dest[3], $bv, $av, 1); } } for _ in 0..k { let b1 = vld1q_f64(b); let b2 = vld1q_f64(b.add(2)); let a1 = vld1q_f64(a); let a2 = vld1q_f64(a.add(2)); ab_ij_equals_ai_bj_12!(ab11, a1, b1); ab_ij_equals_ai_bj_23!(ab11, a2, b1); ab_ij_equals_ai_bj_12!(ab12, a1, b2); ab_ij_equals_ai_bj_23!(ab12, a2, b2); let a3 = vld1q_f64(a.add(4)); let a4 = vld1q_f64(a.add(6)); ab_ij_equals_ai_bj_12!(ab21, a3, b1); ab_ij_equals_ai_bj_23!(ab21, a4, b1); ab_ij_equals_ai_bj_12!(ab22, a3, b2); ab_ij_equals_ai_bj_23!(ab22, a4, b2); a = a.add(MR); b = b.add(NR); } macro_rules! c { ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize)); } // ab *= alpha loop4!(i, ab11[i] = vmulq_n_f64(ab11[i], alpha)); loop4!(i, ab12[i] = vmulq_n_f64(ab12[i], alpha)); loop4!(i, ab21[i] = vmulq_n_f64(ab21[i], alpha)); loop4!(i, ab22[i] = vmulq_n_f64(ab22[i], alpha)); // load one float64x2_t from two pointers macro_rules! loadq_from_pointers { ($p0:expr, $p1:expr) => ( { let v = vld1q_dup_f64($p0); let v = vld1q_lane_f64($p1, v, 1); v } ); } if beta != 0. { // load existing value in C let mut c11 = [vmovq_n_f64(0.); 4]; let mut c12 = [vmovq_n_f64(0.); 4]; let mut c21 = [vmovq_n_f64(0.); 4]; let mut c22 = [vmovq_n_f64(0.); 4]; if csc == 1 { loop4!(i, c11[i] = vld1q_f64(c![i + 0, 0])); loop4!(i, c12[i] = vld1q_f64(c![i + 0, 2])); loop4!(i, c21[i] = vld1q_f64(c![i + 4, 0])); loop4!(i, c22[i] = vld1q_f64(c![i + 4, 2])); } else { loop4!(i, c11[i] = loadq_from_pointers!(c![i + 0, 0], c![i + 0, 1])); loop4!(i, c12[i] = loadq_from_pointers!(c![i + 0, 2], c![i + 0, 3])); loop4!(i, c21[i] = loadq_from_pointers!(c![i + 4, 0], c![i + 4, 1])); loop4!(i, c22[i] = loadq_from_pointers!(c![i + 4, 2], c![i + 4, 3])); } let betav = vmovq_n_f64(beta); // ab += β C loop4!(i, ab11[i] = vfmaq_f64(ab11[i], c11[i], betav)); loop4!(i, ab12[i] = vfmaq_f64(ab12[i], c12[i], betav)); loop4!(i, ab21[i] = vfmaq_f64(ab21[i], c21[i], betav)); loop4!(i, ab22[i] = vfmaq_f64(ab22[i], c22[i], betav)); } // c <- ab // which is in full // C <- α A B (+ β C) if csc == 1 { loop4!(i, vst1q_f64(c![i + 0, 0], ab11[i])); loop4!(i, vst1q_f64(c![i + 0, 2], ab12[i])); loop4!(i, vst1q_f64(c![i + 4, 0], ab21[i])); loop4!(i, vst1q_f64(c![i + 4, 2], ab22[i])); } else { loop4!(i, vst1q_lane_f64(c![i + 0, 0], ab11[i], 0)); loop4!(i, vst1q_lane_f64(c![i + 0, 1], ab11[i], 1)); loop4!(i, vst1q_lane_f64(c![i + 0, 2], ab12[i], 0)); loop4!(i, vst1q_lane_f64(c![i + 0, 3], ab12[i], 1)); loop4!(i, vst1q_lane_f64(c![i + 4, 0], ab21[i], 0)); loop4!(i, vst1q_lane_f64(c![i + 4, 1], ab21[i], 1)); loop4!(i, vst1q_lane_f64(c![i + 4, 2], ab22[i], 0)); loop4!(i, vst1q_lane_f64(c![i + 4, 3], ab22[i], 1)); } } #[inline] unsafe fn kernel_fallback_impl(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { const MR: usize = KernelFallback::MR; const NR: usize = KernelFallback::NR; let mut ab: [[T; NR]; MR] = [[0.; NR]; MR]; let mut a = a; let mut b = b; debug_assert_eq!(beta, 0., "Beta must be 0 or is not masked"); // Compute matrix multiplication into ab[i][j] unroll_by!(4 => k, { loop4!(i, loop4!(j, ab[i][j] += at(a, i) * at(b, j))); a = a.offset(MR as isize); b = b.offset(NR as isize); }); macro_rules! c { ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize)); } // set C = α A B loop4!(j, loop4!(i, *c![i, j] = alpha * ab[i][j])); } #[inline(always)] unsafe fn at(ptr: *const T, i: usize) -> T { *ptr.offset(i as isize) } #[cfg(test)] mod tests { use super::*; use crate::kernel::test::test_a_kernel; #[test] fn test_kernel_fallback_impl() { test_a_kernel::("kernel"); } #[cfg(any(target_arch="x86", target_arch="x86_64"))] #[test] fn test_loop_m_n() { let mut m = [[0; 4]; KernelAvx::MR]; loop_m!(i, loop4!(j, m[i][j] += 1)); for arr in &m[..] { for elt in &arr[..] { assert_eq!(*elt, 1); } } } #[cfg(any(target_arch="aarch64"))] #[cfg(has_aarch64_simd)] mod test_kernel_aarch64 { use super::test_a_kernel; use super::super::*; #[cfg(feature = "std")] use std::println; macro_rules! test_arch_kernels_aarch64 { ($($feature_name:tt, $name:ident, $kernel_ty:ty),*) => { $( #[test] fn $name() { if is_aarch64_feature_detected_!($feature_name) { test_a_kernel::<$kernel_ty, _>(stringify!($name)); } else { #[cfg(feature = "std")] println!("Skipping, host does not have feature: {:?}", $feature_name); } } )* } } test_arch_kernels_aarch64! { "neon", neon, KernelNeon } } #[cfg(any(target_arch="x86", target_arch="x86_64"))] mod test_kernel_x86 { use super::test_a_kernel; use super::super::*; #[cfg(feature = "std")] use std::println; macro_rules! test_arch_kernels_x86 { ($($feature_name:tt, $name:ident, $kernel_ty:ty),*) => { $( #[test] fn $name() { if is_x86_feature_detected_!($feature_name) { test_a_kernel::<$kernel_ty, _>(stringify!($name)); } else { #[cfg(feature = "std")] println!("Skipping, host does not have feature: {:?}", $feature_name); } } )* } } test_arch_kernels_x86! { "fma", fma, KernelFma, "avx", avx, KernelAvx, "sse2", sse2, KernelSse2 } } } matrixmultiply-0.3.9/src/gemm.rs000064400000000000000000000433401046102023000150270ustar 00000000000000// Copyright 2016 - 2018 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. #[cfg(feature="std")] use core::cell::UnsafeCell; use core::cmp::min; use core::mem::size_of; use core::slice; use crate::aligned_alloc::Alloc; use crate::ptr::Ptr; use crate::util::range_chunk; use crate::util::round_up_to; use crate::kernel::Element; use crate::kernel::GemmKernel; use crate::kernel::GemmSelect; #[cfg(feature = "cgemm")] use crate::kernel::{c32, c64}; use crate::threading::{get_thread_pool, ThreadPoolCtx, LoopThreadConfig}; use crate::sgemm_kernel; use crate::dgemm_kernel; #[cfg(feature = "cgemm")] use crate::cgemm_kernel; #[cfg(feature = "cgemm")] use crate::zgemm_kernel; use rawpointer::PointerExt; /// General matrix multiplication (f32) /// /// C ← α A B + β C /// /// + m, k, n: dimensions /// + a, b, c: pointer to the first element in the matrix /// + A: m by k matrix /// + B: k by n matrix /// + C: m by n matrix /// + rsx: row stride of *x* /// + csx: col stride of *x* /// /// Strides for A and B may be arbitrary. Strides for C must not result in /// elements that alias each other, for example they can not be zero. /// /// If β is zero, then C does not need to be initialized. pub unsafe fn sgemm( m: usize, k: usize, n: usize, alpha: f32, a: *const f32, rsa: isize, csa: isize, b: *const f32, rsb: isize, csb: isize, beta: f32, c: *mut f32, rsc: isize, csc: isize) { sgemm_kernel::detect(GemmParameters { m, k, n, alpha, a, rsa, csa, b, rsb, csb, beta, c, rsc, csc}) } /// General matrix multiplication (f64) /// /// C ← α A B + β C /// /// + m, k, n: dimensions /// + a, b, c: pointer to the first element in the matrix /// + A: m by k matrix /// + B: k by n matrix /// + C: m by n matrix /// + rsx: row stride of *x* /// + csx: col stride of *x* /// /// Strides for A and B may be arbitrary. Strides for C must not result in /// elements that alias each other, for example they can not be zero. /// /// If β is zero, then C does not need to be initialized. pub unsafe fn dgemm( m: usize, k: usize, n: usize, alpha: f64, a: *const f64, rsa: isize, csa: isize, b: *const f64, rsb: isize, csb: isize, beta: f64, c: *mut f64, rsc: isize, csc: isize) { dgemm_kernel::detect(GemmParameters { m, k, n, alpha, a, rsa, csa, b, rsb, csb, beta, c, rsc, csc}) } /// cgemm/zgemm per-operand options /// /// TBD. #[cfg(feature = "cgemm")] #[non_exhaustive] #[derive(Copy, Clone, Debug)] pub enum CGemmOption { /// Standard Standard, } #[cfg(feature = "cgemm")] /// General matrix multiplication (complex f32) /// /// C ← α A B + β C /// /// + m, k, n: dimensions /// + a, b, c: pointer to the first element in the matrix /// + A: m by k matrix /// + B: k by n matrix /// + C: m by n matrix /// + rsx: row stride of *x* /// + csx: col stride of *x* /// /// Strides for A and B may be arbitrary. Strides for C must not result in /// elements that alias each other, for example they can not be zero. /// /// If β is zero, then C does not need to be initialized. /// /// Requires crate feature `"cgemm"` pub unsafe fn cgemm( flaga: CGemmOption, flagb: CGemmOption, m: usize, k: usize, n: usize, alpha: c32, a: *const c32, rsa: isize, csa: isize, b: *const c32, rsb: isize, csb: isize, beta: c32, c: *mut c32, rsc: isize, csc: isize) { let _ = (flaga, flagb); cgemm_kernel::detect(GemmParameters { m, k, n, alpha, a, rsa, csa, b, rsb, csb, beta, c, rsc, csc}) } #[cfg(feature = "cgemm")] /// General matrix multiplication (complex f64) /// /// C ← α A B + β C /// /// + m, k, n: dimensions /// + a, b, c: pointer to the first element in the matrix /// + A: m by k matrix /// + B: k by n matrix /// + C: m by n matrix /// + rsx: row stride of *x* /// + csx: col stride of *x* /// /// Strides for A and B may be arbitrary. Strides for C must not result in /// elements that alias each other, for example they can not be zero. /// /// If β is zero, then C does not need to be initialized. /// /// Requires crate feature `"cgemm"` pub unsafe fn zgemm( flaga: CGemmOption, flagb: CGemmOption, m: usize, k: usize, n: usize, alpha: c64, a: *const c64, rsa: isize, csa: isize, b: *const c64, rsb: isize, csb: isize, beta: c64, c: *mut c64, rsc: isize, csc: isize) { let _ = (flaga, flagb); zgemm_kernel::detect(GemmParameters { m, k, n, alpha, a, rsa, csa, b, rsb, csb, beta, c, rsc, csc}) } struct GemmParameters { // Parameters grouped logically in rows m: usize, k: usize, n: usize, alpha: T, a: *const T, rsa: isize, csa: isize, beta: T, b: *const T, rsb: isize, csb: isize, c: *mut T, rsc: isize, csc: isize, } impl GemmSelect for GemmParameters { fn select(self, _kernel: K) where K: GemmKernel, T: Element, { // This is where we enter with the configuration specific kernel // We could cache kernel specific function pointers here, if we // needed to support more constly configuration detection. let GemmParameters { m, k, n, alpha, a, rsa, csa, b, rsb, csb, beta, c, rsc, csc} = self; unsafe { gemm_loop::( m, k, n, alpha, a, rsa, csa, b, rsb, csb, beta, c, rsc, csc) } } } /// Ensure that GemmKernel parameters are supported /// (alignment, microkernel size). /// /// This function is optimized out for a supported configuration. #[inline(always)] fn ensure_kernel_params() where K: GemmKernel { let mr = K::MR; let nr = K::NR; // These are current limitations, // can change if corresponding code in gemm_loop is updated. assert!(mr > 0 && mr <= 8); assert!(nr > 0 && nr <= 8); assert!(mr * nr * size_of::() <= 8 * 4 * 8); assert!(K::align_to() <= 32); // one row/col of the kernel is limiting the max align we can provide let max_align = size_of::() * min(mr, nr); assert!(K::align_to() <= max_align); assert!(K::MR <= K::mc()); assert!(K::mc() <= K::kc()); assert!(K::kc() <= K::nc()); assert!(K::nc() <= 65536); } /// Implement matrix multiply using packed buffers and a microkernel /// strategy, the type parameter `K` is the gemm microkernel. // no inline is best for the default case, where we support many K per // gemm entry point. FIXME: make this conditional on feature detection #[inline(never)] unsafe fn gemm_loop( m: usize, k: usize, n: usize, alpha: K::Elem, a: *const K::Elem, rsa: isize, csa: isize, b: *const K::Elem, rsb: isize, csb: isize, beta: K::Elem, c: *mut K::Elem, rsc: isize, csc: isize) where K: GemmKernel { debug_assert!(m <= 1 || n == 0 || rsc != 0); debug_assert!(m == 0 || n <= 1 || csc != 0); // if A or B have no elements, compute C ← βC and return if m == 0 || k == 0 || n == 0 { return c_to_beta_c(m, n, beta, c, rsc, csc); } let knc = K::nc(); let kkc = K::kc(); let kmc = K::mc(); ensure_kernel_params::(); let a = Ptr(a); let b = Ptr(b); let c = Ptr(c); let (nthreads, tp) = get_thread_pool(); let thread_config = LoopThreadConfig::new::(m, k, n, nthreads); let nap = thread_config.num_pack_a(); let (mut packing_buffer, ap_size, bp_size) = make_packing_buffer::(m, k, n, nap); let app = Ptr(packing_buffer.ptr_mut()); let bpp = app.add(ap_size * nap); // LOOP 5: split n into nc parts (B, C) for (l5, nc) in range_chunk(n, knc) { dprint!("LOOP 5, {}, nc={}", l5, nc); let b = b.stride_offset(csb, knc * l5); let c = c.stride_offset(csc, knc * l5); // LOOP 4: split k in kc parts (A, B) // This particular loop can't be parallelized because the // C chunk (writable) is shared between iterations. for (l4, kc) in range_chunk(k, kkc) { dprint!("LOOP 4, {}, kc={}", l4, kc); let b = b.stride_offset(rsb, kkc * l4); let a = a.stride_offset(csa, kkc * l4); // Pack B -> B~ K::pack_nr(kc, nc, slice::from_raw_parts_mut(bpp.ptr(), bp_size), b.ptr(), csb, rsb); // First time writing to C, use user's `beta`, else accumulate let betap = if l4 == 0 { beta } else { <_>::one() }; // LOOP 3: split m into mc parts (A, C) range_chunk(m, kmc) .parallel(thread_config.loop3, tp) .thread_local(move |i, _nt| { // a packing buffer A~ per thread debug_assert!(i < nap); app.add(ap_size * i) }) .for_each(move |tp, &mut app, l3, mc| { dprint!("LOOP 3, {}, mc={}", l3, mc); let a = a.stride_offset(rsa, kmc * l3); let c = c.stride_offset(rsc, kmc * l3); // Pack A -> A~ K::pack_mr(kc, mc, slice::from_raw_parts_mut(app.ptr(), ap_size), a.ptr(), rsa, csa); // LOOP 2 and 1 gemm_packed::(nc, kc, mc, alpha, app.to_const(), bpp.to_const(), betap, c, rsc, csc, tp, thread_config); }); } } } // set up buffer for masked (redirected output of) kernel const KERNEL_MAX_SIZE: usize = 8 * 8 * 4; const KERNEL_MAX_ALIGN: usize = 32; const MASK_BUF_SIZE: usize = KERNEL_MAX_SIZE + KERNEL_MAX_ALIGN - 1; // Pointers into buffer will be manually aligned anyway, due to // bugs we have seen on certain platforms (macos) that look like // we don't get aligned allocations out of TLS - 16- and 8-byte // allocations have been seen, make the minimal align request we can. // Align(32) would not work with TLS for s390x. #[cfg_attr(not(target_os = "macos"), repr(align(16)))] struct MaskBuffer { buffer: [u8; MASK_BUF_SIZE], } // Use thread local if we can; this is faster even in the single threaded case because // it is possible to skip zeroing out the array. #[cfg(feature = "std")] thread_local! { static MASK_BUF: UnsafeCell = UnsafeCell::new(MaskBuffer { buffer: [0; MASK_BUF_SIZE] }); } /// Loops 1 and 2 around the µ-kernel /// /// + app: packed A (A~) /// + bpp: packed B (B~) /// + nc: columns of packed B /// + kc: columns of packed A / rows of packed B /// + mc: rows of packed A unsafe fn gemm_packed(nc: usize, kc: usize, mc: usize, alpha: K::Elem, app: Ptr<*const K::Elem>, bpp: Ptr<*const K::Elem>, beta: K::Elem, c: Ptr<*mut K::Elem>, rsc: isize, csc: isize, tp: ThreadPoolCtx, thread_config: LoopThreadConfig) where K: GemmKernel, { let mr = K::MR; let nr = K::NR; // check for the mask buffer that fits 8 x 8 f32 and 8 x 4 f64 kernels and alignment assert!(mr * nr * size_of::() <= KERNEL_MAX_SIZE && K::align_to() <= KERNEL_MAX_ALIGN); #[cfg(not(feature = "std"))] let mut mask_buf = MaskBuffer { buffer: [0; MASK_BUF_SIZE] }; // LOOP 2: through micropanels in packed `b` (B~, C) range_chunk(nc, nr) .parallel(thread_config.loop2, tp) .thread_local(|_i, _nt| { let mut ptr; #[cfg(not(feature = "std"))] { debug_assert_eq!(_nt, 1); ptr = mask_buf.buffer.as_mut_ptr(); } #[cfg(feature = "std")] { ptr = MASK_BUF.with(|buf| (*buf.get()).buffer.as_mut_ptr()); } ptr = align_ptr(K::align_to(), ptr); slice::from_raw_parts_mut(ptr as *mut K::Elem, KERNEL_MAX_SIZE / size_of::()) }) .for_each(move |_tp, mask_buf, l2, nr_| { let bpp = bpp.stride_offset(1, kc * nr * l2); let c = c.stride_offset(csc, nr * l2); // LOOP 1: through micropanels in packed `a` while `b` is constant (A~, C) for (l1, mr_) in range_chunk(mc, mr) { let app = app.stride_offset(1, kc * mr * l1); let c = c.stride_offset(rsc, mr * l1); // GEMM KERNEL // NOTE: For the rust kernels, it performs better to simply // always use the masked kernel function! if K::always_masked() || nr_ < nr || mr_ < mr { masked_kernel::<_, K>(kc, alpha, app.ptr(), bpp.ptr(), beta, c.ptr(), rsc, csc, mr_, nr_, mask_buf); continue; } else { K::kernel(kc, alpha, app.ptr(), bpp.ptr(), beta, c.ptr(), rsc, csc); } } }); } /// Allocate a vector of uninitialized data to be used for both packing buffers. /// /// + A~ needs be KC x MC /// + B~ needs be KC x NC /// but we can make them smaller if the matrix is smaller than this (just ensure /// we have rounded up to a multiple of the kernel size). /// /// na: Number of buffers to alloc for A /// /// Return packing buffer and size of A~ (The offset to B~ is A~ size times `na`), size of B~. unsafe fn make_packing_buffer(m: usize, k: usize, n: usize, na: usize) -> (Alloc, usize, usize) where K: GemmKernel, { // max alignment requirement is a multiple of min(MR, NR) * sizeof // because apack_size is a multiple of MR, start of b aligns fine let m = min(m, K::mc()); let k = min(k, K::kc()); let n = min(n, K::nc()); // round up k, n to multiples of mr, nr // round up to multiple of kc debug_assert_ne!(na, 0); debug_assert!(na <= 128); let apack_size = k * round_up_to(m, K::MR); let bpack_size = k * round_up_to(n, K::NR); let nelem = apack_size * na + bpack_size; dprint!("packed nelem={}, apack={}, bpack={}, m={} k={} n={}, na={}", nelem, apack_size, bpack_size, m,k,n, na); (Alloc::new(nelem, K::align_to()), apack_size, bpack_size) } /// offset the ptr forwards to align to a specific byte count /// Safety: align_to must be a power of two and ptr valid for the pointer arithmetic #[inline] unsafe fn align_ptr(mut align_to: usize, mut ptr: *mut T) -> *mut T { // always ensure minimal alignment on macos if cfg!(target_os = "macos") { align_to = Ord::max(align_to, 8); } if align_to != 0 { let cur_align = ptr as usize % align_to; if cur_align != 0 { ptr = ptr.offset(((align_to - cur_align) / size_of::()) as isize); } } ptr } /// Call the GEMM kernel with a "masked" output C. /// /// Simply redirect the MR by NR kernel output to the passed /// in `mask_buf`, and copy the non masked region to the real /// C. /// /// + rows: rows of kernel unmasked /// + cols: cols of kernel unmasked #[inline(never)] unsafe fn masked_kernel(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize, rows: usize, cols: usize, mask_buf: &mut [T]) where K: GemmKernel, T: Element, { // use column major order for `mask_buf` K::kernel(k, alpha, a, b, T::zero(), mask_buf.as_mut_ptr(), 1, K::MR as isize); c_to_masked_ab_beta_c::<_, K>(beta, c, rsc, csc, rows, cols, &*mask_buf); } /// Copy output in `mask_buf` to the actual c matrix /// /// C ← M + βC where M is the `mask_buf` #[inline] unsafe fn c_to_masked_ab_beta_c(beta: T, c: *mut T, rsc: isize, csc: isize, rows: usize, cols: usize, mask_buf: &[T]) where K: GemmKernel, T: Element, { // note: use separate function here with `&T` argument for mask buf, // so that the compiler sees that `c` and `mask_buf` never alias. let mr = K::MR; let nr = K::NR; let mut ab = mask_buf.as_ptr(); for j in 0..nr { for i in 0..mr { if i < rows && j < cols { let cptr = c.stride_offset(rsc, i) .stride_offset(csc, j); if beta.is_zero() { *cptr = *ab; // initialize } else { (*cptr).mul_assign(beta); (*cptr).add_assign(*ab); } } ab.inc(); } } } // Compute just C ← βC #[inline(never)] unsafe fn c_to_beta_c(m: usize, n: usize, beta: T, c: *mut T, rsc: isize, csc: isize) where T: Element { for i in 0..m { for j in 0..n { let cptr = c.stride_offset(rsc, i) .stride_offset(csc, j); if beta.is_zero() { *cptr = T::zero(); // initialize C } else { (*cptr).mul_assign(beta); } } } } matrixmultiply-0.3.9/src/kernel.rs000064400000000000000000000236161046102023000153660ustar 00000000000000// Copyright 2016 - 2021 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use crate::archparam; use crate::packing::pack; /// General matrix multiply kernel pub(crate) trait GemmKernel { type Elem: Element; /// Kernel rows const MR: usize = Self::MRTy::VALUE; /// Kernel cols const NR: usize = Self::NRTy::VALUE; /// Kernel rows as const num type type MRTy: ConstNum; /// Kernel cols as const num type type NRTy: ConstNum; /// align inputs to this fn align_to() -> usize; /// Whether to always use the masked wrapper around the kernel. fn always_masked() -> bool; // These should ideally be tuned per kernel and per microarch #[inline(always)] fn nc() -> usize { archparam::S_NC } #[inline(always)] fn kc() -> usize { archparam::S_KC } #[inline(always)] fn mc() -> usize { archparam::S_MC } /// Pack matrix A into its packing buffer. /// /// See pack for more documentation. /// /// Override only if the default packing function does not /// use the right layout. #[inline] unsafe fn pack_mr(kc: usize, mc: usize, pack_buf: &mut [Self::Elem], a: *const Self::Elem, rsa: isize, csa: isize) { pack::(kc, mc, pack_buf, a, rsa, csa) } /// Pack matrix B into its packing buffer /// /// See pack for more documentation. /// /// Override only if the default packing function does not /// use the right layout. #[inline] unsafe fn pack_nr(kc: usize, mc: usize, pack_buf: &mut [Self::Elem], a: *const Self::Elem, rsa: isize, csa: isize) { pack::(kc, mc, pack_buf, a, rsa, csa) } /// Matrix multiplication kernel /// /// This does the matrix multiplication: /// /// C ← α A B + β C /// /// + `k`: length of data in a, b /// + a, b are packed /// + c has general strides /// + rsc: row stride of c /// + csc: col stride of c /// + `alpha`: scaling factor for A B product /// + `beta`: scaling factor for c. /// Note: if `beta` is `0.`, the kernel should not (and must not) /// read from c, its value is to be treated as if it was zero. /// /// When masked, the kernel is always called with β=0 but α is passed /// as usual. (This is only useful information if you return `true` from /// `always_masked`.) unsafe fn kernel( k: usize, alpha: Self::Elem, a: *const Self::Elem, b: *const Self::Elem, beta: Self::Elem, c: *mut Self::Elem, rsc: isize, csc: isize); } pub(crate) trait Element : Copy + Send + Sync { fn zero() -> Self; fn one() -> Self; fn test_value() -> Self; fn is_zero(&self) -> bool; fn add_assign(&mut self, rhs: Self); fn mul_assign(&mut self, rhs: Self); } impl Element for f32 { fn zero() -> Self { 0. } fn one() -> Self { 1. } fn test_value() -> Self { 1. } fn is_zero(&self) -> bool { *self == 0. } fn add_assign(&mut self, rhs: Self) { *self += rhs; } fn mul_assign(&mut self, rhs: Self) { *self *= rhs; } } impl Element for f64 { fn zero() -> Self { 0. } fn one() -> Self { 1. } fn test_value() -> Self { 1. } fn is_zero(&self) -> bool { *self == 0. } fn add_assign(&mut self, rhs: Self) { *self += rhs; } fn mul_assign(&mut self, rhs: Self) { *self *= rhs; } } /// Kernel selector pub(crate) trait GemmSelect { /// Call `select` with the selected kernel for this configuration fn select(self, kernel: K) where K: GemmKernel, T: Element; } #[cfg(feature = "cgemm")] #[allow(non_camel_case_types)] pub(crate) type c32 = [f32; 2]; #[cfg(feature = "cgemm")] #[allow(non_camel_case_types)] pub(crate) type c64 = [f64; 2]; #[cfg(feature = "cgemm")] impl Element for c32 { fn zero() -> Self { [0., 0.] } fn one() -> Self { [1., 0.] } fn test_value() -> Self { [2., 1.] } fn is_zero(&self) -> bool { *self == [0., 0.] } #[inline(always)] fn add_assign(&mut self, y: Self) { self[0] += y[0]; self[1] += y[1]; } #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = c32_mul(*self, rhs); } } #[cfg(feature = "cgemm")] impl Element for c64 { fn zero() -> Self { [0., 0.] } fn one() -> Self { [1., 0.] } fn test_value() -> Self { [2., 1.] } fn is_zero(&self) -> bool { *self == [0., 0.] } #[inline(always)] fn add_assign(&mut self, y: Self) { self[0] += y[0]; self[1] += y[1]; } #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = c64_mul(*self, rhs); } } #[cfg(feature = "cgemm")] #[inline(always)] pub(crate) fn c32_mul(x: c32, y: c32) -> c32 { let [a, b] = x; let [c, d] = y; [a * c - b * d, b * c + a * d] } #[cfg(feature = "cgemm")] #[inline(always)] pub(crate) fn c64_mul(x: c64, y: c64) -> c64 { let [a, b] = x; let [c, d] = y; [a * c - b * d, b * c + a * d] } pub(crate) trait ConstNum { const VALUE: usize; } #[cfg(feature = "cgemm")] pub(crate) struct U2; pub(crate) struct U4; pub(crate) struct U8; #[cfg(feature = "cgemm")] impl ConstNum for U2 { const VALUE: usize = 2; } impl ConstNum for U4 { const VALUE: usize = 4; } impl ConstNum for U8 { const VALUE: usize = 8; } #[cfg(test)] pub(crate) mod test { use std::fmt; use super::GemmKernel; use super::Element; use crate::aligned_alloc::Alloc; pub(crate) fn aligned_alloc(elt: K::Elem, n: usize) -> Alloc where K: GemmKernel, K::Elem: Copy, { unsafe { Alloc::new(n, K::align_to()).init_with(elt) } } /// Assert that we can compute A I == A and I B == B for the kernel (truncated, if needed) /// /// Tests C col major and row major /// Tests beta == 0 (and no other option) pub(crate) fn test_a_kernel(_name: &str) where K: GemmKernel, T: Element + fmt::Debug + PartialEq, { const K: usize = 16; let mr = K::MR; let nr = K::NR; // 1. Test A I == A (variables a, b, c) // b looks like an identity matrix (truncated, depending on MR/NR) let mut a = aligned_alloc::(T::zero(), mr * K); let mut b = aligned_alloc::(T::zero(), nr * K); let mut count = 1; for i in 0..mr { for j in 0..K { for _ in 0..count { a[i * K + j].add_assign(T::test_value()); } count += 1; } } for i in 0..Ord::min(K, nr) { b[i + i * nr] = T::one(); } let mut c = vec![T::zero(); mr * nr]; unsafe { // col major C K::kernel(K, T::one(), a.as_ptr(), b.as_ptr(), T::zero(), c.as_mut_ptr(), 1, mr as isize); } let common_len = Ord::min(a.len(), c.len()); assert_eq!(&a[..common_len], &c[..common_len]); // 2. Test I B == B (variables a, b, c) // a looks like an identity matrix (truncated, depending on MR/NR) let mut a = aligned_alloc::(T::zero(), mr * K); let mut b = aligned_alloc::(T::zero(), nr * K); for i in 0..Ord::min(K, mr) { a[i + i * mr] = T::one(); } let mut count = 1; for i in 0..K { for j in 0..nr { for _ in 0..count { b[i * nr + j].add_assign(T::test_value()); } count += 1; } } let mut c = vec![T::zero(); mr * nr]; unsafe { // row major C K::kernel(K, T::one(), a.as_ptr(), b.as_ptr(), T::zero(), c.as_mut_ptr(), nr as isize, 1); } let common_len = Ord::min(b.len(), c.len()); assert_eq!(&b[..common_len], &c[..common_len]); } #[cfg(feature="cgemm")] /// Assert that we can compute A I == A for the kernel (truncated, if needed) /// /// Tests C col major and row major /// Tests beta == 0 (and no other option) pub(crate) fn test_complex_packed_kernel(_name: &str) where K: GemmKernel, T: Element + fmt::Debug + PartialEq, TReal: Element + fmt::Debug + PartialEq, { use crate::cgemm_common::pack_complex; const K: usize = 16; let mr = K::MR; let nr = K::NR; // 1. Test A I == A (variables a, b, c) // b looks like an identity matrix (truncated, depending on MR/NR) let mut a = aligned_alloc::(T::zero(), mr * K); let mut apack = aligned_alloc::(T::zero(), mr * K); let mut b = aligned_alloc::(T::zero(), nr * K); let mut bpack = aligned_alloc::(T::zero(), nr * K); let mut count = 1; for i in 0..mr { for j in 0..K { for _ in 0..count { a[i * K + j].add_assign(T::test_value()); } count += 1; } } for i in 0..Ord::min(K, nr) { b[i + i * nr] = T::one(); } // unlike test_a_kernel, we need custom packing for these kernels unsafe { pack_complex::(K, mr, &mut apack[..], a.ptr_mut(), 1, mr as isize); pack_complex::(nr, K, &mut bpack[..], b.ptr_mut(), nr as isize, 1); } let mut c = vec![T::zero(); mr * nr]; unsafe { // col major C K::kernel(K, T::one(), apack.as_ptr(), bpack.as_ptr(), T::zero(), c.as_mut_ptr(), 1, mr as isize); } let common_len = Ord::min(a.len(), c.len()); assert_eq!(&a[..common_len], &c[..common_len]); } } matrixmultiply-0.3.9/src/lib.rs000064400000000000000000000132421046102023000146460ustar 00000000000000// Copyright 2016 - 2023 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! //! General matrix multiplication for f32, f64, and complex matrices. Operates on //! matrices with general layout (they can use arbitrary row and column stride). //! //! This crate uses the same macro/microkernel approach to matrix multiplication as //! the [BLIS][bl] project. //! //! We presently provide a few good microkernels, portable and for x86-64 and //! AArch64 NEON, and only one operation: the general matrix-matrix //! multiplication (“gemm”). //! //! [bl]: https://github.com/flame/blis //! //! ## Matrix Representation //! //! **matrixmultiply** supports matrices with general stride, so a matrix //! is passed using a pointer and four integers: //! //! - `a: *const f32`, pointer to the first element in the matrix //! - `m: usize`, number of rows //! - `k: usize`, number of columns //! - `rsa: isize`, row stride //! - `csa: isize`, column stride //! //! In this example, A is a m by k matrix. `a` is a pointer to the element at //! index *0, 0*. //! //! The *row stride* is the pointer offset (in number of elements) to the //! element on the next row. It’s the distance from element *i, j* to *i + 1, //! j*. //! //! The *column stride* is the pointer offset (in number of elements) to the //! element in the next column. It’s the distance from element *i, j* to *i, //! j + 1*. //! //! For example for a contiguous matrix, row major strides are *rsa=k, //! csa=1* and column major strides are *rsa=1, csa=m*. //! //! Strides can be negative or even zero, but for a mutable matrix elements //! may not alias each other. //! //! ## Portability and Performance //! //! - The default kernels are written in portable Rust and available //! on all targets. These may depend on autovectorization to perform well. //! //! - *x86* and *x86-64* features can be detected at runtime by default or //! compile time (if enabled), and the following kernel variants are //! implemented: //! //! - `fma` //! - `avx` //! - `sse2` //! //! - *aarch64* features can be detected at runtime by default or compile time //! (if enabled), and the following kernel variants are implemented: //! //! - `neon` //! //! ## Features //! //! ### `std` //! //! `std` is enabled by default. //! //! This crate can be used without the standard library (`#![no_std]`) by //! disabling the default `std` feature. To do so, use this in your //! `Cargo.toml`: //! //! ```toml //! matrixmultiply = { version = "0.3", default-features = false } //! ``` //! //! Runtime CPU feature detection is available **only** when `std` is enabled. //! Without the `std` feature, the crate uses special CPU features only if they //! are enabled at compile time. (To enable CPU features at compile time, pass //! the relevant //! [`target-cpu`](https://doc.rust-lang.org/rustc/codegen-options/index.html#target-cpu) //! or //! [`target-feature`](https://doc.rust-lang.org/rustc/codegen-options/index.html#target-feature) //! option to `rustc`.) //! //! ### `threading` //! //! `threading` is an optional crate feature //! //! Threading enables multithreading for the operations. The environment variable //! `MATMUL_NUM_THREADS` decides how many threads are used at maximum. At the moment 1-4 are //! supported and the default is the number of physical cpus (as detected by `num_cpus`). //! //! ### `cgemm` //! //! `cgemm` is an optional crate feature. //! //! It enables the `cgemm` and `zgemm` methods for complex matrix multiplication. //! This is an **experimental feature** and not yet as performant as the float kernels on x86. //! //! The complex representation we use is `[f64; 2]`. //! //! ### `constconf` //! //! `constconf` is an optional feature. When enabled, cache-sensitive parameters of //! the gemm implementations can be tweaked *at compile time* by defining the following variables: //! //! - `MATMUL_SGEMM_MC` //! (And so on, for S, D, C, ZGEMM and with NC, KC or MC). //! //! ## Other Notes //! //! The functions in this crate are thread safe, as long as the destination //! matrix is distinct. //! //! ## Rust Version //! //! This version requires Rust 1.41.1 or later; the crate follows a carefully //! considered upgrade policy, where updating the minimum Rust version is not a breaking //! change. //! //! Some features are enabled with later versions: from Rust 1.61 AArch64 NEON support. #![doc(html_root_url = "https://docs.rs/matrixmultiply/0.3/")] #![cfg_attr(not(feature = "std"), no_std)] #[cfg(not(feature = "std"))] extern crate alloc; #[cfg(feature = "std")] extern crate core; #[macro_use] mod debugmacros; #[macro_use] mod loopmacros; mod archparam_defaults; #[cfg(feature = "constconf")] mod archparam; #[cfg(feature = "constconf")] mod constparse; #[cfg(not(feature = "constconf"))] pub(crate) use archparam_defaults as archparam; mod gemm; mod kernel; mod packing; mod ptr; mod threading; mod aligned_alloc; mod util; #[macro_use] mod archmacros; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[macro_use] mod x86; #[cfg(any(target_arch = "aarch64"))] #[macro_use] mod aarch64; mod dgemm_kernel; mod sgemm_kernel; pub use crate::gemm::dgemm; pub use crate::gemm::sgemm; #[cfg(feature = "cgemm")] #[macro_use] mod cgemm_common; #[cfg(feature = "cgemm")] mod cgemm_kernel; #[cfg(feature = "cgemm")] mod zgemm_kernel; #[cfg(feature = "cgemm")] pub use crate::gemm::cgemm; #[cfg(feature = "cgemm")] pub use crate::gemm::zgemm; #[cfg(feature = "cgemm")] pub use crate::gemm::CGemmOption; matrixmultiply-0.3.9/src/loopmacros.rs000064400000000000000000000057061046102023000162640ustar 00000000000000// Copyright 2016 - 2018 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. // Unroll only in non-debug builds #[cfg(not(debug_assertions))] macro_rules! repeat { (1 $e:expr) => { $e; }; (2 $e:expr) => { $e;$e; }; (3 $e:expr) => { $e;$e; $e; }; (4 $e:expr) => { $e;$e; $e;$e; }; (5 $e:expr) => { $e;$e; $e;$e; $e; }; (6 $e:expr) => { $e;$e; $e;$e; $e;$e; }; (7 $e:expr) => { $e;$e; $e;$e; $e;$e; $e; }; (8 $e:expr) => { $e;$e; $e;$e; $e;$e; $e;$e; }; } #[cfg(debug_assertions)] macro_rules! loop4 { ($i:ident, $e:expr) => { for $i in 0..4 { $e } } } #[cfg(feature = "cgemm")] macro_rules! loop2 { ($i:ident, $e:expr) => {{ let $i = 0; $e; let $i = 1; $e; }} } #[cfg(not(debug_assertions))] macro_rules! loop4 { ($i:ident, $e:expr) => {{ let $i = 0; $e; let $i = 1; $e; let $i = 2; $e; let $i = 3; $e; }} } #[cfg(debug_assertions)] macro_rules! loop8 { ($i:ident, $e:expr) => { for $i in 0..8 { $e } } } #[cfg(not(debug_assertions))] macro_rules! loop8 { ($i:ident, $e:expr) => {{ let $i = 0; $e; let $i = 1; $e; let $i = 2; $e; let $i = 3; $e; let $i = 4; $e; let $i = 5; $e; let $i = 6; $e; let $i = 7; $e; }} } #[cfg(debug_assertions)] macro_rules! unroll_by { ($by:tt => $ntimes:expr, $e:expr) => { for _ in 0..$ntimes { $e } } } #[cfg(not(debug_assertions))] macro_rules! unroll_by { ($by:tt => $ntimes:expr, $e:expr) => {{ // using while loop to avoid problems // with requiring inlining of foor loop parts let k = $ntimes; let mut _index = 0; let _target = k / $by; while _index < _target { repeat!($by $e); _index += 1; } let mut _index = 0; let _target = k % $by; while _index < _target { $e; _index += 1; } }} } #[allow(unused)] #[cfg(debug_assertions)] macro_rules! unroll_by_with_last { ($by:tt => $ntimes:expr, $is_last:ident, $e:expr) => {{ let k = $ntimes - 1; let $is_last = false; for _ in 0..k { $e; } let $is_last = true; #[allow(unused_assignments)] $e; }} } #[allow(unused)] #[cfg(not(debug_assertions))] macro_rules! unroll_by_with_last { ($by:tt => $ntimes:expr, $is_last:ident, $e:expr) => {{ let k = $ntimes - 1; let $is_last = false; for _ in 0..k / $by { repeat!($by $e); } for _ in 0..k % $by { $e; } let $is_last = true; #[allow(unused_assignments)] $e; }} } matrixmultiply-0.3.9/src/packing.rs000064400000000000000000000066331046102023000155220ustar 00000000000000// Copyright 2016 - 2023 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use rawpointer::PointerExt; use core::ptr::copy_nonoverlapping; use crate::kernel::ConstNum; use crate::kernel::Element; /// Pack matrix into `pack` /// /// + kc: length of the micropanel /// + mc: number of rows/columns in the matrix to be packed /// + pack: packing buffer /// + a: matrix, /// + rsa: row stride /// + csa: column stride /// /// + MR: kernel rows/columns that we round up to // If one of pack and a is of a reference type, it gets a noalias annotation which // gives benefits to optimization. The packing buffer is contiguous so it can be passed as a slice // here. pub(crate) unsafe fn pack(kc: usize, mc: usize, pack: &mut [T], a: *const T, rsa: isize, csa: isize) where T: Element, MR: ConstNum, { pack_impl::(kc, mc, pack, a, rsa, csa) } /// Specialized for AVX2 /// Safety: Requires AVX2 #[cfg(any(target_arch="x86", target_arch="x86_64"))] #[target_feature(enable="avx2")] pub(crate) unsafe fn pack_avx2(kc: usize, mc: usize, pack: &mut [T], a: *const T, rsa: isize, csa: isize) where T: Element, MR: ConstNum, { pack_impl::(kc, mc, pack, a, rsa, csa) } /// Pack implementation, see pack above for docs. /// /// Uses inline(always) so that it can be instantiated for different target features. #[inline(always)] unsafe fn pack_impl(kc: usize, mc: usize, pack: &mut [T], a: *const T, rsa: isize, csa: isize) where T: Element, MR: ConstNum, { let pack = pack.as_mut_ptr(); let mr = MR::VALUE; let mut p = 0; // offset into pack if rsa == 1 { // if the matrix is contiguous in the same direction we are packing, // copy a kernel row at a time. for ir in 0..mc/mr { let row_offset = ir * mr; for j in 0..kc { let a_row = a.stride_offset(rsa, row_offset) .stride_offset(csa, j); copy_nonoverlapping(a_row, pack.add(p), mr); p += mr; } } } else { // general layout case for ir in 0..mc/mr { let row_offset = ir * mr; for j in 0..kc { for i in 0..mr { let a_elt = a.stride_offset(rsa, i + row_offset) .stride_offset(csa, j); copy_nonoverlapping(a_elt, pack.add(p), 1); p += 1; } } } } let zero = <_>::zero(); // Pad with zeros to multiple of kernel size (uneven mc) let rest = mc % mr; if rest > 0 { let row_offset = (mc/mr) * mr; for j in 0..kc { for i in 0..mr { if i < rest { let a_elt = a.stride_offset(rsa, i + row_offset) .stride_offset(csa, j); copy_nonoverlapping(a_elt, pack.add(p), 1); } else { *pack.add(p) = zero; } p += 1; } } } } matrixmultiply-0.3.9/src/ptr.rs000064400000000000000000000026251046102023000147100ustar 00000000000000// Copyright 2020, 2022 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use rawpointer::PointerExt; /// A Send + Sync raw pointer wrapper #[derive(Copy, Clone)] #[repr(transparent)] pub(crate) struct Ptr { ptr: T } unsafe impl Sync for Ptr<*const T> { } unsafe impl Sync for Ptr<*mut T> { } unsafe impl Send for Ptr<*const T> { } unsafe impl Send for Ptr<*mut T> { } /// Create a Ptr /// /// # Safety /// /// Unsafe since it is thread safety critical to use the raw pointer correctly. #[allow(non_snake_case)] pub(crate) unsafe fn Ptr(ptr: T) -> Ptr { Ptr { ptr } } impl Ptr { /// Get the pointer pub(crate) fn ptr(self) -> T where T: Copy { self.ptr } } impl Ptr<*mut T> { /// Get as *const T pub(crate) fn to_const(self) -> Ptr<*const T> { Ptr { ptr: self.ptr } } } impl PointerExt for Ptr<*const T> { #[inline(always)] unsafe fn offset(self, i: isize) -> Self { Ptr(self.ptr.offset(i)) } } impl PointerExt for Ptr<*mut T> { #[inline(always)] unsafe fn offset(self, i: isize) -> Self { Ptr(self.ptr.offset(i)) } } matrixmultiply-0.3.9/src/sgemm_kernel.rs000064400000000000000000000641141046102023000165540ustar 00000000000000// Copyright 2016 - 2023 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use crate::kernel::GemmKernel; use crate::kernel::GemmSelect; use crate::kernel::{U4, U8}; use crate::archparam; #[cfg(target_arch="x86")] use core::arch::x86::*; #[cfg(target_arch="x86_64")] use core::arch::x86_64::*; #[cfg(any(target_arch="x86", target_arch="x86_64"))] use crate::x86::{FusedMulAdd, AvxMulAdd, SMultiplyAdd}; #[cfg(any(target_arch="x86", target_arch="x86_64"))] struct KernelAvx; #[cfg(any(target_arch="x86", target_arch="x86_64"))] struct KernelFmaAvx2; #[cfg(any(target_arch="x86", target_arch="x86_64"))] struct KernelFma; #[cfg(any(target_arch="x86", target_arch="x86_64"))] struct KernelSse2; #[cfg(target_arch="aarch64")] #[cfg(has_aarch64_simd)] struct KernelNeon; struct KernelFallback; type T = f32; /// Detect which implementation to use and select it using the selector's /// .select(Kernel) method. /// /// This function is called one or more times during a whole program's /// execution, it may be called for each gemm kernel invocation or fewer times. #[inline] pub(crate) fn detect(selector: G) where G: GemmSelect { // dispatch to specific compiled versions #[cfg(any(target_arch="x86", target_arch="x86_64"))] { if is_x86_feature_detected_!("fma") { if is_x86_feature_detected_!("avx2") { return selector.select(KernelFmaAvx2); } return selector.select(KernelFma); } else if is_x86_feature_detected_!("avx") { return selector.select(KernelAvx); } else if is_x86_feature_detected_!("sse2") { return selector.select(KernelSse2); } } #[cfg(target_arch="aarch64")] #[cfg(has_aarch64_simd)] { if is_aarch64_feature_detected_!("neon") { return selector.select(KernelNeon); } } return selector.select(KernelFallback); } #[cfg(any(target_arch="x86", target_arch="x86_64"))] macro_rules! loop_m { ($i:ident, $e:expr) => { loop8!($i, $e) }; } #[cfg(all(test, any(target_arch="x86", target_arch="x86_64")))] macro_rules! loop_n { ($j:ident, $e:expr) => { loop8!($j, $e) }; } #[cfg(any(target_arch="x86", target_arch="x86_64"))] impl GemmKernel for KernelAvx { type Elem = T; type MRTy = U8; type NRTy = U8; #[inline(always)] fn align_to() -> usize { 32 } #[inline(always)] fn always_masked() -> bool { false } #[inline(always)] fn nc() -> usize { archparam::S_NC } #[inline(always)] fn kc() -> usize { archparam::S_KC } #[inline(always)] fn mc() -> usize { archparam::S_MC } #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_avx(k, alpha, a, b, beta, c, rsc, csc) } } #[cfg(any(target_arch="x86", target_arch="x86_64"))] impl GemmKernel for KernelFma { type Elem = T; type MRTy = ::MRTy; type NRTy = ::NRTy; #[inline(always)] fn align_to() -> usize { KernelAvx::align_to() } #[inline(always)] fn always_masked() -> bool { KernelAvx::always_masked() } #[inline(always)] fn nc() -> usize { archparam::S_NC } #[inline(always)] fn kc() -> usize { archparam::S_KC } #[inline(always)] fn mc() -> usize { archparam::S_MC } #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_fma(k, alpha, a, b, beta, c, rsc, csc) } } #[cfg(any(target_arch="x86", target_arch="x86_64"))] impl GemmKernel for KernelFmaAvx2 { type Elem = T; type MRTy = ::MRTy; type NRTy = ::NRTy; #[inline(always)] fn align_to() -> usize { KernelAvx::align_to() } #[inline(always)] fn always_masked() -> bool { KernelAvx::always_masked() } #[inline(always)] fn nc() -> usize { archparam::S_NC } #[inline(always)] fn kc() -> usize { archparam::S_KC } #[inline(always)] fn mc() -> usize { archparam::S_MC } #[inline] unsafe fn pack_mr(kc: usize, mc: usize, pack: &mut [Self::Elem], a: *const Self::Elem, rsa: isize, csa: isize) { // safety: Avx2 is enabled crate::packing::pack_avx2::(kc, mc, pack, a, rsa, csa) } #[inline] unsafe fn pack_nr(kc: usize, mc: usize, pack: &mut [Self::Elem], a: *const Self::Elem, rsa: isize, csa: isize) { // safety: Avx2 is enabled crate::packing::pack_avx2::(kc, mc, pack, a, rsa, csa) } #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_fma(k, alpha, a, b, beta, c, rsc, csc) } } #[cfg(any(target_arch="x86", target_arch="x86_64"))] impl GemmKernel for KernelSse2 { type Elem = T; type MRTy = ::MRTy; type NRTy = ::NRTy; #[inline(always)] fn align_to() -> usize { 16 } #[inline(always)] fn always_masked() -> bool { KernelFallback::always_masked() } #[inline(always)] fn nc() -> usize { archparam::S_NC } #[inline(always)] fn kc() -> usize { archparam::S_KC } #[inline(always)] fn mc() -> usize { archparam::S_MC } #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_sse2(k, alpha, a, b, beta, c, rsc, csc) } } #[cfg(target_arch="aarch64")] #[cfg(has_aarch64_simd)] impl GemmKernel for KernelNeon { type Elem = T; type MRTy = U8; type NRTy = U8; #[inline(always)] fn align_to() -> usize { 32 } #[inline(always)] fn always_masked() -> bool { false } #[inline(always)] fn nc() -> usize { archparam::S_NC } #[inline(always)] fn kc() -> usize { archparam::S_KC } #[inline(always)] fn mc() -> usize { archparam::S_MC } #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_neon(k, alpha, a, b, beta, c, rsc, csc) } } impl GemmKernel for KernelFallback { type Elem = T; type MRTy = U8; type NRTy = U4; #[inline(always)] fn align_to() -> usize { 0 } #[inline(always)] fn always_masked() -> bool { true } #[inline(always)] fn nc() -> usize { archparam::S_NC } #[inline(always)] fn kc() -> usize { archparam::S_KC } #[inline(always)] fn mc() -> usize { archparam::S_MC } #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc) } } // no inline for unmasked kernels #[cfg(any(target_arch="x86", target_arch="x86_64"))] #[target_feature(enable="fma")] unsafe fn kernel_target_fma(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_x86_avx::(k, alpha, a, b, beta, c, rsc, csc) } // no inline for unmasked kernels #[cfg(any(target_arch="x86", target_arch="x86_64"))] #[target_feature(enable="avx")] unsafe fn kernel_target_avx(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_x86_avx::(k, alpha, a, b, beta, c, rsc, csc) } #[inline] #[cfg(any(target_arch="x86", target_arch="x86_64"))] #[target_feature(enable="sse2")] unsafe fn kernel_target_sse2(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc) } #[inline(always)] #[cfg(any(target_arch="x86", target_arch="x86_64"))] unsafe fn kernel_x86_avx(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) where MA: SMultiplyAdd, { const MR: usize = KernelAvx::MR; const NR: usize = KernelAvx::NR; debug_assert_ne!(k, 0); let mut ab = [_mm256_setzero_ps(); MR]; // this kernel can operate in either transposition (C = A B or C^T = B^T A^T) let prefer_row_major_c = rsc != 1; let (mut a, mut b) = if prefer_row_major_c { (a, b) } else { (b, a) }; let (rsc, csc) = if prefer_row_major_c { (rsc, csc) } else { (csc, rsc) }; macro_rules! shuffle_mask { ($z:expr, $y:expr, $x:expr, $w:expr) => { ($z << 6) | ($y << 4) | ($x << 2) | $w } } macro_rules! permute_mask { ($z:expr, $y:expr, $x:expr, $w:expr) => { ($z << 6) | ($y << 4) | ($x << 2) | $w } } macro_rules! permute2f128_mask { ($y:expr, $x:expr) => { (($y << 4) | $x) } } // Start data load before each iteration let mut av = _mm256_load_ps(a); let mut bv = _mm256_load_ps(b); // Compute A B unroll_by_with_last!(4 => k, is_last, { // We compute abij = ai bj // // Load b as one contiguous vector // Load a as striped vectors // // Shuffle the abij elements in order after the loop. // // Note this scheme copied and transposed from the BLIS 8x8 sgemm // microkernel. // // Our a indices are striped and our b indices are linear. In // the variable names below, we always have doubled indices so // for example a0246 corresponds to a vector of a0 a0 a2 a2 a4 a4 a6 a6. // // ab0246: ab2064: ab4602: ab6420: // ( ab00 ( ab20 ( ab40 ( ab60 // ab01 ab21 ab41 ab61 // ab22 ab02 ab62 ab42 // ab23 ab03 ab63 ab43 // ab44 ab64 ab04 ab24 // ab45 ab65 ab05 ab25 // ab66 ab46 ab26 ab06 // ab67 ) ab47 ) ab27 ) ab07 ) // // ab1357: ab3175: ab5713: ab7531: // ( ab10 ( ab30 ( ab50 ( ab70 // ab11 ab31 ab51 ab71 // ab32 ab12 ab72 ab52 // ab33 ab13 ab73 ab53 // ab54 ab74 ab14 ab34 // ab55 ab75 ab15 ab35 // ab76 ab56 ab36 ab16 // ab77 ) ab57 ) ab37 ) ab17 ) const PERM32_2301: i32 = permute_mask!(1, 0, 3, 2); const PERM128_30: i32 = permute2f128_mask!(0, 3); // _mm256_moveldup_ps(av): // vmovsldup ymm2, ymmword ptr [rax] // // Load and duplicate each even word: // ymm2 ← [a0 a0 a2 a2 a4 a4 a6 a6] // // _mm256_movehdup_ps(av): // vmovshdup ymm2, ymmword ptr [rax] // // Load and duplicate each odd word: // ymm2 ← [a1 a1 a3 a3 a5 a5 a7 a7] // let a0246 = _mm256_moveldup_ps(av); // Load: a0 a0 a2 a2 a4 a4 a6 a6 let a2064 = _mm256_permute_ps(a0246, PERM32_2301); let a1357 = _mm256_movehdup_ps(av); // Load: a1 a1 a3 a3 a5 a5 a7 a7 let a3175 = _mm256_permute_ps(a1357, PERM32_2301); let a4602 = _mm256_permute2f128_ps(a0246, a0246, PERM128_30); let a6420 = _mm256_permute2f128_ps(a2064, a2064, PERM128_30); let a5713 = _mm256_permute2f128_ps(a1357, a1357, PERM128_30); let a7531 = _mm256_permute2f128_ps(a3175, a3175, PERM128_30); ab[0] = MA::multiply_add(a0246, bv, ab[0]); ab[1] = MA::multiply_add(a2064, bv, ab[1]); ab[2] = MA::multiply_add(a4602, bv, ab[2]); ab[3] = MA::multiply_add(a6420, bv, ab[3]); ab[4] = MA::multiply_add(a1357, bv, ab[4]); ab[5] = MA::multiply_add(a3175, bv, ab[5]); ab[6] = MA::multiply_add(a5713, bv, ab[6]); ab[7] = MA::multiply_add(a7531, bv, ab[7]); if !is_last { a = a.add(MR); b = b.add(NR); bv = _mm256_load_ps(b); av = _mm256_load_ps(a); } }); let alphav = _mm256_set1_ps(alpha); // Permute to put the abij elements in order // // shufps 0xe4: 22006644 00224466 -> 22226666 // // vperm2 0x30: 00004444 44440000 -> 00000000 // vperm2 0x12: 00004444 44440000 -> 44444444 // let ab0246 = ab[0]; let ab2064 = ab[1]; let ab4602 = ab[2]; let ab6420 = ab[3]; let ab1357 = ab[4]; let ab3175 = ab[5]; let ab5713 = ab[6]; let ab7531 = ab[7]; const SHUF_0123: i32 = shuffle_mask!(3, 2, 1, 0); debug_assert_eq!(SHUF_0123, 0xE4); const PERM128_03: i32 = permute2f128_mask!(3, 0); const PERM128_21: i32 = permute2f128_mask!(1, 2); // No elements are "shuffled" in truth, they all stay at their index // but we combine vectors to de-stripe them. // // For example, the first shuffle below uses 0 1 2 3 which // corresponds to the X0 X1 Y2 Y3 sequence etc: // // variable // X ab00 ab01 ab22 ab23 ab44 ab45 ab66 ab67 ab0246 // Y ab20 ab21 ab02 ab03 ab64 ab65 ab46 ab47 ab2064 // // X0 X1 Y2 Y3 X4 X5 Y6 Y7 // = ab00 ab01 ab02 ab03 ab44 ab45 ab46 ab47 ab0044 let ab0044 = _mm256_shuffle_ps(ab0246, ab2064, SHUF_0123); let ab2266 = _mm256_shuffle_ps(ab2064, ab0246, SHUF_0123); let ab4400 = _mm256_shuffle_ps(ab4602, ab6420, SHUF_0123); let ab6622 = _mm256_shuffle_ps(ab6420, ab4602, SHUF_0123); let ab1155 = _mm256_shuffle_ps(ab1357, ab3175, SHUF_0123); let ab3377 = _mm256_shuffle_ps(ab3175, ab1357, SHUF_0123); let ab5511 = _mm256_shuffle_ps(ab5713, ab7531, SHUF_0123); let ab7733 = _mm256_shuffle_ps(ab7531, ab5713, SHUF_0123); let ab0000 = _mm256_permute2f128_ps(ab0044, ab4400, PERM128_03); let ab4444 = _mm256_permute2f128_ps(ab0044, ab4400, PERM128_21); let ab2222 = _mm256_permute2f128_ps(ab2266, ab6622, PERM128_03); let ab6666 = _mm256_permute2f128_ps(ab2266, ab6622, PERM128_21); let ab1111 = _mm256_permute2f128_ps(ab1155, ab5511, PERM128_03); let ab5555 = _mm256_permute2f128_ps(ab1155, ab5511, PERM128_21); let ab3333 = _mm256_permute2f128_ps(ab3377, ab7733, PERM128_03); let ab7777 = _mm256_permute2f128_ps(ab3377, ab7733, PERM128_21); ab[0] = ab0000; ab[1] = ab1111; ab[2] = ab2222; ab[3] = ab3333; ab[4] = ab4444; ab[5] = ab5555; ab[6] = ab6666; ab[7] = ab7777; // Compute α (A B) // Compute here if we don't have fma, else pick up α further down if !MA::IS_FUSED { loop_m!(i, ab[i] = _mm256_mul_ps(alphav, ab[i])); } macro_rules! c { ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize)); } // C ← α A B + β C let mut cv = [_mm256_setzero_ps(); MR]; if beta != 0. { let betav = _mm256_set1_ps(beta); // Read C if csc == 1 { loop_m!(i, cv[i] = _mm256_loadu_ps(c![i, 0])); } else { loop_m!(i, cv[i] = _mm256_setr_ps(*c![i, 0], *c![i, 1], *c![i, 2], *c![i, 3], *c![i, 4], *c![i, 5], *c![i, 6], *c![i, 7])); } // Compute β C loop_m!(i, cv[i] = _mm256_mul_ps(cv[i], betav)); } // Compute (α A B) + (β C) if !MA::IS_FUSED { loop_m!(i, cv[i] = _mm256_add_ps(cv[i], ab[i])); } else { loop_m!(i, cv[i] = MA::multiply_add(alphav, ab[i], cv[i])); } // Store C back to memory if csc == 1 { loop_m!(i, _mm256_storeu_ps(c![i, 0], cv[i])); } else { // Permute to bring each element in the vector to the front and store loop_m!(i, { let cvlo = _mm256_extractf128_ps(cv[i], 0); let cvhi = _mm256_extractf128_ps(cv[i], 1); _mm_store_ss(c![i, 0], cvlo); let cperm = _mm_permute_ps(cvlo, permute_mask!(0, 3, 2, 1)); _mm_store_ss(c![i, 1], cperm); let cperm = _mm_permute_ps(cperm, permute_mask!(0, 3, 2, 1)); _mm_store_ss(c![i, 2], cperm); let cperm = _mm_permute_ps(cperm, permute_mask!(0, 3, 2, 1)); _mm_store_ss(c![i, 3], cperm); _mm_store_ss(c![i, 4], cvhi); let cperm = _mm_permute_ps(cvhi, permute_mask!(0, 3, 2, 1)); _mm_store_ss(c![i, 5], cperm); let cperm = _mm_permute_ps(cperm, permute_mask!(0, 3, 2, 1)); _mm_store_ss(c![i, 6], cperm); let cperm = _mm_permute_ps(cperm, permute_mask!(0, 3, 2, 1)); _mm_store_ss(c![i, 7], cperm); }); } } #[cfg(target_arch="aarch64")] #[cfg(has_aarch64_simd)] #[target_feature(enable="neon")] unsafe fn kernel_target_neon(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { use core::arch::aarch64::*; const MR: usize = KernelNeon::MR; const NR: usize = KernelNeon::NR; let (mut a, mut b, rsc, csc) = if rsc == 1 { (b, a, csc, rsc) } else { (a, b, rsc, csc) }; // Kernel 8 x 8 (a x b) // Four quadrants of 4 x 4 let mut ab11 = [vmovq_n_f32(0.); 4]; let mut ab12 = [vmovq_n_f32(0.); 4]; let mut ab21 = [vmovq_n_f32(0.); 4]; let mut ab22 = [vmovq_n_f32(0.); 4]; // Compute // ab_ij = a_i * b_j for all i, j macro_rules! ab_ij_equals_ai_bj { ($dest:ident, $av:expr, $bv:expr) => { $dest[0] = vfmaq_laneq_f32($dest[0], $bv, $av, 0); $dest[1] = vfmaq_laneq_f32($dest[1], $bv, $av, 1); $dest[2] = vfmaq_laneq_f32($dest[2], $bv, $av, 2); $dest[3] = vfmaq_laneq_f32($dest[3], $bv, $av, 3); } } for _ in 0..k { let a1 = vld1q_f32(a); let b1 = vld1q_f32(b); let a2 = vld1q_f32(a.add(4)); let b2 = vld1q_f32(b.add(4)); // compute an outer product ab = a (*) b in four quadrants ab11, ab12, ab21, ab22 // ab11: [a1 a2 a3 a4] (*) [b1 b2 b3 b4] // ab11: a1b1 a1b2 a1b3 a1b4 // a2b1 a2b2 a2b3 a2b4 // a3b1 a3b2 a3b3 a3b4 // a4b1 a4b2 a4b3 a4b4 // etc ab_ij_equals_ai_bj!(ab11, a1, b1); ab_ij_equals_ai_bj!(ab12, a1, b2); ab_ij_equals_ai_bj!(ab21, a2, b1); ab_ij_equals_ai_bj!(ab22, a2, b2); a = a.add(MR); b = b.add(NR); } macro_rules! c { ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize)); } // ab *= alpha loop4!(i, ab11[i] = vmulq_n_f32(ab11[i], alpha)); loop4!(i, ab12[i] = vmulq_n_f32(ab12[i], alpha)); loop4!(i, ab21[i] = vmulq_n_f32(ab21[i], alpha)); loop4!(i, ab22[i] = vmulq_n_f32(ab22[i], alpha)); // load one float32x4_t from four pointers macro_rules! loadq_from_pointers { ($p0:expr, $p1:expr, $p2:expr, $p3:expr) => ( { let v = vld1q_dup_f32($p0); let v = vld1q_lane_f32($p1, v, 1); let v = vld1q_lane_f32($p2, v, 2); let v = vld1q_lane_f32($p3, v, 3); v } ); } if beta != 0. { // load existing value in C let mut c11 = [vmovq_n_f32(0.); 4]; let mut c12 = [vmovq_n_f32(0.); 4]; let mut c21 = [vmovq_n_f32(0.); 4]; let mut c22 = [vmovq_n_f32(0.); 4]; if csc == 1 { loop4!(i, c11[i] = vld1q_f32(c![i + 0, 0])); loop4!(i, c12[i] = vld1q_f32(c![i + 0, 4])); loop4!(i, c21[i] = vld1q_f32(c![i + 4, 0])); loop4!(i, c22[i] = vld1q_f32(c![i + 4, 4])); } else { loop4!(i, c11[i] = loadq_from_pointers!(c![i + 0, 0], c![i + 0, 1], c![i + 0, 2], c![i + 0, 3])); loop4!(i, c12[i] = loadq_from_pointers!(c![i + 0, 4], c![i + 0, 5], c![i + 0, 6], c![i + 0, 7])); loop4!(i, c21[i] = loadq_from_pointers!(c![i + 4, 0], c![i + 4, 1], c![i + 4, 2], c![i + 4, 3])); loop4!(i, c22[i] = loadq_from_pointers!(c![i + 4, 4], c![i + 4, 5], c![i + 4, 6], c![i + 4, 7])); } let betav = vmovq_n_f32(beta); // ab += β C loop4!(i, ab11[i] = vfmaq_f32(ab11[i], c11[i], betav)); loop4!(i, ab12[i] = vfmaq_f32(ab12[i], c12[i], betav)); loop4!(i, ab21[i] = vfmaq_f32(ab21[i], c21[i], betav)); loop4!(i, ab22[i] = vfmaq_f32(ab22[i], c22[i], betav)); } // c <- ab // which is in full // C <- α A B (+ β C) if csc == 1 { loop4!(i, vst1q_f32(c![i + 0, 0], ab11[i])); loop4!(i, vst1q_f32(c![i + 0, 4], ab12[i])); loop4!(i, vst1q_f32(c![i + 4, 0], ab21[i])); loop4!(i, vst1q_f32(c![i + 4, 4], ab22[i])); } else { loop4!(i, vst1q_lane_f32(c![i + 0, 0], ab11[i], 0)); loop4!(i, vst1q_lane_f32(c![i + 0, 1], ab11[i], 1)); loop4!(i, vst1q_lane_f32(c![i + 0, 2], ab11[i], 2)); loop4!(i, vst1q_lane_f32(c![i + 0, 3], ab11[i], 3)); loop4!(i, vst1q_lane_f32(c![i + 0, 4], ab12[i], 0)); loop4!(i, vst1q_lane_f32(c![i + 0, 5], ab12[i], 1)); loop4!(i, vst1q_lane_f32(c![i + 0, 6], ab12[i], 2)); loop4!(i, vst1q_lane_f32(c![i + 0, 7], ab12[i], 3)); loop4!(i, vst1q_lane_f32(c![i + 4, 0], ab21[i], 0)); loop4!(i, vst1q_lane_f32(c![i + 4, 1], ab21[i], 1)); loop4!(i, vst1q_lane_f32(c![i + 4, 2], ab21[i], 2)); loop4!(i, vst1q_lane_f32(c![i + 4, 3], ab21[i], 3)); loop4!(i, vst1q_lane_f32(c![i + 4, 4], ab22[i], 0)); loop4!(i, vst1q_lane_f32(c![i + 4, 5], ab22[i], 1)); loop4!(i, vst1q_lane_f32(c![i + 4, 6], ab22[i], 2)); loop4!(i, vst1q_lane_f32(c![i + 4, 7], ab22[i], 3)); } } #[inline] unsafe fn kernel_fallback_impl(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { const MR: usize = KernelFallback::MR; const NR: usize = KernelFallback::NR; let mut ab: [[T; NR]; MR] = [[0.; NR]; MR]; let mut a = a; let mut b = b; debug_assert_eq!(beta, 0., "Beta must be 0 or is not masked"); // Compute A B into ab[i][j] unroll_by!(4 => k, { loop8!(i, loop4!(j, ab[i][j] += at(a, i) * at(b, j))); a = a.offset(MR as isize); b = b.offset(NR as isize); }); macro_rules! c { ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize)); } // set C = α A B loop4!(j, loop8!(i, *c![i, j] = alpha * ab[i][j])); } #[inline(always)] unsafe fn at(ptr: *const T, i: usize) -> T { *ptr.offset(i as isize) } #[cfg(test)] mod tests { use super::*; use crate::kernel::test::test_a_kernel; #[test] fn test_kernel_fallback_impl() { test_a_kernel::("kernel"); } #[cfg(any(target_arch="x86", target_arch="x86_64"))] #[test] fn test_loop_m_n() { let mut m = [[0; KernelAvx::NR]; KernelAvx::MR]; loop_m!(i, loop_n!(j, m[i][j] += 1)); for arr in &m[..] { for elt in &arr[..] { assert_eq!(*elt, 1); } } } #[cfg(any(target_arch="aarch64"))] #[cfg(has_aarch64_simd)] mod test_kernel_aarch64 { use super::test_a_kernel; use super::super::*; #[cfg(feature = "std")] use std::println; macro_rules! test_arch_kernels_aarch64 { ($($feature_name:tt, $name:ident, $kernel_ty:ty),*) => { $( #[test] fn $name() { if is_aarch64_feature_detected_!($feature_name) { test_a_kernel::<$kernel_ty, _>(stringify!($name)); } else { #[cfg(feature = "std")] println!("Skipping, host does not have feature: {:?}", $feature_name); } } )* } } test_arch_kernels_aarch64! { "neon", neon8x8, KernelNeon } } #[cfg(any(target_arch="x86", target_arch="x86_64"))] mod test_kernel_x86 { use super::test_a_kernel; use super::super::*; #[cfg(feature = "std")] use std::println; macro_rules! test_arch_kernels_x86 { ($($feature_name:tt, $name:ident, $kernel_ty:ty),*) => { $( #[test] fn $name() { if is_x86_feature_detected_!($feature_name) { test_a_kernel::<$kernel_ty, _>(stringify!($name)); } else { #[cfg(feature = "std")] println!("Skipping, host does not have feature: {:?}", $feature_name); } } )* } } test_arch_kernels_x86! { "fma", fma, KernelFma, "avx", avx, KernelAvx, "sse2", sse2, KernelSse2 } #[test] fn ensure_target_features_tested() { // If enabled, this test ensures that the requested feature actually // was enabled on this configuration, so that it was tested. let should_ensure_feature = !option_env!("MMTEST_ENSUREFEATURE") .unwrap_or("").is_empty(); if !should_ensure_feature { // skip return; } let feature_name = option_env!("MMTEST_FEATURE") .expect("No MMTEST_FEATURE configured!"); let detected = match feature_name { "avx" => is_x86_feature_detected_!("avx"), "fma" => is_x86_feature_detected_!("fma"), "sse2" => is_x86_feature_detected_!("sse2"), _ => false, }; assert!(detected, "Feature {:?} was not detected, so it could not be tested", feature_name); } } } matrixmultiply-0.3.9/src/threading.rs000064400000000000000000000175171046102023000160560ustar 00000000000000/// /// Threading support functions and statics #[cfg(feature="threading")] use std::cmp::min; #[cfg(feature="threading")] use std::str::FromStr; #[cfg(feature="threading")] use once_cell::sync::Lazy; #[cfg(feature="threading")] pub use thread_tree::ThreadTree as ThreadPool; #[cfg(feature="threading")] pub use thread_tree::ThreadTreeCtx as ThreadPoolCtx; use crate::kernel::GemmKernel; use crate::util::RangeChunk; /// Dummy threadpool #[cfg(not(feature="threading"))] pub(crate) struct ThreadPool; #[cfg(not(feature="threading"))] pub(crate) type ThreadPoolCtx<'a> = &'a (); #[cfg(not(feature="threading"))] impl ThreadPool { /// Get top dummy thread pool context pub(crate) fn top(&self) -> ThreadPoolCtx<'_> { &() } } pub(crate) fn get_thread_pool<'a>() -> (usize, ThreadPoolCtx<'a>) { let reg = &*REGISTRY; (reg.nthreads, reg.thread_pool().top()) } struct Registry { nthreads: usize, #[cfg(feature="threading")] thread_pool: Box, } impl Registry { fn thread_pool(&self) -> &ThreadPool { #[cfg(feature="threading")] return &*REGISTRY.thread_pool; #[cfg(not(feature="threading"))] return &ThreadPool; } } #[cfg(not(feature="threading"))] const REGISTRY: &'static Registry = &Registry { nthreads: 1 }; #[cfg(feature="threading")] /// Maximum (usefully) supported threads at the moment const MAX_THREADS: usize = 4; #[cfg(feature="threading")] static REGISTRY: Lazy = Lazy::new(|| { let var = ::std::env::var("MATMUL_NUM_THREADS").ok(); let threads = match var { Some(s) if !s.is_empty() => { if let Ok(nt) = usize::from_str(&s) { nt } else { eprintln!("Failed to parse MATMUL_NUM_THREADS"); 1 } } _otherwise => num_cpus::get_physical(), }; // Ensure threads in 1 <= threads <= MAX_THREADS let threads = 1.max(threads).min(MAX_THREADS); let tp = if threads <= 1 { Box::new(ThreadPool::new_level0()) } else if threads <= 3 { ThreadPool::new_with_level(1) } else { ThreadPool::new_with_level(2) }; Registry { nthreads: threads, thread_pool: tp, } }); /// Describe how many threads we use in each loop #[derive(Copy, Clone)] pub(crate) struct LoopThreadConfig { /// Loop 3 threads pub(crate) loop3: u8, /// Loop 2 threads pub(crate) loop2: u8, } impl LoopThreadConfig { /// Decide how many threads to use in each loop pub(crate) fn new(m: usize, k: usize, n: usize, max_threads: usize) -> Self where K: GemmKernel { let default_config = LoopThreadConfig { loop3: 1, loop2: 1 }; #[cfg(not(feature="threading"))] { let _ = (m, k, n, max_threads); // used return default_config; } #[cfg(feature="threading")] { if max_threads == 1 { return default_config; } Self::new_impl(m, k, n, max_threads, K::mc()) } } #[cfg(feature="threading")] fn new_impl(m: usize, k: usize, n: usize, max_threads: usize, kmc: usize) -> Self { // use a heuristic to try not to use too many threads for smaller matrices let size_factor = m * k + k * n; let thread_factor = 1 << 14; // pure guesswork in terms of what the default should be let arch_factor = if cfg!(target_arch="arm") { 20 } else { 1 }; // At the moment only a configuration of 1, 2, or 4 threads is supported. // // Prefer to split Loop 3 if only 2 threads are available, (because it was better in a // square matrix benchmark). let matrix_max_threads = size_factor / (thread_factor / arch_factor); let mut max_threads = max_threads.min(matrix_max_threads); let loop3 = if max_threads >= 2 && m >= 3 * (kmc / 2) { max_threads /= 2; 2 } else { 1 }; let loop2 = if max_threads >= 2 { 2 } else { 1 }; LoopThreadConfig { loop3, loop2, } } /// Number of packing buffers for A #[inline(always)] pub(crate) fn num_pack_a(&self) -> usize { self.loop3 as usize } } impl RangeChunk { /// "Builder" method to create a RangeChunkParallel pub(crate) fn parallel(self, nthreads: u8, pool: ThreadPoolCtx) -> RangeChunkParallel { fn nop() {} RangeChunkParallel { nthreads, pool, range: self, thread_local: nop, } } } /// Intermediate struct for building the parallel execution of a range chunk. pub(crate) struct RangeChunkParallel<'a, G> { range: RangeChunk, nthreads: u8, pool: ThreadPoolCtx<'a>, thread_local: G, } impl<'a, G> RangeChunkParallel<'a, G> { #[cfg(feature="threading")] /// Set thread local setup function - called once per thread to setup thread local data. pub(crate) fn thread_local(self, func: G2) -> RangeChunkParallel<'a, G2> where G2: Fn(usize, usize) -> R + Sync { RangeChunkParallel { nthreads: self.nthreads, pool: self.pool, thread_local: func, range: self.range, } } #[cfg(not(feature="threading"))] /// Set thread local setup function - called once per thread to setup thread local data. pub(crate) fn thread_local(self, func: G2) -> RangeChunkParallel<'a, G2> where G2: FnOnce(usize, usize) -> R + Sync { RangeChunkParallel { nthreads: self.nthreads, pool: self.pool, thread_local: func, range: self.range, } } } #[cfg(not(feature="threading"))] impl RangeChunkParallel<'_, G> where G: FnOnce(usize, usize) -> R + Sync, { pub(crate) fn for_each(self, for_each: F) where F: Fn(ThreadPoolCtx<'_>, &mut R, usize, usize) + Sync, { let mut local = (self.thread_local)(0, 1); for (ln, chunk_size) in self.range { for_each(self.pool, &mut local, ln, chunk_size) } } } #[cfg(feature="threading")] impl RangeChunkParallel<'_, G> where G: Fn(usize, usize) -> R + Sync, { /// Execute loop iterations (parallel if enabled) using the given closure. /// /// The closure gets the following arguments for each iteration: /// /// - Thread pool context (used for child threads) /// - Mutable reference to thread local data /// - index of chunk (like RangeChunk) /// - size of chunk (like RangeChunk) pub(crate) fn for_each(self, for_each: F) where F: Fn(ThreadPoolCtx<'_>, &mut R, usize, usize) + Sync, { fn inner(range: RangeChunk, index: usize, nthreads: usize, pool: ThreadPoolCtx<'_>, thread_local: G, for_each: F) where G: Fn(usize, usize) -> R + Sync, F: Fn(ThreadPoolCtx<'_>, &mut R, usize, usize) + Sync { let mut local = thread_local(index, nthreads); for (ln, chunk_size) in range.part(index, nthreads) { for_each(pool, &mut local, ln, chunk_size) } } debug_assert!(self.nthreads <= 4, "this method does not support nthreads > 4, got {}", self.nthreads); let pool = self.pool; let range = self.range; let for_each = &for_each; let local = &self.thread_local; let nthreads = min(self.nthreads as usize, 4); let f = move |ctx: ThreadPoolCtx<'_>, i| inner(range, i, nthreads, ctx, local, for_each); if nthreads >= 4 { pool.join4(&f); } else if nthreads >= 3 { pool.join3l(&f); } else if nthreads >= 2 { pool.join(|ctx| f(ctx, 0), |ctx| f(ctx, 1)); } else { f(pool, 0) } } } matrixmultiply-0.3.9/src/util.rs000064400000000000000000000040201046102023000150470ustar 00000000000000// Copyright 2016 - 2018 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use core::cmp::min; #[derive(Copy, Clone)] pub struct RangeChunk { i: usize, n: usize, chunk: usize } /// Create an iterator that splits `n` in chunks of size `chunk`; /// the last item can be an uneven chunk. pub fn range_chunk(n: usize, chunk: usize) -> RangeChunk { RangeChunk { i: 0, n: n, chunk: chunk, } } impl Iterator for RangeChunk { type Item = (usize, usize); #[inline] fn next(&mut self) -> Option { if self.n == 0 { None } else { let i = self.i; let rem = min(self.n, self.chunk); self.i += 1; self.n -= rem; Some((i, rem)) } } } #[inline] pub fn round_up_to(x: usize, multiple_of: usize) -> usize { let (mut d, r) = (x / multiple_of, x % multiple_of); if r > 0 { d += 1; } d * multiple_of } impl RangeChunk { #[cfg(feature="threading")] /// Split the iterator in `total` parts and only iterate the `index`th part of it. /// The iterator must not have started when this is called. pub(crate) fn part(self, index: usize, total: usize) -> Self { debug_assert_eq!(self.i, 0, "range must be uniterated"); debug_assert_ne!(total, 0); let (n, chunk) = (self.n, self.chunk); // round up let mut nchunks = n / chunk; nchunks += (n % chunk != 0) as usize; // chunks per thread // round up let mut chunks_per = nchunks / total; chunks_per += (nchunks % total != 0) as usize; let i = chunks_per * index; let nn = min(n, (i + chunks_per) * chunk).saturating_sub(i * chunk); RangeChunk { i, n: nn, chunk } } } matrixmultiply-0.3.9/src/x86/macros.rs000064400000000000000000000020461046102023000160110ustar 00000000000000macro_rules! is_x86_feature_detected_ { ($name:tt) => {{ #[cfg(feature="std")] { // For testing purposes, we can make sure only one specific feature // is enabled by setting MMTEST_FEATURE=featurename (all others // disabled). This does not force it to be detected, it must also be. compile_env_matches_or_is_empty!("MMTEST_FEATURE", $name) && is_x86_feature_detected!($name) } #[cfg(not(feature="std"))] { // For testing purposes, we can make sure only one specific feature // is enabled by setting MMTEST_FEATURE=featurename (all others // disabled). This does not force it to be detected, it must also // be. In the `no_std` case, the `is_86_feature_detected` macro is // not available, so we have to fall back to checking whether the // feature is enabled at compile-time. compile_env_matches_or_is_empty!("MMTEST_FEATURE", $name) && cfg!(target_feature=$name) } }}; } matrixmultiply-0.3.9/src/x86/mod.rs000064400000000000000000000024431046102023000153050ustar 00000000000000 #[cfg(target_arch="x86")] use core::arch::x86::*; #[cfg(target_arch="x86_64")] use core::arch::x86_64::*; #[macro_use] mod macros; pub(crate) struct FusedMulAdd; pub(crate) struct AvxMulAdd; pub(crate) trait SMultiplyAdd { const IS_FUSED: bool; unsafe fn multiply_add(a: __m256, b: __m256, c: __m256) -> __m256; } impl SMultiplyAdd for AvxMulAdd { const IS_FUSED: bool = false; #[inline(always)] unsafe fn multiply_add(a: __m256, b: __m256, c: __m256) -> __m256 { _mm256_add_ps(_mm256_mul_ps(a, b), c) } } impl SMultiplyAdd for FusedMulAdd { const IS_FUSED: bool = true; #[inline(always)] unsafe fn multiply_add(a: __m256, b: __m256, c: __m256) -> __m256 { _mm256_fmadd_ps(a, b, c) } } pub(crate) trait DMultiplyAdd { const IS_FUSED: bool; unsafe fn multiply_add(a: __m256d, b: __m256d, c: __m256d) -> __m256d; } impl DMultiplyAdd for AvxMulAdd { const IS_FUSED: bool = false; #[inline(always)] unsafe fn multiply_add(a: __m256d, b: __m256d, c: __m256d) -> __m256d { _mm256_add_pd(_mm256_mul_pd(a, b), c) } } impl DMultiplyAdd for FusedMulAdd { const IS_FUSED: bool = true; #[inline(always)] unsafe fn multiply_add(a: __m256d, b: __m256d, c: __m256d) -> __m256d { _mm256_fmadd_pd(a, b, c) } } matrixmultiply-0.3.9/src/zgemm_kernel.rs000064400000000000000000000174231046102023000165640ustar 00000000000000// Copyright 2016 - 2021 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use crate::kernel::GemmKernel; use crate::kernel::GemmSelect; use crate::kernel::{U2, U4, c64, Element, c64_mul as mul}; use crate::archparam; use crate::cgemm_common::pack_complex; #[cfg(any(target_arch="x86", target_arch="x86_64"))] struct KernelAvx2; #[cfg(any(target_arch="x86", target_arch="x86_64"))] struct KernelFma; #[cfg(target_arch = "aarch64")] #[cfg(has_aarch64_simd)] struct KernelNeon; struct KernelFallback; type T = c64; type TReal = f64; /// Detect which implementation to use and select it using the selector's /// .select(Kernel) method. /// /// This function is called one or more times during a whole program's /// execution, it may be called for each gemm kernel invocation or fewer times. #[inline] pub(crate) fn detect(selector: G) where G: GemmSelect { // dispatch to specific compiled versions #[cfg(any(target_arch="x86", target_arch="x86_64"))] { if is_x86_feature_detected_!("fma") { if is_x86_feature_detected_!("avx2") { return selector.select(KernelAvx2); } return selector.select(KernelFma); } } #[cfg(target_arch = "aarch64")] #[cfg(has_aarch64_simd)] { if is_aarch64_feature_detected_!("neon") { return selector.select(KernelNeon); } } return selector.select(KernelFallback); } macro_rules! loop_m { ($i:ident, $e:expr) => { loop4!($i, $e) }; } macro_rules! loop_n { ($j:ident, $e:expr) => { loop2!($j, $e) }; } #[cfg(any(target_arch="x86", target_arch="x86_64"))] impl GemmKernel for KernelAvx2 { type Elem = T; type MRTy = U4; type NRTy = U2; #[inline(always)] fn align_to() -> usize { 32 } #[inline(always)] fn always_masked() -> bool { KernelFallback::always_masked() } #[inline(always)] fn nc() -> usize { archparam::Z_NC } #[inline(always)] fn kc() -> usize { archparam::Z_KC } #[inline(always)] fn mc() -> usize { archparam::Z_MC } pack_methods!{} #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_avx2(k, alpha, a, b, beta, c, rsc, csc) } } #[cfg(any(target_arch="x86", target_arch="x86_64"))] impl GemmKernel for KernelFma { type Elem = T; type MRTy = ::MRTy; type NRTy = ::NRTy; #[inline(always)] fn align_to() -> usize { 16 } #[inline(always)] fn always_masked() -> bool { KernelFallback::always_masked() } #[inline(always)] fn nc() -> usize { archparam::Z_NC } #[inline(always)] fn kc() -> usize { archparam::Z_KC } #[inline(always)] fn mc() -> usize { archparam::Z_MC } pack_methods!{} #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_fma(k, alpha, a, b, beta, c, rsc, csc) } } #[cfg(target_arch = "aarch64")] #[cfg(has_aarch64_simd)] impl GemmKernel for KernelNeon { type Elem = T; type MRTy = U4; type NRTy = U2; #[inline(always)] fn align_to() -> usize { 16 } #[inline(always)] fn always_masked() -> bool { KernelFallback::always_masked() } #[inline(always)] fn nc() -> usize { archparam::Z_NC } #[inline(always)] fn kc() -> usize { archparam::Z_KC } #[inline(always)] fn mc() -> usize { archparam::Z_MC } pack_methods!{} #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_target_neon(k, alpha, a, b, beta, c, rsc, csc) } } impl GemmKernel for KernelFallback { type Elem = T; type MRTy = U4; type NRTy = U2; #[inline(always)] fn align_to() -> usize { 0 } #[inline(always)] fn always_masked() -> bool { true } #[inline(always)] fn nc() -> usize { archparam::Z_NC } #[inline(always)] fn kc() -> usize { archparam::Z_KC } #[inline(always)] fn mc() -> usize { archparam::Z_MC } pack_methods!{} #[inline(always)] unsafe fn kernel( k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc) } } #[cfg(any(target_arch="x86", target_arch="x86_64"))] kernel_fallback_impl_complex! { // instantiate fma separately [inline target_feature(enable="fma") target_feature(enable="avx2")] [fma_yes] kernel_target_avx2, T, TReal, KernelAvx2::MR, KernelAvx2::NR, 4 } #[cfg(any(target_arch="x86", target_arch="x86_64"))] kernel_fallback_impl_complex! { // instantiate fma separately [inline target_feature(enable="fma")] [fma_no] kernel_target_fma, T, TReal, KernelFma::MR, KernelFma::NR, 2 } // Kernel neon #[cfg(target_arch = "aarch64")] #[cfg(has_aarch64_simd)] kernel_fallback_impl_complex! { [inline target_feature(enable="neon")] [fma_yes] kernel_target_neon, T, TReal, KernelNeon::MR, KernelNeon::NR, 1 } // kernel fallback kernel_fallback_impl_complex! { [inline] [fma_no] kernel_fallback_impl, T, TReal, KernelFallback::MR, KernelFallback::NR, 1 } #[inline(always)] unsafe fn at(ptr: *const TReal, i: usize) -> TReal { *ptr.add(i) } #[cfg(test)] mod tests { use super::*; use crate::kernel::test::test_complex_packed_kernel; #[test] fn test_kernel_fallback_impl() { test_complex_packed_kernel::("kernel"); } #[cfg(target_arch = "aarch64")] #[cfg(has_aarch64_simd)] mod test_kernel_aarch64 { use super::test_complex_packed_kernel; use super::super::*; #[cfg(feature = "std")] use std::println; macro_rules! test_arch_kernels { ($($feature_name:tt, $name:ident, $kernel_ty:ty),*) => { $( #[test] fn $name() { if is_aarch64_feature_detected_!($feature_name) { test_complex_packed_kernel::<$kernel_ty, _, TReal>(stringify!($name)); } else { #[cfg(feature = "std")] println!("Skipping, host does not have feature: {:?}", $feature_name); } } )* } } test_arch_kernels! { "neon", neon, KernelNeon } } #[cfg(any(target_arch="x86", target_arch="x86_64"))] mod test_arch_kernels { use super::test_complex_packed_kernel; use super::super::*; #[cfg(feature = "std")] use std::println; macro_rules! test_arch_kernels_x86 { ($($feature_name:tt, $name:ident, $kernel_ty:ty),*) => { $( #[test] fn $name() { if is_x86_feature_detected_!($feature_name) { test_complex_packed_kernel::<$kernel_ty, _, TReal>(stringify!($name)); } else { #[cfg(feature = "std")] println!("Skipping, host does not have feature: {:?}", $feature_name); } } )* } } test_arch_kernels_x86! { "fma", fma, KernelFma, "avx2", avx2, KernelAvx2 } } } matrixmultiply-0.3.9/testdefs/testdefs.rs000064400000000000000000000135761046102023000167650ustar 00000000000000 use matrixmultiply::{sgemm, dgemm}; #[cfg(feature="cgemm")] use matrixmultiply::{cgemm, zgemm, CGemmOption}; // Common code for tests - generic treatment of f32, f64, c32, c64 and Gemm pub trait Float : Copy + std::fmt::Debug + PartialEq { fn zero() -> Self; fn one() -> Self; // construct as number x fn from(x: i64) -> Self; // construct as number x + yi, but ignore y if not complex fn from2(x: i64, _y: i64) -> Self { Self::from(x) } fn nan() -> Self; fn real(self) -> Self { self } fn imag(self) -> Self; fn is_nan(self) -> bool; fn is_complex() -> bool { false } fn diff(self, rhs: Self) -> Self; // absolute value as f64 fn abs_f64(self) -> f64; fn relative_error_scale() -> f64; fn mul_add_assign(&mut self, x: Self, y: Self); } impl Float for f32 { fn zero() -> Self { 0. } fn one() -> Self { 1. } fn from(x: i64) -> Self { x as Self } fn nan() -> Self { 0./0. } fn imag(self) -> Self { 0. } fn is_nan(self) -> bool { self.is_nan() } fn diff(self, rhs: Self) -> Self { self - rhs } fn abs_f64(self) -> f64 { self.abs() as f64 } fn relative_error_scale() -> f64 { 1e-6 } fn mul_add_assign(&mut self, x: Self, y: Self) { *self += x * y; } } impl Float for f64 { fn zero() -> Self { 0. } fn one() -> Self { 1. } fn from(x: i64) -> Self { x as Self } fn nan() -> Self { 0./0. } fn imag(self) -> Self { 0. } fn is_nan(self) -> bool { self.is_nan() } fn diff(self, rhs: Self) -> Self { self - rhs } fn abs_f64(self) -> f64 { self.abs() } fn relative_error_scale() -> f64 { 1e-12 } fn mul_add_assign(&mut self, x: Self, y: Self) { *self += x * y; } } #[allow(non_camel_case_types)] #[cfg(feature="cgemm")] pub type c32 = [f32; 2]; #[allow(non_camel_case_types)] #[cfg(feature="cgemm")] pub type c64 = [f64; 2]; #[cfg(feature="cgemm")] impl Float for c32 { fn zero() -> Self { [0., 0.] } fn one() -> Self { [1., 0.] } fn from(x: i64) -> Self { [x as _, 0.] } fn from2(x: i64, y: i64) -> Self { [x as _, y as _] } fn nan() -> Self { [0./0., 0./0.] } fn real(self) -> Self { [self[0], 0.] } fn imag(self) -> Self { [self[1], 0.] } fn is_nan(self) -> bool { self[0].is_nan() || self[1].is_nan() } fn is_complex() -> bool { true } fn diff(self, rhs: Self) -> Self { [self[0] - rhs[0], self[1] - rhs[1]] } fn abs_f64(self) -> f64 { (self[0].powi(2) + self[1].powi(2)).sqrt() as f64 } fn relative_error_scale() -> f64 { 1e-6 } fn mul_add_assign(&mut self, x: Self, y: Self) { let [re, im] = c32_mul(x, y); self[0] += re; self[1] += im; } } #[cfg(feature="cgemm")] impl Float for c64 { fn zero() -> Self { [0., 0.] } fn one() -> Self { [1., 0.] } fn from(x: i64) -> Self { [x as _, 0.] } fn from2(x: i64, y: i64) -> Self { [x as _, y as _] } fn nan() -> Self { [0./0., 0./0.] } fn real(self) -> Self { [self[0], 0.] } fn imag(self) -> Self { [self[1], 0.] } fn is_nan(self) -> bool { self[0].is_nan() || self[1].is_nan() } fn is_complex() -> bool { true } fn diff(self, rhs: Self) -> Self { [self[0] - rhs[0], self[1] - rhs[1]] } fn abs_f64(self) -> f64 { (self[0].powi(2) + self[1].powi(2)).sqrt() } fn relative_error_scale() -> f64 { 1e-12 } fn mul_add_assign(&mut self, x: Self, y: Self) { let [re, im] = c64_mul(x, y); self[0] += re; self[1] += im; } } #[cfg(feature = "cgemm")] #[inline(always)] pub(crate) fn c32_mul(x: c32, y: c32) -> c32 { let [a, b] = x; let [c, d] = y; [a * c - b * d, b * c + a * d] } #[cfg(feature = "cgemm")] #[inline(always)] pub(crate) fn c64_mul(x: c64, y: c64) -> c64 { let [a, b] = x; let [c, d] = y; [a * c - b * d, b * c + a * d] } pub trait Gemm : Sized { unsafe fn gemm( m: usize, k: usize, n: usize, alpha: Self, a: *const Self, rsa: isize, csa: isize, b: *const Self, rsb: isize, csb: isize, beta: Self, c: *mut Self, rsc: isize, csc: isize); } impl Gemm for f32 { unsafe fn gemm( m: usize, k: usize, n: usize, alpha: Self, a: *const Self, rsa: isize, csa: isize, b: *const Self, rsb: isize, csb: isize, beta: Self, c: *mut Self, rsc: isize, csc: isize) { sgemm( m, k, n, alpha, a, rsa, csa, b, rsb, csb, beta, c, rsc, csc) } } impl Gemm for f64 { unsafe fn gemm( m: usize, k: usize, n: usize, alpha: Self, a: *const Self, rsa: isize, csa: isize, b: *const Self, rsb: isize, csb: isize, beta: Self, c: *mut Self, rsc: isize, csc: isize) { dgemm( m, k, n, alpha, a, rsa, csa, b, rsb, csb, beta, c, rsc, csc) } } #[cfg(feature="cgemm")] impl Gemm for c32 { unsafe fn gemm( m: usize, k: usize, n: usize, alpha: Self, a: *const Self, rsa: isize, csa: isize, b: *const Self, rsb: isize, csb: isize, beta: Self, c: *mut Self, rsc: isize, csc: isize) { cgemm( CGemmOption::Standard, CGemmOption::Standard, m, k, n, alpha, a, rsa, csa, b, rsb, csb, beta, c, rsc, csc) } } #[cfg(feature="cgemm")] impl Gemm for c64 { unsafe fn gemm( m: usize, k: usize, n: usize, alpha: Self, a: *const Self, rsa: isize, csa: isize, b: *const Self, rsb: isize, csb: isize, beta: Self, c: *mut Self, rsc: isize, csc: isize) { zgemm( CGemmOption::Standard, CGemmOption::Standard, m, k, n, alpha, a, rsa, csa, b, rsb, csb, beta, c, rsc, csc) } } matrixmultiply-0.3.9/tests/sgemm.rs000064400000000000000000000347131046102023000155710ustar 00000000000000extern crate core; extern crate itertools; extern crate matrixmultiply; include!("../testdefs/testdefs.rs"); use itertools::Itertools; use itertools::{ cloned, enumerate, repeat_n, }; use core::fmt::Debug; const FAST_TEST: Option<&'static str> = option_env!("MMTEST_FAST_TEST"); #[test] fn test_sgemm() { test_gemm::(); } #[test] fn test_dgemm() { test_gemm::(); } #[cfg(feature="cgemm")] #[test] fn test_cgemm() { test_gemm::(); } #[cfg(feature="cgemm")] #[test] fn test_cgemm_complex() { test_complex::(4, 4, 4, true); test_complex::(16, 32, 8, false); test_complex::(63, 65, 67, false); } #[cfg(feature="cgemm")] #[test] fn test_zgemm() { test_gemm::(); } #[cfg(feature="cgemm")] #[test] fn test_zgemm_complex() { test_complex::(4, 4, 4, true); test_complex::(16, 32, 8, false); test_complex::(63, 65, 67, false); } #[test] fn test_sgemm_strides() { test_gemm_strides::(); } #[test] fn test_dgemm_strides() { test_gemm_strides::(); } #[cfg(feature="cgemm")] #[test] fn test_cgemm_strides() { test_gemm_strides::(); } #[cfg(feature="cgemm")] #[test] fn test_zgemm_strides() { test_gemm_strides::(); } fn test_gemm_strides() where F: Gemm + Float { if FAST_TEST.is_some() { return; } for n in 0..20 { test_strides::(n, n, n); } for n in (3..12).map(|x| x * 7) { test_strides::(n, n, n); } test_strides::(8, 12, 16); test_strides::(8, 0, 10); } fn test_gemm() where F: Gemm + Float { test_mul_with_id::(4, 4, true); test_mul_with_id::(8, 8, true); test_mul_with_id::(32, 32, true); if FAST_TEST.is_some() { return; } test_mul_with_id::(128, 128, false); test_mul_with_id::(17, 128, false); for i in 0..12 { for j in 0..12 { test_mul_with_id::(i, j, true); } } test_mul_with_id::(17, 257, false); test_mul_with_id::(24, 512, false); for i in 0..10 { for j in 0..10 { test_mul_with_id::(i * 4, j * 4, true); } } test_mul_with_id::(266, 265, false); test_mul_id_with::(4, 4, true); for i in 0..12 { for j in 0..12 { test_mul_id_with::(i, j, true); } } test_mul_id_with::(266, 265, false); test_scale::(0, 4, 4, true); test_scale::(4, 0, 4, true); test_scale::(4, 4, 0, true); test_scale::(4, 4, 4, true); test_scale::(19, 20, 16, true); test_scale::(150, 140, 128, false); } /// multiply a M x N matrix with an N x N id matrix #[cfg(test)] fn test_mul_with_id(m: usize, n: usize, small: bool) where F: Gemm + Float { if !small && FAST_TEST.is_some() { return; } let (m, k, n) = (m, n, n); let mut a = vec![F::zero(); m * k]; let mut b = vec![F::zero(); k * n]; let mut c = vec![F::zero(); m * n]; println!("test matrix with id input M={}, N={}", m, n); for (i, elt) in a.iter_mut().enumerate() { *elt = F::from(i as i64); } for i in 0..k { b[i + i * k] = F::one(); } unsafe { F::gemm( m, k, n, F::one(), a.as_ptr(), k as isize, 1, b.as_ptr(), n as isize, 1, F::zero(), c.as_mut_ptr(), n as isize, 1, ) } assert_matrix_equal(m, n, &a, k as isize, 1, &c, n as isize, 1, small); println!("passed matrix with id input M={}, N={}", m, n); } /// multiply a K x K id matrix with an K x N matrix #[cfg(test)] fn test_mul_id_with(k: usize, n: usize, small: bool) where F: Gemm + Float { if !small && FAST_TEST.is_some() { return; } let (m, k, n) = (k, k, n); let mut a = vec![F::zero(); m * k]; let mut b = vec![F::zero(); k * n]; let mut c = vec![F::zero(); m * n]; for i in 0..k { a[i + i * k] = F::one(); } for (i, elt) in b.iter_mut().enumerate() { *elt = F::from(i as i64); } unsafe { F::gemm( m, k, n, F::one(), a.as_ptr(), k as isize, 1, b.as_ptr(), n as isize, 1, F::zero(), c.as_mut_ptr(), n as isize, 1, ) } assert_matrix_equal(m, n, &b, n as isize, 1, &c, n as isize, 1, small); println!("passed id with matrix input K={}, N={}", k, n); } #[cfg(test)] fn test_scale(m: usize, k: usize, n: usize, small: bool) where F: Gemm + Float { if !small && FAST_TEST.is_some() { return; } let (m, k, n) = (m, k, n); let mut a = vec![F::zero(); m * k]; let mut b = vec![F::zero(); k * n]; let mut c1 = vec![F::one(); m * n]; let mut c2 = vec![F::nan(); m * n]; // init c2 with NaN to test the overwriting behavior when beta = 0. for (i, elt) in a.iter_mut().enumerate() { *elt = F::from2(i as i64, i as i64); } for (i, elt) in b.iter_mut().enumerate() { *elt = F::from2(i as i64, i as i64); } let alpha1; let beta1 = F::zero(); let alpha21; let beta21; let alpha22; let beta22; if !F::is_complex() { // 3 A B == C in this way: // C <- A B // C <- A B + 2 C alpha1 = F::from(3); alpha21 = F::one(); beta21 = F::zero(); alpha22 = F::one(); beta22 = F::from(2); } else { // Select constants in a way that makes the complex values // significant for the complex case. Using i² = -1 to make sure. // // (2 + 3i) A B == C in this way: // C <- (1 + i) A B // C <- A B + (2 + i) C == (3 + 3i - 1) A B alpha1 = F::from2(2, 3); alpha21 = F::from2(1, 1); beta21 = F::zero(); alpha22 = F::one(); beta22 = F::from2(2, 1); } unsafe { // C1 = alpha1 A B F::gemm( m, k, n, alpha1, a.as_ptr(), k as isize, 1, b.as_ptr(), n as isize, 1, beta1, c1.as_mut_ptr(), n as isize, 1, ); // C2 = alpha21 A B F::gemm( m, k, n, alpha21, a.as_ptr(), k as isize, 1, b.as_ptr(), n as isize, 1, beta21, c2.as_mut_ptr(), n as isize, 1, ); // C2 = A B + beta22 C2 F::gemm( m, k, n, alpha22, a.as_ptr(), k as isize, 1, b.as_ptr(), n as isize, 1, beta22, c2.as_mut_ptr(), n as isize, 1, ); } assert_matrix_equal(m, n, &c1, n as isize, 1, &c2, n as isize, 1, small); println!("passed matrix with id input M={}, N={}", m, n); } #[cfg(feature="cgemm")] #[cfg(test)] fn test_complex(m: usize, k: usize, n: usize, small: bool) where F: Gemm + Float { if !small && FAST_TEST.is_some() { return; } let (m, k, n) = (m, k, n); let mut a = vec![F::zero(); m * k]; let mut b = vec![F::zero(); k * n]; let mut c1 = vec![F::zero(); m * n]; let mut c2 = vec![F::zero(); m * n]; for (i, elt) in a.iter_mut().enumerate() { *elt = F::from2(i as i64, -(i as i64)); } for (i, elt) in b.iter_mut().enumerate() { *elt = F::from2(-(i as i64), i as i64); } let alpha1 = F::from2(3, 2); let beta1 = F::zero(); unsafe { // C1 = alpha1 A B F::gemm( m, k, n, alpha1, a.as_ptr(), k as isize, 1, b.as_ptr(), n as isize, 1, beta1, c1.as_mut_ptr(), n as isize, 1, ); } // reference computation // matmul for i in 0..m { for j in 0..n { for ki in 0..k { c2[i * n + j].mul_add_assign(a[i * k + ki], b[ki * n + j]); } } } // multiply by alpha for i in 0..m { for j in 0..n { let elt = &mut c2[i * n + j]; let mut scaled = F::zero(); scaled.mul_add_assign(alpha1, *elt); *elt = scaled; } } assert_matrix_equal(m, n, &c1, n as isize, 1, &c2, n as isize, 1, small); println!("passed matrix with id input M={}, N={}", m, n); } // // Custom stride tests // #[derive(Copy, Clone, Debug)] enum Layout { C, F } use self::Layout::*; impl Layout { fn strides_scaled(self, m: usize, n: usize, scale: [usize; 2]) -> (isize, isize) { match self { C => ((n * scale[0] * scale[1]) as isize, scale[1] as isize), F => (scale[0] as isize, (m * scale[1] * scale[0]) as isize), } } } impl Default for Layout { fn default() -> Self { C } } #[cfg(test)] fn test_strides(m: usize, k: usize, n: usize) where F: Gemm + Float { let (m, k, n) = (m, k, n); let stride_multipliers = vec![[1, 2], [2, 2], [2, 3], [1, 1], [2, 2], [4, 1], [3, 4]]; let mut multipliers_iter = cloned(&stride_multipliers).cycle(); let layout_species = [C, F]; let layouts_iter = repeat_n(cloned(&layout_species), 4).multi_cartesian_product(); for elt in layouts_iter { let layouts = [elt[0], elt[1], elt[2], elt[3]]; let (m0, m1, m2, m3) = multipliers_iter.next_tuple().unwrap(); test_strides_inner::(m, k, n, [m0, m1, m2, m3], layouts); } } fn test_strides_inner(m: usize, k: usize, n: usize, stride_multipliers: [[usize; 2]; 4], layouts: [Layout; 4]) where F: Gemm + Float { let (m, k, n) = (m, k, n); let small = m < 8 && k < 8 && n < 8; // stride multipliers let mstridea = stride_multipliers[0]; let mstrideb = stride_multipliers[1]; let mstridec = stride_multipliers[2]; let mstridec2 = stride_multipliers[3]; let mut a = vec![F::zero(); m * k * mstridea[0] * mstridea[1]]; let mut b = vec![F::zero(); k * n * mstrideb[0] * mstrideb[1]]; let mut c1 = vec![F::nan(); m * n * mstridec[0] * mstridec[1]]; let mut c2 = vec![F::nan(); m * n * mstridec2[0] * mstridec2[1]]; for (i, elt) in a.iter_mut().enumerate() { *elt = F::from(i as i64); } for (i, elt) in b.iter_mut().enumerate() { *elt = F::from(i as i64); } let la = layouts[0]; let lb = layouts[1]; let lc1 = layouts[2]; let lc2 = layouts[3]; let (rs_a, cs_a) = la.strides_scaled(m, k, mstridea); let (rs_b, cs_b) = lb.strides_scaled(k, n, mstrideb); let (rs_c1, cs_c1) = lc1.strides_scaled(m, n, mstridec); let (rs_c2, cs_c2) = lc2.strides_scaled(m, n, mstridec2); println!("Test matrix a : {} × {} layout: {:?} strides {}, {}", m, k, la, rs_a, cs_a); println!("Test matrix b : {} × {} layout: {:?} strides {}, {}", k, n, lb, rs_b, cs_b); println!("Test matrix c1: {} × {} layout: {:?} strides {}, {}", m, n, lc1, rs_c1, cs_c1); println!("Test matrix c2: {} × {} layout: {:?} strides {}, {}", m, n, lc2, rs_c2, cs_c2); unsafe { // Compute the same result in C1 and C2 in two different ways. // We only use whole integer values in the low range of floats here, // so we have no loss of precision. // // C1 = A B F::gemm( m, k, n, F::from(1), a.as_ptr(), rs_a, cs_a, b.as_ptr(), rs_b, cs_b, F::zero(), c1.as_mut_ptr(), rs_c1, cs_c1, ); // C1 += 2 A B F::gemm( m, k, n, F::from(2), a.as_ptr(), rs_a, cs_a, b.as_ptr(), rs_b, cs_b, F::from(1), c1.as_mut_ptr(), rs_c1, cs_c1, ); // C2 = 3 A B F::gemm( m, k, n, F::from(3), a.as_ptr(), rs_a, cs_a, b.as_ptr(), rs_b, cs_b, F::zero(), c2.as_mut_ptr(), rs_c2, cs_c2, ); } assert_matrix_equal(m, n, &c1, rs_c1, cs_c1, &c2, rs_c2, cs_c2, small); // check we haven't overwritten the NaN values outside the passed output for (index, elt) in enumerate(&c1) { let i = index / rs_c1 as usize; let j = index / cs_c1 as usize; let irem = index % rs_c1 as usize; let jrem = index % cs_c1 as usize; if irem != 0 && jrem != 0 { assert!(elt.is_nan(), "Element at index={} ({}, {}) should be NaN, but was {:?}\n\ c1: {:?}\n", index, i, j, elt, c1); } } println!("{}×{}×{} {:?} .. passed.", m, k, n, layouts); } /// Assert that matrix C1 == matrix C2 /// /// m, n: size of matrix C1 and C2 /// /// exact: if true, require == equality /// if false, use relative difference from zero fn assert_matrix_equal(m: usize, n: usize, c1: &[F], rs_c1: isize, cs_c1: isize, c2: &[F], rs_c2: isize, cs_c2: isize, exact: bool) where F: Gemm + Float { macro_rules! c1 { ($i:expr, $j:expr) => (c1[(rs_c1 * $i as isize + cs_c1 * $j as isize) as usize]); } macro_rules! c2 { ($i:expr, $j:expr) => (c2[(rs_c2 * $i as isize + cs_c2 * $j as isize) as usize]); } let rel_tolerance = F::relative_error_scale(); let mut maximal = 0.; let mut rel_diff_max = 0.; let mut n_diffs = 0; let mut first_diff_index = None; for i in 0..m { for j in 0..n { let c1_elt = c1![i, j]; let c2_elt = c2![i, j]; let c1norm = c1_elt.abs_f64(); if c1norm > maximal { maximal = c1norm } let c2norm = c2_elt.abs_f64(); if c2norm > maximal { maximal = c1norm } let c_diff = c1_elt.diff(c2_elt); if c_diff != F::zero() { n_diffs += 1; if first_diff_index.is_none() { first_diff_index = Some((i, j)); } } let largest_elt = f64::max(c1norm, c2norm); let point_diff_rel = c_diff.abs_f64() / largest_elt; rel_diff_max = f64::max(point_diff_rel, rel_diff_max); } } if n_diffs > 0 { println!("Matrix equality stats: maximal elt: {}, largest relative error={:.4e}, ndiffs={}", maximal, rel_diff_max, n_diffs); } eprintln!("c1: {:?}, {}, {}", c1, rs_c1, cs_c1); eprintln!("c2: {:?}, {}, {}", c2, rs_c2, cs_c2); if exact { assert_eq!(0, n_diffs, "C1 == C2 assertion failed for matrix of size {}x{} with first failing element at index={:?}", m, n, first_diff_index); } else { assert!(rel_diff_max < rel_tolerance, "Assertion failed: largest relative diff < {:.2e}, was={:e}", rel_tolerance, rel_diff_max); } }