pax_global_header00006660000000000000000000000064151364200610014510gustar00rootroot0000000000000052 comment=6f8f11fce12822efdab498bd78faa444092baca9 sc-membench-1.2.1/000077500000000000000000000000001513642006100136725ustar00rootroot00000000000000sc-membench-1.2.1/LICENSE000066400000000000000000000405261513642006100147060ustar00rootroot00000000000000Mozilla Public License Version 2.0 ================================== 1. Definitions -------------- 1.1. "Contributor" means each individual or legal entity that creates, contributes to the creation of, or owns Covered Software. 1.2. "Contributor Version" means the combination of the Contributions of others (if any) used by a Contributor and that particular Contributor's Contribution. 1.3. "Contribution" means Covered Software of a particular Contributor. 1.4. "Covered Software" means Source Code Form to which the initial Contributor has attached the notice in Exhibit A, the Executable Form of such Source Code Form, and Modifications of such Source Code Form, in each case including portions thereof. 1.5. "Incompatible With Secondary Licenses" means (a) that the initial Contributor has attached the notice described in Exhibit B to the Covered Software; or (b) that the Covered Software was made available under the terms of version 1.1 or earlier of the License, but not also under the terms of a Secondary License. 1.6. "Executable Form" means any form of the work other than Source Code Form. 1.7. "Larger Work" means a work that combines Covered Software with other material, in a separate file or files, that is not Covered Software. 1.8. "License" means this document. 1.9. "Licensable" means having the right to grant, to the maximum extent possible, whether at the time of the initial grant or subsequently, any and all of the rights conveyed by this License. 1.10. "Modifications" means any of the following: (a) any file in Source Code Form that results from an addition to, deletion from, or modification of the contents of Covered Software; or (b) any new file in Source Code Form that contains any Covered Software. 1.11. "Patent Claims" of a Contributor means any patent claim(s), including without limitation, method, process, and apparatus claims, in any patent Licensable by such Contributor that would be infringed, but for the grant of the License, by the making, using, selling, offering for sale, having made, import, or transfer of either its Contributions or its Contributor Version. 1.12. "Secondary License" means either the GNU General Public License, Version 2.0, the GNU Lesser General Public License, Version 2.1, the GNU Affero General Public License, Version 3.0, or any later versions of those licenses. 1.13. "Source Code Form" means the form of the work preferred for making modifications. 1.14. "You" (or "Your") means an individual or a legal entity exercising rights under this License. For legal entities, "You" includes any entity that controls, is controlled by, or is under common control with You. For purposes of this definition, "control" means (a) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (b) ownership of more than fifty percent (50%) of the outstanding shares or beneficial ownership of such entity. 2. License Grants and Conditions -------------------------------- 2.1. Grants Each Contributor hereby grants You a world-wide, royalty-free, non-exclusive license: (a) under intellectual property rights (other than patent or trademark) Licensable by such Contributor to use, reproduce, make available, modify, display, perform, distribute, and otherwise exploit its Contributions, either on an unmodified basis, with Modifications, or as part of a Larger Work; and (b) under Patent Claims of such Contributor to make, use, sell, offer for sale, have made, import, and otherwise transfer either its Contributions or its Contributor Version. 2.2. Effective Date The licenses granted in Section 2.1 with respect to any Contribution become effective for each Contribution on the date the Contributor first distributes such Contribution. 2.3. Limitations on Grant Scope The licenses granted in this Section 2 are the only rights granted under this License. No additional rights or licenses will be implied from the distribution or licensing of Covered Software under this License. Notwithstanding Section 2.1(b) above, no patent license is granted by a Contributor: (a) for any code that a Contributor has removed from Covered Software; or (b) for infringements caused by: (i) Your and any other third party's modifications of Covered Software, or (ii) the combination of its Contributions with other software (except as part of its Contributor Version); or (c) under Patent Claims infringed by Covered Software in the absence of its Contributions. This License does not grant any rights in the trademarks, service marks, or logos of any Contributor (except as may be necessary to comply with the notice requirements in Section 3.4). 2.4. Subsequent Licenses No Contributor makes additional grants as a result of Your choice to distribute the Covered Software under a subsequent version of this License (see Section 10.2) or under the terms of a Secondary License (if permitted under the terms of Section 3.3). 2.5. Representation Each Contributor represents that the Contributor believes its Contributions are its original creation(s) or it has sufficient rights to grant the rights to its Contributions conveyed by this License. 2.6. Fair Use This License is not intended to limit any rights You have under applicable copyright doctrines of fair use, fair dealing, or other equivalents. 2.7. Conditions Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in Section 2.1. 3. Responsibilities ------------------- 3.1. Distribution of Source Form All distribution of Covered Software in Source Code Form, including any Modifications that You create or to which You contribute, must be under the terms of this License. You must inform recipients that the Source Code Form of the Covered Software is governed by the terms of this License, and how they can obtain a copy of this License. You may not attempt to alter or restrict the recipients' rights in the Source Code Form. 3.2. Distribution of Executable Form If You distribute Covered Software in Executable Form then: (a) such Covered Software must also be made available in Source Code Form, as described in Section 3.1, and You must inform recipients of the Executable Form how they can obtain a copy of such Source Code Form by reasonable means in a timely manner, at a charge no more than the cost of distribution to the recipient; and (b) You may distribute such Executable Form under the terms of this License, or sublicense it under different terms, provided that the license for the Executable Form does not attempt to limit or alter the recipients' rights in the Source Code Form under this License. 3.3. Distribution of a Larger Work You may create and distribute a Larger Work under terms of Your choice, provided that You also comply with the requirements of this License for the Covered Software. If the Larger Work is a combination of Covered Software with a work governed by one or more Secondary Licenses, and the Covered Software is not Incompatible With Secondary Licenses, this License permits You to additionally distribute such Covered Software under the terms of such Secondary License(s), so that the recipient of the Larger Work may, at their option, further distribute the Covered Software under the terms of either this License or such Secondary License(s). 3.4. Notices You may not remove or alter the substance of any license notices (including copyright notices, patent notices, disclaimers of warranty, or limitations of liability) contained within the Source Code Form of the Covered Software, except that You may alter any license notices to the extent required to remedy known factual inaccuracies. 3.5. Application of Additional Terms You may choose to offer, and to charge a fee for, warranty, support, indemnity or liability obligations to one or more recipients of Covered Software. However, You may do so only on Your own behalf, and not on behalf of any Contributor. You must make it absolutely clear that any such warranty, support, indemnity, or liability obligation is offered by You alone, and You hereby agree to indemnify every Contributor for any liability incurred by such Contributor as a result of warranty, support, indemnity or liability terms You offer. You may include additional disclaimers of warranty and limitations of liability specific to any jurisdiction. 4. Inability to Comply Due to Statute or Regulation --------------------------------------------------- If it is impossible for You to comply with any of the terms of this License with respect to some or all of the Covered Software due to statute, judicial order, or regulation then You must: (a) comply with the terms of this License to the maximum extent possible; and (b) describe the limitations and the code they affect. Such description must be placed in a text file included with all distributions of the Covered Software under this License. Except to the extent prohibited by statute or regulation, such description must be sufficiently detailed for a recipient of ordinary skill to be able to understand it. 5. Termination -------------- 5.1. The rights granted under this License will terminate automatically if You fail to comply with any of its terms. However, if You become compliant, then the rights granted under this License from a particular Contributor are reinstated (a) provisionally, unless and until such Contributor explicitly and finally terminates Your grants, and (b) on an ongoing basis, if such Contributor fails to notify You of the non-compliance by some reasonable means prior to 60 days after You have come back into compliance. Moreover, Your grants from a particular Contributor are reinstated on an ongoing basis if such Contributor notifies You of the non-compliance by some reasonable means, this is the first time You have received notice of non-compliance with this License from such Contributor, and You become compliant prior to 30 days after Your receipt of the notice. 5.2. If You initiate litigation against any entity by asserting a patent infringement claim (excluding declaratory judgment actions, counter-claims, and cross-claims) alleging that a Contributor Version directly or indirectly infringes any patent, then the rights granted to You by any and all Contributors for the Covered Software under Section 2.1 of this License shall terminate. 5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user license agreements (excluding distributors and resellers) which have been validly granted by You or Your distributors under this License prior to termination shall survive termination. ************************************************************************ * * * 6. Disclaimer of Warranty * * ------------------------- * * * * Covered Software is provided under this License on an "as is" * * basis, without warranty of any kind, either expressed, implied, or * * statutory, including, without limitation, warranties that the * * Covered Software is free of defects, merchantable, fit for a * * particular purpose or non-infringing. The entire risk as to the * * quality and performance of the Covered Software is with You. * * Should any Covered Software prove defective in any respect, You * * (not any Contributor) assume the cost of any necessary servicing, * * repair, or correction. This disclaimer of warranty constitutes an * * essential part of this License. No use of any Covered Software is * * authorized under this License except under this disclaimer. * * * ************************************************************************ ************************************************************************ * * * 7. Limitation of Liability * * -------------------------- * * * * Under no circumstances and under no legal theory, whether tort * * (including negligence), contract, or otherwise, shall any * * Contributor, or anyone who distributes Covered Software as * * permitted above, be liable to You for any direct, indirect, * * special, incidental, or consequential damages of any character * * including, without limitation, damages for lost profits, loss of * * goodwill, work stoppage, computer failure or malfunction, or any * * and all other commercial damages or losses, even if such party * * shall have been informed of the possibility of such damages. This * * limitation of liability shall not apply to liability for death or * * personal injury resulting from such party's negligence to the * * extent applicable law prohibits such limitation. Some * * jurisdictions do not allow the exclusion or limitation of * * incidental or consequential damages, so this exclusion and * * limitation may not apply to You. * * * ************************************************************************ 8. Litigation ------------- Any litigation relating to this License may be brought only in the courts of a jurisdiction where the defendant maintains its principal place of business and such litigation shall be governed by laws of that jurisdiction, without reference to its conflict-of-law provisions. Nothing in this Section shall prevent a party's ability to bring cross-claims or counter-claims. 9. Miscellaneous ---------------- This License represents the complete agreement concerning the subject matter hereof. If any provision of this License is held to be unenforceable, such provision shall be reformed only to the extent necessary to make it enforceable. Any law or regulation which provides that the language of a contract shall be construed against the drafter shall not be used to construe this License against a Contributor. 10. Versions of the License --------------------------- 10.1. New Versions Mozilla Foundation is the license steward. Except as provided in Section 10.3, no one other than the license steward has the right to modify or publish new versions of this License. Each version will be given a distinguishing version number. 10.2. Effect of New Versions You may distribute the Covered Software under the terms of the version of the License under which You originally received the Covered Software, or under the terms of any subsequent version published by the license steward. 10.3. Modified Versions If you create software not governed by this License, and you want to create a new license for such software, you may create and use a modified version of this License if you rename the license and remove any references to the name of the license steward (except to note that such modified license differs from this License). 10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses If You choose to distribute Source Code Form that is Incompatible With Secondary Licenses under the terms of this version of the License, the notice described in Exhibit B of this License must be attached. Exhibit A - Source Code Form License Notice ------------------------------------------- This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/. If it is not possible or desirable to put the notice in a particular file, then You may include the notice in a location (such as a LICENSE file in a relevant directory) where a recipient would be likely to look for such a notice. You may add additional accurate notices of copyright ownership. Exhibit B - "Incompatible With Secondary Licenses" Notice --------------------------------------------------------- This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0. sc-membench-1.2.1/Makefile000066400000000000000000000244671513642006100153470ustar00rootroot00000000000000# sc-membench Makefile # # Portable build system for Linux, macOS, and BSD # Uses OpenMP for parallel bandwidth measurement # # Build options: # make - Auto-detect features and build universal binary # make basic - Build minimal version (no optional dependencies) # make hwloc - Build with hwloc (portable cache detection) # make numa - Build with NUMA support (Linux only) # make full - Build with all available features # make clean - Remove built files # make test - Quick test run # ============================================================================= # Platform and Compiler Detection # ============================================================================= # Detect OS UNAME_S := $(shell uname -s) # Auto-detect compiler: prefer gcc, fall back to clang, then cc CC ?= $(shell command -v gcc 2>/dev/null || command -v clang 2>/dev/null || echo cc) # Base flags (portable across gcc/clang) CFLAGS_BASE = -O3 -Wall -Wextra -std=c11 # OpenMP flag (same for gcc and clang) OPENMP_FLAG = -fopenmp # ============================================================================= # Source Files and Targets # ============================================================================= SRC = membench.c TARGET = membench TARGET_BASIC = membench-basic TARGET_HWLOC = membench-hwloc TARGET_NUMA = membench-numa TARGET_FULL = membench-full # ============================================================================= # Platform-Specific Universal Optimization Flags # ============================================================================= # Platform-specific adjustments with UNIVERSAL compatibility ifeq ($(UNAME_S),Darwin) # macOS: packages typically in /opt/homebrew (ARM) or /usr/local (Intel) ARCH := $(shell uname -m) ifeq ($(ARCH),arm64) # ARM64 macOS: Use generic ARMv8-A (works on all Apple Silicon) CFLAGS_ARCH = -mcpu=generic CFLAGS_PATHS = -I/opt/homebrew/include LDFLAGS_PATHS = -L/opt/homebrew/lib else # x86_64 macOS: Use baseline x86-64 (works on all Intel Macs) CFLAGS_ARCH = -march=x86-64 -mtune=generic CFLAGS_PATHS = -I/usr/local/include LDFLAGS_PATHS = -L/usr/local/lib endif # macOS with clang needs libomp LDFLAGS_BASE = -lm # Check if using clang (needs -lomp for OpenMP) IS_CLANG := $(shell $(CC) --version 2>/dev/null | grep -q clang && echo yes) ifeq ($(IS_CLANG),yes) OPENMP_LIBS = -lomp else OPENMP_LIBS = endif else ifeq ($(UNAME_S),FreeBSD) # FreeBSD: packages in /usr/local, use baseline x86-64 CFLAGS_ARCH = -march=x86-64 -mtune=generic CFLAGS_PATHS = -I/usr/local/include LDFLAGS_PATHS = -L/usr/local/lib LDFLAGS_BASE = -lm OPENMP_LIBS = else ifeq ($(UNAME_S),OpenBSD) CFLAGS_ARCH = -march=x86-64 -mtune=generic CFLAGS_PATHS = -I/usr/local/include LDFLAGS_PATHS = -L/usr/local/lib LDFLAGS_BASE = -lm OPENMP_LIBS = else ifeq ($(UNAME_S),NetBSD) CFLAGS_ARCH = -march=x86-64 -mtune=generic CFLAGS_PATHS = -I/usr/local/include -I/usr/pkg/include LDFLAGS_PATHS = -L/usr/local/lib -L/usr/pkg/lib LDFLAGS_BASE = -lm OPENMP_LIBS = else # Linux (default) - Use conservative, universally compatible flags ARCH := $(shell uname -m) ifeq ($(ARCH),aarch64) # ARM64: Use generic ARMv8-A with CRC (universally supported) # This works on all ARM64 CPUs from Cortex-A53 to Neoverse-V2 CFLAGS_ARCH = -mcpu=generic+crc else ifeq ($(ARCH),x86_64) # x86_64: Use baseline x86-64 with SSE2 (universally supported since 2003) # This works on all x86_64 CPUs from Opteron/Pentium 4 to latest Xeon/EPYC CFLAGS_ARCH = -march=x86-64 -mtune=generic else # Other architectures: use generic optimization CFLAGS_ARCH = -mtune=generic endif CFLAGS_PATHS = LDFLAGS_PATHS = LDFLAGS_BASE = -lm OPENMP_LIBS = endif CFLAGS = $(CFLAGS_BASE) $(CFLAGS_ARCH) $(CFLAGS_PATHS) $(OPENMP_FLAG) LDFLAGS = $(OPENMP_FLAG) $(LDFLAGS_BASE) $(LDFLAGS_PATHS) $(OPENMP_LIBS) # ============================================================================= # Library Detection # ============================================================================= # Check for hwloc (cross-platform) HAVE_HWLOC := $(shell pkg-config --exists hwloc 2>/dev/null && echo yes || \ (test -f /usr/include/hwloc.h && echo yes) || \ (test -f /usr/local/include/hwloc.h && echo yes) || \ (test -f /opt/homebrew/include/hwloc.h && echo yes)) # Check for libnuma (Linux only) ifeq ($(UNAME_S),Linux) HAVE_NUMA := $(shell pkg-config --exists numa 2>/dev/null && echo yes || \ test -f /usr/include/numa.h && echo yes) else HAVE_NUMA := no endif # Check for libhugetlbfs (Linux only) ifeq ($(UNAME_S),Linux) HAVE_HUGETLBFS := $(shell pkg-config --exists hugetlbfs 2>/dev/null && echo yes || \ test -f /usr/include/hugetlbfs.h && echo yes) else HAVE_HUGETLBFS := no endif # Auto-detect features and compile with all available DETECTED_DEFS = DETECTED_LIBS = ifeq ($(HAVE_HUGETLBFS),yes) DETECTED_DEFS += -DHAVE_HUGETLBFS DETECTED_LIBS += -lhugetlbfs endif ifeq ($(HAVE_HWLOC),yes) DETECTED_DEFS += -DUSE_HWLOC DETECTED_LIBS += -lhwloc endif ifeq ($(HAVE_NUMA),yes) DETECTED_DEFS += -DUSE_NUMA DETECTED_LIBS += -lnuma endif # ============================================================================= # Build Targets # ============================================================================= .PHONY: default all clean test basic hwloc numa full help info # Default: auto-detect and build universal binary default: $(TARGET) $(TARGET): $(SRC) @echo "Building universal binary for $(UNAME_S) $(ARCH)..." @echo " Compiler: $(CC)" @echo " Optimization: $(CFLAGS_ARCH) (universal compatibility)" @echo " OpenMP: enabled" @echo " hwloc: $(HAVE_HWLOC)" @echo " numa: $(HAVE_NUMA)" @echo " hugetlbfs: $(HAVE_HUGETLBFS)" $(CC) $(CFLAGS) $(DETECTED_DEFS) -o $@ $< $(LDFLAGS) $(DETECTED_LIBS) # Basic: minimal build, no optional dependencies basic: $(TARGET_BASIC) $(TARGET_BASIC): $(SRC) $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) # Build with hwloc support (portable cache/topology detection) hwloc: $(TARGET_HWLOC) $(TARGET_HWLOC): $(SRC) ifeq ($(HAVE_HWLOC),yes) $(CC) $(CFLAGS) -DUSE_HWLOC -o $@ $< $(LDFLAGS) -lhwloc else @echo "Error: hwloc 2.x not found. Install with:" @echo " Linux: apt install libhwloc-dev (or: yum install hwloc-devel)" @echo " macOS: brew install hwloc" @echo " BSD: pkg install hwloc2" @exit 1 endif # Build with NUMA support (Linux only) numa: $(TARGET_NUMA) $(TARGET_NUMA): $(SRC) ifeq ($(UNAME_S),Linux) ifeq ($(HAVE_NUMA),yes) $(CC) $(CFLAGS) -DUSE_NUMA -o $@ $< $(LDFLAGS) -lnuma else @echo "Error: libnuma not found. Install with:" @echo " apt install libnuma-dev (or: yum install numactl-devel)" @exit 1 endif else @echo "Error: NUMA support is only available on Linux" @exit 1 endif # Build with all features (recommended for production Linux servers) full: $(TARGET_FULL) $(TARGET_FULL): $(SRC) ifeq ($(UNAME_S),Linux) $(CC) $(CFLAGS) -DUSE_HWLOC -DUSE_NUMA $(if $(filter yes,$(HAVE_HUGETLBFS)),-DHAVE_HUGETLBFS) \ -o $@ $< $(LDFLAGS) -lhwloc -lnuma $(if $(filter yes,$(HAVE_HUGETLBFS)),-lhugetlbfs) else @echo "Note: Building without NUMA (not available on $(UNAME_S))" $(CC) $(CFLAGS) -DUSE_HWLOC -o $@ $< $(LDFLAGS) -lhwloc endif # Build all versions that can be built on this platform all: $(TARGET) $(TARGET_BASIC) ifeq ($(HAVE_HWLOC),yes) $(MAKE) hwloc endif ifeq ($(HAVE_NUMA),yes) $(MAKE) numa endif ifeq ($(UNAME_S),Linux) ifeq ($(HAVE_HWLOC),yes) ifeq ($(HAVE_NUMA),yes) $(MAKE) full endif endif endif # Quick test (30 seconds) test: $(TARGET) ./$(TARGET) -v -t 30 clean: rm -f $(TARGET) $(TARGET_BASIC) $(TARGET_HWLOC) $(TARGET_NUMA) $(TARGET_FULL) # Install to /usr/local/bin install: $(TARGET) install -m 755 $(TARGET) /usr/local/bin/membench # Show detected configuration info: @echo "Platform Detection:" @echo " OS: $(UNAME_S)" @echo " Arch: $(ARCH)" @echo " Compiler: $(CC)" @echo " CFLAGS: $(CFLAGS)" @echo " LDFLAGS: $(LDFLAGS)" @echo "" @echo "Library Detection:" @echo " hwloc: $(HAVE_HWLOC)" @echo " numa: $(HAVE_NUMA)" @echo " hugetlbfs: $(HAVE_HUGETLBFS)" @echo "" @echo "Universal Optimization:" ifeq ($(ARCH),aarch64) @echo " ARM64: -mcpu=generic+crc (works on all ARM64 CPUs)" else ifeq ($(ARCH),x86_64) @echo " x86_64: -march=x86-64 (works on all x86_64 CPUs since 2003)" else @echo " Other: -mtune=generic" endif # Help target help: @echo "sc-membench - Universal Memory Benchmark (OpenMP)" @echo "" @echo "Build targets:" @echo " make - Auto-detect features and build universal binary" @echo " make basic - Minimal build (no optional dependencies)" @echo " make hwloc - With hwloc (portable cache detection)" @echo " make numa - With NUMA support (Linux only)" @echo " make full - With all features (hwloc + numa, Linux recommended)" @echo " make all - Build all available versions" @echo " make info - Show detected platform and libraries" @echo "" @echo "Universal Compatibility:" @echo " This build system uses conservative optimization flags that work" @echo " on ALL CPUs of the target architecture:" @echo " - ARM64: -mcpu=generic+crc (Cortex-A53 to Neoverse-V2)" @echo " - x86_64: -march=x86-64 (Opteron/P4 to latest Xeon/EPYC)" @echo " No illegal instruction errors, works in any Docker container." @echo "" @echo "OpenMP thread control (environment variables):" @echo " OMP_PROC_BIND=spread Distribute threads across NUMA nodes" @echo " OMP_PLACES=cores One thread per physical core" @echo " OMP_NUM_THREADS=N Override thread count" @echo "" @echo "Optional dependencies:" @echo " hwloc 2: Portable cache/topology detection (requires hwloc 2.x)" @echo " Linux: apt install libhwloc-dev" @echo " macOS: brew install hwloc libomp" @echo " BSD: pkg install hwloc2" @echo "" @echo " numa: NUMA-aware memory allocation (Linux only)" @echo " apt install libnuma-dev" @echo "" @echo " hugetlbfs: Better huge page detection (Linux only)" @echo " apt install libhugetlbfs-dev"sc-membench-1.2.1/README.md000066400000000000000000000744001513642006100151560ustar00rootroot00000000000000# sc-membench - Memory Bandwidth Benchmark A portable, multi-platform memory bandwidth benchmark designed for comprehensive system analysis. ## Features - **Multi-platform**: Works on x86, arm64, and other architectures - **Multiple operations**: Measures read, write, copy bandwidth + memory latency - **OpenMP parallelization**: Uses OpenMP for efficient multi-threaded bandwidth measurement - **NUMA-aware**: Automatically handles NUMA systems with `proc_bind(spread)` thread placement - **Cache-aware sizing**: Adaptive test sizes based on detected L1, L2, L3 cache hierarchy - **Per-thread buffer model**: Like bw_mem, each thread gets its own buffer - **Thread control**: Default uses all CPUs; optional auto-scaling to find optimal thread count - **Latency measurement**: True memory latency using pointer chasing with statistical sampling - **Statistically valid**: Latency reports median, stddev, and sample count (CV < 5%) - **Best-of-N runs**: Bandwidth tests run multiple times, reports best result (like lmbench) - **CSV output**: Machine-readable output for analysis ## Quick Start ```bash # Compile make # Run with default settings (uses all CPUs, cache-aware sizes) ./membench # Run with verbose output and 5 minute time limit ./membench -v -t 300 # Test specific buffer size (1MB per thread) ./membench -s 1024 # Compile with NUMA support (requires libnuma-dev) make numa ./membench-numa -v ``` ## Docker Usage The easiest way to run sc-membench without building is using the pre-built Docker image: ```bash # Run with default settings docker run --rm ghcr.io/sparecores/membench:main # Run with verbose output and time limit docker run --rm ghcr.io/sparecores/membench:main -v -t 300 # Test specific buffer size docker run --rm ghcr.io/sparecores/membench:main -s 1024 # Recommended: use --privileged and huge pages for best accuracy docker run --rm --privileged ghcr.io/sparecores/membench:main -H -v # Save output to file docker run --rm --privileged ghcr.io/sparecores/membench:main -H > results.csv ``` **Notes:** - The `--privileged` flag is recommended for optimal CPU pinning and NUMA support - The `-H` flag enables huge pages automatically for large buffers (≥ 2× huge page size), no setup required ## Build Options ```bash make # Basic version (sysfs cache detection, Linux only) make hwloc # With hwloc 2 (recommended - portable cache detection) make numa # With NUMA support make full # With hwloc + NUMA (recommended for servers) make all # Build all versions make clean # Remove built files make test # Quick 30-second test run ``` ### Recommended Build For production use on servers, build with all features: ```bash # Install dependencies first sudo apt-get install libhugetlbfs-dev libhwloc-dev libnuma-dev # Debian/Ubuntu # or: sudo yum install libhugetlbfs-devel hwloc-devel numactl-devel # RHEL/CentOS # Build with full features make full ./membench-full -v ``` ## Usage ``` sc-membench - Memory Bandwidth Benchmark Usage: ./membench [options] Options: -h Show help -v Verbose output (use -vv for more detail) -s SIZE_KB Test only this buffer size (in KB), e.g. -s 1024 for 1MB -r TRIES Repeat each test N times, report best (default: 3) -f Full sweep (test larger sizes up to memory limit) -p THREADS Use exactly this many threads (default: num_cpus) -a Auto-scaling: try different thread counts to find best (slower but finds optimal thread count per buffer size) -t SECONDS Maximum runtime, 0 = unlimited (default: unlimited) -o OP Run only this operation: read, write, copy, or latency Can be specified multiple times (default: all) -H Enable huge pages for large buffers (>= 2x huge page size) Uses THP automatically, no setup required -R Human-readable output with summary and benchmark scores (default: CSV output) ``` ## Output Format CSV output to stdout with columns: | Column | Description | |--------|-------------| | `size_kb` | **Per-thread** buffer size (KB) | | `operation` | Operation type: `read`, `write`, `copy`, or `latency` | | `bandwidth_mb_s` | Aggregate bandwidth across all threads (MB/s), 0 for latency | | `latency_ns` | Median memory latency (nanoseconds), 0 for bandwidth tests | | `latency_stddev_ns` | Standard deviation of latency samples (nanoseconds), 0 for bandwidth | | `latency_samples` | Number of samples collected for latency measurement, 0 for bandwidth | | `threads` | Thread count used | | `iterations` | Number of iterations performed | | `elapsed_s` | Elapsed time for the test (seconds) | **Total memory used** = `size_kb × threads` (or `× 2` for copy which needs src + dst). ### Example Output ```csv size_kb,operation,bandwidth_mb_s,latency_ns,latency_stddev_ns,latency_samples,threads,iterations,elapsed_s 32,read,9309701.64,0,0,0,96,292056,0.094113 32,write,9868845.93,0,0,0,96,578703,0.175918 32,latency,0,1.77,0.00,7,1,7,0.254053 128,read,6410473.70,0,0,0,96,83556,0.156412 128,write,9883443.78,0,0,0,96,177556,0.215580 128,latency,0,3.93,0.00,7,1,7,0.689736 512,latency,0,5.66,0.01,7,1,7,0.654846 1024,latency,0,7.38,0.04,7,1,7,0.671615 32768,latency,0,44.90,0.03,7,1,7,1.050579 131072,latency,0,96.78,3.00,7,1,7,8.520152 262144,latency,0,122.22,0.90,7,1,7,21.756578 ``` In this example ([Azure D96pls_v6](https://sparecores.com/server/azure/Standard_D96pls_v6) with 96 ARM cores, 64KB L1, 1MB L2, 128MB L3): - **32KB**: Fits in L1 → very high bandwidth (~9.3 TB/s read), low latency (~1.8ns, stddev 0.00) - **512KB**: Fits in L2 → good latency (~5.7ns, stddev 0.01) - **32MB**: In L3 → moderate latency (~45ns, stddev 0.03) - **128MB**: At L3 boundary → RAM latency visible (~97ns, stddev 3.0) - **256MB**: Past L3 → pure RAM latency (~122ns, stddev 0.9) ## Human-Readable Output (`-R`) Use `-R` for a formatted table with summary statistics and benchmark scores instead of CSV: ```bash ./membench -R ``` ### Example Output ``` Size Op Bandwidth Latency Threads ---- -- --------- ------- ------- 32 KB read 2.6 TB/s - 32 32 KB write 1.6 TB/s - 32 32 KB copy 464.4 GB/s - 32 32 KB latency - 0.9 ns 1 128 KB read 1.7 TB/s - 32 128 KB write 691.7 GB/s - 32 128 KB copy 495.5 GB/s - 32 128 KB latency - 2.4 ns 1 ... ================================================================================ BENCHMARK SUMMARY ================================================================================ BANDWIDTH (MB/s): Operation Peak Weighted Avg --------- ---- ------------ Read 2612561 1680432 Write 1605601 850445 Copy 495476 372027 LATENCY: Best latency: 97.2 ns (RAM) at 131072 KB buffer -------------------------------------------------------------------------------- BENCHMARK SCORE (higher is better): Bandwidth Score: 1571.2 (avg peak bandwidth in GB/s) Latency Score: 10.3 (1000 / latency_ns) >> COMBINED SCORE: 4024 (sqrt(bw_score × latency_score) × 100) -------------------------------------------------------------------------------- ``` ### Summary Statistics | Metric | Description | |--------|-------------| | **Peak** | Highest bandwidth achieved across all buffer sizes | | **Weighted Avg** | Average weighted by log₂(size) — larger buffers count more | | **Best latency** | Latency at the largest buffer size tested (closest to true RAM latency) | ### Benchmark Scores The summary includes scores for easy comparison between systems: | Score | Formula | Description | |-------|---------|-------------| | **Bandwidth Score** | `avg(peak_read, peak_write, peak_copy) / 1000` | Average peak bandwidth in GB/s | | **Latency Score** | `1000 / latency_ns` | Inverse of RAM latency (higher = faster) | | **Combined Score** | `sqrt(bw_score × latency_score) × 100` | Geometric mean of both (balanced) | The **Combined Score** uses a geometric mean so that neither bandwidth nor latency dominates — both contribute equally to the final score. ### Score Comparability Warning When using options that affect test coverage, a warning is displayed: ``` WARNING: Scores may not be comparable due to non-default options: - Time limit (-t 60) may have prevented testing larger buffer sizes - Fixed thread count (-p 4) instead of using all CPUs (32) For comparable scores, run without -t, -p, or -s options. ``` **For comparable benchmark scores**, run without `-t`, `-p`, or `-s` options to ensure: - All buffer sizes are tested (including large RAM-sized buffers) - All CPUs are utilized (maximum bandwidth) - Full cache hierarchy is exercised ## Operations Explained ### Read (`read`) Reads all 64-bit words from the buffer using XOR (faster than addition, no carry chains). This measures pure read bandwidth. ```c checksum ^= buffer[i]; // For all elements, using 8 independent accumulators ``` ### Write (`write`) Writes a pattern to all 64-bit words in the buffer. This measures pure write bandwidth. ```c buffer[i] = pattern; // For all elements ``` ### Copy (`copy`) Copies data from source to destination buffer. Reports bandwidth as `buffer_size / time` (matching lmbench's approach), not `2 × buffer_size / time`. ```c dst[i] = src[i]; // For all elements ``` **Note:** Copy bandwidth is typically lower than read or write alone because it performs both operations. The reported bandwidth represents the buffer size traversed, not total bytes moved (read + write). ### Latency (`latency`) Measures true memory access latency using **pointer chasing** with a linked list traversal approach inspired by [ram_bench](https://github.com/emilk/ram_bench) by Emil Ernerfeldt. Each memory access depends on the previous one, preventing CPU pipelining and prefetching. ```c // Node structure (16 bytes) - realistic for linked list traversal struct Node { uint64_t payload; // Dummy data for realistic cache behavior Node *next; // Pointer to next node }; // Each load depends on previous (can't be optimized away) node = node->next; // Address comes from previous load ``` The buffer is initialized as a contiguous array of nodes linked in **randomized order** to defeat hardware prefetchers. This measures: - L1/L2/L3 cache hit latency at small sizes - DRAM access latency at large sizes - True memory latency without pipelining effects **Statistical validity**: The latency measurement collects **multiple independent samples** (7-21) and reports the **median** (robust to outliers) along with standard deviation. Sampling continues until coefficient of variation < 5% or maximum samples reached. **CPU and NUMA pinning**: The latency test pins to CPU 0 and allocates memory on the local NUMA node (when compiled with NUMA support) for consistent, reproducible results. Results are reported in **nanoseconds per access** with statistical measures: - `latency_ns`: Median latency (robust central tendency) - `latency_stddev_ns`: Standard deviation (measurement precision indicator) - `latency_samples`: Number of samples collected (statistical effort) **Large L3 cache support**: The latency test uses buffers up to 2GB (or 25% of RAM) to correctly measure DRAM latency even on processors with huge L3 caches like AMD EPYC 9754 (1.1GB L3 with 3D V-Cache). ## Memory Sizes Tested The benchmark tests **per-thread buffer sizes** at cache transition points, automatically adapting to the detected cache hierarchy: ### Adaptive Cache-Aware Sizes Based on detected L1, L2, L3 cache sizes (typically 10 sizes): | Size | Purpose | |------|---------| | L1/2 | Pure L1 cache performance (e.g., 32KB for 64KB L1) | | 2×L1 | L1→L2 transition | | L2/2 | Mid L2 cache performance | | L2 | L2 cache boundary | | 2×L2 | L2→L3 transition | | L3/4 | Mid L3 cache (for large L3 caches) | | L3/2 | Late L3 cache | | L3 | L3→RAM boundary | | 2×L3 | Past L3, hitting RAM | | 4×L3 | Deep into RAM | With `-f` (full sweep), additional larger sizes are tested up to the memory limit. ### Cache Detection With hwloc 2 (recommended), cache sizes are detected automatically on any platform. Without hwloc, the benchmark uses sysctl (macOS/BSD) or parses `/sys/devices/system/cpu/*/cache/` (Linux). If cache detection fails, sensible defaults are used (32KB L1, 256KB L2, 8MB L3). ## Thread Model (Per-Thread Buffers) Like bw_mem, each thread gets its **own private buffer**: ``` Example for 1MB buffer size with 4 threads (read/write): Thread 0: 1MB buffer Thread 1: 1MB buffer Thread 2: 1MB buffer Thread 3: 1MB buffer Total memory: 4MB Example for 1MB buffer size with 4 threads (copy): Thread 0: 1MB src + 1MB dst = 2MB Thread 1: 1MB src + 1MB dst = 2MB ... Total memory: 8MB ``` ### Thread Modes | Mode | Flag | Behavior | |------|------|----------| | **Default** | (none) | Use `num_cpus` threads | | **Explicit** | `-p N` | Use exactly N threads | | **Auto-scaling** | `-a` | Try 1, 2, 4, ..., num_cpus threads, report best | ### OpenMP Thread Affinity You can fine-tune thread placement using OpenMP environment variables: ```bash # Spread threads across NUMA nodes (default behavior) OMP_PROC_BIND=spread OMP_PLACES=cores ./membench # Bind threads close together (may reduce bandwidth on multi-socket) OMP_PROC_BIND=close OMP_PLACES=cores ./membench # Override thread count via environment OMP_NUM_THREADS=8 ./membench ``` | Variable | Values | Effect | |----------|--------|--------| | `OMP_PROC_BIND` | `spread`, `close`, `master` | Thread distribution strategy | | `OMP_PLACES` | `cores`, `threads`, `sockets` | Placement units | | `OMP_NUM_THREADS` | Integer | Override thread count | The default `proc_bind(spread)` in the code distributes threads evenly across NUMA nodes for maximum memory bandwidth. ### What the Benchmark Measures - **Aggregate bandwidth**: Sum of all threads' bandwidth - **Per-thread buffer**: Each thread works on its own memory region - **No sharing**: Threads don't contend for the same cache lines ### Interpreting Results - `size_kb` = buffer size per thread - `threads` = number of threads used - `bandwidth_mb_s` = total system bandwidth (all threads combined) - Total memory = `size_kb × threads` (×2 for copy) ## NUMA Support When compiled with `-DUSE_NUMA` and linked with `-lnuma`: - Detects NUMA topology automatically - Maps CPUs to their NUMA nodes - Load-balances threads across NUMA nodes - Binds each thread's memory to its local node - Works transparently on UMA (single-node) systems ### NUMA Load Balancing On multi-socket systems, OpenMP's `proc_bind(spread)` distributes threads **evenly across NUMA nodes** to ensure balanced utilization of all memory controllers. **Example: 128 threads on a 2-node system (96 CPUs per node):** ``` Without spread (may cluster): With proc_bind(spread): Thread 0-95 → Node 0 (96 threads) Threads spread evenly across nodes Thread 96-127 → Node 1 (32 threads) ~64 threads per node Result: Node 0 overloaded! Result: Balanced utilization! ``` **Impact:** - Higher bandwidth with balanced distribution - More accurate measurement of total system memory bandwidth - Exercises all memory controllers evenly ### NUMA-Local Memory Each thread allocates its buffer directly on its local NUMA node using `numa_alloc_onnode()`: ```c // Inside OpenMP parallel region with proc_bind(spread) int cpu = sched_getcpu(); int node = numa_node_of_cpu(cpu); buffer = numa_alloc_onnode(size, node); ``` This ensures: - Memory is allocated on the same node as the accessing CPU - No cross-node memory access penalties - No memory migrations during the benchmark ### Verbose Output Use `-v` to see the detected NUMA topology: ``` NUMA: 2 nodes detected (libnuma enabled) NUMA topology: Node 0: 96 CPUs (first: 0, last: 95) Node 1: 96 CPUs (first: 96, last: 191) ``` ## Huge Pages Support Use `-H` to enable huge pages (2MB instead of 4KB). This reduces TLB (Translation Lookaside Buffer) pressure, which is especially beneficial for: - **Large buffer tests**: A 2GB buffer needs 512K page table entries with 4KB pages, but only 1024 with 2MB huge pages - **Latency tests**: Random pointer-chasing access patterns cause many TLB misses with small pages - **Accurate measurements**: TLB overhead can distort results, making memory appear slower than it is ### Automatic and smart The `-H` option is designed to "just work": 1. **Automatic threshold**: Huge pages are only used for buffers ≥ 2× huge page size (typically 4MB on systems with 2MB huge pages). The huge page size is detected dynamically via `libhugetlbfs`. Smaller buffers use regular pages automatically (no wasted memory, no user intervention needed). 2. **No setup required**: The benchmark uses **Transparent Huge Pages (THP)** via `madvise(MADV_HUGEPAGE)`, which is handled automatically by the Linux kernel. No root access or pre-allocation needed. 3. **Graceful fallback**: If THP isn't available, the benchmark falls back to regular pages transparently. ### How it works When `-H` is enabled and buffer size ≥ threshold (2× huge page size): 1. **First tries explicit huge pages** (`MAP_HUGETLB`) for deterministic huge pages 2. **Falls back to THP** (`madvise(MADV_HUGEPAGE)`) which works without pre-configuration 3. **Falls back to regular pages** if neither is available ### Optional: Pre-allocating explicit huge pages For the most deterministic results, you can pre-allocate explicit huge pages: ```bash # Check current huge page status grep Huge /proc/meminfo # Calculate huge pages needed for BANDWIDTH tests (read/write/copy): # threads × buffer_size × 2 (for copy: src+dst) / 2MB # # Examples: # 8 CPUs, 256 MiB buffer: 8 × 256 × 2 / 2 = 2,048 pages (4 GB) # 64 CPUs, 256 MiB buffer: 64 × 256 × 2 / 2 = 16,384 pages (32 GB) # 192 CPUs, 256 MiB buffer: 192 × 256 × 2 / 2 = 49,152 pages (96 GB) # # LATENCY tests run single-threaded, so need much less: # 256 MiB buffer: 256 / 2 = 128 pages (256 MB) # Allocate huge pages (requires root) - adjust for your system echo 49152 | sudo tee /proc/sys/vm/nr_hugepages # Run with huge pages (will use explicit huge pages if available) ./membench -H -v ``` However, this is **optional** - THP works well for most use cases without any setup, and doesn't require pre-allocation. If explicit huge pages run out, the benchmark automatically falls back to THP. ### Usage recommendation Just add `-H` to your command line - the benchmark handles everything automatically: ```bash # Recommended for production benchmarking ./membench -H # With verbose output to see what's happening ./membench -H -v ``` The benchmark will use huge pages only where they help (large buffers) and regular pages where they don't (small buffers). ### Why latency improves more than bandwidth You may notice that `-H` dramatically improves latency measurements (often 20-40% lower) while bandwidth stays roughly the same. This is expected: **Latency tests** use pointer chasing - random jumps through memory. Each access requires address translation via the TLB (Translation Lookaside Buffer): | Buffer Size | 4KB pages | 2MB huge pages | |-------------|-----------|----------------| | 128 MB | 32,768 pages | 64 pages | | TLB fit? | No (TLB ~1000-2000 entries) | Yes | | TLB misses | Frequent | Rare | With 4KB pages on a 128MB buffer: - 32,768 pages can't fit in the TLB - Random pointer chasing causes frequent TLB misses - Each TLB miss adds **10-20+ CPU cycles** (page table walk) - Measured latency = true memory latency + TLB overhead With 2MB huge pages: - Only 64 pages easily fit in the TLB - Almost no TLB misses - Measured latency ≈ **true memory latency** ### Real-world benchmark results #### Azure D96pls_v6 (ARM) Measured on [**Azure D96pls_v6**](https://sparecores.com/server/azure/Standard_D96pls_v6) (96 ARM Neoverse-N2 cores, 2 NUMA nodes, L1d=64KB/core, L2=1MB/core, L3=128MB shared): | Buffer | No Huge Pages | With THP (-H) | Improvement | |--------|---------------|---------------|-------------| | 32 KB | 1.77 ns | 1.77 ns | HP not used (< 4MB) | | 128 KB | 3.95 ns | 3.95 ns | HP not used (< 4MB) | | 512 KB | 5.99 ns | 5.98 ns | HP not used (< 4MB) | | 1 MB | 11.52 ns | 10.92 ns | HP not used (< 4MB) | | 2 MB | 24.27 ns | 24.65 ns | HP not used (< 4MB) | | **32 MB** | 44.90 ns | **36.23 ns** | **-19%** | | **64 MB** | 49.40 ns | **40.77 ns** | **-17%** | | **128 MB** | 92.50 ns | **78.32 ns** | **-15%** | | **256 MB** | 121.92 ns | **107.65 ns** | **-12%** | | **512 MB** | 140.97 ns | **118.74 ns** | **-16%** | #### AWS c8a.metal-48xl (AMD) Measured on [**AWS c8a.metal-48xl**](https://sparecores.com/server/aws/c8a.metal-48xl) (192 AMD EPYC 9R45 cores, 2 NUMA nodes, L1d=48KB/core, L2=1MB/core, L3=32MB/die): | Buffer | No Huge Pages | With THP (-H) | Improvement | |--------|---------------|---------------|-------------| | 32 KB | 0.89 ns | 0.89 ns | HP not used (< 4MB) | | 128 KB | 2.43 ns | 2.45 ns | HP not used (< 4MB) | | 512 KB | 3.32 ns | 3.35 ns | HP not used (< 4MB) | | 1 MB | 5.47 ns | 4.09 ns | HP not used (< 4MB) | | 2 MB | 8.85 ns | 8.85 ns | HP not used (< 4MB) | | **8 MB** | 11.72 ns | **10.32 ns** | **-12%** | | **16 MB** | 12.58 ns | **10.74 ns** | **-15%** | | **32 MB** | **30.83 ns** | **11.29 ns** | **-63%** | | **64 MB** | 84.81 ns | **75.25 ns** | **-11%** | | **128 MB** | 117.75 ns | **105.45 ns** | **-10%** | **Key observations:** - **Small buffers (≤ 2MB)**: No significant difference — TLB can handle the page count - **L3 boundary effect**: AMD shows **63% improvement at 32MB** (exactly at L3 size) — without huge pages, TLB misses make L3 appear like RAM! - **L3 region**: 12-19% improvement with huge pages - **RAM region**: 10-16% lower latency with huge pages - **THP works automatically**: No pre-allocation needed, just use `-H` **Bottom line**: Use `-H` for accurate latency measurements on large buffers. Without huge pages, TLB overhead can severely distort results, especially at cache boundaries. **Bandwidth tests** don't improve as much because: - Sequential access has better TLB locality (same pages accessed repeatedly) - Hardware prefetchers hide TLB miss latency - The memory bus is already saturated ## Consistent Results Achieving consistent benchmark results on modern multi-core systems requires careful handling of: ### Thread Pinning Threads are distributed across CPUs using OpenMP's `proc_bind(spread)` clause, which spreads threads evenly across NUMA nodes and physical cores. This prevents the OS scheduler from migrating threads between cores, which causes huge variability. ### NUMA-Aware Memory On NUMA systems, each thread allocates memory directly on its local NUMA node using `numa_alloc_onnode()`. OpenMP's `proc_bind(spread)` ensures threads are distributed across NUMA nodes, then each thread allocates locally. This ensures: - Memory is close to where it will be accessed - No cross-node memory access penalties - No memory migrations during the benchmark ### Bandwidth: Best-of-N Runs Like lmbench (TRIES=11), each bandwidth test configuration runs multiple times and reports the best result: 1. First run is a warmup (discarded) to stabilize CPU frequency 2. Each configuration is then tested 3 times (configurable with `-r`) 3. Highest bandwidth is reported (best shows true hardware capability) ### Latency: Statistical Sampling Latency measurements use a different approach optimized for statistical validity: 1. Thread is pinned to CPU 0 with NUMA-local memory 2. Multiple independent samples (7-21) are collected per measurement 3. Sampling continues until coefficient of variation < 5% or max samples reached 4. **Median** latency is reported (robust to outliers) 5. Standard deviation and sample count are included for validation ### Result With these optimizations, benchmark variability is typically **<1%** (compared to 30-60% without them). ### Configuration ```bash ./membench -r 5 # Run each test 5 times instead of 3 ./membench -r 1 # Single run (fastest, still consistent due to pinning) ./membench -p 16 # Use exactly 16 threads ./membench -a # Auto-scale to find optimal thread count ``` ## Comparison with lmbench ### Bandwidth (bw_mem) | Aspect | sc-membench | lmbench bw_mem | |--------|-------------|----------------| | **Parallelism model** | OpenMP threads | Processes (fork) | | **Buffer allocation** | Each thread has own buffer | Each process has own buffer | | **Size reporting** | Per-thread buffer size | Per-process buffer size | | **Read operation** | Reads 100% of data | `rd` reads 25% (strided) | | **Copy reporting** | Buffer size / time | Buffer size / time | | **Huge pages** | Built-in (`-H` flag) | Not supported (uses `valloc`) | | **Operation selection** | `-o read/write/copy/latency` | Separate invocations per operation | | **Output format** | CSV (stdout) | Text to stderr | | **Full vs strided read** | Always 100% (`read`) | `rd` (25% strided) or `frd` (100%) | **Key differences:** 1. **Size meaning**: Both report per-worker buffer size (comparable) 2. **Read operation**: bw_mem `rd` uses 32-byte stride (reads 25% of data at indices 0,4,8...124 per 512-byte chunk), reporting ~4x higher apparent bandwidth. Use `frd` for full read. sc-membench always reads 100%. 3. **Thread control**: sc-membench defaults to num_cpus threads; use `-a` for auto-scaling or `-p N` for explicit count 4. **Huge pages**: sc-membench has built-in support (`-H`) with automatic THP fallback; lmbench has no huge page support 5. **Workflow**: sc-membench runs all tests in one invocation; bw_mem requires separate runs per operation (`bw_mem 64m rd`, `bw_mem 64m wr`, etc.) ### Latency (lat_mem_rd) sc-membench's `latency` operation is comparable to lmbench's `lat_mem_rd`: | Aspect | sc-membench latency | lmbench lat_mem_rd | |--------|---------------------|-------------------| | **Method** | Pointer chasing (linked list) | Pointer chasing (array) | | **Node structure** | 16 bytes (payload + pointer) | 8 bytes (pointer only) | | **Pointer order** | Randomized (defeats prefetching) | Fixed backward stride (may be prefetched) | | **Stride** | Random (visits all elements) | Configurable (default 64 bytes on 64-bit) | | **Statistical validity** | Multiple samples, reports median + stddev | Single measurement | | **CPU/NUMA pinning** | Pins to CPU 0, NUMA-local memory | No pinning | | **Output** | Median nanoseconds + stddev + sample count | Nanoseconds | | **Huge pages** | Built-in (`-H` flag) | Not supported | Both measure memory latency using dependent loads that prevent pipelining. **Key differences**: 1. **Prefetching vulnerability**: lat_mem_rd uses fixed backward stride, which modern CPUs may prefetch (the man page acknowledges: "vulnerable to smart, stride-sensitive cache prefetching policies"). sc-membench's randomized pointer chain defeats all prefetching, measuring true random-access latency. 2. **Statistical validity**: sc-membench collects 7-21 samples per measurement, reports median (robust to outliers) and standard deviation, and continues until coefficient of variation < 5%. This provides confidence in the results. 3. **Reproducibility**: CPU pinning and NUMA-local memory allocation eliminate variability from thread migration and remote memory access. **Huge pages advantage**: With `-H`, sc-membench automatically uses huge pages for large buffers, eliminating TLB overhead that can inflate latency by 20-40% (see [benchmark results](#real-world-benchmark-results)). ## Interpreting Results ### Cache Effects Look for bandwidth drops and latency increases as buffer sizes exceed cache levels: - Dramatic change at L1 boundary (32-64KB per thread typically) - Another change at L2 boundary (256KB-1MB per thread typically) - Final change when total memory exceeds L3 (depends on thread count) ### Thread Configuration - By default, all CPUs are used for maximum aggregate bandwidth - Use `-p N` to test with a specific thread count - Use `-a` to find optimal thread count (slower but thorough) - Latency test: Always uses 1 thread (measures true access latency) ### Bandwidth Values Typical modern systems: - L1 cache: 200-500 GB/s (varies with frequency) - L2 cache: 100-200 GB/s - L3 cache: 50-100 GB/s - Main memory: 20-100 GB/s (DDR4/DDR5, depends on channels) ### Latency Values Typical modern systems: - L1 cache: 1-2 ns - L2 cache: 3-10 ns - L3 cache: 10-40 ns (larger/3D V-Cache may be higher) - Main memory: 25-50 ns (fast DDR5) to 60-120 ns (DDR4) ## Dependencies ### Build Requirements - **Required**: C11 compiler with OpenMP support (gcc or clang) - **Recommended**: hwloc 2.x for portable cache topology detection - **Optional**: libnuma for NUMA support (Linux only) - **Optional**: libhugetlbfs for huge page size detection (Linux only) ### Runtime Requirements - **Required**: OpenMP runtime library (`libgomp1` on Debian/Ubuntu, `libgomp` on RHEL) - **Optional**: libhwloc, libnuma, libhugetlbfs (same as build dependencies) ### Installing Dependencies ```bash # Debian/Ubuntu - Build apt-get install build-essential libhwloc-dev libnuma-dev libhugetlbfs-dev # Debian/Ubuntu - Runtime only (e.g., Docker images) apt-get install libgomp1 libhwloc15 libnuma1 libhugetlbfs-dev # RHEL/CentOS/Fedora - Build yum install gcc make hwloc-devel numactl-devel libhugetlbfs-devel # RHEL/CentOS/Fedora - Runtime only yum install libgomp hwloc-libs numactl-libs libhugetlbfs # macOS (hwloc only, no NUMA) brew install hwloc libomp xcode-select --install # FreeBSD (hwloc 2 required, not hwloc 1) pkg install gmake hwloc2 ``` ### What Each Dependency Provides | Library | Purpose | Platforms | Build/Runtime | |---------|---------|-----------|---------------| | **libgomp** | OpenMP runtime (parallel execution) | All | Both | | **hwloc 2** | Cache topology detection (L1/L2/L3 sizes) | Linux, macOS, BSD | Both | | **libnuma** | NUMA-aware memory allocation | Linux only | Both | | **libhugetlbfs** | Huge page size detection | Linux only | Both | **Note**: hwloc 2.x is required. hwloc 1.x uses a different API and is not supported. Without hwloc, the benchmark falls back to sysctl (macOS/BSD) or `/sys/devices/system/cpu/*/cache/` (Linux). Without libnuma, memory is allocated without NUMA awareness (may underperform on multi-socket systems). ## License Mozilla Public License 2.0 ## See Also - [STREAM benchmark](https://www.cs.virginia.edu/stream/) - [lmbench](https://sourceforge.net/projects/lmbench/) - [ram_bench](https://github.com/emilk/ram_bench) sc-membench-1.2.1/membench.c000066400000000000000000002430641513642006100156250ustar00rootroot00000000000000/* * sc-membench - Portable Memory Bandwidth and Latency Benchmark * * A multi-platform memory benchmark that: * - Works on Linux, macOS, FreeBSD, and other Unix-like systems * - Works on x86, arm64, and other architectures * - Measures read, write, and copy bandwidth using OpenMP * - Measures memory latency using pointer chasing * - Handles NUMA automatically (works on non-NUMA too) * - Sweeps through cache and memory sizes * - Finds optimal thread count for peak bandwidth * - Outputs CSV format for analysis * * Compile (recommended - use make for auto-detection): * make # Auto-detect available features * make basic # Minimal build, no optional dependencies * make full # All features (Linux: hwloc + numa + hugetlbfs) * * Manual compilation: * gcc -O3 -fopenmp -o membench membench.c -lm * # With optional libraries: * gcc -O3 -fopenmp -DUSE_HWLOC -DUSE_NUMA -DHAVE_HUGETLBFS \ * -o membench membench.c -lm -lhwloc -lnuma -lhugetlbfs * * Usage: * ./membench [options] * ./membench -h # Show help * * Copyright 2026 Spare Cores * Licensed under Mozilla Public License 2.0 */ /* Platform detection (may be overridden by compiler flags) */ #if !defined(PLATFORM_LINUX) && !defined(PLATFORM_MACOS) && !defined(PLATFORM_BSD) #if defined(__linux__) #define PLATFORM_LINUX #elif defined(__APPLE__) && defined(__MACH__) #define PLATFORM_MACOS #elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) #define PLATFORM_BSD #endif #endif /* Enable GNU extensions on Linux for CPU affinity (must be before includes) */ #ifdef PLATFORM_LINUX #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #endif #include #include #include #include #include #include #include #include #include #include #include /* Platform-specific includes */ #ifdef PLATFORM_LINUX #include #endif #ifdef PLATFORM_BSD #include #include #include #endif #ifdef PLATFORM_MACOS #include #include #include #endif /* Optional library: libhugetlbfs (Linux only, for huge page size detection) */ #if defined(HAVE_HUGETLBFS) && defined(PLATFORM_LINUX) #include #endif /* Optional library: NUMA support (Linux only) */ #ifdef USE_NUMA #include #include #endif /* ============================================================================ * Configuration * ============================================================================ */ #define VERSION "1.2.1" /* Target time per individual measurement (seconds) */ #define TARGET_TIME_PER_TEST 0.25 /* Minimum iterations per test (keep low for large buffers that take seconds per iteration) */ #define MIN_ITERATIONS 3 /* Maximum iterations per test */ #define MAX_ITERATIONS 10000000 /* Default total runtime target (seconds). 0 = unlimited */ #define DEFAULT_MAX_RUNTIME 0 /* Fixed RAM sizes for when we need to measure pure memory bandwidth */ #define RAM_SIZE_1 (64UL * 1024 * 1024) /* 64 MB - definitely past any L3 */ #define RAM_SIZE_2 (256UL * 1024 * 1024) /* 256 MB - more RAM data points */ /* Get huge page size dynamically from the system. * Tries multiple methods in order of reliability: * 1. libhugetlbfs (if available, most reliable) * 2. /proc/meminfo (Linux) * 3. sysctl (macOS/BSD) * 4. Default fallback (2MB for x86, common size) * Returns the default huge page size (typically 2MB on x86, varies on ARM). */ static size_t get_huge_page_size(void) { static size_t cached_size = 0; if (cached_size != 0) return cached_size; #if defined(HAVE_HUGETLBFS) && defined(PLATFORM_LINUX) /* Method 1: libhugetlbfs (most reliable on Linux) */ long size = gethugepagesize(); if (size > 0) { cached_size = (size_t)size; return cached_size; } #endif #ifdef PLATFORM_LINUX /* Method 2: Parse /proc/meminfo */ FILE *file = fopen("/proc/meminfo", "r"); if (file) { char line[256]; unsigned long size_kb = 0; while (fgets(line, sizeof(line), file)) { if (sscanf(line, "Hugepagesize: %lu kB", &size_kb) == 1) { cached_size = size_kb * 1024; fclose(file); return cached_size; } } fclose(file); } #endif #if defined(PLATFORM_MACOS) || defined(PLATFORM_BSD) /* Method 3: sysctl for macOS/BSD (get VM page size, huge pages vary) */ /* Note: macOS doesn't have traditional huge pages like Linux, * but we can use vm.pagesize as a reference. Superpage support varies. */ int mib[2] = { CTL_HW, HW_PAGESIZE }; int pagesize = 0; size_t len = sizeof(pagesize); if (sysctl(mib, 2, &pagesize, &len, NULL, 0) == 0 && pagesize > 0) { /* On macOS, superpage size is typically 2MB on Intel, 16KB on ARM * but there's no standard API to query it. Use 2MB as common default. */ cached_size = 2UL * 1024 * 1024; return cached_size; } #endif /* Method 4: Default fallback (2MB, most common huge page size) */ cached_size = 2UL * 1024 * 1024; return cached_size; } /* Minimum buffer size to use huge pages (2 huge pages). * Below this threshold, TLB pressure isn't significant and huge pages * would waste memory (each allocation rounds up to huge page boundary). */ static size_t get_huge_page_threshold(void) { return 2 * get_huge_page_size(); } /* ============================================================================ * Types * ============================================================================ */ typedef enum { OP_READ, OP_WRITE, OP_COPY, OP_LATENCY /* Memory latency test using pointer chasing */ } operation_t; static const char* OP_NAMES[] = {"read", "write", "copy", "latency"}; typedef struct { size_t size; operation_t op; int threads; double bandwidth_mb_s; /* For read/write/copy */ double latency_ns; /* For latency test (median) */ double latency_mean_ns; /* For latency test (mean) */ double latency_stddev_ns; /* For latency test (standard deviation) */ double latency_cv; /* Coefficient of variation (stddev/mean) */ int latency_samples; /* Number of samples for latency measurement */ double elapsed_s; int iterations; } result_t; /* Summary statistics structure */ typedef struct { /* Peak bandwidth for large buffer sizes (RAM speed) */ double peak_read_mb_s; double peak_write_mb_s; double peak_copy_mb_s; /* Best latency for large buffer sizes (RAM latency) */ double best_latency_ns; /* Weighted average bandwidth (larger sizes weighted more) */ double weighted_avg_read_mb_s; double weighted_avg_write_mb_s; double weighted_avg_copy_mb_s; /* Counts and weights for weighted average */ double read_weight_sum; double write_weight_sum; double copy_weight_sum; double read_bw_weighted_sum; double write_bw_weighted_sum; double copy_bw_weighted_sum; /* Track the largest size tested for "RAM" results */ size_t largest_size_tested; /* Count of measurements */ int read_count; int write_count; int copy_count; int latency_count; } summary_t; static summary_t g_summary = {0}; /* ============================================================================ * Global state * ============================================================================ */ static volatile int g_running = 1; static int g_verbose = 0; /* 0=quiet, 1=summary, 2=detailed */ static int g_full_sweep = 0; /* If 1, test all sizes up to max; if 0, stop early when converged */ static size_t g_single_size = 0; /* If > 0, test only this size (in bytes) */ static int g_human_readable = 0; /* If 1, output human-readable format instead of CSV */ static int g_num_cpus = 0; static int g_numa_nodes = 0; static size_t g_total_memory = 0; /* NUMA topology - CPUs per node for balanced thread distribution */ #define MAX_NUMA_NODES 64 #define MAX_CPUS_PER_NODE 512 static int g_cpus_per_node[MAX_NUMA_NODES]; /* Count of CPUs on each node */ static int g_node_cpus[MAX_NUMA_NODES][MAX_CPUS_PER_NODE]; /* CPU IDs for each node */ /* Number of times to run each benchmark, taking best result (like lmbench TRIES=11) */ #define DEFAULT_BENCHMARK_TRIES 3 static int g_benchmark_tries = DEFAULT_BENCHMARK_TRIES; /* Thread count options: * g_explicit_threads > 0: use exactly that many threads * g_explicit_threads == 0: use num_cpus (default) * g_auto_scaling: try multiple thread counts to find best */ static int g_explicit_threads = 0; static int g_auto_scaling = 0; static double g_max_runtime = DEFAULT_MAX_RUNTIME; /* Huge pages support */ static int g_use_hugepages = 0; /* Operation selection bitmask (bit 0=read, 1=write, 2=copy, 3=latency) */ #define OP_MASK_ALL 0x0F /* All operations enabled */ static int g_ops_mask = OP_MASK_ALL; /* Detected cache sizes (per core) */ static size_t g_l1_cache_size = 0; static size_t g_l2_cache_size = 0; static size_t g_l3_cache_size = 0; /* Minimum total buffer size - adaptive based on cache topology */ static size_t g_min_total_size = 4096; /* Default 4KB, updated after cache detection */ /* ============================================================================ * Timing * ============================================================================ */ static inline double get_time(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec + ts.tv_nsec * 1e-9; } /* ============================================================================ * Memory operations * ============================================================================ */ /* Prevent compiler from optimizing away operations */ static volatile uint64_t g_sink = 0; /* * Memory operations - heavily optimized for bandwidth measurement * Key techniques: * 1. Multiple independent accumulators to break dependency chains * 2. Large unrolling (32 elements = 256 bytes per iteration) * 3. Force inlining to eliminate call overhead */ /* Read operation: XOR all 64-bit words with independent accumulators * XOR is faster than ADD and has no carry dependency chains */ static inline __attribute__((always_inline)) uint64_t mem_read(const void *buf, size_t size) { const uint64_t *p = (const uint64_t *)buf; const uint64_t *end = p + (size / sizeof(uint64_t)); /* Use 8 independent accumulators - each one handles every 8th element */ uint64_t x0 = 0, x1 = 0, x2 = 0, x3 = 0; uint64_t x4 = 0, x5 = 0, x6 = 0, x7 = 0; /* Process 32 elements (256 bytes) per iteration */ while (p + 32 <= end) { x0 ^= p[0]; x1 ^= p[1]; x2 ^= p[2]; x3 ^= p[3]; x4 ^= p[4]; x5 ^= p[5]; x6 ^= p[6]; x7 ^= p[7]; x0 ^= p[8]; x1 ^= p[9]; x2 ^= p[10]; x3 ^= p[11]; x4 ^= p[12]; x5 ^= p[13]; x6 ^= p[14]; x7 ^= p[15]; x0 ^= p[16]; x1 ^= p[17]; x2 ^= p[18]; x3 ^= p[19]; x4 ^= p[20]; x5 ^= p[21]; x6 ^= p[22]; x7 ^= p[23]; x0 ^= p[24]; x1 ^= p[25]; x2 ^= p[26]; x3 ^= p[27]; x4 ^= p[28]; x5 ^= p[29]; x6 ^= p[30]; x7 ^= p[31]; p += 32; } /* Handle remaining elements */ while (p + 8 <= end) { x0 ^= p[0]; x1 ^= p[1]; x2 ^= p[2]; x3 ^= p[3]; x4 ^= p[4]; x5 ^= p[5]; x6 ^= p[6]; x7 ^= p[7]; p += 8; } while (p < end) { x0 ^= *p++; } return x0 ^ x1 ^ x2 ^ x3 ^ x4 ^ x5 ^ x6 ^ x7; } /* Write operation: fill with pattern, heavily unrolled */ static inline __attribute__((always_inline)) void mem_write(void *buf, size_t size, uint64_t pattern) { uint64_t *p = (uint64_t *)buf; uint64_t *end = p + (size / sizeof(uint64_t)); /* Process 32 elements (256 bytes) per iteration */ while (p + 32 <= end) { p[0] = pattern; p[1] = pattern; p[2] = pattern; p[3] = pattern; p[4] = pattern; p[5] = pattern; p[6] = pattern; p[7] = pattern; p[8] = pattern; p[9] = pattern; p[10] = pattern; p[11] = pattern; p[12] = pattern; p[13] = pattern; p[14] = pattern; p[15] = pattern; p[16] = pattern; p[17] = pattern; p[18] = pattern; p[19] = pattern; p[20] = pattern; p[21] = pattern; p[22] = pattern; p[23] = pattern; p[24] = pattern; p[25] = pattern; p[26] = pattern; p[27] = pattern; p[28] = pattern; p[29] = pattern; p[30] = pattern; p[31] = pattern; p += 32; } /* Handle remaining */ while (p < end) { *p++ = pattern; } } /* Copy operation: copy from src to dst, heavily unrolled */ static inline __attribute__((always_inline)) void mem_copy(void *dst, const void *src, size_t size) { const uint64_t *s = (const uint64_t *)src; uint64_t *d = (uint64_t *)dst; const uint64_t *end = s + (size / sizeof(uint64_t)); /* Process 32 elements (256 bytes) per iteration */ while (s + 32 <= end) { d[0] = s[0]; d[1] = s[1]; d[2] = s[2]; d[3] = s[3]; d[4] = s[4]; d[5] = s[5]; d[6] = s[6]; d[7] = s[7]; d[8] = s[8]; d[9] = s[9]; d[10] = s[10]; d[11] = s[11]; d[12] = s[12]; d[13] = s[13]; d[14] = s[14]; d[15] = s[15]; d[16] = s[16]; d[17] = s[17]; d[18] = s[18]; d[19] = s[19]; d[20] = s[20]; d[21] = s[21]; d[22] = s[22]; d[23] = s[23]; d[24] = s[24]; d[25] = s[25]; d[26] = s[26]; d[27] = s[27]; d[28] = s[28]; d[29] = s[29]; d[30] = s[30]; d[31] = s[31]; s += 32; d += 32; } /* Handle remaining */ while (s < end) { *d++ = *s++; } } /* * Memory latency test using pointer chasing * * This implementation is based on ram_bench by Emil Ernerfeldt: * https://github.com/emilk/ram_bench * * Recommended by Alex Miller. * * Uses a linked list traversal approach where each node contains a payload * and a pointer to the next node. Nodes are allocated contiguously but * linked in random order to defeat hardware prefetchers. * * Key insight from ram_bench: random memory access cost is O(√N) due to * cache hierarchy (L1, L2, L3, RAM) and the fundamental limit that memory * within distance r from CPU is bounded by r² (Bekenstein bound). */ /* Node structure for linked list traversal (16 bytes like ram_bench) * The payload prevents compiler from optimizing away the traversal * and makes the structure cache-line realistic */ typedef struct LatencyNode LatencyNode; struct LatencyNode { uint64_t payload; /* Dummy data for realistic cache behavior */ LatencyNode *next; /* Pointer to next node in chain */ }; /* Statistical parameters for latency measurement */ #define LATENCY_MIN_SAMPLES 7 /* Minimum samples for statistical validity */ #define LATENCY_MAX_SAMPLES 21 /* Maximum samples (enough for robust statistics) */ #define LATENCY_TARGET_CV 0.05 /* Target coefficient of variation (5%) */ /* Comparison function for qsort (double ascending) */ static int compare_double(const void *a, const void *b) { double da = *(const double *)a; double db = *(const double *)b; if (da < db) return -1; if (da > db) return 1; return 0; } /* Calculate median of sorted array */ static double calculate_median(double *sorted, int n) { if (n == 0) return 0; if (n % 2 == 0) { return (sorted[n/2 - 1] + sorted[n/2]) / 2.0; } return sorted[n/2]; } /* Calculate mean of array */ static double calculate_mean(double *arr, int n) { if (n == 0) return 0; double sum = 0; for (int i = 0; i < n; i++) { sum += arr[i]; } return sum / n; } /* Calculate standard deviation of array */ static double calculate_stddev(double *arr, int n, double mean) { if (n < 2) return 0; double sum_sq = 0; for (int i = 0; i < n; i++) { double diff = arr[i] - mean; sum_sq += diff * diff; } return sqrt(sum_sq / (n - 1)); /* Sample standard deviation */ } /* Fisher-Yates shuffle for node pointer array */ static void shuffle_nodes(LatencyNode **nodes, size_t n) { for (size_t i = n - 1; i > 0; i--) { size_t j = (size_t)rand() % (i + 1); LatencyNode *tmp = nodes[i]; nodes[i] = nodes[j]; nodes[j] = tmp; } } /* Allocate memory for latency chain with NUMA awareness and huge page support * Uses mmap with optional huge pages to reduce TLB overhead for large buffers */ static LatencyNode* alloc_latency_memory(size_t num_nodes, size_t *alloc_size) { size_t size = num_nodes * sizeof(LatencyNode); *alloc_size = size; LatencyNode *memory = MAP_FAILED; int try_hugepages = g_use_hugepages && (size >= get_huge_page_threshold()); if (try_hugepages) { /* Round up size to huge page boundary */ size_t hp_size = get_huge_page_size(); size_t aligned_size = (size + hp_size - 1) & ~(hp_size - 1); *alloc_size = aligned_size; #ifdef MAP_HUGETLB /* Try explicit huge pages first */ memory = (LatencyNode *)mmap(NULL, aligned_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if (memory != MAP_FAILED) { if (g_verbose >= 2) { fprintf(stderr, " Latency: allocated %zu bytes using explicit 2MB huge pages\n", aligned_size); } } #endif /* Fall back to THP (Transparent Huge Pages) */ if (memory == MAP_FAILED) { memory = (LatencyNode *)mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (memory != MAP_FAILED) { #ifdef MADV_HUGEPAGE if (madvise(memory, size, MADV_HUGEPAGE) == 0) { if (g_verbose >= 2) { fprintf(stderr, " Latency: allocated %zu bytes with THP (transparent huge pages)\n", size); } } else if (g_verbose >= 2) { fprintf(stderr, " Latency: allocated %zu bytes (THP hint failed)\n", size); } #endif *alloc_size = size; /* Reset to actual size for THP */ } } } /* Regular allocation if huge pages disabled or failed */ if (memory == MAP_FAILED) { memory = (LatencyNode *)mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); *alloc_size = size; } if (memory == MAP_FAILED) return NULL; #ifdef USE_NUMA /* Bind memory to NUMA node 0 (where CPU 0 is) for consistent latency measurement */ if (numa_available() >= 0 && g_numa_nodes > 1) { int node = numa_node_of_cpu(0); if (node >= 0) { unsigned long nodemask = 1UL << node; mbind(memory, *alloc_size, MPOL_BIND, &nodemask, g_numa_nodes + 1, MPOL_MF_MOVE); if (g_verbose >= 2) { fprintf(stderr, " Latency memory bound to NUMA node %d\n", node); } } } #endif return memory; } /* Free latency chain memory allocated via mmap */ static void free_latency_memory(LatencyNode *memory, size_t alloc_size) { if (memory && alloc_size > 0) { munmap(memory, alloc_size); } } /* Initialize linked list with random traversal order * Memory is contiguous (good for allocation) but traversal is random * (defeats prefetcher, measures true memory latency) * Returns: start node pointer; caller must track alloc_size for freeing */ static LatencyNode* init_latency_chain(size_t num_nodes, size_t *alloc_size) { if (num_nodes < 2) return NULL; /* Allocate contiguous memory for all nodes using NUMA-aware allocation */ LatencyNode *memory = alloc_latency_memory(num_nodes, alloc_size); if (!memory) return NULL; /* Initialize payloads (also touches pages for NUMA first-touch policy) */ for (size_t i = 0; i < num_nodes; i++) { memory[i].payload = i; /* Unique payload for each node */ } /* Create array of pointers for shuffling */ LatencyNode **nodes = (LatencyNode **)malloc(num_nodes * sizeof(LatencyNode *)); if (!nodes) { free_latency_memory(memory, *alloc_size); return NULL; } for (size_t i = 0; i < num_nodes; i++) { nodes[i] = &memory[i]; } /* Shuffle to create random traversal order */ shuffle_nodes(nodes, num_nodes); /* Link nodes in shuffled order (circular) */ for (size_t i = 0; i < num_nodes - 1; i++) { nodes[i]->next = nodes[i + 1]; } nodes[num_nodes - 1]->next = nodes[0]; /* Close the loop */ LatencyNode *start = nodes[0]; free(nodes); return start; } /* Free latency chain - need base address and size */ static void free_latency_chain(LatencyNode *start, size_t num_nodes, size_t alloc_size) { if (!start || num_nodes == 0) return; /* Find the lowest address in the chain (that's where mmap'd block starts) */ LatencyNode *min_addr = start; LatencyNode *node = start->next; size_t visited = 1; while (node != start && visited < num_nodes) { if (node < min_addr) min_addr = node; node = node->next; visited++; } free_latency_memory(min_addr, alloc_size); } /* Pin current thread to CPU 0 for consistent latency measurement. * Platform-specific implementations for Linux, macOS, and BSD. */ static void pin_thread_to_cpu0(void) { int success = 0; #ifdef PLATFORM_LINUX cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(0, &cpuset); success = (sched_setaffinity(0, sizeof(cpu_set_t), &cpuset) == 0); #endif #ifdef PLATFORM_BSD cpuset_t cpuset; CPU_ZERO(&cpuset); CPU_SET(0, &cpuset); success = (cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(cpuset), &cpuset) == 0); #endif #ifdef PLATFORM_MACOS /* macOS doesn't have true CPU affinity, but we can suggest affinity * via thread_policy_set with THREAD_AFFINITY_POLICY. * This is a hint, not a guarantee. */ thread_affinity_policy_data_t policy = { 0 }; /* Affinity tag 0 */ success = (thread_policy_set(mach_thread_self(), THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT) == KERN_SUCCESS); #endif if (g_verbose >= 2) { if (success) { fprintf(stderr, " Latency thread pinned to CPU 0\n"); } else { fprintf(stderr, " Warning: Could not pin thread to CPU 0\n"); } } (void)success; /* Suppress unused warning if no platform matched */ } /* Chase through linked list - each load depends on previous * Returns final node pointer to prevent optimization */ static inline __attribute__((always_inline)) LatencyNode* chase_latency_chain(LatencyNode *start, size_t count) { LatencyNode *node = start; volatile uint64_t sink = 0; /* Prevent optimization */ /* Unroll 8x to reduce loop overhead while maintaining dependency chain */ size_t i = count; while (i >= 8) { sink += node->payload; node = node->next; sink += node->payload; node = node->next; sink += node->payload; node = node->next; sink += node->payload; node = node->next; sink += node->payload; node = node->next; sink += node->payload; node = node->next; sink += node->payload; node = node->next; sink += node->payload; node = node->next; i -= 8; } while (i > 0) { sink += node->payload; node = node->next; i--; } g_sink += sink; /* Store to global to prevent optimization */ return node; } /* Result structure for latency measurement with statistics */ typedef struct { double median_ns; /* Median latency (robust to outliers) */ double mean_ns; /* Mean latency */ double stddev_ns; /* Standard deviation */ double cv; /* Coefficient of variation (stddev/mean) */ int num_samples; /* Number of samples collected */ size_t total_accesses; /* Total node accesses performed */ } latency_stats_t; /* Target time per sample in seconds - long enough for timer precision, * short enough for reasonable total measurement time */ #define LATENCY_TARGET_SAMPLE_TIME 0.1 /* 100ms per sample */ #define LATENCY_MIN_SAMPLE_TIME 0.01 /* 10ms minimum for timer precision */ /* Measure latency with statistical validity * * Strategy: * 1. Create random linked list covering the buffer size * 2. Warmup by traversing the list once * 3. Calibration run to estimate latency and calculate traversals needed * 4. Collect multiple independent time samples * 5. Continue until CV < target or max samples reached * 6. Report median (robust to outliers) and statistics * * Returns statistically valid latency measurement */ static latency_stats_t measure_latency_stats(size_t buffer_size) { latency_stats_t stats = {0}; /* Pin thread to CPU 0 for consistent latency measurement. * This prevents OS scheduler from migrating the thread during measurement, * which would cause inconsistent results due to cache effects and NUMA. */ pin_thread_to_cpu0(); /* Calculate number of nodes that fit in buffer */ size_t num_nodes = buffer_size / sizeof(LatencyNode); if (num_nodes < 64) num_nodes = 64; /* Minimum for meaningful measurement */ /* Initialize chain with NUMA-aware allocation */ size_t alloc_size = 0; LatencyNode *start = init_latency_chain(num_nodes, &alloc_size); if (!start) { fprintf(stderr, "Failed to allocate %zu bytes for latency test\n", num_nodes * sizeof(LatencyNode)); return stats; } /* Warmup: single traversal to prime caches and stabilize CPU */ chase_latency_chain(start, num_nodes); /* Calibration: time a single traversal to estimate latency */ double cal_start = get_time(); chase_latency_chain(start, num_nodes); double cal_elapsed = get_time() - cal_start; /* Calculate traversals needed to achieve target sample time */ double estimated_latency_s = cal_elapsed / num_nodes; size_t traversals_per_sample; if (estimated_latency_s > 0) { /* Calculate traversals to reach target sample time */ double target_accesses = LATENCY_TARGET_SAMPLE_TIME / estimated_latency_s; traversals_per_sample = (size_t)(target_accesses / num_nodes); /* Ensure at least 1 full traversal per sample */ if (traversals_per_sample < 1) traversals_per_sample = 1; /* Cap at reasonable maximum for very fast (L1) accesses */ if (traversals_per_sample > 10000) traversals_per_sample = 10000; } else { /* Fallback: at least 1 traversal */ traversals_per_sample = 1; } /* Sample collection */ double samples[LATENCY_MAX_SAMPLES]; int num_samples = 0; size_t total_accesses = 0; /* Collect samples until statistically valid or max reached */ while (num_samples < LATENCY_MAX_SAMPLES) { size_t accesses_this_sample = num_nodes * traversals_per_sample; /* Time this sample */ double start_time = get_time(); chase_latency_chain(start, accesses_this_sample); double end_time = get_time(); double elapsed = end_time - start_time; double latency_ns = (elapsed * 1e9) / accesses_this_sample; samples[num_samples++] = latency_ns; total_accesses += accesses_this_sample; /* Check if we have enough samples and they're stable */ if (num_samples >= LATENCY_MIN_SAMPLES) { double mean = calculate_mean(samples, num_samples); double stddev = calculate_stddev(samples, num_samples, mean); double cv = (mean > 0) ? (stddev / mean) : 1.0; /* Stop if coefficient of variation is acceptable */ if (cv < LATENCY_TARGET_CV) { break; } } } /* Calculate final statistics */ double mean = calculate_mean(samples, num_samples); double stddev = calculate_stddev(samples, num_samples, mean); /* Sort for median calculation */ qsort(samples, num_samples, sizeof(double), compare_double); double median = calculate_median(samples, num_samples); /* Populate result */ stats.median_ns = median; stats.mean_ns = mean; stats.stddev_ns = stddev; stats.cv = (mean > 0) ? (stddev / mean) : 0; stats.num_samples = num_samples; stats.total_accesses = total_accesses; /* Cleanup */ free_latency_chain(start, num_nodes, alloc_size); return stats; } /* ============================================================================ * Memory allocation * ============================================================================ */ static void* alloc_buffer(size_t size) { void *buf = MAP_FAILED; int try_hugepages = g_use_hugepages && (size >= get_huge_page_threshold()); if (try_hugepages) { /* * Strategy: prefer THP over explicit huge pages because: * 1. THP doesn't require pre-allocation by root * 2. THP is managed automatically by the kernel * 3. Explicit huge pages may fail if pool isn't configured * * We try explicit huge pages first only because they're more * deterministic (guaranteed 2MB pages vs THP's best-effort). */ #ifdef MAP_HUGETLB /* Round up size to huge page boundary for explicit huge pages */ size_t hp_size = get_huge_page_size(); size_t aligned_size = (size + hp_size - 1) & ~(hp_size - 1); /* Try explicit huge pages (uses pre-allocated pool if available) */ buf = mmap(NULL, aligned_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if (buf != MAP_FAILED) { if (g_verbose >= 2) { fprintf(stderr, " Allocated %zu bytes using explicit %zu KB huge pages\n", aligned_size, hp_size / 1024); } /* Touch all pages to ensure they're allocated */ memset(buf, 0, size); return buf; } /* Explicit huge pages failed - likely no pool configured, try THP */ #endif /* Use mmap + madvise for Transparent Huge Pages (no pre-allocation needed) */ buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (buf != MAP_FAILED) { #ifdef MADV_HUGEPAGE /* Hint to kernel: please use huge pages for this region. * The kernel will use THP if available and beneficial. * This doesn't require root or pre-allocation. */ if (madvise(buf, size, MADV_HUGEPAGE) == 0) { if (g_verbose >= 2) { fprintf(stderr, " Allocated %zu bytes with THP (transparent huge pages)\n", size); } } else if (g_verbose >= 2) { fprintf(stderr, " Allocated %zu bytes (THP hint failed, using regular pages)\n", size); } #else if (g_verbose >= 2) { fprintf(stderr, " Allocated %zu bytes (THP not available on this system)\n", size); } #endif /* Touch all pages to ensure they're allocated */ memset(buf, 0, size); return buf; } } /* Regular allocation: small buffers, huge pages disabled, or fallback */ buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (buf == MAP_FAILED) { return NULL; } /* Touch all pages to ensure they're allocated */ memset(buf, 0, size); return buf; } static void free_buffer(void *buf, size_t size) { if (buf) { munmap(buf, size); } } /* ============================================================================ * Cache topology detection using hwloc (portable: x86, arm64, etc.) * * Install hwloc: * Debian/Ubuntu: apt-get install libhwloc-dev * RHEL/CentOS: yum install hwloc-devel * macOS: brew install hwloc * ============================================================================ */ #ifdef USE_HWLOC #include static hwloc_topology_t g_topology = NULL; /* Detect cache sizes using hwloc */ static void init_cache_info(void) { if (hwloc_topology_init(&g_topology) < 0) { goto use_defaults; } if (hwloc_topology_load(g_topology) < 0) { hwloc_topology_destroy(g_topology); g_topology = NULL; goto use_defaults; } /* Find cache sizes by iterating through cache objects */ int depth; /* L1 Data Cache */ depth = hwloc_get_type_depth(g_topology, HWLOC_OBJ_L1CACHE); if (depth != HWLOC_TYPE_DEPTH_UNKNOWN) { hwloc_obj_t obj = hwloc_get_obj_by_depth(g_topology, depth, 0); if (obj && obj->attr && obj->attr->cache.type != HWLOC_OBJ_CACHE_INSTRUCTION) { g_l1_cache_size = obj->attr->cache.size; } } /* L2 Cache */ depth = hwloc_get_type_depth(g_topology, HWLOC_OBJ_L2CACHE); if (depth != HWLOC_TYPE_DEPTH_UNKNOWN) { hwloc_obj_t obj = hwloc_get_obj_by_depth(g_topology, depth, 0); if (obj && obj->attr) { g_l2_cache_size = obj->attr->cache.size; } } /* L3 Cache */ depth = hwloc_get_type_depth(g_topology, HWLOC_OBJ_L3CACHE); if (depth != HWLOC_TYPE_DEPTH_UNKNOWN) { hwloc_obj_t obj = hwloc_get_obj_by_depth(g_topology, depth, 0); if (obj && obj->attr) { g_l3_cache_size = obj->attr->cache.size; } } /* Count total L3 cache (sum across all L3 objects for distributed caches) */ if (g_l3_cache_size > 0) { depth = hwloc_get_type_depth(g_topology, HWLOC_OBJ_L3CACHE); int num_l3 = hwloc_get_nbobjs_by_depth(g_topology, depth); if (g_verbose && num_l3 > 1) { fprintf(stderr, "Note: %d L3 caches detected (distributed across dies)\n", num_l3); } } use_defaults: /* Set defaults if detection failed */ if (g_l1_cache_size == 0) g_l1_cache_size = 32 * 1024; /* 32 KB */ if (g_l2_cache_size == 0) g_l2_cache_size = 256 * 1024; /* 256 KB */ if (g_l3_cache_size == 0) g_l3_cache_size = 8 * 1024 * 1024; /* 8 MB */ /* Calculate adaptive minimum total size: * Use 16KB per thread × num_cpus so each thread has a reliable buffer size. * This ensures all CPUs can participate with meaningful measurements. */ g_min_total_size = 16384 * g_num_cpus; /* 16KB per thread minimum */ if (g_verbose) { fprintf(stderr, "Cache (hwloc): L1d=%zuKB, L2=%zuKB, L3=%zuKB (per core)\n", g_l1_cache_size / 1024, g_l2_cache_size / 1024, g_l3_cache_size / 1024); fprintf(stderr, "Minimum total test size: %zu KB (16KB × %d CPUs)\n", g_min_total_size / 1024, g_num_cpus); } } static void cleanup_hwloc(void) { if (g_topology) { hwloc_topology_destroy(g_topology); g_topology = NULL; } } #else /* !USE_HWLOC - fallback to platform-specific methods */ #ifdef PLATFORM_LINUX /* Parse cache size from sysfs (handles "48K", "1024K", "32768K" format) */ static size_t parse_cache_size_sysfs(const char *str) { size_t size = 0; char unit = 0; if (sscanf(str, "%zu%c", &size, &unit) >= 1) { if (unit == 'K' || unit == 'k') size *= 1024; else if (unit == 'M' || unit == 'm') size *= 1024 * 1024; } return size; } /* Read cache info from sysfs (Linux-specific) */ static void init_cache_info_linux(void) { char path[256]; char buf[64]; FILE *f; for (int index = 0; index < 10; index++) { /* Read level */ snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu0/cache/index%d/level", index); f = fopen(path, "r"); if (!f) continue; int level = -1; if (fgets(buf, sizeof(buf), f)) level = atoi(buf); fclose(f); if (level < 0) continue; /* Read type */ snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu0/cache/index%d/type", index); f = fopen(path, "r"); if (!f) continue; char type[32] = ""; if (fgets(type, sizeof(type), f)) type[strcspn(type, "\n")] = 0; fclose(f); /* Skip instruction caches */ if (strcmp(type, "Instruction") == 0) continue; /* Read size */ snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu0/cache/index%d/size", index); f = fopen(path, "r"); if (!f) continue; size_t size = 0; if (fgets(buf, sizeof(buf), f)) size = parse_cache_size_sysfs(buf); fclose(f); if (size == 0) continue; switch (level) { case 1: if (g_l1_cache_size == 0) g_l1_cache_size = size; break; case 2: if (g_l2_cache_size == 0) g_l2_cache_size = size; break; case 3: if (g_l3_cache_size == 0) g_l3_cache_size = size; break; } } } #endif /* PLATFORM_LINUX */ #ifdef PLATFORM_MACOS /* Read cache info from sysctl (macOS-specific) */ static void init_cache_info_macos(void) { size_t size; size_t len = sizeof(size); /* L1 data cache */ if (sysctlbyname("hw.l1dcachesize", &size, &len, NULL, 0) == 0 && size > 0) { g_l1_cache_size = size; } /* L2 cache */ len = sizeof(size); if (sysctlbyname("hw.l2cachesize", &size, &len, NULL, 0) == 0 && size > 0) { g_l2_cache_size = size; } /* L3 cache (may not exist on all Macs) */ len = sizeof(size); if (sysctlbyname("hw.l3cachesize", &size, &len, NULL, 0) == 0 && size > 0) { g_l3_cache_size = size; } } #endif /* PLATFORM_MACOS */ #ifdef PLATFORM_BSD /* Read cache info from sysctl (BSD-specific) */ static void init_cache_info_bsd(void) { /* FreeBSD and other BSDs have limited sysctl cache info. * Try standard hw.cacheXXX values, fall back to defaults. */ size_t size; size_t len = sizeof(size); /* Try various BSD sysctl names */ if (sysctlbyname("hw.l1dcachesize", &size, &len, NULL, 0) == 0 && size > 0) { g_l1_cache_size = size; } len = sizeof(size); if (sysctlbyname("hw.l2cachesize", &size, &len, NULL, 0) == 0 && size > 0) { g_l2_cache_size = size; } len = sizeof(size); if (sysctlbyname("hw.l3cachesize", &size, &len, NULL, 0) == 0 && size > 0) { g_l3_cache_size = size; } } #endif /* PLATFORM_BSD */ /* Platform-agnostic cache info initialization */ static void init_cache_info(void) { const char *method = "defaults"; #ifdef PLATFORM_LINUX init_cache_info_linux(); method = "sysfs"; #endif #ifdef PLATFORM_MACOS init_cache_info_macos(); method = "sysctl"; #endif #ifdef PLATFORM_BSD init_cache_info_bsd(); method = "sysctl"; #endif /* Set defaults if detection failed */ if (g_l1_cache_size == 0) g_l1_cache_size = 32 * 1024; /* 32 KB */ if (g_l2_cache_size == 0) g_l2_cache_size = 256 * 1024; /* 256 KB */ if (g_l3_cache_size == 0) g_l3_cache_size = 8 * 1024 * 1024; /* 8 MB */ /* Calculate adaptive minimum total size: * Use 16KB per thread × num_cpus so each thread has a reliable buffer size. */ g_min_total_size = 16384 * g_num_cpus; /* 16KB per thread minimum */ if (g_verbose) { fprintf(stderr, "Cache (%s): L1d=%zuKB, L2=%zuKB, L3=%zuKB (per core)\n", method, g_l1_cache_size / 1024, g_l2_cache_size / 1024, g_l3_cache_size / 1024); fprintf(stderr, "Minimum total test size: %zu KB (16KB × %d CPUs)\n", g_min_total_size / 1024, g_num_cpus); } } static void cleanup_hwloc(void) { /* No-op when hwloc is not used */ } #endif /* USE_HWLOC */ /* ============================================================================ * NUMA support * ============================================================================ */ static void init_numa_topology(void) { /* Initialize topology arrays */ memset(g_cpus_per_node, 0, sizeof(g_cpus_per_node)); memset(g_node_cpus, 0, sizeof(g_node_cpus)); #ifdef USE_NUMA if (numa_available() >= 0 && g_numa_nodes > 1) { /* Build CPU-to-node mapping using libnuma */ for (int cpu = 0; cpu < g_num_cpus && cpu < MAX_NUMA_NODES * MAX_CPUS_PER_NODE; cpu++) { int node = numa_node_of_cpu(cpu); if (node >= 0 && node < MAX_NUMA_NODES) { int idx = g_cpus_per_node[node]; if (idx < MAX_CPUS_PER_NODE) { g_node_cpus[node][idx] = cpu; g_cpus_per_node[node]++; } } } if (g_verbose) { fprintf(stderr, "NUMA topology:\n"); for (int node = 0; node < g_numa_nodes; node++) { fprintf(stderr, " Node %d: %d CPUs (first: %d, last: %d)\n", node, g_cpus_per_node[node], g_cpus_per_node[node] > 0 ? g_node_cpus[node][0] : -1, g_cpus_per_node[node] > 0 ? g_node_cpus[node][g_cpus_per_node[node]-1] : -1); } } } else #endif { /* UMA or NUMA not enabled: all CPUs on "node 0" */ for (int cpu = 0; cpu < g_num_cpus && cpu < MAX_CPUS_PER_NODE; cpu++) { g_node_cpus[0][cpu] = cpu; } g_cpus_per_node[0] = g_num_cpus < MAX_CPUS_PER_NODE ? g_num_cpus : MAX_CPUS_PER_NODE; } } static void init_numa(void) { #ifdef USE_NUMA if (numa_available() >= 0) { g_numa_nodes = numa_max_node() + 1; if (g_verbose) { fprintf(stderr, "NUMA: %d nodes detected (libnuma enabled)\n", g_numa_nodes); } } else { g_numa_nodes = 1; if (g_verbose) { fprintf(stderr, "NUMA: not available (libnuma enabled but no NUMA support)\n"); } } #else g_numa_nodes = 1; if (g_verbose) { fprintf(stderr, "NUMA: disabled (compile with -DUSE_NUMA -lnuma to enable)\n"); } #endif /* Build NUMA topology after detecting nodes */ init_numa_topology(); } /* ============================================================================ * System info * ============================================================================ */ static void init_system_info(void) { /* Get number of CPUs (POSIX, works on all platforms) */ g_num_cpus = sysconf(_SC_NPROCESSORS_ONLN); if (g_num_cpus < 1) g_num_cpus = 1; /* Get total memory (platform-specific methods) */ g_total_memory = 0; #ifdef PLATFORM_LINUX /* Linux: sysconf is reliable */ long pages = sysconf(_SC_PHYS_PAGES); long page_size = sysconf(_SC_PAGESIZE); if (pages > 0 && page_size > 0) { g_total_memory = (size_t)pages * (size_t)page_size; } #endif #ifdef PLATFORM_MACOS /* macOS: use sysctl hw.memsize */ int64_t memsize = 0; size_t len = sizeof(memsize); if (sysctlbyname("hw.memsize", &memsize, &len, NULL, 0) == 0 && memsize > 0) { g_total_memory = (size_t)memsize; } #endif #ifdef PLATFORM_BSD /* BSD: try hw.physmem or hw.realmem */ unsigned long physmem = 0; size_t len = sizeof(physmem); if (sysctlbyname("hw.physmem", &physmem, &len, NULL, 0) == 0 && physmem > 0) { g_total_memory = (size_t)physmem; } else { /* Fallback to sysconf */ long pages = sysconf(_SC_PHYS_PAGES); long page_size = sysconf(_SC_PAGESIZE); if (pages > 0 && page_size > 0) { g_total_memory = (size_t)pages * (size_t)page_size; } } #endif /* Fallback if detection failed */ if (g_total_memory == 0) { long pages = sysconf(_SC_PHYS_PAGES); long page_size = sysconf(_SC_PAGESIZE); if (pages > 0 && page_size > 0) { g_total_memory = (size_t)pages * (size_t)page_size; } else { g_total_memory = 1024UL * 1024 * 1024; /* Default 1GB */ } } if (g_verbose) { fprintf(stderr, "System: %d CPUs, %.2f GB memory\n", g_num_cpus, g_total_memory / (1024.0 * 1024 * 1024)); } /* Detect cache topology (must be called after g_num_cpus is set) */ init_cache_info(); } /* ============================================================================ * OpenMP Bandwidth Benchmark * ============================================================================ */ /* * Run bandwidth benchmark using OpenMP. * * Key features: * - proc_bind(spread) distributes threads across NUMA nodes * - Per-thread NUMA-local buffer allocation * - Implicit barrier synchronization (more efficient than pthread_barrier) * - 8-accumulator read for optimal bandwidth measurement */ static result_t run_benchmark_omp(size_t size, operation_t op, int nthreads) { result_t result = {0}; result.size = size; result.op = op; result.threads = nthreads; /* Allocate arrays for per-thread buffers and results */ void **src_bufs = calloc(nthreads, sizeof(void*)); void **dst_bufs = calloc(nthreads, sizeof(void*)); double *thread_elapsed = calloc(nthreads, sizeof(double)); uint64_t *thread_checksums = calloc(nthreads, sizeof(uint64_t)); int alloc_failed = 0; if (!src_bufs || !dst_bufs || !thread_elapsed || !thread_checksums) { free(src_bufs); free(dst_bufs); free(thread_elapsed); free(thread_checksums); return result; } /* Set OpenMP thread count */ omp_set_num_threads(nthreads); /* Phase 1: Parallel allocation with NUMA awareness * proc_bind(spread) distributes threads across NUMA nodes, * then each thread allocates memory locally */ #pragma omp parallel proc_bind(spread) { int tid = omp_get_thread_num(); #ifdef USE_NUMA /* Get current CPU and its NUMA node (OpenMP has placed us optimally) */ if (numa_available() >= 0) { int cpu = sched_getcpu(); int node = numa_node_of_cpu(cpu); if (node >= 0) { /* Allocate on local NUMA node */ src_bufs[tid] = numa_alloc_onnode(size, node); if (op == OP_COPY) { dst_bufs[tid] = numa_alloc_onnode(size, node); } } } #endif /* Fallback: regular allocation if NUMA not available or failed */ if (!src_bufs[tid]) { src_bufs[tid] = alloc_buffer(size); } if (op == OP_COPY && !dst_bufs[tid]) { dst_bufs[tid] = alloc_buffer(size); } /* Check allocation success */ if (!src_bufs[tid] || (op == OP_COPY && !dst_bufs[tid])) { #pragma omp atomic write alloc_failed = 1; } /* Initialize buffer (first-touch for NUMA) */ if (src_bufs[tid]) { memset(src_bufs[tid], 0xAA, size); } if (dst_bufs[tid]) { memset(dst_bufs[tid], 0, size); } } if (alloc_failed) { /* Cleanup on allocation failure */ for (int i = 0; i < nthreads; i++) { #ifdef USE_NUMA if (numa_available() >= 0) { if (src_bufs[i]) numa_free(src_bufs[i], size); if (dst_bufs[i]) numa_free(dst_bufs[i], size); } else #endif { free_buffer(src_bufs[i], size); free_buffer(dst_bufs[i], size); } } free(src_bufs); free(dst_bufs); free(thread_elapsed); free(thread_checksums); if (g_verbose) { fprintf(stderr, "Failed to allocate %zu bytes × %d threads\n", size, nthreads); } return result; } /* Phase 2: Calibration - estimate iterations needed */ int iterations = MIN_ITERATIONS; { /* Warmup */ g_sink += mem_read(src_bufs[0], size); /* Time single iteration */ double t_start = get_time(); switch (op) { case OP_READ: g_sink += mem_read(src_bufs[0], size); break; case OP_WRITE: mem_write(src_bufs[0], size, 0x1234567890ABCDEFULL); break; case OP_COPY: mem_copy(dst_bufs[0], src_bufs[0], size); break; default: break; } double time_per_iter = get_time() - t_start; if (time_per_iter > 1e-9) { iterations = (int)(TARGET_TIME_PER_TEST / time_per_iter); if (iterations < MIN_ITERATIONS) iterations = MIN_ITERATIONS; if (iterations > MAX_ITERATIONS) iterations = MAX_ITERATIONS; } } result.iterations = iterations; /* Phase 3: Timed measurement with all threads * OpenMP implicit barrier ensures all threads start together */ #pragma omp parallel proc_bind(spread) { int tid = omp_get_thread_num(); void *src = src_bufs[tid]; void *dst = dst_bufs[tid]; uint64_t checksum = 0; /* Implicit barrier here - all threads synchronized */ double t_start = get_time(); switch (op) { case OP_READ: for (int i = 0; i < iterations; i++) { checksum ^= mem_read(src, size); } break; case OP_WRITE: for (int i = 0; i < iterations; i++) { mem_write(src, size, (uint64_t)i); } break; case OP_COPY: for (int i = 0; i < iterations; i++) { mem_copy(dst, src, size); } break; default: break; } double t_end = get_time(); thread_elapsed[tid] = t_end - t_start; thread_checksums[tid] = checksum; } /* Find max elapsed time (determines overall bandwidth) */ double max_elapsed = 0; uint64_t total_checksum = 0; for (int i = 0; i < nthreads; i++) { if (thread_elapsed[i] > max_elapsed) { max_elapsed = thread_elapsed[i]; } total_checksum ^= thread_checksums[i]; } g_sink += total_checksum; result.elapsed_s = max_elapsed; /* Calculate bandwidth = (size per thread * threads * iterations) / time * This gives aggregate bandwidth across all threads. * Note: for copy, we report buffer size (not 2x) to match bw_mem convention */ if (max_elapsed > 0) { size_t bytes_transferred = (size_t)size * nthreads * iterations; result.bandwidth_mb_s = (bytes_transferred / (1024.0 * 1024.0)) / max_elapsed; } /* Cleanup */ for (int i = 0; i < nthreads; i++) { #ifdef USE_NUMA if (numa_available() >= 0) { if (src_bufs[i]) numa_free(src_bufs[i], size); if (dst_bufs[i]) numa_free(dst_bufs[i], size); } else #endif { free_buffer(src_bufs[i], size); free_buffer(dst_bufs[i], size); } } free(src_bufs); free(dst_bufs); free(thread_elapsed); free(thread_checksums); return result; } /* Run single-threaded benchmark (for small buffers or latency) */ static result_t run_benchmark_single(size_t size, operation_t op) { result_t result = {0}; result.size = size; result.op = op; result.threads = 1; void *src = alloc_buffer(size); void *dst = (op == OP_COPY) ? alloc_buffer(size) : NULL; if (!src || (op == OP_COPY && !dst)) { free_buffer(src, size); free_buffer(dst, size); return result; } memset(src, 0xAA, size); if (dst) memset(dst, 0, size); /* Warmup */ g_sink += mem_read(src, size); /* Calibrate */ double t_start = get_time(); switch (op) { case OP_READ: g_sink += mem_read(src, size); break; case OP_WRITE: mem_write(src, size, 0x1234567890ABCDEFULL); break; case OP_COPY: mem_copy(dst, src, size); break; default: break; } double time_per_iter = get_time() - t_start; int iterations = MIN_ITERATIONS; if (time_per_iter > 1e-9) { iterations = (int)(TARGET_TIME_PER_TEST / time_per_iter); if (iterations < MIN_ITERATIONS) iterations = MIN_ITERATIONS; if (iterations > MAX_ITERATIONS) iterations = MAX_ITERATIONS; } result.iterations = iterations; /* Timed run */ uint64_t checksum = 0; t_start = get_time(); switch (op) { case OP_READ: for (int i = 0; i < iterations; i++) { checksum ^= mem_read(src, size); } break; case OP_WRITE: for (int i = 0; i < iterations; i++) { mem_write(src, size, (uint64_t)i); } break; case OP_COPY: for (int i = 0; i < iterations; i++) { mem_copy(dst, src, size); } break; default: break; } double elapsed = get_time() - t_start; g_sink += checksum; result.elapsed_s = elapsed; if (elapsed > 0) { size_t bytes_transferred = size * iterations; result.bandwidth_mb_s = (bytes_transferred / (1024.0 * 1024.0)) / elapsed; } free_buffer(src, size); free_buffer(dst, size); return result; } /* Main benchmark runner - dispatches to OpenMP or single-threaded */ static result_t run_benchmark(size_t size, operation_t op, int nthreads) { if (nthreads == 1) { return run_benchmark_single(size, op); } return run_benchmark_omp(size, op, nthreads); } /* Run benchmark multiple times and return best result (like lmbench TRIES) * For bandwidth: best = highest bandwidth * For latency: best = lowest latency * * First run is a warmup (discarded) to allow CPU frequency to ramp up * and caches to warm. This dramatically reduces result variability. */ static result_t run_benchmark_best(size_t size, operation_t op, int nthreads) { result_t best = {0}; /* Warmup run - discarded. * This allows: CPU to reach turbo frequency, caches to warm, * thread scheduling to stabilize. Critical for consistent results. */ (void)run_benchmark(size, op, nthreads); for (int try = 0; try < g_benchmark_tries; try++) { result_t r = run_benchmark(size, op, nthreads); if (try == 0) { best = r; } else { if (op == OP_LATENCY) { /* For latency: lower is better */ if (r.latency_ns > 0 && r.latency_ns < best.latency_ns) { best = r; } } else { /* For bandwidth: higher is better */ if (r.bandwidth_mb_s > best.bandwidth_mb_s) { best = r; } } } } return best; } /* ============================================================================ * Main benchmark loop * ============================================================================ */ /* Generate thread counts dynamically based on CPU count (for auto-scaling mode) * * Strategy: * - Powers of 2 from 1 up to nproc * - Always include nproc itself (if not already a power of 2) * - No oversubscription (causes unreliable results) * * Examples: * 4 cores: 1, 2, 4 (3 values) * 32 cores: 1, 2, 4, 8, 16, 32 (6 values) * 48 cores: 1, 2, 4, 8, 16, 32, 48 (7 values) */ static int* get_thread_counts(int *count) { int nproc = g_num_cpus; if (nproc < 1) nproc = 1; /* Cap at nproc - oversubscription causes unreliable benchmark results * due to context switching, cache thrashing, and scheduler interference */ int max_threads = nproc; /* Allocate more than enough space */ int *tc = malloc(32 * sizeof(int)); int n = 0; /* Add powers of 2 up to nproc */ for (int t = 1; t <= max_threads; t *= 2) { tc[n++] = t; } /* Add nproc if not already in list (i.e., not a power of 2) */ if (tc[n-1] != nproc) { tc[n++] = nproc; } tc[n] = 0; /* Sentinel */ *count = n; return tc; } /* Round size to nearest power of 2 for cleaner output */ static size_t round_to_power_of_2(size_t size) { if (size == 0) return 4096; size_t power = 1; while (power < size) power <<= 1; /* Return closer of power and power/2 */ if (power - size > size - power/2 && power/2 >= 4096) { return power / 2; } return power; } /* Get sizes to test (per-thread buffer sizes) - adaptive based on cache hierarchy * * Generates sizes at critical cache transition points to show: * 1. Pure L1 performance * 2. L1→L2 transition * 3. Pure L2 performance * 4. L2→L3 transition * 5. L3 region * 6. Pure RAM bandwidth * * All sizes are strictly increasing with no overlaps. */ static size_t* get_sizes(int *count) { int nthreads = g_explicit_threads > 0 ? g_explicit_threads : g_num_cpus; if (nthreads < 1) nthreads = 1; /* Use detected cache sizes, with sensible defaults */ size_t l1 = g_l1_cache_size > 0 ? g_l1_cache_size : 32768; /* 32 KB */ size_t l2 = g_l2_cache_size > 0 ? g_l2_cache_size : 262144; /* 256 KB */ size_t l3 = g_l3_cache_size > 0 ? g_l3_cache_size : 8388608; /* 8 MB */ /* Memory limit per thread */ size_t max_size = g_total_memory / 2 / nthreads; /* Build strictly increasing size sequence */ size_t sizes_list[20]; int n = 0; size_t prev = 0; /* Helper macro to add size if > prev and <= max_size */ #define ADD_SIZE(sz) do { \ size_t _s = round_to_power_of_2(sz); \ if (_s > prev && _s <= max_size) { sizes_list[n++] = _s; prev = _s; } \ } while(0) /* L1 region */ ADD_SIZE(l1 / 2); /* L1→L2 transition */ ADD_SIZE(l1 * 2); /* L2 region */ ADD_SIZE(l2 / 2); ADD_SIZE(l2); /* L2→L3 transition */ ADD_SIZE(l2 * 2); /* L3 region */ if (l3 > l2 * 4) { ADD_SIZE(l3 / 4); } ADD_SIZE(l3 / 2); /* L3→RAM transition */ ADD_SIZE(l3); /* RAM region */ ADD_SIZE(l3 * 2); ADD_SIZE(l3 * 4); /* Full sweep: add larger sizes up to memory limit */ if (g_full_sweep) { size_t ram_size = RAM_SIZE_2 * 2; while (ram_size <= max_size && n < 18) { ADD_SIZE(ram_size); ram_size *= 2; } } #undef ADD_SIZE /* Ensure at least one size */ if (n == 0) { sizes_list[n++] = 4096; } /* Copy to result array */ size_t *sizes = malloc((n + 1) * sizeof(size_t)); for (int i = 0; i < n; i++) { sizes[i] = sizes_list[i]; } sizes[n] = 0; *count = n; return sizes; } /* Format size for human readable output (e.g., 1024 KB -> "1 MB") */ static const char* format_size(size_t size_kb, char *buf, size_t buf_size) { if (size_kb >= 1024 * 1024) { snprintf(buf, buf_size, "%zu GB", size_kb / (1024 * 1024)); } else if (size_kb >= 1024) { snprintf(buf, buf_size, "%zu MB", size_kb / 1024); } else { snprintf(buf, buf_size, "%zu KB", size_kb); } return buf; } /* Format bandwidth for human readable output */ static const char* format_bandwidth(double mb_s, char *buf, size_t buf_size) { if (mb_s >= 1000000) { snprintf(buf, buf_size, "%.1f TB/s", mb_s / 1000000); } else if (mb_s >= 1000) { snprintf(buf, buf_size, "%.1f GB/s", mb_s / 1000); } else { snprintf(buf, buf_size, "%.1f MB/s", mb_s); } return buf; } static void print_csv_header(void) { if (g_human_readable) { printf("\n%-10s %-8s %12s %12s %8s\n", "Size", "Op", "Bandwidth", "Latency", "Threads"); printf("%-10s %-8s %12s %12s %8s\n", "----", "--", "---------", "-------", "-------"); } else { printf("size_kb,operation,bandwidth_mb_s,latency_ns,latency_stddev_ns,latency_samples,threads,iterations,elapsed_s\n"); } } static void print_result(const result_t *r) { size_t size_kb = r->size / 1024; if (g_human_readable) { char size_buf[32], bw_buf[32]; format_size(size_kb, size_buf, sizeof(size_buf)); if (r->op == OP_LATENCY) { printf("%-10s %-8s %12s %9.1f ns %8d\n", size_buf, OP_NAMES[r->op], "-", r->latency_ns, r->threads); } else { format_bandwidth(r->bandwidth_mb_s, bw_buf, sizeof(bw_buf)); printf("%-10s %-8s %12s %12s %8d\n", size_buf, OP_NAMES[r->op], bw_buf, "-", r->threads); } } else { if (r->op == OP_LATENCY) { /* For latency test: report median, stddev, and sample count for statistical validity * Median is robust to outliers and provides reliable central tendency * StdDev indicates measurement precision * Sample count shows measurement effort */ printf("%zu,%s,0,%.2f,%.2f,%d,%d,%d,%.6f\n", size_kb, OP_NAMES[r->op], r->latency_ns, r->latency_stddev_ns, r->latency_samples, r->threads, r->iterations, r->elapsed_s); } else { /* For bandwidth tests, latency fields are 0 */ printf("%zu,%s,%.2f,0,0,0,%d,%d,%.6f\n", size_kb, OP_NAMES[r->op], r->bandwidth_mb_s, r->threads, r->iterations, r->elapsed_s); } } } /* Update summary statistics with a new result */ static void update_summary(const result_t *r) { /* Weight by log2 of size - larger sizes get more weight */ double weight = log2((double)r->size / 1024.0 + 1.0); /* Track largest size tested */ if (r->size > g_summary.largest_size_tested) { g_summary.largest_size_tested = r->size; } switch (r->op) { case OP_READ: g_summary.read_count++; if (r->bandwidth_mb_s > g_summary.peak_read_mb_s) { g_summary.peak_read_mb_s = r->bandwidth_mb_s; } g_summary.read_bw_weighted_sum += r->bandwidth_mb_s * weight; g_summary.read_weight_sum += weight; break; case OP_WRITE: g_summary.write_count++; if (r->bandwidth_mb_s > g_summary.peak_write_mb_s) { g_summary.peak_write_mb_s = r->bandwidth_mb_s; } g_summary.write_bw_weighted_sum += r->bandwidth_mb_s * weight; g_summary.write_weight_sum += weight; break; case OP_COPY: g_summary.copy_count++; if (r->bandwidth_mb_s > g_summary.peak_copy_mb_s) { g_summary.peak_copy_mb_s = r->bandwidth_mb_s; } g_summary.copy_bw_weighted_sum += r->bandwidth_mb_s * weight; g_summary.copy_weight_sum += weight; break; case OP_LATENCY: g_summary.latency_count++; /* For latency, track the largest buffer size tested for the most RAM-like result */ if (r->latency_ns > 0 && r->size >= g_summary.largest_size_tested) { /* Always update with the largest size measurement */ g_summary.best_latency_ns = r->latency_ns; } break; } } /* Print summary statistics */ static void print_summary(void) { fprintf(stderr, "\n"); fprintf(stderr, "================================================================================\n"); fprintf(stderr, " BENCHMARK SUMMARY\n"); fprintf(stderr, "================================================================================\n\n"); /* Calculate weighted averages */ if (g_summary.read_weight_sum > 0) { g_summary.weighted_avg_read_mb_s = g_summary.read_bw_weighted_sum / g_summary.read_weight_sum; } if (g_summary.write_weight_sum > 0) { g_summary.weighted_avg_write_mb_s = g_summary.write_bw_weighted_sum / g_summary.write_weight_sum; } if (g_summary.copy_weight_sum > 0) { g_summary.weighted_avg_copy_mb_s = g_summary.copy_bw_weighted_sum / g_summary.copy_weight_sum; } /* Print bandwidth results */ fprintf(stderr, "BANDWIDTH (MB/s):\n"); fprintf(stderr, " %-10s %12s %12s\n", "Operation", "Peak", "Weighted Avg"); fprintf(stderr, " %-10s %12s %12s\n", "---------", "----", "------------"); if (g_summary.read_count > 0) { fprintf(stderr, " %-10s %12.0f %12.0f\n", "Read", g_summary.peak_read_mb_s, g_summary.weighted_avg_read_mb_s); } if (g_summary.write_count > 0) { fprintf(stderr, " %-10s %12.0f %12.0f\n", "Write", g_summary.peak_write_mb_s, g_summary.weighted_avg_write_mb_s); } if (g_summary.copy_count > 0) { fprintf(stderr, " %-10s %12.0f %12.0f\n", "Copy", g_summary.peak_copy_mb_s, g_summary.weighted_avg_copy_mb_s); } /* Print latency results */ if (g_summary.latency_count > 0 && g_summary.best_latency_ns > 0) { fprintf(stderr, "\nLATENCY:\n"); const char *cache_note = ""; if (g_summary.largest_size_tested < 1024 * 1024) { cache_note = " (L2/L3 cache)"; } else if (g_summary.largest_size_tested < 64 * 1024 * 1024) { cache_note = " (L3 cache/RAM)"; } else { cache_note = " (RAM)"; } fprintf(stderr, " Best latency: %.1f ns%s at %zu KB buffer\n", g_summary.best_latency_ns, cache_note, g_summary.largest_size_tested / 1024); } /* Calculate and print composite benchmark score * Score formula: geometric mean of bandwidth scores, divided by latency factor * Higher is better for all components */ fprintf(stderr, "\n"); fprintf(stderr, "--------------------------------------------------------------------------------\n"); fprintf(stderr, "BENCHMARK SCORE (higher is better):\n\n"); /* Individual scores */ double bw_total = 0; int bw_count = 0; if (g_summary.peak_read_mb_s > 0) { bw_total += g_summary.peak_read_mb_s; bw_count++; } if (g_summary.peak_write_mb_s > 0) { bw_total += g_summary.peak_write_mb_s; bw_count++; } if (g_summary.peak_copy_mb_s > 0) { bw_total += g_summary.peak_copy_mb_s; bw_count++; } /* Bandwidth score: average of peak bandwidths (in GB/s for nicer numbers) */ double bw_score = 0; if (bw_count > 0) { bw_score = (bw_total / bw_count) / 1000.0; /* Convert MB/s to GB/s */ fprintf(stderr, " Bandwidth Score: %8.1f (avg peak bandwidth in GB/s)\n", bw_score); } /* Latency score: inverse of latency (higher = faster memory) */ double latency_score = 0; if (g_summary.best_latency_ns > 0) { latency_score = 1000.0 / g_summary.best_latency_ns; /* 1000/ns gives reasonable scale */ fprintf(stderr, " Latency Score: %8.1f (1000 / latency_ns)\n", latency_score); } /* Combined score: geometric mean if both available, otherwise just bandwidth */ double combined_score = 0; if (bw_score > 0 && latency_score > 0) { combined_score = sqrt(bw_score * latency_score) * 100; /* Scale for nice numbers */ fprintf(stderr, "\n >> COMBINED SCORE: %8.0f (sqrt(bw_score × latency_score) × 100)\n", combined_score); } else if (bw_score > 0) { combined_score = bw_score * 100; fprintf(stderr, "\n >> COMBINED SCORE: %8.0f (bandwidth only, no latency data)\n", combined_score); } fprintf(stderr, "--------------------------------------------------------------------------------\n"); /* Warn if options that affect score comparability were used */ int has_warnings = 0; if (g_max_runtime > 0 || g_explicit_threads > 0 || g_single_size > 0) { fprintf(stderr, "\n"); fprintf(stderr, "WARNING: Scores may not be comparable due to non-default options:\n"); if (g_max_runtime > 0) { fprintf(stderr, " - Time limit (-t %.0f) may have prevented testing larger buffer sizes\n", g_max_runtime); has_warnings = 1; } if (g_explicit_threads > 0) { fprintf(stderr, " - Fixed thread count (-p %d) instead of using all CPUs (%d)\n", g_explicit_threads, g_num_cpus); has_warnings = 1; } if (g_single_size > 0) { fprintf(stderr, " - Single buffer size (-s %zu KB) instead of full sweep\n", g_single_size / 1024); has_warnings = 1; } if (has_warnings) { fprintf(stderr, "For comparable scores, run without -t, -p, or -s options.\n"); } } fprintf(stderr, "\n"); } /* Maximum buffer size for latency test. * Must exceed largest L3 caches to measure true DRAM latency. * AMD EPYC 9754 (Genoa-X) has 1.1GB L3 cache, so we need > 1.1GB. * 2GB should cover any current processor. */ #define MAX_LATENCY_SIZE (2UL * 1024 * 1024 * 1024) /* 2 GB */ /* Find best configuration for a given buffer size and operation. * * This follows bw_mem's approach: * - buffer_size is the per-thread buffer size * - Total memory = buffer_size * threads (or buffer_size * threads * 2 for copy) * * Three modes: * 1. Auto-scaling (g_auto_scaling=1): Try multiple thread counts, find best * 2. Explicit threads (g_explicit_threads>0): Use exactly that many threads * 3. Default (neither): Use num_cpus threads */ static result_t find_best_config(size_t buffer_size, operation_t op, int *thread_counts, int tc_count) { result_t best = {0}; best.size = buffer_size; best.op = op; /* For latency test: single-thread, statistically valid measurement */ if (op == OP_LATENCY) { size_t max_latency = MAX_LATENCY_SIZE; if (g_total_memory / 4 < max_latency) { max_latency = g_total_memory / 4; } size_t latency_size = (buffer_size > max_latency) ? max_latency : buffer_size; double start = get_time(); latency_stats_t stats = measure_latency_stats(latency_size); double elapsed = get_time() - start; best.size = buffer_size; best.op = op; best.threads = 1; best.latency_ns = stats.median_ns; best.latency_mean_ns = stats.mean_ns; best.latency_stddev_ns = stats.stddev_ns; best.latency_cv = stats.cv; best.latency_samples = stats.num_samples; best.elapsed_s = elapsed; best.iterations = stats.num_samples; return best; } /* Bandwidth tests */ int nthreads; if (g_auto_scaling) { /* Auto-scaling mode: try all thread counts, find best */ for (int i = 0; i < tc_count; i++) { nthreads = thread_counts[i]; if (nthreads < 1) continue; int bufs_per_op = (op == OP_COPY) ? 2 : 1; size_t memory_needed = buffer_size * nthreads * bufs_per_op; if (memory_needed > g_total_memory / 4) { continue; } result_t r = run_benchmark_best(buffer_size, op, nthreads); r.size = buffer_size; if (r.bandwidth_mb_s > best.bandwidth_mb_s) { best = r; } } if (best.bandwidth_mb_s == 0) { best = run_benchmark_best(buffer_size, op, 1); best.size = buffer_size; } return best; } /* Fixed thread count mode */ if (g_explicit_threads > 0) { nthreads = g_explicit_threads; } else { nthreads = g_num_cpus; } /* Check memory limit and reduce threads if needed */ int bufs_per_op = (op == OP_COPY) ? 2 : 1; size_t memory_needed = buffer_size * nthreads * bufs_per_op; while (nthreads > 1 && memory_needed > g_total_memory / 4) { nthreads /= 2; memory_needed = buffer_size * nthreads * bufs_per_op; } best = run_benchmark_best(buffer_size, op, nthreads); best.size = buffer_size; return best; } static void run_all_benchmarks(void) { double start_time = get_time(); int tc_count; int *thread_counts = get_thread_counts(&tc_count); /* Single size mode */ if (g_single_size > 0) { if (g_verbose) { fprintf(stderr, "Testing buffer size: %zu KB per thread\n", g_single_size / 1024); } print_csv_header(); for (int op = 0; op < 4 && g_running; op++) { if (!(g_ops_mask & (1 << op))) continue; result_t best = find_best_config(g_single_size, (operation_t)op, thread_counts, tc_count); if (best.bandwidth_mb_s > 0 || best.latency_ns > 0) { print_result(&best); if (g_human_readable) update_summary(&best); fflush(stdout); } } free(thread_counts); if (g_verbose) { double total = get_time() - start_time; fprintf(stderr, "Total runtime: %.1f seconds\n", total); } /* Print summary in human-readable mode */ if (g_human_readable) print_summary(); return; } /* Normal mode: test all sizes */ int size_count; size_t *sizes = get_sizes(&size_count); if (g_verbose) { fprintf(stderr, "Testing %d buffer sizes (per thread, adaptive to cache hierarchy)\n", size_count); if (g_auto_scaling) { fprintf(stderr, "Thread mode: auto-scaling (trying 1-%d threads)\n", g_num_cpus); } else if (g_explicit_threads > 0) { fprintf(stderr, "Thread mode: fixed %d threads\n", g_explicit_threads); } else { fprintf(stderr, "Thread mode: num_cpus (%d threads)\n", g_num_cpus); } fprintf(stderr, "OpenMP: proc_bind(spread) for NUMA-aware thread placement\n"); } print_csv_header(); for (int s = 0; s < size_count && g_running; s++) { size_t size = sizes[s]; for (int op = 0; op < 4 && g_running; op++) { if (!(g_ops_mask & (1 << op))) continue; result_t best = find_best_config(size, (operation_t)op, thread_counts, tc_count); if (best.bandwidth_mb_s > 0 || best.latency_ns > 0) { print_result(&best); if (g_human_readable) update_summary(&best); fflush(stdout); } if (g_max_runtime > 0) { double elapsed = get_time() - start_time; if (elapsed > g_max_runtime) { if (g_verbose) { fprintf(stderr, "Time limit reached (%.1f s)\n", elapsed); } g_running = 0; break; } } } } free(sizes); free(thread_counts); if (g_verbose) { double total = get_time() - start_time; fprintf(stderr, "Total runtime: %.1f seconds\n", total); } /* Print summary in human-readable mode */ if (g_human_readable) print_summary(); } /* ============================================================================ * Main * ============================================================================ */ static void usage(const char *prog) { fprintf(stderr, "sc-membench %s - Memory Bandwidth Benchmark (OpenMP)\n\n", VERSION); fprintf(stderr, "Usage: %s [options]\n\n", prog); fprintf(stderr, "Options:\n"); fprintf(stderr, " -h Show this help\n"); fprintf(stderr, " -V Print version and exit\n"); fprintf(stderr, " -v Verbose output (use -vv for more detail)\n"); fprintf(stderr, " -s SIZE_KB Test only this buffer size (in KB), e.g. -s 1024 for 1MB\n"); fprintf(stderr, " -f Full sweep (test all sizes up to memory limit)\n"); fprintf(stderr, " Default: test up to 512 MB per thread\n"); fprintf(stderr, " -p THREADS Use exactly this many threads (default: num_cpus)\n"); fprintf(stderr, " -a Auto-scaling: try different thread counts to find best\n"); fprintf(stderr, " (slower but finds optimal thread count per buffer size)\n"); fprintf(stderr, " -t SECONDS Maximum runtime, 0 = unlimited (default: unlimited)\n"); fprintf(stderr, " -r TRIES Repeat each test N times, report best (default: %d)\n", DEFAULT_BENCHMARK_TRIES); fprintf(stderr, " -o OP Run only this operation: read, write, copy, or latency\n"); fprintf(stderr, " Can be specified multiple times (default: all)\n"); fprintf(stderr, " -H Enable huge pages for large buffers (>= 4MB)\n"); fprintf(stderr, " Uses THP (no setup needed) or explicit 2MB pages\n"); fprintf(stderr, " Automatically skipped for small buffers\n"); fprintf(stderr, " -R Human-readable output with summary (default: CSV)\n"); fprintf(stderr, "\n"); fprintf(stderr, "OpenMP Thread Affinity (environment variables):\n"); fprintf(stderr, " OMP_PROC_BIND=spread Spread threads across NUMA nodes (default)\n"); fprintf(stderr, " OMP_PLACES=cores One thread per physical core\n"); fprintf(stderr, " OMP_NUM_THREADS=N Override thread count\n"); fprintf(stderr, "\n"); fprintf(stderr, "Output: CSV to stdout with columns:\n"); fprintf(stderr, " size_kb - Per-thread buffer size (KB)\n"); fprintf(stderr, " operation - read, write, copy, or latency\n"); fprintf(stderr, " bandwidth_mb_s - Aggregate bandwidth in MB/s (0 for latency)\n"); fprintf(stderr, " latency_ns - Median memory latency in ns (0 for bandwidth)\n"); fprintf(stderr, " latency_stddev_ns - Latency standard deviation in ns (0 for bandwidth)\n"); fprintf(stderr, " latency_samples - Number of samples for latency measurement\n"); fprintf(stderr, " threads - Thread count used\n"); fprintf(stderr, " iterations - Iterations performed\n"); fprintf(stderr, " elapsed_s - Elapsed time in seconds\n"); fprintf(stderr, "\n"); fprintf(stderr, "Latency measurement uses linked list traversal with random node order\n"); fprintf(stderr, "to defeat prefetchers. Statistical validity ensured via multiple samples\n"); fprintf(stderr, "until coefficient of variation < 5%% or max samples reached.\n"); fprintf(stderr, "\n"); fprintf(stderr, "Memory model: each thread gets its own buffer.\n"); fprintf(stderr, "Total memory = size_kb × threads (×2 for copy: src + dst).\n"); fprintf(stderr, "\n"); fprintf(stderr, "Compile with -DUSE_NUMA -lnuma for explicit NUMA allocation.\n"); } int main(int argc, char *argv[]) { int opt; int ops_specified = 0; /* Track if -o was used */ while ((opt = getopt(argc, argv, "hvfas:t:r:p:o:VHR")) != -1) { switch (opt) { case 'h': usage(argv[0]); return 0; case 'V': printf("%s\n", VERSION); return 0; case 'v': g_verbose++; break; case 'f': g_full_sweep = 1; break; case 'a': g_auto_scaling = 1; break; case 'r': g_benchmark_tries = atoi(optarg); if (g_benchmark_tries < 1) g_benchmark_tries = 1; break; case 'p': g_explicit_threads = atoi(optarg); if (g_explicit_threads < 1) { fprintf(stderr, "Invalid thread count: %s\n", optarg); return 1; } break; case 's': { long size_kb = atol(optarg); if (size_kb <= 0) { fprintf(stderr, "Invalid size: %s\n", optarg); return 1; } g_single_size = (size_t)size_kb * 1024; /* Convert KB to bytes */ break; } case 't': g_max_runtime = atof(optarg); if (g_max_runtime < 0) { fprintf(stderr, "Invalid runtime: %s (use 0 for unlimited)\n", optarg); return 1; } break; case 'o': { /* First -o clears the default "all" mask */ if (!ops_specified) { g_ops_mask = 0; ops_specified = 1; } /* Parse operation name */ if (strcmp(optarg, "read") == 0) { g_ops_mask |= (1 << OP_READ); } else if (strcmp(optarg, "write") == 0) { g_ops_mask |= (1 << OP_WRITE); } else if (strcmp(optarg, "copy") == 0) { g_ops_mask |= (1 << OP_COPY); } else if (strcmp(optarg, "latency") == 0) { g_ops_mask |= (1 << OP_LATENCY); } else { fprintf(stderr, "Invalid operation: %s (use: read, write, copy, latency)\n", optarg); return 1; } break; } case 'H': g_use_hugepages = 1; break; case 'R': g_human_readable = 1; break; default: usage(argv[0]); return 1; } } /* Initialize */ srand((unsigned int)time(NULL)); /* Seed RNG for pointer chain randomization */ init_system_info(); init_numa(); /* Run benchmarks */ run_all_benchmarks(); /* Cleanup */ cleanup_hwloc(); return 0; }