pax_global_header00006660000000000000000000000064146345165540014527gustar00rootroot0000000000000052 comment=178b6b759074597777ce13438efb0e0ba625e429 nccl-2.22.3-1/000077500000000000000000000000001463451655400126725ustar00rootroot00000000000000nccl-2.22.3-1/.gitignore000066400000000000000000000001351463451655400146610ustar00rootroot00000000000000# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. /build *.gcov /coverage/ nccl-2.22.3-1/LICENSE.txt000066400000000000000000000035471463451655400145260ustar00rootroot00000000000000 Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National Laboratory, the U.S. Department of Energy, nor the names of their contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. The U.S. Department of Energy funded the development of this software under subcontract 7078610 with Lawrence Berkeley National Laboratory. This code also includes files from the NVIDIA Tools Extension SDK project. See: https://github.com/NVIDIA/NVTX for more information and license details. nccl-2.22.3-1/Makefile000066400000000000000000000012261463451655400143330ustar00rootroot00000000000000# # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # .PHONY : all clean default : src.build install : src.install BUILDDIR ?= $(abspath ./build) ABSBUILDDIR := $(abspath $(BUILDDIR)) TARGETS := src pkg clean: ${TARGETS:%=%.clean} test.build: src.build LICENSE_FILES := LICENSE.txt LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%) lic: $(LICENSE_TARGETS) ${BUILDDIR}/%.txt: %.txt @printf "Copying %-35s > %s\n" $< $@ mkdir -p ${BUILDDIR} cp $< $@ src.%: ${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR} pkg.%: ${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR} pkg.debian.prep: lic pkg.txz.prep: lic nccl-2.22.3-1/README.md000066400000000000000000000047561463451655400141650ustar00rootroot00000000000000# NCCL Optimized primitives for inter-GPU communication. ## Introduction NCCL (pronounced "Nickel") is a stand-alone library of standard communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, as well as any send/receive based communication pattern. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications. For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html). ## Build Note: the official and tested builds of NCCL can be downloaded from: https://developer.nvidia.com/nccl. You can skip the following build steps if you choose to use the official builds. To build the library : ```shell $ cd nccl $ make -j src.build ``` If CUDA is not installed in the default /usr/local/cuda path, you can define the CUDA path with : ```shell $ make src.build CUDA_HOME= ``` NCCL will be compiled and installed in `build/` unless `BUILDDIR` is set. By default, NCCL is compiled for all supported architectures. To accelerate the compilation and reduce the binary size, consider redefining `NVCC_GENCODE` (defined in `makefiles/common.mk`) to only include the architecture of the target platform : ```shell $ make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70" ``` ## Install To install NCCL on the system, create a package then install it as root. Debian/Ubuntu : ```shell $ # Install tools to create debian packages $ sudo apt install build-essential devscripts debhelper fakeroot $ # Build NCCL deb package $ make pkg.debian.build $ ls build/pkg/deb/ ``` RedHat/CentOS : ```shell $ # Install tools to create rpm packages $ sudo yum install rpm-build rpmdevtools $ # Build NCCL rpm package $ make pkg.redhat.build $ ls build/pkg/rpm/ ``` OS-agnostic tarball : ```shell $ make pkg.txz.build $ ls build/pkg/txz/ ``` ## Tests Tests for NCCL are maintained separately at https://github.com/nvidia/nccl-tests. ```shell $ git clone https://github.com/NVIDIA/nccl-tests.git $ cd nccl-tests $ make $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g ``` ## Copyright All source code and accompanying documentation is copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. nccl-2.22.3-1/ext-net/000077500000000000000000000000001463451655400142565ustar00rootroot00000000000000nccl-2.22.3-1/ext-net/README.md000066400000000000000000000442621463451655400155450ustar00rootroot00000000000000# NCCL Net Plugin Documentation This page describes the NCCL Net plugin API and how to implement a network plugin for NCCL. # Overview To allow NCCL to work on any network type, NCCL provides a way to use external plugins. Plugins implement the NCCL network API, and decouple NCCL binary builds which are built against a particular version of the GPU stack (i.e. CUDA) from the network code which is built against a particular version of the networking stack. That way, we can easily integrate any CUDA version with any network stack version. NCCL network plugins come as a shared library called `libnccl-net.so`. That shared library contains one or more implementations of the NCCL NET API, in the form of versioned structs, filled with pointers to all required functions. # Plugin architecture ## Plugin name and supporting multiple network plugins When NCCL is initialized, it will look for a `libnccl-net.so` library and dynamically load it, then look for symbols inside the library. The `NCCL_NET_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL will look for a library with a name of `libnccl-net-${NCCL_NET_PLUGIN}.so`. It is therefore advised to name the library following that pattern, with a symlink pointing `libnccl-net.so` to `libnccl-net-${NCCL_NET_PLUGIN}.so`. That way, if there are multiple plugins in the path, setting `NCCL_NET_PLUGIN` will allow users to select the right plugin. ## Struct versioning Once a library is found, NCCL will look for a symbol named `ncclNet_vX`, with `X` increasing over time. The versioning ensures that the plugin and the NCCL core are compatible. Plugins are encouraged to provide multiple of those symbols, implementing multiple versions of the NCCL NET API, so that the same plugin can be compiled and support a wide range of NCCL versions. Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking for the latest ncclNet struct version, but also looking for older ones so that older plugins would still work. ## In-network collective operations, a.k.a. collNet Additionally to the ncclNet structure, network plugins can provide a collNet structure which implements in-network collective operations, if supported. That can be used by the NCCL collNet algorithm to accelerate inter-node reductions in allReduce. The collNet struct is a different, optional struct provided by the network plugin, but its versioning is tied to the ncclNet struct and many functions are common between the two to ease the implementation. ## Headers management To help users build plugins effortlessly, plugins should copy the `ncclNet_vX` definitions they support to their internal includes. An example is shown in `ext-net/example/` where we keep all headers in the `nccl/` directory and provide thin layers to implement old versions on top of newer ones. The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions from old API versions. It also provides error codes in `err.h`. # API (v6) Below is the main `ncclNet_v6` struct. Each function is explained in later sections. ``` typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); /* DMA-BUF support */ ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* sizes); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v6_t; ``` ## Error codes All plugins functions use NCCL error codes as return value. `ncclSuccess` should be returned upon success. Otherwise, plugins can return one of the following: - `ncclSystemError` is the most common error for network plugins, when a call to the linux kernel or a system library fails. This typically includes all network/hardware errors. - `ncclInternalError` is returned when the NCCL core code is using the network plugin in an incorrect way, for example allocating more requests than it should, or passing an invalid argument to calls. - `ncclInvalidUsage` should be returned when the error is most likely a user error. This can include misconfiguration, but also sizes mismatch. - `ncclInvalidArgument` should usually not be used by plugins since arguments should be checked by the NCCL core layer. - `ncclUnhandledCudaError` is returned when an error comes from CUDA. Since network plugins should not need to rely on CUDA, this should not be common. ## Operation overview NCCL will call the `init` function first, then query the number of network devices with the `devices` function, getting each network device properties with `getProperties`. To establish a connection between two network devices, NCCL will first call `listen` on the receiving side, pass the returned handle to the sender side of the connection, and call `connect` with that handle. Finally, `accept` will be called on the receiving side to finalize the connection establishment. Once the connection is established, communication will be done using the functions `isend`, `irecv` and `test`. Prior to calling `isend` or `irecv`, NCCL will call the `regMr` function on all buffers to allow RDMA NICs to prepare buffers. `deregMr` will be used to unregister buffers. In certain conditions, `iflush` will be called after a receive calls completes to allow the network plugin to flush data and ensure the GPU will observe the newly written data. To close the connections NCCL will call `closeListen` to close the object returned by `listen`, `closeSend` to close the object returned by `connect` and `closeRecv` to close the object returned by `accept`. ## API Functions ### Initialization `name` The `name` field should point to a character string with the name of the network plugin. This will be used for all logging, especially when `NCCL_DEBUG=INFO` is set. Note: setting `NCCL_NET=` will ensure a specific network implementation is used, with a matching `name`. This is not to be confused with `NCCL_NET_PLUGIN` which defines a suffix to the `libnccl-net.so`library name to load. `init` As soon as NCCL finds the plugin and the correct ncclNet symbol, it will call the `init` function. This will allow the plugin to discover network devices and make sure they are usable. If the `init` function does not return `ncclSuccess`, then NCCL will not use the plugin and fall back on internal ones. To allow the plugin logs to integrate into the NCCL logs seemlessly, NCCL provides a logging function to `init`. This function is typically used to allow for `INFO` and `WARN` macros within the plugin code adding the following definitions: ``` #define WARN(...) logFunction(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) ``` `devices` Once the plugin is initialized, NCCL will query the number of devices available. It should not be zero, otherwise NCCL initialization will fail. If no device is present or usable, the `init` function should not return `ncclSuccess`. `getProperties` Right after getting the number of devices, NCCL will query properties for each available network device. These properties are critical when multiple adapters are present to ensure NCCL uses each adapter in the most optimized way. The `name` is only used for logging. The `pciPath` is the base for all topology detection and should point to the PCI device directory in /sys. This is typically the directory pointed by `/sys/class/net/eth0/device` or `/sys/class/infiniband/mlx5_0/device`. If the network interface is virtual, then `pciPath` should be `NULL`. The `guid` field is used to determine when network adapters are connected to multiple PCI endpoints. For normal cases, it can be set to the device number. If multiple network devices have the same guid, then NCCL will consider the are sharing the same network port to the fabric, hence it will not use the port multiple times. The `ptrSupport` field indicates whether or not CUDA pointers are supported. If so, it should be set to `NCCL_PTR_HOST|NCCL_PTR_CUDA`, otherwise it should be set to `NCCL_PTR_HOST`. If the plugin supports `dmabuf`, it should set `ptrSupport` to `NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF` and provide a `regMrDmaBuf` function. The `regIsGlobal` field allows NCCL to register buffers in advance using e.g. a loopback connection and later on, expect that another registration on a buffer contained within a previous registration will be nearly immediate, as the buffer is already known by the network adapter. A typical implementation would maintain a registration cache; the call to ncclCommRegister will create the initial entry in the cache using regMr() on a loopback connection. Any later call to NCCL operations will call regMr() again on the real connection, with the real buffer (could be at a different offset within the original buffer, with a smaller size, etc), then deregMr() right after. The call to ncclCommDeregister should call the final deregMr() and effectively remove the mapping on the network adapter. The `speed` field indicates the speed of the network port in Mbps (10^6 bits per second). This is important to ensure proper optimization of flows within the node. The `port` field indicates the port number. This is important again for topology detection and flow optimization within the node when a NIC with a single PCI connection is connected to the fabric with multiple ports. The `latency` field indicates the network latency in microseconds. This can be useful to improve the NCCL tuning and make sure NCCL switches from tree to ring at the right size. The `maxComms` field indicates the maximum number of connections we can create. The `maxRecvs` field indicates the maximum number for grouped receive operations (see grouped receive). ### Connection establishment Connections are used in an unidirectional manner. There is therefore a sender side and a receiver side. `listen` To create a connection, NCCL will start by calling `listen` on the receiver side. This function takes a device number as input argument, and should return a local `listenComm` object, and a `handle` to pass to the other side, so that the sender side can connect to the receiver. The `handle` is a buffer of size `NCCL_NET_HANDLE_MAXSIZE` and is provided by NCCL. This call should never block, but contrary to `connect` and `accept`, `listenComm` should never be `NULL` if the call succeeds. `connect` NCCL will use its bootstrap infrastructure to provide the `handle` to the sender side, then call `connect` on the sender side on a given device index `dev`, providing the `handle`. `connect` should not block either, and instead set `sendComm` to `NULL` and return `ncclSuccess`. In that case, NCCL will call `accept` again until it succeeds. `accept` To finalize the connection, the receiver side will call `accept` on the `listenComm` returned by the `listen` call previously. If the sender did not connect yet, `accept` should not block. It should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it succeeds. `closeListen`/`closeSend`/`closeRecv` Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call `closeListen`/`closeSend`/`closeRecv` to free the associated resources. ### Communication Communication is done using asynchronous send and receive operations: `isend`, `irecv` and `test`. To support RDMA capabilities, buffer registration and flush functions are provided. To keep track of asynchronous send, receive and flush operations, requests are returned to NCCL, then queried with `test`. Each `sendComm` or `recvComm` must be able to handle `NCCL_NET_MAX_REQUESTS` requests in parallel. Note: That value should be multiplied by the multi-receive capability of the plugin for the sender side, so that we can effectively have `NCCL_NET_MAX_REQUESTS` multi-receive operations happening in parallel. So, if we have a `maxRecvs`value of 8 and `NCCL_NET_MAX_REQUESTS` is 8, then each `sendComm` must be able to handle up to 8x8=64 concurrent `isend` operations. `regMr` Prior to sending or receiving data, NCCL will call `regMr` with any buffers later used for communication. It will provide a `sendComm` or `recvComm` as `comm` argument, then the buffer pointer `data`, `size`, and `type` being either `NCCL_PTR_HOST`, or `NCCL_PTR_CUDA` if the network supports CUDA pointers. The network plugin can use the output argument `mhandle` to keep any reference to that memory registration, as this `mhandle` will be passed back for all `isend`, `irecv`, `iflush` and `deregMr` calls. `regMrDmaBuf` If the plugin has set the `NCCL_PTR_DMABUF` property in `ptrSupport`, NCCL will use `regMrDmaBuf` instead of `regMr`. If the property was not set, `regMrDmaBuf` can be set to `NULL`. `deregMr` When buffers will no longer be used for communication, NCCL will call `deregMr` to let the plugin free resources. This function is used to deregister handles returned by both `regMr` and `regMrDmaBuf`. `isend` Data will be sent through the connection using `isend`, passing the `sendComm` previously created by `connect`, and the buffer described by `data`, `size`, and `mhandle`. A `tag` must be used if the network supports multi-receive operations (see `irecv`) to distinguish between different sends matching the same multi-receive. Otherwise it can be set to 0. The `isend` operation returns a handle in the `request` argument for further calls to `test`. If the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call `isend` again later. `irecv` To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument `n` will allow NCCL to perform a multi-receive, to allow grouping of multiple sends through a single network connection. Each buffer will be described by the `data`, `sizes`, and `mhandles` arrays. `tags` will specify a tag for each receive so that each of the `n` independent `isend` operations is received into the right buffer. If all receive operations can be initiated, `irecv` will return a handle in the `request` pointer, otherwise it will set it to `NULL`. In the case of multi-receive, all `n` receive operations are handled by a single request handle. The sizes provided to `irecv` can (and will) be larger than the size of the `isend` operation. The contrary (receive size being lower than the send size) is an error, however. Note: for a given connection, send/receive operations should always match in the order they were posted. Tags provided for receive operations are only used to assign a given send operation to one of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag matching on any receive operation posted. `test` After an `isend` or `irecv` operation is initiated, NCCL will call `test` on the request handles until they complete. When that happens, `done` will be set to 1 and `sizes` will be set to the real size sent or received, the latter being potentially lower than the size passed to `irecv`. In the case of a multi-receive, all receives will be considered as done as a single operation (the goal being to allow aggregation), hence they share a single request and a single `done` status. However, they can have different sizes, so when `done` is non-zero, the `sizes` array should contain the `n` sizes corresponding to the buffers passed to `irecv`. Once `test` returns 1 in `done`, the request handle can be freed, meaning that NCCL will never call `test` again on that request (until it is reallocated by another call to `isend` or `irecv`). `iflush` After a receive operation completes, if the operation was targeting GPU memory and received a non-zero number of bytes, NCCL will call `iflush` to let the network flush any buffer and ensure the GPU can read it right after without seeing stale data. This flush operation is decoupled from the `test` code to improve latency of `LL*` protocols, as those are capable of determining when data is valid or not. `iflush` returns a request which needs to be queried with `test` until it completes. nccl-2.22.3-1/ext-net/example/000077500000000000000000000000001463451655400157115ustar00rootroot00000000000000nccl-2.22.3-1/ext-net/example/Makefile000066400000000000000000000006051463451655400173520ustar00rootroot00000000000000# # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # NCCL_HOME:=../../build/ CUDA_HOME:=/usr/local/cuda INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl PLUGIN_SO:=libnccl-net.so default: $(PLUGIN_SO) $(PLUGIN_SO): plugin.c $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ clean: rm -f $(PLUGIN_SO) nccl-2.22.3-1/ext-net/example/nccl/000077500000000000000000000000001463451655400166305ustar00rootroot00000000000000nccl-2.22.3-1/ext-net/example/nccl/common.h000066400000000000000000000015011463451655400202660ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef COMMON_H_ #define COMMON_H_ typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); #endif nccl-2.22.3-1/ext-net/example/nccl/err.h000066400000000000000000000010011463451655400175610ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_ERR_H_ #define NCCL_ERR_H_ /* Error type for plugins */ typedef enum { ncclSuccess = 0, ncclUnhandledCudaError = 1, ncclSystemError = 2, ncclInternalError = 3, ncclInvalidArgument = 4, ncclInvalidUsage = 5, ncclRemoteError = 6 } ncclResult_t; #endif nccl-2.22.3-1/ext-net/example/nccl/net.h000066400000000000000000000010641463451655400175700ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_NET_H_ #define NCCL_NET_H_ #include #include #include "common.h" #include "err.h" #define NCCL_NET_HANDLE_MAXSIZE 128 #define NCCL_PTR_HOST 0x1 #define NCCL_PTR_CUDA 0x2 #define NCCL_PTR_DMABUF 0x4 // Maximum number of requests per comm object #define NCCL_NET_MAX_REQUESTS 32 #include "net_v8.h" #include "net_v7.h" #include "net_v6.h" #include "net_v5.h" #include "net_v4.h" #include "net_v3.h" #include "net_v2.h" #endif // end include guard nccl-2.22.3-1/ext-net/example/nccl/net_device.h000066400000000000000000000021521463451655400211060ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NET_DEVICE_H_ #define NET_DEVICE_H_ #define NCCL_NET_DEVICE_INVALID_VERSION 0x0 #define NCCL_NET_MTU_SIZE 4096 // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; typedef struct { ncclNetDeviceType netDeviceType; // Network offload type int netDeviceVersion; // Version number for network offload void* handle; size_t size; int needsProxyProgress; } ncclNetDeviceHandle_v7_t; typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t; #endif nccl-2.22.3-1/ext-net/example/nccl/net_v2.h000066400000000000000000000050361463451655400202020ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_NET_V2_H_ #define NCCL_NET_V2_H_ typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Return the device path in /sys. NCCL will call free on this path. ncclResult_t (*pciPath)(int dev, char** path); // Return whether this device supports host pointers and/or CUDA pointers // as data from the current GPU. Supported types should be composed with // NCCL_PTR_HOST and NCCL_PTR_CUDA. ncclResult_t (*ptrSupport)(int dev, int* supportedTypes); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connectHandle ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* size); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v2_t; #endif // end include guard nccl-2.22.3-1/ext-net/example/nccl/net_v3.h000066400000000000000000000045141463451655400202030ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_NET_V3_H_ #define NCCL_NET_V3_H_ #define NCCL_NET_MAX_REQUESTS_V3 16 typedef ncclNetProperties_v4_t ncclNetProperties_v3_t; typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connectHandle ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* size); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v3_t; #endif // end include guard nccl-2.22.3-1/ext-net/example/nccl/net_v4.h000066400000000000000000000054751463451655400202130ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_NET_V4_H_ #define NCCL_NET_V4_H_ #define NCCL_NET_HANDLE_MAXSIZE_V4 64 typedef struct { char* name; // Used mostly for logging. char* pciPath; // Path to the PCI device in /sys. uint64_t guid; // Unique identifier for the NIC chip. Important for // cards with multiple PCI functions (Physical or virtual). int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA int speed; // Port speed in Mbps. int port; // Port number. int maxComms; // Maximum number of comms we can create } ncclNetProperties_v4_t; // v4 struct for backwards compatibility typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connectHandle ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* size); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v4_t; #endif // end include guard nccl-2.22.3-1/ext-net/example/nccl/net_v5.h000066400000000000000000000054121463451655400202030ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_NET_V5_H_ #define NCCL_NET_V5_H_ typedef ncclNetProperties_v6_t ncclNetProperties_v5_t; typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* sizes); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v5_t; #endif // end include guard nccl-2.22.3-1/ext-net/example/nccl/net_v6.h000066400000000000000000000067201463451655400202070ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_NET_V6_H_ #define NCCL_NET_V6_H_ #define NCCL_NET_MAX_REQUESTS_V6 8 typedef struct { char* name; // Used mostly for logging. char* pciPath; // Path to the PCI device in /sys. uint64_t guid; // Unique identifier for the NIC chip. Important for // cards with multiple PCI functions (Physical or virtual). int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] int speed; // Port speed in Mbps. int port; // Port number. float latency; // Network latency int maxComms; // Maximum number of comms we can create int maxRecvs; // Maximum number of grouped receives. }ncclNetProperties_v6_t; typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); /* DMA-BUF support */ ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* sizes); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v6_t; #endif // end include guard nccl-2.22.3-1/ext-net/example/nccl/net_v7.h000066400000000000000000000101601463451655400202010ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_NET_V7_H_ #define NCCL_NET_V7_H_ #include "net_device.h" typedef struct { char* name; // Used mostly for logging. char* pciPath; // Path to the PCI device in /sys. uint64_t guid; // Unique identifier for the NIC chip. Important for // cards with multiple PCI functions (Physical or virtual). int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] int speed; // Port speed in Mbps. int port; // Port number. float latency; // Network latency int maxComms; // Maximum number of comms we can create int maxRecvs; // Maximum number of grouped receives. ncclNetDeviceType netDeviceType; // Network offload type int netDeviceVersion; // Version number for network offload } ncclNetProperties_v7_t; typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); /* DMA-BUF support */ ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* sizes); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); // Copy the given mhandle to a dptr in a format usable by this plugin's device code ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); // Notify the plugin that a recv has completed by the device ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); } ncclNet_v7_t; #endif // end include guard nccl-2.22.3-1/ext-net/example/nccl/net_v8.h000066400000000000000000000107121463451655400202050ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_NET_V8_H_ #define NCCL_NET_V8_H_ #include "net_device.h" typedef struct { char* name; // Used mostly for logging. char* pciPath; // Path to the PCI device in /sys. uint64_t guid; // Unique identifier for the NIC chip. Important for // cards with multiple PCI functions (Physical or virtual). int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] int regIsGlobal; // regMr is not tied to a particular comm int speed; // Port speed in Mbps. int port; // Port number. float latency; // Network latency int maxComms; // Maximum number of comms we can create int maxRecvs; // Maximum number of grouped receives. ncclNetDeviceType netDeviceType; // Network offload type int netDeviceVersion; // Version number for network offload } ncclNetProperties_v8_t; typedef ncclNetProperties_v8_t ncclNetProperties_t; typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); /* DMA-BUF support */ ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* sizes); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); // Copy the given mhandle to a dptr in a format usable by this plugin's device code ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); // Notify the plugin that a recv has completed by the device ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); } ncclNet_v8_t; #endif // end include guard nccl-2.22.3-1/ext-net/example/nccl/types.h000066400000000000000000000011471463451655400201500ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_TYPES_H_ #define NCCL_TYPES_H_ /* Data types */ typedef enum { ncclInt8 = 0, ncclChar = 0, ncclUint8 = 1, ncclInt32 = 2, ncclInt = 2, ncclUint32 = 3, ncclInt64 = 4, ncclUint64 = 5, ncclFloat16 = 6, ncclHalf = 6, ncclFloat32 = 7, ncclFloat = 7, ncclFloat64 = 8, ncclDouble = 8, ncclBfloat16 = 9, } ncclDataType_t; #endif nccl-2.22.3-1/ext-net/example/plugin.c000066400000000000000000000266711463451655400173670ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "net.h" #define __hidden __attribute__ ((visibility("hidden"))) int max_requests = NCCL_NET_MAX_REQUESTS; __hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; } __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; } __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; } __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; } __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props) { // Below are default values, if unsure don't change. props->name = "Example"; // Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0 props->pciPath = NULL; // Only used to detect NICs with multiple PCI attachments. props->guid = 0; // Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers. props->ptrSupport = NCCL_PTR_HOST; // If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled. props->regIsGlobal = 0; // Speed in *Mbps*. 100000 means 100G props->speed = 100000; // Port number, used in conjunction with guid props->port = 0; // Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values. props->latency = 0; // Maximum number of comm objects we can create. props->maxComms = 1024*1024; // Maximum number of receive operations taken by irecv(). props->maxRecvs = 1; // Coupling with NCCL network device-side code. props->netDeviceType = 0; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; return ncclInternalError; } __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; } __hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm) { return ncclInternalError; } __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm) { return ncclInternalError; } __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; } __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; } __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;} __hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclInternalError; } __hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; } __hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; } __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; } __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; } __hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; } __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; } __hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; } __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; } #define PLUGIN_NAME "Plugin" const ncclNet_v8_t ncclNetPlugin_v8 = { .name = PLUGIN_NAME, .init = pluginInit, .devices = pluginDevices, .getProperties = pluginGetProperties, .listen = pluginListen, .connect = pluginConnect, .accept = pluginAccept, .regMr = pluginRegMr, .regMrDmaBuf = pluginRegMrDmaBuf, .deregMr = pluginDeregMr, .isend = pluginIsend, .irecv = pluginIrecv, .iflush = pluginIflush, .test = pluginTest, .closeSend = pluginCloseSend, .closeRecv = pluginCloseRecv, .closeListen = pluginCloseListen, .getDeviceMr = pluginGetDeviceMr, .irecvConsumed = pluginIrecvConsumed, }; __hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_v7_t* props_v7) { ncclNetProperties_t props; ncclResult_t ret = pluginGetProperties(dev, &props); if (ret != ncclSuccess) return ret; props_v7->name = props.name; props_v7->pciPath = props.pciPath; props_v7->guid = props.guid; props_v7->ptrSupport = props.ptrSupport; props_v7->speed = props.speed; props_v7->port = props.port; props_v7->maxComms = props.maxComms; props_v7->maxRecvs = props.maxRecvs; props_v7->netDeviceType = props.netDeviceType; props_v7->netDeviceVersion = props.netDeviceVersion; return ncclSuccess; } __hidden ncclResult_t pluginRegMr_v7(void* collComm, void* data, int size, int type, void** mhandle) { return pluginRegMr(collComm, data, size, type, mhandle); } const ncclNet_v7_t ncclNetPlugin_v7 = { .name = PLUGIN_NAME, .init = pluginInit, .devices = pluginDevices, .getProperties = pluginGetProperties_v7, .listen = pluginListen, .connect = pluginConnect, .accept = pluginAccept, .regMr = pluginRegMr_v7, .regMrDmaBuf = pluginRegMrDmaBuf, .deregMr = pluginDeregMr, .isend = pluginIsend, .irecv = pluginIrecv, .iflush = pluginIflush, .test = pluginTest, .closeSend = pluginCloseSend, .closeRecv = pluginCloseRecv, .closeListen = pluginCloseListen, .getDeviceMr = pluginGetDeviceMr, .irecvConsumed = pluginIrecvConsumed, }; __hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props_v6) { ncclNetProperties_t props; ncclResult_t ret = pluginGetProperties(dev, &props); if (ret != ncclSuccess) return ret; props_v6->name = props.name; props_v6->pciPath = props.pciPath; props_v6->guid = props.guid; props_v6->ptrSupport = props.ptrSupport; props_v6->speed = props.speed; props_v6->port = props.port; props_v6->maxComms = props.maxComms; props_v6->maxRecvs = props.maxRecvs; return ncclSuccess; } __hidden ncclResult_t pluginConnect_v6(int dev, void* handle, void** sendComm) { return ncclInternalError; } __hidden ncclResult_t pluginAccept_v6(void* listenComm, void** recvComm) { return ncclInternalError; } const ncclNet_v6_t ncclNetPlugin_v6 = { .name = PLUGIN_NAME, .init = pluginInit, .devices = pluginDevices, .getProperties = pluginGetProperties_v6, .listen = pluginListen, .connect = pluginConnect_v6, .accept = pluginAccept_v6, .regMr = pluginRegMr_v7, .regMrDmaBuf = pluginRegMrDmaBuf, .deregMr = pluginDeregMr, .isend = pluginIsend, .irecv = pluginIrecv, .iflush = pluginIflush, .test = pluginTest, .closeSend = pluginCloseSend, .closeRecv = pluginCloseRecv, .closeListen = pluginCloseListen }; /* v5 Compat */ const ncclNet_v5_t ncclNetPlugin_v5 = { .name = PLUGIN_NAME, .init = pluginInit, .devices = pluginDevices, .getProperties = pluginGetProperties_v6, .listen = pluginListen, .connect = pluginConnect_v6, .accept = pluginAccept_v6, .regMr = pluginRegMr_v7, .deregMr = pluginDeregMr, .isend = pluginIsend, .irecv = pluginIrecv, .iflush = pluginIflush, .test = pluginTest, .closeSend = pluginCloseSend, .closeRecv = pluginCloseRecv, .closeListen = pluginCloseListen, }; /* v4 Compat */ static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props_v4) { ncclNetProperties_t props; ncclResult_t ret = pluginGetProperties(dev, &props); if (ret != ncclSuccess) return ret; props_v4->name = props.name; props_v4->pciPath = props.pciPath; props_v4->guid = props.guid; props_v4->ptrSupport = props.ptrSupport; props_v4->speed = props.speed; props_v4->port = props.port; props_v4->maxComms = props.maxComms; return ncclSuccess; } static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) { return pluginIsend(sendComm, data, size, 0, mhandle, request); } static ncclResult_t pluginIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) { int tag = 0; return pluginIrecv(recvComm, 1, &data, &size, &tag, &mhandle, request); } static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) { return pluginIflush(recvComm, 1, &data, &size, &mhandle, request); } static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) { ncclResult_t ret; do { ncclNetDeviceHandle_v7_t* handle = NULL; ret = pluginConnect(dev, handle, sendComm, &handle); } while (ret == ncclSuccess && *sendComm == NULL); return ret; } static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) { ncclResult_t ret; do { ncclNetDeviceHandle_v7_t* handle = NULL; ret = pluginAccept(listenComm, recvComm, &handle); } while (ret == ncclSuccess && *recvComm == NULL); return ret; } const ncclNet_v4_t ncclNetPlugin_v4 = { .name = PLUGIN_NAME, .init = pluginInit, .devices = pluginDevices, .getProperties = pluginGetProperties_v4, .listen = pluginListen, .connect = pluginConnect_v4, .accept = pluginAccept_v4, .regMr = pluginRegMr_v7, .deregMr = pluginDeregMr, .isend = pluginIsend_v4, .irecv = pluginIrecv_v4, .iflush = pluginIflush_v4, .test = pluginTest, .closeSend = pluginCloseSend, .closeRecv = pluginCloseRecv, .closeListen = pluginCloseListen, }; /* v3 Compat */ static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhandle) { void* req; ncclResult_t ret = pluginIflush_v4(recvComm, data, size, mhandle, &req); int done = 0; while (ret == ncclSuccess && done == 0) { ret = pluginTest(req, &done, NULL); } return ret; } static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) { max_requests = NCCL_NET_MAX_REQUESTS_V3; return pluginInit(logFunction); } #include static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) { char pluginHandle[NCCL_NET_HANDLE_MAXSIZE]; ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm); memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V4); return ret; } static ncclResult_t pluginConnect_v3(int dev, void* handle, void** sendComm) { char pluginHandle[NCCL_NET_HANDLE_MAXSIZE]; memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V4); return pluginConnect_v4(dev, &pluginHandle, sendComm); } const ncclNet_v3_t ncclNetPlugin_v3 = { .name = PLUGIN_NAME, .init = pluginInit_v3, .devices = pluginDevices, .getProperties = pluginGetProperties_v4, .listen = pluginListen_v3, .connect = pluginConnect_v3, .accept = pluginAccept_v4, .regMr = pluginRegMr_v7, .deregMr = pluginDeregMr, .isend = pluginIsend_v4, .irecv = pluginIrecv_v4, .flush = pluginFlush, .test = pluginTest, .closeSend = pluginCloseSend, .closeRecv = pluginCloseRecv, .closeListen = pluginCloseListen, }; /* v2 Compat */ const ncclNet_v2_t ncclNetPlugin_v2 = { .name = PLUGIN_NAME, .init = pluginInit_v3, .devices = pluginDevices, .pciPath = pluginPciPath, .ptrSupport = pluginPtrSupport, .listen = pluginListen, .connect = pluginConnect_v4, .accept = pluginAccept_v4, .regMr = pluginRegMr_v7, .deregMr = pluginDeregMr, .isend = pluginIsend_v4, .irecv = pluginIrecv_v4, .flush = pluginFlush, .test = pluginTest, .closeSend = pluginCloseSend, .closeRecv = pluginCloseRecv, .closeListen = pluginCloseListen, }; nccl-2.22.3-1/ext-net/google-fastsocket/000077500000000000000000000000001463451655400176765ustar00rootroot00000000000000nccl-2.22.3-1/ext-net/google-fastsocket/Makefile000066400000000000000000000007701463451655400213420ustar00rootroot00000000000000CUDA_HOME?=/usr/local/cuda INC:=-I$(CUDA_HOME)/include PLUGIN_SO:=libnccl-net.so default: $(PLUGIN_SO) $(PLUGIN_SO): nccl-fastsocket/*.cc $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ nccl-fastsocket/*.cc: git clone https://github.com/google/nccl-fastsocket.git install: $(BUILDDIR)/lib/$(PLUGIN_SO) $(BUILDDIR)/lib/$(PLUGIN_SO): $(PLUGIN_SO) @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(BUILDDIR)/lib install -m 644 $< $@ clean: rm -f $(PLUGIN_SO) rm -Rf nccl-fastsocket nccl-2.22.3-1/ext-tuner/000077500000000000000000000000001463451655400146255ustar00rootroot00000000000000nccl-2.22.3-1/ext-tuner/example/000077500000000000000000000000001463451655400162605ustar00rootroot00000000000000nccl-2.22.3-1/ext-tuner/example/Makefile000066400000000000000000000006071463451655400177230ustar00rootroot00000000000000# # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # NCCL_HOME:=../../build/ CUDA_HOME:=/usr/local/cuda INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl PLUGIN_SO:=libnccl-tuner.so default: $(PLUGIN_SO) $(PLUGIN_SO): plugin.c $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ clean: rm -f $(PLUGIN_SO) nccl-2.22.3-1/ext-tuner/example/nccl/000077500000000000000000000000001463451655400171775ustar00rootroot00000000000000nccl-2.22.3-1/ext-tuner/example/nccl/common.h000066400000000000000000000015011463451655400206350ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef COMMON_H_ #define COMMON_H_ typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); #endif nccl-2.22.3-1/ext-tuner/example/nccl/err.h000066400000000000000000000010011463451655400201300ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_ERR_H_ #define NCCL_ERR_H_ /* Error type for plugins */ typedef enum { ncclSuccess = 0, ncclUnhandledCudaError = 1, ncclSystemError = 2, ncclInternalError = 3, ncclInvalidArgument = 4, ncclInvalidUsage = 5, ncclRemoteError = 6 } ncclResult_t; #endif nccl-2.22.3-1/ext-tuner/example/nccl/tuner.h000066400000000000000000000064011463451655400205060ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_TUNER_H_ #define NCCL_TUNER_H_ #include #include #include "common.h" #include "err.h" #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now typedef enum { ncclFuncBroadcast = 0, ncclFuncReduce = 1, ncclFuncAllGather = 2, ncclFuncReduceScatter = 3, ncclFuncAllReduce = 4, ncclFuncSendRecv = 5, ncclFuncSend = 6, ncclFuncRecv = 7, ncclNumFuncs = 8 } ncclFunc_t; #define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* #define NCCL_ALGO_UNDEF -1 #define NCCL_ALGO_TREE 0 #define NCCL_ALGO_RING 1 #define NCCL_ALGO_COLLNET_DIRECT 2 #define NCCL_ALGO_COLLNET_CHAIN 3 #define NCCL_ALGO_NVLS 4 #define NCCL_ALGO_NVLS_TREE 5 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 #define NCCL_PROTO_UNDEF -1 #define NCCL_PROTO_LL 0 #define NCCL_PROTO_LL128 1 #define NCCL_PROTO_SIMPLE 2 #define NCCL_ALGO_PROTO_IGNORE -1.0 // API to be implemented by external tuner typedef struct { // Name of the tuner const char* name; // Initializes tuner states. // Inputs: // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. // - nNodes: number of nodes in current communicator. // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. // Outputs: // - context: tuner context object ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); // Gets info (algo, protocol, number of ctas and threads) for a given collective. // Inputs: // - context: tuner context object // - collType: collective type , e.g., allreduce, allgather… // - nBytes: collective size in bytes // - numPipeOps: number of operations in the group // - numAlgo: number of algorithms in collCostTable // - numProto: number of protocols in collCostTable // // Outputs: // - nChannels: number of channels (hence SMs) to be used. // // InOut: // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). // // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the // default tuning for the given collective. // Also, the plugin is allowed to not set any output, or set only the // algorithm and protocol, but not only the algorithm or only the protocol. // Unset fields will be set automatically by NCCL. ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int* nChannels); // Terminates the plugin and cleans up any resources that the plugin allocated. // context: tuner context object ncclResult_t (*destroy)(void* context); } ncclTuner_v3_t; typedef ncclTuner_v3_t ncclTuner_t; #define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3" #endif nccl-2.22.3-1/ext-tuner/example/plugin.c000066400000000000000000000024041463451655400177220ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "tuner.h" #define __hidden __attribute__ ((visibility("hidden"))) __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; } __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int* nChannels) { // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) { collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0; } *nChannels = 1; return ncclSuccess; } __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; } #define PLUGIN_NAME "Example" const ncclTuner_v3_t ncclTunerPlugin_v3 = { .name = PLUGIN_NAME, .init = pluginInit, .getCollInfo = pluginGetCollInfo, .destroy = pluginDestroy }; nccl-2.22.3-1/makefiles/000077500000000000000000000000001463451655400146325ustar00rootroot00000000000000nccl-2.22.3-1/makefiles/common.mk000066400000000000000000000100431463451655400164510ustar00rootroot00000000000000# # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # CUDA_HOME ?= /usr/local/cuda PREFIX ?= /usr/local VERBOSE ?= 0 KEEP ?= 0 DEBUG ?= 0 ASAN ?= 0 TRACE ?= 0 PROFAPI ?= 1 NVTX ?= 1 RDMA_CORE ?= 0 NVCC = $(CUDA_HOME)/bin/nvcc CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev) CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) #$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR}) # You should define NVCC_GENCODE in your environment to the minimal set # of archs to reduce compile time. CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \ -gencode=arch=compute_60,code=sm_60 \ -gencode=arch=compute_61,code=sm_61 ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0) # SM35 is deprecated from CUDA12.0 onwards CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35 endif CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80 CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90 CUDA8_PTX = -gencode=arch=compute_61,code=compute_61 CUDA9_PTX = -gencode=arch=compute_70,code=compute_70 CUDA11_PTX = -gencode=arch=compute_80,code=compute_80 CUDA12_PTX = -gencode=arch=compute_90,code=compute_90 ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0) # Include Hopper support if we're using CUDA11.8 or above NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX) else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX) # Include Volta support if we're using CUDA9 or above else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0) NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) else NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX) endif $(info NVCC_GENCODE is ${NVCC_GENCODE}) CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \ -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \ -I $(CUDA_INC) \ $(CXXFLAGS) # Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors) # 512 : 120, 640 : 96, 768 : 80, 1024 : 60 # We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions. NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all # Use addprefix so that we can specify more than one path NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt ########## GCOV ########## GCOV ?= 0 # disable by default. GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1 CXXFLAGS += ${GCOV_FLAGS} NVCUFLAGS += ${GCOV_FLAGS:%=-Xcompiler %} LDFLAGS += ${GCOV_FLAGS} NVLDFLAGS += ${GCOV_FLAGS:%=-Xcompiler %} # $(warning GCOV_FLAGS=${GCOV_FLAGS}) ########## GCOV ########## ifeq ($(DEBUG), 0) NVCUFLAGS += -O3 CXXFLAGS += -O3 -g else NVCUFLAGS += -O0 -G -g CXXFLAGS += -O0 -g -ggdb3 endif # Make sure to run with ASAN_OPTIONS=protect_shadow_gap=0 otherwise CUDA will fail with OOM ifneq ($(ASAN), 0) CXXFLAGS += -fsanitize=address LDFLAGS += -fsanitize=address -static-libasan NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan endif ifneq ($(VERBOSE), 0) NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter CXXFLAGS += -Wall -Wextra else .SILENT: endif ifneq ($(TRACE), 0) CXXFLAGS += -DENABLE_TRACE endif ifeq ($(NVTX), 0) CXXFLAGS += -DNVTX_DISABLE endif ifneq ($(KEEP), 0) NVCUFLAGS += -keep endif ifneq ($(PROFAPI), 0) CXXFLAGS += -DPROFAPI endif ifneq ($(RDMA_CORE), 0) CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1 endif nccl-2.22.3-1/makefiles/formatting.mk000066400000000000000000000022361463451655400173400ustar00rootroot00000000000000# # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # # Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting # As this file defines a new target (format), it should be included at least after the definition of the # default target. ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none ASTYLEDIR := $(BUILDDIR)/contrib ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/ ASTYLEVER := 3.1 ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz" $(ASTYLEDIR) : @mkdir -p $(ASTYLEDIR) $(ASTYLETAR) : $(ASTYLEDIR) @wget -q -O $(ASTYLETAR) $(ASTYLEURL) $(ASTYLEBLD) : $(ASTYLETAR) @cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR) $(ASTYLEBIN) : $(ASTYLEBLD) ${MAKE} -C $(ASTYLEBLD) .PHONY : format format : $(ASTYLEBIN) @$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT) nccl-2.22.3-1/makefiles/version.mk000066400000000000000000000001471463451655400166520ustar00rootroot00000000000000##### version NCCL_MAJOR := 2 NCCL_MINOR := 22 NCCL_PATCH := 3 NCCL_SUFFIX := PKG_REVISION := 1 nccl-2.22.3-1/pkg/000077500000000000000000000000001463451655400134535ustar00rootroot00000000000000nccl-2.22.3-1/pkg/Makefile000066400000000000000000000010161463451655400151110ustar00rootroot00000000000000# # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # .PHONY : all clean default : build build : debian.build txz.build BUILDDIR ?= $(abspath ../build) ABSBUILDDIR := $(abspath $(BUILDDIR)) TARGETS := debian txz all: ${TARGETS:%=%.build} prep: ${TARGETS:%=%.prep} build: ${TARGETS:%=%.build} clean: ${TARGETS:%=%.clean} %.prep: ${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR} %.build: ${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR} %.clean: ${MAKE} -C $* clean nccl-2.22.3-1/pkg/debian/000077500000000000000000000000001463451655400146755ustar00rootroot00000000000000nccl-2.22.3-1/pkg/debian/.gitignore000066400000000000000000000001211463451655400166570ustar00rootroot00000000000000/*.debhelper.log /*.debhelper /*.substvars /tmp/ /files /libnccl1/ /libnccl-dev/ nccl-2.22.3-1/pkg/debian/Makefile000066400000000000000000000031501463451655400163340ustar00rootroot00000000000000# # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # include ../../makefiles/common.mk include ../../makefiles/version.mk BUILDDIR ?= $(abspath ../../build) DEBPREPDIR := $(BUILDDIR)/debian PKGDIR := $(BUILDDIR)/pkg/deb/ DEBGEN_IN := $(wildcard *.in) DEBGEN := $(DEBGEN_IN:.in=) DEBFILES := compat copyright libnccl-dev.install rules $(DEBGEN) DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES)) PKG_TIMESTAMP := $(shell date -R) PKG_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH) PKG_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH) prep : $(DEBTARGETS) $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) build : prep $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) @printf "Building Debian package\n" (cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b) mkdir -p $(PKGDIR) mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/ clean: rm -Rf $(DEBPREPDIR) $(PKGDIR) $(DEBPREPDIR)/% : %.in @printf "Generating %-35s > %s\n" $< $@ mkdir -p $(DEBPREPDIR) sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \ -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \ $< > $@ $(DEBPREPDIR)/% : % @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(DEBPREPDIR) cp -f $< $@ nccl-2.22.3-1/pkg/debian/changelog.in000066400000000000000000000003471463451655400171600ustar00rootroot00000000000000nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium * Automatic Debian package from build -- cudatools ${pkg:Timestamp} nccl-2.22.3-1/pkg/debian/compat000066400000000000000000000000021463451655400160730ustar00rootroot000000000000009 nccl-2.22.3-1/pkg/debian/control.in000066400000000000000000000023511463451655400167060ustar00rootroot00000000000000Source: nccl Section: libs Maintainer: cudatools Priority: optional Build-depends: debhelper(>=9) Standards-Version: 3.9.5 Package: libnccl${nccl:Major} Section: libs Architecture: ${pkg:Arch} Depends: ${misc:Depends}, ${shlibs:Depends} Description: NVIDIA Collective Communication Library (NCCL) Runtime NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on any platform using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. Package: libnccl-dev Section: libdevel Architecture: ${pkg:Arch} Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version}) Description: NVIDIA Collective Communication Library (NCCL) Development Files NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on any platform using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. nccl-2.22.3-1/pkg/debian/copyright000077700000000000000000000000001463451655400210742../../LICENSE.txtustar00rootroot00000000000000nccl-2.22.3-1/pkg/debian/gbp.conf000066400000000000000000000001641463451655400163150ustar00rootroot00000000000000[DEFAULT] debian-branch = master upstream-branch = master ignore-new = True [git-buildpackage] no-purge = True nccl-2.22.3-1/pkg/debian/libnccl-dev.install.in000066400000000000000000000002241463451655400210520ustar00rootroot00000000000000include/nccl.h /usr/include include/nccl_net.h /usr/include lib/libnccl.so /usr/lib/${pkg:MultiArch} lib/libnccl_static.a /usr/lib/${pkg:MultiArch} nccl-2.22.3-1/pkg/debian/libnccl2.install.in000066400000000000000000000002121463451655400203550ustar00rootroot00000000000000lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch} lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch} nccl-2.22.3-1/pkg/debian/rules000077500000000000000000000003011463451655400157470ustar00rootroot00000000000000#!/usr/bin/make -f %: dh $@ --parallel override_dh_auto_install: PREFIX=debian/tmp dh_auto_install override_dh_auto_test: # Do not make test override_dh_auto_clean: # Do not make clean nccl-2.22.3-1/pkg/debian/source/000077500000000000000000000000001463451655400161755ustar00rootroot00000000000000nccl-2.22.3-1/pkg/debian/source/format000066400000000000000000000000151463451655400174040ustar00rootroot000000000000003.0 (native) nccl-2.22.3-1/pkg/redhat/000077500000000000000000000000001463451655400147225ustar00rootroot00000000000000nccl-2.22.3-1/pkg/redhat/Makefile000066400000000000000000000037041463451655400163660ustar00rootroot00000000000000# # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # include ../../makefiles/common.mk include ../../makefiles/version.mk BUILDDIR ?= $(abspath ../../build) RPMPREPDIR := $(BUILDDIR)/redhat PKGDIR := $(BUILDDIR)/pkg/rpm/ RPMGEN_IN := $(wildcard *.in) RPMGEN := $(RPMGEN_IN:.in=) RPMFILES := $(RPMGEN) RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES)) PKG_TIMESTAMP := $(shell date -R) ARCH := $(shell uname -m) PKG_ARCH ?= $(shell uname -m) PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch) ifeq ($(PKG_MULTIARCH),) # Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it PKG_MULTIARCH := $(ARCH)-linux-gnu endif prep : $(RPMTARGETS) $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) build : prep $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) $(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR) @printf "Building Redhat package\n" mkdir -p $(PKGDIR) rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \ --define "_rpmdir $(PKGDIR)" \ --define "_builddir $(PKGDIR)/build/" \ --define "_buildrootdir $(PKGDIR)/buildroot/" \ -bb $(BUILDDIR)/redhat/nccl.spec clean: rm -Rf $(RPMPREPDIR) $(PKGDIR) $(RPMPREPDIR)/% : %.in @printf "Generating %-35s > %s\n" $< $@ mkdir -p $(RPMPREPDIR) sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \ -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \ $< > $@ $(RPMPREPDIR)/% : % @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(RPMPREPDIR) cp -f $< $@ nccl-2.22.3-1/pkg/redhat/nccl.spec.in000066400000000000000000000047631463451655400171340ustar00rootroot00000000000000Name: libnccl Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix} Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor} Summary: NVIDIA Collective Communication Library (NCCL) Runtime Group: Development/Libraries License: BSD URL: http://developer.nvidia.com/nccl Source0: nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz Requires(pre,preun): /sbin/ldconfig %description NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on any platform using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. %package devel Summary: NVIDIA Collective Communication Library (NCCL) Runtime Group: Development/Libraries %description devel NCCL development files %package static Summary: NVIDIA Collective Communication Library (NCCL) Runtime Group: Development/Libraries %description static NCCL static library %define debug_package %{nil} %prep %setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q %build %install rm -rf $RPM_BUILD_ROOT install -m 755 -d $RPM_BUILD_ROOT install -m 755 -d $RPM_BUILD_ROOT/%{_libdir} install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir} ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major} # devel install -m 755 -d $RPM_BUILD_ROOT/%{_includedir} install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir} install -m 644 include/nccl_net.h $RPM_BUILD_ROOT/%{_includedir} ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so # static install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir} %post -p /sbin/ldconfig %postun -p /sbin/ldconfig %post devel -p /sbin/ldconfig %postun devel -p /sbin/ldconfig %clean rm -rf $RPM_BUILD_ROOT %files devel %doc LICENSE.txt %defattr(-,root,root,-) %{_includedir}/nccl.h %{_includedir}/nccl_net.h %{_libdir}/libnccl.so %files static %doc LICENSE.txt %defattr(-,root,root,-) %{_libdir}/libnccl_static.a %files %doc LICENSE.txt %defattr(-,root,root,-) %{_libdir}/libnccl.so.${nccl:Major} %{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} %changelog nccl-2.22.3-1/pkg/srctxz/000077500000000000000000000000001463451655400150105ustar00rootroot00000000000000nccl-2.22.3-1/pkg/srctxz/Makefile000066400000000000000000000020511463451655400164460ustar00rootroot00000000000000# # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # include ../../makefiles/common.mk include ../../makefiles/version.mk BUILDDIR ?= $(abspath ../../build) TXZPREPDIR := $(BUILDDIR)/srctxz PKGDIR := $(BUILDDIR)/pkg/srctxz/ TXZGEN_IN := $(wildcard *.in) TXZGEN := $(TXZGEN_IN:.in=) TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN)) PKG_REVISION ?= 3 PKG_ARCH := $(shell uname -m) prep: $(TXZTARGETS) build: prep $(MAKE) -C ../../src clean @printf "Building source tar.xz package\n" (cd $(BUILDDIR); bash srctxz/create_srctxz.sh) mkdir -p $(PKGDIR) mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR) clean: rm -Rf $(TXZPREPDIR) $(PKGDIR) $(TXZPREPDIR)/% : %.in @printf "Generating %-35s > %s\n" $< $@ mkdir -p $(TXZPREPDIR) sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ $< > $@ nccl-2.22.3-1/pkg/srctxz/create_srctxz.sh.in000066400000000000000000000015101463451655400206260ustar00rootroot00000000000000#!/bin/bash # # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # # To run from $BUILDDIR/ cd .. NCCLDIR=`basename $PWD` echo "Checking for unclean directory ..." git clean -x -i echo "Clean done." echo "Checking for uncommited files ..." if [ "`git status -s | wc -l`" != "0" ]; then git status -s echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)" read fi cd .. NCCL_MAJOR=${nccl:Major} NCCL_MINOR=${nccl:Minor} NCCL_PATCH=${nccl:Patch} NCCL_SUFFIX=${nccl:Suffix} NCCL_BUILD=${pkg:Revision} NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}" tar --exclude build \ --exclude ".git*" \ --exclude pkg/srctxz \ --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR nccl-2.22.3-1/pkg/txz/000077500000000000000000000000001463451655400143005ustar00rootroot00000000000000nccl-2.22.3-1/pkg/txz/Makefile000066400000000000000000000022751463451655400157460ustar00rootroot00000000000000# # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # include ../../makefiles/common.mk include ../../makefiles/version.mk BUILDDIR ?= $(abspath ../../build) TXZPREPDIR := $(BUILDDIR)/txz PKGDIR := $(BUILDDIR)/pkg/txz/ TXZGEN_IN := $(wildcard *.in) TXZGEN := $(TXZGEN_IN:.in=) TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN)) PKG_ARCH := $(shell uname -m) prep: $(TXZTARGETS) $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) build: prep $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) @printf "Building tar.xz package\n" (cd $(BUILDDIR); bash txz/create_txz.sh) mkdir -p $(PKGDIR) mv $(BUILDDIR)/../nccl*.txz $(PKGDIR) clean: rm -Rf $(TXZPREPDIR) $(PKGDIR) $(TXZPREPDIR)/% : %.in @printf "Generating %-35s > %s\n" $< $@ mkdir -p $(TXZPREPDIR) sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ $< > $@ nccl-2.22.3-1/pkg/txz/create_txz.sh.in000066400000000000000000000012141463451655400174070ustar00rootroot00000000000000#!/bin/bash # # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # # To run from $BUILDDIR/ BUILDDIR=`basename $PWD` cd .. NCCL_MAJOR=${nccl:Major} NCCL_MINOR=${nccl:Minor} NCCL_PATCH=${nccl:Patch} NCCL_SUFFIX=${nccl:Suffix} CUDA_MAJOR=${cuda:Major} CUDA_MINOR=${cuda:Minor} PKG_REVISION=${pkg:Revision} PKG_ARCH=${pkg:Arch} NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}" tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt nccl-2.22.3-1/src/000077500000000000000000000000001463451655400134615ustar00rootroot00000000000000nccl-2.22.3-1/src/Makefile000066400000000000000000000107241463451655400151250ustar00rootroot00000000000000# # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # include ../makefiles/common.mk include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h LIBSRCFILES := \ bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \ init.cc init_nvtx.cc net.cc proxy.cc transport.cc register.cc \ $(wildcard graph/*.cc) \ $(wildcard misc/*.cc) \ $(wildcard transport/*.cc) ##### lib files LIBNAME := libnccl.so STATICLIBNAME := libnccl_static.a ##### pkgconfig files PKGCONFIGFILE := nccl.pc ##### dirs BUILDDIR ?= $(abspath ../build) INCDIR := $(BUILDDIR)/include LIBDIR := $(BUILDDIR)/lib OBJDIR := $(BUILDDIR)/obj PKGDIR := $(BUILDDIR)/lib/pkgconfig ##### target files CUDARTLIB ?= cudart_static ifeq ($(CUDARTLIB), cudart_static) # Use compatibility shim only with static cudart; see https://github.com/NVIDIA/nccl/issues/658 LIBSRCFILES += enhcompat.cc endif INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%) LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR)) LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH)) STATICLIBTARGET := $(STATICLIBNAME) PKGTARGET := $(PKGCONFIGFILE) LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o) DEPFILES := $(LIBOBJ:%.o=%.d) LDFLAGS += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl DEVMANIFEST := $(BUILDDIR)/obj/device/manifest ##### rules build : lib staticlib lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET) staticlib : $(LIBDIR)/$(STATICLIBTARGET) $(DEVMANIFEST): ALWAYS_REBUILD $(INCTARGETS) $(MAKE) -C ./device # Empty target to force rebuild ALWAYS_REBUILD: -include $(DEPFILES) $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ) $(INCDIR)/nccl.h : nccl.h.in ../makefiles/version.mk # NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z)) @$(eval NCCL_VERSION := $(shell printf "%d%02d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH))) mkdir -p $(INCDIR) @printf "Generating %-35s > %s\n" $< $@ sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \ $< > $@ $(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVMANIFEST) @printf "Linking %-35s > %s\n" $(LIBTARGET) $@ mkdir -p $(LIBDIR) $(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(LDFLAGS) ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME) ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME) $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVMANIFEST) @printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@ mkdir -p $(LIBDIR) ar cr $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(PKGDIR)/nccl.pc : nccl.pc.in mkdir -p $(PKGDIR) @printf "Generating %-35s > %s\n" $< $@ sed -e 's|$${nccl:Prefix}|\$(PREFIX)|g' \ -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ $< > $@ $(INCDIR)/%.h : %.h @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(INCDIR) install -m 644 $< $@ $(INCDIR)/nccl_%.h : include/nccl_%.h @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(INCDIR) install -m 644 $< $@ $(PKGDIR)/%.pc : %.pc @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(PKGDIR) install -m 644 $< $@ $(OBJDIR)/%.o : %.cc $(INCTARGETS) @printf "Compiling %-35s > %s\n" $< $@ mkdir -p `dirname $@` $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@ @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp) @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d) @sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d) @rm -f $(@:%.o=%.d.tmp) clean : $(MAKE) -C device clean rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR} install : build mkdir -p $(PREFIX)/lib mkdir -p $(PREFIX)/lib/pkgconfig mkdir -p $(PREFIX)/include cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/ cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/ cp -v $(BUILDDIR)/include/* $(PREFIX)/include/ FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h') # Note that formatting.mk defines a new target so in order to not overwrite the default target, # it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well # as the BUILDDIR variable. include ../makefiles/formatting.mk nccl-2.22.3-1/src/bootstrap.cc000066400000000000000000000622061463451655400160130ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nccl.h" #include "core.h" #include "utils.h" #include "bootstrap.h" #include "net.h" #include #include #include "proxy.h" #include "param.h" struct bootstrapRootArgs { struct ncclSocket* listenSock; uint64_t magic; }; /* Init functions */ static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1]; static union ncclSocketAddress bootstrapNetIfAddr; static int bootstrapNetInitDone = 0; pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER; ncclResult_t bootstrapNetInit() { if (bootstrapNetInitDone == 0) { pthread_mutex_lock(&bootstrapNetLock); if (bootstrapNetInitDone == 0) { const char* env = ncclGetEnv("NCCL_COMM_ID"); if (env) { union ncclSocketAddress remoteAddr; if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) { WARN("Invalid NCCL_COMM_ID, please use format: : or []: or :"); pthread_mutex_unlock(&bootstrapNetLock); return ncclInvalidArgument; } if (ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { WARN("NET/Socket : No usable listening interface found"); pthread_mutex_unlock(&bootstrapNetLock); return ncclSystemError; } } else { int nIfs = ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1); if (nIfs <= 0) { WARN("Bootstrap : no socket interface found"); pthread_mutex_unlock(&bootstrapNetLock); return ncclInternalError; } } char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2]; sprintf(line, " %s:", bootstrapNetIfName); ncclSocketToString(&bootstrapNetIfAddr, line+strlen(line)); INFO(NCCL_INIT, "Bootstrap : Using%s", line); bootstrapNetInitDone = 1; } pthread_mutex_unlock(&bootstrapNetLock); } return ncclSuccess; } /* Socket Interface Selection type */ enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 }; // Additional sync functions static ncclResult_t bootstrapNetSend(struct ncclSocket* sock, void* data, int size) { NCCLCHECK(ncclSocketSend(sock, &size, sizeof(int))); NCCLCHECK(ncclSocketSend(sock, data, size)); return ncclSuccess; } static ncclResult_t bootstrapNetRecv(struct ncclSocket* sock, void* data, int size) { int recvSize; NCCLCHECK(ncclSocketRecv(sock, &recvSize, sizeof(int))); if (recvSize > size) { WARN("Message truncated : received %d bytes instead of %d", recvSize, size); return ncclInternalError; } NCCLCHECK(ncclSocketRecv(sock, data, std::min(recvSize, size))); return ncclSuccess; } static ncclResult_t bootstrapNetSendRecv(struct ncclSocket* sendSock, void* sendData, int sendSize, struct ncclSocket* recvSock, void* recvData, int recvSize) { int senderRecvSize; NCCLCHECK(ncclSocketSendRecv(sendSock, &sendSize, sizeof(int), recvSock, &senderRecvSize, sizeof(int))); if (senderRecvSize > recvSize) { WARN("Message truncated : received %d bytes instead of %d", senderRecvSize, recvSize); return ncclInternalError; } NCCLCHECK(ncclSocketSendRecv(sendSock, sendData, sendSize, recvSock, recvData, recvSize)); return ncclSuccess; } struct extInfo { int rank; int nranks; union ncclSocketAddress extAddressListenRoot; union ncclSocketAddress extAddressListen; }; #include static ncclResult_t setFilesLimit() { struct rlimit filesLimit; SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit"); filesLimit.rlim_cur = filesLimit.rlim_max; SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit"); return ncclSuccess; } static void *bootstrapRoot(void* rargs) { struct bootstrapRootArgs* args = (struct bootstrapRootArgs*)rargs; struct ncclSocket* listenSock = args->listenSock; uint64_t magic = args->magic; ncclResult_t res = ncclSuccess; int nranks = 0, c = 0; struct extInfo info; union ncclSocketAddress *rankAddresses = NULL; union ncclSocketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange union ncclSocketAddress *zero = NULL; NCCLCHECKGOTO(ncclCalloc(&zero, 1), res, out); setFilesLimit(); TRACE(NCCL_INIT, "BEGIN"); /* Receive addresses from all ranks */ do { struct ncclSocket sock; NCCLCHECKGOTO(ncclSocketInit(&sock), res, out); NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out); NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out); NCCLCHECKGOTO(ncclSocketClose(&sock), res, out); if (c == 0) { nranks = info.nranks; NCCLCHECKGOTO(ncclCalloc(&rankAddresses, nranks), res, out); NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nranks), res, out); } if (nranks != info.nranks) { WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks); goto out; } if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union ncclSocketAddress)) != 0) { WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks); goto out; } // Save the connection handle for that rank memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union ncclSocketAddress)); memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union ncclSocketAddress)); ++c; TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks); } while (c < nranks); TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks); // Send the connect handle for the next rank in the AllGather ring for (int r=0; raddr, handle->magic, ncclSocketTypeBootstrap, NULL, 0)); NCCLCHECK(ncclSocketListen(listenSock)); NCCLCHECK(ncclSocketGetAddr(listenSock, &handle->addr)); NCCLCHECK(ncclCalloc(&args, 1)); args->listenSock = listenSock; args->magic = handle->magic; NEQCHECK(pthread_create(&thread, NULL, bootstrapRoot, (void*)args), 0); ncclSetThreadName(thread, "NCCL BootstrapR"); NEQCHECK(pthread_detach(thread), 0); // will not be pthread_join()'d return ncclSuccess; } ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) { memset(handle, 0, sizeof(ncclBootstrapHandle)); const char* env = ncclGetEnv("NCCL_COMM_ID"); if (env) { INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env); if (ncclSocketGetAddrFromString(&handle->addr, env) != ncclSuccess) { WARN("Invalid NCCL_COMM_ID, please use format: : or []: or :"); return ncclInvalidArgument; } handle->magic = NCCL_MAGIC; } else { NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic))); memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress)); NCCLCHECK(bootstrapCreateRoot(handle, false)); } return ncclSuccess; } struct unexConn { int peer; int tag; struct ncclSocket sock; struct unexConn* next; }; struct bootstrapState { struct ncclSocket listenSock; struct ncclSocket ringRecvSocket; struct ncclSocket ringSendSocket; union ncclSocketAddress* peerCommAddresses; union ncclSocketAddress* peerProxyAddresses; uint64_t* peerProxyAddressesUDS; struct unexConn* unexpectedConnections; int cudaDev; int rank; int nranks; uint64_t magic; volatile uint32_t *abortFlag; }; ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm) { int rank = comm->rank; int nranks = comm->nRanks; struct bootstrapState* state; struct ncclSocket* proxySocket; ncclSocketAddress nextAddr; struct ncclSocket sock, listenSockRoot; struct extInfo info = { 0 }; NCCLCHECK(ncclCalloc(&state, 1)); state->rank = rank; state->nranks = nranks; state->abortFlag = comm->abortFlag; comm->bootstrap = state; comm->magic = state->magic = handle->magic; TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); info.rank = rank; info.nranks = nranks; // Create socket for other ranks to contact me NCCLCHECK(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag)); NCCLCHECK(ncclSocketListen(&state->listenSock)); NCCLCHECK(ncclSocketGetAddr(&state->listenSock, &info.extAddressListen)); // Create socket for root to contact me NCCLCHECK(ncclSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag)); NCCLCHECK(ncclSocketListen(&listenSockRoot)); NCCLCHECK(ncclSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); // stagger connection times to avoid an overload of the root if (nranks > 128) { long msec = rank; struct timespec tv; tv.tv_sec = msec / 1000; tv.tv_nsec = 1000000 * (msec % 1000); TRACE(NCCL_INIT, "rank %d delaying connection to root by %ld msec", rank, msec); (void) nanosleep(&tv, NULL); } // send info on my listening socket to root NCCLCHECK(ncclSocketInit(&sock, &handle->addr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag)); NCCLCHECK(ncclSocketConnect(&sock)); NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info))); NCCLCHECK(ncclSocketClose(&sock)); // get info on my "next" rank in the bootstrap ring from root NCCLCHECK(ncclSocketInit(&sock)); NCCLCHECK(ncclSocketAccept(&sock, &listenSockRoot)); NCCLCHECK(bootstrapNetRecv(&sock, &nextAddr, sizeof(union ncclSocketAddress))); NCCLCHECK(ncclSocketClose(&sock)); NCCLCHECK(ncclSocketClose(&listenSockRoot)); NCCLCHECK(ncclSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag)); NCCLCHECK(ncclSocketConnect(&state->ringSendSocket)); // Accept the connect request from the previous rank in the AllGather ring NCCLCHECK(ncclSocketInit(&state->ringRecvSocket)); NCCLCHECK(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock)); // AllGather all listen handlers NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks)); NCCLCHECK(ncclSocketGetAddr(&state->listenSock, state->peerCommAddresses+rank)); NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress))); // Create the service proxy NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks)); NCCLCHECK(ncclCalloc(&state->peerProxyAddressesUDS, nranks)); // proxy is aborted through a message; don't set abortFlag NCCLCHECK(ncclCalloc(&proxySocket, 1)); NCCLCHECK(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeProxy, comm->abortFlag)); NCCLCHECK(ncclSocketListen(proxySocket)); NCCLCHECK(ncclSocketGetAddr(proxySocket, state->peerProxyAddresses+rank)); NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress))); // cuMem UDS support // Make sure we create a unique UDS socket name uint64_t randId; NCCLCHECK(getRandomData(&randId, sizeof(randId))); state->peerProxyAddressesUDS[rank] = getPidHash()+randId; NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS))); NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS)); TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); return ncclSuccess; } ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) { ncclResult_t ret = ncclSuccess; int rank = comm->rank; int nranks = comm->nRanks; int prev, next; ncclSocketAddress listenAddr, tmpAddr; struct ncclSocket* proxySocket; struct bootstrapState* state; NCCLCHECKGOTO(ncclCalloc(&state, 1), ret, fail); state->rank = rank; state->nranks = nranks; state->abortFlag = comm->abortFlag; comm->bootstrap = state; comm->magic = state->magic = handle->magic; prev = parentRanks[(rank-1+nranks)%nranks]; next = parentRanks[(rank+1)%nranks]; // Setup my sockets for the allgather ring and other p2p connections NCCLCHECKGOTO(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail); NCCLCHECKGOTO(ncclSocketInit(&state->ringRecvSocket, NULL, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail); // Create socket for other ranks to contact me NCCLCHECKGOTO(ncclSocketListen(&state->listenSock), ret, fail); // Get addr from next rank NCCLCHECKGOTO(ncclSocketGetAddr(&state->listenSock, &listenAddr), ret, fail); NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, -2, &listenAddr, sizeof(union ncclSocketAddress)), ret, fail); NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, -2, &tmpAddr, sizeof(union ncclSocketAddress)), ret, fail); NCCLCHECKGOTO(ncclSocketInit(&state->ringSendSocket, &tmpAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail); NCCLCHECKGOTO(ncclSocketConnect(&state->ringSendSocket), ret, fail); // Accept the connect request from the previous rank in the AllGather ring NCCLCHECKGOTO(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock), ret, fail); // AllGather all listen handlers NCCLCHECKGOTO(ncclCalloc(&state->peerCommAddresses, nranks), ret, fail); memcpy(state->peerCommAddresses+rank, &listenAddr, sizeof(union ncclSocketAddress)); NCCLCHECKGOTO(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)), ret, fail); if (parent->config.splitShare) { /* map local rank to top parent local rank. */ for (int i = 0; i < nranks; ++i) { comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]]; } } else { // Create the service proxy NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddresses, nranks), ret, fail); NCCLCHECKGOTO(ncclCalloc(&proxySocket, 1), ret, fail); NCCLCHECKGOTO(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeProxy, comm->abortFlag, 0), ret, fail); NCCLCHECKGOTO(ncclSocketListen(proxySocket), ret, fail); NCCLCHECKGOTO(ncclSocketGetAddr(proxySocket, &tmpAddr), ret, fail); memcpy(state->peerProxyAddresses + rank, &tmpAddr, sizeof(union ncclSocketAddress)); NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)), ret, fail); // cuMem UDS support NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), ret, fail); // Make sure we create a unique UDS socket name uint64_t randId; NCCLCHECKGOTO(getRandomData(&randId, sizeof(randId)), ret, fail); state->peerProxyAddressesUDS[rank] = getPidHash()+randId; NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)), ret, fail); NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail); } INFO(NCCL_INIT, "bootstrapSplit: comm %p parent %p rank %d nranks %d color %d key %d prev %d next %d - DONE", comm, parent, rank, nranks, color, key, prev, next); exit: return ret; fail: goto exit; } // Bootstrap send/receive functions // // We do not keep connections opened with all ranks at all times, and we have no guarantee // that connections to our unique listen socket will arrive in the same order as we need // them. Therefore, when establishing a connection, the sender sends a (peer, tag) tuple to // allow the receiver to identify the flow, and keep it in an unexpected queue if needed. ncclResult_t bootstrapConnect(void* commState, int peer, int tag, struct ncclSocket* sock) { ncclResult_t ret = ncclSuccess; struct bootstrapState* state = (struct bootstrapState*)commState; NCCLCHECKGOTO(ncclSocketInit(sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap), ret, fail); NCCLCHECKGOTO(ncclSocketConnect(sock), ret, fail); NCCLCHECKGOTO(bootstrapNetSend(sock, &state->rank, sizeof(int)), ret, fail); NCCLCHECKGOTO(bootstrapNetSend(sock, &tag, sizeof(int)), ret, fail); return ncclSuccess; fail: NCCLCHECK(ncclSocketClose(sock)); return ret; } ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) { ncclResult_t ret = ncclSuccess; struct ncclSocket sock; TRACE(NCCL_BOOTSTRAP, "Sending to peer=%d tag=%d size=%d", peer, tag, size); NCCLCHECK(bootstrapConnect(commState, peer, tag, &sock)); NCCLCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, exit); TRACE(NCCL_BOOTSTRAP, "Sent to peer=%d tag=%d size=%d", peer, tag, size); exit: NCCLCHECK(ncclSocketClose(&sock)); return ret; } ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) { // New unex struct unexConn* unex; NCCLCHECK(ncclCalloc(&unex, 1)); unex->peer = peer; unex->tag = tag; memcpy(&unex->sock, sock, sizeof(struct ncclSocket)); // Enqueue struct unexConn* list = state->unexpectedConnections; if (list == NULL) { state->unexpectedConnections = unex; return ncclSuccess; } while (list->next) list = list->next; list->next = unex; return ncclSuccess; } ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock, int* found) { struct unexConn* elem = state->unexpectedConnections; struct unexConn* prev = NULL; *found = 0; while (elem) { if (elem->peer == peer && elem->tag == tag) { if (prev == NULL) { state->unexpectedConnections = elem->next; } else { prev->next = elem->next; } memcpy(sock, &elem->sock, sizeof(struct ncclSocket)); free(elem); *found = 1; return ncclSuccess; } prev = elem; elem = elem->next; } return ncclSuccess; } static void unexpectedFree(struct bootstrapState* state) { struct unexConn* elem = state->unexpectedConnections; struct unexConn* prev = NULL; while (elem) { prev = elem; elem = elem->next; free(prev); } return; } // We can't know who we'll receive from, so we need to receive everything at once ncclResult_t bootstrapAccept(void* commState, int peer, int tag, struct ncclSocket* sock) { ncclResult_t ret = ncclSuccess; struct bootstrapState* state = (struct bootstrapState*)commState; int newPeer, newTag; // Search unexpected connections first int found; NCCLCHECK(unexpectedDequeue(state, peer, tag, sock, &found)); if (found) return ncclSuccess; // Then look for new connections while (1) { NCCLCHECKGOTO(ncclSocketInit(sock), ret, fail); NCCLCHECKGOTO(ncclSocketAccept(sock, &state->listenSock), ret, fail); NCCLCHECKGOTO(bootstrapNetRecv(sock, &newPeer, sizeof(int)), ret, fail); NCCLCHECKGOTO(bootstrapNetRecv(sock, &newTag, sizeof(int)), ret, fail); if (newPeer == peer && newTag == tag) return ncclSuccess; NCCLCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, sock), ret, fail); } return ncclSuccess; fail: NCCLCHECK(ncclSocketClose(sock)); return ret; } // We can't know who we'll receive from, so we need to receive everything at once ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) { ncclResult_t ret; struct ncclSocket sock; NCCLCHECK(bootstrapAccept(commState, peer, tag, &sock)); TRACE(NCCL_BOOTSTRAP, "Receiving tag=%d peer=%d size=%d", tag, peer, size); NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, exit); exit: NCCLCHECK(ncclSocketClose(&sock)); return ret; } // Collective algorithms, based on bootstrapSend/Recv, and sometimes bootstrapConnect/Accept ncclResult_t bootstrapRingAllGather(struct ncclSocket* prevSocket, struct ncclSocket* nextSocket, int rank, int nranks, char* data, int size) { /* Simple ring based AllGather * At each step i receive data from (rank-i-1) from prev * and send previous step's data from (rank-i) to next */ for (int i=0; irank; int nranks = state->nranks; TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size); NCCLCHECK(bootstrapRingAllGather(&state->ringRecvSocket, &state->ringSendSocket, rank, nranks, (char*)allData, size)); TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); return ncclSuccess; } ncclResult_t bootstrapIntraNodeBarrier(void* commState, int *ranks, int rank, int nranks, int tag) { if (nranks == 1) return ncclSuccess; TRACE(NCCL_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag); /* Simple [intra] process barrier * * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet, * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988" */ int data[1]; for (int mask=1; maskunexpectedConnections != NULL) { unexpectedFree(state); if (__atomic_load_n(state->abortFlag, __ATOMIC_ACQUIRE) == 0) { WARN("Unexpected connections are not empty"); return ncclInternalError; } } NCCLCHECK(ncclSocketClose(&state->listenSock)); NCCLCHECK(ncclSocketClose(&state->ringSendSocket)); NCCLCHECK(ncclSocketClose(&state->ringRecvSocket)); free(state->peerCommAddresses); free(state); return ncclSuccess; } ncclResult_t bootstrapAbort(void* commState) { struct bootstrapState* state = (struct bootstrapState*)commState; if (commState == NULL) return ncclSuccess; NCCLCHECK(ncclSocketClose(&state->listenSock)); NCCLCHECK(ncclSocketClose(&state->ringSendSocket)); NCCLCHECK(ncclSocketClose(&state->ringRecvSocket)); free(state->peerCommAddresses); free(state->peerProxyAddresses); free(state->peerProxyAddressesUDS); free(state); return ncclSuccess; } nccl-2.22.3-1/src/channel.cc000066400000000000000000000175521463451655400154120ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "channel.h" #include "param.h" #include "gdrwrap.h" #include "transport.h" ncclResult_t initChannel(struct ncclComm* comm, int channelId) { struct ncclChannel* channel = &comm->channels[channelId]; if (channel->id != -1) return ncclSuccess; int nRanks = comm->nRanks; int nvlsRanks = comm->localRanks; int nPeers = nRanks + 1 /* Collnet */ + nvlsRanks /* NVLS */; channel->id = channelId; channel->workFifoProduced = 0; struct ncclSharedResources* sharedRes = comm->sharedRes; NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); if (channel->peers == NULL) { // The extra on nRanks+1 is for collnet root (i.e. network) // Allocate everything related to sharedRes with ncclCalloc as this can be // shared between communicators hence should not be tied to comm. if (sharedRes->peers[channelId] == NULL) { NCCLCHECK(ncclCalloc(sharedRes->peers + channelId, sharedRes->tpNRanks)); } channel->peers = ncclMemoryStackAlloc(&comm->memPermanent, nPeers); for (int r = 0; r < nRanks; r++) { channel->peers[r] = comm->sharedRes->peers[channelId] + comm->topParentRanks[r]; ncclAtomicRefCountIncrement(&channel->peers[r]->refCount); } } if (channel->devPeers == NULL) { if (sharedRes->devPeers[channelId] == NULL) { NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream)); } /* channel->devPeers is not shared, so just free it when calling commFree() */ NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream)); ncclCommPushCudaFree(comm, channel->devPeers); NCCLCHECK(ncclCalloc(&channel->devPeersHostPtr, nPeers)); for (int r = 0; r < nRanks; r++) { uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]); NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); channel->devPeersHostPtr[r] = (struct ncclDevChannelPeer*)addr; } } channel->ring.userRanks = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream)); ncclCommPushCudaFree(comm, channel->devRingUserRanks); /* guarantee addr has been copied into channel->devPeers */ NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream)); NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); return ncclSuccess; } ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) { struct ncclChannel* channel = &comm->channels[channelId]; struct ncclSharedResources* sharedRes = comm->sharedRes; if (channel->nvlsPeers != NULL) return ncclSuccess; if (channel->id == -1) NCCLCHECK(initChannel(comm, channelId)); NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); int nvlsRanks = comm->localRanks; if (share) { channel->nvlsPeers = parent->channels[channelId].nvlsPeers; channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers; for (int r = 0; r < nvlsRanks; ++r) { int tr = comm->topParentLocalRanks[r]; uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr); channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr; NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr; ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount); } } else { NCCLCHECK(ncclCalloc(&channel->nvlsPeers, nvlsRanks)); NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, sharedRes->deviceStream.cudaStream)); for (int r = 0; r < nvlsRanks; ++r) { uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r); channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r; NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr; ncclAtomicRefCountIncrement(&channel->nvlsPeers[r].refCount); } } NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream)); NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); return ncclSuccess; } ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) { struct ncclChannel* channel = &comm->channels[channelId]; struct ncclSharedResources* sharedRes = comm->sharedRes; uintptr_t addr; if (channel->collnetPeers != NULL) return ncclSuccess; if (channel->id == -1) NCCLCHECK(initChannel(comm, channelId)); NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); if (share) { channel->collnetPeers = parent->channels[channelId].collnetPeers; channel->collnetDevPeers = parent->channels[channelId].collnetDevPeers; addr = (uintptr_t)parent->channels[channelId].collnetDevPeers; channel->peers[comm->nRanks] = parent->channels[channelId].collnetPeers; NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr; ncclAtomicRefCountIncrement(&parent->channels[channelId].collnetPeers->refCount); } else { NCCLCHECK(ncclCalloc(&channel->collnetPeers, 1)); NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, sharedRes->deviceStream.cudaStream)); addr = (uintptr_t)channel->collnetDevPeers; channel->peers[comm->nRanks] = channel->collnetPeers; NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr; ncclAtomicRefCountIncrement(&channel->collnetPeers->refCount); } NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream)); NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); return ncclSuccess; } ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks) { int nPeers = nRanks + collnetNRanks + nvlsNRanks; /* channel peers are only valid when async init thread completes commAlloc() and * the channel is intialized with initChannel(); if either is not done, this channel * should never be free. */ if (channel->id == -1 || channel->peers == NULL) return ncclSuccess; // Free transport proxy resources // Note: free all send resources first due to CollNet arrangement for (int r = 0; r < nPeers; r++) { struct ncclChannelPeer* peer = channel->peers[r]; if (peer) { if (ncclAtomicRefCountDecrement(&peer->refCount) == 0) { for (int b=0; bsend[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b)); if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b)); } if (r == nRanks) { free(channel->collnetPeers); ncclCudaFree(channel->collnetDevPeers); } else if (r == nPeers - 1) { free(channel->nvlsPeers); ncclCudaFree(channel->nvlsDevPeers); } } } } free(channel->devPeersHostPtr); return ncclSuccess; } nccl-2.22.3-1/src/collectives.cc000066400000000000000000000225361463451655400163140ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2023, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "argcheck.h" // Need some checks here since we access comm #include "collectives.h" #include "enqueue.h" #include "nccl.h" const char* ncclFuncToString(ncclFunc_t fn) { switch (fn) { case ncclFuncAllGather: return "AllGather"; case ncclFuncAllReduce: return "AllReduce"; case ncclFuncBroadcast: return "Broadcast"; case ncclFuncRecv: return "Recv"; case ncclFuncReduce: return "Reduce"; case ncclFuncReduceScatter: return "ReduceScatter"; case ncclFuncSendRecv: return "SendRecv"; case ncclFuncSend: return "Send"; default: return "Invalid"; } } const char* ncclDevRedOpToString(ncclDevRedOp_t op) { switch (op) { case ncclDevSum: return "Sum"; case ncclDevProd: return "Prod"; case ncclDevMinMax: return "MinMax"; case ncclDevPreMulSum: return "PreMulSum"; case ncclDevSumPostDiv: return "SumPostDiv"; default: return "Unknown"; } } const char* ncclDatatypeToString(ncclDataType_t type) { switch (type) { case ncclInt8: return "ncclInt8"; case ncclInt32: return "ncclInt32"; case ncclUint32: return "ncclUint32"; case ncclInt64: return "ncclInt64"; case ncclUint64: return "ncclUint64"; case ncclFloat16: return "ncclFloat16"; case ncclFloat32: return "ncclFloat32"; case ncclFloat64: return "ncclFloat64"; #if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: return "ncclBfloat16"; #endif default: return "Unknown"; } } const char* ncclAlgoToString(int algo) { switch (algo) { case NCCL_ALGO_TREE: return "TREE"; case NCCL_ALGO_RING: return "RING"; case NCCL_ALGO_COLLNET_DIRECT: return "COLLNET_DIRECT"; case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN"; case NCCL_ALGO_NVLS: return "NVLS"; case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE"; default: return "Unknown"; } } const char* ncclProtoToString(int proto) { switch (proto) { case NCCL_PROTO_LL: return "LL"; case NCCL_PROTO_LL128: return "LL128"; case NCCL_PROTO_SIMPLE: return "SIMPLE"; default: return "Unknown"; } } NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { // Just pass the size of one message and not the total bytes sent/received. constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"} }; size_t msgsize = sendcount * ncclTypeSize(datatype); NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize) struct ncclInfo info = { ncclFuncAllGather, "AllGather", sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */ ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS }; NCCLCHECK(ncclEnqueueCheck(&info)); return ncclSuccess; } NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { struct NvtxParamsAllReduce { size_t bytes; ncclRedOp_t op; }; // Just pass the size of one message and not the total bytes sent/received. static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, offsetof(NvtxParamsAllReduce, op)} }; NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op}; NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload) struct ncclInfo info = { ncclFuncAllReduce, "AllReduce", sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */ ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS }; NCCLCHECK(ncclEnqueueCheck(&info)); return ncclSuccess; } NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream) { struct NvtxParamsBroadcast { size_t bytes; int root; }; constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)} }; NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root}; NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload) struct ncclInfo info = { ncclFuncBroadcast, "Broadcast", sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */ BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS }; NCCLCHECK(ncclEnqueueCheck(&info)); return ncclSuccess; } /* Deprecated original "in place" function, similar to MPI */ NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream) { NCCLCHECK(ncclBroadcast(buff, buff, count, datatype, root, comm, stream)); return ncclSuccess; } NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { struct NvtxParamsReduce { size_t bytes; int root; ncclRedOp_t op; }; constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)}, {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, offsetof(NvtxParamsReduce, op)} }; NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op}; NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload) struct ncclInfo info = { ncclFuncReduce, "Reduce", sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */ REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS }; NCCLCHECK(ncclEnqueueCheck(&info)); return ncclSuccess; } NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { struct NvtxParamsReduceScatter { size_t bytes; ncclRedOp_t op; }; constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, offsetof(NvtxParamsReduceScatter, op)} }; NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op}; NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload) struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */ REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS }; NCCLCHECK(ncclEnqueueCheck(&info)); return ncclSuccess; } struct NvtxParamsSendRecv { size_t bytes; int peer; }; constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)} }; NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream) { NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload) struct ncclInfo info = { ncclFuncSend, "Send", NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ 1, 1 }; ncclResult_t ret; NCCLCHECK(ncclGroupStart()); NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit); exit: NCCLCHECK(ncclGroupEnd()); return ret; } NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream) { NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload) struct ncclInfo info = { ncclFuncRecv, "Recv", NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ 1, 1 }; ncclResult_t ret; NCCLCHECK(ncclGroupStart()); NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit); exit: NCCLCHECK(ncclGroupEnd()); return ret; } nccl-2.22.3-1/src/debug.cc000066400000000000000000000202501463451655400150550ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" #include "nccl_net.h" #include #include #include #include #include #include #include "param.h" int ncclDebugLevel = -1; static int pid = -1; static char hostname[1024]; thread_local int ncclDebugNoWarn = 0; char ncclLastError[1024] = ""; // Global string for the last error in human readable form static uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV FILE *ncclDebugFile = stdout; static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER; static std::chrono::steady_clock::time_point ncclEpoch; static bool ncclWarnSetDebugInfo = false; static __thread int tid = -1; static void ncclDebugInit() { pthread_mutex_lock(&ncclDebugLock); if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; } const char* nccl_debug = ncclGetEnv("NCCL_DEBUG"); int tempNcclDebugLevel = -1; if (nccl_debug == NULL) { tempNcclDebugLevel = NCCL_LOG_NONE; } else if (strcasecmp(nccl_debug, "VERSION") == 0) { tempNcclDebugLevel = NCCL_LOG_VERSION; } else if (strcasecmp(nccl_debug, "WARN") == 0) { tempNcclDebugLevel = NCCL_LOG_WARN; } else if (strcasecmp(nccl_debug, "INFO") == 0) { tempNcclDebugLevel = NCCL_LOG_INFO; } else if (strcasecmp(nccl_debug, "ABORT") == 0) { tempNcclDebugLevel = NCCL_LOG_ABORT; } else if (strcasecmp(nccl_debug, "TRACE") == 0) { tempNcclDebugLevel = NCCL_LOG_TRACE; } /* Parse the NCCL_DEBUG_SUBSYS env var * This can be a comma separated list such as INIT,COLL * or ^INIT,COLL etc */ const char* ncclDebugSubsysEnv = ncclGetEnv("NCCL_DEBUG_SUBSYS"); if (ncclDebugSubsysEnv != NULL) { int invert = 0; if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; } ncclDebugMask = invert ? ~0ULL : 0ULL; char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv); char *subsys = strtok(ncclDebugSubsys, ","); while (subsys != NULL) { uint64_t mask = 0; if (strcasecmp(subsys, "INIT") == 0) { mask = NCCL_INIT; } else if (strcasecmp(subsys, "COLL") == 0) { mask = NCCL_COLL; } else if (strcasecmp(subsys, "P2P") == 0) { mask = NCCL_P2P; } else if (strcasecmp(subsys, "SHM") == 0) { mask = NCCL_SHM; } else if (strcasecmp(subsys, "NET") == 0) { mask = NCCL_NET; } else if (strcasecmp(subsys, "GRAPH") == 0) { mask = NCCL_GRAPH; } else if (strcasecmp(subsys, "TUNING") == 0) { mask = NCCL_TUNING; } else if (strcasecmp(subsys, "ENV") == 0) { mask = NCCL_ENV; } else if (strcasecmp(subsys, "ALLOC") == 0) { mask = NCCL_ALLOC; } else if (strcasecmp(subsys, "CALL") == 0) { mask = NCCL_CALL; } else if (strcasecmp(subsys, "PROXY") == 0) { mask = NCCL_PROXY; } else if (strcasecmp(subsys, "NVLS") == 0) { mask = NCCL_NVLS; } else if (strcasecmp(subsys, "BOOTSTRAP") == 0) { mask = NCCL_BOOTSTRAP; } else if (strcasecmp(subsys, "REG") == 0) { mask = NCCL_REG; } else if (strcasecmp(subsys, "PROFILE") == 0) { mask = NCCL_PROFILE; } else if (strcasecmp(subsys, "ALL") == 0) { mask = NCCL_ALL; } if (mask) { if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask; } subsys = strtok(NULL, ","); } free(ncclDebugSubsys); } const char* ncclWarnSetDebugInfoEnv = ncclGetEnv("NCCL_WARN_ENABLE_DEBUG_INFO"); if (ncclWarnSetDebugInfoEnv != NULL && strlen(ncclWarnSetDebugInfoEnv) > 0) { int64_t value; errno = 0; value = strtoll(ncclWarnSetDebugInfoEnv, NULL, 0); if (!errno) ncclWarnSetDebugInfo = value; } // Cache pid and hostname getHostName(hostname, 1024, '.'); pid = getpid(); /* Parse and expand the NCCL_DEBUG_FILE path and * then create the debug file. But don't bother unless the * NCCL_DEBUG level is > VERSION */ const char* ncclDebugFileEnv = ncclGetEnv("NCCL_DEBUG_FILE"); if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) { int c = 0; char debugFn[PATH_MAX+1] = ""; char *dfn = debugFn; while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) { if (ncclDebugFileEnv[c++] != '%') { *dfn++ = ncclDebugFileEnv[c-1]; continue; } switch (ncclDebugFileEnv[c++]) { case '%': // Double % *dfn++ = '%'; break; case 'h': // %h = hostname dfn += snprintf(dfn, PATH_MAX, "%s", hostname); break; case 'p': // %p = pid dfn += snprintf(dfn, PATH_MAX, "%d", pid); break; default: // Echo everything we don't understand *dfn++ = '%'; *dfn++ = ncclDebugFileEnv[c-1]; break; } } *dfn = '\0'; if (debugFn[0] != '\0') { FILE *file = fopen(debugFn, "w"); if (file != nullptr) { setbuf(file, nullptr); // disable buffering ncclDebugFile = file; } } } ncclEpoch = std::chrono::steady_clock::now(); __atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE); pthread_mutex_unlock(&ncclDebugLock); } /* Common logging function used by the INFO, WARN and TRACE macros * Also exported to the dynamically loadable Net transport modules so * they can share the debugging mechanisms and output files */ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) { if (__atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE) == -1) ncclDebugInit(); if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; } // Save the last error (WARN) as a human readable string if (level == NCCL_LOG_WARN) { pthread_mutex_lock(&ncclDebugLock); va_list vargs; va_start(vargs, fmt); (void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs); va_end(vargs); pthread_mutex_unlock(&ncclDebugLock); } if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return; if (tid == -1) { tid = syscall(SYS_gettid); } int cudaDev; if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) { cudaGetDevice(&cudaDev); } char buffer[1024]; size_t len = 0; if (level == NCCL_LOG_WARN) { len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line); if (ncclWarnSetDebugInfo) ncclDebugLevel = NCCL_LOG_INFO; } else if (level == NCCL_LOG_INFO) { len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev); } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) { len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid); } else if (level == NCCL_LOG_TRACE) { auto delta = std::chrono::steady_clock::now() - ncclEpoch; double timestamp = std::chrono::duration_cast>(delta).count()*1000; len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, pid, tid, cudaDev, timestamp, filefunc, line); } va_list vargs; va_start(vargs, fmt); len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs); va_end(vargs); // vsnprintf may return len > sizeof(buffer) in the case of a truncated output. // Rewind len so that we can replace the final \0 by \n if (len > sizeof(buffer)) len = sizeof(buffer)-1; buffer[len++] = '\n'; if (len) fwrite(buffer, 1, len, ncclDebugFile); } NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0); void ncclSetThreadName(pthread_t thread, const char *fmt, ...) { // pthread_setname_np is nonstandard GNU extension // needs the following feature test macro #ifdef _GNU_SOURCE if (ncclParamSetThreadName() != 1) return; char threadName[NCCL_THREAD_NAMELEN]; va_list vargs; va_start(vargs, fmt); vsnprintf(threadName, NCCL_THREAD_NAMELEN, fmt, vargs); va_end(vargs); pthread_setname_np(thread, threadName); #endif } nccl-2.22.3-1/src/device/000077500000000000000000000000001463451655400147205ustar00rootroot00000000000000nccl-2.22.3-1/src/device/Makefile000066400000000000000000000054711463451655400163670ustar00rootroot00000000000000# # Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # SHELL := /usr/bin/env bash MAKEFALGS += -r .SUFFIXES: .SECONDARY: NCCLDIR := ../.. include $(NCCLDIR)/makefiles/common.mk include $(NCCLDIR)/makefiles/version.mk BUILDDIR ?= $(abspath ../../build) OBJDIR := $(BUILDDIR)/obj/device MANIFEST := $(OBJDIR)/manifest DEVGLUE_OBJ := $(OBJDIR)/device_glue.o INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden" CXXFLAGS += $(INCFLAGS) SAY = @bash -c 'path="$$2"; [[ "$$(realpath "$$2")" =~ ^$(subst .,\.,$(abspath $(NCCLDIR)))/(.*)$$ ]] && path="$${BASH_REMATCH[1]}"; printf "%-15s %s\n" "$$1" "$$path"' SAY COMPILE.cu = $(NVCC) $(NVCUFLAGS) -dc $2 -o $1 COMPILE.cc = $(CXX) $(CXXFLAGS) -c $2 -o $1 define COMPILE @$(SAY) "Compiling" $2;\ mkdir -p $(dir $1);\ $(call COMPILE$(suffix $2),$1,$2) endef DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1 DEPENDS.cc = $(CXX) $(CXXFLAGS) -M -c $1 define DEPENDS @$(SAY) "Dependencies" $2;\ mkdir -p $(dir $1);\ mk=$$($(call DEPENDS$(suffix $2),$2));\ [[ $$mk =~ ^[^:]*:(.*)$$ ]];\ files=$${BASH_REMATCH[1]};\ files=$$(for x in $$files; do case "$$x" in '\'|$$'\t') ;; *) echo "$$x"; esac; done);\ files=$$(for x in $$files; do [[ "$$(realpath "$$x")" == "$$(realpath "$(NCCLDIR)")"* ]] && echo "$$x"; done);\ echo "$(patsubst %.d,%.o,$1) $1: " $$files > $1 endef all: $(MANIFEST) ifeq (1,1) # Case if the directory is generated on-demand: $(OBJDIR)/gensrc: generate.py @mkdir -p $@ (which python3 >/dev/null || \ (bar='!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'; \ printf "\n$${bar}\nERROR: Building NCCL requires a Python 3 installation invokable as 'python3'.\n$${bar}\n\n" 1>&2; \ exit 1)) \ && ./generate.py $@ "$(ONLY_FUNCS)" else # Case if the directory is pre-generated and checked in the repo as ./gen: $(OBJDIR)/gensrc: @mkdir -p $(OBJDIR); ln -srfn ./gen $@ endif # The trailing ";" is necessary to make this an "empty recipe": # https://www.gnu.org/software/make/manual/html_node/Empty-Recipes.html $(OBJDIR)/gensrc/rules.mk: $(OBJDIR)/gensrc ; -include $(OBJDIR)/gensrc/rules.mk # "gensrc/rules.mk" populates $(LIB_OBJS_GEN) SRCS = common.cu onerank.cu LIB_OBJS = $(patsubst %, $(OBJDIR)/%.o, $(SRCS)) $(LIB_OBJS_GEN) $(OBJDIR)/%.o: % $(OBJDIR)/%.d $(call COMPILE,$@,$<) $(OBJDIR)/genobj/%.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/%.d $(call COMPILE,$@,$(OBJDIR)/gensrc/$*) $(OBJDIR)/%.d: % $(call DEPENDS,$@,$<) $(OBJDIR)/genobj/%.d: $(OBJDIR)/gensrc/% $(call DEPENDS,$@,$<) $(DEVGLUE_OBJ): $(LIB_OBJS) $(NVCC) $(NVCUFLAGS) -dlink $^ -o $@ $(MANIFEST): $(LIB_OBJS) $(DEVGLUE_OBJ) @echo $^ > $@ -include $(wildcard $(OBJDIR)/*.d) -include $(wildcard $(OBJDIR)/genobj/*.d) .PHONY: clean clean: rm -rf $(OBJDIR) nccl-2.22.3-1/src/device/all_gather.h000066400000000000000000000317571463451655400172100ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "device.h" #include "collectives.h" #include "primitives.h" namespace { template __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { ncclRing *ring = &ncclShmem.channel.ring; const int *ringRanks = ring->userRanks; const int nranks = ncclShmem.comm.nRanks; size_t count, partOffset, partCount, chunkCount; ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount); size_t offset; size_t dataOffset; int nelem; int rankDest; T *inputBuf = (T*)work->sendbuff; T *outputBuf = (T*)work->recvbuff; Primitives, 1, Proto, 0> prims (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg); for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) { /////////////// begin AllGather steps /////////////// nelem = min(chunkCount, partCount - elemOffset); dataOffset = partOffset + elemOffset; // step 0: push data to next GPU rankDest = ringRanks[0]; offset = dataOffset + rankDest * count; if (inputBuf + dataOffset == outputBuf + offset) { // In place prims.directSend(dataOffset, offset, nelem); } else { prims.directCopySend(dataOffset, offset, nelem); } // k-2 steps: copy to next GPU for (int j=1; j struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { using Proto = ProtoSimple; runRing(tid, nthreads, work); } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { runRing(tid, nthreads, work); } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { runRing(tid, nthreads, work); } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { struct ncclNvls* nvls = &ncclShmem.channel.nvls; const ssize_t rank = ncclShmem.comm.rank; size_t count, gridOffset, channelCount; size_t chunkCount; ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount); size_t offset; int nelem; const int nThreadsBcast = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE; const int nThreadsGather = work->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast; const int tidEndGather = nThreadsGather; const int tidEndBcast = tidEndGather + nThreadsBcast; if (!work->regUsed) { if (tid < tidEndGather) { // Gather using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> prims(tid, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff, work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.gather(offset, nvls->nHeads * count, nelem, count, -1, 0); } } else if (tid < tidEndBcast) { // Bcast through NVLS using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; Primitives, /*Direct=*/0, Proto, 0> prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, work->sendbuff, NULL, work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.send(offset, nelem); } } } else { /* direct allgather */ if (tid < tidEndGather) { using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL, work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); /* used as sync */ prims.scatter(0, 0, 0, 0, -1, 0); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { prims.gather(0, 0, 0, 0, -1, 0); } } else if (tid < tidEndBcast) { using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, work->sendbuff, NULL, work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, work); /* used as sync */ prims.recv(0, 0); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { ssize_t inpOffset = gridOffset + elemOffset; ssize_t outOffset = inpOffset + rank * count; nelem = min(chunkCount, channelCount - elemOffset); prims.directSend(inpOffset, outOffset, nelem); } } } } }; template struct RunWorkColl { template struct Scatterer { struct ncclDevWorkColl* work; ssize_t chunkSize; ssize_t railGridOffset; template __device__ __forceinline__ void operator()( int tid, int tn, int slice, int maxSliceSize, int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes ) { static_assert(SlicePerChunk==1, "require: SlicePerChunk==1"); static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1"); struct ncclDirect* direct = &ncclShmem.channel.collnetDirect; int nNodes = ncclShmem.comm.nNodes; int nRails = direct->nHeads; int part = ncclShmem.channelId - work->channelLo; char* inbuf = (char*)work->sendbuff; char* outbuf = (char*)work->recvbuff; ssize_t sizePerRank = work->collnet.count*sizeof(T); bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*sizePerRank); ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank); ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank); int railAllSize = railAllEnd - railAllBeg; if (tid < nDsts) dstSizes[tid] = railAllSize; int src = 0; int rail; if (BcastSendNotRecv) { rail = direct->headRank; } else { rail = direct->headRank+1; if (rail == nRails) rail = 0; } do { int node = railAllBeg/sizePerRank; int railAllOffset = 0; while (railAllOffset < railAllSize) { ssize_t railOneBeg = node*sizePerRank; ssize_t railOneEnd = railOneBeg + sizePerRank; ssize_t railOneOffset = (railAllBeg+railAllOffset) - railOneBeg; int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset); int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail]; ssize_t userOneBeg = rank*sizePerRank + railOneOffset; int outIsDst = (inPlace && rank == ncclShmem.comm.rank) ? 0 : 1; reduceCopy (tid, tn, 0, nullptr, false, /*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* { return (char*)srcPtrs[src] + railAllOffset; }, /*nDsts=*/outIsDst+nDsts, [=]__device__(int d) -> void* { return d < outIsDst ? outbuf + userOneBeg : (char*)dstPtrs[d-outIsDst] + railAllOffset; }, delta); railAllOffset += delta; node += 1; } src += 1; rail += 1; if (rail == nRails) rail = 0; } while (!BcastSendNotRecv && src < nRails-1); } }; __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { const int part = ncclShmem.channelId - work->channelLo; const int nChannels = work->channelHi - work->channelLo + 1; struct ncclDirect* direct = &ncclShmem.channel.collnetDirect; int const &nNodes = ncclShmem.comm.nNodes; ssize_t sizePerRank = work->collnet.count*sizeof(T); size_t chunkSize = work->collnet.chunkCount; bool isMultiRail = (direct->nHeads > 1); int nWarps1 = 1; int nWarps2 = (isMultiRail ? 2 : 1); int nWarps3 = (isMultiRail ? 2 : 0); float denom = float(work->nWarps)/float(nWarps1+nWarps2+nWarps3); nWarps3 = int(denom*nWarps3); nWarps2 = int(denom*nWarps2); nWarps1 = work->nWarps - (nWarps2+nWarps3); using Proto = ProtoSimple<1, 1>; int tn = nWarps1*WARP_SIZE; if (tid < tn) { if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (tid == 0) { int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps); } __syncwarp(); } else { // Phase 1: send to network Primitives, /*Direct=*/0, Proto, 0> prims(tid, tn, nullptr, &direct->out, work->sendbuff, nullptr, /*redOpArg=*/0, 0 * Proto::MaxGroupWidth, 1, 1); for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) { ssize_t railAllBeg = railGridOffset + part * chunkSize; ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank); ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank; ssize_t railOneEnd = railOneBeg + sizePerRank; ssize_t beg = max(railAllBeg, railOneBeg); ssize_t end = min(railAllEnd, railOneEnd); prims.send(beg - railOneBeg, max(ssize_t(0), end - beg)); } } return; } tid -= tn; tn = nWarps2*WARP_SIZE; if (tid < tn) { if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (tid == 0) { int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps); } __syncwarp(); } else { // Phase 2: Recv network -> deposit output + send to bcast Primitives, /*Direct=*/0, Proto, 0> prims(tid, tn, &direct->out, direct->heads + 1, nullptr, nullptr, /*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0); for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) { Scatterer scat; scat.work = work; scat.chunkSize = chunkSize; scat.railGridOffset = railGridOffset; prims.template process(scat); } } return; } tid -= tn; tn = nWarps3*WARP_SIZE; if (tid < tn) { // Phase 3: Recv bcast -> deposit output Primitives, /*Direct=*/0, Proto, 0> prims(tid, tn, direct->heads+1, nullptr, nullptr, nullptr, /*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0); for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) { Scatterer scat; scat.work = work; scat.chunkSize = chunkSize; scat.railGridOffset = railGridOffset; prims.template process(scat); } return; } } }; nccl-2.22.3-1/src/device/all_reduce.h000066400000000000000000001066451463451655400172040ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "device.h" #include "collectives.h" #include "primitives.h" namespace { template __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { ncclRing *ring = &ncclShmem.channel.ring; int ringIx = ring->index; const int nranks = ncclShmem.comm.nRanks; ssize_t gridOffset; ssize_t channelCount; ssize_t chunkCount; ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount); const ssize_t loopCount = nranks * chunkCount; ssize_t offset; int nelem; int chunk; Primitives, 1, Proto, 0> prims (tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { ssize_t remCount = channelCount - elemOffset; ssize_t chunkOffset; if (remCount < loopCount) chunkCount = alignUp(divUp(remCount, nranks), 16/sizeof(T)); auto modRanks = [&]__device__(int r)->int { return r - (r >= nranks ? nranks : 0); }; // step 0: push data to next GPU chunk = modRanks(ringIx + nranks - 1); chunkOffset = chunk * chunkCount; offset = gridOffset + elemOffset + chunkOffset; nelem = (int)min(chunkCount, remCount - chunkOffset); prims.send(offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j = 2; j < nranks; ++j) { chunk = modRanks(ringIx + nranks - j); chunkOffset = chunk * chunkCount; offset = gridOffset + elemOffset + chunkOffset; nelem = (int)min(chunkCount, remCount - chunkOffset); prims.recvReduceSend(offset, nelem); } // step k-1: reduce this buffer and data, which will produce the final // result that we store in this data and push to the next GPU chunk = ringIx + 0; chunkOffset = chunk * chunkCount; offset = gridOffset + elemOffset + chunkOffset; nelem = (int)min(chunkCount, remCount - chunkOffset); prims.directRecvReduceCopySend(offset, offset, nelem, /*postOp=*/true); // k-2 steps: copy to next GPU for (int j = 1; j < nranks - 1; ++j) { chunk = modRanks(ringIx + nranks - j); chunkOffset = chunk * chunkCount; offset = gridOffset + elemOffset + chunkOffset; nelem = (int)min(chunkCount, remCount - chunkOffset); prims.directRecvCopySend(offset, nelem); } // Make final copy from buffer to dest. chunk = modRanks(ringIx + 1); chunkOffset = chunk * chunkCount; offset = gridOffset + elemOffset + chunkOffset; nelem = (int)min(chunkCount, remCount - chunkOffset); prims.directRecv(offset, nelem); } } template __device__ __forceinline__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) { ncclTree *tree = &ncclShmem.channel.tree; size_t gridOffset; size_t channelCount; size_t chunkCount; ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount); size_t offset; int nelem; { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) Primitives, /*Direct=*/0, Proto, 0> prims (tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg); if (tree->up == -1) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true); } } else if (tree->down[0] == -1) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.send(offset, nelem); } } else { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.recvReduceSend(offset, nelem); } } } { // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) Primitives, /*Direct=*/1, Proto, 0> prims (tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg); if (tree->up == -1) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.directSendFromOutput(offset, nelem); } } else if (tree->down[0] == -1) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.directRecv(offset, nelem); } } else { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.directRecvCopySend(offset, nelem); } } } } template __device__ __forceinline__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) { ncclTree *tree = &ncclShmem.channel.tree; size_t gridOffset; size_t channelCount; size_t chunkCount; ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount); size_t offset; int nelem; int nthreadsSplit; if (Proto::Id == NCCL_PROTO_SIMPLE) { nthreadsSplit = nthreads/2; if (nthreadsSplit >= 256) nthreadsSplit += 64; } else { // LL & LL128 // Receiving from up to 3 sources is more compute intensive than sending // to 3 dests. Use 70% for reduce and 30% for bcast. nthreadsSplit = (nthreads*7/(10*WARP_SIZE))*WARP_SIZE; } if (tree->up == -1) { // Reduce and broadcast. Max number of recv is 2, max number of send is 2 Primitives, /*Direct=*/1, Proto, 0> prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.directRecvReduceCopySend(offset, offset, nelem, /*doPost=*/true); } } else if (tid < nthreadsSplit) { /* Reduce up. Max number of recv is 3, max number of send is 1 (binary tree + local). * Why Direct=1???? * Answer: Because despite not performing any direct operations, the ctor * must assume Direct so that it can exchange direct pointers with remote ctors * that are Direct, otherwise it hangs. A cleaner solution would be to seperate * into DirectRecv and DirectSend capabilities, this ctor would have both=0, * but the ctor above for tree roots would be DirectRecv=0 DirectSend=1. */ Primitives, /*Direct=*/1, Proto, 0> prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth); if (tree->down[0] == -1) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.send(offset, nelem); } } else { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.recvReduceSend(offset, nelem); } } } else { // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local) Primitives, /*Direct=*/1, Proto, 0> prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg, 1*Proto::MaxGroupWidth); if (tree->down[0] == -1) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.directRecv(offset, nelem); } } else { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.directRecvCopySend(offset, nelem); } } } } } template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { using Proto = ProtoSimple; runRing(tid, nthreads, work); } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { #if CUDART_VERSION >= 11020 && CUDART_VERSION < 11040 && __CUDA_ARCH__ >= 800 runTreeUpDown>(tid, nthreads, work); #else runTreeSplit>(tid, nthreads, work); #endif } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { static constexpr int COLLNET_COPY_THREADS = 96; const int bid = ncclShmem.channelId - work->channelLo; const int nChannels = work->channelHi - work->channelLo + 1; struct ncclDirect* direct = &ncclShmem.channel.collnetDirect; const ssize_t chunkSize = work->collnet.chunkCount; const ssize_t size = work->collnet.count; const ssize_t loopSize = nChannels*direct->nHeads*chunkSize; const int hasUp = (direct->up[0] >= 0) ? 1 : 0; const int hasDn = (direct->down[0] >= 0) ? 1 : 0; const int nThreadsScatter = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 3*COLLNET_COPY_THREADS : 0); const int nThreadsGather = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0); const int nThreadsBcast = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 2*COLLNET_COPY_THREADS); const int nThreadsReduce = work->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast; const int tidStartBcast = nThreadsGather; const int tidStartScatter = tidStartBcast + nThreadsBcast; const int tidStartReduce = tidStartScatter + nThreadsScatter; using Proto = ProtoSimple<1, 1>; if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) { // Scatter Primitives, /*Direct=*/1, Proto, 0> prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, work->sendbuff, work->recvbuff, work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize; int nelem = min(direct->nHeads*chunkSize, size-offset); if (work->regUsed) { prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift); } else { prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift); } } } else if (tid >= tidStartReduce && direct->out != -1) { if (hasDn) { // Reduce, send to network Primitives, /*Direct=*/1, Proto, 0> prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, work->sendbuff, work->recvbuff, work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); if (work->regUsed) { prims.directRecvReduceSend(offset, nelem); } else { prims.recvReduceSend(offset, nelem); } } } else { // Directly send to network if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (tid == tidStartReduce) { int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps); } __syncwarp(); } else { Primitives, /*Direct=*/0, Proto, 0> prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, work->sendbuff, work->recvbuff, work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); prims.send(offset, nelem); } } } } else if (tid < tidStartBcast && hasUp) { // Gather Primitives, /*Direct=*/1, Proto, 0> prims(tid, nThreadsGather, direct->up, NULL, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize; int nelem = min(direct->nHeads*chunkSize, size-offset); prims.directGather(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift); } } else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) { if (hasDn) { // Recv from network, broadcast Primitives, /*Direct=*/1, Proto, 0> prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, work->sendbuff, work->recvbuff, work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true); } } else { if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (tid == tidStartBcast) { int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps); } __syncwarp(); } else { // Recv from network (no post thread needed) Primitives, /*Direct=*/0, Proto, 0> prims(tid - tidStartBcast, nThreadsBcast, &direct->out, nullptr, work->sendbuff, work->recvbuff, work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize; int nelem = min(chunkSize, size - offset); prims.recv(offset, nelem, /*postOp=*/true); } } } } } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { struct ncclNvls* nvls = &ncclShmem.channel.nvls; const bool hasOut = nvls->out != -1; const int nranks = ncclShmem.comm.nRanks; const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE; const int bcastWarps = hasOut ? (work->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0; const int reduceWarps = work->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5); const int scatterWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1; const int gatherWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1; const int nThreadsScatter = scatterWarps*WARP_SIZE; const int nThreadsGather = gatherWarps*WARP_SIZE; const int nThreadsReduce = reduceWarps*WARP_SIZE; const int nThreadsBcast = (bcastWarps)*WARP_SIZE; const int tidEndScatter = nThreadsScatter; const int tidEndGather = tidEndScatter + nThreadsGather; const int tidEndReduce = tidEndGather + nThreadsReduce; const int tidEndBcast = tidEndReduce + nThreadsBcast; if (work->oneNode) { ssize_t gridOffset, channelCount, chunkSize; ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkSize); const ssize_t loopCount = nvls->nHeads * chunkSize; ssize_t offset; int nelem; int remCount = channelCount%(nvls->nHeads*chunkSize); int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T)); if (tid < tidEndScatter) { // Scatter using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL, work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize; offset = gridOffset + elemOffset; nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset); prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndGather) { // Gather using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff, work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize; offset = gridOffset + elemOffset; nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset); prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndReduce) { // Reduce, broadcast through NVLS using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL, work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { ssize_t chunkOffset; if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize; chunkOffset = elemOffset + nvls->headRank * chunkSize; offset = gridOffset + chunkOffset; nelem = min(chunkSize, channelCount - chunkOffset); prims.directRecvDirectSend(offset, offset, nelem); } } } else { const int bid = ncclShmem.channelId - work->channelLo; const int nChannels = work->channelHi - work->channelLo + 1; const ssize_t chunkSize = work->collnet.chunkCount; const ssize_t loopSize = nChannels * nvls->nHeads * chunkSize; const ssize_t size = work->collnet.count; if (tid < tidEndScatter) { // Scatter using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL, work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize; int nelem = work->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset); prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndGather) { // Gather using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff, work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize; int nelem = work->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset); prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndReduce && nvls->headRank != -1) { if (!hasOut) { // Reduce, broadcast through NVLS using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL, work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; int nelem = min(chunkSize, size - offset); prims.directRecvDirectSend(offset, offset, nelem); } } else { // Reduce, send to network using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL, work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; int nelem = min(chunkSize, size - offset); prims.directRecvDirectSend(offset, offset, nelem); } } } else if (tid < tidEndBcast && nvls->headRank != -1) { // Recv from network, broadcast using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL, work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; int nelem = min(chunkSize, size - offset); prims.directRecvDirectSend(offset, offset, nelem); } } } } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { struct ncclNvls* nvls = &ncclShmem.channel.nvls; const int treeUp = nvls->treeUp; const int* treeDown = nvls->treeDown; ssize_t gridOffset, channelCount, chunkCount; ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount); const ssize_t loopCount = nvls->nHeads * chunkCount; const int nranks = ncclShmem.comm.nRanks; const bool hasUp = treeUp != -1; const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE; const int bcastWarps = hasUp ? (work->regUsed ? ((totalWarps - 2) >> 1) - 1 : 4) : 0; const int reduceWarps = work->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5); const int scatterWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1; const int gatherWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1; ssize_t offset; int nelem; int remCount = channelCount%(nvls->nHeads*chunkCount); int lastChunkCount = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T)); const int nThreadsScatter = scatterWarps*WARP_SIZE; const int nThreadsGather = gatherWarps*WARP_SIZE; const int nThreadsReduce = reduceWarps*WARP_SIZE; const int nThreadsBcast = (bcastWarps)*WARP_SIZE; const int tidEndScatter = nThreadsScatter; const int tidEndGather = tidEndScatter + nThreadsGather; const int tidEndReduce = tidEndGather + nThreadsReduce; const int tidEndBcast = tidEndReduce + nThreadsBcast; if (tid < tidEndScatter) { // Scatter using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL, work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount; offset = gridOffset + elemOffset; nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset); prims.scatter(offset, nelem, chunkCount, chunkCount, -1, 0); } } else if (tid < tidEndGather) { // Gather using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff, work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount; offset = gridOffset + elemOffset; nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset); prims.gather(offset, nelem, chunkCount, chunkCount, -1, 0); } } else if (tid < tidEndReduce && nvls->headRank != -1) { if (!hasUp) { // Reduce and Broadcast using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL, work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { ssize_t chunkOffset; if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount; chunkOffset = elemOffset + nvls->headRank * chunkCount; offset = gridOffset + chunkOffset; nelem = min(chunkCount, channelCount - chunkOffset); prims.directRecvDirectSend(offset, offset, nelem); } } else { // Reduce, send to network using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL, work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { ssize_t chunkOffset; if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount; chunkOffset = elemOffset + nvls->headRank * chunkCount; offset = gridOffset + chunkOffset; nelem = min(chunkCount, channelCount - chunkOffset); prims.directRecvDirectSend(offset, offset, nelem); } } } else if (tid < tidEndBcast && nvls->headRank != -1) { // Recv from network, broadcast using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL, work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { ssize_t chunkOffset; if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount; chunkOffset = elemOffset + nvls->headRank * chunkCount; offset = gridOffset + chunkOffset; nelem = min(chunkCount, channelCount - chunkOffset); prims.directRecvDirectSend(offset, offset, nelem); } } } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { const int bid = ncclShmem.channelId - work->channelLo; const int nChannels = work->channelHi - work->channelLo + 1; ncclTree *tree = &ncclShmem.channel.collnetChain; ssize_t chunkSize = work->collnet.chunkCount; const ssize_t loopSize = int(nChannels*chunkSize); const int nranks = ncclShmem.comm.nRanks; const ssize_t size = work->collnet.count; int nthreadsSplit = nthreads/2; if (nthreadsSplit >= 256) nthreadsSplit += 64; int group, connIndex, send, recv, groupTid, groupNthreads; using Proto = ProtoSimple<1, 1>; if (tid < nthreadsSplit) { // Reduce up the chain group = 0; connIndex = 1; recv = tree->down[0]; send = tree->up; groupTid = tid; groupNthreads = nthreadsSplit; } else { // Broadcast down the chain group = 1; connIndex = 0; recv = tree->up; send = tree->down[0]; groupTid = tid - nthreadsSplit; groupNthreads = nthreads-nthreadsSplit; } if (tid < nthreadsSplit) { if (recv == -1) { if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (groupTid == 0) { int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, steps); } __syncwarp(); } else { Primitives, /*Direct=*/1, Proto, 0> prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff, work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * int(chunkSize); int nelem = min(chunkSize, size - offset); prims.send(offset, nelem); } } } else { Primitives, /*Direct=*/1, Proto, 0> prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff, work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * int(chunkSize); int nelem = min(chunkSize, size - offset); prims.recvReduceSend(offset, nelem); } } } else { if (recv == nranks) { // I'm the first in the broadcast chain, I need to perform the division (postOp) if (send == -1) { if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (groupTid == 0) { int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, steps); } __syncwarp(); } else { Primitives, /*Direct=*/1, Proto, 0> prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff, work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * int(chunkSize); int nelem = min(chunkSize, size - offset); prims.recv(offset, nelem, /*postOp*/true); } } } else { Primitives, /*Direct=*/1, Proto, 0> prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff, work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * int(chunkSize); int nelem = min(chunkSize, size - offset); prims.recvCopyDirectSend(offset, nelem, /*postOp*/true); } } } else { Primitives, /*Direct=*/1, Proto, 0> prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff, work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); if (send == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.directRecv(offset, nelem); } } else { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.directRecvCopySend(offset, nelem); } } } } } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { runRing(tid, nthreads, work); } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { runTreeSplit(tid, nthreads, work); } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { runRing(tid, nthreads, work); } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { runTreeSplit(tid, nthreads, work); } }; nccl-2.22.3-1/src/device/broadcast.h000066400000000000000000000047741463451655400170470ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "device.h" #include "collectives.h" #include "primitives.h" namespace { template __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { ncclRing *ring = &ncclShmem.channel.ring; const int rank = ring->userRanks[0]; const int nextRank = ring->userRanks[1]; const int root = work->root; size_t chunkCount; size_t channelCount; size_t gridOffset; ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount); size_t offset; int nelem; T *inputBuf = (T*)work->sendbuff; T *outputBuf = (T*)work->recvbuff; Primitives, 0, Proto, 0> prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); if (rank == root) { if (inputBuf == outputBuf) { prims.send(offset, nelem); } else { prims.copySend(offset, offset, nelem); } } else if (nextRank == root) { prims.recv(offset, nelem); } else { prims.recvCopySend(offset, nelem); } } } } template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { using Proto = ProtoSimple; runRing(tid, nthreads, work); } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { runRing(tid, nthreads, work); } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { runRing(tid, nthreads, work); } }; nccl-2.22.3-1/src/device/common.cu000066400000000000000000000013471463451655400165460ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "device.h" #include "collectives.h" #include "common.h" __shared__ ncclShmemData ncclShmem; #if __CUDA_ARCH__ < 700 __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)]; #endif struct RunWorkNop { __device__ void run() {} }; __global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { ncclKernelMain<-1, RunWorkNop>(&args4K.args); } __device__ void ncclDevFunc_Nop() {} nccl-2.22.3-1/src/device/common.h000066400000000000000000000353551463451655400163740ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_DEVICE_COMMON_H_ #define NCCL_DEVICE_COMMON_H_ #include "collectives.h" #include "device.h" #include "op128.h" #include "reduce_kernel.h" #include "network/unpack/unpack_defs.h" #define COLL_UNROLL (ncclCollUnroll()) #if __CUDA_ARCH__ >= 700 // __grid_constant__ appears to break cuda-gdb //#define NCCL_GRID_CONSTANT __grid_constant__ #define NCCL_GRID_CONSTANT #else #define NCCL_GRID_CONSTANT #endif typedef void(*ncclDevFuncPtr_t)(); extern __device__ ncclDevFuncPtr_t const ncclDevFuncTable[]; struct ncclShmemGroup { ncclConnInfo *recvConns[NCCL_MAX_ARITY]; ncclConnInfo *sendConns[NCCL_MAX_ARITY]; void* userInput; void* userOutput; void* srcs[NCCL_MAX_ARITY+1]; void* dsts[NCCL_MAX_ARITY+1]; union { unpackGroupShmem unpack; } devicePlugin; int32_t dstSizes[NCCL_MAX_ARITY+1]; }; struct ncclShmemData { struct ncclDevKernelArgs args; int channelId; int aborted; alignas(16) struct ncclDevComm comm; alignas(16) struct ncclDevChannel channel; int batchIx, nextBatchIx; enum ncclDevWorkType workType; uint8_t directMode; uint16_t funcId; int nWorks; int workSize; uint32_t workConsumed; struct ncclShmemGroup groups[NCCL_MAX_GROUPS]; uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1]; alignas(16) char workStorage[1024]; alignas(16) union { unpackShmem unpack; } devicePlugin; }; extern __shared__ ncclShmemData ncclShmem; #if __CUDA_ARCH__ >= 700 extern __shared__ ulong2 ncclShmemPerWarp[/*ncclShmemDynamicSize()/sizeof(ulong2)*/]; #else extern __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)]; #endif __device__ inline void* ncclScratchForWarp(int warp) { return (char*)ncclShmemPerWarp + warp*ncclShmemScratchWarpSize(); } __device__ inline void barrier_sync(int name) { #if 0 asm volatile("barrier.sync %0;" :: "r"(name) : "memory"); #else asm volatile("barrier.sync.aligned %0;" :: "r"(name) : "memory"); #endif } __device__ inline void barrier_sync(int name, int nThreads) { #if 0 asm volatile("barrier.sync %0, %1;" :: "r"(name), "r"(nThreads) : "memory"); #else asm volatile("barrier.sync.aligned %0, %1;" :: "r"(name), "r"(nThreads) : "memory"); #endif } __device__ inline void barrier_sync_aligned(int name) { asm volatile("barrier.sync.aligned %0;" :: "r"(name) : "memory"); } __device__ inline void barrier_sync_aligned(int name, int nThreads) { asm volatile("barrier.sync.aligned %0, %1;" :: "r"(name), "r"(nThreads) : "memory"); } __device__ inline bool barrier_red_or(bool vote, int name) { int ans; asm("{ .reg .pred p;" " setp.ne.s32 p, %1, 0;" " barrier.red.or.pred p, %2, p; " " selp.s32 %0, 1, 0, p; }" : "=r"(ans) : "r"((int)vote), "r"(name) : "memory"); return bool(ans); } __device__ inline bool barrier_red_or(bool vote, int name, int nThreads) { int ans; asm("{ .reg .pred p;" " setp.ne.s32 p, %1, 0;" " barrier.red.or.pred p, %2, %3, p; " " selp.s32 %0, 1, 0, p; }" : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory"); return bool(ans); } __device__ inline bool barrier_red_or_aligned(bool vote, int name) { int ans; asm("{ .reg .pred p;" " setp.ne.s32 p, %1, 0;" " barrier.red.or.pred.aligned p, %2, p; " " selp.s32 %0, 1, 0, p; }" : "=r"(ans) : "r"((int)vote), "r"(name) : "memory"); return bool(ans); } __device__ inline bool barrier_red_or_aligned(bool vote, int name, int nThreads) { int ans; asm("{ .reg .pred p;" " setp.ne.s32 p, %1, 0;" " barrier.red.or.pred.aligned p, %2, %3, p; " " selp.s32 %0, 1, 0, p; }" : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory"); return bool(ans); } // Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads. inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int bytes) { int offset = 16*tid; if (offset < bytes) { uint64_t a=0, b=0; asm("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset)); uint32_t udst = (uint32_t)__cvta_generic_to_shared(dst); asm volatile("st.shared.v2.u64 [%0],{%1,%2};" :: "r"(udst + offset), "l"(a), "l"(b)); } } // Must run with at least 64 threads __device__ __forceinline__ void loadWorkBatchToShmem( int tid, int tn, struct ncclDevKernelArgs const* args, int batchIx ) { int lane = tid%WARP_SIZE; int workCursor = 0; // num works written in previous loop iterations. while (true) { struct ncclDevWorkBatch batch = ((struct ncclDevWorkBatch*)(args+1))[batchIx]; // fnsOfBitset[n] = index of n'th set bit in batch.offsetBitset. // PTX has instruction "fns" (find n-th set) but it expands to a lot of SASS, // since we know all lanes will be querying the same bitmask we can compute // much faster using shared memory. uint8_t* fnsOfBitset = (uint8_t*)ncclScratchForWarp(threadIdx.x/WARP_SIZE); __syncwarp(); if (uint32_t(batch.offsetBitset) & (1u<>32) & (1u<>32) & ((1u<>32)); // add high 32 bits __syncwarp(); int workSize; int nPacks; // total number of packs loaded, each pack is 16 bytes int packInWork; // my pack index within work struct int dstWork; // my work index in contiguous destination shmem switch (batch.workType) { case (int)ncclDevWorkTypeP2p: workSize = sizeof(struct ncclDevWorkP2p); nPacks = nWorks*(workSize/16); packInWork = tid%(workSize/16); dstWork = tid/(workSize/16); break; case (int)ncclDevWorkTypeColl: workSize = sizeof(struct ncclDevWorkColl); nPacks = nWorks*(workSize/16); packInWork = tid%(workSize/16); dstWork = tid/(workSize/16); break; case (int)ncclDevWorkTypeCollReg: default: workSize = sizeof(struct ncclDevWorkCollReg); nPacks = nWorks*(workSize/16); packInWork = tid%(workSize/16); dstWork = tid/(workSize/16); break; } if (tid == 0) { ncclShmem.workSize = workSize; ncclShmem.workConsumed = batch.offsetBase + (64-__clzll(batch.offsetBitset))*workSize; } // We deliberately replicate these div and mod calculations into the case // blocks above so that they get constant divisor optimizations by the compiler. // packInWork = tid%(workSize/16); // dstWork = tid/(workSize/16); // We can only assume we have 64 threads, which means we can read at most 1024 bytes // here which is the per batch maximum. if (tid < nPacks) { int srcWork = fnsOfBitset[dstWork]; // find n'th set bit in batch.offsetBitset ulong2 tmp; // The loads done in these two cases must be kept separate since we are // relying on the compiler to use "ld.param" in the first one. The parameter // space is not generically addressable, so any attempt to load through // a pointer that *might* be parameter space backed will cause the // compiler to spill the parameter struct (4K!) to each thread's local space // before creating a pointer (to the spill) and decimate perf. // // An example of what not to do would be the following: // // if (condition) { // // The compiler could spill parameter_variable to local space and take // // the address of that, since when src is loaded below it could also // // be global space. // src = ¶meter_variable; // } else { // src = &global_variable; // } // memcpy(dst, src, n); if (ncclShmem.args.workStorageType == ncclDevWorkStorageTypeArgs) { char* src = (char*)args + (batch.offsetBase + srcWork*workSize + packInWork*16); tmp = *(ulong2*)src; // becomes ld.param.v2.u64 } else { char* src = (char*)ncclShmem.args.workBuf + ((batch.offsetBase + srcWork*workSize + packInWork*16) & ncclShmem.args.workMask); tmp = *(ulong2*)src; // becomes ld.v2.u64 } char* dst = ncclShmem.workStorage; dst += (workCursor + dstWork)*workSize + packInWork*16; *(ulong2*)dst = tmp; } workCursor += nWorks; if (batch.nextExtends) { batchIx += batch.nextJump; tid -= 64; // Rotate threads so we use the next two warps for next batch struct. if (tid < 0) tid += tn; } else { if (tid == 0) { ncclShmem.batchIx = batchIx; ncclShmem.nextBatchIx = (batch.nextJump == 0) ? -1 : batchIx + batch.nextJump; ncclShmem.workType = (enum ncclDevWorkType)batch.workType; ncclShmem.nWorks = workCursor; ncclShmem.funcId = batch.funcId; } break; } } } template struct RunWorkColl { __device__ void run(int tid, int tn, struct ncclDevWorkColl* work) { // Put NOT IMPLEMENTED behavior here. } }; template struct RunWorkBatch; // Specialized for P2p in sendrecv.h template struct RunWorkBatch; // Specialized here for non-P2p (Coll and CollReg) template struct RunWorkBatch { // This __forceinline__ is necessary. The compiler was inserting a function call // here from the LL ncclKernel. __device__ __forceinline__ void run() { int tid = threadIdx.x; int tn = blockDim.x; if (RedOpArg::ArgUsed) { int nWorks = ncclShmem.nWorks; for (int w=tid; w < nWorks; w += tn) { struct ncclDevWorkColl* work = (ncclDevWorkColl*)(ncclShmem.workStorage + w*ncclShmem.workSize); if (work->redOpArgIsPtr) { work->redOpArg = RedOpArg::loadArg(reinterpret_cast(work->redOpArg)); } } __syncthreads(); } #pragma unroll 1 for (int w=0; w < ncclShmem.nWorks; w++) { struct ncclDevWorkColl* work = (struct ncclDevWorkColl*)(ncclShmem.workStorage + w*ncclShmem.workSize); if (w != 0) { struct ncclDevWorkColl* workPrev = (struct ncclDevWorkColl*)(ncclShmem.workStorage + (w-1)*ncclShmem.workSize); if (work->nWarps != workPrev->nWarps) __syncthreads(); } int subtn = work->nWarps*WARP_SIZE; if (tid < subtn) RunWorkColl().run(tid, subtn, work); } } }; template __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* args) { int tid = threadIdx.x; int tn = blockDim.x; // Copy kernel args to shmem and then only read those. Otherwise the compiler // will end up putting the args into thread local stack which is very wasteful. if (tid < sizeof(ncclDevKernelArgs)/sizeof(uint32_t)) { ((uint32_t*)&ncclShmem.args)[tid] = ((uint32_t*)args)[tid]; } // To map blockId to channelId, we need the n'th set bit of channelMask which // is the inverse of counting the number of set bits among the the first n. // PTX has the fns instruction which does this but is extremely slow. We can // do better when we know all threads are querying the same bitmask. if (tid < MAXCHANNELS && (args->channelMask & (1ull<channelMask & ((1ull<channels[ncclShmem.channelId]; int bytes = sizeof(ncclDevChannel); static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn."); copyToShmem16(tid-WARP_SIZE, dst, src, bytes); } break; default: { int subtid = tid - 2*WARP_SIZE; int subtn = tn - 2*WARP_SIZE; loadWorkBatchToShmem(subtid, subtn, args, /*batchIx=*/blockIdx.x); } break; } __syncthreads(); // publish ncclShmem if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) { // ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads() ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed; } while (true) { if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) { SpecializedRunWorkBatch().run(); } else { ncclDevFuncTable[ncclShmem.funcId](); } if (ncclShmem.nextBatchIx == -1) break; int batchIx = ncclShmem.nextBatchIx; __syncthreads(); loadWorkBatchToShmem(tid, tn, args, batchIx); // Check whether the last operation was aborted and make sure all threads exit bool aborted = false; if (tid == 0) aborted = *ncclShmem.comm.abortFlag; aborted = barrier_red_or_aligned(aborted, 0); // publish ncclShmem.work if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) { // ncclShmem.workConsumed written by loadWorkBatchToShmem before barrier_red_or() ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed; } if (aborted) break; } } __global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); __device__ void ncclDevFunc_Nop(); #define DEFINE_ncclDevKernel(suffix, coll, redop, ty, algo, proto, specializedFnId) \ __global__ void ncclDevKernel_##suffix(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { \ ncclKernelMain, algo, proto>>(&args4K.args); \ } #define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto) \ __device__ void ncclDevFunc_##suffix() { \ RunWorkBatch, algo, proto>().run(); \ } #endif nccl-2.22.3-1/src/device/common_kernel.h000066400000000000000000000242351463451655400177270ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_COMMON_KERNEL_H_ #define NCCL_COMMON_KERNEL_H_ #include "device.h" #include "op128.h" #include "reduce_kernel.h" #include #include #include // Define min for ssize_t inline __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; } inline __device__ int loadInt(int* ptr) { int v; asm volatile("ld.volatile.global.u32 %0, [%1];" : "=r"(v) : "l"(ptr)); return v; } template __device__ __forceinline__ void reduceCopyPacks( int nThreads, int &thread, uint64_t redArg, uint64_t *preOpArgs, bool postOp, int nSrcs, SrcPtrFn const &srcPtrFn, int nDsts, DstPtrFn const &dstPtrFn, IntBytes &nBytesBehind, IntBytes &nBytesAhead ) { static_assert(std::is_signed::value, "IntBytes must be a signed integral type."); if (BytePerPack == 0) __trap(); // A hunk is the amount of contiguous data a warp consumes per loop iteration // assuming all threads partake. constexpr int BytePerHunk = Unroll*WARP_SIZE*BytePerPack; int nWarps = nThreads/WARP_SIZE; int warp = thread/WARP_SIZE; int lane = thread%WARP_SIZE; // This thread's initial position. IntBytes threadBytesBehind = nBytesBehind + (warp*BytePerHunk + lane*BytePerPack); IntBytes threadBytesAhead = nBytesAhead - (warp*BytePerHunk + lane*BytePerPack); // Number of hunks to be consumed over all warps. IntBytes nHunksAhead = nBytesAhead/(BytePerHunk + !BytePerHunk); // Advance collective position. nBytesBehind += nHunksAhead*BytePerHunk; nBytesAhead -= nHunksAhead*BytePerHunk; if (Unroll==1 && BytePerPack <= nBytesAhead) { // Only Unroll=1 can do partial hunks (where not all threads partake). nHunksAhead += 1; nBytesBehind += nBytesAhead - (nBytesAhead%(BytePerPack + !BytePerPack)); nBytesAhead = nBytesAhead%(BytePerPack + !BytePerPack); } nHunksAhead -= warp; RedFn redFn(redArg); uintptr_t minSrcs[MinSrcs + !MinSrcs]; uintptr_t minDsts[MinDsts + !MinDsts]; #pragma unroll for (int s=0; s < MinSrcs; s++) minSrcs[s] = cvta_to_global(srcPtrFn(s)) + threadBytesBehind; #pragma unroll for (int d=0; d < MinDsts; d++) minDsts[d] = cvta_to_global(dstPtrFn(d)) + threadBytesBehind; // We dictate loop termination condition according to whether partial hunks // can be handled or not. while (Unroll==1 ? (BytePerPack <= threadBytesAhead) : (0 < nHunksAhead)) { BytePack acc[Unroll]; { RedFn preFn(0 < PreOpSrcs ? preOpArgs[0] : 0); #pragma unroll Unroll for (int u=0; u < Unroll; u++) { if (0 < MultimemSrcs) { // applyLoadMultimem uses relaxed semantics for same reason we use volatile below. acc[u] = applyLoadMultimem(redFn, minSrcs[0]); } else { // Use volatile loads in case credits are polled for with volatile (instead of acquire). acc[u] = ld_volatile_global(minSrcs[0]); if (0 < PreOpSrcs) acc[u] = applyPreOp(preFn, acc[u]); } minSrcs[0] += WARP_SIZE*BytePerPack; } } #pragma unroll (MinSrcs-1 + !(MinSrcs-1)) for (int s=1; s < MinSrcs; s++) { BytePack tmp[Unroll]; RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0); #pragma unroll Unroll for (int u=0; u < Unroll; u++) { if (s < MultimemSrcs) { // applyLoadMultimem uses relaxed semantics for same reason we use volatile below. acc[u] = applyLoadMultimem(redFn, minSrcs[s]); } else { // Use volatile loads in case credits are polled for with volatile (instead of acquire). tmp[u] = ld_volatile_global(minSrcs[s]); } minSrcs[s] += WARP_SIZE*BytePerPack; } #pragma unroll Unroll for (int u=0; u < Unroll; u++) { if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]); acc[u] = applyReduce(redFn, acc[u], tmp[u]); } } for (int s=MinSrcs; (MinSrcs < MaxSrcs) && (s < MaxSrcs) && (s < nSrcs); s++) { uintptr_t src = cvta_to_global(srcPtrFn(s)) + threadBytesBehind; BytePack tmp[Unroll]; RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0); #pragma unroll Unroll for (int u=0; u < Unroll; u++) { // Use volatile loads in case credits are polled for with volatile (instead of acquire). tmp[u] = ld_volatile_global(src); src += WARP_SIZE*BytePerPack; } #pragma unroll Unroll for (int u=0; u < Unroll; u++) { if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]); acc[u] = applyReduce(redFn, acc[u], tmp[u]); } } if (postOp) { #pragma unroll Unroll for (int u=0; u < Unroll; u++) acc[u] = applyPostOp(redFn, acc[u]); } #pragma unroll (MinDsts + !MinDsts) for (int d=0; d < MinDsts; d++) { #pragma unroll Unroll for (int u=0; u < Unroll; u++) { if (d < MultimemDsts) { multimem_st_global(minDsts[d], acc[u]); } else { st_global(minDsts[d], acc[u]); } minDsts[d] += WARP_SIZE*BytePerPack; } } for (int d=MinDsts; (MinDsts < MaxDsts) && (d < MaxDsts) && (d < nDsts); d++) { uintptr_t dst = cvta_to_global(dstPtrFn(d)) + threadBytesBehind; #pragma unroll Unroll for (int u=0; u < Unroll; u++) { st_global(dst, acc[u]); dst += WARP_SIZE*BytePerPack; } } nWarps = nThreads/WARP_SIZE; #pragma unroll for (int s=0; s < MinSrcs; s++) minSrcs[s] += (nWarps-1)*BytePerHunk; #pragma unroll for (int d=0; d < MinDsts; d++) minDsts[d] += (nWarps-1)*BytePerHunk; threadBytesBehind += nWarps*BytePerHunk; threadBytesAhead -= nWarps*BytePerHunk; nHunksAhead -= nWarps; } nWarps = nThreads/WARP_SIZE; warp = thread/WARP_SIZE; lane = thread%WARP_SIZE; // The last loop iteration could have been partial, i.e. not taken by all // threads. The threads that weren't included need an extra subtraction to // make the value warp uniform. if (Unroll==1 && nHunksAhead > 0) nHunksAhead -= nWarps; // Rotate warps so the warp which got the least work here will be warp 0. // This effectively assigns: warp = (warp-nHunks+nWarps)%nWarps; warp = -nHunksAhead; thread = warp*WARP_SIZE + lane; } template __device__ __forceinline__ void reduceCopy( int thread, int nThreads, uint64_t redArg, uint64_t *preOpArgs, bool postOp, int nSrcs, SrcPtrFn const &srcPtrFn, int nDsts, DstPtrFn const &dstPtrFn, IntBytes nElts ) { static_assert(MultimemSrcs <= MinSrcs && MultimemDsts <= MinDsts, "Multimem pointers cannot exceed respective Min values."); //int nWarps = nThreads/WARP_SIZE; //int warp = thread/WARP_SIZE; int lane = thread%WARP_SIZE; // If a multimem src is present then our biggest pack size is limited to what // is supported for this redfn/type. constexpr int BigPackSize = (MultimemSrcs == 0) ? 16 : LoadMultimem_BigPackSize::BigPackSize; if (MaxDsts==0) return; if (MinDsts==0 && nDsts==0) return; IntBytes nBytesBehind = 0; IntBytes nBytesAhead = nElts*sizeof(T); #if __cpp_if_constexpr if constexpr (BigPackSize > sizeof(T)) { #else if (BigPackSize > sizeof(T)) { #endif // Check that all pointers are BigPackSize aligned. bool aligned = true; if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrFn(lane)) % (BigPackSize + !BigPackSize); if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrFn(lane)) % (BigPackSize + !BigPackSize); aligned = __all_sync(~0u, aligned); if (aligned) { reduceCopyPacks (nThreads, /*&*/thread, redArg, preOpArgs, postOp, nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead); if (nBytesAhead == 0) return; reduceCopyPacks (nThreads, /*&*/thread, redArg, preOpArgs, postOp, nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead); if (nBytesAhead == 0) return; } } reduceCopyPacks (nThreads, /*&*/thread, redArg, preOpArgs, postOp, nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead); if (nBytesAhead == 0) return; reduceCopyPacks (nThreads, /*&*/thread, redArg, preOpArgs, postOp, nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead); } template __device__ __forceinline__ void reduceCopy( int thread, int nThreads, uint64_t redArg, uint64_t *preOpArgs, bool postOp, int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, IntBytes nElts ) { reduceCopy (thread, nThreads, redArg, preOpArgs, postOp, nSrcs, [=]__device__(int i) { return srcPtrs[i]; }, nDsts, [=]__device__(int i) { return dstPtrs[i]; }, nElts); } #endif // COMMON_KERNEL_H_ nccl-2.22.3-1/src/device/generate.py000077500000000000000000000350001463451655400170650ustar00rootroot00000000000000#!/usr/bin/env python3 import os import sys # Order of redops, tys, protos, algos must match src/include/device.h all_colls = ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","SendRecv"] all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"] all_tys = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16"] all_protos = ["LL","LL128","SIMPLE"] all_algos = ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"] ################################################################################ # The first command line argument is the path to the directory to generate and # populate. gensrc = sys.argv[1] if os.path.exists(gensrc): for name in os.listdir(gensrc): os.remove(os.path.join(gensrc, name)) #os.truncate(os.path.join(gensrc, name), 0) else: os.mkdir(gensrc) ################################################################################ # The second command line argument is used as a regex to filter the functions # which make it into libnccl. This is helpful for reducing the binary when # developing device code. The regex supports non-space containing globs '*', # parentheses '(x)', and union 'a|b'. The string representing the function has # one of the forms: # # SendRecv # (AllGather|Broadcast) # (AlLReduce|Reduce|ReduceScatter) # # The possible values for redop, type, algo, proto can be found in the all_ # lists at the top of this file. # # Since the Makefile forwards this from the ONLY_FUNCS variable, useful command # line examples are given: """ # Only send/recv: make ONLY_FUNCS="SendRecv" # Only non-reductions: make ONLY_FUNCS="AllGather * *|Broadcast * *|SendRecv" # Only AllReduce sum f32 (but all algos, protos) make ONLY_FUNCS="AllReduce Sum f32 * *" # Only AllReduce minmax i32 NVLS (but all protos) make ONLY_FUNCS="AllReduce MinMax i32 NVLS *" # AllReduce sum RING LL128 make ONLY_FUNCS="AllReduce Sum f32 RING LL128" """ # Paste all non-None arguments together with `sep`. def paste(sep, *args): return sep.join(x for x in args if x is not None) func_pattern = sys.argv[2:3] if func_pattern and func_pattern[0]: import re func_pattern = func_pattern[0] func_pattern = func_pattern.replace("*", "[^ ]*") func_pattern += "$" def func_filter(*fn): return None is not re.match(func_pattern, paste(" ", *fn), flags=re.IGNORECASE) else: def func_filter(coll, redop, ty, algo, proto): return True ################################################################################ algos_of_coll = { "AllGather": ["RING","COLLNET_DIRECT","NVLS"], "AllReduce": all_algos, "Broadcast": ["RING"], "Reduce": ["RING"], "ReduceScatter": ["RING","COLLNET_DIRECT","NVLS"], "SendRecv": [None] } coll_camel_to_lower = { "AllGather": "all_gather", "AllReduce": "all_reduce", "Broadcast": "broadcast", "Reduce": "reduce", "ReduceScatter": "reduce_scatter", "SendRecv": "sendrecv" } coll_lower_to_camel = {coll_camel_to_lower[x]: x for x in coll_camel_to_lower} ################################################################################ # Returns pair of minimum required values for (CUDART_VERSION, __CUDA_ARCH__) # or None if function is never supported. Note that (0, 0) encodes universal # support. def required_cuda(coll, redop, ty, algo, proto): cudart, arch = 0, 0 # kernels mapped to by coll="Nop" functions have coll="Generic" if coll in ("SendRecv", "Generic", "Nop"): return (cudart, arch) if proto!="SIMPLE" and algo not in ("RING","TREE"): return None if coll in ("AllReduce","Reduce","ReduceScatter"): if redop=="SumPostDiv" and ty[0] not in ("i","u"): return None if ty=="bf16": cudart = max(cudart, 11000) if "NVLS" in algo: if coll in ("AllReduce","Reduce","ReduceScatter"): # Must match ncclNvlsSupported() in src/include/device.h nvls_ok = ((ty in ("i32","u32","i64","u64") and redop in ("Sum","MinMax")) or (ty in ("f32","f64") and redop=="Sum") or (ty in ("f16","bf16") and redop in ("Sum","MinMax"))) if not nvls_ok: return None cudart = max(cudart, 12010) arch = max(arch, 900) return (cudart, arch) # Maps functions to the chosen representative for the equivalence class it # belongs to. For instance (sum, signed int) maps to (sum, unsigned int). def equivalent_primary(coll, redop, ty, algo, proto): if coll in ("AllReduce", "Reduce", "ReduceScatter"): # map signed integer sum/prod to unsigned if redop in ("Sum","Prod","PreMulSum") and ty[0]=="i": return (coll, redop, "u"+ty[1:], algo, proto) # map signed integer min/max to unsigned for non-NVLS if redop=="MinMax" and ty[0]=="i" and ("NVLS" not in algo): return (coll, redop, "u"+ty[1:], algo, proto) return (coll, redop, ty, algo, proto) # Map to another func representing the best kernel to use. Every distinct value # returned will instantiate a ncclDevKernel specialized to run this func # without function call overhead. def best_kernel(coll, redop, ty, algo, proto): def best(coll, redop, ty, algo, proto): # Modify this logic to control how many kernels are specialized. if coll=="Nop": return ("Generic", None, None, None, None) if coll=="SendRecv": return ("SendRecv", None, None, None, None) if coll in ("AllGather","Broadcast"): return (coll, None, None, "RING", "LL") return (coll, "Sum", ty, ("TREE" if algo=="TREE" else "RING"), "LL") # Need to ensure kernel is specialize for a primary function kfn = equivalent_primary(*best(coll, redop, ty, algo, proto)) # And isn't filtered out. if not func_filter(*kfn): return ("Generic", None, None, None, None) return kfn # Order rows are enumerated must match formula of `ncclDevFuncId()`: def enumerate_func_rows(): yield ("SendRecv", None, None, None, None) for coll in ("AllGather", "Broadcast"): algos = algos_of_coll[coll] for algo in algos: for proto in all_protos: yield (coll, None, None, algo, proto) for coll in ("AllReduce", "Reduce", "ReduceScatter"): algos = algos_of_coll[coll] for redop in all_redops: for ty in all_tys: for algo in algos: for proto in all_protos: yield (coll, redop, ty, algo, proto) ################################################################################ def is_built(coll, redop, ty, algo, proto): built = required_cuda(coll, redop, ty, algo, proto) built = built and func_filter(coll, redop, ty, algo, proto) return built # Returns None if required_cuda(...) is None. # Returns the coll="Nop" function if developer has filtered it out. # Otherwise just returns func it was given. def validate(coll, redop, ty, algo, proto): valid = required_cuda(coll, redop, ty, algo, proto) built = valid and func_filter(coll, redop, ty, algo, proto) if built: return (coll, redop, ty, algo, proto) if valid: return ("Nop", None, None, None, None) return None # Corresponds to ncclDevFuncRowToId[] func_rows = [validate(*fn) for fn in enumerate_func_rows()] # Corresponds to ncclDevFuncTable[] primary_funcs = sorted(set(equivalent_primary(*fn) for fn in func_rows if fn is not None)) # primary_to_index[primary_funcs[i]] == i primary_to_index = {fn: i for (i,fn) in zip(range(len(primary_funcs)), primary_funcs)} kernel_funcs = sorted(set(best_kernel(*fn) for fn in primary_funcs)) ################################################################################ # Generate /device_table.cu with open(os.path.join(gensrc, "device_table.cu"), "w") as f: out = f.write out('#include "common.h"\n') out("\n") for fn in primary_funcs: sym = paste("_", "ncclDevFunc", *fn) cudart, arch = required_cuda(*fn) if (cudart, arch) != (0, 0): out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch)) out("__device__ void %s();\n" % sym) if (cudart, arch) != (0, 0): out("#endif\n") out("\n") out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable[] = {\n"); index = 0 for fn in primary_funcs: sym = paste("_", "ncclDevFunc", *fn) cudart, arch = required_cuda(*fn) if (cudart, arch) != (0, 0): out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart ,arch)) out("/*%4d*/ %s,\n" % (index, sym)) if (cudart, arch) != (0, 0): out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index) index += 1 out("nullptr};\n") out("\n") out("// Workaround for https://reviews.llvm.org/D55580\n" "__device__ void ncclWorkaroundClangD55580() {}\n") # Generate /host_table.cc with open(os.path.join(gensrc, "host_table.cc"), "w") as f: out = f.write out('#include "device.h"\n') out("\n") out("extern int const ncclDevFuncIdCount = %d;\n" % len(primary_funcs)) # The mapping from function rows to valid primary function ids. out("extern int const ncclDevFuncRowToId[] = {\n") index = 0 for fn in func_rows: fn_id, comment = -1, "" if fn is not None: fn_id = primary_to_index[equivalent_primary(*fn)] comment = " // " + paste(" ", *fn) out("/*%4d*/ %d,%s\n" % (index, fn_id, comment)) index += 1 out("-1};\n") out("\n") # Forward declarations of kernels. for kfn in kernel_funcs: cudart, _ = required_cuda(*kfn) sym = paste("_", "ncclDevKernel", *kfn) if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart) out("__global__ void %s(ncclDevKernelArgs4K const);\n" % sym) if cudart != 0: out("#endif\n") out("\n") # List of all kernel function pointers. out("extern int const ncclDevKernelCount = %d;\n" % len(kernel_funcs)) out("extern void* const ncclDevKernelList[] = {\n") index = 0 for kfn in kernel_funcs: cudart, _ = required_cuda(*kfn) sym = paste("_", "ncclDevKernel", *kfn) if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart) out("/*%4d*/ (void*)%s,\n" % (index, sym)); if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index) index += 1 out("nullptr};\n") out("\n") # Maps primary id to kernel function pointer. out("extern void* const ncclDevKernelForFunc[] = {\n") index = 0 for fn in primary_funcs: kfn = best_kernel(*fn) sym = paste("_", "ncclDevKernel", *kfn) cudart, _ = required_cuda(*kfn) if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart) out("/*%4d*/ (void*)%s,\n" % (index, sym)) if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index) index += 1 out("nullptr};\n") out("\n") # Does the prior map use an explicitly specialized kernel. out("extern bool const ncclDevKernelForFuncIsSpecialized[] = {\n") index = 0 for fn in primary_funcs: kfn = best_kernel(*fn) specialized = "1" if fn == kfn else "0" out("/*%4d*/ %s,\n" % (index, specialized)) index += 1 out("0};\n") # Maps to .cu filename which implements this func. The only constraint is that # "coll" is reflected in the name: formally that no two funcs having different # coll's map to the same filename. def impl_filename(coll, redop, ty, algo, proto): return "%s.cu" % paste("_", coll_camel_to_lower[coll], redop and redop.lower(), ty) # Partition the functions and kernels to the .cu filenames. The partition is # a dictionary mapping filename to (coll, func-tuple list) def partition_by_name(fns): ans = {} for fn in fns: name = impl_filename(*fn) coll = fn[0] if name not in ans: ans[name] = (coll, []) ans[name][1].append(fn) return ans name_to_funcs = partition_by_name(fn for fn in primary_funcs if fn[0]!="Nop") name_to_kernels = partition_by_name(kfn for kfn in kernel_funcs if kfn[0]!="Generic") # Generate /rules.mk with open(os.path.join(gensrc, "rules.mk"), "w") as f: out = f.write impl_names = sorted(name_to_funcs.keys()) names = impl_names + ["host_table.cc", "device_table.cu"] out("LIB_OBJS_GEN = $(patsubst %, $(OBJDIR)/genobj/%.o, {names})\n" .format(names=" ".join(names))) out("\n") # For each __.cu compile to a .cu.o file. Notice the dependencies # come from the suffix-erased file (e.g. 'gensrc/all_reduce.cu') for name in impl_names: coll = name_to_funcs[name][0] out( "$(OBJDIR)/genobj/{name}.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/{lower_coll}.cu.d\n" "\t" "$(call COMPILE,$@,$(OBJDIR)/gensrc/{name})\n" "\n" .format(name=name, lower_coll=coll_camel_to_lower[coll]) ) # Add the suffix-erased .cu's which are used only for dependency scraping. for coll in set(coll for (coll,_,_,_,_) in primary_funcs if coll!="Nop"): name = impl_filename(coll, None, None, None, None) if name not in name_to_funcs: name_to_funcs[name] = (coll, []) redop_to_cxx = { None: "FuncCopy", "Sum": "FuncSum", "Prod": "FuncProd", "MinMax": "FuncMinMax", "PreMulSum": "FuncPreMulSum", "SumPostDiv": "FuncSumPostDiv" } ty_to_cxx = { None: "int8_t", "i8": "int8_t", "u8": "uint8_t", "i32": "int32_t", "u32": "uint32_t", "i64": "int64_t", "u64": "uint64_t", "f16": "half", "f32": "float", "f64": "double", "bf16": "__nv_bfloat16" } # Generate each /.cu: for name in name_to_funcs.keys(): (coll, fns) = name_to_funcs[name] with open(os.path.join(gensrc, name), "w") as f: out = f.write out( '#include "common.h"\n' '#include "{lower_coll}.h"\n' .format(lower_coll=coll_camel_to_lower[coll]) ) (_, kfns) = name_to_kernels.get(name) or (None, []) for kfn in kfns: (coll, redop, ty, algo, proto) = kfn sym = paste("_", coll, redop, ty, algo, proto) fn_id = primary_to_index[kfn] cudart, arch = required_cuda(*kfn) if (cudart, arch) != (0, 0): out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch)) out( "DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n" .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty], algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id) ) if (cudart, arch) != (0, 0): out("#endif\n") for fn in fns: (coll, redop, ty, algo, proto) = fn sym = paste("_", coll, redop, ty, algo, proto) cudart, arch = required_cuda(*fn) if (cudart, arch) != (0, 0): out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch)) out( "DEFINE_ncclDevFunc({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto})\n" .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty], algo=(algo or "RING"), proto=(proto or "SIMPLE")) ) if (cudart, arch) != (0, 0): out("#endif\n") nccl-2.22.3-1/src/device/network/000077500000000000000000000000001463451655400164115ustar00rootroot00000000000000nccl-2.22.3-1/src/device/network/unpack/000077500000000000000000000000001463451655400176725ustar00rootroot00000000000000nccl-2.22.3-1/src/device/network/unpack/unpack.h000066400000000000000000000256061463451655400213350ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2023, Google LLC. All rights reserved. * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NET_DEVICE_UNPACK_H #define NET_DEVICE_UNPACK_H #include "unpack_defs.h" #include "op128.h" #include "bitops.h" #include "device.h" #include "common.h" // #define ALIGNED_LOAD inline __device__ void load64gpu(const uint64_t* ptr, uint64_t &v) { #if __CUDA_ARCH__ >= 700 asm volatile("ld.relaxed.gpu.u64 {%0}, [%1];" : "=l"(v) : "l"(ptr)); #else asm volatile("ld.volatile.global.u64 {%0}, [%1];" : "=l"(v) : "l"(ptr)); #endif } #define PAGE_META_SIZE 16 #define META_LOAD_SIZE 16 #define DATA_LOAD_SIZE 16 // Map internal association of handle with group and peer index (called once at init time) inline __device__ void ncclNetDeviceUnpackSetup(void* ohandle, const int group, const int index) { struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle; ncclShmem.groups[group].devicePlugin.unpack.g_meta[index] = handle->meta; ncclShmem.devicePlugin.unpack.bounce_buf = handle->bounce_buf; ncclShmem.groups[group].devicePlugin.unpack.head[index] = handle->head; } inline __device__ void ncclNetDeviceIncrementHead(const int group, const int index) { ncclShmem.groups[group].devicePlugin.unpack.head[index]++; } inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group, const int index) { struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle; handle->head = ncclShmem.groups[group].devicePlugin.unpack.head[index]; } template inline __device__ void bulkLoad(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack *reg, const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){ bulkLoad<1>(t, len, cpy_src, cpy_dst, reg, w, g_meta, s_meta, src_off, dst_off); } template <> inline __device__ void bulkLoad<1>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<1> reg[16], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){ uint64_t data_s; for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) { #ifdef ALIGNED_LOAD load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]); #else #pragma unroll for (int i=0; i<16; i++) { reg[i] = ld_volatile_global<1>((uintptr_t)((uint8_t*)(cpy_src + data_s) + i)); } #endif #pragma unroll for (int i=0; i<16; i++) { st_global<1>((uintptr_t)((uint8_t*)(cpy_dst + data_s) + i), reg[i]); } } } template <> inline __device__ void bulkLoad<2>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<2> reg[8], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){ uint64_t data_s; for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) { #ifdef ALIGNED_LOAD load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]); #else #pragma unroll for (int i=0; i<8; i++) { reg[i] = ld_volatile_global<2>((uintptr_t)((uint16_t*)(cpy_src + data_s) + i)); } #endif #pragma unroll for (int i=0; i<8; i++) { st_global<2>((uintptr_t)((uint16_t*)(cpy_dst + data_s) + i), reg[i]); } } } template <> inline __device__ void bulkLoad<4>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<4> reg[4], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){ uint64_t data_s; for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) { #ifdef ALIGNED_LOAD load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]); #else #pragma unroll for (int i=0; i<4; i++) { reg[i] = ld_volatile_global<4>((uintptr_t)((uint32_t *)(cpy_src + data_s) + i)); } #endif #pragma unroll for (int i=0; i<4; i++) { st_global<4>((uintptr_t)((uint32_t*)(cpy_dst + data_s) + i), reg[i]); } } } template <> inline __device__ void bulkLoad<8>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<8> reg[2], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){ uint64_t data_s; for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) { #ifdef ALIGNED_LOAD load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]); #else #pragma unroll for (int i=0; i<2; i++) { reg[i] = ld_volatile_global<8>((uintptr_t)((uint64_t*)(cpy_src + data_s) + i)); } #endif #pragma unroll for (int i=0; i<2; i++) { st_global<8>((uintptr_t)((uint64_t*)(cpy_dst + data_s) + i), reg[i]); } } } template <> inline __device__ void bulkLoad<16>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<16> reg[1], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){ uint64_t data_s; for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) { reg[0] = ld_volatile_global<16>((uintptr_t)(cpy_src + data_s)); st_global<16>((uintptr_t)(cpy_dst + data_s), reg[0]); } } #ifndef PAGE_SIZE #define PAGE_SIZE 4096 #endif inline __device__ int ppw(const int nbytes, int nw) { int v = DIVUP(nbytes, SLICE_PAGE_SIZE); v = DIVUP(v, nw); while (v > WARP_SHM_PAGE_CNT) { v = DIVUP(v, 2); } return v; } // This function is called by all threads // Pack data from the internal iovec to the supplied flat buffer using all the // threads template inline __device__ void ncclNetDeviceUnpack( const int tid, const int tidInBlock, const int nworkers, const int group, int mask, int Src, int workSize); template <> inline __device__ void ncclNetDeviceUnpack( const int tid, const int tidInBlock, const int nworkers, const int group, int mask, int Src, int workSize) { // send unpack empty } inline __device__ void ncclNetDeviceUnpackInner( const int tid, const int tidInBlock, const int nworkers, const int group, const int index, void *src, const int nbytes, const uint64_t step); template <> inline __device__ void ncclNetDeviceUnpack( const int tid, const int tidInBlock, const int nworkers, const int group, int mask, int Src, int workSize) { while (mask != 0) { int ix = __ffs(mask)-1; // Get the first set bit of the mask (this should correlate to a peer index) mask &= mask-1; // Drop the first set bit of the mask // Pack data from the internal iovec to the supplied flat srcs buffer using all the threads // + Src is necessary in the case of accessing the user buffer directly ncclNetDeviceUnpackInner(tid, tidInBlock, nworkers, group /* in case they need to use split warps shared memory partitioning*/, ix, ncclShmem.groups[group].srcs[ix + Src], workSize, ncclShmem.groups[group].devicePlugin.unpack.head[ix]); } } inline __device__ void ncclNetDeviceUnpackInner( const int tid, const int tidInBlock, const int nworkers, const int group, const int index, void *src, const int nbytes, const uint64_t step) { // from src/collectives/device/common_kernel.h const int w = tid / WARP_SIZE; // Warp number const int nw = nworkers / WARP_SIZE; // Number of warps const int t = tid % WARP_SIZE; // Thread (inside the warp) BytePack<16> reg; loadMeta meta; uint64_t head; struct netUnpackMeta* g_meta_struct; void* bounce_buf; loadMeta* g_meta; loadMeta* s_meta; uint64_t meta_cnt; // hack head use per-warp head = step; g_meta_struct = ncclShmem.groups[group].devicePlugin.unpack.g_meta[index]; bounce_buf = ncclShmem.devicePlugin.unpack.bounce_buf; __syncwarp(); head %= NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH; g_meta = g_meta_struct->mem[head]; // Currently, even/odd groups perform send/recv separately. We don't really need space for send side. // Total size is N page per warp * 16 B per page * 20 WARPS max = 320 * N bytes, N == WARP_SHM_PAGE_CNT static_assert(ncclShmemScratchWarpSize() >= WARP_SHM_SIZE, "Each warp must have enough scratch space"); s_meta = (loadMeta*) ncclScratchForWarp(tidInBlock / WARP_SIZE); // (loadMeta*) (ncclShmem.devicePlugin.unpack.meta + shm_off); load64gpu(g_meta_struct->cnt + head, meta_cnt); int PPW = ppw(nbytes, nw); for (uint64_t meta_s = w * PPW; meta_s < meta_cnt; meta_s += nw * PPW) { uint64_t iter_meta_cnt = meta_cnt - meta_s; iter_meta_cnt = iter_meta_cnt < PPW ? iter_meta_cnt : PPW; // TODO: this load size needs to work if not aligned, but since the two are both 16... if (t < PPW * PAGE_META_SIZE / META_LOAD_SIZE && t < iter_meta_cnt) { // avoid last iter load garbage data load128((const uint64_t*) (g_meta + (meta_s + t)), reg.u64[0], reg.u64[1]); storeShmem128(shmemCvtPtr((uint64_t *)(s_meta + (w * PPW + t))), reg.u64[0], reg.u64[1]); } __syncwarp(); for (int x = 0; x < iter_meta_cnt; x++) { int meta_idx = x + w * PPW; // load page offs loadShmem128(shmemCvtPtr((uint64_t*) (s_meta + meta_idx)), meta.r64[0], meta.r64[1]); if (meta.len >= DATA_LOAD_SIZE) { // fast path, but need to adapt to alignment issue // bulk copy data uint8_t align_off = (meta.src_off | meta.dst_off) % DATA_LOAD_SIZE; align_off = align_off & -align_off; // keep the lowest bit if (align_off == 0) { // 0x16 bulkLoad<16>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, ®, w, g_meta, s_meta, meta.src_off, meta.dst_off); } else if (align_off & 0x8) { bulkLoad<8>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<8>*) ®, w, g_meta, s_meta, meta.src_off, meta.dst_off); } else if (align_off & 0x4) { bulkLoad<4>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<4>*) ®, w, g_meta, s_meta, meta.src_off, meta.dst_off); } else if (align_off & 0x2) { bulkLoad<2>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<2>*) ®, w, g_meta, s_meta, meta.src_off, meta.dst_off); } else { // if (align_off & 0x1) bulkLoad<1>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<1>*) ®, w, g_meta, s_meta, meta.src_off, meta.dst_off); } } // must be less than 16 bytes if (t < meta.len % DATA_LOAD_SIZE) { volatile char* cpy_src = (char*) bounce_buf + meta.src_off + (meta.len / DATA_LOAD_SIZE) * DATA_LOAD_SIZE + t; volatile char* cpy_dst = (char*) src + meta.dst_off + (meta.len / DATA_LOAD_SIZE) * DATA_LOAD_SIZE + t; *cpy_dst = *cpy_src; } } __syncwarp(); } } #endif // NET_DEVICE_UNPACK_DEFS_H_ nccl-2.22.3-1/src/device/network/unpack/unpack_defs.h000066400000000000000000000036301463451655400223270ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2023, Google LLC. All rights reserved. * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NET_DEVICE_UNPACK_DEFS_H #define NET_DEVICE_UNPACK_DEFS_H #include #include "device.h" #define NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH 16 union alignas(16) loadMeta { uint64_t r64[2]; struct { uint32_t src_off; uint32_t len; uint64_t dst_off; }; }; static_assert(sizeof(union loadMeta) == 16, "Must be 16-byte aligned"); /****** global memory ******/ #define NET_UNPACK_MAX_QUEUE_DEPTH 16 // MAX_REQUESTS #define NET_UNPACK_MAX_SLICE_SIZE 4194304 // 4MB per Irecv call #define SLICE_PAGE_SIZE 4096 #define NET_UNPACK_MAX_SLICE_PAGES \ (NET_UNPACK_MAX_SLICE_SIZE / SLICE_PAGE_SIZE * 2) // * 2 for slack, wasteful.. struct netUnpackMeta { loadMeta mem[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH][NET_UNPACK_MAX_SLICE_PAGES]; uint64_t cnt[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH]; }; struct unpackNetDeviceHandle { struct netUnpackMeta *meta; // mapped void* bounce_buf; uint64_t head; }; /****** shared memory ******/ #define NET_UNPACK_MAX_GROUPS 16 // Forked from NCCL_MAX_GROUPS in devcomm.h #define NET_UNPACK_MAX_NPEERS 2 // The most you should have is 2 network peers per-group (indexed by index) #define WARP_SHM_PAGE_CNT 4 #define WARP_SHM_SIZE (WARP_SHM_PAGE_CNT * sizeof(union loadMeta)) struct unpackShmem { void* bounce_buf; }; struct unpackGroupShmem { int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv uint64_t head[NET_UNPACK_MAX_NPEERS]; struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy }; #endif // NET_DEVICE_UNPACK_DEFS_H_ nccl-2.22.3-1/src/device/onerank.cu000066400000000000000000000062731463451655400167160ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "alloc.h" #include "collectives.h" #include "common_kernel.h" #include "common.h" #include namespace { template __global__ __launch_bounds__(512, 1) void oneRankReduce(void* dst, void* src, size_t nElts, uint64_t redOpArg, bool redOpArgIsPtr) { using T = typename RedOp::EltType; int tid = threadIdx.x; int tn = blockDim.x; int bid = blockIdx.x; int bn = gridDim.x; // each block/channel gets a roughly equal segment of 16 byte packs constexpr int EltPerPack = 16/sizeof(T); intptr_t i0 = (bid+0)*alignUp(nElts/bn, EltPerPack); intptr_t i1 = (bid+1)*alignUp(nElts/bn, EltPerPack); i0 = min(i0, nElts); i1 = min(i1, nElts); src = (T*)src + i0; dst = (T*)dst + i0; if (redOpArgIsPtr) { if (redOpArg%2 != 0) { redOpArg = *reinterpret_cast(redOpArg); } else if (redOpArg%4 != 0) { redOpArg = *reinterpret_cast(redOpArg); } else if (redOpArg%8 != 0) { redOpArg = *reinterpret_cast(redOpArg); } else { redOpArg = *reinterpret_cast(redOpArg); } } reduceCopy (tid, tn, redOpArg, &redOpArg, true, 1, &src, 1, &dst, i1-i0); } } ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t eltType, cudaStream_t stream) { size_t eltSize = ncclTypeSize(eltType); if (redOp.op != ncclDevPreMulSum) { if (dst != src) { NCCLCHECK(ncclCudaMemcpyAsync((char*)dst, (char*)src, nElts*eltSize, stream)); } return ncclSuccess; } void const* kernel; switch (eltType) { case ncclInt8: kernel = (void const*)&oneRankReduce>; break; case ncclUint8: kernel = (void const*)&oneRankReduce>; break; case ncclInt32: kernel = (void const*)&oneRankReduce>; break; case ncclUint32: kernel = (void const*)&oneRankReduce>; break; case ncclInt64: kernel = (void const*)&oneRankReduce>; break; case ncclUint64: kernel = (void const*)&oneRankReduce>; break; case ncclFloat16: kernel = (void const*)&oneRankReduce>; break; #if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: kernel = (void const*)&oneRankReduce>; break; #endif case ncclFloat32: kernel = (void const*)&oneRankReduce>; break; case ncclFloat64: kernel = (void const*)&oneRankReduce>; break; default: return ncclInvalidArgument; } dim3 grid = {0, 1, 1}; grid.x = std::min(32, (int)divUp(nElts*eltSize, 16<<10)); dim3 block = {512, 1, 1}; void* args[5] = {&dst, &src, &nElts, &redOp.scalarArg, &redOp.scalarArgIsPtr}; CUDACHECK(cudaLaunchKernel(kernel, grid, block, args, 0, stream)); return ncclSuccess; } nccl-2.22.3-1/src/device/op128.h000066400000000000000000000376571463451655400157640ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef OP128_H_ #define OP128_H_ #include inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) { asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v0), "=l"(v1) : "l"(ptr)); } inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) { asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};" :: "l"(v0), "l"(v1), "l"(ptr)); } inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) { uint64_t* shmemAsmPtr; asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr)); return shmemAsmPtr; } inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) { asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];" : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr)); } inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) { asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};" :: "l"(v0), "l"(v1), "l"(shmemAsmPtr)); } template inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1) { union { uint32_t tmp4[4]; uint64_t tmp8[2]; }; if(sizeof(T) < 4) { uint32_t *ptr4 = reinterpret_cast(reinterpret_cast(ptr) & -uintptr_t(4)); #pragma unroll for(int e=0; e < 4; e++) { // Produce 4 bytes of sub-register type by reading 2 4-byte // aligned values and shifting. uint32_t lo, hi; asm("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0)); asm("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1)); tmp4[e] = __funnelshift_r(lo, hi, 8*(int(reinterpret_cast(ptr))%4)); } } else if(sizeof(T) == 4) { #pragma unroll for(int e=0; e < 4; e++) asm("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e)); } else /*sizeof(T)==8*/ { #pragma unroll for(int e=0; e < 2; e++) asm("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e)); } v0 = tmp8[0]; v1 = tmp8[1]; } template __device__ __forceinline__ uint32_t cvta_to_shared(T* ptr) { return (uint32_t)__cvta_generic_to_shared(ptr); } template __device__ __forceinline__ uintptr_t cvta_to_global(T* ptr) { return (uintptr_t)__cvta_generic_to_global(ptr); } template __device__ __forceinline__ T* cvta_from_shared(uint32_t shptr) { T* ans; asm("cvta.shared.u64 %0, %1;" : "=l"(ans) : "l"(uint64_t(shptr))); return ans; } template __device__ __forceinline__ T* cvta_from_global(uintptr_t gptr) { T* ans; asm("cvta.global.u64 %0, %1;" : "=l"(ans) : "l"(gptr)); return ans; } //////////////////////////////////////////////////////////////////////////////// // BytePack: struct of bytes. template union BytePack; template<> union BytePack<0> {}; template<> union BytePack<1> { uint8_t u8, native; }; template<> union BytePack<2> { BytePack<1> half[2]; uint8_t u8[2]; uint16_t u16, native; }; template<> union BytePack<4> { BytePack<2> half[2]; uint8_t u8[4]; uint16_t u16[2]; uint32_t u32, native; }; template<> union BytePack<8> { BytePack<4> half[2]; uint8_t u8[8]; uint16_t u16[4]; uint32_t u32[2]; uint64_t u64, native; }; template<> union alignas(16) BytePack<16> { BytePack<8> half[2]; uint8_t u8[16]; uint16_t u16[8]; uint32_t u32[4]; uint64_t u64[2]; ulong2 ul2, native; }; template struct BytePackOf { static constexpr int Size = sizeof(T); using Pack = BytePack; }; template<> struct BytePackOf> { static constexpr int Size = 0; using Pack = BytePack<0>; }; template __device__ __forceinline__ typename BytePackOf::Pack toPack(T value) { union { typename BytePackOf::Pack p; T v; }; v = value; return p; } template __device__ __forceinline__ T fromPack(typename BytePackOf::Pack pack) { union { typename BytePackOf::Pack p; T v; }; p = pack; return v; } //////////////////////////////////////////////////////////////////////////////// // Load/store of BytePack using integral addresses. template __device__ BytePack ld_global(uintptr_t addr); template __device__ BytePack ld_shared(uint32_t addr); template __device__ BytePack ld_volatile_global(uintptr_t addr); template __device__ BytePack ld_volatile_shared(uint32_t addr); template __device__ BytePack ld_relaxed_gpu_global(uintptr_t addr); template __device__ void st_global(uintptr_t addr, BytePack value); template __device__ void st_shared(uint32_t addr, BytePack value); template __device__ void st_relaxed_gpu_global(uintptr_t addr, BytePack value); template<> __device__ __forceinline__ BytePack<0> ld_global<0>(uintptr_t addr) { return {}; } template<> __device__ __forceinline__ BytePack<0> ld_shared<0>(uint32_t addr) { return {}; } template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; } template<> __device__ __forceinline__ BytePack<0> ld_volatile_shared<0>(uint32_t addr) { return {}; } template<> __device__ __forceinline__ BytePack<0> ld_relaxed_gpu_global<0>(uintptr_t addr) { return {}; } template<> __device__ __forceinline__ void st_global<0>(uintptr_t addr, BytePack<0> value) {} template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<0> value) {} template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t addr, BytePack<0> value) {} // Used to define implementations for above prototypes. #define DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \ template<> \ __device__ __forceinline__ BytePack ld_##space(addr_cxx_ty addr) { \ data_cxx_ty tmp; \ asm("ld." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \ BytePack ans; \ ans.native = tmp; \ return ans; \ } \ template<> \ __device__ __forceinline__ BytePack ld_volatile_##space(addr_cxx_ty addr) { \ data_cxx_ty tmp; \ asm("ld.volatile." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \ BytePack ans; \ ans.native = tmp; \ return ans; \ } \ template<> \ __device__ __forceinline__ void st_##space(addr_cxx_ty addr, BytePack value) { \ data_cxx_ty tmp = value.native; \ asm volatile("st." #space "." #data_ptx_ty " [%0], %1;" :: #addr_reg_ty(addr), #data_reg_ty(tmp) : "memory"); \ } #if __CUDA_ARCH__ >= 700 #define PTX_relaxed_gpu "relaxed.gpu" #else #define PTX_relaxed_gpu "volatile" #endif #define DEFINE_ld_st_gpu_relaxed__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty) \ template<> \ __device__ __forceinline__ BytePack ld_relaxed_gpu_global(uintptr_t addr) { \ data_cxx_ty tmp; \ asm("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr)); \ BytePack ans; \ ans.native = tmp; \ return ans; \ } \ template<> \ __device__ __forceinline__ void st_relaxed_gpu_global(uintptr_t addr, BytePack value) { \ data_cxx_ty tmp = value.native; \ asm volatile("st." PTX_relaxed_gpu ".global." #data_ptx_ty " [%0], %1;" :: "l"(addr), #data_reg_ty(tmp) : "memory"); \ } #define DEFINE_ld_st__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty) \ DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, global, uintptr_t, l) \ DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, shared, uint32_t, r) \ DEFINE_ld_st_gpu_relaxed__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty) // Single-byte types use 4-byte registers since there is no 1-byte register // character for asm blocks. See https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints DEFINE_ld_st__size(1, uint32_t, b8, r) DEFINE_ld_st__size(2, uint16_t, b16, h) DEFINE_ld_st__size(4, uint32_t, b32, r) DEFINE_ld_st__size(8, uint64_t, b64, l) #undef DEFINE_ld_st__size_space #undef DEFINE_ld_st__size #define DEFINE_ld_st_16__space(space, addr_cxx_ty, addr_reg_ty) \ template<> \ __device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \ BytePack<16> ans; \ asm("ld." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \ return ans; \ } \ template<> \ __device__ __forceinline__ BytePack<16> ld_volatile_##space<16>(addr_cxx_ty addr) { \ BytePack<16> ans; \ asm("ld.volatile." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \ return ans; \ } \ template<> \ __device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \ asm("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \ } DEFINE_ld_st_16__space(global, uintptr_t, l) DEFINE_ld_st_16__space(shared, uint32_t, r) #undef DEFINE_ld_st_16 template<> __device__ __forceinline__ BytePack<16> ld_relaxed_gpu_global<16>(uintptr_t addr) { BytePack<16> ans; asm("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr)); return ans; } template<> __device__ __forceinline__ void st_relaxed_gpu_global<16>(uintptr_t addr, BytePack<16> value) { asm volatile("st." PTX_relaxed_gpu ".global.v2.b64 [%0], {%1,%2};" :: "l"(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); } #undef PTX_relaxed_gpu //////////////////////////////////////////////////////////////////////////////// // Atomic load/store using c++ pointers. __device__ __forceinline__ uint64_t ld_volatile_global(uint64_t *ptr) { uint64_t ans; asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); return ans; } __device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) { uint64_t ans; #if __CUDA_ARCH__ >= 700 asm("ld.relaxed.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); #else asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); #endif return ans; } __device__ __forceinline__ uint64_t ld_relaxed_gpu_global(uint64_t *ptr) { uint64_t ans; #if __CUDA_ARCH__ >= 700 asm("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); #else asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); #endif return ans; } __device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) { uint64_t ans; #if __CUDA_ARCH__ >= 700 asm("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); #else asm("ld.volatile.sys.global.u64 %0, [%1]; membar.gl;" : "=l"(ans) : "l"(cvta_to_global(ptr))); #endif return ans; } __device__ __forceinline__ void st_volatile_global(uint64_t *ptr, uint64_t val) { asm volatile("st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory"); } __device__ __forceinline__ void st_relaxed_sys_global(uint64_t *ptr, uint64_t val) { #if __CUDA_ARCH__ >= 700 asm volatile("st.relaxed.sys.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory"); #else asm volatile("st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory"); #endif } __device__ __forceinline__ void st_release_sys_global(uint64_t *ptr, uint64_t val) { #if __CUDA_ARCH__ >= 700 asm volatile("st.release.sys.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory"); #else asm volatile("membar.sys; st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory"); #endif } __device__ __forceinline__ void fence_acq_rel_sys() { #if __CUDA_ARCH__ >= 700 asm volatile("fence.acq_rel.sys;" ::: "memory"); #else asm volatile("membar.sys;" ::: "memory"); #endif } __device__ __forceinline__ void fence_acq_rel_gpu() { #if __CUDA_ARCH__ >= 700 asm volatile("fence.acq_rel.gpu;" ::: "memory"); #else asm volatile("membar.gl;" ::: "memory"); #endif } //////////////////////////////////////////////////////////////////////////////// // Multimem stores of BytePack. template __device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack val); #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010 template<> __device__ __forceinline__ void multimem_st_global<0>(uintptr_t addr, BytePack<0> val) { // nop } template<> __device__ __forceinline__ void multimem_st_global<1>(uintptr_t addr, BytePack<1> val) { asm volatile("st.global.b8 [%0], %1;" :: "l"(addr), "r"((uint32_t)val.u8) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<2>(uintptr_t addr, BytePack<2> val) { asm volatile("st.global.b16 [%0], %1;" :: "l"(addr), "h"(val.u16) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) { asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.u32) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<8>(uintptr_t addr, BytePack<8> val) { asm volatile("multimem.st.global.b64 [%0], %1;" :: "l"(addr), "l"(val.u64) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<16>(uintptr_t addr, BytePack<16> val) { asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" :: "l"(addr), "r"(val.u32[0]), "r"(val.u32[1]), "r"(val.u32[2]), "r"(val.u32[3]) : "memory"); } #else template __device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack val) { // nop } #endif // Warp-uniform memory copy from shared address (not generic) to global memory. // The number of bytes copied is `min(MaxBytes, nBytesAhead)`, a negative value // is interpeted as zero. EltSize is the guaranteed alignment of the addresses and sizes. template __device__ __forceinline__ void copyGlobalShared_WarpUnrolled( int lane, uintptr_t dstAddr, uint32_t srcAddr, IntBytes nBytesAhead ) { static_assert(std::is_signed::value, "`IntBytes` must be a signed integral type."); int nBytes = min(nBytesAhead, (IntBytes)MaxBytes); int nFrontBytes = min(nBytes, (16 - int(dstAddr%16))%16); int nMiddleBytes = (nBytes-nFrontBytes) & -16; int nBackBytes = (nBytes-nFrontBytes) % 16; { int backLane = WARP_SIZE-1 - lane; bool hasFront = lane*EltSize < nFrontBytes; bool hasBack = backLane*EltSize < nBackBytes; int offset = hasFront ? lane*EltSize : (nBytes - (backLane+1)*EltSize); if (hasFront | hasBack) { BytePack tmp = ld_shared(srcAddr+offset); // Can't use multimem_st since it doesn't support EltSize==2 st_global(dstAddr+offset, tmp); } } srcAddr += nFrontBytes; int srcMisalign = EltSize < 4 ? (srcAddr%4) : 0; srcAddr += -srcMisalign + lane*16; dstAddr += nFrontBytes + lane*16; nMiddleBytes -= lane*16; #pragma unroll for (int u=0; u < divUp(MaxBytes, WARP_SIZE*16); u++) { if (nMiddleBytes <= 0) break; union { BytePack<4> b4[4]; BytePack<16> b16; }; b4[0] = ld_shared<4>(srcAddr + 0*4); b4[1] = ld_shared<4>(srcAddr + 1*4); b4[2] = ld_shared<4>(srcAddr + 2*4); b4[3] = ld_shared<4>(srcAddr + 3*4); if (srcMisalign != 0) { BytePack<4> b4_4 = ld_shared<4>(srcAddr + 4*4); b4[0].u32 = __funnelshift_r(b4[0].u32, b4[1].u32, srcMisalign*8); b4[1].u32 = __funnelshift_r(b4[1].u32, b4[2].u32, srcMisalign*8); b4[2].u32 = __funnelshift_r(b4[2].u32, b4[3].u32, srcMisalign*8); b4[3].u32 = __funnelshift_r(b4[3].u32, b4_4.u32, srcMisalign*8); } if (Multimem) multimem_st_global<16>(dstAddr, b16); else st_global<16>(dstAddr, b16); srcAddr += WARP_SIZE*16; dstAddr += WARP_SIZE*16; nMiddleBytes -= WARP_SIZE*16; } } #endif nccl-2.22.3-1/src/device/primitives.h000066400000000000000000000126751463451655400172770ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_PRIMITIVES_H_ #define NCCL_PRIMITIVES_H_ #include #include "reduce_kernel.h" // for reduction funcs #include "common_kernel.h" #include "common.h" #define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000 /* Protocol classes: ProtoSimple, ProtoLL, ProtoLL128 * We use these as template args to the Primtiives class instead of integral * enums (e.g. NCCL_PROTO_LL) because for SIMPLE we need to carry a few extra * numbers. Also these types hold methods which let us compute numbers important * to how that protocol operates with a consistent interface so that our * algorithm code can operate protocol parametrically. */ template struct ProtoSimple { static constexpr int Id = NCCL_PROTO_SIMPLE; static constexpr int SlicePerChunk = SlicePerChunk_1; static constexpr int StepPerSlice = StepPerSlice_1; static constexpr int Unroll = Unroll_1; static constexpr int MultimemSrcs = MultimemSrcs_1; static constexpr int MultimemDsts = MultimemDsts_1; // Data bytes (no flags etc) in one step of the fifo queue. __device__ static int calcBytePerStep() { return ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; } // Granularity of data bytes transferred per thread. __device__ static int calcBytePerGrain() { return sizeof(uint64_t); // Bogus value? Nobody queries this metric for simple. } // Group width is how many consecutive group values a subchannel occupies. static constexpr int MaxGroupWidth = 2; }; struct ProtoLL { static constexpr int Id = NCCL_PROTO_LL; // Data bytes (no flags etc) in one step of the fifo queue. __device__ static int calcBytePerStep() { return ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/2; // Half is data } // Granularity of data bytes transferred per thread. __device__ static int calcBytePerGrain() { return sizeof(uint64_t); // One 16-byte line has 8-bytes of data } // Group width is how many consecutive group values a subchannel occupies. static constexpr int MaxGroupWidth = 1; }; struct ProtoLL128 { static constexpr int Id = NCCL_PROTO_LL128; // Data bytes (no flags etc) in one step of the fifo queue. __device__ static int calcBytePerStep() { return (ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS)*NCCL_LL128_DATAELEMS/NCCL_LL128_LINEELEMS; } // Granularity of data bytes transferred per thread. __device__ static int calcBytePerGrain() { return NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_DATAELEMS*sizeof(uint64_t)/NCCL_LL128_LINEELEMS; } // Group width is how many consecutive group values a subchannel occupies. static constexpr int MaxGroupWidth = 1; }; /* Fan (as in fan-in & fan-out) classes hold recv and send counts. The template * arguments are static bounds on the maximum values. Asymmetric counts are * independent. Symmetric is a static guarantee that nrecv==nsend, so it only * stores one value at runtime. This optimization save 32-bit register, but more * importantly uses fewer predicate registers when unrolling loops. */ template struct FanAsymmetric { static constexpr int MaxRecv = MaxRecv_, MaxSend = MaxSend_; int nr, ns; FanAsymmetric() = default; __device__ FanAsymmetric(int nrecv, int nsend): nr(nrecv), ns(nsend) { // assert(nrecv <= MaxRecv && nsend <= MaxSend); } __device__ int nrecv() const { return MaxRecv ? nr : 0; } __device__ int nsend() const { return MaxSend ? ns : 0; } }; template struct FanSymmetric { static constexpr int MaxRecv = MaxArity, MaxSend = MaxArity; int n; FanSymmetric() = default; __device__ FanSymmetric(int nrecv, int nsend): n(nrecv) { // assert(nrecv == nsend && nrecv <= MaxArity); } __device__ int nrecv() const { return n; } __device__ int nsend() const { return n; } }; // The primitives class. Specialized per protocol in the other headers. template class Primitives; // Used by LL & LL128 to implement direct members in the naive way. template struct PrimitivesWithoutDirect { __device__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) { static_cast(this)->send(inpIx, eltN); } __device__ void directSendFromOutput(intptr_t outIx, int eltN) { static_cast(this)->sendFromOutput(outIx, eltN); } __device__ void directRecv(intptr_t outIx, int eltN) { static_cast(this)->recv(outIx, eltN, /*postOp=*/false); } __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { static_cast(this)->copySend(inpIx, outIx, eltN, postOp); } __device__ void directRecvCopySend(intptr_t outIx, int eltN) { static_cast(this)->recvCopySend(outIx, eltN, /*postOp=*/false); } __device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { // Direct is only for the send part static_cast(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp); } }; #include "prims_simple.h" #include "prims_ll.h" #include "prims_ll128.h" #endif nccl-2.22.3-1/src/device/prims_ll.h000066400000000000000000000327431463451655400167230ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ template class Primitives: public PrimitivesWithoutDirect> { // In the case of Fan::MaxRecv == 0, we need to force MaxRecv to 1 for this to compile // This is because of a recv buffer which is allocated to MaxRecv length in send-only cases static constexpr int MaxRecv = Fan::MaxRecv > 1 ? Fan::MaxRecv : 1; static constexpr int MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; RedOp redOp; const int tid; const int nthreads; const int wid; const int group; const int stepLines; Fan fan; T *userBufs[2]; struct ncclConnInfo* recvConn = NULL; volatile uint64_t* recvConnHeadPtr = NULL; uint64_t recvConnHead; struct ncclConnInfo* sendConn = NULL; volatile struct ncclConnFifo* sendConnFifo = NULL; volatile uint64_t* sendConnHeadPtr = NULL; uint64_t sendConnHead; uint64_t sendConnHeadCache; // Cache last seen value uint64_t recvStep[MaxRecv]; uint64_t sendStep[MaxSend]; union ncclLLFifoLine* recvBuff[MaxRecv]; union ncclLLFifoLine* sendBuff[MaxSend]; inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepLines; } inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepLines; } inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); } inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); } inline __device__ void barrier() { if (nthreads == WARP_SIZE) { __syncwarp(); } else { barrier_sync(15-group, nthreads); } } uint32_t abort = 0; inline __device__ int checkAbort(int &spins, int send) { spins++; if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { abort = *ncclShmem.comm.abortFlag; spins = 0; } return abort; } inline __device__ void waitSend(int nbytes) { if (sendConnHeadPtr) { int spins = 0; while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) { sendConnHeadCache = *sendConnHeadPtr; if (checkAbort(spins, 1)) break; } if (sendConnFifo) { int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes; sendConnFifo[sendConnHead%NCCL_STEPS].size = size; } sendConnHead += 1; } barrier(); } inline __device__ void incRecv(int i) { recvStep[i] += 1; } inline __device__ void postRecv() { barrier(); if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1; } inline __device__ void incSend(int i, int offset) { // LL Cleanup : write all flags in the slice to make sure we don't have // data corruption when flag loops over. if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) { for (int o = offset; oi4)); if (checkAbort(spins, 0)) break; } while ((flag1 != flag) || (flag2 != flag)); uint64_t val64 = data1 + (((uint64_t)data2) << 32); return val64; } template __device__ void readLLBeginAll(int offset, ncclLLFifoLine(&line)[MaxRecv]) { #pragma unroll for (int i=BeginIx; i < MaxRecv; i++) { if (i < fan.nrecv()) { union ncclLLFifoLine* src = recvPtr(i) + offset; asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4)); } } } __device__ uint64_t readLLFinish(int offset, ncclLLFifoLine(&line)[MaxRecv], int i) { union ncclLLFifoLine* src = recvPtr(i) + offset; uint32_t flag = recvFlag(i); int spins = 0; while (line[i].flag1 != flag || line[i].flag2 != flag) { asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4)); if (checkAbort(spins, 0)) break; } uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32); return val64; } __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag)); } static constexpr int EltPerLine = sizeof(uint64_t)/sizeof(T); template __device__ static U load(U *src) { union { U elt; uint16_t u2; uint32_t u4; uint64_t u8; }; if(sizeof(U) == 1) asm("ld.volatile.global.b8 %0,[%1];" : "=r"(u4) : "l"(src)); else if(sizeof(U) == 2) asm("ld.volatile.global.b16 %0,[%1];" : "=h"(u2) : "l"(src)); else if(sizeof(U) == 4) asm("ld.volatile.global.b32 %0,[%1];" : "=r"(u4) : "l"(src)); else asm("ld.volatile.global.b64 %0,[%1];" : "=l"(u8) : "l"(src)); return elt; } template __device__ static void store(U *dst, U val) { union { U elt; uint16_t u2; uint32_t u4; uint64_t u8; }; elt = val; if(sizeof(U) == 1) asm("st.volatile.global.b8 [%0],%1;" :: "l"(dst), "r"(u4)); else if(sizeof(U) == 2) asm("st.volatile.global.b16 [%0],%1;" :: "l"(dst), "h"(u2)); else if(sizeof(U) == 4) asm("st.volatile.global.b32 [%0],%1;" :: "l"(dst), "r"(u4)); else asm("st.volatile.global.b64 [%0],%1;" :: "l"(dst), "l"(u8)); } struct DataLoader { int misalign; union { uint32_t u4[sizeof(T) <= 2 ? 3 : 2]; uint64_t u8; T elt[EltPerLine]; }; __device__ void loadBegin(T *src, int eltN) { if (sizeof(T) <= 2) { misalign = reinterpret_cast(src)%4; uint32_t *p = reinterpret_cast(reinterpret_cast(src) & -uintptr_t(4)); u4[0] = load(p+0); u4[1] = misalign + eltN*sizeof(T) > 4 ? load(p+1) : 0; // u4[2] would be simpler, but that throws warnings on some compilers u4[sizeof(T) <= 2 ? 2 : 0] = misalign + eltN*sizeof(T) > 8 ? load(p+2) : 0; } else { #pragma unroll for(int i=0; i < EltPerLine; i++) { if(i==0 || i < eltN) elt[i] = load(src + i); } } } __device__ uint64_t loadFinish() { if (sizeof(T) <= 2) { u4[0] = __funnelshift_r(u4[0], u4[1], 8*misalign); // u4[2] would be simpler, but that throws warnings on some compilers u4[1] = __funnelshift_r(u4[1], u4[sizeof(T) <= 2 ? 2 : 0], 8*misalign); } return u8; } }; __device__ void storeData(T *dst, uint64_t val, int eltN) { union { uint64_t u8; T elt[EltPerLine]; }; u8 = val; #pragma unroll for(int i=0; i < EltPerLine; i++) { if (i==0 || i < eltN) //store(dst+i, elt[i]); dst[i] = elt[i]; } } template __device__ __forceinline__ void LLGenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) { constexpr int SRC = SrcBuf != -1 ? 1 : 0; constexpr int DST = DstBuf != -1 ? 1 : 0; T *srcElts = SrcBuf == -1 ? nullptr : userBufs[SrcBuf] + srcIx; T *dstElts = DstBuf == -1 ? nullptr : userBufs[DstBuf] + dstIx; // Always waitSend in case of cleanup nelem = nelem < 0 ? 0 : nelem; if (SEND) waitSend(divUp(nelem, EltPerLine)*sizeof(ncclLLFifoLine)); nelem -= tid*EltPerLine; srcElts += tid*EltPerLine; dstElts += tid*EltPerLine; int offset = tid; int eltPerTrip = nthreads*EltPerLine; while (nelem > 0) { int eltInLine = EltPerLine < nelem ? EltPerLine : nelem; DataLoader dl; ncclLLFifoLine line[MaxRecv]; uint64_t data, peerData; if (SRC) { dl.loadBegin(srcElts, eltInLine); srcElts += eltPerTrip; } if (RECV) { readLLBeginAll<1>(offset, line); peerData = readLL(offset, 0); } if (SRC) { data = dl.loadFinish(); if (SrcBuf == Input) data = applyPreOp(redOp, data); } if (RECV) { data = !SRC ? peerData : applyReduce(redOp, peerData, data); #pragma unroll MaxRecv for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) { peerData = readLLFinish(offset, line, i); data = applyReduce(redOp, peerData, data); } } if (postOp) data = applyPostOp(redOp, data); // Send : inter-node, then intra-node, then local if (SEND) { for (int i=1; i < MaxSend && i < fan.nsend(); i++) storeLL(sendPtr(i)+offset, data, sendFlag(i)); storeLL(sendPtr(0)+offset, data, sendFlag(0)); } if (DST) { storeData(dstElts, data, eltInLine); dstElts += eltPerTrip; } nelem -= eltPerTrip; offset += nthreads; } if (RECV) { for (int i=0; i < MaxRecv; i++) incRecv(i); postRecv(); } if (SEND) { for (int i=1; i < MaxSend && i < fan.nsend(); i++) incSend(i, offset); incSend(0, offset); } } __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) { recvBuff[i] = (union ncclLLFifoLine*)conn->buffs[NCCL_PROTO_LL]; recvStep[i] = conn->step; if (wid == i) recvConn = conn; } __device__ __forceinline__ void loadRecvSync() { if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv()) { recvConnHeadPtr = recvConn->head; recvConnHead = recvConn->step; } } __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) { sendBuff[i] = (union ncclLLFifoLine*)conn->buffs[NCCL_PROTO_LL]; sendStep[i] = conn->step; if (wid == i) sendConn = conn; } __device__ __forceinline__ void loadSendSync() { if (tid < fan.nsend()) { sendConnHeadPtr = sendConn->head; sendConnHeadCache = *sendConnHeadPtr; sendConnHead = sendConn->step; sendConnFifo = sendConn->connFifo; } } public: __device__ Primitives( const int tid, const int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, bool userBufReg=false, int stepSize_=0 ): redOp(redOpArg), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group), stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) { auto *channel = &ncclShmem.channel; // If we are going to support oneshot collNet + LL, then we would need to add connector index here int nrecv=0, nsend=0; // We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1 while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) { loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv); nrecv++; } while (nsend < MaxSend && sendPeers[nsend] >= 0) { loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend); nsend++; } this->fan = Fan(nrecv, nsend); loadRecvSync(); loadSendSync(); setDataPtrs(inputBuf, outputBuf); } __device__ ~Primitives() { // Save steps for the next operation if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv()) recvConn->step = recvConnHead; if (tid < fan.nsend()) sendConn->step = sendConnHead; // Ensure all steps written back barrier(); } __device__ void setDataPtrs(void const *inputBuf, void *outputBuf) { userBufs[Input] = (T*)inputBuf; userBufs[Output] = (T*)outputBuf; } __device__ void moveDataPtrs(intptr_t delta) { userBufs[Input] += delta; userBufs[Output] += delta; } __device__ void send(intptr_t inpIx, int eltN) { return LLGenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false); } __device__ void sendFromOutput(intptr_t outIx, int eltN) { return LLGenericOp<0, 1, Output, -1>(outIx, -1, eltN, false); } __device__ void recv(intptr_t outIx, int eltN, bool postOp=false) { return LLGenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp); } __device__ void recvReduceSend(intptr_t inpIx, int eltN) { return LLGenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false); } __device__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { return LLGenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { return LLGenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) { return LLGenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp); } __device__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { return LLGenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp); } }; nccl-2.22.3-1/src/device/prims_ll128.h000066400000000000000000000371641463451655400171600ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "op128.h" #define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1) template class Primitives: public PrimitivesWithoutDirect> { static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; RedOp redOp; const int tid; // thread index in primitives group const int nthreads; // thread count in primitives group const int wid; // lane index in warp const int stepSize; const int warp; // warp index in primitives group const int warpInBlock; // warp index in thread block const bool flagThread; const int group; Fan fan; T *userBufs[2]; struct ncclConnInfo* recvConn = NULL; volatile uint64_t* recvConnHeadPtr = NULL; uint64_t recvConnHead; struct ncclConnInfo* sendConn = NULL; volatile struct ncclConnFifo* sendConnFifo = NULL; volatile uint64_t* sendConnTailPtr = NULL; uint64_t sendConnTail; volatile uint64_t* sendConnHeadPtr = NULL; uint64_t sendConnHead; uint64_t sendConnHeadCache; // Cache last seen value uint64_t recvStep[MaxRecv]; uint64_t sendStep[MaxSend]; uint64_t* recvBuff[MaxRecv]; uint64_t* sendBuff[MaxSend]; inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; } inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; } inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; } inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; } inline __device__ void barrier() { barrier_sync(15-group, nthreads); } uint32_t abort = 0; inline __device__ int checkAbort(int &spins, int i, int send) { spins++; if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { abort = *ncclShmem.comm.abortFlag; spins = 0; } return abort; } inline __device__ void waitSend(int nbytes) { if (sendConnHeadPtr) { int spins = 0; while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) { sendConnHeadCache = *sendConnHeadPtr; if (checkAbort(spins, wid, 1)) break; } if (sendConnFifo) { sendConnFifo[sendStep[wid]%NCCL_STEPS].size = nbytes; } sendConnHead += 1; } } inline __device__ void postRecv() { if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1; } inline __device__ void postSend() { if (sendConnTailPtr) { #if __CUDA_ARCH__ >= 900 __threadfence_system(); #else __threadfence(); #endif *sendConnTailPtr = sendConnTail += 1; } } template __device__ __forceinline__ void loadRegsBegin(uint64_t(®s)[WordPerThread], T const *src, int eltN) { constexpr int EltPer16B = 16/sizeof(T); if(reinterpret_cast(src)%16 == 0) { /* We are aligned to 16 bytes, so load directly to registers no shmem. * Flag threads load half as much data which gets shuffled to the even * registers during Finish. The point of splitting into two phases is to * defer that shuffle, which incurs a dependency stall, until after other * memops are launched by the caller. */ #pragma unroll for(int g=0; g < WordPerThread/2; g++) { int ix = g*WARP_SIZE - 4*(g/2) + wid - (g%2)*(wid/8); if(!flagThread || g%2==0) { if(ix*EltPer16B < eltN) load128((uint64_t*)(src + ix*EltPer16B), regs[2*g+0], regs[2*g+1]); } } } else { // Not aligned. Stage the smallest 16 byte aligned region subsuming the // buffer into shmem. int misalignment = reinterpret_cast(src) % 16; uint64_t *src8 = reinterpret_cast(reinterpret_cast(src) & -uintptr_t(16)); uint64_t *shm8 = shmemCvtPtr((uint64_t*)ncclScratchForWarp(warpInBlock)); #pragma unroll for(int g=0; g < WordPerThread/2; g++) if((g*WARP_SIZE + wid)*16 < misalignment + eltN*sizeof(T)) load128(src8 + 2*(g*WARP_SIZE + wid), regs[2*g+0], regs[2*g+1]); #pragma unroll for(int g=0; g < WordPerThread/2; g++) storeShmem128(shm8 + 2*(g*WARP_SIZE + wid), regs[2*g+0], regs[2*g+1]); __syncwarp(); // Now load from shmem stage to regs. Preserve the same pre-shuffled layout // as the aligned case since Finish() will be applied regardless. T *shm = (T*)shm8 + misalignment/sizeof(T); #pragma unroll for(int g=0; g < WordPerThread/2; g++) { int ix = g*WARP_SIZE - 4*(g/2) + wid - (g%2)*(wid/8); if(!flagThread || g%2==0) { if(ix*EltPer16B < eltN) loadShmemMisaligned128(shm + ix*EltPer16B, regs[2*g+0], regs[2*g+1]); } } } } template __device__ __forceinline__ void loadRegsFinish(uint64_t(®s)[WordPerThread]) { // Move data out of flag registers into the vacant registers. #pragma unroll for (int g=1; g < WordPerThread/2; g+=2) { if (flagThread) regs[2*g] = regs[2*g-1]; } } template __device__ __forceinline__ void storeRegs(T *dst, uint64_t(®s)[WordPerThread], int eltN) { constexpr int EltPer16B = 16/sizeof(T); // Reverse Finish() register permuatation. #pragma unroll for (int g=1; g < WordPerThread/2; g+=2) { if (flagThread) regs[2*g-1] = regs[2*g]; } // Write to dst if 16-byte aligned, shmem otherwise. int misalignment = reinterpret_cast(dst)%16; uint64_t *shm8 = shmemCvtPtr((uint64_t*)ncclScratchForWarp(warpInBlock)); #pragma unroll for(int g=0; g < WordPerThread/2; g++) { int ix = g*WARP_SIZE - 4*(g/2) + wid - (g%2)*(wid/8); if (!flagThread || g%2==0) { if(misalignment == 0 && (ix+1)*EltPer16B <= eltN) store128((uint64_t*)(dst + ix*EltPer16B), regs[2*g+0], regs[2*g+1]); else storeShmem128(shm8+2*ix, regs[2*g+0], regs[2*g+1]); } } __syncwarp(); // Write rest from shmem to dst. No need to coalesce stores to 16-bytes, // the hardware keeps up fine. T *shm = (T*)ncclScratchForWarp(warpInBlock); int skip = misalignment == 0 ? eltN & -EltPer16B : 0; for(int i=skip+wid; i < eltN; i += WARP_SIZE) dst[i] = shm[i]; } #define WARP_MASK 0xffffffff template __device__ __forceinline__ void recvReduceSendCopy(uint64_t(&v)[ELEMS_PER_THREAD], int ll128Offset, bool postOp) { constexpr int SRC = SrcBuf != -1 ? 1 : 0; uint64_t vr[ELEMS_PER_THREAD]; __syncwarp(); /************************ Wait first recv ********************/ if (RECV) { uint64_t* ptr = recvPtr(0)+ll128Offset; uint64_t flag = recvFlag(0); bool needReload; int spins = 0; do { needReload = false; #pragma unroll for (int u=0; u __device__ __forceinline__ void GenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) { constexpr int SRC = SrcBuf != -1 ? 1 : 0; constexpr int DST = DstBuf != -1 ? 1 : 0; T const *srcPtr = SrcBuf == -1 ? nullptr : userBufs[SrcBuf] + srcIx; T *dstPtr = DstBuf == -1 ? nullptr : userBufs[DstBuf] + dstIx; int wireOffset = WireWordPerSlice*warp + 2*wid; const int nwarps = nthreads/WARP_SIZE; nelem = nelem < 0 ? 0 : nelem; if (SEND) waitSend(divUp(nelem, DataEltPerSlice)*WireWordPerSlice*sizeof(uint64_t)); barrier(); nelem -= DataEltPerSlice*warp; srcPtr += DataEltPerSlice*warp; dstPtr += DataEltPerSlice*warp; while (nelem > 0) { const int eltInSlice = min(nelem, DataEltPerSlice); uint64_t regs[NCCL_LL128_SHMEM_ELEMS_PER_THREAD]; if (SRC) loadRegsBegin(regs, srcPtr, eltInSlice); recvReduceSendCopy(regs, wireOffset, postOp); if (DST) storeRegs(dstPtr, regs, eltInSlice); wireOffset += WireWordPerSlice*nwarps; srcPtr += DataEltPerSlice*nwarps; dstPtr += DataEltPerSlice*nwarps; nelem -= DataEltPerSlice*nwarps; } barrier(); if (SEND) for (int i=0; i < MaxSend; i++) sendStep[i] += 1; if (SEND) postSend(); if (RECV) for (int i=0; i < MaxRecv; i++) recvStep[i] += 1; if (RECV) postRecv(); } __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) { recvBuff[i] = (uint64_t*)conn->buffs[NCCL_PROTO_LL128]; recvStep[i] = conn->step; if (wid == i) recvConn = conn; } __device__ __forceinline__ void loadRecvSync() { if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv()) { recvConnHeadPtr = recvConn->head; recvConnHead = recvConn->step; } } __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) { sendBuff[i] = (uint64_t*)conn->buffs[NCCL_PROTO_LL128]; sendStep[i] = conn->step; if (wid == i) sendConn = conn; } __device__ __forceinline__ void loadSendSync() { if (tid < fan.nsend()) { sendConnHeadPtr = sendConn->head; sendConnHeadCache = *sendConnHeadPtr; sendConnHead = sendConn->step; sendConnFifo = sendConn->connFifo; } if (tid >= nthreads-WARP_SIZE && widconnFifo) { sendConnTailPtr = sendConn->tail; sendConnTail = sendConn->step; } } } public: __device__ Primitives( const int tid, const int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclDevWorkColl* e = nullptr, bool userBufReg=false, int stepSize_=0 ): redOp(redOpArg), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), warpInBlock(threadIdx.x/WARP_SIZE), flagThread((tid%8)==7), group(group), stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) { auto *channel = &ncclShmem.channel; int nrecv=0, nsend=0; while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) { loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv); nrecv++; } while (nsend < MaxSend && sendPeers[nsend] >= 0) { loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend); nsend++; } this->fan = Fan(nrecv, nsend); loadRecvSync(); loadSendSync(); setDataPtrs(inputBuf, outputBuf); } __device__ ~Primitives() { // Save steps for the next operation if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv()) recvConn->step = recvConnHead; if (tid < fan.nsend()) sendConn->step = sendConnHead; // Ensure all steps written back barrier(); } __device__ void setDataPtrs(void const *inputBuf, void *outputBuf) { userBufs[Input] = (T*)inputBuf; userBufs[Output] = (T*)outputBuf; } __device__ void moveDataPtrs(intptr_t delta) { userBufs[Input] += delta; userBufs[Output] += delta; } __device__ void send(intptr_t inpIx, int eltN) { return GenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false); } __device__ void sendFromOutput(intptr_t outIx, int eltN) { return GenericOp<0, 1, Output, -1>(outIx, -1, eltN, false); } __device__ void recv(intptr_t outIx, int eltN, bool postOp=false) { return GenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp); } __device__ void recvReduceSend(intptr_t inpIx, int eltN) { return GenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false); } __device__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { return GenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { return GenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) { return GenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp); } __device__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { return GenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp); } }; nccl-2.22.3-1/src/device/prims_simple.h000066400000000000000000001063061463451655400176020ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "network/unpack/unpack.h" #include template class Primitives< T, RedOp, Fan, Direct, ProtoSimple, P2p > { static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; static constexpr int RoleWaitRecv = 0x04, // 0x1 0x2 are free to use RoleWaitSend = 0x08, RolePostSend = 0x10, RolePostRecv = 0x20, Aborted = 0x40, UserBufferMode = 0x80, ConnFifoEnabled = 0x100, DirectWrite = 0x200, DirectRead = 0x400, // 0x800 is free to use NvlsMinPolling = 0x1000, NetDeviceUnpack = 0x2000, AnyNetDeviceUnpack = 0x4000, NvlsDirectRead = 0x8000, NvlsDirectWrite = 0x10000; const int tid, tidInBlock; const int nthreads; int nworkers; const int stepSize; Fan fan; int index; // Peer index I'm responsible for int flags; int group; uint64_t step; struct ncclConnFifo* connFifo = NULL; T* connEltsFifo; T* directBuff; uint64_t *connStepPtr; uint64_t connStepCache; // Cache last seen value of (*connStepPtr) int connStepSize; // Connection step size void* netDeviceHandle; // Don't use barrier 0 as it's used by the final sync __device__ void barrier() { if (nthreads == WARP_SIZE) __syncwarp(); else { int bar = 15-group; barrier_sync(bar, nthreads); } } __device__ void subBarrier() { if (nworkers == WARP_SIZE) __syncwarp(); else { int bar = 15-group - (nworkers!=nthreads ? 1 : 0); barrier_sync(bar, nworkers); } } __device__ bool barrierAny(int vote) { if (nthreads == WARP_SIZE) { return __any_sync(~0u, vote); } else { int name = 15-group; return barrier_red_or(vote, name, nthreads); } } __device__ bool subBarrierAny(int vote) { if (nworkers == WARP_SIZE) { return __any_sync(~0u, vote); } else { int name = 15-group - (nworkers!=nthreads ? 1 : 0); return barrier_red_or(vote, name, nworkers); } } inline __device__ bool checkAbort(int &spins) { spins++; if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { if (*ncclShmem.comm.abortFlag) { flags |= Aborted; ncclShmem.aborted = 1; } spins = 0; } return flags & Aborted; } inline __device__ uint64_t loadStepValue(uint64_t* ptr) { #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010 if (flags & NvlsMinPolling) { uint64_t ans; asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); return ans; } #endif // volatile is faster than acquire but not as correct. Make sure reduceCopy // loads data using volatile so it doesn't see stale data in L1. return ld_volatile_global(ptr); } template __device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) { const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send; const bool noRecvWait = DirectRecv && Src && (flags & DirectRead); // no wait when directly reading from remote input const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) || ((flags & (Send*RoleWaitSend)) && !noSendWait)) { int spins = 0; while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) { connStepCache = loadStepValue(connStepPtr); if (checkAbort(spins)) break; //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice)); } } if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) { if (flags & ConnFifoEnabled) connFifo[step%NCCL_STEPS].size = nelts*sizeof(T); void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst) : (ncclShmem.groups[group].srcs + Src); if (flags & UserBufferMode) { // Do nothing } else if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) { ptrs[index] = connEltsFifo + loadInt(&connFifo[step%NCCL_STEPS].offset)/sizeof(T); } else if (isSendNotRecv && DirectSend) { if (flags & (DirectWrite | NvlsDirectWrite)) { ptrs[index] = directBuff + dstIx + offset; } else if (flags & DirectRead) { // empty send ptrs[index] = nullptr; } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize; } } else if (!isSendNotRecv && DirectRecv) { if (flags & (DirectRead | NvlsDirectRead)) { ptrs[index] = directBuff + srcIx + offset; } else if (flags & DirectWrite) { ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize; } } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize; } if (flags & NetDeviceUnpack) { ncclNetDeviceIncrementHead(group, index); } step += StepPerSlice; } } template inline __device__ void postPeer(bool dataStored) { if (flags & (Recv*RolePostRecv | Send*RolePostSend)) { step += StepPerSlice; if (Send && (flags & RolePostSend) && (dataStored||(flags&ConnFifoEnabled))) { fence_acq_rel_sys(); } st_relaxed_sys_global(connStepPtr, step); } } template __device__ __forceinline__ void genericOp( intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp ) { constexpr int DirectRecv = 1 && Direct && DirectRecv1; constexpr int DirectSend = 1 && Direct && DirectSend1; constexpr int Src = SrcBuf != -1; constexpr int Dst = DstBuf != -1; nelem = nelem < 0 ? 0 : nelem; int sliceSize = stepSize*StepPerSlice; sliceSize = max(divUp(nelem, 16*SlicePerChunk)*16, sliceSize/32); int slice = 0; int offset = 0; if (tid < nworkers && offset < nelem && ((flags & UserBufferMode) == 0)) { // Worker-only loop for non-empty slices. Non-workers and empty slices are // processed in the loop following this if block. The benefit of splitting // the loop like this is we pull two branches out of the critical path. // Using "number of branch insns (taken or not) encountered dynamically" // as the performance metric, then: // perf_orig = 2*numslices // perf_new = 2+numslices // So the new code and old code behave the same for numslices=2, and for // numslices>2 the new code is superior. And note that in the case // numslices=1, the loop is trivially unrollable (single iteration) so we // don't incur that that tail branch and we still have perf_new=2. // // ORIGINAL CODE: // unrolled for(slices) { // if(worker) { // This branch removed // wait(); // subBarrier(); // if(slice not empty) // This branch removed // ReduceCopyMulti(); // } // barrier(); // post(); // } // Since we no longer unroll, new branch added here #if __CUDA_ARCH__ < 700 // Above doesn't matter on older hardware. #pragma unroll SlicePerChunk #else #pragma unroll 1 #endif do { sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset; if (tid == 0) { T* userInput = (T*)ncclShmem.groups[group].userInput; T* userOutput = (T*)ncclShmem.groups[group].userOutput; if (Src) ncclShmem.groups[group].srcs[0] = (SrcBuf==Input ? userInput : userOutput) + srcIx + offset; if (Dst) ncclShmem.groups[group].dsts[0] = (DstBuf==Input ? userInput : userOutput) + dstIx + offset; } waitPeer(srcIx, dstIx, offset, sliceSize); subBarrier(); /* if user abort the kernel, we don't need to actually perform copy/reduce; just set size * to 0 to avoid unnecessary workload. */ int workSize = ncclShmem.aborted ? 0 : sliceSize; if (flags & AnyNetDeviceUnpack) { ncclNetDeviceUnpack(tid, tidInBlock, nworkers, group, ncclShmem.groups[group].devicePlugin.unpack.unpackNetDeviceIndexMask, Src, workSize); // Sync here to make sure all workers are reading from the updated srcs) subBarrier(); } if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0] /* NVLS can have srcs[0] == dsts[0], but we cannot enter this "if branch", * so we need to check whether MultimemSrcs and MultimemDsts are 0. */ && MultimemSrcs == 0 && MultimemDsts == 0) { // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy if (Send) { reduceCopy (tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false, 1, ncclShmem.groups[group].srcs, fan.nsend(), ncclShmem.groups[group].dsts+1, workSize); } } else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) { // For broadcast in CollNet to do empty send reduceCopy (tid, nworkers, ncclShmem.redOpArgs[0], nullptr, postOp, Recv, ncclShmem.groups[group].srcs, Dst, ncclShmem.groups[group].dsts, workSize); } else { constexpr int PreOpSrcs = SrcBuf != Input ? 0 : DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1; reduceCopy (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs, Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts, workSize); } barrier(); // This barrier has a counterpart in following loop postPeer(0 < sliceSize); offset += sliceSize; slice += 1; } while (slice < SlicePerChunk && offset < nelem); } // Non-workers come straight here. Workers too but only once the remaining // slices are all empty. Since empty slices are the uncommon case, and // worker perf is the limiter, perf-wise this loop is effectively unentered, // hence just a single branch insn. #pragma unroll 1 while (slice < SlicePerChunk) { sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset; { // Only workers could have Wait roles so we know the slice must be empty // since we've exited the loop above. waitPeer(0, 0, 0, 0); } barrier(); // Has couterpart in preceding worker-only loop. postPeer(0 < sliceSize); offset += sliceSize; slice += 1; } } public: static inline __device__ void sendPeerNotify(int peer, int connIndex, int steps) { ncclDevChannelPeer* peerPtr = ncclShmem.channel.peers[peer]; peerPtr->send[connIndex].step += steps; st_relaxed_sys_global(peerPtr->send[connIndex].tail, peerPtr->send[connIndex].step); } static inline __device__ void recvPeerNotify(int peer, int connIndex, int steps) { int spins = 0; ncclDevChannelPeer* peerPtr = ncclShmem.channel.peers[peer]; peerPtr->recv[connIndex].step += steps; st_relaxed_sys_global(peerPtr->recv[connIndex].head, peerPtr->recv[connIndex].step); while (ld_volatile_global(peerPtr->recv[connIndex].tail) < peerPtr->recv[connIndex].step) { if (spins++ == NCCL_SPINS_BEFORE_CHECK_ABORT) { if (*ncclShmem.comm.abortFlag) { ncclShmem.aborted = 1; break; } spins = 0; } } } template __device__ __forceinline__ void process(Fn &&fn) { #pragma unroll 1 for (int slice=0; slice < SlicePerChunk; slice++) { if (tid < nworkers) { if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) { bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send; int spins = 0; while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) { connStepCache = loadStepValue(connStepPtr); if (checkAbort(spins)) break; } void **ptrs = isSendNotRecv ? ncclShmem.groups[group].dsts : ncclShmem.groups[group].srcs; if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) { int offset = loadInt(&connFifo[step%NCCL_STEPS].offset); ptrs[index] = connEltsFifo + offset/sizeof(T); } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; } } subBarrier(); fn.template operator() (tid, nworkers, slice, stepSize*StepPerSlice, fan.nrecv(), ncclShmem.groups[group].srcs, fan.nsend(), ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes); } barrier(); int32_t dstSize = 0; if (flags & Send*RolePostSend) { dstSize = ncclShmem.groups[group].dstSizes[index]; ncclShmem.groups[group].dstSizes[index] = 0; if (flags & ConnFifoEnabled) connFifo[step%NCCL_STEPS].size = dstSize*sizeof(T); } barrier(); if (flags & (Recv*(RoleWaitRecv|RolePostRecv) | Send*(RoleWaitSend|RolePostSend))) { step += StepPerSlice; } if (flags & (Recv*RolePostRecv | Send*RolePostSend)) { if (Send && (!Recv || (flags & RolePostSend)) && (dstSize!=0 || (flags&ConnFifoEnabled))) { fence_acq_rel_sys(); } st_relaxed_sys_global(connStepPtr, step); } } } private: // Scatter/Gather generic op // skip: my own rank order in the buffer chunks // shift: peer offset to avoid all ranks sending to or receiving from same peer template __device__ __forceinline__ void ScatterGatherOp(intptr_t inpIx, intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift, bool postOp) { constexpr int DirectRecv = 1 && Direct && DirectRecv1; constexpr int DirectSend = 1 && Direct && DirectSend1; int offset = 0; // slice offset int sliceSize = stepSize*StepPerSlice; int dataSize = max(DIVUP(peerElem, 16*SlicePerChunk)*16, sliceSize/32); // per-peer slice size #pragma unroll for (int slice=0; slice(0, inpIx, offset, realSize); subBarrier(); #pragma unroll // Loop over peers for (int j=0; j= 0 && i >= skip) pOffset += peerElem; void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset; ssize_t realPeerSize = min(realSize, totalElem-pOffset); if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) { reduceCopy(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize); // Mark for threadfence at the end fenceNeeded |= true; } } } else if (Recv) { if (tid==0) ncclShmem.groups[group].dsts[0] = (T*)ncclShmem.groups[group].userOutput + outIx + offset; ssize_t pOffset = index*peerOffset; if (skip >= 0 && index >= skip) pOffset += peerElem; // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer waitPeer(outIx+pOffset, outIx+pOffset, offset, realSize); subBarrier(); #pragma unroll for (int j=0; j= 0 && i >= skip) pOffset += peerElem; void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset; ssize_t realPeerSize = min(realSize, totalElem-pOffset); if (DirectRecv && ncclShmem.groups[group].srcs[i] == dst0) realPeerSize = 0; if (realPeerSize > 0) reduceCopy(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize); } } } fenceNeeded = barrierAny(fenceNeeded); postPeer(fenceNeeded); offset += realSize; } } __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) { if (flags & (RoleWaitRecv|RolePostRecv)) { auto *conn = &peer->recv[connIndex]; if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) { // handle must be a device ptr netDeviceHandle = conn->netDeviceHandle.handle; // Cache the handle ncclNetDeviceUnpackSetup(netDeviceHandle, group, index); flags |= NetDeviceUnpack; } step = conn->step; step = roundUp(step, SlicePerChunk*StepPerSlice); if (flags & RolePostRecv) { connStepPtr = conn->head; *connStepPtr = step; // Return credits in case we rounded up. } if (flags & RoleWaitRecv) { ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs() flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0; connStepPtr = conn->tail; connStepCache = loadStepValue(connStepPtr); connStepSize = conn->stepSize/sizeof(T); connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; if (conn->connFifo != nullptr) { flags |= ConnFifoEnabled; connFifo = conn->connFifo; } else if (Direct) { // User buffers have been registered if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) { if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite : (e->direct & NCCL_DIRECT_READ) ? DirectRead : 0; } } else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) { if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { // direct read not allowed in non-register case // otherwise, in one-to-multi send, we could mix empty send and intermediate send flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0; } } else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) { /* NVLS direct */ flags |= NvlsDirectRead; } } } } } __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) { if (flags & (RoleWaitSend|RolePostSend)) { auto *conn = &peer->send[connIndex]; step = conn->step; step = roundUp(step, SlicePerChunk*StepPerSlice); connFifo = conn->connFifo; if (connFifo != nullptr) flags |= ConnFifoEnabled; if (flags & RolePostSend) { connStepPtr = conn->tail; connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; } if (flags & RoleWaitSend) { ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs() flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0; connStepPtr = conn->head; connStepCache = loadStepValue(connStepPtr); connStepSize = conn->stepSize/sizeof(T); connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; if (connFifo == nullptr && Direct) { // User buffers have been registered if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) { if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite : (e->direct & NCCL_DIRECT_READ) ? DirectRead : 0; } } else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) { if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { // direct read not allowed in non-register case // otherwise, in one-to-multi send, we could mix empty send and intermediate send flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0; } } else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) { /* NVLS direct */ flags |= NvlsDirectWrite; } } } } } public: __device__ Primitives( int tid, int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,bool userBufReg=false, int stepSize_=0 ): tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group), stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) { // For send operations, we need an extra warp to overlap the threadfence and the copy this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0); int nrecv=0, nsend=0; while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++; while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++; this->fan = Fan(nrecv, nsend); constexpr int ThreadPerSync = MaxSend >= 16 || MaxRecv >= 16 ? 32 : // NVLS may have an arity > 8. In that case increase the size of the groups MaxSend >= 8 || MaxRecv >= 8 ? 16 : 8; // Allows for all roles (WaitRecv/WaitSend/PostRecv/PostSend) within a single warp static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers"); index = -1; flags = 0; assert(2*(nrecv+nsend) <= nthreads); // Ensure no thread is assigned more than one role. if (tid < nrecv) { flags |= RoleWaitRecv; index = tid; } else if (tid < nrecv+nsend) { flags |= RoleWaitSend; index = tid-nrecv; } else if (nthreads-nsend <= tid) { flags |= RolePostSend; index = tid-(nthreads-nsend); } else if (nthreads-nrecv-nsend <= tid) { flags |= RolePostRecv; index = tid-(nthreads-nrecv-nsend); } int peer = 0; if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index]; if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index]; loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e); loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e); if (userBufReg) flags |= UserBufferMode; if (barrierAny(flags & NetDeviceUnpack)) { flags |= AnyNetDeviceUnpack; // RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers // have NetDeviceUnpack. uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0); if (tid == 0) { ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask; } } setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e); } __device__ ~Primitives() { // Ensure ncclShmem.groups[].send/recvConns are available barrier(); // Save steps for the next operation if (flags & (RolePostSend|RolePostRecv)) { auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns; conns[index]->step = step; } if ((flags & UserBufferMode) && (flags & RoleWaitSend)) { // Make sure we wait until the proxy has sent data before we return. // We don't want the next CUDA kernel to overwrite the send buffer which // was accessed directly. uint64_t prevStep = step - StepPerSlice; volatile ssize_t* ptr = &(connFifo[prevStep%NCCL_STEPS].size); int spins = 0; while (*ptr != -1) if (checkAbort(spins)) break; } if (flags & NetDeviceUnpack) { ncclNetDeviceSaveHead(netDeviceHandle, group, index); } // Make sure all threads are done writing back conn->step and done using // ncclShmem.groups[group] barrier(); } __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* e) { if (tid==0) { ncclShmem.groups[group].userInput = (void*)inputBuf; ncclShmem.groups[group].userOutput = (void*)outputBuf; ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input } bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite); bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite)); bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched) bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead) || (flags == (flags|RoleWaitRecv|NvlsDirectRead)); // receiver accepts direct buffer int regUsed = e != nullptr ? e->coll.regUsed : 0; if (Direct && recvProvider) { int spins = 0; void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange; // Wait for consumer to consume previous value before trampling it. if (slot) { while (*slot != nullptr && !checkAbort(spins)); directBuff = (T*)outputBuf; // Encode pointer by XOR'ing against some address they definitely wouldn't send // since we want to allow them sending us nullptr while not colliding with // the empty slot value. *slot = reinterpret_cast(reinterpret_cast(directBuff) ^ reinterpret_cast(slot)); } } if (Direct && sendAcceptor) { int spins = 0; void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange; void *ptr; while (slot) { ptr = *slot; if (ptr != nullptr || checkAbort(spins)) break; } if (slot) { directBuff = regUsed ? (T*)(e->dnOutputs[index]) : reinterpret_cast(reinterpret_cast(ptr) ^ reinterpret_cast(slot)); *slot = nullptr; } else { /* slot is NULL, it must be regUsed == 1 */ directBuff = (T*)e->dnOutputs[index]; } } if (Direct && sendProvider) { int spins = 0; void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange; volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange; volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1; // Wait for consumer to consume previous value before trampling it. if (slot && argSlot0 && argSlot1) { while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins)); // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter) // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend) directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf; // Exchange pre-scalers for use in direct pull *argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg; *argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32); // Encode pointer by XOR'ing against some address they definitely wouldn't send // since we want to allow them sending us nullptr while not colliding with // the empty slot value. *slot = reinterpret_cast(reinterpret_cast(directBuff) ^ reinterpret_cast(slot)); } } if (Direct && recvAcceptor) { int spins = 0; void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange; volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange; volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1; void *ptr; while (slot) { ptr = *slot; if (ptr != nullptr || checkAbort(spins)) break; } if (slot && argSlot0 && argSlot1) { directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) : reinterpret_cast(reinterpret_cast(ptr) ^ reinterpret_cast(slot)); if (MaxSend != 0) { // reduce group rather than gather group // Store scalers for remote inputs uint64_t arg0, arg1; while (true) { arg0 = *argSlot0; arg1 = *argSlot1; if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break; } ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff); } *argSlot0 = 0; *argSlot1 = 0; *slot = nullptr; } else { directBuff = (T*)e->dnInputs[index]; } } } __device__ void moveDataPtrs(intptr_t delta) { if (tid==0) { ncclShmem.groups[group].userInput = (T*)ncclShmem.groups[group].userInput + delta; ncclShmem.groups[group].userOutput = (T*)ncclShmem.groups[group].userOutput + delta; } } __device__ __forceinline__ void send(intptr_t inpIx, int eltN) { genericOp<0, 0, 0, 1, Input, -1>(inpIx, -1, eltN, false); } __device__ __forceinline__ void sendFromOutput(intptr_t outIx, int eltN) { genericOp<0, 0, 0, 1, Output, -1>(outIx, -1, eltN, false); } __device__ __forceinline__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) { genericOp<0, 1, 0, 1, Input, -1>(inpIx, outIx, eltN, false); } __device__ __forceinline__ void directSendFromOutput(intptr_t outIx, int eltN) { genericOp<0, 1, 0, 1, Output, -1>(outIx, outIx, eltN, false); } __device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp); } __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) { genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false); } __device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) { genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false); } __device__ __forceinline__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ __forceinline__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ __forceinline__ void recvSend(int eltN, bool postOp=false) { genericOp<0, 0, 1, 1, -1, -1>(-1, -1, eltN, postOp); } __device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp); } __device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) { genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false); } __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN) { genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, false); } __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp); } __device__ __forceinline__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ __forceinline__ void recvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp); } __device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) { genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp); } __device__ __forceinline__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { // Direct is only for the send part genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ __forceinline__ void scatter(intptr_t inpIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) { ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false); } __device__ __forceinline__ void directScatter(intptr_t inpIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) { ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false); } __device__ __forceinline__ void gather(intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift, bool postOp=false) { ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, postOp); } __device__ __forceinline__ void directGather(intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) { ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false); } }; nccl-2.22.3-1/src/device/reduce.h000066400000000000000000000054451463451655400163500ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "device.h" #include "collectives.h" #include "primitives.h" namespace { template __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { ncclRing *ring = &ncclShmem.channel.ring; const int nranks = ncclShmem.comm.nRanks; const int rank = ncclShmem.comm.rank; const int prevRank = ring->userRanks[nranks-1]; const int root = work->root; size_t chunkCount; size_t channelCount; size_t gridOffset; ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount); size_t offset; int nelem; Primitives, 0, Proto, 0> prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg); if (prevRank == root) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.send(offset, nelem); } } else if (rank == root) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true); } } else { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.recvReduceSend(offset, nelem); } } } } template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { using Proto = ProtoSimple; runRing(tid, nthreads, work); } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { runRing(tid, nthreads, work); } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { runRing(tid, nthreads, work); } }; nccl-2.22.3-1/src/device/reduce_kernel.h000066400000000000000000000661701463451655400177120ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_REDUCE_KERNEL_H_ #define NCCL_REDUCE_KERNEL_H_ #include "op128.h" #include #include template struct IsFloatingPoint: std::false_type {}; template<> struct IsFloatingPoint: std::true_type {}; #if defined(__CUDA_BF16_TYPES_EXIST__) template<> struct IsFloatingPoint<__nv_bfloat16>: std::true_type {}; #endif template<> struct IsFloatingPoint: std::true_type {}; template<> struct IsFloatingPoint: std::true_type {}; //////////////////////////////////////////////////////////////////////////////// // The reduction function classes. All classes must: // 1. Expose the `EltType` typedef. // 2. Have constructor taking no arguments (default constructible). // 3. Have constructor taking `uint64_t opArg`. template struct FuncCopy { using EltType = T; __device__ FuncCopy(uint64_t opArg=0) {}; }; template struct FuncSum { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; }; template struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; }; template struct FuncMinMax { using EltType = T; BytePack xormask; // only used by integers bool isMinNotMax; // only used by floats __device__ FuncMinMax(uint64_t opArg=0) { xormask.native = opArg; isMinNotMax = (opArg&1)==0; } }; template struct FuncPreMulSum; template struct FuncSumPostDiv; //////////////////////////////////////////////////////////////////////////////// // Trait class for handling the reduction argument. template struct RedOpArg { // default case: no argument static constexpr bool ArgUsed = false; __device__ static uint64_t loadArg(void *ptr) { return 0; } }; template struct RedOpArg> { static constexpr bool ArgUsed = true; __device__ static uint64_t loadArg(void *ptr) { union { uint64_t u64; T val; }; u64 = 0; val = *(T*)ptr; return u64; } }; //////////////////////////////////////////////////////////////////////////////// // Trait classes for reduction functions. Given a function (FuncSum, etc.) // and a number of elements in a pack, will reduce, preOp, or postOp a pack // of elements. These classes are intended to be specialized for specific // combinations of reduction function and pack size. template struct Apply_Reduce /*{ static BytePack reduce( Fn fn, BytePack a, BytePack b ); }*/; template struct Apply_PreOp/*{ static constexpr bool IsIdentity; static BytePack preOp(Fn fn, BytePack a); }*/; template struct Apply_PostOp/*{ static constexpr bool IsIdentity; static BytePack postOp(Fn fn, BytePack a); }*/; template struct LoadMultimem_BigPackSize/*{ // If non-zero, then this and sizeof(T) are valid pack sizes for LoadMultimem, // otherwise there are no valid pack sizes for LoadMultimem. static constexpr int BigPackSize = 0; }*/; template struct Apply_LoadMultimem/*{ static BytePack load(Fn fn, uintptr_t addr); }*/; //////////////////////////////////////////////////////////////////////////////// // Public API for calling the trait classes. These take the data elements as a // pack of any type, which could be a BytePack or any integral type (uint64_t, // uint32_t, etc.), and will return a new pack where each element has been // transformed appropriately. template __device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) { return fromPack( Apply_Reduce::Size/sizeof(typename Fn::EltType)> ::reduce(fn, toPack(a), toPack(b)) ); } template __device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) { return fromPack( Apply_PreOp::Size/sizeof(typename Fn::EltType)> ::preOp(fn, toPack(a)) ); } template __device__ __forceinline__ Pack applyPostOp(Fn fn, Pack a) { return fromPack( Apply_PostOp::Size/sizeof(typename Fn::EltType)> ::postOp(fn, toPack(a)) ); } template __device__ __forceinline__ BytePack applyLoadMultimem(Fn fn, uintptr_t addr) { return Apply_LoadMultimem::load(fn, addr); } //////////////////////////////////////////////////////////////////////////////// // Apply_Reduce // Nonsensical base case template struct Apply_Reduce { __device__ static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) { return {}; } }; // General recursive definition (EltPerPack > 1). This is how we iterate over // all elements in a pack of any size, by breaking it into halves. Eventually // we'll hit a base case (a more specific template specialization which takes // precedence). template struct Apply_Reduce { template __device__ static BytePack reduce(Fn fn, BytePack a, BytePack b) { a.half[0] = Apply_Reduce::reduce(fn, a.half[0], b.half[0]); a.half[1] = Apply_Reduce::reduce(fn, a.half[1], b.half[1]); return a; } }; // Base case definitions (EltPerPack == 1) template struct Apply_Reduce, /*EltPerPack=*/1> { __device__ static BytePack reduce(FuncCopy fn, BytePack a, BytePack b) { return a; } }; template struct Apply_Reduce, /*EltPerPack=*/1> { __device__ static BytePack reduce(FuncSum fn, BytePack a, BytePack b) { return toPack(fromPack(a) + fromPack(b)); } }; template struct Apply_Reduce, /*EltPerPack=*/1> { __device__ static BytePack reduce(FuncProd fn, BytePack a, BytePack b) { return toPack(fromPack(a) * fromPack(b)); } }; template struct Apply_Reduce, /*EltPerPack=*/1> { __device__ static BytePack reduce(FuncMinMax fn, BytePack a, BytePack b) { return (a.native ^ fn.xormask.native) < (b.native ^ fn.xormask.native) ? a : b; } }; // Optimizations for specfic types and element count combinations: template<> struct Apply_Reduce, /*EltPerPack=*/4> { __device__ static BytePack<4> reduce(FuncSum fn, BytePack<4> a, BytePack<4> b) { constexpr uint32_t even = 0x00ff00ffu; uint32_t x = (a.native & even) + (b.native & even); uint32_t y = (a.native & ~even) + (b.native & ~even); //a.native = (x & even) | (y & ~even); a.native = __byte_perm(x, y, 0x7250); return a; } }; template<> struct Apply_Reduce, /*EltPerPack=*/4> { __device__ static BytePack<4> reduce(FuncMinMax fn, BytePack<4> a, BytePack<4> b) { constexpr uint32_t ones = 0x01010101u; constexpr uint32_t even = 0x00ff00ffu; // even byte mask // Replicate xormask to all bytes uint32_t x = fn.xormask.native * ones; // Transform inputs by xormask uint32_t ax = a.native ^ x; uint32_t bx = b.native ^ x; // Use 9-bit arithmetic to compute d=a-b uint32_t d0 = (ax & even) + (~bx & even) + ones; uint32_t d1 = (ax>>8 & even) + (~(bx>>8) & even) + ones; // Move sign bit of each 9-bit delta into the least bit of origin byte //uint32_t s = (d0>>8 & ones & even) | (d1 & ones & ~even); uint32_t s = __byte_perm(d0, d1, 0x7351) & ones; // Broadcast least bit across whole byte s *= 0xffu; // Compose result by selecting bytes via: signbit(a-b)==1 ? a : b a.native = (a.native & s) | (b.native & ~s); return a; } }; template<> struct Apply_Reduce, /*EltPerPack=*/4> { __device__ static BytePack<4> reduce(FuncProd fn, BytePack<4> apack, BytePack<4> bpack) { uint32_t a = apack.native; uint32_t b = bpack.native; uint32_t ab0 = (a*b) & 0xffu; asm("mad.lo.u32 %0, %1, %2, %0;" : "+r"(ab0) : "r"(a&0xff00u), "r"(b&0xff00u)); uint32_t ab1; asm("mul.hi.u32 %0, %1, %2;" : "=r"(ab1) : "r"(a&0xff0000), "r"(b&0xff0000)); asm("mad.hi.u32 %0, %1, %2, %0;" : "+r"(ab1) : "r"(a&0xff000000u), "r"(b&0xff000000u)); apack.native = __byte_perm(ab0, ab1, 0x6420); return apack; } }; #define SPECIALIZE_REDUCE(Fn, T, EltPerPack, Vec, expr_of_fn_x_y) \ template<> \ struct Apply_Reduce, EltPerPack> { \ __device__ __forceinline__ static BytePack reduce( \ Fn fn, BytePack a, BytePack b \ ) { \ Vec x = fromPack(a); \ Vec y = fromPack(b); \ return toPack(expr_of_fn_x_y); \ } \ }; SPECIALIZE_REDUCE(FuncMinMax, float, 1, float, fn.isMinNotMax ? fminf(x, y) : fmaxf(x, y)) SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : fmax(x, y)) #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610 SPECIALIZE_REDUCE(FuncSum, half, 1, half, __hadd(x, y)) SPECIALIZE_REDUCE(FuncSum, half, 2, half2, __hadd2(x, y)) SPECIALIZE_REDUCE(FuncProd, half, 1, half, __hmul(x, y)) SPECIALIZE_REDUCE(FuncProd, half, 2, half2, __hmul2(x, y)) #else SPECIALIZE_REDUCE(FuncSum, half, 1, half, __float2half(__half2float(x) + __half2float(y))) SPECIALIZE_REDUCE(FuncProd, half, 1, half, __float2half(__half2float(x) * __half2float(y))) #endif #if __CUDA_ARCH__ >= 800 SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y)) SPECIALIZE_REDUCE(FuncMinMax, half, 2, half2, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y)) #else SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, __float2half(fn.isMinNotMax ? fminf(__half2float(x), __half2float(y)) : fmaxf(__half2float(x), __half2float(y)))) #endif #if defined(__CUDA_BF16_TYPES_EXIST__) #if __CUDA_ARCH__ >= 800 SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __hadd(x, y)) SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 2, __nv_bfloat162, __hadd2(x, y)) SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __hmul(x, y)) SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 2, __nv_bfloat162, __hmul2(x, y)) SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y)) SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 2, __nv_bfloat162, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y)) #else SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) + __bfloat162float(y))) SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) * __bfloat162float(y))) SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(fn.isMinNotMax ? fminf(__bfloat162float(x), __bfloat162float(y)) : fmaxf(__bfloat162float(x), __bfloat162float(y)))) #endif #endif #undef SPECIALIZE_REDUCE //////////////////////////////////////////////////////////////////////////////// // Apply_PreOp // General recursive definition (EltPerPack > 1) template struct Apply_PreOp { static constexpr bool IsIdentity = Apply_PreOp::IsIdentity; template __device__ static BytePack preOp(Fn fn, BytePack a) { #if __cpp_if_constexpr if constexpr(!IsIdentity) { #else if (!IsIdentity) { #endif // The `if (!IsIdentity)` condition is not strictly necessary, but it may help // compiler in that it won't have to tear a register apart for no reason // just to put it back together again. a.half[0] = Apply_PreOp::preOp(fn, a.half[0]); a.half[1] = Apply_PreOp::preOp(fn, a.half[1]); } return a; } }; // Base case definition (EltPerPack == 1), by default is identity function. template struct Apply_PreOp { static constexpr bool IsIdentity = true; template __device__ static BytePack preOp(Fn fn, BytePack a) { return a; } }; // Base case definition (EltPerPack == 0), is nonsense! template struct Apply_PreOp { static constexpr bool IsIdentity = true; __device__ static BytePack<0> preOp(Fn fn, BytePack<0> a) { return {}; } }; //////////////////////////////////////////////////////////////////////////////// // Apply_PostOp // General recursive definition (EltPerPack > 1) template struct Apply_PostOp { static constexpr bool IsIdentity = Apply_PostOp::IsIdentity; template __device__ static BytePack postOp(Fn fn, BytePack a) { #if __cpp_if_constexpr if constexpr(!IsIdentity) { #else if (!IsIdentity) { #endif // The `if (!IsIdentity)` condition is not strictly necessary, but it may help // compiler in that it won't have to tear a register apart for no reason // just to put it back together again. a.half[0] = Apply_PostOp::postOp(fn, a.half[0]); a.half[1] = Apply_PostOp::postOp(fn, a.half[1]); } return a; } }; // Base case definition (EltPerPack == 1), by default is identity function. template struct Apply_PostOp { static constexpr bool IsIdentity = true; template __device__ static BytePack postOp(Fn fn, BytePack a) { return a; } }; // Base case definition (EltPerPack == 0), is nonsense! template struct Apply_PostOp { static constexpr bool IsIdentity = true; __device__ static BytePack<0> postOp(Fn fn, BytePack<0> a) { return {}; } }; //////////////////////////////////////////////////////////////////////////////// // FuncPreMulSum template struct RedOpArg> { static constexpr bool ArgUsed = true; __device__ static uint64_t loadArg(void *ptr) { union { uint64_t u64; T val; }; u64 = 0; val = *(T*)ptr; return u64; } }; // General definition for all integral types, float, and double. template struct FuncPreMulSum { using EltType = T; T scalar; __device__ FuncPreMulSum(uint64_t opArg=0) { union { uint64_t u64; T val; }; u64 = opArg; scalar = val; } }; template<> struct FuncPreMulSum { using EltType = half; #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610 half2 scalar; __device__ FuncPreMulSum(uint64_t opArg=0) { union { uint64_t u64; half val; }; u64 = opArg; scalar.x = val; scalar.y = val; } #else float scalar; __device__ FuncPreMulSum(uint64_t opArg=0) { union { uint64_t u64; half val; }; u64 = opArg; scalar = __half2float(val); } #endif }; #if defined(__CUDA_BF16_TYPES_EXIST__) template<> struct FuncPreMulSum<__nv_bfloat16> { using EltType = __nv_bfloat16; #if __CUDA_ARCH__ >= 800 __nv_bfloat162 scalar; __device__ FuncPreMulSum(uint64_t opArg=0) { union { uint64_t u64; __nv_bfloat16 val; }; u64 = opArg; scalar.x = val; scalar.y = val; } #else float scalar; __device__ FuncPreMulSum(uint64_t opArg=0) { union { uint64_t u64; __nv_bfloat16 val; }; u64 = opArg; scalar = __bfloat162float(val); } #endif }; #endif template struct Apply_Reduce, /*EltPerPack=*/1> { __device__ static BytePack reduce(FuncPreMulSum fn, BytePack a, BytePack b) { // FuncPreMulSum reduce dispatches to FuncSum. return Apply_Reduce, 1>::reduce(FuncSum(), a, b); } }; // PreOp of FuncPreMulSum for integral types, float, and double. template struct Apply_PreOp, /*EltPerPack=*/1> { static constexpr bool IsIdentity = false; __device__ static BytePack preOp(FuncPreMulSum fn, BytePack a) { return toPack(fromPack(a) * fn.scalar); } }; //////////////////////////////////////////////////////////////////////////////// // Apply_PreOp of FuncPreMulSum for float16. template<> struct Apply_PreOp, /*EltPerPack=*/1> { static constexpr bool IsIdentity = false; __device__ static BytePack preOp(FuncPreMulSum fn, BytePack a) { #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610 return toPack(__hmul(fromPack(a), fn.scalar.x)); #else return toPack(__float2half(__half2float(fromPack(a)) * fn.scalar)); #endif } }; #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610 template<> struct Apply_PreOp, /*EltPerPack=*/2> { static constexpr bool IsIdentity = false; __device__ static BytePack preOp(FuncPreMulSum fn, BytePack a) { return toPack(__hmul2(fromPack(a), fn.scalar)); } }; #endif //////////////////////////////////////////////////////////////////////////////// // Apply_PreOp of FuncPreMulSum for bfloat16. #if defined(__CUDA_BF16_TYPES_EXIST__) template<> struct Apply_PreOp, /*EltPerPack=*/1> { static constexpr bool IsIdentity = false; __device__ static BytePack preOp( FuncPreMulSum<__nv_bfloat16> fn, BytePack a ) { #if __CUDA_ARCH__ >= 800 return toPack<__nv_bfloat16>(__hmul(fromPack<__nv_bfloat16>(a), fn.scalar.x)); #else return toPack<__nv_bfloat16>(__float2bfloat16(__bfloat162float(fromPack<__nv_bfloat16>(a)) * fn.scalar)); #endif } }; #if __CUDA_ARCH__ >= 800 template<> struct Apply_PreOp, /*EltPerPack=*/2> { static constexpr bool IsIdentity = false; __device__ static BytePack preOp( FuncPreMulSum<__nv_bfloat16> fn, BytePack a ) { return toPack<__nv_bfloat162>(__hmul2(fromPack<__nv_bfloat162>(a), fn.scalar)); } }; #endif #endif //////////////////////////////////////////////////////////////////////////////// // FuncSumPostDiv template struct RedOpArg> { static constexpr bool ArgUsed = true; __device__ static uint64_t loadArg(void *ptr) { return *(uint64_t*)ptr; } }; template::value> struct FuncSumPostDiv_IntOnly; template struct FuncSumPostDiv: FuncSumPostDiv_IntOnly { __device__ FuncSumPostDiv(uint64_t opArg=0): FuncSumPostDiv_IntOnly(opArg) { } }; template struct FuncSumPostDiv_IntOnly: FuncSum { using EltType = T; int divisor; __device__ FuncSumPostDiv_IntOnly(uint64_t opArg=0): divisor(opArg) {} }; template struct FuncSumPostDiv_IntOnly { static_assert(sizeof(T)!=sizeof(T), "FuncSumPostDiv is only for implementing ncclAvg on integral types."); }; template struct Apply_Reduce, /*EltPerPack=*/1>: Apply_Reduce, 1> { __device__ static BytePack reduce(FuncSumPostDiv fn, BytePack a, BytePack b) { // FuncSumPostDiv reduce dispatches to FuncSum. return Apply_Reduce, 1>::reduce(FuncSum(), a, b); } }; template struct Apply_PostOp, /*EltPerPack=*/1> { static constexpr bool IsIdentity = false; __device__ static BytePack postOp(FuncSumPostDiv fn, BytePack a) { return toPack(fromPack(a) / fn.divisor); } }; //////////////////////////////////////////////////////////////////////////////// // Apply_LoadMultimem #define SIZEOF_BytePack_field_u16 2 #define PTX_REG_BytePack_field_u16 "h" #define SIZEOF_BytePack_field_u32 4 #define PTX_REG_BytePack_field_u32 "r" #define SIZEOF_BytePack_field_u64 8 #define PTX_REG_BytePack_field_u64 "l" #define DEFINE_Apply_LoadMultimem_sum(T, ptx_ty, pack_field) \ template<> \ struct Apply_LoadMultimem, SIZEOF_BytePack_field_##pack_field> { \ static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \ __device__ static BytePack load(FuncSum fn, uintptr_t addr) { \ BytePack ans; \ asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \ : "l"(addr)); \ return ans; \ } \ }; #define DEFINE_Apply_LoadMultimem_minmax(T, ptx_ty, pack_field) \ template<> \ struct Apply_LoadMultimem, SIZEOF_BytePack_field_##pack_field> { \ static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \ __device__ static BytePack load(FuncMinMax fn, uintptr_t addr) { \ BytePack ans; \ if (fn.isMinNotMax) { \ asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \ : "l"(addr)); \ } else { \ asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \ : "l"(addr)); \ } \ return ans; \ } \ }; #define DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \ template<> \ struct Apply_LoadMultimem, 4*(SIZEOF_BytePack_field_##pack_field)> { \ static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \ __device__ static BytePack load(FuncSum fn, uintptr_t addr) { \ BytePack ans; \ asm("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \ : "l"(addr)); \ return ans; \ } \ }; #define DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \ template<> \ struct Apply_LoadMultimem, 4*(SIZEOF_BytePack_field_##pack_field)> { \ static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \ __device__ static BytePack load(FuncMinMax fn, uintptr_t addr) { \ BytePack ans; \ if (fn.isMinNotMax) { \ asm("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \ : "l"(addr)); \ } else { \ asm("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \ : "l"(addr)); \ } \ return ans; \ } \ }; #define DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(T, ptx_ty, pack_field) \ DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \ template<> \ struct Apply_LoadMultimem, sizeof(T)> { \ __device__ static BytePack load(FuncSum fn, uintptr_t addr) { \ BytePack<2*sizeof(T)> tmp; \ asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \ : "l"(addr & -uintptr_t(2*sizeof(T)))); \ return tmp.half[(addr/sizeof(T))%2]; \ } \ }; #define DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(T, ptx_ty, pack_field) \ DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \ template<> \ struct Apply_LoadMultimem, sizeof(T)> { \ __device__ static BytePack load(FuncMinMax fn, uintptr_t addr) { \ BytePack<2*sizeof(T)> tmp; \ if (fn.isMinNotMax) { \ asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \ : "l"(addr & -uintptr_t(2*sizeof(T)))); \ } else { \ asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \ : "l"(addr & -uintptr_t(2*sizeof(T)))); \ } \ return tmp.half[(addr/sizeof(T))%2]; \ } \ }; template struct Apply_LoadMultimem { __device__ static BytePack load(Fn fn, uintptr_t addr) { __trap(); return {}; } }; #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010 template struct LoadMultimem_BigPackSize { using T = typename Fn::EltType; static constexpr bool IsSum = std::is_same>::value || std::is_same>::value || std::is_same>::value; static constexpr bool IsMinMax = std::is_same>::value; static constexpr bool IsFloat = IsFloatingPoint::value; static constexpr int BigPackSize = IsFloat && IsSum && sizeof(T) < 8 ? 16 : IsFloat && IsSum ? sizeof(T) : IsFloat && IsMinMax && sizeof(T)==2 ? 16 : !IsFloat && (IsSum||IsMinMax) && sizeof(T)>=4 ? sizeof(T) : /*multimem.ld_reduce not supported:*/ 0; }; DEFINE_Apply_LoadMultimem_sum(uint32_t, u32, u32) DEFINE_Apply_LoadMultimem_minmax(uint32_t, u32, u32) DEFINE_Apply_LoadMultimem_sum(int32_t, s32, u32) DEFINE_Apply_LoadMultimem_minmax(int32_t, s32, u32) DEFINE_Apply_LoadMultimem_sum(uint64_t, u64, u64) DEFINE_Apply_LoadMultimem_minmax(uint64_t, u64, u64) DEFINE_Apply_LoadMultimem_sum(int64_t, u64, u64) DEFINE_Apply_LoadMultimem_minmax(int64_t, s64, u64) DEFINE_Apply_LoadMultimem_sum(float, f32, u32) DEFINE_Apply_LoadMultimem_sum_v4(float, f32, u32) DEFINE_Apply_LoadMultimem_sum(double, f64, u64) DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(half, f16x2, u32) DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(half, f16x2, u32) #if defined(__CUDA_BF16_TYPES_EXIST__) DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(__nv_bfloat16, bf16x2, u32) DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(__nv_bfloat16, bf16x2, u32) #endif #else template struct LoadMultimem_BigPackSize { static constexpr int BigPackSize = 0; }; #endif #undef DEFINE_Apply_LoadMultimem #undef DEFINE_Apply_LoadMultimem_v4 #undef DEFINE_Apply_LoadMultimem_v4x2_and_subhalf #undef SIZEOF_BytePack_field_u64 #undef PTX_REG_BytePack_field_u64 #undef SIZEOF_BytePack_field_u32 #undef PTX_REG_BytePack_field_u32 #undef SIZEOF_BytePack_field_u16 #undef PTX_REG_BytePack_field_u16 #endif // REDUCE_KERNEL_H_ nccl-2.22.3-1/src/device/reduce_scatter.h000066400000000000000000000321131463451655400200650ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "device.h" #include "collectives.h" #include "primitives.h" namespace { template __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { ncclRing *ring = &ncclShmem.channel.ring; int const *ringRanks = ring->userRanks; const int nranks = ncclShmem.comm.nRanks; size_t count; size_t gridOffset; size_t channelCount; size_t chunkCount; ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount); size_t offset; size_t dataOffset; uint32_t nelem; int rankDest; Primitives, 0, Proto, 0> prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { nelem = min(chunkCount, channelCount - elemOffset); dataOffset = gridOffset + elemOffset; /////////////// begin ReduceScatter steps /////////////// // step 0: push data to next GPU rankDest = ringRanks[nranks-1]; offset = dataOffset + rankDest * count; prims.send(offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; j struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { using Proto = ProtoSimple; runRing(tid, nthreads, work); } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { runRing(tid, nthreads, work); } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { runRing(tid, nthreads, work); } }; template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { struct ncclNvls* nvls = &ncclShmem.channel.nvls; size_t count; size_t gridOffset; size_t channelCount; size_t chunkCount; ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount); const int rank = ncclShmem.comm.rank; const int nranks = ncclShmem.comm.nRanks; size_t offset; int nelem; /* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync; * if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth * and the rest are allocated to scatter. */ const int nThreadsReduce = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE); const int nThreadsScatter = work->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce); const int tidEndScatter = nThreadsScatter; const int tidEndReduce = tidEndScatter + nThreadsReduce; if (!work->regUsed) { if (tid < tidEndScatter) { // Scatter using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL, work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.scatter(offset, nvls->nHeads * count, nelem, count, -1, 0); } } else if (tid < tidEndReduce) { // Reduce through NVLS using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; Primitives, /*Direct=*/0, Proto, 0> prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, work->recvbuff, work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); prims.recv(offset, nelem); } } } else { if (tid < tidEndScatter) { // Scatter using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL, work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { prims.scatter(0, 0, 0, 0, -1, 0); } /* gather used as sync */ prims.gather(0, 0, 0, 0, -1, 0); } else if (tid < tidEndReduce) { // Reduce through NVLS using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, work->recvbuff, work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { size_t outOffset = gridOffset + elemOffset; size_t inpOffset = outOffset + rank * count; nelem = min(chunkCount, channelCount - elemOffset); prims.directRecvCopy(inpOffset, outOffset, nelem); } /* send for sync */ prims.send(0, 0); } } } }; template struct RunWorkColl { template struct Scatterer { struct ncclDevWorkColl* work; int chunkSize; ssize_t railGridOffset; template __device__ __forceinline__ void operator()( int tid, int tn, int slice, int maxSliceSize, int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes ) { static_assert(SlicePerChunk==1, "require: SlicePerChunk==1"); static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1"); struct ncclDirect* direct = &ncclShmem.channel.collnetDirect; int nNodes = ncclShmem.comm.nNodes; int nRails = direct->nHeads; int part = ncclShmem.channelId - work->channelLo; void* inbuf = (void*)work->sendbuff; ssize_t sizePerRank = work->collnet.count; ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank); ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank); int railAllSize = railAllEnd - railAllBeg; if (tid < nDsts) dstSizes[tid] = railAllSize; int dst = 0; int rail; if (!ReduceSendNotRecv) { rail = direct->headRank; } else { rail = direct->headRank+1; if (rail == nRails) rail = 0; } do { int node = railAllBeg/sizePerRank; int railAllOffset = 0; while (railAllOffset < railAllSize) { ssize_t railOneBeg = node*sizePerRank; ssize_t railOneEnd = railOneBeg + sizePerRank; ssize_t railOneOffset = (railAllBeg+railAllOffset) - railOneBeg; int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset); int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail]; ssize_t userOneBeg = rank*sizePerRank + railOneOffset; reduceCopy (tid, tn, work->redOpArg, &work->redOpArg, false, /*nSrcs=*/1+nSrcs, [=]__device__(int s) { return s==0 ? (T*)inbuf + userOneBeg : (T*)srcPtrs[s-1] + railAllOffset; }, /*nDsts=*/1, [=]__device__(int d/*==0*/) { return (T*)dstPtrs[dst] + railAllOffset; }, delta); railAllOffset += delta; node += 1; } dst += 1; rail += 1; if (rail == nRails) rail = 0; } while (ReduceSendNotRecv && dst < nRails-1); } }; __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { const int part = ncclShmem.channelId - work->channelLo; const int nChannels = work->channelHi - work->channelLo + 1; struct ncclDirect* direct = &ncclShmem.channel.collnetDirect; int const &nNodes = ncclShmem.comm.nNodes; ssize_t chunkSize = int(work->collnet.chunkCount); ssize_t sizePerRank = work->collnet.count; if (direct->out == -1) __trap(); bool isMultiRail = (direct->nHeads > 1); int nWarps1 = (isMultiRail ? 2 : 0); int nWarps2 = (isMultiRail ? 2 : 1); int nWarps3 = 1; float denom = float(work->nWarps)/float(nWarps1+nWarps2+nWarps3); nWarps3 = int(denom*nWarps3); nWarps2 = int(denom*nWarps2); nWarps1 = work->nWarps - (nWarps2+nWarps3); using Proto = ProtoSimple<1, 1>; int tn = nWarps1*WARP_SIZE; if (tid < tn) { // Phase 1: Scatter inputs to peers Primitives, /*Direct=*/0, Proto, 0> prims(tid, tn, nullptr, direct->heads+1, nullptr, nullptr, work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1); for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) { Scatterer scat; scat.work = work; scat.chunkSize = chunkSize; scat.railGridOffset = railGridOffset; prims.template process(scat); } return; } tid -= tn; tn = nWarps2*WARP_SIZE; if (tid < tn) { if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (tid == 0) { int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps); } __syncwarp(); } else { // Phase 2: Reduce from peers + local input -> send to network Primitives, /*Direct=*/0, Proto, 0> prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr, work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) { Scatterer scat; scat.work = work; scat.chunkSize = chunkSize; scat.railGridOffset = railGridOffset; prims.template process(scat); } } return; } tid -= tn; tn = nWarps3*WARP_SIZE; if (tid < tn) { if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (tid == 0) { int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps); } __syncwarp(); } else { // Phase 3: recv from network Primitives, /*Direct=*/0, Proto, 0> prims(tid, tn, &direct->out, nullptr, nullptr, work->recvbuff, work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0); for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) { ssize_t railAllBeg = railGridOffset + part * chunkSize; ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank); ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank; ssize_t railOneEnd = railOneBeg + sizePerRank; ssize_t beg = max(railAllBeg, railOneBeg); ssize_t end = min(railAllEnd, railOneEnd); prims.recv(beg - railOneBeg, max(ssize_t(0), end - beg), /*postOp=*/true); } } return; } } }; nccl-2.22.3-1/src/device/sendrecv.h000066400000000000000000000157421463451655400167130ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "device.h" #include "collectives.h" #include "primitives.h" template struct RunWorkBatch { static_assert(sizeof(T)==1, "SendRecv only works on single byte types T."); template __device__ void runSend(int tid, int tn, int group, struct ncclDevWorkP2p* work) { size_t bytes = work->sendBytes; int chunkSize = u32fp8Decode(work->sendChunkSize_u32fp8); Primitives, 1, Proto, 1> prims(tid, tn, nullptr, &work->sendRank, work->sendAddr, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, /*userBufferMode=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize); size_t cursor = 0; do { int n = min(size_t(chunkSize), bytes-cursor); prims.directSend(cursor, cursor, n); cursor += n; } while (cursor < bytes && work->sendRegistered == 0); } template __device__ void runRecv(int tid, int tn, int group, struct ncclDevWorkP2p* work) { size_t bytes = work->recvBytes; int chunkSize = u32fp8Decode(work->recvChunkSize_u32fp8); Primitives, 1, Proto, 1> prims(tid, tn, &work->recvRank, nullptr, nullptr, work->recvAddr, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, /*userBufferMode=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize); size_t cursor = 0; do { int n = min(size_t(chunkSize), bytes-cursor); prims.directRecv(cursor, n); cursor += n; } while (cursor < bytes && work->recvRegistered == 0); } __device__ __forceinline__ void run() { const int tid = threadIdx.x; const int tn = blockDim.x; const int wid = tid/WARP_SIZE; const int nWarps = tn/WARP_SIZE; const int lane = tid%WARP_SIZE; struct Shared { uint32_t workSendMask; // bitmasks of which work indices have send/recv uint32_t workRecvMask; }; Shared* shared = (Shared*)ncclScratchForWarp(0); struct ncclDevWorkP2p* works = (ncclDevWorkP2p*)ncclShmem.workStorage; int nWorks = ncclShmem.nWorks; if (wid == 0) { // Modify the memory range of each work[] to reflect this channel's // partition of the work. Since integer divides are very heavy it's // best to do them all in one warp. int workIx = lane%16; int isSend = lane < 16 ? 0 : 1; bool hasWork = false; if (workIx < nWorks) { struct ncclDevWorkP2p* work = &works[workIx]; size_t bytes = isSend ? work->sendBytes : work->recvBytes; int nParts = isSend ? work->nSendChannels : work->nRecvChannels; int part = ncclP2pChannelToPart(work->nP2pChannels, work->channelBase, ncclShmem.channelId); hasWork = (part < nParts); if (nParts != 0) { size_t partBeg, partEnd; ncclP2pPartBounds(nParts, part, bytes, &partBeg, &partEnd); (isSend ? work->sendAddr : work->recvAddr) = (char*)(isSend ? work->sendAddr : work->recvAddr) + partBeg; (isSend ? work->sendBytes : work->recvBytes) = partEnd - partBeg; } } uint32_t mask = __ballot_sync(~0u, hasWork); if (lane == 0) { shared->workSendMask = mask>>16; shared->workRecvMask = mask & 0xffff; } } // The fastest way to compute a warp uniform division x/y in [0,32) is to // use each lane to guess a solution and count the ones that don't exceed // the numerator: // __popc(__ballot_sync(~0u, y*(lane+1) <= x)) // That takes 1/3 the time of standard division and about 3/4 the time of // approximate floating point division: // __float2int_rd(__fdividef(float(x),float(y))). // nWarpPerWork = nWarps/nWorks int nWarpPerWork = __popc(__ballot_sync(~0u, nWorks*(lane+1) <= nWarps)); int nRecvWarpPerWork = nWarpPerWork<=4 ? nWarpPerWork/2 : (nWarpPerWork-1)/2; int nSendWarpPerWork = nWarpPerWork<=4 ? nRecvWarpPerWork : nRecvWarpPerWork+1; // This might reduce nWarpPerWork which is probably desirable. It is better // to have a balanced number of reading and writing threads even if that // leaves warps unused. nWarpPerWork = nSendWarpPerWork + nRecvWarpPerWork; // The work index this warp belongs to: workIx = wid/nWarpPerWork int workIx = __popc(__ballot_sync(~0u, (lane+1)*nWarpPerWork <= wid)); __syncthreads(); // Wait for works[] and shared->* to be updated by warp=0 uint32_t workSendMask = shared->workSendMask; uint32_t workRecvMask = shared->workRecvMask; __syncthreads(); // release scratch space used by shared->* if (nWorks <= workIx) return; // Thread range for whole work (send & recv combined) int subtid = tid - workIx*nWarpPerWork*WARP_SIZE; int subtn = nWarpPerWork*WARP_SIZE; // A send primtive of sufficient size requires 2 cuda barrier ids. constexpr int nSendWarpsForExtraGroup = NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE/WARP_SIZE; // Count up all group ids used below this workIx: int group, extra; // Each recv gets one group id: group = __popc(workRecvMask & ((1<= nSendWarpsForExtraGroup) ? 1 : 0; group += __popc((workSendMask & workRecvMask) & ((1<= nSendWarpsForExtraGroup) ? 1 : 0; group += __popc((workSendMask & ~workRecvMask) & ((1<>workIx); bool hasRecv = 1 & (workRecvMask>>workIx); bool isCopy = work->sendRank == ncclShmem.comm.rank; bool isSend = !hasRecv || (hasSend && subtid < nSendWarpPerWork*WARP_SIZE); if (!isCopy && hasSend && hasRecv) { // Translate thread ids to reflect just this send or recv as opposed to whole work. if (isSend) { subtn = nSendWarpPerWork*WARP_SIZE; } else { subtid -= nSendWarpPerWork*WARP_SIZE; subtn = nRecvWarpPerWork*WARP_SIZE; group += 1 + (nSendWarpPerWork >= nSendWarpsForExtraGroup ? 1 : 0); } } if (isCopy) { reduceCopy (subtid, subtn, 0, nullptr, false, 1, &work->sendAddr, 1, &work->recvAddr, (ssize_t)work->sendBytes); } else if (isSend) { if (work->sendProtoLL) { runSend(subtid, subtn, group, work); } else { runSend>(subtid, subtn, group, work); } } else { if (work->recvProtoLL) { runRecv(subtid, subtn, group, work); } else { runRecv>(subtid, subtn, group, work); } } } }; nccl-2.22.3-1/src/enhcompat.cc000066400000000000000000000026111463451655400157460ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ /* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */ enum cudaError_t { cudaErrorStubLibrary = 34 }; extern "C" { cudaError_t cudaStreamGetCaptureInfo_v2(...) __attribute__((visibility("hidden"))) __attribute((weak)); cudaError_t cudaStreamGetCaptureInfo_v2(...) { return cudaErrorStubLibrary; } cudaError_t cudaUserObjectCreate(...) __attribute__((visibility("hidden"))) __attribute((weak)); cudaError_t cudaUserObjectCreate(...) { return cudaErrorStubLibrary; } cudaError_t cudaGraphRetainUserObject(...) __attribute__((visibility("hidden"))) __attribute((weak)); cudaError_t cudaGraphRetainUserObject(...) { return cudaErrorStubLibrary; } cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak)); cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; } cudaError_t cudaGetDriverEntryPoint(...) __attribute__((visibility("hidden"))) __attribute((weak)); cudaError_t cudaGetDriverEntryPoint(...) { return cudaErrorStubLibrary; } } nccl-2.22.3-1/src/enqueue.cc000066400000000000000000002662301463451655400154500ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "enqueue.h" #include "argcheck.h" #include "coll_net.h" #include "gdrwrap.h" #include "bootstrap.h" #include "channel.h" #include "cudawrap.h" #include "transport.h" #include // std::memcpy #include // PRIx64 NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0); // Returns maximum kernel stack size of all CUDA kernels ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) { ncclResult_t result = ncclSuccess; if (maxStackSize) *maxStackSize = 0; int carveout = ncclParamL1SharedMemoryCarveout(); for (int k=0; k < ncclDevKernelCount; k++) { void* fn = ncclDevKernelList[k]; if (fn == nullptr) continue; if (maxStackSize) { cudaFuncAttributes attr = {0}; CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0); if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes; ignore0:; } if (carveout) { CUDACHECKGOTO(cudaFuncSetAttribute(fn, cudaFuncAttributePreferredSharedMemoryCarveout, carveout), result, ignore1); ignore1:; } if (ncclShmemDynamicSize(cudaArch) != 0) { CUDACHECKGOTO(cudaFuncSetAttribute(fn, cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)), result, next_kernel); } next_kernel:; } return result; } //////////////////////////////////////////////////////////////////////////////// // Data movement metrics. static inline int ncclFuncTrafficPerByte(ncclFunc_t func, int nRanks) { switch (func) { case ncclFuncAllReduce: return 2; case ncclFuncAllGather: return nRanks; case ncclFuncReduceScatter: return nRanks; default: return 1; } } static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count) { return func == ncclFuncReduceScatter ? nRanks*count : count; } static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) { return func == ncclFuncAllGather ? nRanks*count : count; } static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) { return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count; } /*****************************************************************************/ /* Launch system : synchronization and CUDA kernel launch */ /*****************************************************************************/ static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclProxyOp* op) { bool needed = true; NCCLCHECK(ncclProxySaveOp(comm, op, &needed)); if (needed) { struct ncclProxyOp* q = ncclMemoryPoolAlloc(&comm->memPool_ncclProxyOp, &comm->memPermanent); *q = *op; // C++ struct assignment ncclIntruQueueEnqueue(&comm->planner.wipPlan.channels[op->channelId].proxyOpQueue, q); } return ncclSuccess; } static void addWorkBatchToPlan( struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId, enum ncclDevWorkType workType, int devFuncId, uint32_t workOffset, int p2pRound = -1 ) { ncclKernelPlanner::WipPlan::Channel* chan = &comm->planner.wipPlan.channels[channelId]; size_t workSize = ncclDevWorkSize(workType); // Conditions causing us to create a new blank batch. bool newBatch = (chan->workBatchQueue.tail == nullptr); struct ncclDevWorkBatch* batch = nullptr; if (!newBatch) { batch = &chan->workBatchQueue.tail->batch; // All of the conditions that prevent us from appending to current batch. newBatch |= batch->workType != (uint8_t)workType; newBatch |= batch->funcId != devFuncId; // The following ensure the device can handle a batch this large. They have to // account for all extension batches being fused together which is why // wipBatch.workBytes and wipBatch.nP2ps aren't reset to 0 for a new extension // batch further down. newBatch |= NCCL_MAX_DEV_WORK_BATCH_BYTES < chan->wipBatch.workBytes + workSize; if (workType == ncclDevWorkTypeP2p) { newBatch |= chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH; for (int i=0; i < chan->wipBatch.nP2ps; i++) { newBatch |= p2pRound == chan->wipBatch.p2pRounds[i]; } } } // Conditions causing us to create an extension batch (prev->nextExtends=1) uint32_t offset = newBatch ? 0 : (workOffset - batch->offsetBase); bool extendBatch = 63*workSize < offset; extendBatch |= 0 != offset%workSize; if (newBatch || extendBatch) { if (!newBatch) batch->nextExtends = extendBatch; // Extending the previous batch. struct ncclWorkBatchList* batchNode = ncclMemoryStackAlloc(&comm->memScoped); ncclIntruQueueEnqueue(&chan->workBatchQueue, batchNode); batch = &batchNode->batch; batch->nextExtends = 0; batch->workType = (uint32_t)workType; batch->funcId = devFuncId; batch->offsetBase = workOffset; batch->offsetBitset = 0; offset = 0; if (newBatch) { // Since extension batches are fused together on the device, and these values // account for constraints on the fused batch, we only reset the values on // a new batch chan->wipBatch.workBytes = 0; chan->wipBatch.nP2ps = 0; // We don't count extension batches since this is used to derive a proxyOpCount, // and we wan't all ops which are fused together to have the same value. chan->nWorkBatchesP2p += (workType == ncclDevWorkTypeP2p ? 1 : 0); } plan->nWorkBatches += 1; } batch->offsetBitset |= 1ull<<(offset/workSize); chan->wipBatch.workBytes += workSize; if (workType == ncclDevWorkTypeP2p) { // We need to ensure that a single batch doesn't have multiple p2p's // of the same round since they would use the same connections. chan->wipBatch.p2pRounds[chan->wipBatch.nP2ps++] = p2pRound; } } static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) { ncclKernelPlanner::WipPlan::Channel* wipChannels = comm->planner.wipPlan.channels; size_t workBytes = plan->workBytes; size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch); plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MIN_NTHREADS); // If we can fit everything into the kernel args we do so. if (sizeof(ncclDevKernelArgs) + batchBytes + workBytes <= comm->workArgsBytes) { plan->workStorageType = ncclDevWorkStorageTypeArgs; } plan->kernelArgsSize = sizeof(struct ncclDevKernelArgs) + batchBytes; plan->kernelArgsSize += (plan->workStorageType == ncclDevWorkStorageTypeArgs) ? workBytes : 0; plan->kernelArgsSize = alignUp(plan->kernelArgsSize, 16); plan->kernelArgs = (struct ncclDevKernelArgs*)ncclMemoryStackAlloc(&comm->memScoped, plan->kernelArgsSize, /*align=*/16); plan->kernelArgs->comm = comm->devComm; plan->kernelArgs->channelMask = plan->channelMask; plan->kernelArgs->workStorageType = plan->workStorageType; // Put batches into the kernel arguments. The first batch for each channel // must be located at batchZero[blockIdx.x]. To achieve this we round robin // over the channels in ascending order until they're exhausted. uint64_t hasBatchMask = plan->channelMask; struct ncclDevWorkBatch* batchPrev[MAXCHANNELS] = {}; // {0...} struct ncclDevWorkBatch* batchZero = (struct ncclDevWorkBatch*)(plan->kernelArgs+1); int batchIx = 0; while (hasBatchMask != 0) { uint64_t tmpMask = hasBatchMask; // channels with a batch for this round. do { int c = popFirstOneBit(&tmpMask); if (!ncclIntruQueueEmpty(&wipChannels[c].workBatchQueue)) { struct ncclWorkBatchList* batchNode = ncclIntruQueueDequeue(&wipChannels[c].workBatchQueue); if (batchPrev[c] != nullptr) { batchPrev[c]->nextJump = int(&batchZero[batchIx] - batchPrev[c]); } batchPrev[c] = &batchZero[batchIx]; batchZero[batchIx++] = batchNode->batch; } if (ncclIntruQueueEmpty(&wipChannels[c].workBatchQueue)) { hasBatchMask ^= 1ull<proxyOpQueue // Phase 1: scan first op of each channel, store opCount in headIds[c]. uint64_t headIds[MAXCHANNELS]; int nHeads = 0; int channelUbound = 0; for (int c=0; c < MAXCHANNELS; c++) { struct ncclProxyOp* op = ncclIntruQueueHead(&wipChannels[c].proxyOpQueue); headIds[c] = op ? op->opCount : uint64_t(-1); if (op) nHeads += 1; if (op) plan->hasProxyOps = true; if (op) channelUbound = c+1; } // Phase 2: Dequeue from planner->channels[c], enqueue in merged order to plan while (nHeads != 0) { int c = -1; uint64_t minId = uint64_t(-1); // Find channel with least proxy-op id. We store the heads[c]->opCount in // headIds[c] to remove indirect loads from this loop. for (int c1=0; c1 < channelUbound; c1++) { uint64_t id = headIds[c1]; id = (id>>1 | id<<63); // Move tag bit to order collectives before p2p's if (id < minId) { c = c1; minId = id; } } struct ncclProxyOp* op = ncclIntruQueueDequeue(&wipChannels[c].proxyOpQueue); struct ncclProxyOp* opNext = ncclIntruQueueHead(&wipChannels[c].proxyOpQueue); headIds[c] = opNext ? opNext->opCount : uint64_t(-1); nHeads -= opNext ? 0 : 1; ncclIntruQueueEnqueue(&plan->proxyOpQueue, op); } } int64_t ncclParamLocalRegister(); NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 1); struct ncclIpcCleanupCallback { struct ncclCommCallback base; void* ptr; }; static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) { struct ncclIpcCleanupCallback* me = (struct ncclIpcCleanupCallback*)cb; CUDACHECKIGNORE(cudaIpcCloseMemHandle(me->ptr)); free(me); return ncclSuccess; } static ncclResult_t registerIntraNodeBuffers( struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue* cleanupQueue, bool* regNeedConnect ) { ncclResult_t result = ncclSuccess; info->regBufType = NCCL_REGULAR_BUFFER; *regNeedConnect = true; #if CUDART_VERSION >= 11030 if ((info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) && comm->nvlsRegSupport) { bool regBufUsed = false; const void *sendbuff = info->sendbuff; void *recvbuff = info->recvbuff; if (info->func == ncclFuncAllGather) sendbuff = NULL; if (info->func == ncclFuncReduceScatter) recvbuff = NULL; size_t elementSize = ncclTypeSize(info->datatype); size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count); size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count); /* first try local registration. */ if (ncclParamLocalRegister()) { ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, ®BufUsed, outRegBufSend, outRegBufRecv); } if (regBufUsed == false && comm->planner.persistent && ncclParamGraphRegister()) { ncclNvlsGraphRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, ®BufUsed, outRegBufSend, outRegBufRecv, cleanupQueue, &info->nCleanupQueueElts); } if (regBufUsed) { *regNeedConnect = false; /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to * saturate bandwidth. */ if (comm->nNodes == 1) { if (info->func == ncclFuncReduceScatter) info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5)); else info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4)); } else { info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6)); } info->regBufType = NCCL_NVLS_REG_BUFFER; } } else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other comm->intraRanks < comm->localRanks && // only with inter-process & intra-node peers comm->planner.persistent && 0) { /* Disable CollnetDirect registration since it does not support cuMem* allocated memory. */ int localRank = comm->localRank; cudaPointerAttributes sattr, rattr; CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff)); CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff)); if (sattr.type != cudaMemoryTypeDevice || rattr.type != cudaMemoryTypeDevice) return ncclSuccess; if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess; struct HandlePair { cudaIpcMemHandle_t ipc[2]; // {send, recv} size_t offset[2]; // {send, recv} }; struct HandlePair handles[NCCL_MAX_LOCAL_RANKS]; CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback); CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback); void *baseSend, *baseRecv; size_t size; CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff)); handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend; CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff)); handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv; NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair))); // Open handles locally for (int i=0; i < comm->localRanks; i++) { if (i == localRank) { // Skip self outRegBufSend[i] = nullptr; outRegBufRecv[i] = nullptr; } else { for (int sr=0; sr < 2; sr++) { // Get base address of mapping void* base; CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess)); // Get real buffer address by adding offset in the mapping (sr == 0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr]; // Enqueue reminder to close memory handle struct ncclIpcCleanupCallback* cb = (struct ncclIpcCleanupCallback*)malloc(sizeof(struct ncclIpcCleanupCallback)); cb->base.fn = cleanupIpc; cb->ptr = base; ncclIntruQueueEnqueue(cleanupQueue, &cb->base); info->nCleanupQueueElts += 1; } } } info->regBufType = NCCL_IPC_REG_BUFFER; } else if ((info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && comm->collNetRegSupport && info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv) { size_t elementSize = ncclTypeSize(info->datatype); size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count); size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count); int sendRegBufFlag = 0; int recvRegBufFlag = 0; void *sendHandle, *recvHandle; if (ncclParamLocalRegister()) { ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle); info->sendMhandle = sendHandle; if (sendRegBufFlag) { ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle); info->recvMhandle = recvHandle; } } if ((sendRegBufFlag == 0 || recvRegBufFlag == 0) && comm->planner.persistent && ncclParamGraphRegister()) { ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts); info->sendMhandle = sendHandle; if (sendRegBufFlag) { ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts); info->recvMhandle = recvHandle; } } if (sendRegBufFlag && recvRegBufFlag) { info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 1)); info->regBufType = NCCL_COLLNET_REG_BUFFER; if (sendRegBufFlag == 1 && recvRegBufFlag == 1) { INFO(NCCL_REG, "rank %d successfully registered collNet sendbuff %p (handle %p), sendbuff size %ld, recvbuff %p (handle %p), recvbuff size %ld", comm->rank, info->sendbuff, sendHandle, sendbuffSize, info->recvbuff, recvHandle, recvbuffSize); } } } fallback: #endif return result; } static ncclResult_t getCollNetSupport(struct ncclComm* comm, struct ncclTaskColl* task, int* collNetSupport); static ncclResult_t getAlgoInfo( struct ncclComm* comm, struct ncclTaskColl* task, int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo = NULL ); static ncclResult_t calcCollChunking( struct ncclComm* comm, struct ncclTaskColl* task, int nChannels, size_t nBytes, /*outputs*/uint32_t* outChunkSize, uint32_t* outDirectFlags, struct ncclProxyOp* proxyOp ); struct ncclKernelPlanBudget { ssize_t inArgsBytes; // Space available within kernel args struct ssize_t outArgsBytes; // Space available outside of args struct (fifo or persistent buf) }; static bool testBudget( struct ncclKernelPlanBudget* budget, int nWorkBatches, ssize_t workBytes ) { ssize_t batchBytes = nWorkBatches*sizeof(struct ncclDevWorkBatch); bool ok = false; ok |= (batchBytes + workBytes <= budget->inArgsBytes); ok |= (batchBytes <= budget->inArgsBytes) && (workBytes <= budget->outArgsBytes); return ok; } // Called once per ncclGroup to organize the user submitted tasks in // comm->planner so that they can be peeled off into plans. ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo) { struct ncclKernelPlanner* planner = &comm->planner; // Tasks from the sorter come out ordered size descending. struct ncclTaskColl* task = ncclTaskCollSorterDequeueAll(&planner->collSorter); // Tasks are assembled by (fn,op,ty) size ascending. struct ncclTaskColl* tasksByFnOpTy[ncclNumFuncs*ncclNumDevRedOps*ncclNumTypes]; memset(tasksByFnOpTy, 0, sizeof(tasksByFnOpTy)); int fnOpTyIndices[ncclNumFuncs*ncclNumDevRedOps*ncclNumTypes]; int fnOpTyCount = 0; // Walk the size sorted tasks, binning them by (fn,op,ty). while (task != nullptr) { struct ncclTaskColl* next = task->next; int index = ((int)task->func*ncclNumDevRedOps + (int)task->opDev.op)*ncclNumTypes + (int)task->datatype; // Add to set of (fn,op,ty) indices on first occurrence if (tasksByFnOpTy[index] == nullptr) fnOpTyIndices[fnOpTyCount++] = index; // Add to LIFO for this (fn,op,ty) task->next = tasksByFnOpTy[index]; tasksByFnOpTy[index] = task; // Next task task = next; } // Walk (fn,op,ty) bins, compute algo and proto etc. Then bin them by their // scheduling constraints (collnet x nvls). struct ncclIntruQueue collBins[2][2] = {}; for (int cursor=0; cursor < fnOpTyCount; cursor++) { struct ncclTaskColl* aggBeg = tasksByFnOpTy[fnOpTyIndices[cursor]]; int collNetSupport = 0; NCCLCHECK(getCollNetSupport(comm, aggBeg, &collNetSupport)); int nvlsSupport = comm->nvlsSupport && (ncclNvlsSupported(aggBeg->opDev.op, aggBeg->datatype) || aggBeg->func == ncclFuncAllGather); // Crudely estimate number of tasks per channel. This is using the wrong number // of channels for NVLS algos, but knowing the algo requires having this value, // so either be crude our iterate until fixed point, we chose the former. int nTasksPerChannel = divUp(comm->planner.nTasksColl, comm->nChannels); do { struct ncclTaskColl* aggEnd = aggBeg->next; struct ncclTaskColl agg = *aggBeg; // We aggregate operations that are within 4X size of each other. while (aggEnd != nullptr && aggEnd->trafficBytes < 4*aggBeg->trafficBytes) { agg.count += aggEnd->count; agg.trafficBytes += aggEnd->trafficBytes; aggEnd = aggEnd->next; } NCCLCHECK(getAlgoInfo(comm, &agg, collNetSupport, nvlsSupport, nTasksPerChannel, simInfo)); agg.devFuncId = ncclDevFuncId(agg.func, agg.opDev.op, agg.datatype, agg.algorithm, agg.protocol); int isCollnet=0, isNvls=0; switch (agg.algorithm) { case NCCL_ALGO_NVLS: case NCCL_ALGO_NVLS_TREE: isNvls = 1; isCollnet = agg.algorithm == NCCL_ALGO_NVLS && comm->nNodes > 1; break; case NCCL_ALGO_COLLNET_CHAIN: case NCCL_ALGO_COLLNET_DIRECT: isCollnet = 1; break; } // Update the aggregated tasks with the computed values. do { struct ncclTaskColl* next = aggBeg->next; aggBeg->algorithm = agg.algorithm; aggBeg->protocol = agg.protocol; aggBeg->nMaxChannels = agg.nMaxChannels; aggBeg->nWarps = agg.nWarps; aggBeg->devFuncId = agg.devFuncId; aggBeg->isCollnet = isCollnet; aggBeg->isNvls = isNvls; ncclIntruQueueEnqueue(&collBins[isCollnet][isNvls], aggBeg); aggBeg = next; } while (aggBeg != aggEnd); } while (aggBeg != nullptr); } // Concatenate `collBins[*][*]` together into final list `planner->collTaskQueue`. // Collnet is the outer dimension since that affects how we divide over the // channels. for (int isCollnet=0; isCollnet <= 1; isCollnet++) { for (int isNvls=0; isNvls <= 1; isNvls++) { ncclIntruQueueTransfer(&planner->collTaskQueue, &collBins[isCollnet][isNvls]); } } // Walk tasks again to: // 1. Possibly register buffers. // 2. Build ncclDevWorkColl structs. // 3. Bin the work structs according to the number of valid channels they // may be assigned to {collnet, nvls, standard} task = ncclIntruQueueHead(&planner->collTaskQueue); while (task != nullptr) { // Build a ncclDevWorkColl[Reg?] struct for each task. void* regBufSend[NCCL_MAX_LOCAL_RANKS]; void* regBufRecv[NCCL_MAX_LOCAL_RANKS]; bool regNeedConnect = true; registerIntraNodeBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, ®NeedConnect); if (comm->runtimeConn && comm->initAlgoChannels[task->algorithm] == false) { if (task->algorithm == NCCL_ALGO_NVLS_TREE && comm->initAlgoChannels[NCCL_ALGO_NVLS] == false && regNeedConnect == true) { comm->initAlgoChannels[NCCL_ALGO_NVLS] = true; algoNeedConnect[NCCL_ALGO_NVLS] = true; } if (task->algorithm != NCCL_ALGO_NVLS || regNeedConnect == true) { comm->initAlgoChannels[task->algorithm] = true; algoNeedConnect[task->algorithm] = true; *needConnect = true; } } struct ncclDevWorkColl devWork = {}; devWork.sendbuff = (void*)task->sendbuff; devWork.recvbuff = (void*)task->recvbuff; devWork.root = task->root; devWork.nWarps = task->nWarps; devWork.redOpArg = task->opDev.scalarArg; devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr; devWork.oneNode = (comm->nNodes == 1); devWork.regUsed = task->regBufType; struct ncclWorkList* workNode; switch (task->regBufType) { case NCCL_REGULAR_BUFFER: case NCCL_COLLNET_REG_BUFFER: { workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); workNode->workType = ncclDevWorkTypeColl; workNode->size = sizeof(struct ncclDevWorkColl); memcpy((void*)(workNode+1), (void*)&devWork, workNode->size); } break; case NCCL_IPC_REG_BUFFER: { struct ncclDevWorkCollReg workReg = {}; workReg.coll = devWork; struct ncclChannel *channel0 = &comm->channels[0]; for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) { int peer = channel0->collnetDirect.down[i]; if (peer == -1) break; int j = comm->rankToLocalRank[peer]; // Get intra-node slot workReg.dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer workReg.dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer } for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) { int peer = channel0->collnetDirect.up[i]; if (peer == -1) break; int j = comm->rankToLocalRank[peer]; // Output buffer of root peer workReg.upOutputs[i] = regBufRecv[j]; } workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); workNode->workType = ncclDevWorkTypeCollReg; workNode->size = sizeof(struct ncclDevWorkCollReg); memcpy((void*)(workNode+1), (void*)&workReg, workNode->size); } break; case NCCL_NVLS_REG_BUFFER: { struct ncclDevWorkCollReg workReg = {}; workReg.coll = devWork; // C++ struct assignment /* NVLS only has one send and recv buffer registered */ workReg.dnInputs[0] = regBufSend[0]; workReg.dnOutputs[0] = regBufRecv[0]; workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); workNode->workType = ncclDevWorkTypeCollReg; workNode->size = sizeof(struct ncclDevWorkCollReg); memcpy((void*)(workNode+1), (void*)&workReg, workNode->size); } break; default: /* impossible value */ WARN("Invalid regBufType %d", task->regBufType); return ncclInvalidArgument; } ncclIntruQueueEnqueue(&planner->collWorkQueue, workNode); task = task->next; } return ncclSuccess; } static ncclResult_t scheduleCollTasksToPlan( struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclKernelPlanBudget* budget ) { struct ncclKernelPlanner* planner = &comm->planner; // Estimate number of tasks that will fit in this plan. int nPlanColls = 0; size_t trafficBytes[2*2] = {0, 0, 0, 0}; // [collnet][nvls] int nChannels[2*2] = {0, 0, 0, 0}; // [collnet][nvls] int const nMaxChannels[2*2] = {comm->nChannels, comm->nvlsChannels, // [collnet][nvls] comm->nChannels, comm->nvlsChannels}; do { size_t workBytes = 0; struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue); struct ncclWorkList* workNode = ncclIntruQueueHead(&planner->collWorkQueue); while (task != nullptr) { int nBatches = divUp(nPlanColls, 4); // Rough guess: 4 colls per batch. if (!testBudget(budget, nBatches, workBytes + workNode->size)) goto plan_full; nPlanColls += 1; workBytes += workNode->size; int kind = 2*task->isCollnet + task->isNvls; trafficBytes[kind] += task->trafficBytes; nChannels[kind] += task->nMaxChannels; nChannels[kind] = std::min(nChannels[kind], nMaxChannels[kind]); task = task->next; workNode = workNode->next; } plan_full:; } while (0); int kindPrev = -1; constexpr size_t MinTrafficPerChannel = 512; size_t trafficPerChannel = 0; int channelId = 0; size_t currentTraffic = 0; while (nPlanColls!=0 && !ncclIntruQueueEmpty(&planner->collTaskQueue)) { struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue); struct ncclWorkList* workNode = ncclIntruQueueHead(&planner->collWorkQueue); struct ncclDevWorkColl* devWork = (struct ncclDevWorkColl*)(workNode+1); size_t elementSize = ncclTypeSize(task->datatype); int kind = 2*task->isCollnet + task->isNvls; if (kind != kindPrev) { trafficPerChannel = std::max(MinTrafficPerChannel, trafficBytes[kind]/nChannels[kind]); kindPrev = kind; channelId = 0; currentTraffic = 0; } if (task->isCollnet) { int nChannels = task->nMaxChannels; // Ensure room for worst case of one new batch per channel if (!testBudget(budget, plan->nWorkBatches + nChannels, plan->workBytes + workNode->size)) { return ncclSuccess; } size_t globalBytesPerElement = elementSize*ncclFuncMaxSendRecvCount(task->func, comm->nRanks, 1); struct ncclProxyOp proxyOp; uint32_t chunkSize, directFlags=0; NCCLCHECK(calcCollChunking(comm, task, nChannels, globalBytesPerElement*task->count, &chunkSize, &directFlags, &proxyOp)); devWork->channelLo = 0; devWork->channelHi = nChannels-1; devWork->collnet.count = task->count; devWork->collnet.chunkCount = chunkSize/ncclTypeSize(task->datatype); devWork->direct = directFlags; uint64_t proxyOpId = uint64_t(plan->collOpCount++)<<1 | 0; for (int c=devWork->channelLo; c <= (int)devWork->channelHi; c++) { proxyOp.channelId = c; proxyOp.opCount = proxyOpId; addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes); NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp)); } } else { // not task->isCollnet constexpr size_t cellSize = 16; int elementsPerCell = cellSize/elementSize; size_t cells = divUp(task->count*elementSize, cellSize); int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks); size_t trafficPerElement = elementSize*trafficPerByte; size_t trafficPerCell = cellSize*trafficPerByte; size_t cellsPerChannel = std::min(cells, divUp(trafficPerChannel, trafficPerCell)); size_t cellsLo; if (channelId+1 == nMaxChannels[kind]) { // On last channel everything goes to "lo" cellsLo = cells; } else { cellsLo = std::min(cells, (trafficPerChannel-currentTraffic)/trafficPerCell); } int nMidChannels = (cells-cellsLo)/cellsPerChannel; size_t cellsHi = (cells-cellsLo)%cellsPerChannel; int nChannels = (cellsLo!=0 ? 1 : 0) + nMidChannels + (cellsHi!=0 ? 1 : 0); if (nMaxChannels[kind] < channelId + nChannels) { // Overflowed available channels nMidChannels = nMaxChannels[kind] - channelId - 2; cellsPerChannel = (cells-cellsLo)/(nMidChannels+1); cellsHi = cellsPerChannel + (cells-cellsLo)%(nMidChannels+1); } if (cellsHi == 0 && nMidChannels != 0) { cellsHi = cellsPerChannel; nMidChannels -= 1; } if (cellsLo == 0) { // Least channel skipped. Make the next channel the new least. channelId += 1; if (nMidChannels == 0) { cellsLo = cellsHi; cellsHi = 0; } else { cellsLo = cellsPerChannel; nMidChannels -= 1; } } size_t countMid = nMidChannels!=0 ? cellsPerChannel*elementsPerCell : 0; size_t countLo = cellsLo*elementsPerCell; size_t countHi = cellsHi*elementsPerCell; (countHi != 0 ? countHi : countLo) -= cells*elementsPerCell - task->count; nChannels = (countLo!=0 ? 1 : 0) + nMidChannels + (cellsHi!=0 ? 1 : 0); // Ensure room for worst case of one new batch per channel if (!testBudget(budget, plan->nWorkBatches + nChannels, plan->workBytes + workNode->size)) { return ncclSuccess; } devWork->channelLo = channelId; devWork->channelHi = channelId + nChannels-1; devWork->cbd.countLo = countLo; devWork->cbd.countMid = countMid; devWork->cbd.countHi = countHi; // calcCollChunking() uses global bytes instead of traffic which differs // in that allreduce isn't multiplied by 2. size_t globalBytesPerElement = elementSize*ncclFuncMaxSendRecvCount(task->func, comm->nRanks, 1); struct ncclProxyOp proxyOpLo, proxyOpMid, proxyOpHi; uint32_t chunkSize, directFlags=0; size_t grainSize = ncclProtoGrainSize(task->protocol); if (countLo != 0) { NCCLCHECK(calcCollChunking(comm, task, /*nChannels=*/1, globalBytesPerElement*countLo, &chunkSize, &directFlags, &proxyOpLo)); devWork->cbd.chunkGrainsLo = chunkSize/grainSize; } if (countHi != 0) { NCCLCHECK(calcCollChunking(comm, task, /*nChannels=*/1, globalBytesPerElement*countHi, &chunkSize, &directFlags, &proxyOpHi)); devWork->cbd.chunkGrainsHi = chunkSize/grainSize; } if (nMidChannels != 0) { NCCLCHECK(calcCollChunking(comm, task, /*nChannels=*/1, globalBytesPerElement*countMid, &chunkSize, &directFlags, &proxyOpMid)); devWork->cbd.chunkGrainsMid = chunkSize/grainSize; } devWork->direct = directFlags; // Update the current channel and vacant traffic budget. if (countHi != 0) { channelId += nChannels-1; currentTraffic = countHi*trafficPerElement; } else if (nMidChannels != 0) { channelId += nChannels; currentTraffic = 0; } else { currentTraffic += countLo*trafficPerElement; } if (currentTraffic >= trafficPerChannel && channelId+1 != nMaxChannels[kind]) { channelId += 1; currentTraffic = 0; } uint64_t proxyOpId = uint64_t(plan->collOpCount++)<<1 | 0; for (int c=devWork->channelLo; c <= (int)devWork->channelHi; c++) { struct ncclProxyOp* proxyOp; if (c == (int)devWork->channelLo) { proxyOp = &proxyOpLo; } else if (c == (int)devWork->channelHi) { proxyOp = &proxyOpHi; } else { proxyOp = &proxyOpMid; } proxyOp->channelId = c; proxyOp->opCount = proxyOpId; addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes); NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp)); } } plan->channelMask |= (2ull<channelHi) - (1ull<channelLo); plan->threadPerBlock = std::max(plan->threadPerBlock, task->nWarps*WARP_SIZE); if (!plan->kernelSpecialized) { plan->kernelFn = ncclDevKernelForFunc[task->devFuncId]; plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[task->devFuncId]; } if (comm->rank == 0) { if (task->isCollnet) { TRACE(NCCL_COLL, "Collective %s(%s, %s, %s, %s) count=%ld devFuncId=%d channel{Lo..Hi}={%d..%d} count=%ld chunkCount=%d", ncclFuncToString(task->func), ncclDevRedOpToString(task->opDev.op), ncclDatatypeToString(task->datatype), ncclAlgoToString(task->algorithm), ncclProtoToString(task->protocol), (long)task->count, task->devFuncId, devWork->channelLo, devWork->channelHi, (long)devWork->collnet.count, devWork->collnet.chunkCount); } else { TRACE(NCCL_COLL, "Collective %s(%s, %s, %s, %s) count=%ld devFuncId=%d channel{Lo..Hi}={%d..%d} count{Lo,Mid,Hi}={%ld,%ld,%ld} chunkBytes{Lo,Mid,Hi}={%d,%d,%d}", ncclFuncToString(task->func), ncclDevRedOpToString(task->opDev.op), ncclDatatypeToString(task->datatype), ncclAlgoToString(task->algorithm), ncclProtoToString(task->protocol), (long)task->count, task->devFuncId, devWork->channelLo, devWork->channelHi, (long)devWork->cbd.countLo, (long)devWork->cbd.countMid, (long)devWork->cbd.countHi, int(devWork->cbd.chunkGrainsLo*ncclProtoGrainSize(task->protocol)), int(devWork->cbd.chunkGrainsMid*ncclProtoGrainSize(task->protocol)), int(devWork->cbd.chunkGrainsHi*ncclProtoGrainSize(task->protocol))); } } for (int i=0; i < task->nCleanupQueueElts; i++) { ncclIntruQueueEnqueue(&plan->cleanupQueue, ncclIntruQueueDequeue(&planner->collCleanupQueue)); } ncclIntruQueueDequeue(&planner->collTaskQueue); ncclIntruQueueDequeue(&planner->collWorkQueue); nPlanColls -= 1; planner->nTasksColl -= 1; ncclIntruQueueEnqueue(&plan->workQueue, workNode); plan->workBytes += workNode->size; } return ncclSuccess; } NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 16384); NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0); // Put p2p op in plan assuming there is sizeof(ncclDevWorkBatch) in batch budget // and sizeof(ncclDevWorkP2p) in work budget. "sendRank" and "recvRank" must // match the corresponding values for this round of the p2p schedule (no -1's). // No-op's are encoded with a -1 size. static ncclResult_t addP2pToPlan( struct ncclComm* comm, struct ncclKernelPlan* plan, int nChannelsMin, int nChannelsMax, int p2pRound, int sendRank, void* sendAddr, ssize_t sendBytes, int recvRank, void* recvAddr, ssize_t recvBytes ) { constexpr int connIndex = 1; bool selfSend = (sendRank == comm->rank); // recv: dir=0, send: dir=1 void* addrs[2] = {recvAddr, sendAddr}; ssize_t bytes[2] = {recvBytes, sendBytes}; bool protoLL[2] = {!selfSend, !selfSend}; bool network[2] = {false, false}; bool proxySameProcess[2] = {true, true}; uint8_t base = ncclP2pChannelBaseForRound(comm, p2pRound); if (!selfSend) { for (int part=0; part < nChannelsMax; part++) { int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part); struct ncclChannelPeer** channelPeers = comm->channels[channelId].peers; for (int dir=0; dir <= 1; dir++) { int peerRank = dir ? sendRank : recvRank; struct ncclConnector* conn = dir ? &channelPeers[peerRank]->send[connIndex] : &channelPeers[peerRank]->recv[connIndex]; protoLL[dir] &= conn->conn.buffs[NCCL_PROTO_LL] != nullptr; network[dir] |= conn->transportComm == (dir ? &netTransport.send : &netTransport.recv); proxySameProcess[dir] &= conn->proxyConn.sameProcess; } } } ssize_t thresholdLL = nChannelsMax*ncclParamP2pLLThreshold(); ssize_t paramChunkSize = ncclParamChunkSize(); // Arrays indexed by dir where recv=0, send=1: int nChannels[2]; int protocol[2]; int stepSize[2]; int chunkSize[2]; int chunkDataSize[2]; int chunkDataSize_u32fp8[2]; bool registered[2]; for (int dir=0; dir < 2; dir++) { // 0=recv, 1=send if (bytes[dir] != -1) protoLL[dir] &= bytes[dir] <= thresholdLL; protocol[dir] = protoLL[dir] ? NCCL_PROTO_LL : NCCL_PROTO_SIMPLE; stepSize[dir] = comm->buffSizes[protocol[dir]]/NCCL_STEPS; if (protocol[dir] == NCCL_PROTO_SIMPLE) stepSize[dir] = comm->p2pChunkSize; chunkSize[dir] = stepSize[dir]; if (paramChunkSize != 0) { chunkSize[dir] = paramChunkSize; } else if (network[dir]) { // Tune chunk size for the network if (protocol[dir] == NCCL_PROTO_SIMPLE && bytes[dir] < stepSize[dir]) chunkSize[dir] /= 4; else if (bytes[dir] < 8*stepSize[dir]) chunkSize[dir] /= 2; } chunkDataSize[dir] = chunkSize[dir]; if (protocol[dir] == NCCL_PROTO_LL) chunkDataSize[dir] /= 2; chunkDataSize_u32fp8[dir] = u32fp8Encode(chunkDataSize[dir]); chunkDataSize[dir] = u32fp8Decode(chunkDataSize_u32fp8[dir]); chunkSize[dir] = chunkDataSize[dir]; if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2; registered[dir] = false; if (bytes[dir] > 0 && network[dir] && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) { struct ncclReg* regRecord; NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], ®Record)); registered[dir] = (regRecord && regRecord->nDevs); } if (bytes[dir] == -1) nChannels[dir] = 0; else if (bytes[dir] == 0) nChannels[dir] = 1; else { ssize_t minPartSize = comm->nNodes > 1 ? stepSize[dir]/2 : stepSize[dir]/8; ssize_t maxPartSize = comm->nNodes > 1 ? stepSize[dir] : stepSize[dir]*32; nChannels[dir] = std::min(nChannelsMin, divUp(bytes[dir], minPartSize)); size_t partSize = std::max(minPartSize, divUp(bytes[dir], nChannels[dir])); while (partSize > maxPartSize && nChannels[dir] <= nChannelsMax/2) { nChannels[dir] *= 2; partSize = divUp(bytes[dir], nChannels[dir]); } } } struct ncclWorkList* workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); workNode->workType = ncclDevWorkTypeP2p; workNode->size = sizeof(struct ncclDevWorkP2p); ncclIntruQueueEnqueue(&plan->workQueue, workNode); uint32_t workOffset = plan->workBytes; plan->workBytes += sizeof(struct ncclDevWorkP2p); struct ncclDevWorkP2p* work = (struct ncclDevWorkP2p*)(workNode+1); work->nP2pChannels = comm->p2pnChannels; work->channelBase = base; work->nSendChannels = nChannels[1]; work->sendProtoLL = protoLL[1]; work->sendRegistered = registered[1]; work->sendChunkSize_u32fp8 = chunkDataSize_u32fp8[1]; work->sendRank = sendRank; work->sendAddr = sendAddr; work->sendBytes = sendBytes==-1 ? 0 : sendBytes; work->nRecvChannels = nChannels[0]; work->recvProtoLL = protoLL[0]; work->recvRegistered = registered[0]; work->recvChunkSize_u32fp8 = chunkDataSize_u32fp8[0]; work->recvRank = recvRank; work->recvAddr = recvAddr; work->recvBytes = recvBytes==-1 ? 0 : recvBytes; struct ncclProxyOp proxyOps[2] = {}; int nProxyOps = selfSend ? 0 : 2; for (int dir=0; dir < nProxyOps; dir++) { struct ncclProxyOp* op = &proxyOps[dir]; op->root = dir ? sendRank : recvRank; op->sliceSteps = 1; op->chunkSteps = 1; op->dtype = ncclInt8; op->redOp = ncclSum; op->protocol = protocol[dir]; op->pattern = dir ? ncclPatternSend : ncclPatternRecv; op->chunkSize = chunkSize[dir]; op->reg = registered[dir]; // The following are modified per channel part in addWorkToChannels(): // op->buffer, op->nbytes, op->nsteps = ...; } nChannelsMax = std::max(nChannels[0], nChannels[1]); for (int part=0; part < nChannelsMax; part++) { int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part); plan->channelMask |= uint64_t(1)<nSendChannels : work->nRecvChannels; void* addr = dir ? work->sendAddr : work->recvAddr; size_t bytes = dir ? work->sendBytes : work->recvBytes; proxyOps[dir].recvbuff = nullptr; if (nParts <= part) { proxyOps[dir].nsteps = 0; } else if (bytes == 0) { proxyOps[dir].nsteps = 1; proxyOps[dir].nbytes = 0; } else { size_t chunkDataSize = u32fp8Decode(dir ? work->sendChunkSize_u32fp8 : work->recvChunkSize_u32fp8); size_t partBeg, partEnd; ncclP2pPartBounds(nParts, part, bytes, &partBeg, &partEnd); if (proxyOps[dir].reg) { proxyOps[dir].nsteps = 1; proxyOps[dir].recvbuff = (uint8_t*)addr+partBeg; proxyOps[dir].nbytes = partEnd-partBeg; } else { proxyOps[dir].nsteps = divUp(partEnd-partBeg, chunkDataSize); proxyOps[dir].nbytes = std::min(partEnd-partBeg, chunkDataSize); } if (proxyOps[dir].protocol == NCCL_PROTO_LL) { proxyOps[dir].nbytes *= 2; proxyOps[dir].nbytes = roundUp(proxyOps[dir].nbytes, sizeof(union ncclLLFifoLine)); } } if (proxyOps[dir].nsteps != 0) { // Calculate the opCount after adding batch since then the batch count will // equal one plus the batch index this p2p settled in. proxyOps[dir].channelId = channelId; proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1; NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir])); } } } return ncclSuccess; } static int calcP2pChannelCount(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) { size_t size = std::max(minSize, divUp(totalSize, minChannels)); int nChannels = minChannels; while (size > maxSize && nChannels <= maxChannels/2) { nChannels *= 2; size = divUp(totalSize, nChannels); } return nChannels; } static ncclResult_t scheduleP2pTasksToPlan( struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclKernelPlanBudget* budget ) { int nRanks = comm->nRanks; struct ncclKernelPlanner::Peer* peers = comm->planner.peers; plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MAX_NTHREADS); if (!plan->kernelSpecialized) { plan->kernelFn = ncclDevKernelForFunc[ncclDevFuncId_P2p()]; plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[ncclDevFuncId_P2p()]; } // Compute how much to split operations // Try to use all channels int nChannelsMax = comm->p2pnChannelsPerPeer; int nChannelsMin = nChannelsMax; // Try to use all channels, but one channel per operation. while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2; while (comm->planner.nTasksP2p != 0) { for (int round=0; round < nRanks; round++) { int sendRank = comm->p2pSchedule[round].sendRank; int recvRank = comm->p2pSchedule[round].recvRank; struct ncclTaskP2p* send = ncclIntruQueueHead(&peers[sendRank].sendQueue); struct ncclTaskP2p* recv = ncclIntruQueueHead(&peers[recvRank].recvQueue); if (send == nullptr && recv == nullptr) continue; if (sendRank == comm->rank) { if (send != nullptr && recv == nullptr) { WARN("Trying to send to self without a matching recv"); return ncclInvalidUsage; } if (send == nullptr && recv != nullptr) { WARN("Trying to recv to self without a matching send"); return ncclInvalidUsage; } } ssize_t sendBytes = send ? send->bytes : -1; ssize_t recvBytes = recv ? recv->bytes : -1; void* sendBuff = send ? send->buff : nullptr; void* recvBuff = recv ? recv->buff : nullptr; if (sendRank == comm->rank && send->buff == recv->buff) { // Skip send to self in-place (we don't need to support this). ncclIntruQueueDequeue(&peers[sendRank].sendQueue); ncclIntruQueueDequeue(&peers[recvRank].recvQueue); comm->planner.nTasksP2p -= 2; } else { // Ensure room for worst case of one new batch per channel. if (!testBudget(budget, plan->nWorkBatches+nChannelsMax, plan->workBytes + sizeof(struct ncclDevWorkP2p))) { return ncclSuccess; } NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes)); if (send != nullptr) { ncclIntruQueueDequeue(&peers[sendRank].sendQueue); comm->planner.nTasksP2p -= 1; } if (recv != nullptr) { ncclIntruQueueDequeue(&peers[recvRank].recvQueue); comm->planner.nTasksP2p -= 1; } } } } return ncclSuccess; } // Spin until its safe to increase comm->workFifoProduced to desiredProduced. static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduced) { bool hasRoom = (desiredProduced - comm->workFifoConsumedLeast) <= comm->workFifoBytes; if (hasRoom) return; while (true) { // We have to poll for notifications from device. uint32_t* consumedLive = comm->workFifoConsumed; uint32_t consumed[MAXCHANNELS]; for (int c=0; c < MAXCHANNELS; c++) { consumed[c] = __atomic_load_n(&consumedLive[c], __ATOMIC_RELAXED); } // Compiler-only fence to prevent fusion of loops to encourage dense loads. __atomic_signal_fence(__ATOMIC_SEQ_CST); uint32_t produced = comm->workFifoProduced; uint32_t consumedLeast = produced; for (int c=0; c < MAXCHANNELS; c++) { // consumedLeast is min over all non-quiesced channels if (consumed[c] != comm->channels[c].workFifoProduced) { if ((produced - consumedLeast) < (produced - consumed[c])) { consumedLeast = consumed[c]; } } } // Compiler only fence to prevent fusion of loops to encourage dense stores. __atomic_signal_fence(__ATOMIC_SEQ_CST); for (int c=0; c < MAXCHANNELS; c++) { // Advance counter on quiesced channels so they don't lag behind // too far where they could get lost in 32-bit wraparound. if (consumed[c] == comm->channels[c].workFifoProduced) { comm->channels[c].workFifoProduced = consumedLeast; __atomic_store_n(&consumedLive[c], consumedLeast, __ATOMIC_RELAXED); } } comm->workFifoConsumedLeast = consumedLeast; hasRoom = (desiredProduced - comm->workFifoConsumedLeast) <= comm->workFifoBytes; if (hasRoom) break; sched_yield(); } } static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) { size_t workBytes = plan->workBytes; size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch); void* fifoBuf; uint32_t fifoCursor, fifoMask; switch (plan->workStorageType) { case ncclDevWorkStorageTypeArgs: plan->kernelArgs->workBuf = nullptr; fifoBuf = (void*)plan->kernelArgs; fifoCursor = sizeof(ncclDevKernelArgs) + batchBytes; fifoMask = ~0u; break; case ncclDevWorkStorageTypeFifo: fifoBuf = comm->workFifoBuf; fifoCursor = comm->workFifoProduced; fifoMask = comm->workFifoBytes-1; waitWorkFifoAvailable(comm, fifoCursor + workBytes); plan->kernelArgs->workBuf = comm->workFifoBufDev; break; case ncclDevWorkStorageTypePersistent: ncclMemoryStackPush(&comm->memScoped); fifoBuf = ncclMemoryStackAlloc(&comm->memScoped, workBytes, /*align=*/16); fifoCursor = 0; fifoMask = ~0u; break; default: return ncclInternalError; } plan->kernelArgs->workMask = fifoMask; // Batches were placed after kernelArgs by finishPlan(). Only thing left to // do is translate the work offset from zero based (in plan) to: // ncclDevWorkStorageTypeArgs: offset from beginning of kernel args // ncclDevWorkStorageTypeFifo: offset from base of fifo // ncclDevWorkStorageTypePersistent: no translation since our dedicated buffer will also begin at zero. struct ncclDevWorkBatch* batchZero = (struct ncclDevWorkBatch*)(plan->kernelArgs+1); for (int b=0; b < plan->nWorkBatches; b++) { batchZero[b].offsetBase += fifoCursor; } // Write the channel-shared work structs. struct ncclWorkList* workNode = ncclIntruQueueHead(&plan->workQueue); while (workNode != nullptr) { char* dst = (char*)fifoBuf; char* src = (char*)(workNode+1); for (int n = workNode->size; n != 0; n -= 16) { memcpy( __builtin_assume_aligned(dst + (fifoCursor & fifoMask), 16), __builtin_assume_aligned(src, 16), 16 ); fifoCursor += 16; src += 16; } workNode = workNode->next; } switch (plan->workStorageType) { case ncclDevWorkStorageTypeFifo: comm->workFifoProduced = fifoCursor; if (comm->workFifoBufGdrHandle != nullptr) wc_store_fence(); break; case ncclDevWorkStorageTypePersistent: NCCLCHECK(ncclCudaMalloc(&plan->workBufPersistent, workBytes)); plan->kernelArgs->workBuf = plan->workBufPersistent; NCCLCHECK(ncclCudaMemcpy(plan->workBufPersistent, fifoBuf, workBytes)); ncclMemoryStackPop(&comm->memScoped); break; default: break; } return ncclSuccess; } static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* plan) { uint64_t collOpCount = comm->sharedRes->collOpCount; uint64_t p2pOpBump[MAXCHANNELS] = {/*0...*/}; // Advance comm's collOpCount by number of colls in this plan. comm->sharedRes->collOpCount += plan->collOpCount; struct ncclProxyOp* op = ncclIntruQueueHead(&plan->proxyOpQueue); while (op != nullptr) { uint64_t oldId = op->opCount; // Ignoring the bottom tag bit, opCount's are zero-based within plan so // translate them to the tip of the comm's history. if (oldId & 1) { // p2p // opCount is monotonic increasing within a plan's channel so just // remember last value to compute max. p2pOpBump[op->channelId] = (oldId>>1) + 1; // +1 to ensure next plan doesn't collide op->opCount = (comm->sharedRes->p2pOpCount[op->channelId]<<1) + oldId; } else { // coll op->opCount = (collOpCount<<1) + oldId; } NCCLCHECK(ncclProxySaveOp(comm, op, nullptr)); op->opCount = oldId; // Restore for next uploadProxyOps() struct ncclProxyOp* opNext = op->enqNext; if (!plan->persistent) { // Non-persistent kernels upload ops only once so can be free'd here. ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, op); } op = opNext; } // Erase proxyOpQueue since all ops were free'd back to mempool. if (!plan->persistent) ncclIntruQueueConstruct(&plan->proxyOpQueue); for (int c=0; c < MAXCHANNELS; c++) { // Advance channel's p2pOpCount by number of p2p's in this plan channel. comm->sharedRes->p2pOpCount[c] += p2pOpBump[c]; } return ncclSuccess; } static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelPlan* plan) { NCCLCHECK(uploadProxyOps(comm, plan)); NCCLCHECK(ncclProxyStart(comm)); if (!plan->persistent) { // Notify main thread of our reclaiming. This will reclaim plan concurrently. ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer); } return ncclSuccess; } static void CUDART_CB hostStreamPlanCallback(void *plan_) { NVTX3_FUNC_RANGE_IN(nccl_domain); struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plan_; ncclResult_t result = hostStreamPlanTask(plan->comm, plan); if (result != ncclSuccess) { WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result)); } } static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* me) { struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim` if (plan->persistent) { comm->persistentRefs -= 1; NCCLCHECK(ncclCudaFree(plan->workBufPersistent)); struct ncclProxyOp* q = ncclIntruQueueHead(&plan->proxyOpQueue); while (q != nullptr) { struct ncclProxyOp* q1 = q->enqNext; ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q); q = q1; } ncclResult_t result = ncclSuccess; while (!ncclIntruQueueEmpty(&plan->cleanupQueue)) { struct ncclCommCallback* cb = ncclIntruQueueDequeue(&plan->cleanupQueue); ncclResult_t res1 = cb->fn(comm, cb); // Expect to reclaim memory of cb if (res1 != ncclSuccess) result = res1; } NCCLCHECK(result); } ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan); return ncclSuccess; } static void persistentDestructor(void* plans_) { struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plans_; struct ncclComm* comm = plan->comm; while (plan != nullptr) { struct ncclKernelPlan* next = plan->next; ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer); plan = next; } } ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { ncclResult_t result = ncclSuccess; struct ncclKernelPlanner* planner = &comm->planner; bool persistent = ncclCudaGraphValid(planner->capturingGraph); planner->persistent = persistent; int nPlans = 0; // Poll for callbacks sent to us from other threads. Typically these free // resources from to our memory pools. NCCLCHECK(ncclCommPollCallbacks(comm, /*waitSome=*/false)); if (planner->nTasksColl + planner->nTasksP2p != 0) { do { memset(&planner->wipPlan, 0, sizeof(planner->wipPlan)); struct ncclKernelPlan* plan = ncclMemoryPoolAlloc(&comm->memPool_ncclKernelPlan, &comm->memPermanent); plan->comm = comm; plan->reclaimer.fn = reclaimPlan; plan->persistent = persistent; // uploadWork() promotes ncclDevWorkStorageType[Fifo|Buf]->Args if the work can fit. plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent : ncclDevWorkStorageTypeFifo; struct ncclKernelPlanBudget budget; budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs); // Non-persistent kernels fill up at most half of our fifo per kernel. budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2; // Drain coll tasks first. This is essential since we partition tasks based // on the work budget and p2p work isn't collective. If we were to drain p2p // first, the place where we cut the kernel could vary by rank which would // cause the "shortest channel first" channel picker to have divergent results. if (planner->nTasksColl != 0) { NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure); } // And only drain p2p tasks once colls are depleted. if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) { NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure); } finishPlan(comm, plan); if (plan->workBytes != 0) { ncclIntruQueueEnqueue(&planner->planQueue, plan); nPlans += 1; } } while (planner->nTasksColl + planner->nTasksP2p != 0); struct ncclKernelPlan* planHead = ncclIntruQueueHead(&planner->planQueue); planner->unlaunchedPlansHead = planHead; if (nPlans == 0) return ncclSuccess; // Semantically we want these dependencies for the kernels launched: // 1. Launch host task on hostStream. // 2. Launch kernel, depends on all of {deviceStream, hostStream, userStream[i]...} // 3. {deviceStream, userStream[i]...} depend on kernel. // We achieve this by: // 1. userStream[0] waits on deviceStream // 2. deviceStream waits on each of userStream[1...] // 3. host task launch on hostStream // 4. userStream[0] waits on hostStream // 5. kernel launch on userStream[0] // 6. deviceStream waits on userStream[0] // 7. userStream[1...] each waits on deviceStream // The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires // at least one of the two streams to be strong-stream. cudaStream_t launchStream = planner->streams->stream; NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream), result, failure); // Create dependency for device stream on user streams. First from extra user // streams to deviceStream. Then deviceStream to first user stream. for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) { NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure); } NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure); if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking) { // We have to launch host tasks to push proxy args. We are careful to only // do this if necessary since host tasks impose a high performance cost in CUDA. bool acquired = false; for (struct ncclKernelPlan* plan=planHead; plan != nullptr; plan = plan->next) { if (plan->hasProxyOps) { if (!acquired) { acquired = true; NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure); } NCCLCHECKGOTO(ncclStrongStreamLaunchHost(planner->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure); } } if (acquired) { // Make to-be-launched kernels dependent on just-launched host stream tasks. NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure); NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure); } } if (persistent) { comm->persistentRefs += nPlans; NCCLCHECKGOTO(ncclCudaGraphAddDestructor(planner->capturingGraph, persistentDestructor, (void*)planHead), result, failure); } } failure: return result; } ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) { // This code is called after we've checked in to the intra-process barrier // but before launching the kernel. We are not allowed to call CUDA unless the // kernel launch is captured. NCCLCHECK(uploadWork(comm, plan)); return ncclSuccess; } #if CUDART_VERSION >= 12000 // NCCL uses the "Remote" Mem Sync domain by default NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote); #endif ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) { struct ncclKernelPlanner* planner = &comm->planner; int nChannels = countOneBits(plan->channelMask); void* sym = plan->kernelFn; dim3 grid = {(unsigned)nChannels, 1, 1}; dim3 block = {(unsigned)plan->threadPerBlock, 1, 1}; int smem = ncclShmemDynamicSize(comm->cudaArch); cudaStream_t launchStream = planner->streams->stream; void* extra[] = { CU_LAUNCH_PARAM_BUFFER_POINTER, plan->kernelArgs, CU_LAUNCH_PARAM_BUFFER_SIZE, &plan->kernelArgsSize, CU_LAUNCH_PARAM_END }; CUfunction fn; CUDACHECK(cudaGetFuncBySymbol(&fn, sym)); #if CUDART_VERSION >= 11080 int driverVersion; NCCLCHECK(ncclCudaDriverVersion(&driverVersion)); if (driverVersion >= 11080) { int compCap = comm->compCap; unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0; CUlaunchConfig launchConfig = {0}; CUlaunchAttribute launchAttrs[3]; int attrs = 0; /* Cooperative Group Array (CGA) * On sm90 and later we have an extra level of hierarchy where we * can group together several blocks within the Grid, called * Thread Block Clusters. * Clusters enable multiple thread blocks running concurrently * across multiple SMs to synchronize and collaboratively fetch * and exchange data. A cluster of blocks are guaranteed to be * concurrently scheduled onto a group of SMs. * The maximum value is 8 and it must be divisible into the grid dimensions */ if (clusterSize) { // Grid dimension must be divisible by clusterSize if (grid.x % clusterSize) clusterSize = 1; launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; launchAttrs[attrs++].value.clusterDim = {clusterSize, 1, 1}; launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE; launchAttrs[attrs++].value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD; } #if CUDART_VERSION >= 12000 if (compCap >= 90 && driverVersion >= 12000) { // Set the NCCL Mem Sync domain on CUDA 12.0 and later (sm90) launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN; launchAttrs[attrs++].value.memSyncDomain = (CUlaunchMemSyncDomain) ncclParamMemSyncDomain(); } #endif launchConfig.gridDimX = grid.x; launchConfig.gridDimY = grid.y; launchConfig.gridDimZ = grid.z; launchConfig.blockDimX = block.x; launchConfig.blockDimY = block.y; launchConfig.blockDimZ = block.z; launchConfig.sharedMemBytes = smem; launchConfig.attrs = launchAttrs; launchConfig.numAttrs = attrs; launchConfig.hStream = launchStream; //CUDACHECK(cudaLaunchKernelExC(&launchConfig, fnAddr, args)); CUCHECK(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra)); return ncclSuccess; } #endif // Standard kernel launch CUCHECK(cuLaunchKernel(fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra)); //CUDACHECK(cudaLaunchKernel(fnAddr, grid, block, args, smem, launchStream)); return ncclSuccess; } ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) { if (!(plan->persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking)) { // We are not using the host stream for proxy ops and reclaimation submission. NCCLCHECK(hostStreamPlanTask(comm, plan)); } else { // We are using the host stream for proxy ops and reclaimation submission. // Only plans with proxy ops have a callback pushed by ncclLaunchPrepare. // Since non-persistent plans also require reclaimation, we have to do it // here. if (!plan->persistent && !plan->hasProxyOps) { ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer); } } return ncclSuccess; } ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { ncclResult_t result = ncclSuccess; struct ncclKernelPlanner* planner = &comm->planner; if (!ncclIntruQueueEmpty(&planner->planQueue)) { // Reset queue to empty without destroying plans since those will be sent // back to us for reclaiming via callbackQueue. ncclIntruQueueConstruct(&planner->planQueue); cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch // Create dependency for deviceStream on launchStream. We know that deviceStream // hasn't been modified since launchStream waited on it (in ncclLaunchPrepare), // so we can say that launchStream subsumes it. NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1); resume1: // Create dependency for other user streams (skip launch stream) on deviceStream. // Again, the user streams haven't been touched since deviceStream waited on them // so we can say they are subsumed by deviceStream. struct ncclCudaStreamList* sl = planner->streams->next; planner->streams = nullptr; // Reset comm->planner.streams to empty. while (sl != nullptr) { NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2); resume2: sl = sl->next; } // Release device stream as acquired in ncclLaunchPrepare() NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream), result, resume3); resume3:; } return result; } /*****************************************************************************/ /* Enqueueing system : computation of kernel and proxy operations parameters */ /*****************************************************************************/ static inline ncclResult_t getCollNetSupport( struct ncclComm* comm, struct ncclTaskColl* info, int* collNetSupport ) { // Translate ncclAvg and PreMulSum ncclRedOp_t netOp = info->opHost; if (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv) { netOp = ncclSum; } *collNetSupport = comm->collNetSupport; switch (info->func) { case ncclFuncAllReduce: case ncclFuncReduce: case ncclFuncReduceScatter: *collNetSupport &= comm->collNetSupportMatrix[netOp][info->datatype]; break; default: break; } return ncclSuccess; } static void initCollCostTable(float** collCostTable) { float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++) { for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) { table[a][p] = NCCL_ALGO_PROTO_IGNORE; } } } // numPipeOps: number of pipelined ops. Can be greater than 1 in aggregation mode. Used to adjust latency. static ncclResult_t updateCollCostTable( struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes, int collNetSupport, int nvlsSupport, int numPipeOps, float** collCostTable, int* backupAlgo, int* backupProto, float* backupTime ) { float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; if (comm->nRanks == 1) { table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0; return ncclSuccess; } for (int a=0; afunc != ncclFuncAllGather) continue; if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue; /* now we only support single-node NVLS allgather and reducescatter */ if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue; for (int p=0; pfunc, a, p, nBytes, numPipeOps, &time, &backup)); if (!backup) { table[a][p] = time; } else { if (time >= 0.0 && time < *backupTime) { *backupAlgo = a; *backupProto = p; *backupTime = time; } } } } return ncclSuccess; } static ncclResult_t topoGetAlgoInfo( struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes, float** collCostTable, int backupAlgo, int backupProto, float backupTime, ncclSimInfo_t* simInfo ) { float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; float minTime = 3600000000.0; int algorithm = info->algorithm = NCCL_ALGO_UNDEF; int protocol = info->protocol = NCCL_PROTO_UNDEF; for (int a=0; a= 0.0 && table[a][p] < minTime) { algorithm = a; protocol = p; minTime = table[a][p]; } } } info->algorithm = algorithm; info->protocol = protocol; float time = minTime; if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) { if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) { WARN("Error : no algorithm/protocol available"); return ncclInternalError; } info->algorithm = backupAlgo; info->protocol = backupProto; time = backupTime; } if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time); if (simInfo) simInfo->estimatedTime = time; TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time); int nc = comm->nChannels; int nt = comm->maxThreads[info->algorithm][info->protocol]; int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol]; if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) { // CollNet channel tuning int ncSwitch = 16; bool flag = true; while (ncSwitch >= 1 && flag) { while ((flag = nBytes < nc*nt*comm->channels[0].collnetDirect.nHeads*threadThreshold) && nc > ncSwitch) { if (nc == ncSwitch+ncSwitch/2) threadThreshold /= 2; nc--; } ncSwitch /= 2; } } else if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) { // NVLS should not need more than 16 channels to get peak BW. nc = comm->nvlsChannels; } else { // Ring/Tree channel tuning while (nBytes < nc * nt * threadThreshold) { if (nc >= 2) nc--; else break; } } if (info->algorithm != NCCL_ALGO_NVLS && info->algorithm != NCCL_ALGO_NVLS_TREE && info->algorithm != NCCL_ALGO_COLLNET_DIRECT) { while (nBytes < nc * nt * threadThreshold) { if (nt % 128 == 0) nt /= 2; else break; } } if (info->protocol == NCCL_PROTO_SIMPLE) { if (info->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync // More threads or sync warps needed due to split thread model if (info->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE; } nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt; if (info->algorithm == NCCL_ALGO_TREE) nt = NCCL_MAX_NTHREADS; // Tree now uses all threads always. info->nMaxChannels = nc; info->nWarps = nt/WARP_SIZE; return ncclSuccess; } // Use the default topo-based tuner if tuner plugin is not successful. // Call the plugin first. Let it set algo+proto, and/or nChannels. // Then, topoGetAlgoInfo will set algo/proto if not set, then nChannels and nThreads based on algo/proto. // Finally, nChannels will be overriden by the plugin setting. static ncclResult_t getAlgoInfo( struct ncclComm* comm, struct ncclTaskColl* info, int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo/* = NULL*/ ) { size_t nBytes = ncclTypeSize(info->datatype)*ncclFuncMaxSendRecvCount(info->func, comm->nRanks, info->count); info->algorithm = NCCL_ALGO_UNDEF; info->protocol = NCCL_PROTO_UNDEF; int nMaxChannels = 0; int backupAlgo = NCCL_ALGO_UNDEF; int backupProto = NCCL_PROTO_UNDEF; float backupTime = 3600000000.0; float collCostTable[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; initCollCostTable((float **)collCostTable); NCCLCHECK(updateCollCostTable(comm, info, nBytes, collNetSupport, nvlsSupport, numPipeOps, (float **)collCostTable, &backupAlgo, &backupProto, &backupTime)); if (comm->tuner != NULL) { NCCLCHECK(comm->tuner->getCollInfo( comm->tunerContext, info->func, nBytes, numPipeOps, (float **)collCostTable, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, &nMaxChannels)); } NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, backupAlgo, backupProto, backupTime, simInfo)); info->nMaxChannels = nMaxChannels == 0 ? info->nMaxChannels : nMaxChannels; return ncclSuccess; } NCCL_PARAM(NvlsTreeMaxChunkSize, "NVLSTREE_MAX_CHUNKSIZE", -2); static ncclResult_t calcCollChunking( struct ncclComm* comm, struct ncclTaskColl* info, int nChannels, size_t nBytes, /*outputs*/uint32_t* outChunkSize, uint32_t* outDirectFlags, struct ncclProxyOp* proxyOp ) { ncclPattern_t pattern; size_t grainSize = ncclProtoGrainSize(info->protocol); switch (info->func) { case ncclFuncBroadcast: pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break; case ncclFuncReduce: pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break; case ncclFuncReduceScatter: case ncclFuncAllGather: pattern = info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls : info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect : ncclPatternRing; break; case ncclFuncAllReduce: pattern = info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls : info->algorithm == NCCL_ALGO_NVLS_TREE ? ncclPatternNvlsTree : info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect : info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break; default: WARN("Unknown pattern for collective %d algorithm %d", info->func, info->algorithm); return ncclInternalError; } int nstepsPerLoop, nchunksPerLoop; switch (pattern) { case ncclPatternTreeUp: case ncclPatternTreeDown: case ncclPatternTreeUpDown: case ncclPatternPipelineFrom: case ncclPatternPipelineTo: case ncclPatternCollnetChain: nstepsPerLoop = nchunksPerLoop = 1; break; case ncclPatternNvls: nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads; break; case ncclPatternCollnetDirect: nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].collnetDirect.nHeads; break; case ncclPatternRing: nstepsPerLoop = comm->nRanks-1; nchunksPerLoop = comm->nRanks; break; case ncclPatternRingTwice: nstepsPerLoop = 2*(comm->nRanks-1); nchunksPerLoop = comm->nRanks; break; case ncclPatternNvlsTree: nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads; break; default: WARN("Unknown pattern %d", pattern); return ncclInternalError; } int stepSize = comm->buffSizes[info->protocol]/NCCL_STEPS; int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1; int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1; int chunkSize = stepSize*chunkSteps; if (info->protocol == NCCL_PROTO_LL) chunkSize /= 2; if (info->protocol == NCCL_PROTO_LL128) chunkSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS; if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) { // Optimize chunkSize / nSteps while (nBytes / (nChannels * comm->channels[0].collnetDirect.nHeads * chunkSize) < comm->channels[0].collnetDirect.depth * 64 && chunkSize > 131072) chunkSize /= 2; while (nBytes / (nChannels * comm->channels[0].collnetDirect.nHeads * chunkSize) < comm->channels[0].collnetDirect.depth * 8 && chunkSize > 65536) chunkSize /= 2; while (nBytes / (nChannels * comm->channels[0].collnetDirect.nHeads * chunkSize) < comm->channels[0].collnetDirect.depth * 8 && chunkSize > 32768) chunkSize /= 2; } else if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) { stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS; chunkSize = std::min(256 * 1024, stepSize * chunkSteps); while (nBytes / (nChannels * chunkSize) < comm->channels[0].collnetChain.depth * 64 && chunkSize > 131072) chunkSize /= 2; while (nBytes / (nChannels * chunkSize) < comm->channels[0].collnetChain.depth * 8 && chunkSize > 65536) chunkSize /= 2; while (nBytes / (nChannels * chunkSize) < comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2; } else if (info->algorithm == NCCL_ALGO_NVLS) { int maxChunkSize = comm->nvlsChunkSize; if (comm->nNodes > 1 && comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768; if (chunkSize > maxChunkSize) chunkSize = maxChunkSize; // Use uint64_t so that concurrentOps*chunkSize*X does not overflow uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads; if ((nBytes < (64 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536; if ((nBytes < (8 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768; if ((nBytes < (2 * (concurrentOps * chunkSize))) && (chunkSize > 16384)) chunkSize = 16384; } else if (info->algorithm == NCCL_ALGO_NVLS_TREE) { // Use uint64_t so that concurrentOps*chunkSize*X does not overflow uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads; chunkSize = comm->nvlsChunkSize; int maxChunkSize = (int)ncclParamNvlsTreeMaxChunkSize(); if (maxChunkSize == -2) maxChunkSize = comm->nNodes >= 4 ? 65536 : chunkSize; chunkSize = std::min(chunkSize, maxChunkSize); if ((nBytes < (32 * (concurrentOps * chunkSize))) && (chunkSize > 262144)) chunkSize = 262144; if ((nBytes < (16 * (concurrentOps * chunkSize))) && (chunkSize > 131072)) chunkSize = 131072; if ((nBytes < (4 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536; if ((nBytes < (1 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768; } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) { int nNodes = comm->nNodes; float ppn = comm->nRanks / (float)nNodes; float nstepsLL128 = 1+log2i(nNodes) + 0.1*ppn; while (nBytes / (nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2; while (nBytes / (nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2; } // Compute directFlags of work struct. if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) { // Set direct direction for broadcast-gather (read or write) *outDirectFlags = (nBytes/nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ; } else { *outDirectFlags = 0; } // Compute nSteps for proxies //if (comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->func, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol); chunkSize = chunkSize / grainSize * grainSize; // align chunkSize to multiple grainSize int nLoops = (int)DIVUP(nBytes, size_t(nChannels)*nchunksPerLoop*chunkSize); memset(proxyOp, 0, sizeof(*proxyOp)); proxyOp->nsteps = nstepsPerLoop * nLoops * chunkSteps; proxyOp->sliceSteps = sliceSteps; proxyOp->chunkSteps = chunkSteps; proxyOp->chunkSize = chunkSize; proxyOp->protocol = info->protocol; proxyOp->dtype = info->datatype; if (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv) { proxyOp->redOp = ncclSum; // Network sees avg as sum } else { proxyOp->redOp = info->opHost; } proxyOp->pattern = pattern; proxyOp->coll = info->func; proxyOp->root = info->root; // This is used by P2P to reduce the receive buffer size. We don't use it in collectives // because some protocols need to transmit more than the total size, plus they sometimes // round up proxyOp->nbytes = stepSize*sliceSteps; if (info->regBufType == NCCL_COLLNET_REG_BUFFER) { proxyOp->reg = 1; proxyOp->nsteps = DIVUP(nBytes, NCCL_MAX_COLLNET_SIZE); proxyOp->sendMhandle = info->sendMhandle; proxyOp->recvMhandle = info->recvMhandle; proxyOp->sendbuff = (uint8_t*)info->sendbuff; proxyOp->recvbuff = (uint8_t*)info->recvbuff; proxyOp->nbytes = nBytes; } else { proxyOp->reg = 0; } if (pattern == ncclPatternCollnetDirect) { proxyOp->specifics.collnetDirect.nNodes = comm->nNodes; proxyOp->specifics.collnetDirect.node = comm->node; if (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) { proxyOp->specifics.collnetDirect.sizePerRank = info->count*ncclTypeSize(info->datatype); } } *outChunkSize = chunkSize; return ncclSuccess; } static ncclResult_t hostToDevRedOp( ncclDevRedOpFull *opFull, ncclRedOp_t op, ncclDataType_t datatype, ncclComm *comm ) { union { int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64; half f16; float f32; double f64; #if defined(__CUDA_BF16_TYPES_EXIST__) __nv_bfloat16 bf16; #endif void *ptr; }; u64 = 0; opFull->scalarArgIsPtr = false; opFull->proxyOp = op; int nbits = 8*ncclTypeSize(datatype); uint64_t allBits = uint64_t(-1)>>(64-nbits); uint64_t signBit = allBits^(allBits>>1); switch (int(op)) { case ncclSum: opFull->op = ncclDevSum; break; case ncclProd: opFull->op = ncclDevProd; break; case ncclMin: case ncclMax: opFull->op = ncclDevMinMax; opFull->scalarArg = 0; // The xormask used by ncclFuncMinMax<[u]int> is the XOR of the sign bit // for signed (opposed to unsigned) types and all the bits for max (opposed to min). if (datatype==ncclInt8 || datatype==ncclInt32 || datatype==ncclInt64) { opFull->scalarArg ^= signBit; } opFull->scalarArg ^= (op == ncclMax) ? allBits : 0; break; case ncclAvg: switch ((int)datatype) { case ncclInt8: case ncclInt32: case ncclInt64: case ncclUint8: case ncclUint32: case ncclUint64: opFull->op = ncclDevSumPostDiv; u64 = comm->nRanks; break; case ncclFloat16: opFull->op = ncclDevPreMulSum; f16 = __float2half(float(1.0/comm->nRanks)); // __double2half not supported pre CUDA 11.x break; #if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: opFull->op = ncclDevPreMulSum; bf16 = __float2bfloat16(float(1.0/comm->nRanks)); break; #endif case ncclFloat32: opFull->op = ncclDevPreMulSum; f32 = float(1.0/comm->nRanks); break; case ncclFloat64: opFull->op = ncclDevPreMulSum; f64 = 1.0/comm->nRanks; break; } opFull->scalarArgIsPtr = false; opFull->scalarArg = u64; break; default: // user created int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps); ncclUserRedOp *user = &comm->userRedOps[ix]; if (datatype != user->datatype) { WARN("Data type supplied to user-created ncclRedOp_t does not match type " "given to reduction operation"); return ncclInvalidArgument; } *opFull = user->opFull; break; } return ncclSuccess; } // Converts `info` to a task and adds it to `comm->planner`. The exception is with // single rank communicators, collectives are issued as `ncclMemcpyAsync`s and // thus don't need a task. static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { struct ncclKernelPlanner *planner = &comm->planner; if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) { int peer = info->root; ssize_t nBytes = info->count*ncclTypeSize(info->datatype); bool isSendNotRecv = info->coll == ncclFuncSend; // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. ncclGroupCommJoin(info->comm); struct ncclTaskP2p* p2p = ncclMemoryStackAlloc(&comm->memScoped); p2p->buff = (void*)info->recvbuff; p2p->bytes = nBytes; ncclIntruQueueEnqueue( isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue, p2p); planner->nTasksP2p += 1; // Mark channels that need pre-connect if (comm->rank != peer) { if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) { (isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true; int round = 0; while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank : comm->p2pSchedule[round].recvRank)) { round += 1; } uint8_t base = ncclP2pChannelBaseForRound(comm, round); for (int c=0; c < comm->p2pnChannelsPerPeer; c++) { int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c); if (isSendNotRecv) { if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector comm->connectSend[peer] |= (1UL<channels[channelId].peers[peer]->recv[1].connected == 0) { // P2P uses only 1 connector comm->connectRecv[peer] |= (1UL<count == 0) return ncclSuccess; // Copy reduction op state from op handle into info struct here since the // op handle may be destroyed before ncclGroupEnd(). struct ncclDevRedOpFull opDev; NCCLCHECK(hostToDevRedOp(&opDev, info->op, info->datatype, comm)); if (comm->nRanks == 1) { NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, opDev, info->datatype, info->stream)); return ncclSuccess; } else { // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. ncclGroupCommJoin(info->comm); struct ncclTaskColl* t = ncclMemoryStackAlloc(&comm->memScoped); t->func = info->coll; t->sendbuff = info->sendbuff; t->recvbuff = info->recvbuff; t->count = info->count; t->root = info->root; t->datatype = info->datatype; size_t elementSize = ncclTypeSize(t->datatype); if (t->func == ncclFuncAllGather || t->func == ncclFuncBroadcast) { t->count *= elementSize; t->datatype = ncclInt8; elementSize = 1; } t->trafficBytes = t->count*elementSize*ncclFuncTrafficPerByte(t->func, comm->nRanks); t->opHost = info->op; t->opDev = opDev; // C++ struct assignment t->chunkSteps = info->chunkSteps; t->sliceSteps = info->sliceSteps; planner->nTasksColl += 1; ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes); } } if (info->stream != planner->streamRecent || planner->streams == nullptr) { planner->streamRecent = info->stream; struct ncclCudaStreamList* l = planner->streams; while (true) { if (l == nullptr) { // Got to the end, this must be a new stream. struct ncclCudaGraph graph; NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream)) if (planner->streams != nullptr && !ncclCudaGraphSame(planner->capturingGraph, graph)) { WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph."); return ncclInvalidUsage; } planner->capturingGraph = graph; // C++ struct assignment // Add stream to list l = ncclMemoryStackAlloc(&comm->memScoped); l->stream = info->stream; l->next = planner->streams; planner->streams = l; break; } if (l->stream == info->stream) break; // Already seen stream. l = l->next; } } return ncclSuccess; } ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { NCCLCHECK(ncclGroupStartInternal()); ncclResult_t ret = ncclSuccess; int devOld = -1; NCCLCHECKGOTO(CommCheck(info->comm, info->opName, "comm"), ret, fail); // Check whether communicator is ready to communicate NCCLCHECKGOTO(ncclCommEnsureReady(info->comm), ret, fail); if (info->comm->checkPointers) { CUDACHECKGOTO(cudaGetDevice(&devOld), ret, fail); CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, fail); } NCCLCHECKGOTO(ArgsCheck(info), ret, fail); INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zu datatype %d op %d root %d comm %p [nranks=%d] stream %p", info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); TRACE_CALL("nccl%s(%" PRIx64 ",%" PRIx64 ",%zu,%d,%d,%d,%p,%p)", info->opName, reinterpret_cast(info->sendbuff), reinterpret_cast(info->recvbuff), info->count, info->datatype, info->op, info->root, info->comm, info->stream); NCCLCHECKGOTO(taskAppend(info->comm, info), ret, fail); exit: if (devOld != -1) CUDACHECK(cudaSetDevice(devOld)); ncclGroupErrCheck(ret); NCCLCHECK(ncclGroupEndInternal()); /* if depth is 1, ncclGroupEndInternal() will trigger group ops. The state can change * so we have to check state here. */ if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) }; return ret; fail: if (info->comm && !info->comm->config.blocking) (void) ncclCommSetAsyncError(info->comm, ret); goto exit; } NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) { NCCLCHECK(CommCheck(comm, "ncclRedOpCreatePreMulSum", "comm")); /* join init thread before creating PreMulSum op. */ NCCLCHECK(ncclCommEnsureReady(comm)); if (comm->userRedOpFreeHead == comm->userRedOpCapacity) { // double capacity and resize int cap = 2*comm->userRedOpCapacity; if (cap < 4) cap = 4; ncclUserRedOp *ops = new ncclUserRedOp[cap]; std::memcpy(ops, comm->userRedOps, comm->userRedOpCapacity*sizeof(ncclUserRedOp)); for(int ix=comm->userRedOpCapacity; ix < cap; ix++) ops[ix].freeNext = ix + 1; delete[] comm->userRedOps; comm->userRedOps = ops; comm->userRedOpCapacity = cap; } // pop from free list int ix = comm->userRedOpFreeHead; ncclUserRedOp *user = &comm->userRedOps[ix]; comm->userRedOpFreeHead = user->freeNext; user->freeNext = -1; // allocated user->datatype = datatype; user->opFull.op = ncclDevPreMulSum; if (residence == ncclScalarHostImmediate) { user->opFull.scalarArgIsPtr = false; std::memcpy(&user->opFull.scalarArg, scalar, ncclTypeSize(datatype)); } else { user->opFull.scalarArgIsPtr = true; user->opFull.scalarArg = reinterpret_cast(scalar); } *op = ncclRedOp_t(int(ncclNumOps) + ix); *op = ncclUserRedOpMangle(comm, *op); TRACE_CALL("ncclRedOpCreatePreMulSum(%d,%p,%d,%d,%p)", *op, scalar, datatype, residence, comm); return ncclSuccess; } NCCL_API(ncclResult_t, ncclRedOpDestroy, ncclRedOp_t op, ncclComm_t comm); ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) { if (0 <= int(op) && int(op) < int(ncclNumOps)) { WARN("ncclRedOpDestroy : operator is a NCCL builtin."); return ncclInvalidArgument; } if (int(op) < 0 || int(ncclMaxRedOp) < int(op)) { WARN("ncclRedOpDestroy : operator is garbage."); return ncclInvalidArgument; } if (comm == NULL) { WARN("ncclRedOpDestroy : invalid communicator passed."); return ncclInvalidArgument; } int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps); if (comm->userRedOpCapacity <= ix || comm->userRedOps[ix].freeNext != -1) { WARN("ncclRedOpDestroy : operator unknown to this communicator."); return ncclInvalidArgument; } // push to free list comm->userRedOps[ix].freeNext = comm->userRedOpFreeHead; comm->userRedOpFreeHead = ix; TRACE_CALL("ncclRedOpDestroy(%d,%p)", op, comm); return ncclSuccess; } nccl-2.22.3-1/src/graph/000077500000000000000000000000001463451655400145625ustar00rootroot00000000000000nccl-2.22.3-1/src/graph/connect.cc000066400000000000000000000521511463451655400165260ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "device.h" #include "graph.h" #include "transport.h" #include "trees.h" #include "rings.h" #include "topo.h" /******************************************************************/ /********************* Internode connection ***********************/ /******************************************************************/ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) { int rank = comm->rank; int localRanks = comm->topo->nodes[GPU].count; int nvlsRanks = comm->MNNVL ? comm->clique.size : localRanks; int nChannels = comm->nChannels; topoRanks->nvlsHeadNum = 0; for (int c=0; cchannels+c; channel->ring.prev = channel->ring.next = -1; channel->tree.up = -1; channel->collnetChain.up = -1; for (int i=0; itree.down[i] = -1; for (int i=0; icollnetChain.down[i] = -1; channel->collnetDirect.out = -1; channel->collnetDirect.headRank = -1; channel->collnetDirect.nHeads = 0; channel->collnetDirect.shift = 0; for (int i=0; icollnetDirect.heads[i] = -1; for (int i=0; icollnetDirect.up[i] = -1; for (int i=0; icollnetDirect.down[i] = -1; int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks; int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks; int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks; for (int i=0; iringRecv[c] = ringIntra[0]; topoRanks->ringSend[c] = ringIntra[localRanks-1]; topoRanks->ringPrev[c] = (i == 0) ? -1 : ringIntra[i-1]; topoRanks->ringNext[c] = (i == localRanks-1) ? -1 : ringIntra[i+1]; } if (treeIntra[i] == rank) { int parentIndex = 0; int child0Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; int child1Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0; topoRanks->treeToParent[c] = treeIntra[parentIndex]; topoRanks->treeToChild0[c] = treeIntra[child0Index]; topoRanks->treeToChild1[c] = treeIntra[child1Index]; channel->tree.up = i == 0 ? -1 : treeIntra[i-1]; channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1]; } if (collNetIntra[i] == rank) { channel->collnetChain.up = i == 0 ? comm->nRanks : collNetIntra[i-1]; channel->collnetChain.down[0] = i == localRanks-1 ? -1 : collNetIntra[i+1]; } } } // Duplicate channels trees struct ncclChannel* channel0 = comm->channels; struct ncclChannel* channel1 = channel0+nChannels; memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel)); // Get nvls heads and the number of heads. Duplicate head is not allowed. for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) { bool addHead = true; int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * nvlsRanks; for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) { if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) { addHead = false; break; } } if (addHead) { topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0]; } } memcpy(comm->nvlsHeads, topoRanks->nvlsHeads, sizeof(int) * topoRanks->nvlsHeadNum); return ncclSuccess; } static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) { int nChannels = comm->nChannels; int nNodes = comm->nNodes; for (int c=0; cnNodes; int* send = ringSend+c*comm->nNodes; int* prev = ringPrev+c*comm->nRanks; int* next = ringNext+c*comm->nRanks; for (int n=0; nup = indexes[u]; return ncclSuccess; } static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) { if (d == -1) return ncclSuccess; int x = 0; while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++; if (x == NCCL_MAX_TREE_ARITY) { WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]); return ncclInternalError; } tree->down[x] = indexes[d]; return ncclSuccess; } static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) { const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node; // Compute tree depth. Not an exact value but a good approximation in most // cases int depth = comm->nRanks/nNodes - 1 + log2i(nNodes); int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType; int* ttp, *ttc0, *ttc1; NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType)); for (int c=0; cchannels+c; struct ncclChannel* channel1 = channel0+nChannels; ttp = treeToParent+c*comm->nNodes; ttc0 = treeToChild0+c*comm->nNodes; ttc1 = treeToChild1+c*comm->nNodes; if (comm->rank == ttp[node]) { NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u)); NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u)); } if (comm->rank == ttc0[node]) { NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0)); NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0)); } if (comm->rank == ttc1[node]) { NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1)); NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1)); } if (comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) { INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]); INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]); } channel0->tree.depth = channel1->tree.depth = depth; } return ncclSuccess; } static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) { int rank = comm->rank; int localRanks = comm->localRanks; int nHeads = 0; int *heads; NCCLCHECK(ncclCalloc(&heads, localRanks)); // Find all head ranks // Head index is always 0 for (int c=0; cnChannels; c++) { int* collNetIntra = collNetGraph->intra+c*localRanks; int head = collNetIntra[0]; for (int h=0; hnChannels; c++) { struct ncclChannel* channel = comm->channels+c; char line[1024]; sprintf(line, "CollNetDirect channel %d rank %d ", c, rank); int nDown = 0; for (int i=0; icollnetDirect.headRank = i; // Mark the index for deciding offset in the CUDA kernel channel->collnetDirect.out = comm->nRanks; // Set root of collnetDirect to id nranks int* collNetIntra = collNetGraph->intra+i*localRanks; sprintf(line+strlen(line), "down "); for (int r=0; rcollnetDirect.down[nDown++] = collNetIntra[r]; // connect to all peers sprintf(line+strlen(line), " %d ", collNetIntra[r]); } sprintf(line+strlen(line), "nDown %d ", nDown); break; } } // Connect to all heads int nUp = 0; sprintf(line+strlen(line), "up "); for (int h=0; hcollnetDirect.up[nUp++] = heads[h]; sprintf(line+strlen(line), " %d ", heads[h]); } sprintf(line+strlen(line), "heads "); { // heads[] is the list of heads ordered in head order startubg with self int h0 = (channel->collnetDirect.headRank == -1) ? 0 : channel->collnetDirect.headRank; for (int h1=0; h1 < nHeads; h1++) { int h = (h0+h1)%nHeads; channel->collnetDirect.heads[h1] = heads[h]; sprintf(line+strlen(line), " %d ", heads[h]); } } channel->collnetDirect.nHeads = nHeads; channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2; sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads); sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift); INFO(NCCL_GRAPH, "%s", line); channel->collnetChain.depth = comm->nRanks/comm->nNodes; } free(heads); return ncclSuccess; } static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHeads) { int headRank = -1; if (nHeads == 0) { comm->nvlsChannels = 0; return ncclSuccess; } for (int h = 0; h < nHeads; h++) { if (nvlsHeads[h * comm->nNodes + comm->node] == comm->rank) headRank = h; } for (int c=0; cnChannels; c++) { struct ncclChannel* channel = comm->channels+c; channel->nvls.nHeads = nHeads; for (int h=0; hnvls.up[h] = comm->nRanks+1+h; for (int h=nHeads; hnvls.up[h] = -1; channel->nvls.down = comm->nRanks+1+headRank; channel->nvls.out = -1; // NVLS+SHARP not yet implemented. channel->nvls.headRank = headRank; channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1; channel->nvls.node = comm->node; channel->nvls.nNodes = comm->nNodes; if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks; } if (comm->nNodes == 1) return ncclSuccess; // Connect Trees int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1; int pc0, pc1; // ignored NCCLCHECK(ncclGetDtree(comm->nNodes, comm->node, &tree0Parent, &tree0Child0, &tree0Child1, &pc0, &tree1Parent, &tree1Child0, &tree1Child1, &pc1)); int* heads = NULL; int treeUp[2] = { -1, -1 }; int treeDown0[2] = { -1, -1 }; int treeDown1[2] = { -1, -1 }; if (comm->node == 0) { for (int h=0; hnNodes; for (int n=0; nnNodes && n<20; n++) { sprintf(line+strlen(line), " %2d", heads[n]); } INFO(NCCL_INIT, "%s", line); } } // Find the heads where I'm the head rank and retain tree up/down for (int h=0; hnNodes; if (heads[comm->node] == comm->rank) { treeUp[0] = tree0Parent == -1 ? -1: heads[tree0Parent]; treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0]; treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1]; treeUp[1] = tree1Parent == -1 ? -1 : heads[tree1Parent]; treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0]; treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1]; break; } } // Set prev/next in all channels (NVLS compute channels work // orthogonally to NVLS search channels). for (int c=0; cnChannels; c++) { struct ncclChannel* channel = comm->channels+c; channel->nvls.treeUp = treeUp[c%2]; channel->nvls.treeDown[0] = channel->nvls.down; int ix = 1; if (treeDown0[c%2] != -1) channel->nvls.treeDown[ix++] = treeDown0[c%2]; if (treeDown1[c%2] != -1) channel->nvls.treeDown[ix] = treeDown1[c%2]; } struct ncclNvls* nvls0 = &comm->channels[0].nvls; struct ncclNvls* nvls1 = &comm->channels[1].nvls; INFO(NCCL_GRAPH, "NVLS Trees : %d/%d/%d->%d->%d %d/%d/%d->%d->%d", nvls0->treeDown[0], nvls0->treeDown[1], nvls0->treeDown[2], comm->rank, nvls0->treeUp, nvls1->treeDown[0], nvls1->treeDown[1], nvls1->treeDown[2], comm->rank, nvls1->treeUp); return ncclSuccess; } // Legacy naming NCCL_PARAM(MinNrings, "MIN_NRINGS", -2); NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2); // New naming NCCL_PARAM(MinNchannels, "MIN_NCHANNELS", -2); NCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2); int ncclMinNchannels() { int minNchannels = 0; if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings(); if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels(); if (minNchannels > MAXCHANNELS) { WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS); minNchannels = MAXCHANNELS; } if (minNchannels < 0) minNchannels = 0; return minNchannels; } extern int64_t ncclParamWorkArgsBytes(); int ncclMaxNchannels() { int maxNchannels = MAXCHANNELS; if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings(); if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels(); maxNchannels = std::min(maxNchannels, ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes())); if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS; if (maxNchannels < 1) { WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels); maxNchannels = 1; } return maxNchannels; } static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev, int* ringNext) { int nranks = comm->nRanks; int c; for (c=start; cchannels+c, comm->channels+c-start, sizeof(struct ncclChannel)); } return c; } void exchangeValues(int* v0, int* v1) { int tmp = *v1; *v1 = *v0; *v0 = tmp; } NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1); ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) { // Gather data from all ranks int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads; int nranks = comm->nRanks; int nNodes = comm->nNodes; int nChannels = comm->nChannels; int minHeadNum = INT_MAX; int shared = parent && parent->nvlsSupport && parent->config.splitShare; NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS)); // Alternate rings to avoid crossing rails if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) { for (int r=0; rnRanks; r++) { if (comm->rankToNode[r] % 2 == 1) { // Exchange rings for (int c=0; cringRecv+c, allTopoRanks[r]->ringRecv+(c^1)); exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1)); exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1)); exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1)); } } } } for (int c=0; cringRecv[c]; ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c]; treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c]; treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c]; treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c]; } for (int r=0; rringPrev[c]; ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c]; } } for (int n = 0; n < nNodes; n++) { int r = firstRanks[n]; if (minHeadNum > allTopoRanks[r]->nvlsHeadNum) minHeadNum = allTopoRanks[r]->nvlsHeadNum; } for (int c = 0; c < minHeadNum; c++) { for (int n = 0; n < nNodes; n++) { int r = firstRanks[n]; nvlsHeads[c * nNodes + n] = allTopoRanks[r]->nvlsHeads[c]; } } // Connect rings and trees. This should also duplicate the channels. NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext)); NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns)); // Duplicate ringPrev/ringNext for ncclBuildRing memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int)); memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int)); // Set ring prev/next for my rank for (int c=0; cchannels+c; struct ncclChannel* channel1 = channel0+nChannels; channel0->ring.prev = channel1->ring.prev = ringPrev[c*nranks+comm->rank]; channel0->ring.next = channel1->ring.next = ringNext[c*nranks+comm->rank]; } // Duplication should be complete now nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2); // Setup CollNet if (comm->collNetSupport == 1) { struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN]; // Add more channels to saturate intra-node bandwidth, except the 1 PPN case if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) { int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2); nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext); } NCCLCHECK(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT])); } // Use 4 compute channels per search channel to reach peak BW on <8 PPN if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) { nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext); } // Double the number of channels when using unpack networking (greater than 1 node) // We won't automatically double past 16 channels, users can specify 32 if they want if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) { nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext); } // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS. // We permit combining max, then min, to only use the first channels, then duplicate them. if (comm->sharedRes->owner != comm) { /* child comm #channels cannot exceed top parent #channels. */ nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels); nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), comm->config.minCTAs), comm->sharedRes->tpNChannels), ringPrev, ringNext); } else { nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs); nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->config.minCTAs), ringPrev, ringNext); } comm->collChannels = comm->nChannels; #if CUDART_VERSION >= 12010 // Support maximal channel usage for aggregation if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) { comm->nvlsChannels = parent->nvlsResources->nChannels; } if (comm->nChannels < comm->nvlsChannels) { nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext); } NCCLCHECK(connectNvls(comm, nvlsHeads, minHeadNum)); #endif if (shared && comm->nChannels > parent->sharedRes->tpNChannels) { nChannels = comm->nChannels = parent->sharedRes->tpNChannels; comm->collChannels = std::min(comm->collChannels, comm->nChannels); } // Create rings array and check all is fine NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext)); free(ringRecv); free(ringSend); free(ringPrev); free(ringNext); free(treeToParent); free(treeToChild0); free(treeToChild1); free(nvlsHeads); return ncclSuccess; } nccl-2.22.3-1/src/graph/paths.cc000066400000000000000000000750211463451655400162150ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" #include "graph.h" #include "topo.h" #include "comm.h" #include "net.h" #include "channel.h" #include "transport.h" #include "device.h" // Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths struct ncclTopoNodeList { struct ncclTopoNode* list[NCCL_TOPO_MAX_NODES]; int count; }; static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* node, int t, int64_t id, struct ncclTopoLinkList** path) { for (int i=0; inodes[t].count; i++) { if (system->nodes[t].nodes[i].id == id) { *path = node->paths[t]+i; return ncclSuccess; } } WARN("Could not find node of type %d id %lx", t, id); return ncclInternalError; } NCCL_PARAM(NvbDisable, "NVB_DISABLE", 0); static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) { if (baseNode->paths[baseNode->type] == NULL) { NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count)); } // breadth-first search to set all paths to that node in the system struct ncclTopoNodeList nodeList; struct ncclTopoNodeList nextNodeList; nodeList.count = 1; nodeList.list[0] = baseNode; nextNodeList.count = 0; struct ncclTopoLinkList* basePath; NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath)); basePath->count = 0; basePath->bw = LOC_BW; basePath->type = PATH_LOC; while (nodeList.count) { nextNodeList.count = 0; for (int n=0; ntype, baseNode->id, &path)); for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = node->links+l; struct ncclTopoNode* remNode = link->remNode; if (remNode->paths[baseNode->type] == NULL) { NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count)); for (int i=0; inodes[baseNode->type].count; i++) remNode->paths[baseNode->type][i].type = PATH_DIS; } struct ncclTopoLinkList* remPath; NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath)); float bw = std::min(path->bw, link->bw); // allow routing through a GPU only as 1 hop if (node != baseNode && node->type == GPU && (ncclParamNvbDisable() || link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue; if ((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) { // Find reverse link for (int l=0; lnlinks; l++) { if (remNode->links[l].remNode == node && remNode->links[l].type == link->type) { remPath->list[0] = remNode->links+l; break; } } if (remPath->list[0] == NULL) { WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx", remNode->type, remNode->id, remNode->nlinks, node->type, node->id); return ncclInternalError; } // Copy the rest of the path for (int i=0; icount; i++) remPath->list[i+1] = path->list[i]; remPath->count = path->count + 1; remPath->bw = bw; // Start with path type = link type. PATH and LINK types are supposed to match. // Don't consider LINK_NET as we only care about the NIC->GPU path. int type = link->type == LINK_NET ? LINK_LOC : link->type; // Differentiate between one and multiple PCI switches if (node->type == PCI && remNode->type == PCI) type = PATH_PXB; // Consider a path going through the CPU as PATH_PHB if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB; // Set 1 hop NVLink as NVB if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB; remPath->type = std::max(path->type, type); // Add to the list for the next iteration if not already in the list int i; for (i=0; itype], node->id); #else snprintf(line, linesize, "%s/%lX :", topoNodeTypeStr[node->type], node->id); int offset = strlen(line); #endif for (int t=0; tpaths[t] == NULL) continue; for (int n = 0; nnodes[t].count; n++) { #ifdef ENABLE_TRACE line[0] = 0; int offset = 0; for (int i=0; ipaths[t][n].count; i++) { struct ncclTopoLink* link = node->paths[t][n].list[i]; struct ncclTopoNode* remNode = link->remNode; snprintf(line+offset, linesize-offset, "--%s(%g)->%s/%lx-%lx", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[remNode->type], NCCL_TOPO_ID_SYSTEM_ID(remNode->id), NCCL_TOPO_ID_LOCAL_ID(remNode->id)); offset = strlen(line); } INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].bw); #else snprintf(line+offset, linesize-offset, "%s/%lx-%lx (%d/%.1f/%s) ", topoNodeTypeStr[t], NCCL_TOPO_ID_SYSTEM_ID(system->nodes[t].nodes[n].id), NCCL_TOPO_ID_LOCAL_ID(system->nodes[t].nodes[n].id), node->paths[t][n].count, node->paths[t][n].bw, topoPathTypeStr[node->paths[t][n].type]); offset = strlen(line); #endif } } #ifndef ENABLE_TRACE INFO(NCCL_GRAPH, "%s", line); #endif } ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) { for (int i=0; inodes[GPU].count; i++) { printNodePaths(system, system->nodes[GPU].nodes+i); } for (int i=0; inodes[NET].count; i++) { printNodePaths(system, system->nodes[NET].nodes+i); } return ncclSuccess; } static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) { // Find the closest CPU to a GPU int minHops = 0; int localCpu = -1; struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU]; for (int c=0; cnodes[CPU].count; c++) { int hops = paths[c].count; if (minHops == 0 || hops < minHops) { localCpu = c; minHops = hops; } } if (localCpu == -1) { WARN("Error : could not find CPU close to GPU %d", gpu); return ncclInternalError; } *retCpu = localCpu; return ncclSuccess; } static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) { struct ncclTopoNode* cpuNode = system->nodes[tx].nodes+ix; struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1; int l=0; // Node 1 -> CPU for (int i=0; ipaths[tx][ix].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[tx][ix].list[i]; // CPU -> Node 2 for (int i=0; ipaths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i]; // Update path characteristics srcNode->paths[t2][i2].count = l; srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type); if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN; srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw); return ncclSuccess; } // Remove/free paths for a given type static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) { for (int t=0; tnodes[t].count; n++) { struct ncclTopoNode* node = system->nodes[t].nodes+n; free(node->paths[nodeType]); node->paths[nodeType] = NULL; } // Remove links _from_ the given type for (int n=0; nnodes[nodeType].count; n++) { struct ncclTopoNode* node = system->nodes[nodeType].nodes+n; free(node->paths[t]); node->paths[t] = NULL; } } } static const int levelsOldToNew[] = { PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS }; ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelEnv) { if (*level == -1) { int l = -1; if (disableEnv) { const char* str = ncclGetEnv(disableEnv); if (str) { int disable = strtol(str, NULL, 0); if (disable == 1) l = 0; } } if (l == -1) { const char* str = ncclGetEnv(levelEnv); if (str) { for (int i=0; i<=PATH_SYS; i++) { if (strcmp(str, topoPathTypeStr[i]) == 0) { l = i; break; } } // Old style numbering // levelsOldToNew to is an array with each index corresponding to the // "old level" int, and each value mapping to the correct value defined in topo.h // maxOldLevel is a quick check to handle out of bounds (based on the length of levelsOldToNew) if (l == -1 && str[0] >= '0' && str[0] <= '9') { int oldLevel = strtol(str, NULL, 0); const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1; if (oldLevel > maxOldLevel) oldLevel = maxOldLevel; l = levelsOldToNew[oldLevel]; } } } if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]); *level = l >= 0 ? l : -2; } return ncclSuccess; } NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0); int ncclTopoUserP2pLevel = -1; ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) { *p2p = 0; if (read) *read = 0; if (intermediateRank) *intermediateRank = -1; // Get GPUs from topology int g1, g2; NCCLCHECK(ncclTopoIdToIndex(system, GPU, id1, &g1)); struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1; if (ncclTopoIdToIndex(system, GPU, id2, &g2) == ncclInternalError) { // GPU not found, we can't use p2p. return ncclSuccess; } int intermediateIndex = -1; // Set intermediate GPU rank, if routing through an intermediate GPU. struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2; if (path->count == 2) { struct ncclTopoNode* intermediateNode = path->list[0]->remNode; if (intermediateNode->type == GPU) { intermediateIndex = intermediateNode - system->nodes[GPU].nodes; if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank; } } // In general, use P2P whenever we can. int p2pLevel = PATH_SYS; // User override if (ncclTopoUserP2pLevel == -1) NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL")); if (ncclTopoUserP2pLevel != -2) { p2pLevel = ncclTopoUserP2pLevel; goto compare; } // Don't use P2P through ARM CPUs int arch, vendor, model; NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB; if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { p2pLevel = PATH_PXB; } if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { p2pLevel = PATH_PXB; } compare: // Compute the PCI distance and compare with the p2pLevel. if (path->type <= p2pLevel) *p2p = 1; if (*p2p == 1) { // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to // validate against NVML at all since they are pretending to be on other hw. if (g1 != g2 && ncclParamIgnoreDisabledP2p() != 2) { int indexes[3] = {-1,-1,-1}; int verticeN = 0; NCCLCHECK(ncclNvmlEnsureInitialized()); indexes[verticeN++] = system->nodes[GPU].nodes[g1].gpu.dev; if (intermediateIndex != -1) indexes[verticeN++] = system->nodes[GPU].nodes[intermediateIndex].gpu.dev; indexes[verticeN++] = system->nodes[GPU].nodes[g2].gpu.dev; for (int i=1; i < verticeN; i++) { nvmlGpuP2PStatus_t status; status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusRead; bool good = status == NVML_P2P_STATUS_OK; status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusWrite; good &= status == NVML_P2P_STATUS_OK; if (!good) { if (!ncclParamIgnoreDisabledP2p()) { if (path->type <= PATH_NVB) { WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]); return ncclUnhandledCudaError; } else if (path->type < PATH_SYS) { INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]); } } *p2p = 0; } } } } if (path->type == PATH_NVL) { struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2; // Enable P2P Read for Ampere/NVLink only if (read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1; } return ncclSuccess; } // MNNVL: Check whether peers are in the same fabric cluster and clique ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret) { *ret = 0; nvmlGpuFabricInfoV_t *fabricInfo1 = &info1->fabricInfo; nvmlGpuFabricInfoV_t *fabricInfo2 = &info2->fabricInfo; // A zero UUID means we don't have MNNVL fabric info if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess; if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) && (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) { INFO(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x", info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId); *ret = 1; } return ncclSuccess; } NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); int ncclTopoUserGdrLevel = -1; ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int64_t netId, int read, int* useGdr) { *useGdr = 0; // Get GPU and NET int n, g; NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n)); struct ncclTopoNode* net = system->nodes[NET].nodes+n; NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g)); struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; // Check that both the NIC and GPUs support it if (net->net.gdrSupport == 0) return ncclSuccess; if (gpu->gpu.gdrSupport == 0) return ncclSuccess; if (read) { // For reads (sends) only enable under certain conditions int gdrReadParam = ncclParamNetGdrRead(); if (gdrReadParam == 0) return ncclSuccess; // Disable GDR Reads pre-Ampere when we have other PCI flows if (gdrReadParam < 0 && gpu->gpu.cudaCompCap < 80) { int nvlink = 0; // Since we don't know whether there are other communicators, // it's better to keep things local if we have a single GPU. if (system->nodes[GPU].count == 1) nvlink = 1; for (int i=0; inodes[GPU].count; i++) { if (i == g) continue; if (gpu->paths[GPU][i].type == PATH_NVL) { nvlink = 1; break; } } if (!nvlink) return ncclSuccess; } } // Check if we are close enough that it makes sense to enable GDR int netGdrLevel = PATH_PXB; NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL")); if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel; int distance = gpu->paths[NET][n].type; if (distance == PATH_PXN) { // In case of PXN, use the intermediate GPU distance instead int proxyRank, g; NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netId, &proxyRank)); NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g)); struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g; distance = proxyGpu->paths[NET][n].type; } if (distance > netGdrLevel) { INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %lx (distance %d > %d)", busId, netId, distance, netGdrLevel); return ncclSuccess; } *useGdr = 1; INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %lx (distance %d <= %d), read %d", busId, netId, distance, netGdrLevel, read); return ncclSuccess; } // Set to 0 to disable the flush on Hopper when using GDR NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0); // Determine whether we need to flush the GDR recv buffers ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush) { int g; NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g)); struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; // Flush is required on Ampere and earlier *flush = gpu->gpu.cudaCompCap < 90 ? 1 : ncclParamNetForceFlush(); return ncclSuccess; } NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0); // Check whether going through the network would be faster than going through P2P/SHM. ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net) { if (ncclParamNetDisableIntra() == 1) { *net = 0; return ncclSuccess; } *net = 1; // First check the current GPU-to-GPU speed. int g1, g2; if (ncclTopoIdToIndex(system, GPU, id1, &g1) != ncclSuccess || ncclTopoIdToIndex(system, GPU, id2, &g2) != ncclSuccess) { return ncclSuccess; } struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1; struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2; float speed = gpu1->paths[GPU][g2].bw; // Now check the speed each GPU can access the network through PXB or better float netSpeed1 = 0, netSpeed2 = 0; for (int n=0; nnodes[NET].count; n++) { struct ncclTopoLinkList* path = gpu1->paths[NET]+n; if (path->type <= PATH_PXB && path->bw > netSpeed1) netSpeed1 = path->bw; path = gpu2->paths[NET]+n; if (path->type <= PATH_PXB && path->bw > netSpeed2) netSpeed2 = path->bw; } if (netSpeed1 > speed && netSpeed2 > speed) return ncclSuccess; *net = 0; return ncclSuccess; } ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank) { // Get GPU and NET int n, g; NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n)); NCCLCHECK(ncclTopoRankToIndex(system, rank, &g)); struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; struct ncclTopoLinkList* path = gpu->paths[NET]+n; if (path->type == PATH_PXN) { struct ncclTopoNode* node; int type = NVS; for (int i=0; icount && type == NVS; i++) { node = path->list[i]->remNode; type = node->type; } if (type != GPU) { WARN("Could not find intermediate GPU between GPU rank %d and NIC %lx", rank, netId); return ncclInternalError; } *intermediateRank = node->gpu.rank; } else { *intermediateRank = rank; } return ncclSuccess; } NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0); // Net v4 plugins don't have non-blocking connect/accept. We can't therefore use // remote proxies without risking deadlocks int ncclPxnDisable(struct ncclComm* comm) { static int pxnDisable = -1; if (pxnDisable == -1) { if (comm && ncclNetVersion(comm) == 4) { INFO(NCCL_INIT, "PXN Disabled as plugin is v4"); pxnDisable = 1; } else { pxnDisable = ncclParamPxnDisable(); } } return pxnDisable; } ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks) { struct ncclTopoSystem* system = comm->topo; *nranks = 0; *intermediateRanks = NULL; if (system->nodes[NET].count == 0) return ncclSuccess; int nr = 0; int* ranks = NULL; for (int rank=0; ranknRanks; rank++) { int64_t netId; int proxyRank; NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank)); if (proxyRank == comm->rank) continue; int useGdr; NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netId, 1, &useGdr)); if (useGdr == 0) continue; int found = 0; for (int r=0; rnodes[CPU].count; c++) { NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system)); } // Set direct paths to GPUs. for (int g=0; gnodes[GPU].count; g++) { NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system)); } // Set direct paths to NICs. for (int n=0; nnodes[NET].count; n++) { NCCLCHECK(ncclTopoSetPaths(system->nodes[NET].nodes+n, system)); } // Set direct paths to NVSwitches. for (int n=0; nnodes[NVS].count; n++) { NCCLCHECK(ncclTopoSetPaths(system->nodes[NVS].nodes+n, system)); } // Update path for GPUs when we don't want to / can't use GPU Direct P2P for (int g=0; gnodes[GPU].count; g++) { for (int p=0; pnodes[GPU].count; p++) { int p2p; NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL)); if (p2p == 0) { // Divert all traffic through the CPU int cpu; NCCLCHECK(getLocalCpu(system, g, &cpu)); NCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g)); } } if (comm == NULL) continue; // Remove GPUs we can't (or don't want to) communicate with through P2P or SHM struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank; for (int p=0; pnodes[GPU].count; p++) { if (p == g) continue; struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank; int p2p; NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo)); if (p2p == 0) { int shm; NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo)); if (shm == 0) { // Mark this peer as inaccessible. We'll trim it later. system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET; } } } } // Update paths for NICs (no GPU Direct, PXN, ...) for (int n=0; nnodes[NET].count; n++) { struct ncclTopoNode* netNode = system->nodes[NET].nodes+n; for (int g=0; gnodes[GPU].count; g++) { // Check whether we can access the NIC through another NVLink-connected GPU (PXN) struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; if (ncclPxnDisable(comm) != 1) { int localGpuIndex; NCCLCHECK(ncclTopoGetLocalGpu(system, netNode->id, &localGpuIndex)); if (localGpuIndex != g && localGpuIndex != -1) { // PXN = PCI + NVLink. struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex; // Only use PXN for NIC n if remote GPU p ... if (peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) && // Is on the same node as us (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU // We can use that GPU as relay to communicate with that NIC. // Only enabling it in the GPU->NIC direction for now to favor // receiving locally and sending remotely (consistent with net.cc) NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n)); } } if (gpu->paths[NET][n].type < PATH_PHB) { // Update path when we dont want to / can't use GPU Direct RDMA. int gdr; NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr)); if (gdr == 0) { // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU int localCpu; NCCLCHECK(getLocalCpu(system, g, &localCpu)); NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g)); NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n)); } } } } return ncclSuccess; } ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) { int *domains; int64_t *ids; NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count)); NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count)); int myDomain = 0; for (int g=0; gnodes[GPU].count; g++) { struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; domains[g] = g; ids[g] = gpu->id; for (int p=0; ppaths[GPU][p].type < PATH_NET) { domains[g] = std::min(domains[g], domains[p]); } } if (gpu->gpu.rank == comm->rank) myDomain = domains[g]; } int ngpus = system->nodes[GPU].count; for (int i=0; inodes[GPU].count /* This one varies over the loops */; g++) { gpu = system->nodes[GPU].nodes+g; if (gpu->id == ids[i]) break; else gpu=NULL; } if (gpu == NULL) { WARN("Could not find id %lx", ids[i]); free(domains); free(ids); return ncclInternalError; } NCCLCHECK(ncclTopoRemoveNode(system, GPU, g)); } if (system->nodes[GPU].count == comm->nRanks) { for (int n=system->nodes[NET].count-1; n>=0; n--) NCCLCHECK(ncclTopoRemoveNode(system, NET, n)); } free(domains); free(ids); return ncclSuccess; } void ncclTopoFree(struct ncclTopoSystem* system) { for (int t=0; ttopo; struct ncclTopoLinkList* path = NULL; if (ncclTopoRankToIndex(system, peerRank, &peer) == ncclSuccess) { // Same rank if (g == peer) { *nChannels = -1; return ncclSuccess; } // Local rank path = system->nodes[GPU].nodes[peer].paths[GPU]+g; if (path->type == PATH_NVL) { float nvlBw = ncclTopoNVLinkBw(system->nodes[GPU].nodes[g].gpu.cudaCompCap); *nChannels = 2*std::max(1, (int)(path->bw / nvlBw)); } else { *nChannels = 2; } } else { // Remote rank, use network int nNetChannels = ncclParamNChannelsPerNetPeer(); if (nNetChannels == -1) { //start from 2 channels per NIC and reduce with scale nNetChannels = 2; // check if we need to use more than one NIC, hence more than one channel int netCountByBw = 1, nChannelsMax = nNetChannels; NCCLCHECK(getLocalNetCountByBw(system, g, &netCountByBw)); // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth. while (nChannelsMax*comm->nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2; //allow upto channels requires to drive the NICs nNetChannels = std::max(netCountByBw, nChannelsMax); } *nChannels = nNetChannels; } return ncclSuccess; } NCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 1); NCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS); extern int64_t ncclParamWorkArgsBytes(); ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { /* here we already honor comm->max/minCTAs for p2pnChannels. */ if (comm->sharedRes->owner != comm) { comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels()); comm->p2pnChannels = std::min(std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels()), comm->sharedRes->tpP2pNChannels); } else { comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels()); comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels()); } int minChannels = comm->p2pnChannels; // We need to loop through all local GPUs to have a global picture for (int g=0; gtopo->nodes[GPU].count; g++) { for (int r=0; rnRanks; r++) { int nChannels; NCCLCHECK(ncclTopoGetNchannels(comm, g, r, &nChannels)); if (nChannels >= 0) minChannels = std::min(minChannels, nChannels); } } // Make nChannelsPerPeer and nChannels powers of 2. This is relied on when // mapping p2p peers to channels. comm->p2pnChannelsPerPeer = pow2Up(minChannels); comm->p2pnChannels = pow2Up(comm->p2pnChannels); comm->p2pnChannels = std::min(comm->p2pnChannels, pow2Down(ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes()))); comm->p2pnChannelsPerPeer = std::min(comm->p2pnChannelsPerPeer, comm->p2pnChannels); // Init channels that weren't used so far for (int c=comm->nChannels; cp2pnChannels; c++) NCCLCHECK(initChannel(comm, c)); return ncclSuccess; } ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks) { int ngpus = system->nodes[GPU].count; NCCLCHECK(ncclCalloc(ranks, ngpus)); int nvbGpus = 0; for (int g=0; gnodes[GPU].nodes+g; if (gpu->gpu.rank != rank) continue; for (int p=0; ppaths[GPU][p].type == PATH_NVB) { (*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank; } } } *nranks = nvbGpus; return ncclSuccess; } int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) { int minPath = PATH_DIS; for (int i=0; inodes[GPU].count; i++) { struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU]; for (int j=0; jnodes[GPU].count; j++) { if (i == j) continue; minPath = std::min(minPath, paths[j].type); } } return minPath >= PATH_PIX ? 0 : 1; } nccl-2.22.3-1/src/graph/rings.cc000066400000000000000000000035141463451655400162160ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" #define MAXWIDTH 20 #define PREFIXLEN 15 #define STRLENGTH (PREFIXLEN+5*MAXWIDTH) void dumpLine(int* values, int nranks, const char* prefix) { int prefixlen = strlen(prefix); char line[STRLENGTH+1]; line[STRLENGTH] = '\0'; memset(line, ' ', STRLENGTH); strncpy(line, prefix, PREFIXLEN); for (int i=0; i NCCL_PARAM(CrossNic, "CROSS_NIC", 2); // Initialize system->maxBw. This is the per-channel (i.e. per-SM) // max bw. static float getMaxBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) { float maxBw = 0.0; for (int i=0; inodes[type].count; i++) { struct ncclTopoLinkList* path = gpu->paths[type]+i; float bw = path->bw; if (path->count == 0) continue; maxBw = std::max(maxBw, bw); } return maxBw; } static float getTotalBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { float nvlinkBw = 0.0, pciBw = 0.0; for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = gpu->links+l; if (link->type == LINK_NVL) nvlinkBw += link->bw; if (link->type == LINK_PCI) pciBw = link->bw; } return std::max(pciBw, nvlinkBw); } ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) { system->maxBw = 0.0; system->totalBw = 0.0; int inter = system->nodes[NET].count; if (inter == 0 && system->nodes[GPU].count == 1) { system->maxBw = LOC_BW; system->totalBw = LOC_BW; return ncclSuccess; } for (int g=0; gnodes[GPU].count; g++) { struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; system->maxBw = std::max(system->maxBw, getMaxBw(system, gpu, inter ? NET : GPU)); system->totalBw = std::max(system->totalBw, getTotalBw(system, gpu)); } return ncclSuccess; } ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm) { // We assume there is at least one CPU and that the CPUs have the same // architecture and vendor. const struct ncclTopoNodeSet* cpus = &comm->topo->nodes[CPU]; comm->cpuArch = cpus->nodes[0].cpu.arch; comm->cpuVendor = cpus->nodes[0].cpu.vendor; return ncclSuccess; } static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, int type, struct ncclTopoLink** revLink) { for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = node2->links+l; if (link->remNode == node1 && link->type == type) { *revLink = link; return ncclSuccess; } } WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id); return ncclInternalError; } // This is unfortunately needed since manipulating floats often results in rounding errors. #define SUB_ROUND(a, b) (a = roundf((a-b)*1000)/1000) static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNode* start, int maxSteps, float bw, int* steps) { float pciBw = bw; for (int step=0; stepcount; step++) { struct ncclTopoNode* node = path->list[step]->remNode; if (node->type == CPU) { // Account for P2P inefficiency through Intel CPU RC if (path->type == PATH_PHB && start->type == GPU && node->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && node->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { pciBw = INTEL_P2P_OVERHEAD(bw); } } } struct ncclTopoNode* node = start; for (int step=0; steplist[step]; struct ncclTopoLink* revLink = NULL; float fwBw = link->type == LINK_PCI ? pciBw : bw; float revBw = 0; if (link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) { if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink)); revBw += fwBw/8; } if (link->remNode->type == CPU && link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER && link->type == LINK_NVL) { if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink)); revBw += fwBw; } if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; } SUB_ROUND(link->bw, fwBw); if (revBw) SUB_ROUND(revLink->bw, revBw); node = link->remNode; } *steps = maxSteps; return ncclSuccess; } // Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1). static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, float mult, struct ncclTopoNode** node) { // First handle easy cases *node = system->nodes[type2].nodes+index2; if (type1 == -1) return ncclSuccess; struct ncclTopoNode* node1 = system->nodes[type1].nodes+index1; struct ncclTopoLinkList* path = node1->paths[type2]+index2; struct ncclTopoNode* node2 = system->nodes[type2].nodes+index2; struct ncclTopoLinkList* revPath = node2->paths[type1]+index1; if (path == NULL) { WARN("No path computed to go from %s/%d to %s/%d", topoNodeTypeStr[type1], index1, topoNodeTypeStr[type2], index2); return ncclInternalError; } // Now check link type *node = NULL; int intra = (type1 == GPU || type1 == NVS) && (type2 == GPU || type2 == NVS); float bw = intra ? graph->bwIntra : graph->bwInter; int type = intra ? graph->typeIntra : graph->typeInter; if (mult == 1 && (path->type > type)) return ncclSuccess; if (mult == 1 && (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE || graph->pattern == NCCL_TOPO_PATTERN_TREE || graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) && (revPath->type > type)) return ncclSuccess; bw *= mult; // Check there is enough bandwidth on paths. int step = 0; NCCLCHECK(followPath(path, node1, path->count, bw, &step)); if (step < path->count) goto rewind; // Enough bandwidth : return destination node. graph->nHops += mult*path->count; *node = system->nodes[type2].nodes+index2; return ncclSuccess; rewind: // Not enough bandwidth : rewind and exit. NCCLCHECK(followPath(path, node1, step, -bw, &step)); return ncclSuccess; } static int gpuPciBw(struct ncclTopoNode* gpu) { for (int l=0; lnlinks; l++) { struct ncclTopoLink* gpuLink = gpu->links+l; if (gpuLink->type != LINK_PCI) continue; struct ncclTopoNode* pci = gpuLink->remNode; for (int l=0; lnlinks; l++) { struct ncclTopoLink* pciLink = pci->links+l; if (pciLink->remNode != gpu) continue; return std::min(gpuLink->bw, pciLink->bw); } } return -1; } /* Choose the order in which we try next GPUs. This is critical for the search to quickly converge to the best solution even if it eventually times out. */ struct ncclGpuScore { int g; // Retain the index int startIndex; // Least important int intraNhops; int intraBw; int interNhops; int interPciBw; int interBw; // Most important }; static int cmpScore(const void * g1, const void * g2) { struct ncclGpuScore *s1 = (struct ncclGpuScore*)g1; struct ncclGpuScore *s2 = (struct ncclGpuScore*)g2; int d; if ((d = (s2->interBw - s1->interBw))) return d; if ((d = (s2->interPciBw - s1->interPciBw))) return d; if ((d = (s1->interNhops - s2->interNhops))) return d; if ((d = (s2->intraBw - s1->intraBw))) return d; if ((d = (s1->intraNhops - s2->intraNhops))) return d; return s1->startIndex - s2->startIndex; } static int cmpIntraScores(struct ncclGpuScore* scores, int count) { int intraBw = scores[0].intraBw; int intraNhops = scores[0].intraNhops; for (int i=1; inodes[GPU].count; g++) { if (system->nodes[GPU].nodes[g].gpu.rank == rank) { *index = g; return ncclSuccess; } } WARN("Could not find gpu rank %d", rank); return ncclInternalError; } static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int* index) { for (int n=0; nnodes[NET].count; n++) { if (system->nodes[NET].nodes[n].id == id) { *index = n; return ncclSuccess; } } WARN("Could not find net id %lx", id); return ncclInternalError; } static ncclResult_t getNetPaths(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoLinkList** netPaths) { int64_t netId = graph->inter[graph->nChannels*2]; int n; NCCLCHECK(getNetIndex(system, netId, &n)); *netPaths=system->nodes[NET].nodes[n].paths[GPU]; return ncclSuccess; } ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* gpu, int* next, int* countPtr, int sortNet) { const uint64_t flag = 1ULL<<(graph->nChannels); int ngpus = system->nodes[GPU].count; struct ncclTopoLinkList* paths = gpu->paths[GPU]; struct ncclTopoLinkList* netPaths = NULL; if (sortNet) NCCLCHECK(getNetPaths(system, graph, &netPaths)); struct ncclGpuScore scores[NCCL_TOPO_MAX_NODES]; memset(scores, 0, ngpus*sizeof(struct ncclGpuScore)); int start = gpu-system->nodes[GPU].nodes; int count = 0; for (int i=1; inodes[GPU].nodes[g].used & flag) continue; scores[count].g = g; scores[count].startIndex = i; scores[count].intraNhops = paths[g].count; scores[count].intraBw = paths[g].bw; if (netPaths) { scores[count].interNhops = netPaths[g].count; scores[count].interPciBw = gpuPciBw(system->nodes[GPU].nodes+g); scores[count].interBw = netPaths[g].bw; } count++; } // Sort GPUs qsort(scores, count, sizeof(struct ncclGpuScore), cmpScore); // Check if all have the same intra-node score in which case we go reverse for sortNet = -1 if (sortNet == -1 && cmpIntraScores(scores, count) == 0) { for (int i=0; inodes[NVS].count) { // NVSwitches prefer when we talk to a limited set of peers. Try to use neighbors first. int index = gpu-system->nodes[GPU].nodes; int i; int prevGpu = (index-1+ngpus)%ngpus; int nextGpu = (index+1)%ngpus; int firstGpus[2]; int firstGpuCount = 0; if (graph->pattern == NCCL_TOPO_PATTERN_RING) { firstGpus[0] = nextGpu; firstGpus[1] = prevGpu; firstGpuCount = 2; } else if (graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE || graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) { firstGpus[0] = prevGpu; firstGpus[1] = nextGpu; firstGpuCount = 2; } else { firstGpus[0] = nextGpu; firstGpuCount = 1; } if (nextGpu == prevGpu && firstGpuCount == 2) firstGpuCount = 1; int firstGpuRealCount = 0; for (int g=0; g0; i--) next[i] = next[i-1]; next[0] = firstGpus[g]; firstGpuRealCount++; } } *countPtr = firstGpuRealCount; } return ncclSuccess; } ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time); // Try to keep all searchs within one second #define NCCL_SEARCH_GLOBAL_TIMEOUT (1ULL<<19) #define NCCL_SEARCH_TIMEOUT (1<<14) #define NCCL_SEARCH_TIMEOUT_TREE (1<<14) #define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<8) #define FORCED_ORDER_PCI 1 #define FORCED_ORDER_REPLAY 2 ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int step, int* g) { *g = -1; if (graph->nChannels == 0) return ncclInternalError; int ngpus = system->nodes[GPU].count; int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1]; for (int i=0; inodes[GPU].nodes[i].gpu.rank == nextRank) { *g = i; return ncclSuccess; } if (*g == -1) return ncclInternalError; return ncclSuccess; } ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time); ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time, int type, int index, int g) { const uint64_t flag = 1ULL<<(graph->nChannels); struct ncclTopoNode* gpu; NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu)); if (gpu) { gpu->used ^= flag; NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, time)); gpu->used ^= flag; NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, -1, &gpu)); } return ncclSuccess; } ncclResult_t ncclTopoSearchTryCollnetDirect(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) { int fwdg = 0; int bwdg = 0; struct ncclTopoNode* gpu = NULL; float mul = 1.0 / (float)(system->nodes[GPU].count - 1); do { NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, fwdg, mul, &gpu)); } while (gpu && ++fwdg < system->nodes[GPU].count); if (gpu != NULL) { do { NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, bwdg, GPU, g, mul, &gpu)); } while (gpu && ++bwdg < system->nodes[GPU].count); if (gpu != NULL) { // Both directions worked. Now we already have head, so pop the all other intra ranks. int step = 1; for (int index = 0; index < ngpus; ++index) { if (index != g) { graph->intra[graph->nChannels * ngpus + step] = system->nodes[GPU].nodes[index].gpu.rank; step++; } } NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time)); } while (bwdg) { bwdg--; NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, bwdg, GPU, g, -mul, &gpu)); } } while (fwdg) { fwdg--; NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, fwdg, -mul, &gpu)); } return ncclSuccess; } ncclResult_t ncclTopoSearchTryNvls(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) { struct ncclTopoNode* nvs; struct ncclTopoNode* gpu; int d0=0; // See if there is enough bandwidth for NVS->GPU traffic do { NCCLCHECK(ncclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? 2 : 1, &gpu)); d0++; } while (gpu && d0 < system->nodes[GPU].count); if (gpu == NULL) { d0--; } else { int d1=0; // See if there is enough bandwidth for GPU->NVS traffic do { NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? 2 : 1, &nvs)); d1++; } while (nvs && d1 < system->nodes[GPU].count); if (nvs == NULL) { d1--; } else { // Both directions worked. Move on to the next path. NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time)); } while (d1) { d1--; NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? -2 : -1, &nvs)); } } while (d0) { d0--; NCCLCHECK(ncclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? -2 : -1, &gpu)); } return ncclSuccess; } ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) { // 1. Try to get the same nChannels between Rings and Trees if (graph->nChannels < graph->minChannels) return ncclSuccess; if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // NVLS channels correspond to GPUs pulling from NVLS. So the more the better. if (graph->nChannels > refGraph->nChannels && graph->nChannels <= system->nodes[GPU].count) *copy = 1; if (graph->nChannels*graph->bwInter > refGraph->nChannels*refGraph->bwInter) *copy = 1; return ncclSuccess; } // 2. Try to get better bandwidth if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra) { *copy = 1; return ncclSuccess; } if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra) return ncclSuccess; // 3. Less hops if (graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1; return ncclSuccess; } // Build a sorted list of the NETs to try. // // "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu // index when trying to get back to the NIC. // // The list is built the following way: // 1. Select NETs starting with those close to GPU(s), based on paths[n].type. // 2. add other NETs satisfying typeInter but not already in the list. ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) { int netCount = 0; int localNetCount; int* localNets; NCCLCHECK(ncclCalloc(&localNets, MAXCHANNELS)); // First add the preferred NICs for (int g=0; gnodes[GPU].count; g++) { if (gpu != -1 && gpu != g) continue; localNetCount = 0; struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; for (int c = 0; cgpu.rank, c, &netId, NULL)); NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount)); if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break; localNetCount++; } // Append NICs to list for (int i=0; inodes[GPU].count; g++) { if (gpu != -1 && gpu != g) continue; localNetCount = 0; struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; struct ncclTopoLinkList* paths = gpu->paths[NET]; for (int n=0; nnodes[NET].count && nnodes[GPU].count; if (step == ngpus) { // Determine whether we found a better solution or not int copy = 0; graph->nChannels++; NCCLCHECK(ncclTopoCompareGraphs(system, graph, saveGraph, ©)); if (copy) { memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph)); if (graph->nChannels == graph->maxChannels) *time = -1; } if (graph->nChannels < graph->maxChannels) { NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, time)); } graph->nChannels--; return ncclSuccess; } graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank; int g = gpu - system->nodes[GPU].nodes; if (step == backToNet) { // first get back to NIC if (system->nodes[NET].count) { int startNetIndex; NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex)); struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex; int netCount; int* nets; NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount)); for (int i=0; inodes[NET].nodes+n; if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue; if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2]) continue; // Balanced Tree : count half of the bandwidth on first two GPUs int nextBackToNet = -1; float bwInterSave = graph->bwInter; if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) { // Count half of the bandwidth on each of the first two GPUs if (step == 0) nextBackToNet = 1; else if (net->id != graph->inter[graph->nChannels*2+1]) continue; graph->bwInter /= 2; } NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net)); graph->bwInter = bwInterSave; if (net) { graph->inter[graph->nChannels*2+1] = net->id; NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time)); if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2; NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net)); graph->bwInter = bwInterSave; } } free(nets); } } else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time)); } else if (graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) { NCCLCHECK(ncclTopoSearchTryCollnetDirect(system, graph, saveGraph, g, ngpus, time)); } else if (step < system->nodes[GPU].count-1) { // Go to next GPU int next[NCCL_TOPO_MAX_NODES]; int count; if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order next[0] = step+1; count = 1; } else if (forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order NCCLCHECK(ncclTopoReplayGetGpu(system, graph, step, next)); count = 1; } else { // Normal search NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 )); } for (int i=0; iintra[graph->nChannels*ngpus], &p)); struct ncclTopoNode* firstGpu; NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, p, 1, &firstGpu)); if (firstGpu) { NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, time)); NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, p, -1, &firstGpu)); } } else { // Next path NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time)); } return ncclSuccess; } ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) { const int bw = graph->bwInter; int* nets; NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); int netCount; int graphFound = 0; NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount)); for (int i=0; ipattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) continue; int n = nets[(graph->nChannels+i)%netCount]; struct ncclTopoNode* net = system->nodes[NET].nodes+n; if (graph->collNet && net->net.collSupport == 0) continue; if (net->net.bw < bw) continue; if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2+1]) continue; graph->inter[graph->nChannels*2] = net->id; graph->latencyInter = net->net.latency; for (int i=0; inodes[NET].count; i++) { if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) { system->nodes[NET].nodes[i].net.bw -= bw; } } if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) { // NVLS search only tries to find NIC:GPU combinations to compute the heads. if (graph->nChannels < netCount) { int gpu; int duplicate = 0; NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu)); // check whether there is duplicate head when one GPU connects with multiple NICs for (int gc = 0; gc < graph->nChannels; gc++) { if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) { duplicate = 1; break; } } if (duplicate) continue; if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu)); graphFound = 1; } } else { if (graph->nChannels > 0) { // Try to replay the last channel int g; NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g)); } if (graph->nChannels == 0 || graph->sameChannels == 0) { if (graph->nChannels == 0 && system->nodes[NVS].count == 0) { // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long int t = 1 << 10; NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0)); if (t == -1) *time = -1; } // Then try the most local GPUs float maxBw = 0; int minHops = 0xfffffff; struct ncclTopoLinkList* paths = net->paths[GPU]; for (int g=0; gnodes[GPU].count; g++) { if (paths[g].bw > maxBw) { maxBw = paths[g].bw; minHops = paths[g].count; } else if (paths[g].bw == maxBw && paths[g].count < minHops) { minHops = paths[g].count; } } if (maxBw >= bw) { for (int i=0; inodes[GPU].count; i++) { int g = (graph->nChannels+i)%system->nodes[GPU].count; if (paths[g].bw == maxBw && paths[g].count == minHops) { NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g)); } } } } } for (int i=0; inodes[NET].count; i++) { if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) { system->nodes[NET].nodes[i].net.bw += bw; } } } free(nets); return ncclSuccess; } /* Search Patterns * * Intra-node * Ring : GPU a -> GPU b -> .. -> GPU x -> GPU a * (=Split Tree Loop) * Tree : GPU a -> GPU b -> .. -> GPU x * (=Split Tree) * * Inter-node * Ring : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic) * Tree : NET n -> GPU a -> GPU b -> .. -> GPU x * `--> NET n (or m if crossNic) * Split Tree : NET n -> GPU a -> GPU b -> .. -> GPU x * `--> NET n (or m if crossNic) * Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a * `--> NET n (or m if crossNic) */ ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) { if (system->nodes[NET].count) { if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1; else if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) *backToNet = 1; else *backToNet = 0; *backToFirstRank = -1; } else { *backToNet = -1; if (pattern == NCCL_TOPO_PATTERN_RING) *backToFirstRank = system->nodes[GPU].count-1; else *backToFirstRank = -1; } return ncclSuccess; } ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time) { int backToNet, backToFirstRank; NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank)); if (system->nodes[NET].count) { // Start from NET ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time); } else { // Intra-node only. if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, graph->nChannels)); return ncclSuccess; } else if (graph->nChannels == 0) { // Try PCI order first NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0)); } else { // Also try to replay previous channel int g; NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, -1, -1, g)); } if (graph->sameChannels == 0 || graph->nChannels == 0) { // Finally, try all other possibilities unless we are forced to use the same channels for (int g=0; gnodes[GPU].count; g++) { NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g)); } } } return ncclSuccess; } /************************************/ /* User defined graph from XML file */ /************************************/ struct kvDict kvDictLinkType[] = { { "LOC", PATH_LOC }, { "NVL", PATH_NVL }, { "NVB", PATH_NVB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "PXN", PATH_PXN }, { "PHB", PATH_PHB }, { "SYS", PATH_SYS }, { NULL, 0 } }; ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; int64_t* inter = graph->inter+2*c; int* intra = graph->intra+ngpus*c; int n=0, g=0; for (int s=0; snSubs; s++) { struct ncclXmlNode* sub = xmlChannel->subs[s]; int64_t dev; const char* str; NCCLCHECK(xmlGetAttrStr(sub, "dev", &str)); dev = strtol(str, NULL, 16); if (strcmp(sub->name, "net") == 0) { inter[n++] = dev; } else if (strcmp(sub->name, "gpu") == 0) { int rank = -1; for (int g=0; gnodes[GPU].nodes[g].id); if (NCCL_TOPO_ID(systemId, system->nodes[GPU].nodes[g].gpu.dev) == dev) rank = system->nodes[GPU].nodes[g].gpu.rank; } if (rank == -1) { WARN("XML Import Channel : dev %ld not found.", dev); return ncclSystemError; } intra[g++] = rank; } } return ncclSuccess; } ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels) { int id; NCCLCHECK(xmlGetAttrInt(xmlGraph, "id", &id)); if (graph->id != id) return ncclSuccess; int crossNic; NCCLCHECK(xmlGetAttrInt(xmlGraph, "crossnic", &crossNic)); if (ncclParamCrossNic() == 0 && crossNic == 1) return ncclSuccess; graph->crossNic = crossNic; NCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern)); NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels)); NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra)); NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter)); if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0; const char* str; NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str)); NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType)); NCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str)); NCCLCHECK(kvConvertToInt(str, &graph->typeInter, kvDictLinkType)); NCCLCHECK(xmlGetAttrInt(xmlGraph, "samechannels", &graph->sameChannels)); for (int s=0; snSubs; s++) { NCCLCHECK(ncclTopoGetChannelFromXml(xmlGraph->subs[s], s, system, graph)); } *nChannels = xmlGraph->nSubs; return ncclSuccess; } ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels) { for (int s=0; snSubs; s++) { NCCLCHECK(ncclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph, nChannels)); } return ncclSuccess; } /* And the reverse : graph->xml */ ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struct ncclTopoSystem* system, struct ncclXml *xml, struct ncclXmlNode* parent) { struct ncclXmlNode* xmlChannel; int ngpus = system->nodes[GPU].count; int64_t* inter = graph->inter+2*c; int* intra = graph->intra+ngpus*c; NCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel)); struct ncclXmlNode* node; if (system->nodes[NET].count) { NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node)); NCCLCHECK(xmlSetAttrLong(node, "dev", inter[0])); } for (int g=0; gnodes[GPU].nodes[i].gpu.rank == intra[g]) { int systemId = NCCL_TOPO_ID_SYSTEM_ID(system->nodes[GPU].nodes[i].id); dev = NCCL_TOPO_ID(systemId, system->nodes[GPU].nodes[i].gpu.dev); } } if (dev == -1) { WARN("XML Export Channel : rank %d not found.", intra[g]); return ncclInternalError; } NCCLCHECK(xmlSetAttrLong(node, "dev", dev)); if (graph->id == 3) break; // NVLS graphs only use the first GPU } if (system->nodes[NET].count) { NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node)); NCCLCHECK(xmlSetAttrLong(node, "dev", inter[1])); } return ncclSuccess; } ncclResult_t ncclTopoGetXmlFromGraph(struct ncclTopoGraph* graph, struct ncclTopoSystem* system, struct ncclXml *xml, struct ncclXmlNode* parent) { struct ncclXmlNode* xmlGraph; NCCLCHECK(xmlAddNode(xml, parent, "graph", &xmlGraph)); NCCLCHECK(xmlSetAttrInt(xmlGraph, "id", graph->id)); NCCLCHECK(xmlSetAttrInt(xmlGraph, "pattern", graph->pattern)); NCCLCHECK(xmlSetAttrInt(xmlGraph, "crossnic", graph->crossNic)); NCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels)); NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->bwIntra)); NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->bwInter)); NCCLCHECK(xmlSetAttrFloat(xmlGraph, "latencyinter", graph->latencyInter)); const char* str; NCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType)); NCCLCHECK(xmlSetAttr(xmlGraph, "typeintra", str)); NCCLCHECK(kvConvertToStr(graph->typeInter, &str, kvDictLinkType)); NCCLCHECK(xmlSetAttr(xmlGraph, "typeinter", str)); NCCLCHECK(xmlSetAttrInt(xmlGraph, "samechannels", graph->sameChannels)); for (int c=0; cnChannels; c++) { NCCLCHECK(ncclTopoGetXmlFromChannel(graph, c, system, xml, xmlGraph)); } return ncclSuccess; } ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml) { xml->maxIndex = 0; struct ncclXmlNode* xmlGraphs; NCCLCHECK(xmlAddNode(xml, NULL, "graphs", &xmlGraphs)); NCCLCHECK(xmlSetAttrInt(xmlGraphs, "version", NCCL_GRAPH_XML_VERSION)); for (int g=0; gnChannels == 0) return ncclSuccess; if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess; if (graph->bwIntra < 25.0) return ncclSuccess; if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess; int dupChannels = std::min(graph->nChannels*2, graph->maxChannels); memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int)); memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int64_t)); graph->bwIntra /= DIVUP(dupChannels, graph->nChannels); graph->bwInter /= DIVUP(dupChannels, graph->nChannels); graph->nChannels = dupChannels; return ncclSuccess; } float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 }; float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float)) #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float)) float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 11.0, 6.0, 3.0 }; float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float)) #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float)) ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; int crossNic = (system->nodes[NET].count > 1) && (graph->pattern == NCCL_TOPO_PATTERN_RING || graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE || graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0; graph->crossNic = crossNic == 1 ? 1 : 0; graph->bwIntra = graph->bwInter = 0; graph->latencyInter = 0; graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; graph->typeInter = PATH_PIX; graph->nChannels = 0; int trySameChannels = graph->pattern == NCCL_TOPO_PATTERN_NVLS ? 0 : 1; graph->sameChannels = trySameChannels; int cpuArch, cpuVendor, cpuModel; NCCLCHECK(ncclTopoCpuType(system, &cpuArch, &cpuVendor, &cpuModel)); const char* str = ncclGetEnv("NCCL_GRAPH_FILE"); if (str) { INFO(NCCL_ENV, "NCCL_GRAPH_FILE set by environment to %s", str); struct ncclXml* xml; NCCLCHECK(xmlAlloc(&xml, NCCL_GRAPH_XML_MAX_NODES)); NCCLCHECK(ncclTopoGetXmlGraphFromFile(str, xml)); int nChannels; NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels)); INFO(NCCL_GRAPH, "Search %d : %d channels loaded from XML graph", graph->id, nChannels); free(xml); if (graph->nChannels > 0) return ncclSuccess; } int ccMin; NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL)); if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess; // NVLS and COLLNET_DIRECT search must have ngpus heads at most. if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) graph->maxChannels = system->nodes[GPU].count; if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE; if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // Force intra-node NVLS algorithm to pull evenly from all GPUs. graph->minChannels = graph->maxChannels = system->nodes[GPU].count; } struct ncclTopoGraph tmpGraph; memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph)); // First try crossnic, then decrease bw and finally increase bwIntra. int nspeeds = 0; float* speedArray = NULL; if (system->nodes[NET].count == 0) { nspeeds = ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA; speedArray = ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra; } else { nspeeds = ccMin >= 90 ? NSPEEDSINTER_SM90 : NSPEEDSINTER; speedArray = ccMin >= 90 ? sm90SpeedArrayInter : speedArrayInter; } int pass = 1; int speedIndex = 0; float maxBw = system->maxBw; float totalBw = system->totalBw; if (ngpus > 1 && graph->pattern != NCCL_TOPO_PATTERN_RING) totalBw *= ngpus*1.0/(ngpus-1); while ((speedArray[speedIndex] > maxBw || speedArray[speedIndex]*graph->minChannels > totalBw) && speedIndex < nspeeds-1) speedIndex++; tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex]; int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT; search: int time = tmpGraph.sameChannels ? NCCL_SEARCH_TIMEOUT_SAMECHANNELS : tmpGraph.pattern == NCCL_TOPO_PATTERN_TREE ? NCCL_SEARCH_TIMEOUT_TREE : NCCL_SEARCH_TIMEOUT; tmpGraph.nChannels = 0; globalTimeout -= time; NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time)); #if 0 printf("Id %d Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.id, tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : ""); for (int c=0; cnChannels; c++) { printf("%2d : ", c); for (int g=0; gintra[c*ngpus+g]); } printf("[%lx %lx]", graph->inter[c*2+0], graph->inter[c*2+1]); printf("\n"); } #endif // Optimal solution, stop here if (time == -1) goto done; if (graph->nChannels*graph->bwInter >= system->totalBw) goto done; if (pass == 1) { // First pass, we don't have a solution yet ; try other options // Try having different channels (except when going through AMD CPUs) if (tmpGraph.sameChannels == 1 && !(cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD && tmpGraph.typeIntra == PATH_SYS)) { tmpGraph.sameChannels = 0; goto search; } tmpGraph.sameChannels = trySameChannels; if (time != -1) globalTimeout += time; else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT; if (globalTimeout < 0 && graph->nChannels) goto done; // Try a simpler tree if (ccMin >= 90 && tmpGraph.pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) { tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE; goto search; } tmpGraph.pattern = graph->pattern; int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS; if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) { tmpGraph.typeIntra += 1; goto search; } tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) { tmpGraph.typeInter += 1; goto search; } tmpGraph.typeInter = PATH_PIX; if (crossNic == 2 && tmpGraph.crossNic == 0) { // Try again with crossNic if permitted tmpGraph.crossNic = 1; goto search; } tmpGraph.crossNic = crossNic == 1 ? 1 : 0; // Decrease bw until we find a solution if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->bwInter > .49))) { tmpGraph.bwInter = tmpGraph.bwIntra = speedArray[++speedIndex]; goto search; } speedIndex = 0; while (speedArray[speedIndex] > maxBw && speedIndex < nspeeds-1) speedIndex++; tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex]; } done: // We have a solution. Start from that solution and move to pass 2. if (pass == 1) { time = -1; NCCLCHECK(ncclTopoDupChannels(graph, ccMin, ngpus)); memcpy(&tmpGraph, graph, sizeof(tmpGraph)); speedIndex = 0; while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++; tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex]; tmpGraph.minChannels = graph->nChannels; pass = 2; } if (pass == 2) { // See if we can increase bw if (time != 0 && speedIndex > 0) { if (graph->pattern == NCCL_TOPO_PATTERN_RING) { // increase bw for Ring tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[--speedIndex]; goto search; } else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2) { tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels; tmpGraph.bwInter = speedArray[--speedIndex]; goto search; } else if (tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2) { // increase bwIntra for trees (2 nodes or collnet) tmpGraph.bwIntra = speedArray[--speedIndex]; goto search; } } time = -1; memcpy(&tmpGraph, graph, sizeof(tmpGraph)); } if (graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != NCCL_TOPO_PATTERN_NVLS) { WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern); for (int i=0; iintra[i] = system->nodes[GPU].nodes[i].gpu.rank; graph->inter[0] = graph->inter[1] = 0; graph->bwIntra = graph->bwInter = 0.1; graph->typeIntra = graph->typeInter = PATH_SYS; graph->nChannels = 1; } return ncclSuccess; } ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, bw %f/%f, type %s/%s, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->bwIntra, graph->bwInter, topoPathTypeStr[graph->typeIntra], topoPathTypeStr[graph->typeInter], graph->sameChannels); int ngpus = system->nodes[GPU].count; char line[1024]; for (int c=0; cnChannels; c++) { sprintf(line, "%2d :", c); int offset = strlen(line); if (system->nodes[NET].count > 0) { sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c]); offset = strlen(line); } for (int i=0; iintra[ngpus*c+i]); offset = strlen(line); } if (system->nodes[NET].count > 0) { sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c+1]); offset = strlen(line); } INFO(NCCL_GRAPH, "%s", line); } return ncclSuccess; } ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) { const char* str = ncclGetEnv("NCCL_GRAPH_DUMP_FILE"); if (str) { INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str); struct ncclXml* xml; NCCLCHECK(xmlAlloc(&xml, NCCL_GRAPH_XML_MAX_NODES)); NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml)); NCCLCHECK(ncclTopoDumpXmlToFile(str, xml)); free(xml); } return ncclSuccess; } #include "comm.h" // NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int channelId, int64_t* netId) { ncclResult_t ret = ncclSuccess; int localRanks = comm->topo->nodes[GPU].count; int netNum = 0; int64_t net[MAXCHANNELS]; for (int c = 0; c < graph->nChannels; c++) { if (graph->intra[c * localRanks] == comm->rank) { net[netNum++] = graph->inter[c * 2]; } } if (netNum) { *netId = net[channelId % netNum]; } else { ret = ncclInternalError; goto fail; } exit: return ret; fail: WARN("Could not find NIC for rank %d in NVLS graph", comm->rank); goto exit; } // 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation NCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2); ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank) { int64_t netId = -1; int netDev = -1; if (graph) { // Honor the net device in the graph int channel = channelId%graph->nChannels; int ngpus = comm->topo->nodes[GPU].count; int index = graph->intra[channel*ngpus] == rank ? 0 : 1; if (graph->pattern != NCCL_TOPO_PATTERN_NVLS) { netId = graph->inter[channel*2+index]; } else { NCCLCHECK(getNvlsNetDev(comm, graph, channelId, &netId)); } NCCLCHECK(ncclTopoIdToNetDev(comm->topo, netId, &netDev)); if (dev) *dev = netDev; if (id) *id = netId; NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, netId, proxyRank)); } else if (peerRank == -1) { return ncclInternalError; } else { // Start with our local NIC and local Rank NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, &netId, &netDev)); if (dev) *dev = netDev; if (id) *id = netId; *proxyRank = rank; int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel(); // See whether we can use the remote rank preferred device. if (ncclParamCrossNic() == 0 || (pxnLevel != 0)) { // Find local NIC number close to local nvmlDev int nvmlDev = comm->peerInfo[peerRank].nvmlDev; int localRank; if (ncclTopoDevToRank(comm->topo, nvmlDev, &localRank) != ncclSuccess) return ncclSuccess; NCCLCHECK(ncclTopoGetLocalNet(comm->topo, localRank, channelId, &netId, &netDev)); // Check that device exists on our node if (ncclParamCrossNic() == 0) { if (dev) *dev = netDev; if (id) *id = netId; } if (pxnLevel == 1) { int g, n; NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g)); NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netId, &n)); struct ncclTopoNode* gpu = comm->topo->nodes[GPU].nodes+g; if (gpu->paths[NET][n].type <= PATH_PXN) { if (dev) *dev = netDev; if (id) *id = netId; NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank)); } } else if (pxnLevel == 2) { // Check which local GPU corresponds to that NIC and see if we can use PXN. int n, g1, g2; NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netId, &n)); NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1)); NCCLCHECK(ncclTopoGetLocalGpu(comm->topo, netId, &g2)); if (g2 != -1) { struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2; if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) { *proxyRank = peerGpu->gpu.rank; if (dev) *dev = netDev; if (id) *id = netId; return ncclSuccess; } } } } } return ncclSuccess; } nccl-2.22.3-1/src/graph/topo.cc000066400000000000000000001213701463451655400160560ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" #include "graph.h" #include "topo.h" #include "comm.h" #include "nvmlwrap.h" #include "net.h" #include "coll_net.h" #include "transport.h" #include #include #include "xml.h" #include "cpuset.h" #include "bootstrap.h" #define BUSID_SIZE (sizeof("0000:00:00.0")) #define BUSID_REDUCED_SIZE (sizeof("0000:00")) const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" }; const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "", "SYS", "NET" }; const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" }; /******************************************************************/ /******************* Graph Creation Functions *********************/ /******************************************************************/ // Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000. ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) { char* str = path+offset; // Remove trailing "/" if (*str == '/') str--; // Find next / while (*str != '/') str--; str++; int64_t numid; NCCLCHECK(busIdToInt64(str, &numid)); // Ignore subdevice because those should use the same PCI link so we want to merge nodes. numid -= numid & 0xf; *id = numid; return ncclSuccess; } static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) { *cpu = NULL; if (node->type == CPU) { *cpu = node; return ncclSuccess; } for (int l=0; lnlinks; l++) { // Go up the PCI tree to find the CPU. Follow only PCI switches. if (node->links[l].type == LINK_PCI && (node->links[l].remNode->type == PCI || node->links[l].remNode->type == CPU)) { NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu)); } if (*cpu != NULL) return ncclSuccess; } return ncclSuccess; } int interCpuBw = 0; int cpuPciBw = 0; static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) { *bw = LOC_BW; if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) { *bw = P9_BW; return ncclSuccess; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_ARM) { *bw = ARM_BW; return ncclSuccess; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) { *bw = AMD_BW; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW; } return ncclSuccess; } enum ncclNvLinkDeviceType { ncclNvLinkDeviceUnknown, ncclNvLinkDeviceGpu, ncclNvLinkDeviceSwitch, ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea) }; ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) { for (int i=0; inodes[type].count; i++) { if (system->nodes[type].nodes[i].id == id) { *node = system->nodes[type].nodes+i; return ncclSuccess; } } return ncclSuccess; } ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) { if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) { WARN("Error : tried to create too many nodes of type %d", type); return ncclInternalError; } struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count; system->nodes[type].count++; n->type = type; n->id = id; if (type == GPU) { n->gpu.dev = NCCL_TOPO_UNDEF; n->gpu.rank = NCCL_TOPO_UNDEF; n->gpu.cudaCompCap = NCCL_TOPO_UNDEF; } else if (type == CPU) { n->cpu.arch = NCCL_TOPO_UNDEF; n->cpu.vendor = NCCL_TOPO_UNDEF; n->cpu.model = NCCL_TOPO_UNDEF; } else if (type == NET) { n->net.asic = 0ULL; n->net.port = NCCL_TOPO_UNDEF; n->net.bw = 0.0; n->net.latency = 0.0; } *node = n; return ncclSuccess; } ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int index) { struct ncclTopoNode* delNode = system->nodes[type].nodes+index; for (int t=0; tpaths[t]); for (int n=0; nnodes[t].count; n++) { struct ncclTopoNode* node = system->nodes[t].nodes+n; if (node == delNode) continue; for (int l=0; lnlinks; l++) { while (lnlinks && node->links[l].remNode == delNode) { memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink)); node->nlinks--; } if (lnlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) { node->links[l].remNode--; } } } } memmove(delNode, delNode+1, (system->nodes[type].count-index-1)*sizeof(struct ncclTopoNode)); system->nodes[type].count--; return ncclSuccess; } ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw) { // Aggregate links into higher bw for NVLink struct ncclTopoLink* link; for (link = node->links; link - node->links != NCCL_TOPO_MAX_LINKS && link->remNode; link++) { if (link->remNode == remNode && link->type == type) break; } if (link - node->links == NCCL_TOPO_MAX_LINKS) { WARN("Error : too many Topo links (max %d)", NCCL_TOPO_MAX_LINKS); return ncclInternalError; } if (link->remNode == NULL) node->nlinks++; link->type = type; link->remNode = remNode; link->bw += bw; // Sort links in BW descending order struct ncclTopoLink linkSave; memcpy(&linkSave, link, sizeof(struct ncclTopoLink)); while (link != node->links) { if ((link-1)->bw >= linkSave.bw) break; memcpy(link, link-1, sizeof(struct ncclTopoLink)); link--; } memcpy(link, &linkSave, sizeof(struct ncclTopoLink)); return ncclSuccess; } // BCM Gen4 Switches present themselves as a two-level hierarchical switch // even though they're supposed to sustain full BW across all ports. // Flatten the switch as this extra level can break the search and make // NCCL take wrong topology decisions. int getBcmGen(uint64_t id, int level) { if ((id & 0xfffffffffffff000) == 0x1000c0101000a000) return 4; if ((id & 0xfffffffffffff000) == (0x1000c03010000000 | level*0x1000)) return 5; return 0; } ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) { for (int s=0; snodes[PCI].count; s++) { struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s; int gen = getBcmGen(pciSwitch->pci.device, 0); // Flatten Gen4 PEX switches in base mode if (gen) { // Find sub switches with the same device ID. int64_t* subSwIds; NCCLCHECK(ncclCalloc(&subSwIds, pciSwitch->nlinks)); int subs = 0; for (int l=0; lnlinks; l++) { struct ncclTopoNode* sub = pciSwitch->links[l].remNode; // Only fuse sub switches with the same device ID. if (sub->type != PCI || getBcmGen(sub->pci.device, 1) != gen) continue; // Save sub switch for later subSwIds[subs++] = sub->id; // Remove link to that sub switch memmove(pciSwitch->links+l, pciSwitch->links+l+1, (pciSwitch->nlinks-l-1)*(sizeof(struct ncclTopoLink))); pciSwitch->nlinks--; // Don't increase l for the next iteration as we just shifted all links by one. l--; } for (int s=0; snodes[PCI].nodes is changing every time we remove a node) int index; NCCLCHECK(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index)); struct ncclTopoNode* sub = system->nodes[PCI].nodes+index; // Connect all sub PCI devices to the parent switch for (int l=0; lnlinks; l++) { struct ncclTopoNode* remNode = sub->links[l].remNode; if (remNode == pciSwitch) continue; // Add link from parent PCI switch -> PCI device if (pciSwitch->nlinks == NCCL_TOPO_MAX_LINKS) { WARN("Error : too many Topo links (max %d)", NCCL_TOPO_MAX_LINKS); return ncclInternalError; } memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink)); pciSwitch->nlinks++; // Update link from PCI device -> parent PCI switch for (int rl=0; rlnlinks; rl++) { if (remNode->links[rl].remNode == sub) { remNode->links[rl].remNode = pciSwitch; break; } } } NCCLCHECK(ncclTopoRemoveNode(system, PCI, index)); } // Set subdevice to 0xffff to make sure we don't merge this switch again. pciSwitch->pci.device |= 0xffff; free(subSwIds); // Restart, as system->nodes[PCI].nodes has changed. s = 0; } } return ncclSuccess; } ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) { // And connect all CPU nodes together for (int n=0; nnodes[CPU].count; n++) { struct ncclTopoNode* cpu1 = system->nodes[CPU].nodes+n; for (int p=0; pnodes[CPU].count; p++) { struct ncclTopoNode* cpu2 = system->nodes[CPU].nodes+p; if (n == p || (NCCL_TOPO_ID_SYSTEM_ID(cpu1->id) != NCCL_TOPO_ID_SYSTEM_ID(cpu2->id))) continue; float bw; NCCLCHECK(ncclTopoGetInterCpuBw(cpu1, &bw)); NCCLCHECK(ncclTopoConnectNodes(cpu1, cpu2, LINK_SYS, bw)); } } return ncclSuccess; } static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) { if (node->type == GPU) { sprintf(line+offset, "%s/%lx-%lx (%d)", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id), node->gpu.rank); } else if (node->type == CPU) { sprintf(line+offset, "%s/%lx-%lx (%d/%d/%d)", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id), node->cpu.arch, node->cpu.vendor, node->cpu.model); } else if (node->type == PCI) { sprintf(line+offset, "%s/%lx-%lx (%lx)", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id), node->pci.device); } else { sprintf(line+offset, "%s/%lx-%lx", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id)); } INFO(NCCL_GRAPH, "%s", line); for (int i=0; inlinks; l++) { struct ncclTopoLink* link = node->links+l; if (link->type == LINK_LOC) { sprintf(line+offset, "+ %s[%2.1f] - %s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], link->remNode->id); INFO(NCCL_GRAPH, "%s", line); } else if (link->type != LINK_PCI || link->remNode != prevNode) { sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw); int nextOffset = strlen(line); if (link->type == LINK_PCI) { NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset)); } else { if (link->remNode->type == NET) { sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw); } else { sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id); } INFO(NCCL_GRAPH, "%s", line); } } } return ncclSuccess; } ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) { INFO(NCCL_GRAPH, "=== System : maxBw %2.1f totalBw %2.1f ===", s->maxBw, s->totalBw); char line[1024]; for (int n=0; nnodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0)); INFO(NCCL_GRAPH, "=========================================="); NCCLCHECK(ncclTopoPrintPaths(s)); return ncclSuccess; } static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) { // Shift all links to have upLink as last link if (upNode) { int l=0; while (node->links[l].remNode != upNode) l++; struct ncclTopoLink upLink; memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink)); while (node->links[l+1].remNode) { memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink)); l++; } memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink)); } // Recursively sort the PCI tree for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = node->links+l; if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node)); } return ncclSuccess; } // We want the graph to be organized to ease/accelerate traversal : // 1. NVLinks (already the case) // 2. PCI down // 3. PCI up // 4. SYS (already the case) ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) { for (int n=0; nnodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL)); return ncclSuccess; } ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic, int systemId) { int dev; NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev)); struct ncclTopoNode* net; NCCLCHECK(ncclTopoCreateNode(system, &net, NET, NCCL_TOPO_ID(systemId, dev))); net->net.dev = dev; const char* str; NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str)); if (str) sscanf(str, "0x%lx", &net->net.asic); else net->net.asic = dev; ncclDebugNoWarn = NCCL_GRAPH; int mbps; NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0)); if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1 net->net.bw = mbps / 8000.0; if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0; NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0)); NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0)); NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS)); NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0)); ncclDebugNoWarn = 0; NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.bw)); NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.bw)); return ncclSuccess; } ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic, int systemId) { for (int s=0; snSubs; s++) { struct ncclXmlNode* xmlNet = xmlNic->subs[s]; if (strcmp(xmlNet->name, "net") != 0) continue; int index; NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index)); if (index == -1) continue; NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic, systemId)); } return ncclSuccess; } ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { NCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap)); NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank)); NCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev)); NCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport)); // Do not go any further, nvlinks will be added in a second pass return ncclSuccess; } struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } }; struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */ { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 }, { NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent, int systemId) { const char* str; int type; NCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str)); NCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass)); int64_t busId; NCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str)); NCCLCHECK(busIdToInt64(str, &busId)); struct ncclTopoNode* node = NULL; struct ncclXmlNode* xmlGpu = NULL; NCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu)); if (xmlGpu != NULL) { type = GPU; int index; NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index)); if (index == -1) return ncclSuccess; NCCLCHECK(ncclTopoCreateNode(system, &node, type, NCCL_TOPO_ID(systemId, busId))); NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node)); } struct ncclXmlNode* xmlNic = NULL; NCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic)); if (xmlNic != NULL) { type = NIC; // Ignore sub device ID and merge multi-port NICs into one PCI device. busId &= 0xfffffffffffffff0; struct ncclTopoNode* nicNode = NULL; int64_t id = NCCL_TOPO_ID(systemId, busId); NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, id)); if (nicNode == NULL) { NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, id)); node = nicNode; // Connect it to parent later on } NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode, systemId)); } else if (type == PCI) { NCCLCHECK(ncclTopoCreateNode(system, &node, type, NCCL_TOPO_ID(systemId, busId))); NCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str)); if (str) node->pci.device += strtol(str, NULL, 0) << 48; NCCLCHECK(xmlGetAttr(xmlPci, "device", &str)); if (str) node->pci.device += strtol(str, NULL, 0) << 32; NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_vendor", &str)); if (str) node->pci.device += strtol(str, NULL, 0) << 16; NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_device", &str)); if (str) node->pci.device += strtol(str, NULL, 0); for (int s=0; snSubs; s++) { struct ncclXmlNode* xmlSubPci = xmlPci->subs[s]; if (strcmp(xmlSubPci->name, "pcilink") != 0) { // PCI links will be added later NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId)); } } } if (node) { int width, speed; NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width)); NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str)); // Manage cases where speed was not indicated in /sys if (width == 0) width = 16; NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end) NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0)); NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0)); } return ncclSuccess; } struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } }; struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { "CentaurHauls", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { " Shanghai ", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { NULL, 0 } }; ncclResult_t ncclGetSystemId(struct ncclTopoSystem* system, struct ncclXmlNode* xmlCpu, int* systemIdPtr) { const char* hostHashStr; NCCLCHECK(xmlGetAttr(xmlCpu, "host_hash", &hostHashStr)); uint64_t hostHash = hostHashStr ? strtoull(hostHashStr, NULL, 16) : 0; int systemId; for (systemId=0; systemIdnHosts; systemId++) if (system->hostHashes[systemId] == hostHash) break; if (systemId == system->nHosts) system->hostHashes[system->nHosts++] = hostHash; *systemIdPtr = systemId; return ncclSuccess; } ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) { int numaId; NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId)); int systemId; NCCLCHECK(ncclGetSystemId(system, xmlCpu, &systemId)); struct ncclTopoNode* cpu; NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, NCCL_TOPO_ID(systemId, numaId))); const char* str; NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str)); if (str != NULL) { NCCLCHECK(ncclStrToCpuset(str, &cpu->cpu.affinity)); } NCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str)); NCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch)); if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86) { NCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str)); NCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor)); if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { int familyId, modelId; NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW; } else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { int familyId, modelId; NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG; } } for (int s=0; snSubs; s++) { struct ncclXmlNode* node = xmlCpu->subs[s]; if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu, systemId)); if (strcmp(node->name, "nic") == 0) { struct ncclTopoNode* nic = NULL; NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0)); if (nic == NULL) { NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, NCCL_TOPO_ID(systemId, 0))); NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW)); NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW)); } NCCLCHECK(ncclTopoAddNic(node, system, nic, systemId)); } } return ncclSuccess; } ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) { if (strcmp(node->name, "nvlink") == 0) { struct ncclTopoNode* gpu = NULL; int64_t pBusId; NCCLCHECK(busIdToInt64(parentBusId, &pBusId)); pBusId = NCCL_TOPO_ID(systemId, pBusId); NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId)); if (gpu == NULL) { WARN("Add NVLink error : could not find GPU %lx", pBusId); return ncclInternalError; } int count; NCCLCHECK(xmlGetAttrInt(node, "count", &count)); const char* targetClass; NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass)); int targetType; NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass)); struct ncclTopoNode* remote = NULL; if (targetType == GPU) { // NVL P2P connection to another GPU const char* target; NCCLCHECK(xmlGetAttrStr(node, "target", &target)); int64_t busId; NCCLCHECK(busIdToInt64(target, &busId)); NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, NCCL_TOPO_ID(systemId, busId))); } else if (targetType == CPU) { // NVL connection to the local CPU NCCLCHECK(findLocalCpu(gpu, &remote)); } else { if (system->nodes[NVS].count == 0) { NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0)); } else { remote = system->nodes[NVS].nodes; } } if (remote) { float nvlBw = ncclTopoNVLinkBw(gpu->gpu.cudaCompCap); NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlBw)); if (remote->type != GPU) { NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlBw)); } } } else { if (strcmp(node->name, "cpu") == 0) { NCCLCHECK(ncclGetSystemId(system, node, &systemId)); } const char* busId; NCCLCHECK(xmlGetAttr(node, "busid", &busId)); for (int s=0; snSubs; s++) { NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId, systemId)); } } return ncclSuccess; } ncclResult_t ncclTopoAddPciLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) { if (strcmp(node->name, "pcilink") == 0) { struct ncclTopoNode* pci = NULL; int64_t pBusId; NCCLCHECK(busIdToInt64(parentBusId, &pBusId)); pBusId = NCCL_TOPO_ID(systemId, pBusId); NCCLCHECK(ncclTopoGetNode(system, &pci, PCI, pBusId)); if (pci == NULL) { WARN("Add PCI Link error : could not find PCI SW %lx", pBusId); return ncclInternalError; } struct ncclTopoNode* remote = NULL; const char* target; NCCLCHECK(xmlGetAttrStr(node, "target", &target)); int64_t busId; NCCLCHECK(busIdToInt64(target, &busId)); NCCLCHECK(ncclTopoGetNode(system, &remote, PCI, NCCL_TOPO_ID(systemId, busId))); if (remote) NCCLCHECK(ncclTopoConnectNodes(pci, remote, LINK_LOC, LOC_BW)); } else { if (strcmp(node->name, "cpu") == 0) { NCCLCHECK(ncclGetSystemId(system, node, &systemId)); } const char* busId; NCCLCHECK(xmlGetAttr(node, "busid", &busId)); for (int s=0; snSubs; s++) { NCCLCHECK(ncclTopoAddPciLinks(node->subs[s], system, busId ? busId : parentBusId, systemId)); } } return ncclSuccess; } ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) { if (strcmp(node->name, "c2c") == 0) { struct ncclTopoNode* gpu = NULL; int64_t pBusId; NCCLCHECK(busIdToInt64(parentBusId, &pBusId)); pBusId = NCCL_TOPO_ID(systemId, pBusId); NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId)); if (gpu == NULL) { WARN("Add NVLink error : could not find GPU %lx", pBusId); return ncclInternalError; } int count = 0; NCCLCHECK(xmlGetAttrInt(node, "count", &count)); int bw = 0; NCCLCHECK(xmlGetAttrInt(node, "bw", &bw)); double c2cBw = (bw*count)/1000.0; struct ncclTopoNode* cpu = NULL; NCCLCHECK(findLocalCpu(gpu, &cpu)); if (cpu == NULL) return ncclSuccess; NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_NVL, c2cBw)); NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_NVL, c2cBw)); } else { if (strcmp(node->name, "cpu") == 0) { NCCLCHECK(ncclGetSystemId(system, node, &systemId)); } const char* busId; NCCLCHECK(xmlGetAttr(node, "busid", &busId)); for (int s=0; snSubs; s++) { NCCLCHECK(ncclTopoAddC2c(node->subs[s], system, busId ? busId : parentBusId, systemId)); } } return ncclSuccess; } ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, const uint64_t localHostHash) { NCCLCHECK(ncclCalloc(topoSystem, 1)); struct ncclTopoSystem* system = *topoSystem; struct ncclXmlNode* topNode; NCCLCHECK(xmlFindTag(xml, "system", &topNode)); for (int s=0; snSubs; s++) { struct ncclXmlNode* node = topNode->subs[s]; if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem)); } for (int systemId=0; systemIdnHosts; systemId++) if (system->hostHashes[systemId] == localHostHash) system->systemId = systemId; NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0)); NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0)); NCCLCHECK(ncclTopoAddPciLinks(topNode, *topoSystem, NULL, 0)); NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem)); NCCLCHECK(ncclTopoConnectCpus(*topoSystem)); NCCLCHECK(ncclTopoSortSystem(*topoSystem)); return ncclSuccess; } NCCL_PARAM(TopoDumpFileRank, "TOPO_DUMP_FILE_RANK", 0); // Only set values if not already set static ncclResult_t xmlInitAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value); } return ncclSuccess; } static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attrName, const uint64_t value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value); } return ncclSuccess; } static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); snprintf(node->attrs[index].value, MAX_STR_LEN, "%f", value); } return ncclSuccess; } ncclResult_t ncclTopoRefreshBcmP2pLinks(void) { //refresh the switch topology by reading the link below FILE *fp = fopen("/sys/kernel/pci_switch_link/refresh_switch_toplogy", "r"); if (fp != NULL) { int tmp; size_t r = fread(&tmp, sizeof(tmp), 1, fp); if (r != 1) INFO(NCCL_GRAPH, "Failed to read refresh_switch_toplogy"); fclose(fp); } return ncclSuccess; } ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) { struct ncclXml* xml; NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES)); const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE"); if (xmlTopoFile) { INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile); NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1)); } else { // Try default XML topology location NCCLCHECK(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0)); } if (xml->maxIndex == 0) { // Create top tag struct ncclXmlNode* top; NCCLCHECK(xmlAddNode(xml, NULL, "system", &top)); NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION)); } NCCLCHECK(ncclTopoRefreshBcmP2pLinks()); // Detect only the GPU managed by this process. We'll get any others through XML fusion. char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; NCCLCHECK(int64ToBusId(comm->peerInfo[comm->rank].busId, busId)); struct ncclXmlNode* node; NCCLCHECK(ncclTopoFillGpu(xml, busId, &node)); if (node) { NCCLCHECK(xmlSetAttrInt(node, "keep", 1)); NCCLCHECK(xmlSetAttrInt(node, "rank", comm->rank)); NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport)); } // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes, // so we start with collnet so that it has precedence. int netDevCount = 0; if (collNetSupport(comm)) { NCCLCHECK(collNetDevices(comm, &netDevCount)); for (int n=0; ndmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name); NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport)); NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1)); } } if (netDevCount == 0) { NCCLCHECK(comm->ncclNet->devices(&netDevCount)); } for (int n=0; nncclNet->getProperties(n, &props)); comm->netDeviceType = props.netDeviceType; struct ncclXmlNode* netNode; NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode)); NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1)); NCCLCHECK(xmlSetAttrInt(netNode, "dev", n)); NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed)); NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port)); NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency)); NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid)); NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms)); bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name); NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport)); } // Remove XML branches which don't have a node with keep="1" (typically when importing a topology) NCCLCHECK(ncclTopoTrimXml(xml)); // XML topo fusion. int* localRanks; int localRank = -1, nLocalRanks = 0; if (comm->MNNVL) { // MNNVL clique support nLocalRanks = comm->clique.size; localRank = comm->cliqueRank; localRanks = comm->clique.ranks; } else { // Intra-node fusion. Much of the comm is not initialized yet at this point so we need to do our own calculations. NCCLCHECK(ncclCalloc(&localRanks, comm->nRanks)); for (int i = 0; i < comm->nRanks; i++) { if (comm->peerInfo[i].hostHash == comm->peerInfo[comm->rank].hostHash) { if (i == comm->rank) localRank = nLocalRanks; localRanks[nLocalRanks++] = i; } } } char* mem; NCCLCHECK(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES))); struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank); memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)); NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1)); NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES))); if (comm->MNNVL) { // Ensure that we have enough room when fusing topos from multiple nodes. free(xml); NCCLCHECK(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES)); } else { // In the intra-node case there's no need to enlarge the topo xml. xml->maxIndex = 0; free(localRanks); } for (int i = 0; i < nLocalRanks; i++) { struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i); NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0)); NCCLCHECK(ncclTopoFuseXml(xml, peerXml)); } free(mem); xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE"); if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) { INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile); NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml)); } NCCLCHECK(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash)); free(xml); return ncclSuccess; } ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) { int minType = PATH_DIS; float maxBw = 0; int count = 0; NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count)); struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType]; for (int i=0; inodes[resultType].count; i++) { if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) { maxBw = paths[i].bw; minType = paths[i].type; if (pathType) *pathType = minType; count = 0; } if (paths[i].bw == maxBw && paths[i].type == minType) (*locals)[count++] = i; } *localCount = count; return ncclSuccess; } ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count) { int localNetCount = 0, netCountByBw = 0; int* localNets; float totalNetBw = 0, gpuBw = 0; for (int l=0; lnodes[GPU].nodes[gpu].nlinks; l++) { //assuming BW to CPU reflects the GPU bandwidth via P2P or C2C //caveat, this could be wrong if there is a PCIe switch, //and a narrower link to the CPU if (system->nodes[GPU].nodes[gpu].links[l].remNode->type == CPU) { gpuBw = system->nodes[GPU].nodes[gpu].links[l].bw; } } NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL)); for (int l=0; (l < localNetCount) && (totalNetBw < gpuBw); l++, netCountByBw++) { totalNetBw += system->nodes[GPU].nodes[gpu].paths[NET][localNets[l]].bw; } *count = netCountByBw; free(localNets); return ncclSuccess; } ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) { int gpu; NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu)); int* localNets; int localNetCount; NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL)); int* localGpus = NULL; int localGpuCount; NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL)); int net = system->nodes[GPU].nodes[gpu].gpu.dev; if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount); net += channelId%(DIVUP(localNetCount,localGpuCount)); if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id; if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev; free(localNets); free(localGpus); return ncclSuccess; } ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) { int netIndex; NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &netIndex)); int* localGpus = NULL; int localGpuCount; NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL)); for (int c=0; cnodes[GPU].nodes+g; int64_t id; NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL)); if (netId == id) { *gpuIndex = g; free(localGpus); return ncclSuccess; } } } free(localGpus); *gpuIndex = -1; return ncclSuccess; } /****************************/ /* External query functions */ /****************************/ ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model) { *arch = system->nodes[CPU].nodes[0].cpu.arch; *vendor = system->nodes[CPU].nodes[0].cpu.vendor; *model = system->nodes[CPU].nodes[0].cpu.model; return ncclSuccess; } NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0); ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) { struct ncclTopoNode* cpu = NULL, *gpu = NULL; for (int g=0; gnodes[GPU].count; g++) { if (system->nodes[GPU].nodes[g].gpu.rank == rank) { gpu = system->nodes[GPU].nodes+g; // Find closer CPU int cpuIndex = -1, minHops = 0; for (int c=0; cnodes[CPU].count; c++) { int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count; if (cpuIndex == -1 || nHops < minHops) { cpuIndex = c; minHops = nHops; } } cpu = system->nodes[CPU].nodes+cpuIndex; } } if (cpu == NULL) { WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank); return ncclInternalError; } // Query the CPU affinity set we were provided cpu_set_t mask; SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity"); #ifdef ENABLE_TRACE { char affinityStr[sizeof(cpu_set_t)*2]; NCCLCHECK(ncclCpusetToStr(&mask, affinityStr)); TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev, affinityStr); } #endif // Get the affinity of the CPU close to our GPU. cpu_set_t cpuMask = cpu->cpu.affinity; #ifdef ENABLE_TRACE { char affinityStr[sizeof(cpu_set_t)*2]; NCCLCHECK(ncclCpusetToStr(&cpuMask, affinityStr)); TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev, affinityStr); } #endif cpu_set_t finalMask; if (ncclParamIgnoreCpuAffinity()) // Ignore the CPU affinity set and use the GPU one instead finalMask = cpuMask; else // Use a subset of the GPU affinity set CPU_AND(&finalMask, &mask, &cpuMask); memcpy(affinity, &finalMask, sizeof(cpu_set_t)); // If there is a non empty set, use it to set affinity if (CPU_COUNT(&finalMask)) { char affinityStr[sizeof(cpu_set_t)*2]; NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr)); INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr); } return ncclSuccess; } ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count) { *count = system->nodes[GPU].count; return ncclSuccess; } ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) { *count = system->nodes[NET].count; return ncclSuccess; } ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count) { *count = system->nodes[NVS].count; return ncclSuccess; } ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax) { if (system->nodes[GPU].count == 0) return ncclInternalError; int min, max; min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap; for (int g=1; gnodes[GPU].count; g++) { min = std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap); max = std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap); } if (ccMin) *ccMin = min; if (ccMax) *ccMax = max; return ncclSuccess; } nccl-2.22.3-1/src/graph/topo.h000066400000000000000000000157371463451655400157310ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_TOPO_H_ #define NCCL_TOPO_H_ #include "graph.h" #include "core.h" #define LOC_BW 5000.0 #define SM60_NVLINK_BW 18.0 #define SM70_NVLINK_BW 20.0 #define SM80_NVLINK_BW 20.0 #define SM90_NVLINK_BW 20.6 #define SM86_NVLINK_BW 12.0 #define PCI_BW 12.0 // PCI Gen3 x16 #define QPI_BW 6.0 #define AMD_BW 16.0 #define SKL_QPI_BW 10.0 #define ZPI_BW 6.0 #define YONGFENG_ZPI_BW 9.0 #define P9_BW 32.0 #define ARM_BW 6.0 #define NET_BW 12.0 // 100Gbit // Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU // to GPU traffic consumes more PCI bandwidth. #define INTEL_P2P_OVERHEAD(bw) (bw*6/5) #define NCCL_TOPO_NODE_TYPES 7 #define GPU 0 #define PCI 1 #define NVS 2 #define CPU 3 // Actually NUMA domains #define NIC 4 #define NET 5 extern const char* topoNodeTypeStr[]; // We want link types and path types to match as much as possible #define LINK_LOC 0 #define LINK_NVL 1 // Skipping 2 for PATH_NVB #define LINK_PCI 3 // Skipping 4 for PATH_PXB // Skipping 5 for PATH_PXN // Skipping 6 for PATH_PHB #define LINK_SYS 7 #define LINK_NET 8 extern const char* topoLinkTypeStr[]; // Local (myself) #define PATH_LOC 0 // Connection traversing NVLink #define PATH_NVL 1 // Connection through NVLink using an intermediate GPU #define PATH_NVB 2 // Connection traversing at most a single PCIe bridge #define PATH_PIX 3 // Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) #define PATH_PXB 4 // Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations. #define PATH_PXN 5 // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) #define PATH_PHB 6 // Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) #define PATH_SYS 7 // Connection through the network #define PATH_NET 8 // Disconnected #define PATH_DIS 9 extern const char* topoPathTypeStr[]; struct ncclTopoNode; struct ncclTopoLink { int type; float bw; struct ncclTopoNode* remNode; }; #define NCCL_TOPO_MAX_LINKS 128 #define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES) struct ncclTopoLinkList { struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS]; int count; float bw; int type; }; #define NCCL_TOPO_CPU_INTEL_BDW 1 #define NCCL_TOPO_CPU_INTEL_SKL 2 #define NCCL_TOPO_UNDEF (-1) #define NCCL_TOPO_ID_SYSTEM_ID(id) (id >> 56) #define NCCL_TOPO_ID_LOCAL_ID(id) (id & 0x00ffffffffffffff) #define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + localid) struct ncclTopoNode { int type; int64_t id; // Type specific data union { struct { int dev; // NVML dev number int rank; int cudaCompCap; int gdrSupport; }gpu; struct { int dev; // Plugin dev number uint64_t asic; int port; float bw; float latency; int gdrSupport; int collSupport; int maxChannels; }net; struct { int arch; int vendor; int model; cpu_set_t affinity; }cpu; struct { uint64_t device; }pci; }; int nlinks; struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS]; // Pre-computed paths to GPUs and NICs struct ncclTopoLinkList* paths[NCCL_TOPO_NODE_TYPES]; // Used during search uint64_t used; }; struct ncclTopoNodeSet { int count; struct ncclTopoNode nodes[NCCL_TOPO_MAX_NODES]; }; struct ncclTopoSystem { int systemId; uint64_t hostHashes[NCCL_TOPO_MAX_NODES]; int nHosts; struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES]; float maxBw; float totalBw; }; ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id); ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id); ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id); ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw); ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system); ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system); ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank); #define NCCL_TOPO_XML_MAX_NODES 256 #define NCCL_GRAPH_XML_MAX_NODES 4096 ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, uint64_t localHostHash); ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels); ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml); ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax); static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) { *index = -1; for (int i=0; inodes[type].count; i++) { if (system->nodes[type].nodes[i].id == id) { *index = i; return ncclSuccess; } } return ncclInternalError; } static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index) { *index = -1; for (int i=0; inodes[GPU].count; i++) { if (system->nodes[GPU].nodes[i].gpu.rank == rank) { *index = i; return ncclSuccess; } } return ncclInternalError; } static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, int* rank) { *rank = -1; for (int i=0; inodes[GPU].count; i++) { if (NCCL_TOPO_ID_SYSTEM_ID(system->nodes[GPU].nodes[i].id) != system->systemId) continue; // Only consider GPUs on our node if (system->nodes[GPU].nodes[i].gpu.dev == dev) { *rank = system->nodes[GPU].nodes[i].gpu.rank; return ncclSuccess; } } return ncclInternalError; } static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, int64_t id, int* netDev) { *netDev = -1; for (int i=0; inodes[NET].count; i++) { if (system->nodes[NET].nodes[i].id == id) { *netDev = system->nodes[NET].nodes[i].net.dev; return ncclSuccess; } } WARN("Could not find NET with id %lx", id); return ncclInternalError; } // Returns NVLink bw in GB/s static float ncclTopoNVLinkBw(int cudaCompCap) { return cudaCompCap >= 90 ? SM90_NVLINK_BW : cudaCompCap == 86 ? SM86_NVLINK_BW : cudaCompCap >= 80 ? SM80_NVLINK_BW : cudaCompCap >= 70 ? SM70_NVLINK_BW : cudaCompCap >= 60 ? SM60_NVLINK_BW : SM80_NVLINK_BW; } // Mirror bits static bool isPow2(int val) { return (val & (val-1)) == 0; } static int mirrorBits(int val, int pow2) { int mirror = 0; for (int b=1, mb=(pow2>>1); b>=1) if (val & b) mirror |= mb; return mirror; } #endif nccl-2.22.3-1/src/graph/trees.cc000066400000000000000000000074171463451655400162240ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nccl.h" #define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank) /* Btree which alternates leaves and nodes. * Assumes root is 0, which conveniently builds a tree on powers of two, * (because we have pow2-1 ranks) which lets us manipulate bits. * Find first non-zero bit, then : * Find the parent : * xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below) * xx11[0] -> xx10[0] (3,7,11 below) * Find the children : * xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13) * xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13) * * Illustration : * 0---------------8 * ______/ \______ * 4 12 * / \ / \ * 2 6 10 \ * / \ / \ / \ \ * 1 3 5 7 9 11 13 */ ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) { int up, down0, down1; int bit; for (bit=1; bit 0 so it has to be our child 1, not 0. *d1 = nranks > 1 ? bit >> 1 : -1; return ncclSuccess; } up = (rank ^ bit) | (bit << 1); // if smaller than the parent, we are his first child, otherwise we're his second if (up >= nranks) up = (rank ^ bit); *parentChildType = (rank < up) ? 0 : 1; *u = up; int lowbit = bit >> 1; // down0 is always within bounds down0 = lowbit == 0 ? -1 : rank-lowbit; down1 = lowbit == 0 ? -1 : rank+lowbit; // Make sure down1 is within bounds while (down1 >= nranks) { down1 = lowbit == 0 ? -1 : rank+lowbit; lowbit >>= 1; } *d0 = down0; *d1 = down1; return ncclSuccess; } /* Build a double binary tree. Take the previous tree for the first tree. * For the second tree, we use a mirror tree (if nranks is even) * * 0---------------8 3----------------11 * ______/ \ / \______ * 4 \ / 7 * / \ \ / / \ * 2 6 10 1 5 9 * / \ / \ / \ / \ / \ / \ * 1 3 5 7 9 11 0 2 4 6 8 10 * * or shift it by one rank (if nranks is odd). * * 0---------------8 1---------------9 * ______/ \______ ______/ \______ * 4 12 5 0 * / \ / / \ / * 2 6 10 3 7 11 * / \ / \ / \ / \ / \ / \ * 1 3 5 7 9 11 2 4 6 8 10 12 */ ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) { // First tree ... use a btree ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0); // Second tree ... mirror or shift if (nranks % 2 == 1) { // shift int shiftrank = (rank-1+nranks) % nranks; int u, d0, d1; ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1); *s1 = u == -1 ? -1 : (u+1) % nranks; *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks; *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks; } else { // mirror int u, d0, d1; ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1); *s1 = u == -1 ? -1 : nranks-1-u; *d1_0 = d0 == -1 ? -1 : nranks-1-d0; *d1_1 = d1 == -1 ? -1 : nranks-1-d1; } return ncclSuccess; } nccl-2.22.3-1/src/graph/tuning.cc000066400000000000000000000512001463451655400163730ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" #include "device.h" #include "comm.h" #include "topo.h" NCCL_PARAM(Nthreads, "NTHREADS", -2); NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2); static int getNthreads(const char* name, int env, int min, int max, int def) { int nt = env; if (nt > 0) { if (nt % WARP_SIZE != 0) { WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WARP_SIZE); nt = max; } else if (nt > max) { WARN("Invalid %s %d (maximum %d).", name, nt, max); nt = max; } else if (nt < min) { WARN("Invalid %s %d (minimum %d).", name, nt, min); nt = min; } } else { nt = def; } return nt; } ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) { int def, set; if (str[0] == '^') { def = 1; set = 0; str++; } else { def = 0; set = 1; } for (int i=0; icpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0; if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0; return 1.0; } ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) { int simpleDefaultThreads = (graphs[NCCL_ALGO_RING]->bwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads); comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS); comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] = NCCL_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS); comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS); int nNodes = comm->nNodes; int nRanks = comm->nRanks; if (nRanks <= 1) return ncclSuccess; int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX; int cpuArch, cpuVendor, cpuModel; NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); int index2 = nNodes <= 2 ? nNodes-1 : 2; // LL: for single node, we look at GPU type; for multi-node, we look at CPU type int index1 = nNodes == 1 ? compCapIndex : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0; double llMaxBw = llMaxBws[index1][index2]; double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2]; double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2]; double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2]; // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]; float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS]; for (int a=0; atypeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI; for (int a=0; a 1 ? 2*nNodes :0) : coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 : nNodes; for (int a=0; abwIntra : graphs[a]->bwInter; if (a == NCCL_ALGO_NVLS) bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter); if (a == NCCL_ALGO_NVLS_TREE) bw = std::min(graphs[a]->bwIntra, nNodes <= 2 ? graphs[a]->bwInter : graphs[a]->bwInter/2); float busBw = graphs[a]->nChannels * bw; // Various model refinements if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * .5); } if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw); if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw); if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85; if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) { if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) { busBw = ppn * bw; // AllGather/ReduceScatter requires 1:1 GPU:NIC int nicPerNode = comm->collNetHeadsNum; if (coll == ncclFuncAllGather && comm->nNodes > 1) { if (!comm->ncclCollNet || !comm->ncclCollNet->iallgather || ppn > nicPerNode) busBw = 0; } if (coll == ncclFuncReduceScatter && comm->nNodes > 1) { if (!comm->ncclCollNet || !comm->ncclCollNet->ireducescatter || ppn > nicPerNode) busBw = 0; } // Measured corrective ratio needed at 1 ppn and 8ppn. Here we hackishly // interpolate the two. float w = (ppn-1)/(8-1); busBw *= w*0.85 + (1-w)*0.95; } else { // Collnet+Direct requires all GPUs to have a local NIC to work at full speed float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio factor -= (factor-1)/2; busBw /= factor; if (minCompCap >= 90) busBw *= .85; } } // Convert bus BW to algorithm BW if (!(a == NCCL_ALGO_COLLNET_DIRECT && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) { float ratio = 1.0f; if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps; else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0; else ratio *= .5; busBw *= ratio; } comm->bandwidths[coll][a][p] = busBw; /* Ring bandwidth backup */ if (a == NCCL_ALGO_RING) comm->ringbdw[coll][p] = comm->bandwidths[coll][NCCL_ALGO_RING][p]; comm->latencies[coll][a][p] = baseLat[a][p]; float intraLat = hwLat[intraHw[a]][a][p]; float interLat = hwLat[NCCL_HW_NET][a][p] + graphs[a]->latencyInter; // Also add the flush extra latency if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter; if (a == NCCL_ALGO_RING) { float lat = hwLat[hw[a]][a][p]; if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) { if (graphs[a]->sameChannels) { comm->latencies[coll][a][p] += lat; } else { if (p == NCCL_PROTO_SIMPLE) lat = hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling comm->latencies[coll][a][p] += nsteps*lat; } } else { // Inter-node rings still have to launch nsteps * net overhead. float netOverhead = 0.0; if (nNodes > 1) { netOverhead = getNetOverhead(comm); if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3; } intraLat = std::max(intraLat, netOverhead); comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat; } } else if (a == NCCL_ALGO_TREE) { comm->latencies[coll][a][p] += 2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat); } else if (a == NCCL_ALGO_COLLNET_DIRECT) { comm->latencies[coll][a][p] += 2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.4) + interLat; // Add 0.4 us arity serialization latency } else if (a == NCCL_ALGO_COLLNET_CHAIN) { comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat; } else if (a == NCCL_ALGO_NVLS) { comm->latencies[coll][a][p] = intraLat; if (nNodes > 1) comm->latencies[coll][a][p] += interLat; } else if (a == NCCL_ALGO_NVLS_TREE) { comm->latencies[coll][a][p] += intraLat + 2 * log2i(nNodes) * interLat; } } } } // Protocols/Algorithms enable/disable, and user overrides. // All are enabled except ll128 which is enabled by default only in certain cases. int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 }; int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 }; const char *protoStr = ncclGetEnv("NCCL_PROTO"); if (protoStr) { INFO(NCCL_ENV, "NCCL_PROTO set by environment to %s", protoStr); NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable)); } const char *algoStr = ncclGetEnv("NCCL_ALGO"); if (algoStr) { INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr); NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable)); } if (comm->nNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0; // Disable CollNet if it is not supported if (comm->collNetSupport == 0) { algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0; algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0; if (nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0; // If user has hard set NCCL_ALGO=COLLNET, ignore it if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 && algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) { algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1; } } else { // Disable CollNet+Direct if not on an NVSwitch system int nvsCount = 0; NCCLCHECK(ncclTopoGetNvsCount(comm->topo, &nvsCount)); if (nvsCount == 0) algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0; } for (int c=0; ctypeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN)); pEnable &= (graphs[a]->typeIntra <= PATH_NVB); pEnable &= (minCompCap == maxCompCap); switch (minCompCap) { case 70: pEnable &= 1; break; case 80: pEnable &= 1; break; case 90: pEnable &= !(CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2); break; default: pEnable &= 0; break; } } if (pEnable == 0) comm->bandwidths[c][a][p] = 0; if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0; if (a == NCCL_ALGO_RING && pEnable == 0) comm->ringbdw[c][p] = 0; } for (int c = 0; c < NCCL_NUM_FUNCTIONS; c++) { bool available = false; for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++) for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) if (comm->bandwidths[c][a][p] != 0) { available = true; goto check_avail; } check_avail: if (available == false) { /* at least set ring algo available */ for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) comm->bandwidths[c][NCCL_ALGO_RING][p] = comm->ringbdw[c][p]; } } if (comm->rank == 0) { char line[1024]; for (int block=0; block<2; block++) { sprintf(line, " Algorithm |"); for (int ba=0; bamaxThreads[a][p]); } } INFO(NCCL_TUNING, "%s", line); for (int c=0; clatencies[c][a][p], comm->bandwidths[c][a][p]); } } INFO(NCCL_TUNING, "%s", line); } } } // Set per-thread amount of work before we increase nThreads and nChannels for (int a=0; athreadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD; comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD; comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD; } comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks; comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = 512; comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = 512; // Override defaults with user env const char* str = ncclGetEnv("NCCL_THREAD_THRESHOLDS"); if (str) { INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str); ssize_t t[2][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }}; sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2); for (int a=0; a<2; a++) { for (int p=0; p= 0) comm->threadThresholds[a][p] = t[a][p]; } } } INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld | %ld", comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL], comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128], comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL], comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128], comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE], comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE]); return ncclSuccess; } // Trees are not perfectly sticking to the model for medium sizes. Applying a static correction // factor is not ideal but works quite well. Powers of two, 64 B to 256MB. static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = { { 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .4, .4, .5, .6, .7, .8, .9, 1.0, 1.0, 1.0, 1.0 }, { 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .7, .6, .6, .6, .6, .6, .6, .8, .9, .9, .9, .9, 1.0, 1.0 }, { .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .6, .7, .8, .7, .7, .8, .9, .9 } }; ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup) { float bw = comm->bandwidths[coll][algorithm][protocol]; float lat = comm->latencies[coll][algorithm][protocol]; if (backup) { *backup = false; if (algorithm == NCCL_ALGO_RING && bw == 0.0f) { /* try back up RING algorithm */ bw = comm->ringbdw[coll][protocol]; *backup = true; } } if (bw == 0) { *time = -1.0; return ncclSuccess; } int logSize = log2i(nBytes>>6); if (algorithm == NCCL_ALGO_TREE && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize]; if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && comm->nNodes > 1 && coll == ncclFuncAllReduce && nBytes/(comm->nChannels*comm->nRanks) >= 64) { lat *= comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring } // Tree pipelining saves latency in aggregation cases int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_DEV_WORK_BATCH_COLLS); *time = lat * latCount + nBytes / (1000 * bw); return ncclSuccess; } nccl-2.22.3-1/src/graph/xml.cc000066400000000000000000001060321463451655400156730ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include #include #include #include #include #include #include "core.h" #include "nvmlwrap.h" #include "xml.h" #if defined(__x86_64__) #include #endif /*******************/ /* XML File Parser */ /*******************/ ncclResult_t xmlGetChar(FILE* file, char* c) { if (fread(c, 1, 1, file) == 0) { WARN("XML Parse : Unexpected EOF"); return ncclInternalError; } return ncclSuccess; } ncclResult_t xmlGetValue(FILE* file, char* value, char* last) { char c; NCCLCHECK(xmlGetChar(file, &c)); if (c != '"' && c != '\'') { #if INT_OK int o = 0; do { value[o++] = c; NCCLCHECK(xmlGetChar(file, &c)); } while (c >= '0' && c <= '9'); value[o] = '\0'; *last = c; return ncclSuccess; #else WARN("XML Parse : Expected (double) quote."); return ncclInternalError; #endif } int o = 0; do { NCCLCHECK(xmlGetChar(file, &c)); value[o++] = c; } while (c != '"'); value[o-1] = '\0'; NCCLCHECK(xmlGetChar(file, last)); return ncclSuccess; } ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) { char c; char* ptr = name; int o = 0; do { NCCLCHECK(xmlGetChar(file, &c)); if (c == '=') { ptr[o] = '\0'; if (value == NULL) { WARN("XML Parse : Unexpected value with name %s", ptr); return ncclInternalError; } return xmlGetValue(file, value, last); } ptr[o] = c; if (o == MAX_STR_LEN-1) { ptr[o] = '\0'; WARN("Error : name %s too long (max %d)", ptr, MAX_STR_LEN); return ncclInternalError; } o++; } while (c != ' ' && c != '>' && c != '/' && c != '\n' && c != '\r'); ptr[o-1] = '\0'; *last = c; return ncclSuccess; } // Shift the 3-chars string by one char and append c at the end #define SHIFT_APPEND(s, c) do { s[0]=s[1]; s[1]=s[2]; s[2]=c; } while(0) ncclResult_t xmlSkipComment(FILE* file, char* start, char next) { // Start from something neutral with \0 at the end. char end[4] = "..."; // Inject all trailing chars from previous reads. We don't need // to check for --> here because there cannot be a > in the name. for (int i=0; i" while (strcmp(end, "-->") != 0) { int c; if (fread(&c, 1, 1, file) != 1) { WARN("XML Parse error : unterminated comment"); return ncclInternalError; } SHIFT_APPEND(end, c); } return ncclSuccess; } ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) { node->type = NODE_TYPE_NONE; char c = ' '; while (c == ' ' || c == '\n' || c == '\r') { if (fread(&c, 1, 1, file) == 0) return ncclSuccess; } if (c != '<') { WARN("XML Parse error : expecting '<', got '%c'", c); return ncclInternalError; } // Read XML element name NCCLCHECK(xmlGetToken(file, node->name, NULL, &c)); // Check for comments if (strncmp(node->name, "!--", 3) == 0) { NCCLCHECK(xmlSkipComment(file, node->name+3, c)); return xmlGetNode(file, node); } // Check for closing tag if (node->name[0] == '\0' && c == '/') { node->type = NODE_TYPE_CLOSE; // Re-read the name, we got '/' in the first call NCCLCHECK(xmlGetToken(file, node->name, NULL, &c)); if (c != '>') { WARN("XML Parse error : unexpected trailing %c in closing tag %s", c, node->name); return ncclInternalError; } return ncclSuccess; } node->type = NODE_TYPE_OPEN; // Get Attributes int a = 0; while (c == ' ') { NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c)); if (a == MAX_ATTR_COUNT) { INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)", MAX_ATTR_COUNT); // Actually we need to still consume the extra attributes so we have an extra one. } else a++; } node->nAttrs = a; if (c == '/') { node->type = NODE_TYPE_SINGLE; char str[MAX_STR_LEN]; NCCLCHECK(xmlGetToken(file, str, NULL, &c)); } if (c != '>') { WARN("XML Parse : expected >, got '%c'", c); return ncclInternalError; } return ncclSuccess; } typedef ncclResult_t (*xmlHandlerFunc_t)(FILE*, struct ncclXml*, struct ncclXmlNode*); struct xmlHandler { const char * name; xmlHandlerFunc_t func; }; ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head, struct xmlHandler handlers[], int nHandlers) { if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess; while (1) { if (xml->maxIndex == xml->maxNodes) { WARN("Error : XML parser is limited to %d nodes", xml->maxNodes); return ncclInternalError; } struct ncclXmlNode* node = xml->nodes+xml->maxIndex; memset(node, 0, sizeof(struct ncclXmlNode)); NCCLCHECK(xmlGetNode(file, node)); if (node->type == NODE_TYPE_NONE) { if (head) { WARN("XML Parse : unterminated %s", head->name); return ncclInternalError; } else { // All done return ncclSuccess; } } if (head && node->type == NODE_TYPE_CLOSE) { if (strcmp(node->name, head->name) != 0) { WARN("XML Mismatch : %s / %s", head->name, node->name); return ncclInternalError; } return ncclSuccess; } int found = 0; for (int h=0; hname, handlers[h].name) == 0) { if (head) { if (head->nSubs == MAX_SUBS) { WARN("Error : XML parser is limited to %d subnodes", MAX_SUBS); return ncclInternalError; } head->subs[head->nSubs++] = node; } node->parent = head; node->nSubs = 0; xml->maxIndex++; NCCLCHECK(handlers[h].func(file, xml, node)); found = 1; break; } } if (!found) { if (nHandlers) INFO(NCCL_GRAPH, "Ignoring element %s", node->name); NCCLCHECK(xmlLoadSub(file, xml, node, NULL, 0)); } } } /**************/ /* XML Writer */ /**************/ // exp == 1 -- serialize; exp == 0 -- deserialize ncclResult_t ncclTopoConvertXml(struct ncclXml* xml, uintptr_t base, int exp) { for (int n = 0; n < xml->maxIndex; n++) { struct ncclXmlNode *node = &xml->nodes[n]; // For "parent", we shift the base by 1 so that we can distinguish actual // NULL pointers from pointers pointing to the first node. if (node->parent) node->parent = (struct ncclXmlNode *) (exp ? ((uintptr_t)node->parent - base + 1) : (base - 1 + (uintptr_t)node->parent)); for (int s = 0; s < node->nSubs; s++) { node->subs[s] = (struct ncclXmlNode *) (exp ? ((uintptr_t)node->subs[s] - base) : (base + (uintptr_t)node->subs[s])); } } return ncclSuccess; } ncclResult_t ncclTopoDumpXmlRec(int indent, FILE* file, struct ncclXmlNode* node) { for (int i=0; iname); for (int a=0; anAttrs; a++) { fprintf(file, " %s=\"%s\"", node->attrs[a].key, node->attrs[a].value); } if (node->nSubs == 0) { fprintf(file, "/>\n"); } else { fprintf(file, ">\n"); for (int s=0; snSubs; s++) { NCCLCHECK(ncclTopoDumpXmlRec(indent+2, file, node->subs[s])); } for (int i=0; i\n", node->name); } return ncclSuccess; } ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml) { FILE* file = fopen(xmlTopoFile, "w"); if (file == NULL) { WARN("Unable to open %s, not dumping topology.", xmlTopoFile); return ncclSuccess; } NCCLCHECK(ncclTopoDumpXmlRec(0, file, xml->nodes)); fclose(file); return ncclSuccess; } static ncclResult_t xmlTopoFuseXmlRecursive(struct ncclXml* dst, struct ncclXmlNode* dstParent, struct ncclXmlNode* srcParent) { for (int i = 0; i < srcParent->nSubs; i++) { struct ncclXmlNode* srcNode = srcParent->subs[i]; struct ncclXmlNode* dstNode; NCCLCHECK(xmlFindNode(dstParent, srcNode, &dstNode)); if (dstNode == NULL) { NCCLCHECK(xmlAddTree(dst, dstParent, srcNode)); } else { NCCLCHECK(xmlTopoFuseXmlRecursive(dst, dstNode, srcNode)); } } return ncclSuccess; } ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src) { struct ncclXmlNode* topNodeDst; NCCLCHECK(xmlFindTag(dst, "system", &topNodeDst)); if (topNodeDst == NULL) { xmlAddTree(dst, NULL, src->nodes); return ncclSuccess; } struct ncclXmlNode* topNodeSrc; NCCLCHECK(xmlFindTag(src, "system", &topNodeSrc)); NCCLCHECK(xmlTopoFuseXmlRecursive(dst, topNodeDst, topNodeSrc)); return ncclSuccess; } /****************************************/ /* Parser rules for our specific format */ /****************************************/ ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); return ncclSuccess; } ncclResult_t ncclTopoXmlLoadPciLink(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); return ncclSuccess; } ncclResult_t ncclTopoXmlLoadC2c(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); return ncclSuccess; } ncclResult_t ncclTopoXmlLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink }, { "c2c", ncclTopoXmlLoadC2c } }; NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2)); return ncclSuccess; } ncclResult_t ncclTopoXmlLoadNet(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); return ncclSuccess; } ncclResult_t ncclTopoXmlLoadNic(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { struct xmlHandler handlers[] = { { "net", ncclTopoXmlLoadNet } }; NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); return ncclSuccess; } ncclResult_t ncclTopoXmlLoadPci(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic}, { "pcilink", ncclTopoXmlLoadPciLink} }; NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 4)); return ncclSuccess; } ncclResult_t ncclTopoXmlLoadCpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "nic", ncclTopoXmlLoadNic } }; NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2)); return ncclSuccess; } ncclResult_t ncclTopoXmlLoadSystem(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { int version; NCCLCHECK(xmlGetAttrInt(head, "version", &version)); if (version != NCCL_TOPO_XML_VERSION) { WARN("XML Topology has wrong version %d, %d needed", version, NCCL_TOPO_XML_VERSION); return ncclInvalidUsage; } const char* name; NCCLCHECK(xmlGetAttr(head, "name", &name)); if (name != NULL) INFO(NCCL_GRAPH, "Loading topology %s", name); else INFO(NCCL_GRAPH, "Loading unnamed topology"); struct xmlHandler handlers[] = { { "cpu", ncclTopoXmlLoadCpu } }; NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); return ncclSuccess; } ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml, int warn) { FILE* file = fopen(xmlTopoFile, "r"); if (file == NULL) { if (warn) { WARN("Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno)); } return ncclSuccess; } INFO(NCCL_GRAPH, "Loading topology file %s", xmlTopoFile); struct xmlHandler handlers[] = { { "system", ncclTopoXmlLoadSystem } }; xml->maxIndex = 0; NCCLCHECK(xmlLoadSub(file, xml, NULL, handlers, 1)); fclose(file); return ncclSuccess; } /**********************/ /* XML creation */ /* from autodetection */ /**********************/ #define BUSID_SIZE (sizeof("0000:00:00.0")) #define BUSID_REDUCED_SIZE (sizeof("0000:00")) static void memcpylower(char* dst, const char* src, const size_t size) { for (int i=0; i static ncclResult_t getBcmLinks(const char* busId, int* nlinks, char** peers) { *nlinks = 0; *peers = NULL; char dirPath[] = "/sys/kernel/pci_switch_link/virtual_switch_links/0000:00:00.0"; memcpylower(dirPath+sizeof("/sys/kernel/pci_switch_link/virtual_switch_links/")-1, busId, BUSID_SIZE-1); DIR *dir = opendir(dirPath); if (dir) { struct dirent* file; while ((file = readdir(dir)) != NULL) { if (strlen(file->d_name) != BUSID_SIZE-1) continue; char* path; if (getPciPath(file->d_name, &path) == ncclSystemError) continue; free(path); NCCLCHECK(ncclRealloc(peers, (*nlinks)*BUSID_SIZE, ((*nlinks)+1)*BUSID_SIZE)); memcpy((*peers)+BUSID_SIZE*(*nlinks)++, file->d_name, BUSID_SIZE); } closedir(dir); } return ncclSuccess; } ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) { char filePath[PATH_MAX]; sprintf(filePath, "%s/%s", path, fileName); int offset = 0; FILE* file; if ((file = fopen(filePath, "r")) != NULL) { while (feof(file) == 0 && ferror(file) == 0 && offset < MAX_STR_LEN) { int len = fread(strValue+offset, 1, MAX_STR_LEN-offset, file); offset += len; } fclose(file); } if (offset == 0) { strValue[0] = '\0'; INFO(NCCL_GRAPH, "Topology detection : could not read %s, ignoring", filePath); } else { strValue[offset-1] = '\0'; } return ncclSuccess; } ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const char* path, const char* fileName, const char* attrName) { char strValue[MAX_STR_LEN]; NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue)); if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); } TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s", path, fileName, attrName, strValue); return ncclSuccess; } ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml* xml) { int index; NCCLCHECK(xmlGetAttrIndex(cpuNode, "affinity", &index)); if (index == -1) { const char* numaId; NCCLCHECK(xmlGetAttr(cpuNode, "numaid", &numaId)); if (numaId == NULL) { WARN("GetXmlFromCpu : could not find CPU numa ID."); return ncclInternalError; } // Set affinity char cpumaskPath[] = "/sys/devices/system/node/node0000"; sprintf(cpumaskPath, "/sys/devices/system/node/node%s", numaId); NCCLCHECK(ncclTopoSetAttrFromSys(cpuNode, cpumaskPath, "cpumap", "affinity")); } NCCLCHECK(xmlGetAttrIndex(cpuNode, "arch", &index)); if (index == -1) { // Fill CPU type / vendor / model #if defined(__PPC__) NCCLCHECK(xmlSetAttr(cpuNode, "arch", "ppc64")); #elif defined(__aarch64__) NCCLCHECK(xmlSetAttr(cpuNode, "arch", "arm64")); #elif defined(__x86_64__) NCCLCHECK(xmlSetAttr(cpuNode, "arch", "x86_64")); #endif } #if defined(__x86_64__) NCCLCHECK(xmlGetAttrIndex(cpuNode, "vendor", &index)); if (index == -1) { union { struct { // CPUID 0 String register order uint32_t ebx; uint32_t edx; uint32_t ecx; }; char vendor[12]; } cpuid0; unsigned unused; __cpuid(0, unused, cpuid0.ebx, cpuid0.ecx, cpuid0.edx); char vendor[13]; strncpy(vendor, cpuid0.vendor, 12); vendor[12] = '\0'; NCCLCHECK(xmlSetAttr(cpuNode, "vendor", vendor)); } NCCLCHECK(xmlGetAttrIndex(cpuNode, "familyid", &index)); if (index == -1) { union { struct { unsigned steppingId:4; unsigned modelId:4; unsigned familyId:4; unsigned processorType:2; unsigned resv0:2; unsigned extModelId:4; unsigned extFamilyId:8; unsigned resv1:4; }; uint32_t val; } cpuid1; unsigned unused; __cpuid(1, cpuid1.val, unused, unused, unused); int familyId = cpuid1.familyId + (cpuid1.extFamilyId << 4); int modelId = cpuid1.modelId + (cpuid1.extModelId << 4); NCCLCHECK(xmlSetAttrInt(cpuNode, "familyid", familyId)); NCCLCHECK(xmlSetAttrInt(cpuNode, "modelid", modelId)); } #endif return ncclSuccess; } ncclResult_t ncclTopoGetPciNode(struct ncclXml* xml, const char* busId, struct ncclXmlNode** pciNode) { NCCLCHECK(xmlFindTagKv(xml, "pci", pciNode, "busid", busId)); if (*pciNode == NULL) { NCCLCHECK(xmlAddNode(xml, NULL, "pci", pciNode)); NCCLCHECK(xmlSetAttr(*pciNode, "busid", busId)); } return ncclSuccess; } // Check whether a string is in BDF format or not. // BDF (Bus-Device-Function) is "BBBB:BB:DD.F" where B, D and F are hex digits. // There can be trailing chars. int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')); } int checkBDFFormat(char* bdf) { if (strlen(bdf) != 12) return 0; if ((bdf[4] != ':') || (bdf[7] != ':') || (bdf[10] != '.')) return 0; if ((isHex(bdf[0]) == 0) || (isHex(bdf[1]) == 0) || (isHex(bdf[2]) == 0) || (isHex(bdf[3]) == 0) || (isHex(bdf[5]) == 0) || (isHex(bdf[6]) == 0) || (isHex(bdf[8]) == 0) || (isHex(bdf[9]) == 0) || (isHex(bdf[11]) == 0)) return 0; return 1; } ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) { // Fill info, then parent const char* busId; NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId)); char* path = NULL; ncclDebugNoWarn = NCCL_GRAPH; getPciPath(busId, &path); ncclDebugNoWarn = 0; if (path) { NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class")); } int index; ncclDebugNoWarn = NCCL_GRAPH; NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index)); if (index == -1) { if (path) ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"); } NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index)); if (index == -1) { if (path) ncclTopoSetAttrFromSys(pciNode, path, "device", "device"); } NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index)); if (index == -1) { if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"); } NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index)); if (index == -1) { if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"); } ncclDebugNoWarn = 0; NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index)); if (index == -1) { if (path) { char deviceSpeedStr[MAX_STR_LEN]; float deviceSpeed = FLT_MAX; NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr)); sscanf(deviceSpeedStr, "%f GT/s", &deviceSpeed); char portSpeedStr[MAX_STR_LEN]; float portSpeed = FLT_MAX; NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_speed", portSpeedStr)); sscanf(portSpeedStr, "%f GT/s", &portSpeed); NCCLCHECK(xmlSetAttr(pciNode, "link_speed", portSpeed < deviceSpeed ? portSpeedStr : deviceSpeedStr)); } else { NCCLCHECK(xmlSetAttr(pciNode, "link_speed", "")); } } NCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index)); if (index == -1) { if (path) { char strValue[MAX_STR_LEN]; NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_width", strValue)); int deviceWidth = strtol(strValue, NULL, 0); NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_width", strValue)); int portWidth = strtol(strValue, NULL, 0); NCCLCHECK(xmlSetAttrInt(pciNode, "link_width", std::min(deviceWidth,portWidth))); } else { NCCLCHECK(xmlSetAttr(pciNode, "link_width", "")); } } const char* vendor; NCCLCHECK(xmlGetAttr(pciNode, "vendor", &vendor)); if (vendor != NULL && strcmp(vendor, "0x1000") == 0) { // BCM switch, look for P2P connections int nlinks; char* peers; NCCLCHECK(getBcmLinks(busId, &nlinks, &peers)); for (int l=0; lparent; if (parent == NULL) { if (path) { // Save that for later in case next step is a CPU char numaIdStr[MAX_STR_LEN]; NCCLCHECK(ncclTopoGetStrFromSys(path, "numa_node", numaIdStr)); // Go up one level in the PCI tree. Rewind two "/" and follow the upper PCI // switch, or stop if we reach a CPU root complex. int slashCount = 0; int parentOffset; for (parentOffset = strlen(path)-1; parentOffset>0; parentOffset--) { if (path[parentOffset] == '/') { slashCount++; path[parentOffset] = '\0'; int start = parentOffset - 1; while (start>0 && path[start] != '/') start--; // Check whether the parent path looks like "BBBB:BB:DD.F" or not. if (checkBDFFormat(path+start+1) == 0) { // This a CPU root complex. Create a CPU tag and stop there. struct ncclXmlNode* topNode; NCCLCHECK(xmlFindTag(xml, "system", &topNode)); NCCLCHECK(xmlGetSubKv(topNode, "cpu", &parent, "numaid", numaIdStr)); if (parent == NULL) { NCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent)); NCCLCHECK(xmlSetAttrLong(parent, "host_hash", getHostHash())); NCCLCHECK(xmlSetAttr(parent, "numaid", numaIdStr)); } } else if (slashCount == 2) { // Continue on the upper PCI switch for (int i = strlen(path)-1; i>0; i--) { if (path[i] == '/') { NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", path+i+1)); if (parent == NULL) { NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent)); NCCLCHECK(xmlSetAttr(parent, "busid", path+i+1)); } break; } } } } if (parent) break; } } else { // No information on /sys, attach GPU to unknown CPU NCCLCHECK(xmlFindTagKv(xml, "cpu", &parent, "numaid", "-1")); if (parent == NULL) { struct ncclXmlNode* topNode; NCCLCHECK(xmlFindTag(xml, "system", &topNode)); NCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent)); NCCLCHECK(xmlSetAttrLong(parent, "host_hash", getHostHash())); NCCLCHECK(xmlSetAttr(parent, "numaid", "-1")); NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml)); } } pciNode->parent = parent; // Keep PCI sub devices ordered by PCI Bus ID (Issue #820) int subIndex = parent->nSubs; const char* newBusId; NCCLCHECK(xmlGetAttrStr(pciNode, "busid", &newBusId)); for (int s=0; snSubs; s++) { const char* busId; NCCLCHECK(xmlGetAttr(parent->subs[s], "busid", &busId)); if (busId != NULL && strcmp(newBusId, busId) < 0) { subIndex = s; break; } } if (parent->nSubs == MAX_SUBS) { WARN("Error : XML parser is limited to %d subnodes", MAX_SUBS); return ncclInternalError; } for (int s = parent->nSubs; s > subIndex; s--) parent->subs[s] = parent->subs[s-1]; parent->subs[subIndex] = pciNode; parent->nSubs++; } if (strcmp(parent->name, "pci") == 0) { NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml)); } else if (strcmp(parent->name, "cpu") == 0) { NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml)); } free(path); return ncclSuccess; } ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvmlDev, struct ncclXml* xml, struct ncclXmlNode** gpuNodeRet) { struct ncclXmlNode* gpuNode = NULL; NCCLCHECK(xmlGetSub(pciNode, "gpu", &gpuNode)); if (gpuNode == NULL) NCCLCHECK(xmlAddNode(xml, pciNode, "gpu", &gpuNode)); int index = -1; int dev = -1; NCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index)); if (index == -1) { NCCLCHECK(ncclNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev)); NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev)); } NCCLCHECK(xmlGetAttrInt(gpuNode, "dev", &dev)); if (dev == -1) { *gpuNodeRet = NULL; return ncclSuccess; } NCCLCHECK(xmlGetAttrIndex(gpuNode, "sm", &index)); if (index == -1) { int cudaMajor, cudaMinor; if (nvmlDev == NULL) { cudaDeviceProp devProp; CUDACHECK(cudaGetDeviceProperties(&devProp, dev)); cudaMajor = devProp.major; cudaMinor = devProp.minor; } else { NCCLCHECK(ncclNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor)); } NCCLCHECK(xmlSetAttrInt(gpuNode, "sm", cudaMajor*10+cudaMinor)); } int sm; NCCLCHECK(xmlGetAttrInt(gpuNode, "sm", &sm)); struct ncclXmlNode* nvlNode = NULL; NCCLCHECK(xmlGetSub(gpuNode, "nvlink", &nvlNode)); if (nvlNode == NULL) { // NVML NVLink detection int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : (sm < 90) ? 12 : 18; if (maxNvLinks > 0 && nvmlDev == NULL) { WARN("No NVML device handle. Skipping nvlink detection."); maxNvLinks = 0; } for (int l=0; l= 11080 if (sm >= 90) { nvmlFieldValue_t fv; fv.fieldId = NVML_FI_DEV_NVLINK_GET_STATE; fv.scopeId = l; // fv.value will contain NV_FEATURE_ENABLED or NV_FEATURE_DISABLED if ((ncclNvmlDeviceGetFieldValues(nvmlDev, 1, &fv) == ncclSuccess) && (fv.nvmlReturn == NVML_SUCCESS)) isActive = (nvmlEnableState_t) fv.value.uiVal; } else /* FALLTHRU to GetNvLinkState if before SM90 */ #endif { (void) ncclNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive); } if (isActive != NVML_FEATURE_ENABLED) continue; // Try to figure out what's on the other side of the NVLink nvmlPciInfo_t remoteProc; if (ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue; // Make a lower case copy of the bus ID for calling ncclDeviceType // PCI system path is in lower case char* p = remoteProc.busId; char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; for (int c=0; c= 11080 struct ncclXmlNode* c2cNode = NULL; NCCLCHECK(xmlGetSub(gpuNode, "c2c", &c2cNode)); if (c2cNode == NULL) { if (sm >= 90) { int c2cLinksCount = 0; nvmlFieldValue_t fv; fv.fieldId = NVML_FI_DEV_C2C_LINK_COUNT; if ((ncclNvmlDeviceGetFieldValues(nvmlDev, 1, &fv) == ncclSuccess) && (fv.nvmlReturn == NVML_SUCCESS)) { c2cLinksCount = fv.value.uiVal; int bw = 0; int count = 0; for (int l=0; l 0) { NCCLCHECK(xmlAddNode(xml, gpuNode, "c2c", &c2cNode)); NCCLCHECK(xmlSetAttrInt(c2cNode, "bw", bw)); NCCLCHECK(xmlSetAttrInt(c2cNode, "count", count)); } } } } #endif // Fill target classes for (int s=0; snSubs; s++) { struct ncclXmlNode* sub = gpuNode->subs[s]; if (strcmp(sub->name, "nvlink") != 0) continue; int index; NCCLCHECK(xmlGetAttrIndex(sub, "tclass", &index)); if (index == -1) { const char* busId; NCCLCHECK(xmlGetAttr(sub, "target", &busId)); char* path; ncclDebugNoWarn = NCCL_GRAPH; getPciPath(busId, &path); ncclDebugNoWarn = 0; if (path == NULL || strcmp(busId, "fffffff:ffff:ff") == 0) { // Remote NVLink device is not visible inside this VM. Assume NVSwitch. NCCLCHECK(xmlSetAttr(sub, "tclass", "0x068000")); } else { NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass")); free(path); } } } *gpuNodeRet = gpuNode; return ncclSuccess; } ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode) { struct ncclXmlNode* node; NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node)); NCCLCHECK(xmlSetAttrIfUnset(node, "class", "0x03")); NCCLCHECK(ncclTopoGetXmlFromSys(node, xml)); nvmlDevice_t nvmlDev; NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev)); NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode)); return ncclSuccess; } // Returns the subsystem name of a path, i.e. the end of the path // where sysPath/subsystem points to. ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) { char subSysPath[PATH_MAX]; sprintf(subSysPath, "%s/subsystem", sysPath); char* path = realpath(subSysPath, NULL); if (path == NULL) { subSys[0] = '\0'; } else { int offset; for (offset = strlen(path); offset > 0 && path[offset] != '/'; offset--); strcpy(subSys, path+offset+1); free(path); } return ncclSuccess; } ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode) { NCCLCHECK(xmlFindTagKv(xml, "net", netNode, "name", netName)); if (*netNode != NULL) return ncclSuccess; const char* pciSysPath = pciPath; if (pciSysPath) { char subSystem[PATH_MAX]; NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem)); // This is not a PCI device (virtual, usb, ...). if (strcmp(subSystem, "pci") != 0) { INFO(NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem); pciSysPath = NULL; } } struct ncclXmlNode* parent = NULL; if (pciSysPath) { int offset; for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--); char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; strcpy(busId, pciSysPath+offset+1); NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent)); NCCLCHECK(xmlSetAttrIfUnset(parent, "class", "0x02")); NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml)); } else { // Virtual NIC, no PCI device, attach to first CPU NCCLCHECK(xmlFindTag(xml, "cpu", &parent)); } struct ncclXmlNode* nicNode = NULL; NCCLCHECK(xmlGetSub(parent, "nic", &nicNode)); if (nicNode == NULL) { NCCLCHECK(xmlAddNode(xml, parent, "nic", &nicNode)); } // We know that this net does not exist yet (we searched for it at the // beginning of this function), so we can add it. NCCLCHECK(xmlAddNode(xml, nicNode, "net", netNode)); NCCLCHECK(xmlSetAttr(*netNode, "name", netName)); return ncclSuccess; } ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node, int* keep) { const char* str; NCCLCHECK(xmlGetAttr(node, "keep", &str)); if (str && strcmp(str, "1") == 0) { NCCLCHECK(xmlUnsetAttr(node, "keep")); *keep = 1; } else { // Copy nSubs and subs as they could change as we trim recursively. struct ncclXmlNode* subs[MAX_SUBS]; int nSubs = node->nSubs; memcpy(subs, node->subs, node->nSubs*sizeof(struct ncclXmlNode*)); *keep = 0; for (int s=0; sname, "pci") == 0 || strcmp(node->name, "cpu") == 0)) { NCCLCHECK(xmlRemoveNode(node)); } } return ncclSuccess; } ncclResult_t ncclTopoTrimXml(struct ncclXml* xml) { int keep = 0; NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes, &keep)); return ncclSuccess; } /**************************************************/ /* Parser rules for the user-defined graph search */ /**************************************************/ ncclResult_t ncclTopoXmlGraphLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); return ncclSuccess; } ncclResult_t ncclTopoXmlGraphLoadNet(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); return ncclSuccess; } ncclResult_t ncclTopoXmlGraphLoadChannel(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { struct xmlHandler handlers[] = { { "net", ncclTopoXmlGraphLoadNet }, { "gpu", ncclTopoXmlGraphLoadGpu } }; NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2)); return ncclSuccess; } ncclResult_t ncclTopoXmlGraphLoadGraph(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { struct xmlHandler handlers[] = { { "channel", ncclTopoXmlGraphLoadChannel } }; NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); return ncclSuccess; } ncclResult_t ncclTopoXmlGraphLoadGraphs(FILE* file, struct ncclXml* xmlGraph, struct ncclXmlNode* head) { int version; NCCLCHECK(xmlGetAttrInt(head, "version", &version)); if (version != NCCL_GRAPH_XML_VERSION) { WARN("XML Graph has wrong version %d, %d needed", version, NCCL_GRAPH_XML_VERSION); return ncclInvalidUsage; } const char* name; NCCLCHECK(xmlGetAttr(head, "name", &name)); if (name != NULL) INFO(NCCL_GRAPH, "Loading graphs for topology %s", name); else INFO(NCCL_GRAPH, "Loading graphs"); struct xmlHandler handlers[] = { { "graph", ncclTopoXmlGraphLoadGraph } }; NCCLCHECK(xmlLoadSub(file, xmlGraph, head, handlers, 1)); return ncclSuccess; } ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml) { FILE* file = fopen(xmlGraphFile, "r"); if (file == NULL) { WARN("Could not open XML graph file %s : %s", xmlGraphFile, strerror(errno)); return ncclSystemError; } struct xmlHandler handlers[] = { { "graphs", ncclTopoXmlGraphLoadGraphs } }; xml->maxIndex = 0; NCCLCHECK(xmlLoadSub(file, xml, NULL, handlers, 1)); fclose(file); return ncclSuccess; } nccl-2.22.3-1/src/graph/xml.h000066400000000000000000000310631463451655400155360ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef XML_H_ #define XML_H_ #include "nccl.h" #include "debug.h" #include "checks.h" #include "alloc.h" #include // A few constraints to make the implementation easy #define MAX_STR_LEN 255 #define MAX_ATTR_COUNT 16 #define MAX_SUBS 128 #define NODE_TYPE_NONE 0 #define NODE_TYPE_OPEN 1 #define NODE_TYPE_CLOSE 2 #define NODE_TYPE_SINGLE 3 struct ncclXmlNode { char name[MAX_STR_LEN+1]; struct { char key[MAX_STR_LEN+1]; char value[MAX_STR_LEN+1]; } attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params int nAttrs; int type; struct ncclXmlNode* parent; struct ncclXmlNode* subs[MAX_SUBS]; int nSubs; }; struct ncclXml { int maxIndex, maxNodes; struct ncclXmlNode nodes[1]; }; /* File functions */ #define NCCL_TOPO_XML_VERSION 1 ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml, int warn); ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml); #define NCCL_GRAPH_XML_VERSION 1 ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml); /* Auto-detect functions */ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode); ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode); /* Remove unneeded parts */ ncclResult_t ncclTopoTrimXml(struct ncclXml* xml); /* Fuse multiple system XMLs into one, skipping duplicate entries */ ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src); /* Relocate pointers in XML to (de-)serialize the structure */ ncclResult_t ncclTopoConvertXml(struct ncclXml* xml, uintptr_t base, int exp); /**************/ /* XML Struct */ /* Functions */ /**************/ static size_t xmlMemSize(int maxNodes) { return offsetof(struct ncclXml, nodes) + sizeof(struct ncclXmlNode)*maxNodes; } static ncclResult_t xmlAlloc(struct ncclXml** xml, int maxNodes) { char* mem; NCCLCHECK(ncclCalloc(&mem, xmlMemSize(maxNodes))); *xml = (struct ncclXml*)mem; (*xml)->maxNodes = maxNodes; return ncclSuccess; } static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrName, int* index) { *index = -1; const int nAttrs = node->nAttrs; for (int a=0; aattrs[a].key, attrName, MAX_STR_LEN) == 0) { *index = a; return ncclSuccess; } } return ncclSuccess; } static ncclResult_t xmlGetAttr(struct ncclXmlNode* node, const char* attrName, const char** value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); *value = index == -1 ? NULL : node->attrs[index].value; return ncclSuccess; } static ncclResult_t xmlGetAttrStr(struct ncclXmlNode* node, const char* attrName, const char** value) { NCCLCHECK(xmlGetAttr(node, attrName, value)); if (*value == NULL) { WARN("Attribute %s of node %s not found", attrName, node->name); return ncclInternalError; } return ncclSuccess; } static ncclResult_t xmlGetAttrInt(struct ncclXmlNode* node, const char* attrName, int* value) { const char* str; NCCLCHECK(xmlGetAttrStr(node, attrName, &str)); *value = strtol(str, NULL, 0); return ncclSuccess; } static ncclResult_t xmlGetAttrIntDefault(struct ncclXmlNode* node, const char* attrName, int* value, int defaultValue) { const char* str; NCCLCHECK(xmlGetAttr(node, attrName, &str)); *value = str ? strtol(str, NULL, 0) : defaultValue; return ncclSuccess; } static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char* attrName, int64_t* value) { const char* str; NCCLCHECK(xmlGetAttrStr(node, attrName, &str)); *value = strtol(str, NULL, 0); return ncclSuccess; } static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) { const char* str; NCCLCHECK(xmlGetAttrStr(node, attrName, &str)); *value = strtof(str, NULL); return ncclSuccess; } static ncclResult_t xmlFindTag(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node) { *node = NULL; for (int i=0; imaxIndex; i++) { struct ncclXmlNode* n = xml->nodes+i; if (strcmp(n->name, tagName) == 0) { *node = n; return ncclSuccess; } } return ncclSuccess; } static ncclResult_t xmlFindNextTag(struct ncclXml* xml, const char* tagName, struct ncclXmlNode* prev, struct ncclXmlNode** node) { *node = NULL; for (int i=prev-xml->nodes+1; imaxIndex; i++) { struct ncclXmlNode* n = xml->nodes+i; if (strcmp(n->name, tagName) == 0) { *node = n; return ncclSuccess; } } return ncclSuccess; } static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node, const char* attrName, const char* attrValue) { *node = NULL; for (int i=0; imaxIndex; i++) { struct ncclXmlNode* n = xml->nodes+i; if (strcmp(n->name, tagName) == 0) { const char* value; NCCLCHECK(xmlGetAttr(n, attrName, &value)); if (value && strcmp(value, attrValue) == 0) { *node = n; return ncclSuccess; } } } return ncclSuccess; } static ncclResult_t xmlFindNode(struct ncclXmlNode* parentNode, struct ncclXmlNode* searchNode, struct ncclXmlNode** node) { *node = NULL; // Search for the node at the current level only. for (int i=0; inSubs; i++) { struct ncclXmlNode* n = parentNode->subs[i]; if (strcmp(n->name, searchNode->name) == 0 && n->type == searchNode->type && n->nAttrs == searchNode->nAttrs) { int a; // Ensure that all the attributes are the same. for (a=0; anAttrs; a++) { const char* val; NCCLCHECK(xmlGetAttr(n, searchNode->attrs[a].key, &val)); if (!val || strcmp(val, searchNode->attrs[a].value)) break; } if (a == searchNode->nAttrs) { *node = n; return ncclSuccess; } } } return ncclSuccess; } static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, const char* value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); node->attrs[index].key[MAX_STR_LEN] = '\0'; } strncpy(node->attrs[index].value, value, MAX_STR_LEN); node->attrs[index].value[MAX_STR_LEN] = '\0'; return ncclSuccess; } static ncclResult_t xmlSetAttrIfUnset(struct ncclXmlNode* node, const char* attrName, const char* value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index != -1) return ncclSuccess; index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); node->attrs[index].key[MAX_STR_LEN] = '\0'; strncpy(node->attrs[index].value, value, MAX_STR_LEN); node->attrs[index].value[MAX_STR_LEN] = '\0'; return ncclSuccess; } static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); node->attrs[index].key[MAX_STR_LEN] = '\0'; } snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value); node->attrs[index].value[MAX_STR_LEN] = '\0'; return ncclSuccess; } static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); node->attrs[index].key[MAX_STR_LEN] = '\0'; } snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value); node->attrs[index].value[MAX_STR_LEN] = '\0'; return ncclSuccess; } static ncclResult_t xmlSetAttrLong(struct ncclXmlNode* node, const char* attrName, const int64_t value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); node->attrs[index].key[MAX_STR_LEN] = '\0'; } snprintf(node->attrs[index].value, MAX_STR_LEN, "%#lx", value); node->attrs[index].value[MAX_STR_LEN] = '\0'; return ncclSuccess; } static ncclResult_t xmlUnsetAttr(struct ncclXmlNode* node, const char* attrName) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) return ncclSuccess; for (int i=index+1; inAttrs; i++) { strcpy(node->attrs[i-1].key, node->attrs[i].key); strcpy(node->attrs[i-1].value, node->attrs[i].value); } node->nAttrs--; return ncclSuccess; } static ncclResult_t xmlGetSub(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub) { *sub = NULL; for (int s=0; snSubs; s++) { if (strcmp(node->subs[s]->name, subName) == 0) { *sub = node->subs[s]; return ncclSuccess; } } return ncclSuccess; } static ncclResult_t xmlGetSubKv(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub, const char* attrName, const char* attrValue) { *sub = NULL; for (int s=0; snSubs; s++) { struct ncclXmlNode* subNode = node->subs[s]; if (strcmp(subNode->name, subName) == 0) { const char* value; NCCLCHECK(xmlGetAttr(subNode, attrName, &value)); if (value && strcmp(value, attrValue) == 0) { *sub = node->subs[s]; return ncclSuccess; } } } return ncclSuccess; } static ncclResult_t xmlGetSubKvInt(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub, const char* attrName, const int attrValue) { char strValue[10]; snprintf(strValue, 10, "%d", attrValue); NCCLCHECK(xmlGetSubKv(node, subName, sub, attrName, strValue)); return ncclSuccess; } static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent, const char* subName, struct ncclXmlNode** sub) { if (xml->maxIndex == xml->maxNodes) { WARN("Error : too many XML nodes (max %d)", xml->maxNodes); return ncclInternalError; } struct ncclXmlNode* s = xml->nodes+xml->maxIndex++; s->nSubs = 0; s->nAttrs = 0; *sub = s; s->parent = parent; if (parent) { if (parent->nSubs == MAX_SUBS) { WARN("Error : too many XML subnodes (max %d)", MAX_SUBS); return ncclInternalError; } parent->subs[parent->nSubs++] = s; } strncpy(s->name, subName, MAX_STR_LEN); s->name[MAX_STR_LEN] = '\0'; return ncclSuccess; } static ncclResult_t xmlRemoveNode(struct ncclXmlNode* node) { node->type = NODE_TYPE_NONE; struct ncclXmlNode* parent = node->parent; if (parent == NULL) return ncclSuccess; int shift = 0; for (int s=0; snSubs; s++) { if (parent->subs[s] == node) shift = 1; else if (shift) parent->subs[s-1] = parent->subs[s]; } parent->nSubs--; return ncclSuccess; } static ncclResult_t xmlAddTree(struct ncclXml* dst, struct ncclXmlNode* parent, struct ncclXmlNode* srcNode) { if (dst->maxIndex == dst->maxNodes) { WARN("Error : too many XML nodes (max %d)", dst->maxNodes); return ncclInternalError; } struct ncclXmlNode* dstNode = dst->nodes+dst->maxIndex++; *dstNode = *srcNode; dstNode->parent = parent; if (parent) { if (parent->nSubs == MAX_SUBS) { WARN("Error : too many XML subnodes (max %d)", MAX_SUBS); return ncclInternalError; } parent->subs[parent->nSubs++] = dstNode; } dstNode->nSubs = 0; // Recursively copy the subtree(s) for (int i=0; inSubs; i++) NCCLCHECK(xmlAddTree(dst, dstNode, srcNode->subs[i])); return ncclSuccess; } // Dictionary for STR -> INT conversions. No dictionary size information, // there needs to be a last element with str == NULL. struct kvDict { const char* str; int value; }; static ncclResult_t kvConvertToInt(const char* str, int* value, struct kvDict* dict) { struct kvDict* d = dict; while (d->str) { if (strncmp(str, d->str, strlen(d->str)) == 0) { *value = d->value; return ncclSuccess; } d++; } INFO(NCCL_GRAPH, "KV Convert to int : could not find value of '%s' in dictionary, falling back to %d", str, d->value); *value = d->value; return ncclSuccess; } static ncclResult_t kvConvertToStr(int value, const char** str, struct kvDict* dict) { struct kvDict* d = dict; while (d->str) { if (value == d->value) { *str = d->str; return ncclSuccess; } d++; } WARN("KV Convert to str : could not find value %d in dictionary", value); return ncclInternalError; } #endif nccl-2.22.3-1/src/group.cc000066400000000000000000000507711463451655400151360ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "group.h" #include "debug.h" #include "enqueue.h" #include "transport.h" #include "channel.h" #include #include "bootstrap.h" __thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting __thread ncclResult_t ncclGroupError = ncclSuccess; __thread struct ncclComm* ncclGroupCommHead = nullptr; __thread struct ncclComm* ncclGroupCommPreconnectHead = nullptr; __thread struct ncclIntruQueue ncclAsyncJobs; __thread struct ncclGroupJob *ncclGroupJobMainPtr = NULL; __thread struct ncclGroupJob ncclGroupJobMain; __thread int ncclGroupBlocking = -1; /* default mode */ __thread bool ncclGroupJobAbortFlag = false; void* ncclAsyncJobMain(void* arg); ncclResult_t ncclAsyncLaunch( struct ncclAsyncJob* job, ncclResult_t(*func)(struct ncclAsyncJob*), void(*undo)(struct ncclAsyncJob*), void(*destructor)(void*), ncclComm_t comm ) { ncclResult_t ret = ncclSuccess; job->destroyFlag = comm->destroyFlag; if (ncclGroupDepth == 0) { ret = func(job); if (ret != ncclSuccess && undo) undo(job); if (destructor) destructor(job); } else { job->func = func; job->undo = undo; job->destructor = destructor; job->abortFlag = comm->abortFlag; job->abortFlagDev = comm->abortFlagDev; job->childAbortFlag = comm->childAbortFlag; job->childAbortFlagDev = comm->childAbortFlagDev; job->state = ncclGroupJobRunning; job->comm = comm; /* check if there are blocking and nonblocking comms at the same time in group. */ if (comm->destroyFlag) { ncclGroupBlocking = 1; } else if (ncclGroupBlocking == -1) { /* first met communicator */ ncclGroupBlocking = comm->config.blocking; } else if (ncclGroupBlocking != comm->config.blocking) { WARN("Blocking and nonblocking communicators are not allowed in the same group."); ret = ncclInvalidArgument; } ncclIntruQueueEnqueue(&ncclAsyncJobs, job); } return ret; } void* ncclAsyncJobMain(void* arg) { struct ncclAsyncJob* job = (struct ncclAsyncJob*)arg; job->result = job->func(job); if (job->result != ncclSuccess) { INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, job->result); } __atomic_store_n(&job->state, ncclGroupJobDone, __ATOMIC_RELEASE); return arg; } ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) { ncclResult_t ret; SYSCHECK(pthread_join(job->thread, NULL), "pthread_join"); if (job->result != ncclSuccess) { WARN("ncclAsyncJobComplete: job %p failed, job error %d", job, job->result); } ret = job->result; if (job->destructor) job->destructor((void*)job); return ret; } NCCL_API(ncclResult_t, ncclGroupStart); ncclResult_t ncclGroupStart() { ncclResult_t ret = ncclSuccess; NVTX3_FUNC_RANGE_IN(nccl_domain); NCCLCHECK(ncclGroupStartInternal()); TRACE_CALL("ncclGroupStart()"); return ret; } NCCL_API(ncclResult_t, ncclGroupEnd); ncclResult_t ncclGroupEnd() { ncclResult_t ret = ncclSuccess; NVTX3_FUNC_RANGE_IN(nccl_domain); NCCLCHECKGOTO(ncclGroupEndInternal(), ret, exit); TRACE_CALL("ncclGroupEnd()"); exit: return ret; } NCCL_API(ncclResult_t, ncclGroupSimulateEnd, ncclSimInfo_t* simInfo); ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo) { ncclResult_t ret = ncclSuccess; NVTX3_FUNC_RANGE_IN(nccl_domain); NCCLCHECKGOTO(ncclGroupEndInternal(simInfo), ret, exit); TRACE_CALL("ncclGroupSimulateEnd()"); exit: return ret; } struct ncclPreconnectJob { struct ncclAsyncJob base; struct ncclComm* comm; bool* algoNeedConnect; }; ncclResult_t ncclP2PPreconnectFunc(struct ncclAsyncJob* job_) { struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_; struct ncclComm* comm = job->comm; CUDACHECK(cudaSetDevice(comm->cudaDev)); if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1)); return ncclSuccess; } ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) { struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_; struct ncclComm* comm = job->comm; ncclResult_t ret = ncclSuccess; CUDACHECK(cudaSetDevice(comm->cudaDev)); if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); for (int i = 0; i < NCCL_NUM_ALGORITHMS; ++i) { if (job->algoNeedConnect[i]) { switch (i) { case NCCL_ALGO_RING: { NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail); break; } case NCCL_ALGO_TREE: { NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail); break; } case NCCL_ALGO_NVLS: { /* If we are using NVLS_TREE algo, we must mark NVLS algo to set up * NVLS intra-node buffer */ NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail); break; } case NCCL_ALGO_NVLS_TREE: { NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail); break; } case NCCL_ALGO_COLLNET_CHAIN: { NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail); break; } case NCCL_ALGO_COLLNET_DIRECT: { NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail); break; } default: { ret = ncclInternalError; goto fail; } } } } exit: free(job->algoNeedConnect); return ret; fail: goto exit; } static ncclResult_t doLaunches(struct ncclComm* head) { ncclResult_t result = ncclSuccess; struct ncclComm* cliqueComm0 = head->intraComm0; struct ncclComm* cliqueHead = head; struct ncclComm* cliqueNextHead; bool useBarrier = ncclParamLaunchMode == ncclLaunchModeGroup; // This outer loop iterates over cliques of comms which are siblings of the // same global entity. We calculate a clique as all comms which have the same // `intraComm0` value. do { struct ncclComm* comm = cliqueHead; bool capturingYes = false, capturingNo = false; do { (ncclCudaGraphValid(comm->planner.capturingGraph) ? capturingYes : capturingNo) = true; CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure); NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure); if (useBarrier) ncclCommIntraBarrierIn(comm, 1); comm = comm->groupNext; } while (comm != nullptr && comm->intraComm0 == cliqueComm0); cliqueNextHead = comm; if (capturingYes && capturingNo) { // We have entered barriers but are aborting without leaving them. Thus // these comms are permanently trashed. We need a good mechanism for // tracking and reporting that. WARN("Either none or all communicators in a ncclGroup() can be CUDA graph captured."); result = ncclInvalidUsage; goto failure; } while (true) { // Iterate rounds of launches for clique. bool moreRounds = false; comm = cliqueHead; do { // Iterate clique members. struct ncclComm* next = comm->groupNext; if (useBarrier) { // Barrier reduction result tells us if this was the final round. moreRounds = 0 != ncclCommIntraBarrierOut(comm); } else { moreRounds |= comm->planner.unlaunchedPlansHead != nullptr; } if (moreRounds) { // Pop next unlaunched kernel struct ncclKernelPlan* plan = comm->planner.unlaunchedPlansHead; if (plan != nullptr) { comm->planner.unlaunchedPlansHead = plan->next; CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure); NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure); NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure); } // Barrier reduction input indicates if we require further rounds. if (useBarrier) ncclCommIntraBarrierIn(comm, comm->planner.unlaunchedPlansHead != nullptr ? 1 : 0); if (plan != nullptr) { NCCLCHECKGOTO(ncclLaunchKernelAfter_NoCuda(comm, plan), result, failure); } } else { // Final round. CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure); NCCLCHECKGOTO(ncclLaunchFinish(comm), result, failure); } comm = next; } while (comm != cliqueNextHead); if (!moreRounds) break; } cliqueHead = cliqueNextHead; } while (cliqueHead != nullptr); failure: return result; } static inline void groupResetJobState(struct ncclGroupJob* job) { if (job) { if (job->groupBlockingPtr) *job->groupBlockingPtr = -1; if (job->abortFlagPtr) *job->abortFlagPtr = false; if (job->groupErrorPtr) *job->groupErrorPtr = ncclSuccess; if (job->groupCommHeadPtr) *job->groupCommHeadPtr = NULL; if (job->groupCommPreconnectHeadPtr) *job->groupCommPreconnectHeadPtr = NULL; memset(job, 0, sizeof(struct ncclGroupJob)); } return; } static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue* asyncJobsPtr, ncclResult_t* groupErrorPtr, int* groupBlockingPtr, volatile bool* groupJobAbortFlagPtr, ncclResult_t error) { struct ncclComm* comm = *groupCommHeadPtr; /* reset all thread local variables */ *groupCommHeadPtr = NULL; *groupCommPreconnectHeadPtr = NULL; *groupErrorPtr = ncclSuccess; *groupBlockingPtr = -1; *groupJobAbortFlagPtr = false; while (comm != nullptr) { struct ncclComm* next = comm->groupNext; (void) ncclGroupCommLeave(comm); // overwrites comm->groupNext // We don't know if preconnect succeeded or happened at all, so clear // the flags that let `taskAppend()` skip over checking if preconnect // is needed. comm->preconnectNext = reinterpret_cast(0x1); for (int i = 0; i < comm->nRanks; i++) { comm->connectSend[i] = 0UL; comm->connectRecv[i] = 0UL; } // Reclaim abandoned kernel plan memory. Note ncclWork structs were already // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`. while (!ncclIntruQueueEmpty(&comm->planner.planQueue)) { struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planner.planQueue); // Persistent plans will be reclaimed via the callbackQueue when the // graph drops its UserObject reference. if (!plan->persistent) { while (!ncclIntruQueueEmpty(&plan->proxyOpQueue)) { struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->proxyOpQueue); ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop); } ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan); } } { // Reset comm->planner to empty. ncclKernelPlanner::Peer* tmp = comm->planner.peers; memset(&comm->planner, 0, sizeof(comm->planner)); comm->planner.peers = tmp; memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0])); } if (!comm->config.blocking) (void) ncclCommSetAsyncError(comm, error); comm = next; } /* reset everything */ while (!ncclIntruQueueEmpty(asyncJobsPtr)) { struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr); if (job->comm && !job->comm->config.blocking) (void) ncclCommSetAsyncError(job->comm, error); if (job->undo) job->undo(job); if (job->destructor) job->destructor((void*)job); } return; } static ncclResult_t asyncJobLaunch(struct ncclIntruQueue *asyncJobsMain, volatile bool *groupAbortFlag) { ncclResult_t ret = ncclSuccess; bool jobsDone = false; bool errorJobAbortFlag = false; if (!ncclIntruQueueEmpty(asyncJobsMain)) { struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain); do { SYSCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), ret, fail); job = job->next; } while (job != nullptr); do { jobsDone = true; job = ncclIntruQueueHead(asyncJobsMain); do { ncclGroupJobState_t state = __atomic_load_n(&job->state, __ATOMIC_ACQUIRE); if (state == ncclGroupJobRunning) { jobsDone = false; } else if (state == ncclGroupJobDone) { if (pthread_join(job->thread, nullptr) != 0) { WARN("Error waiting for pthread_join : %s", strerror(errno)); ret = ncclSystemError; } job->state = ncclGroupJobJoined; if (job->result != ncclSuccess && ret == ncclSuccess) { ret = job->result; errorJobAbortFlag = true; } } else { /* safety check */ assert(state == ncclGroupJobJoined); } if (!job->destroyFlag && (__atomic_load_n(groupAbortFlag, __ATOMIC_ACQUIRE) || errorJobAbortFlag == true)) { __atomic_store_n(job->abortFlag, 1, __ATOMIC_RELEASE); __atomic_store_n(job->abortFlagDev, 1, __ATOMIC_RELEASE); if (job->childAbortFlag) { __atomic_store_n(job->childAbortFlag, 1, __ATOMIC_RELEASE); __atomic_store_n(job->childAbortFlagDev, 1, __ATOMIC_RELEASE); } } job = job->next; } while (job != nullptr); // Let preconnect threads progress. if (jobsDone == false) usleep(1); } while (jobsDone == false); if (ret != ncclSuccess) goto fail; } while (!ncclIntruQueueEmpty(asyncJobsMain)) { struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain); if (!job->destroyFlag && job->comm && !job->comm->config.blocking) (void) ncclCommSetAsyncError(job->comm, ret); if (job->destructor) job->destructor((void*)job); } exit: return ret; fail: goto exit; } static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) { int savedDev; ncclResult_t ret = ncclSuccess; struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_; struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr; struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr; struct ncclIntruQueue *asyncJobsMain = gjob->asyncJobsPtr; bool *groupAbortFlag = gjob->abortFlagPtr; CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail); if (!simInfo && groupCommPreconnectHeadMain != nullptr) { struct ncclComm* comm = groupCommPreconnectHeadMain; do { struct ncclPreconnectJob* job; NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); job->base.func = ncclP2PPreconnectFunc; job->base.undo = nullptr; job->base.destructor = free; job->base.state = ncclGroupJobRunning; job->base.abortFlag = comm->abortFlag; job->base.abortFlagDev = comm->abortFlagDev; job->comm = comm; ncclIntruQueueEnqueue(asyncJobsMain, &job->base); struct ncclComm* next = comm->preconnectNext; comm->preconnectNext = reinterpret_cast(0x1); comm = next; } while (comm != nullptr); } NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail); /* Connect channels at runtime if cumem is supported */ if (groupCommHeadMain != nullptr) { struct ncclComm* comm = groupCommHeadMain; do { bool needConnect = false; bool algoNeedConnect[NCCL_NUM_ALGORITHMS]; memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS); NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail); if (comm->cuMemSupport && needConnect) { struct ncclPreconnectJob* job; NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); job->base.func = ncclCollPreconnectFunc; job->base.undo = nullptr; job->base.destructor = free; job->base.state = ncclGroupJobRunning; job->base.abortFlag = comm->abortFlag; job->comm = comm; NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail); memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS); ncclIntruQueueEnqueue(asyncJobsMain, &job->base); } comm = comm->groupNext; } while (comm); NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail); } if ((!simInfo) && (groupCommHeadMain != nullptr)) { NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail); } while (groupCommHeadMain != nullptr) { struct ncclComm* comm = groupCommHeadMain; struct ncclComm* next = comm->groupNext; (void) ncclGroupCommLeave(comm); if (!comm->config.blocking) { (void) ncclCommSetAsyncError(comm, ret); } groupCommHeadMain = next; } CUDACHECK(cudaSetDevice(savedDev)); exit: return ret; fail: groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, gjob->groupBlockingPtr, gjob->abortFlagPtr, ret); goto exit; } static ncclResult_t groupLaunchNonBlocking(struct ncclAsyncJob *job_) { return groupLaunch(job_ /* estimatedTime = NULL */); } ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) { ncclResult_t ret = ncclSuccess; ncclSimInfo_t internalSimInfo = NCCL_SIM_INFO_INITIALIZER; ncclSimInfo_t* internalSimInfoPtr = NULL; size_t realSize = 0; internalSimInfo.magic = 0; if (ncclGroupDepth == 0) { WARN("ncclGroupEnd: not in a group call."); ret = ncclInvalidUsage; goto exit; } if ((--ncclGroupDepth) > 0) goto exit; if ((ret = ncclGroupError) != ncclSuccess) goto fail; if (simInfo) { memcpy((void*)&realSize, (void*)&simInfo->size, sizeof(size_t)); realSize = realSize > sizeof(ncclSimInfo_t) ? sizeof(ncclSimInfo_t) : realSize; memcpy((void*)&internalSimInfo, (void*)simInfo, realSize); if (internalSimInfo.magic != 0x74685283) { WARN("ncclSimInfo_t argument not initialized via NCCL_SIM_INFO_INITIALIZER"); ret = ncclInvalidArgument; goto fail; } internalSimInfoPtr = &internalSimInfo; } if (ncclGroupCommHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs) || ncclGroupCommPreconnectHead != nullptr) { ncclGroupJobMain.groupCommHeadPtr = &ncclGroupCommHead; ncclGroupJobMain.groupCommPreconnectHeadPtr = &ncclGroupCommPreconnectHead; ncclGroupJobMain.groupErrorPtr = &ncclGroupError; ncclGroupJobMain.asyncJobsPtr = &ncclAsyncJobs; ncclGroupJobMain.abortFlagPtr = &ncclGroupJobAbortFlag; ncclGroupJobMain.groupBlockingPtr = &ncclGroupBlocking; ncclGroupJobMain.initialized = true; ncclGroupJobMainPtr = &ncclGroupJobMain; /* make sure ncclGroupBlocking has been set. */ assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1); if (ncclGroupBlocking == 0 && (ncclGroupCommPreconnectHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs))) { /* nonblocking group */ if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) { ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs); do { NCCLCHECKGOTO(ncclCommSetAsyncError(job->comm, ncclInProgress), ret, fail); job->comm->groupJob = ncclGroupJobMainPtr; job = job->next; } while (job); } if (ncclGroupCommHead) { ncclComm_t comm = ncclGroupCommHead; do { NCCLCHECKGOTO(ncclCommSetAsyncError(comm, ncclInProgress), ret, fail); /* link group job to communicators. */ comm->groupJob = ncclGroupJobMainPtr; comm = comm->groupNext; } while (comm); } ncclGroupJobMainPtr->base.func = groupLaunchNonBlocking; SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail); ret = ncclInProgress; } else { /* blocking group */ NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base, internalSimInfoPtr), ret, fail); if (simInfo) memcpy((void*)simInfo, (void*)internalSimInfoPtr, realSize); groupResetJobState(ncclGroupJobMainPtr); } } exit: return ret; fail: groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, &ncclGroupBlocking, &ncclGroupJobAbortFlag, ret); goto exit; } ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) { ncclResult_t ret = ncclSuccess; if (groupJob && groupJob->initialized) { ret = ncclAsyncJobComplete(&groupJob->base); groupResetJobState(groupJob); } return ret; } ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) { if (groupJob && groupJob->initialized) { __atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELEASE); NCCLCHECK(ncclGroupJobComplete(groupJob)); } return ncclSuccess; } nccl-2.22.3-1/src/include/000077500000000000000000000000001463451655400151045ustar00rootroot00000000000000nccl-2.22.3-1/src/include/alloc.h000066400000000000000000000261101463451655400163470ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_ALLOC_H_ #define NCCL_ALLOC_H_ #include "nccl.h" #include "checks.h" #include "bitops.h" #include "utils.h" #include "p2p.h" #include #include #include #include uint64_t clockNano(); // from utils.h with which we have a circular dependency template constexpr size_t ncclSizeOfT() { return sizeof(T); } template<> constexpr size_t ncclSizeOfT() { return 1; } template ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (nelem > 0) { CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*ncclSizeOfT(), cudaHostAllocMapped), result, finish); memset(*ptr, 0, nelem*ncclSizeOfT()); } finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA host alloc %ld bytes", nelem*ncclSizeOfT()); INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT(), *ptr); return result; } #define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__) inline ncclResult_t ncclCudaHostFree(void* ptr) { CUDACHECK(cudaFreeHost(ptr)); return ncclSuccess; } template ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { if (nelem > 0) { void* p = malloc(nelem*ncclSizeOfT()); if (p == NULL) { WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT()); return ncclSystemError; } //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT(), p); memset(p, 0, nelem*ncclSizeOfT()); *ptr = (T*)p; } else { *ptr = NULL; } return ncclSuccess; } #define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__) template ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) { if (nelem < oldNelem) return ncclInternalError; if (nelem == oldNelem) return ncclSuccess; T* oldp = *ptr; T* p = (T*)malloc(nelem*ncclSizeOfT()); if (p == NULL) { WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT()); return ncclSystemError; } memcpy(p, oldp, oldNelem*ncclSizeOfT()); free(oldp); memset(p+oldNelem, 0, (nelem-oldNelem)*ncclSizeOfT()); *ptr = (T*)p; INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*ncclSizeOfT(), nelem*ncclSizeOfT(), *ptr); return ncclSuccess; } #if CUDART_VERSION >= 11030 #include #include "cudawrap.h" static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) { ncclResult_t result = ncclSuccess; size_t granularity = 0; CUdevice currentDev; CUmemAllocationProp prop = {}; CUmemAccessDesc accessDesc = {}; CUmemGenericAllocationHandle handle; CUmemAllocationHandleType type = ncclCuMemHandleType; int cudaDev; int flag = 0; CUDACHECK(cudaGetDevice(&cudaDev)); CUCHECK(cuDeviceGet(¤tDev, cudaDev)); prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.requestedHandleTypes = type; prop.location.id = currentDev; // Query device to see if RDMA support is available CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev)); if (flag) prop.allocFlags.gpuDirectRDMACapable = 1; CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); ALIGN_SIZE(size, granularity); /* Allocate the physical memory on the device */ CUCHECK(cuMemCreate(&handle, size, &prop, 0)); /* Reserve a virtual address range */ CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); /* Map the virtual address range to the physical allocation */ CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); /* Now allow RW access to the newly mapped memory */ accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDesc.location.id = currentDev; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); if (handlep) *handlep = handle; TRACE(NCCL_ALLOC, "CuMem Alloc Size %zu pointer %p handle %llx", size, *ptr, handle); return result; } static inline ncclResult_t ncclCuMemFree(void *ptr) { if (ptr == NULL) return ncclSuccess; ncclResult_t result = ncclSuccess; CUmemGenericAllocationHandle handle; size_t size = 0; CUCHECK(cuMemRetainAllocationHandle(&handle, ptr)); CUCHECK(cuMemRelease(handle)); CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); TRACE(NCCL_ALLOC, "CuMem Free Size %zu pointer %p handle 0x%llx", size, ptr, handle); CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size)); CUCHECK(cuMemRelease(handle)); CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); return result; } #else extern int ncclCuMemEnable(); static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) { WARN("CUMEM not supported prior to CUDA 11.3"); return ncclInternalError; } static inline ncclResult_t ncclCuMemFree(void *ptr) { WARN("CUMEM not supported prior to CUDA 11.3"); return ncclInternalError; } #endif template ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (nelem > 0) { if (ncclCuMemEnable()) { NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT()), result, finish); } else { CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT()), result, finish); } } finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA malloc %ld bytes", nelem*ncclSizeOfT()); INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT(), *ptr); return result; } #define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__) template ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (nelem > 0) { // Need a side stream so as not to interfere with graph capture. cudaStream_t stream; CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); if (ncclCuMemEnable()) { NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT()), result, finish); } else { CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT()), result, finish); } CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT(), stream), result, finish); CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); } finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc %ld bytes", nelem*ncclSizeOfT()); INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT(), *ptr); return result; } #define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__) template ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char *filefunc, int line) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (nelem > 0) { if (ncclCuMemEnable()) { NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT()), result, finish); } else { CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT()), result, finish); } CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT(), stream), result, finish); } finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc async %ld bytes", nelem*ncclSizeOfT()); INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT(), *ptr); return result; } #define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__) template ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); // Need a side stream so as not to interfere with graph capture. cudaStream_t stream; CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish); NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish); CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); return result; } template ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*ncclSizeOfT(), cudaMemcpyDefault, stream), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); return result; } template ncclResult_t ncclCudaFree(T* ptr) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr); CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (ncclCuMemEnable()) { NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish); } else { CUDACHECKGOTO(cudaFree(ptr), result, finish); } finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); return result; } // Allocate memory to be potentially ibv_reg_mr'd. This needs to be // allocated on separate pages as those pages will be marked DONTFORK // and if they are shared, that could cause a crash in a child process inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) { if (size > 0) { size_t page_size = sysconf(_SC_PAGESIZE); void* p; int size_aligned = ROUNDUP(size, page_size); int ret = posix_memalign(&p, page_size, size_aligned); if (ret != 0) return ncclSystemError; memset(p, 0, size); *ptr = p; } else { *ptr = NULL; } INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr); return ncclSuccess; } #define ncclIbMalloc(...) ncclIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__) #endif nccl-2.22.3-1/src/include/argcheck.h000066400000000000000000000012461463451655400170270ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_ARGCHECK_H_ #define NCCL_ARGCHECK_H_ #include "core.h" #include "info.h" ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); ncclResult_t CommCheck(struct ncclComm* ptr, const char* opname, const char* ptrname); ncclResult_t ArgsCheck(struct ncclInfo* info); ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname); #endif nccl-2.22.3-1/src/include/bitops.h000066400000000000000000000213201463451655400165530ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_BITOPS_H_ #define NCCL_BITOPS_H_ #include #if !__NVCC__ #ifndef __host__ #define __host__ #endif #ifndef __device__ #define __device__ #endif #endif #define DIVUP(x, y) \ (((x)+(y)-1)/(y)) #define ROUNDUP(x, y) \ (DIVUP((x), (y))*(y)) #define ALIGN_POWER(x, y) \ ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x)))) #define ALIGN_SIZE(size, align) \ size = ((size + (align) - 1) / (align)) * (align); template __host__ __device__ constexpr Z divUp(X x, Y y) { return (x+y-1)/y; } template __host__ __device__ constexpr Z roundUp(X x, Y y) { return (x+y-1) - (x+y-1)%y; } template __host__ __device__ constexpr Z roundDown(X x, Y y) { return x - x%y; } // assumes second argument is a power of 2 template __host__ __device__ constexpr Z alignUp(X x, int a) { return (x + a-1) & Z(-a); } // assumes second argument is a power of 2 template __host__ __device__ constexpr Z alignDown(X x, int a) { return x & Z(-a); } template inline __host__ __device__ int countOneBits(Int x) { #if __CUDA_ARCH__ if (sizeof(Int) <= sizeof(unsigned int)) { return __popc((unsigned int)x); } else if (sizeof(Int) <= sizeof(unsigned long long)) { return __popcll((unsigned long long)x); } else { static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size."); return -1; } #else if (sizeof(Int) <= sizeof(unsigned int)) { return __builtin_popcount((unsigned int)x); } else if (sizeof(Int) <= sizeof(unsigned long)) { return __builtin_popcountl((unsigned long)x); } else if (sizeof(Int) <= sizeof(unsigned long long)) { return __builtin_popcountll((unsigned long long)x); } else { static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size."); return -1; } #endif } // Returns index of first one bit or returns -1 if mask is zero. template inline __host__ __device__ int firstOneBit(Int mask) { int i; #if __CUDA_ARCH__ if (sizeof(Int) <= sizeof(int)) { i = __ffs((int)mask); } else if (sizeof(Int) <= sizeof(long long)) { i = __ffsll((long long)mask); } else { static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size."); } #else if (sizeof(Int) <= sizeof(int)) { i = __builtin_ffs((int)mask); } else if (sizeof(Int) <= sizeof(long)) { i = __builtin_ffsl((long)mask); } else if (sizeof(Int) <= sizeof(long long)) { i = __builtin_ffsll((long long)mask); } else { static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size."); } #endif return i-1; } template inline __host__ __device__ int popFirstOneBit(Int* mask) { Int tmp = *mask; *mask &= *mask-1; return firstOneBit(tmp); } template inline __host__ __device__ int log2Down(Int x) { int w, n; #if __CUDA_ARCH__ if (sizeof(Int) <= sizeof(int)) { w = 8*sizeof(int); n = __clz((int)x); } else if (sizeof(Int) <= sizeof(long long)) { w = 8*sizeof(long long); n = __clzll((long long)x); } else { static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size."); } #else if (x == 0) { return -1; } else if (sizeof(Int) <= sizeof(unsigned int)) { w = 8*sizeof(unsigned int); n = __builtin_clz((unsigned int)x); } else if (sizeof(Int) <= sizeof(unsigned long)) { w = 8*sizeof(unsigned long); n = __builtin_clzl((unsigned long)x); } else if (sizeof(Int) <= sizeof(unsigned long long)) { w = 8*sizeof(unsigned long long); n = __builtin_clzll((unsigned long long)x); } else { static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size."); } #endif return (w-1)-n; } template inline __host__ __device__ int log2Up(Int x) { int w, n; if (x != 0) x -= 1; #if __CUDA_ARCH__ if (sizeof(Int) <= sizeof(int)) { w = 8*sizeof(int); n = __clz((int)x); } else if (sizeof(Int) <= sizeof(long long)) { w = 8*sizeof(long long); n = __clzll((long long)x); } else { static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size."); } #else if (x == 0) { return 0; } else if (sizeof(Int) <= sizeof(unsigned int)) { w = 8*sizeof(unsigned int); n = __builtin_clz((unsigned int)x); } else if (sizeof(Int) <= sizeof(unsigned long)) { w = 8*sizeof(unsigned long); n = __builtin_clzl((unsigned long)x); } else if (sizeof(Int) <= sizeof(unsigned long long)) { w = 8*sizeof(unsigned long long); n = __builtin_clzll((unsigned long long)x); } else { static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size."); } #endif return w-n; } template inline __host__ __device__ Int pow2Up(Int x) { return Int(1)< inline __host__ __device__ Int pow2Down(Int x) { return Int(1)< inline __host__ UInt reverseSubBits(UInt x) { if (nSubBits >= 16 && 8*sizeof(UInt) == nSubBits) { switch (8*sizeof(UInt)) { case 16: x = __builtin_bswap16(x); break; case 32: x = __builtin_bswap32(x); break; case 64: x = __builtin_bswap64(x); break; default: static_assert(8*sizeof(UInt) <= 64, "Unsupported integer type."); } return reverseSubBits(x); } else if (nSubBits == 1) { return x; } else { UInt m = UInt(-1)/((UInt(1)<<(nSubBits/2))+1); x = (x & m)<<(nSubBits/2) | (x & ~m)>>(nSubBits/2); return reverseSubBits(x); } } template struct ncclToUnsigned; template<> struct ncclToUnsigned { using type = unsigned char; }; template<> struct ncclToUnsigned { using type = unsigned char; }; template<> struct ncclToUnsigned { using type = unsigned char; }; template<> struct ncclToUnsigned { using type = unsigned short; }; template<> struct ncclToUnsigned { using type = unsigned short; }; template<> struct ncclToUnsigned { using type = unsigned int; }; template<> struct ncclToUnsigned { using type = unsigned int; }; template<> struct ncclToUnsigned { using type = unsigned long; }; template<> struct ncclToUnsigned { using type = unsigned long; }; template<> struct ncclToUnsigned { using type = unsigned long long; }; template<> struct ncclToUnsigned { using type = unsigned long long; }; // Reverse the bottom nBits bits of x. The top bits will be overwritten with 0's. template inline __host__ __device__ Int reverseBits(Int x, int nBits) { using UInt = typename ncclToUnsigned::type; union { UInt ux; Int sx; }; sx = x; #if __CUDA_ARCH__ if (sizeof(Int) <= sizeof(unsigned int)) { ux = __brev(ux); } else if (sizeof(Int) <= sizeof(unsigned long long)) { ux = __brevll(ux); } else { static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer type."); } #else ux = reverseSubBits(ux); #endif ux = nBits==0 ? 0 : ux>>(8*sizeof(UInt)-nBits); return sx; } //////////////////////////////////////////////////////////////////////////////// // Custom 8 bit floating point format for approximating 32 bit uints. This format // has nearly the full range of uint32_t except it only keeps the top 3 bits // beneath the leading 1 bit and thus has a max value of 0xf0000000. inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) { int log2x; #if __CUDA_ARCH__ log2x = 31-__clz(x|1); #else log2x = 31-__builtin_clz(x|1); #endif uint32_t mantissa = x>>(log2x >= bitsPerPow2 ? log2x-bitsPerPow2 : 0) & ((1u<= bitsPerPow2 ? log2x-(bitsPerPow2-1) : 0; return exponent<>bitsPerPow2; uint32_t mantissa = (x & ((1u< ncclResult_t initChannel(struct ncclComm* comm, int channelid); ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks); inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) { if (comm->nNodes > 1) { int nodeDelta = p2pRound/comm->maxLocalRanks; int localDelta = p2pRound%comm->maxLocalRanks; int base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH); base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH; return base & 0xff; } else { return p2pRound & 0xff; } } #endif nccl-2.22.3-1/src/include/checks.h000066400000000000000000000131341463451655400165170ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_CHECKS_H_ #define NCCL_CHECKS_H_ #include "debug.h" // Check CUDA RT calls #define CUDACHECK(cmd) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ return ncclUnhandledCudaError; \ } \ } while(false) #define CUDACHECKGOTO(cmd, RES, label) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ RES = ncclUnhandledCudaError; \ goto label; \ } \ } while(false) // Report failure but clear error and continue #define CUDACHECKIGNORE(cmd) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \ (void) cudaGetLastError(); \ } \ } while(false) #include // Check system calls #define SYSCHECK(call, name) do { \ int retval; \ SYSCHECKVAL(call, name, retval); \ } while (false) #define SYSCHECKVAL(call, name, retval) do { \ SYSCHECKSYNC(call, name, retval); \ if (retval == -1) { \ WARN("Call to " name " failed : %s", strerror(errno)); \ return ncclSystemError; \ } \ } while (false) #define SYSCHECKSYNC(call, name, retval) do { \ retval = call; \ if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ } else { \ break; \ } \ } while(true) #define SYSCHECKGOTO(statement, RES, label) do { \ if ((statement) == -1) { \ /* Print the back trace*/ \ RES = ncclSystemError; \ INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ goto label; \ } \ } while (0); #define NEQCHECK(statement, value) do { \ if ((statement) != value) { \ /* Print the back trace*/ \ INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \ return ncclSystemError; \ } \ } while (0); #define NEQCHECKGOTO(statement, value, RES, label) do { \ if ((statement) != value) { \ /* Print the back trace*/ \ RES = ncclSystemError; \ INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ goto label; \ } \ } while (0); #define EQCHECK(statement, value) do { \ if ((statement) == value) { \ /* Print the back trace*/ \ INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \ return ncclSystemError; \ } \ } while (0); #define EQCHECKGOTO(statement, value, RES, label) do { \ if ((statement) == value) { \ /* Print the back trace*/ \ RES = ncclSystemError; \ INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ goto label; \ } \ } while (0); // Propagate errors up #define NCCLCHECK(call) do { \ ncclResult_t RES = call; \ if (RES != ncclSuccess && RES != ncclInProgress) { \ /* Print the back trace*/ \ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ return RES; \ } \ } while (0); #define NCCLCHECKGOTO(call, RES, label) do { \ RES = call; \ if (RES != ncclSuccess && RES != ncclInProgress) { \ /* Print the back trace*/ \ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ goto label; \ } \ } while (0); #define NCCLWAIT(call, cond, abortFlagPtr) do { \ uint32_t* tmpAbortFlag = (abortFlagPtr); \ ncclResult_t RES = call; \ if (RES != ncclSuccess && RES != ncclInProgress) { \ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ return ncclInternalError; \ } \ if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECK(*tmpAbortFlag, 0); \ } while (!(cond)); #define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \ uint32_t* tmpAbortFlag = (abortFlagPtr); \ RES = call; \ if (RES != ncclSuccess && RES != ncclInProgress) { \ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ goto label; \ } \ if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \ } while (!(cond)); #define NCCLCHECKTHREAD(a, args) do { \ if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \ INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \ return args; \ } \ } while(0) #define CUDACHECKTHREAD(a) do { \ if ((a) != cudaSuccess) { \ INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ args->ret = ncclUnhandledCudaError; \ return args; \ } \ } while(0) #endif nccl-2.22.3-1/src/include/coll_net.h000066400000000000000000000064521463451655400170630ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef COLL_NET_H_ #define COLL_NET_H_ #include "nccl.h" #include "nccl_net.h" typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; // Translation to external API static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; } static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; } static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; } static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; } static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; } static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; } static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; } /* DMA-BUF support */ static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; } static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; } static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; } static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; } static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; } static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; } static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; } static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; } #endif nccl-2.22.3-1/src/include/collectives.h000066400000000000000000000033201463451655400175670ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_COLLECTIVES_H_ #define NCCL_COLLECTIVES_H_ #include "nccl.h" #include "nccl_common.h" #include "device.h" // CHUNKSIZE must be a multiple of SLICESIZE #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4) #define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2) #define ALLGATHER_SLICESTEPS (NCCL_STEPS/4) #define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2) #define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4) #define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2) #define BROADCAST_SLICESTEPS 1 #define BROADCAST_CHUNKSTEPS 1 #define REDUCE_SLICESTEPS 1 #define REDUCE_CHUNKSTEPS 1 #define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above const char* ncclFuncToString(ncclFunc_t op); const char* ncclDevRedOpToString(ncclDevRedOp_t op); const char* ncclDatatypeToString(ncclDataType_t type); const char* ncclAlgoToString(int algo); const char* ncclProtoToString(int proto); inline int ncclTypeSize(ncclDataType_t type) { switch (type) { case ncclInt8: case ncclUint8: return 1; case ncclFloat16: #if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: #endif return 2; case ncclInt32: case ncclUint32: case ncclFloat32: return 4; case ncclInt64: case ncclUint64: case ncclFloat64: return 8; default: return -1; } } #include #define NCCL_MODE_NORMAL 0 #define NCCL_MODE_OFFSET 1 #define NCCL_MODE_PTR 2 struct ncclConnFifo { int mode; int offset; ssize_t size; void* ptr; }; #endif nccl-2.22.3-1/src/include/comm.h000066400000000000000000000520461463451655400162170ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_COMM_H_ #define NCCL_COMM_H_ //#include "transport.h" #include "p2p.h" #include "collectives.h" #include "nccl_tuner.h" #include "proxy.h" #include "strongstream.h" #include "nccl_net.h" #include "register.h" #include "graph.h" #if CUDART_VERSION < 9000 struct cudaLaunchParams { void *func; dim3 gridDim; dim3 blockDim; void **args; size_t sharedMem; cudaStream_t stream; }; #endif #define CACHE_LINE_SIZE 128 #define MEM_ALIGN 4096 #define CUDA_IPC_MIN 2097152UL // Channels / LL tuning #define NCCL_LL_THREAD_THRESHOLD 8 #define NCCL_LL128_THREAD_THRESHOLD 8 #define NCCL_SIMPLE_THREAD_THRESHOLD 64 struct ncclSendMem { union { struct { uint64_t head; char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; void* ptrExchange; uint64_t redOpArgExchange[2]; char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)]; int offsFifo[NCCL_STEPS]; }; char pad3[MEM_ALIGN]; }; }; struct ncclRecvMem { union { struct { uint64_t tail; char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; struct ncclConnFifo connFifo[NCCL_STEPS]; int flush; // For GDRCopy-based flush }; char pad4[MEM_ALIGN]; }; }; enum helperThreadState {ThreadStart, ThreadStop}; #define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS) struct ncclGraphHelperResources { ncclComm* comm; pthread_mutex_t threadLock; pthread_cond_t threadCond; enum helperThreadState threadState; void* ipcBases[NCCL_IPC_POOL_SIZE]; int ipcTail; int ipcHead; }; struct ncclUserRedOp { int freeNext; // -1=allocated, otherwise index of next free entry in array ncclDataType_t datatype; ncclDevRedOpFull opFull; }; struct ncclNodeRanks { int localRanks; int* localRankToRank; }; struct cliqueInfo { int id; int size; int *ranks; }; struct ncclDestructor { struct ncclDestructor* next; void* obj; ncclResult_t(*fn)(struct ncclDestructor* me); }; struct ncclCommCallback { struct ncclCommCallback* next; ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb); }; struct ncclSharedResources { int refCount; struct ncclComm* owner; /* comm which creates this shared res. */ struct ncclChannelPeer* peers[MAXCHANNELS]; struct ncclDevChannelPeer* devPeers[MAXCHANNELS]; /* P2P operation counter, one per channel */ uint64_t p2pOpCount[MAXCHANNELS]; /* Collective operation counter */ uint64_t collOpCount; int tpNRanks; int tpNLocalRanks; int tpNChannels; int tpP2pNChannels; int tpP2pChunkSize; uint64_t magic; // top parent rank to localRank translation table int* tpRankToLocalRank; // Internal streams struct ncclStrongStream deviceStream, hostStream; /* proxy related shared res */ struct ncclProxyState* proxyState; }; struct ncclChannel { struct ncclChannelPeer** peers; struct ncclDevChannelPeer** devPeers; /* devPeer pointer array used for host side access */ struct ncclDevChannelPeer** devPeersHostPtr; struct ncclRing ring; int* devRingUserRanks; struct ncclTree tree; struct ncclTree collnetChain; struct ncclDirect collnetDirect; struct ncclNvls nvls; int id; // index of this channel uint32_t workFifoProduced; // +1 successor of last used work fifo byte /* comm split sharable resources */ struct ncclChannelPeer* collnetPeers; struct ncclDevChannelPeer* collnetDevPeers; struct ncclChannelPeer* nvlsPeers; struct ncclDevChannelPeer* nvlsDevPeers; }; struct ncclWorkBatchList { struct ncclWorkBatchList* next; struct ncclDevWorkBatch batch; }; struct alignas(16) ncclWorkList { struct ncclWorkList* next; enum ncclDevWorkType workType; int size; // Size of struct following this node // ncclDevWorkColl, ncclDevWorkColLReg, ncclDevWorkP2p[]... }; struct ncclCollnetHandleList { struct ncclCollnetHandleList *next; void* collnetHandle; size_t size; const void* buffer; struct ncclProxyConnector* proxyconn; }; struct ncclKernelPlan { // A kernel plan is also a callback that reclaims itself. Hence this must // be the first member. struct ncclCommCallback reclaimer; struct ncclComm* comm; struct ncclKernelPlan* next; bool persistent; // aka captured in a graph enum ncclDevWorkStorageType workStorageType; bool kernelSpecialized; void *kernelFn; struct ncclDevKernelArgs* kernelArgs; size_t kernelArgsSize; uint64_t channelMask; // bitset of which channels are present bool hasProxyOps; // does any channel have a non-empty proxyOpQueue int threadPerBlock; int collOpCount; // Number of collectives in this plan. int nWorkBatches; // Number of work batches. size_t workBytes; // Sum size of all work (in the fifo) in bytes. struct ncclIntruQueue workQueue; struct ncclIntruQueue cleanupQueue; void* workBufPersistent; struct ncclIntruQueue proxyOpQueue; }; //////////////////////////////////////////////////////////////////////////////// struct ncclTaskColl { struct ncclTaskColl* next; ncclFunc_t func; void const* sendbuff; void* recvbuff; size_t count; int root; ncclDataType_t datatype; ncclRedOp_t opHost; struct ncclDevRedOpFull opDev; int chunkSteps, sliceSteps; // Computed later: size_t trafficBytes; int32_t nMaxChannels:8; int32_t nWarps:8; int32_t algorithm:8, protocol:8; uint32_t isCollnet:1, isNvls:1; uint32_t devFuncId:30; enum ncclRegBufferType regBufType; // number of elements in planner->ipcMemQueue associated with this collective int nCleanupQueueElts; void* sendMhandle; void* recvMhandle; }; struct ncclTaskP2p { struct ncclTaskP2p* next; void* buff; size_t bytes; }; //////////////////////////////////////////////////////////////////////////////// // Roughly sorts ncclTaskColl's by their size descending. This structure is // self-referential, meaning that pointers it contains internally may point // into the structure itself. This means that it is NOT memcpy-moveable: struct ncclTaskCollSorter { static constexpr int UnitLog2 = 10; // 1K static constexpr size_t UnitSize = 1<>UnitLog2, BitsPerPow2); bin = BinCount-1 - bin; // descending bin if (me->bins[bin] == nullptr) { if (me->binEdge <= bin) { me->binEdge = bin+1; me->bins[bin] = me->tail ? &me->tail->next : &me->head; me->tail = x; } else { // Find successor non-empty bin after this one. int succ = bin+1; while (me->bins[succ] == nullptr) succ++; // What was our successor's head's previous is now our head's previous. me->bins[bin] = me->bins[succ]; // The first node we insert is our tail, so that becomes our successor's // head's new previous. me->bins[succ] = &x->next; } } // Push a new head for this bin. x->next = *me->bins[bin]; *me->bins[bin] = x; } inline bool ncclTaskCollSorterEmpty(struct ncclTaskCollSorter* me) { return me->head == nullptr; } // Reset sorter and return sorted linked list of its coll tasks. inline struct ncclTaskColl* ncclTaskCollSorterDequeueAll(struct ncclTaskCollSorter* me) { struct ncclTaskColl* head = me->head; if (head != nullptr) memset(me, 0, sizeof(*me)); return head; } //////////////////////////////////////////////////////////////////////////////// struct ncclCudaStreamList { struct ncclCudaStreamList *next; cudaStream_t stream; }; struct ncclKernelPlanner { ////////////////////////////////////////////////////////////////////////////// // State for accumulating tasks between ncclGroupStart/End() ////////////////////////////////////////////////////////////////////////////// struct Peer { bool sendSeen, recvSeen; struct ncclIntruQueue sendQueue; struct ncclIntruQueue recvQueue; }; struct ncclTaskCollSorter collSorter; struct Peer* peers/*[nRanks]*/; int nTasksColl, nTasksP2p; bool persistent; // The list of user streams aggregated over all tasks present. struct ncclCudaStreamList* streams; // The most recent user stream. Ignored if streams==nullptr cudaStream_t streamRecent; // The graph capturing all user streams or invalid if none. Thus we restrict the // user that all streams must be captured in the same graph or not captured // at all. Technically we could probably relax this, but that would mean // collecting a different `ncclTasks` per graph and one for non-graph. struct ncclCudaGraph capturingGraph; ////////////////////////////////////////////////////////////////////////////// // Lists of tasks to be assembled into plans. ////////////////////////////////////////////////////////////////////////////// struct ncclIntruQueue collTaskQueue; struct ncclIntruQueue collWorkQueue; struct ncclIntruQueue collCleanupQueue; ////////////////////////////////////////////////////////////////////////////// // State for building current (Work-In-Progress) plan: ////////////////////////////////////////////////////////////////////////////// struct WipPlan { struct Channel { struct { int workBytes; // Sum size of work metadata referenced by this batch. int nP2ps; // Number of p2p works in this batch int p2pRounds[NCCL_MAX_DEV_WORK_P2P_PER_BATCH]; // which rounds are present in this batch. } wipBatch; // work-in-progress batch which will be next tail of workBatchQueue int nWorkBatchesP2p; // number of p2p batches for this channel. struct ncclIntruQueue workBatchQueue; struct ncclIntruQueue proxyOpQueue; } channels[MAXCHANNELS]; } wipPlan; ////////////////////////////////////////////////////////////////////////////// // State for launching built plans: ////////////////////////////////////////////////////////////////////////////// // List of kernel plans built form tasks. struct ncclIntruQueue planQueue; // First of the unlaunched kernels in `planQueue` struct ncclKernelPlan* unlaunchedPlansHead; }; #define NCCL_MAGIC 0x0280028002800280 // Nickel atomic number is 28. struct ncclComm { uint64_t startMagic; struct ncclMemoryStack memPermanent, memScoped; // List of destructors to run when comm is destructed struct ncclDestructor* destructorHead; struct ncclSharedResources* sharedRes; /* map to top parent ranks. */ int* topParentRanks; int* topParentLocalRanks; struct ncclChannel channels[MAXCHANNELS]; struct ncclPeerInfo* peerInfo; struct ncclTopoSystem* topo; int netPluginLoaded; ncclNet_t* ncclNet; ncclNetDeviceType netDeviceType; ncclCollNet_t* ncclCollNet; void* bootstrap; // Bitmasks for ncclTransportP2pSetup uint64_t* connectSend; uint64_t* connectRecv; struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS]; bool initAlgoChannels[NCCL_NUM_ALGORITHMS]; bool runtimeConn; // if dynamic connection is supported int cuMemSupport; uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. uint64_t commHash; int rank; // my rank in the communicator int nRanks; // number of GPUs in communicator int cudaDev; // my cuda device index int nvmlDev; // my nvml device index int compCap; // compute capability of the GPU int minCompCap, maxCompCap; // min/max compute capability in the communicator int64_t busId; // my PCI bus ID in int format cpu_set_t cpuAffinity; // CPU affinity of the GPU int cudaArch; // matches __CUDA_ARCH__ of device int cpuArch; // architecture - As defined in src/include/graph.h, e.g. x86/arm/ppc/mixed int cpuVendor; // vendor - As defined in src/include/graph.h int node; int nNodes; int localRank; int localRanks; int maxLocalRanks; int* rankToNode; int* rankToLocalRank; int* localRankToRank; // localRanks and localRanktoRank for all nodes struct ncclNodeRanks* nodeRanks; // MNNVL: Multi-Node NVLink int MNNVL; // true when MNNVL is available struct cliqueInfo clique; // Our MNNVL clique information int cliqueRank; // Our rank within the MNNVL clique bool checkPointers; bool dmaBufSupport; // Counter for tracking CUDA launches (P2P and collectives included) uint64_t opCount; // Channels for collectives int nChannels; // connection nChannels int collChannels; // enqueue nChannels int nvlsChannels; // enqueue nChannels // all nvls heads stored to check if we can splitShare int nvlsHeads[MAXCHANNELS]; // Channels (per peer) for p2p int p2pnChannels; int p2pnChannelsPerPeer; // Should this comm allocate LL buffers for network P2P connections? bool allocP2pNetLLBuffers; // Buffer sizes int buffSizes[NCCL_NUM_PROTOCOLS]; int p2pChunkSize; int nvlsChunkSize; // Algorithm/Protocols thresholds ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float ringbdw[NCCL_NUM_FUNCTIONS][NCCL_NUM_PROTOCOLS]; int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; /* This attribute can indicate the states of communicators and return code of * asynchronous NCCL operations. */ ncclResult_t asyncResult; // Flag to ask NCCL kernels to abort uint32_t* abortFlag; uint32_t* abortFlagDev; int* abortFlagRefCount; uint32_t* childAbortFlag; uint32_t* childAbortFlagDev; uint32_t destroyFlag; // Device side of the communicator (for cudaFree's) struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm uint32_t workArgsBytes; // max size of kernel args uint32_t workFifoBytes; // size of workFifoBuf, power of 2 void* workFifoBuf; void* workFifoBufDev; void* workFifoBufGdrHandle; // Monotonic number of bytes (mod 1<<32) consumed per channel. In cudaHost memory. uint32_t* workFifoConsumed/*[MAXCHANNELS]*/; // Last observed value of: min(workFifoConsumed[c] for c < MAXCHANNELS) uint32_t workFifoConsumedLeast; // Monotonic number of bytes (mod 1<<32) sent to fifo. uint32_t workFifoProduced; // Intra-process sync struct ncclComm* intraComm0; // leader of intra-process comms (self possible) struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head int intraRank; int intraRanks; uint32_t intraBarrierPhase; char intraPad1[64 - sizeof(uint64_t)]; uint64_t intraBarrierCounter; // only used if this is intraComm0 char intraPad2[64 - sizeof(uint64_t)]; uint64_t intraBarrierGate; // only used if this is intraComm0 struct ncclProxyState* proxyState; int proxyRefCountOld; /* store proxy post-atomic-sub refcount */ // Whether this communicator uses collNet int collNetSupport; bool collNetRegSupport; uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes]; int intraHighestTransportType; int* collNetHeads; int collNetHeadsNum; int* collNetDenseToUserRank; int* collNetUserToDenseRank; /* sharable collNet proxy progress resource. */ struct ncclCollNetSharedRes* collNetSharedRes; // NVLink SHARP (NVLS) support int nvlsSupport; int nvlsRegSupport; /* sharable NVLS resource. */ struct ncclNvlsSharedRes* nvlsResources; // pools backed by comm->memPermanent struct ncclMemoryPool memPool_ncclProxyOp; struct ncclMemoryPool memPool_ncclKernelPlan; // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when // this comm is not yet in a group. struct ncclComm* groupNext; // Subset of those in groupNext list. Holds 0x1 if not needing preconnect. struct ncclComm* preconnectNext; int persistentRefs; // number of persistent plan-lists capturing this comm struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule; struct ncclKernelPlanner planner; // user-created reduction ops int userRedOpCapacity, userRedOpFreeHead; ncclUserRedOp *userRedOps; // Queue of things for the main thread to do struct ncclIntruQueueMpsc callbackQueue; ncclConfig_t config; // initState is to more conveniently reclaim resources when errors happen. ncclResult_t initState; // flag to indicate if ncclCommFinalize() is called bool finalizeCalled; // shared structures for finalization int finalizeRankCnt; // group job to support multi-thread FT struct ncclGroupJob *groupJob; // Tuning plugin int tunerPluginLoaded; ncclTuner_t* tuner; void *tunerContext; // buffer registration cache struct ncclRegCache regCache; uint64_t endMagic; }; enum ncclLaunchMode { ncclLaunchModeInvalid=0, ncclLaunchModeParallel, ncclLaunchModeGroup }; extern enum ncclLaunchMode ncclParamLaunchMode; void ncclCommPushFree(struct ncclComm* comm, void* buf); void ncclCommPushCudaFree(struct ncclComm* comm, void* buf); void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf); void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle); inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome) { ncclResult_t result = ncclSuccess; struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, waitSome); while (cb != nullptr) { struct ncclCommCallback* next = cb->next; ncclResult_t res1 = cb->fn(comm, cb); // may reclaim memory of cb if (res1 != ncclSuccess) result = res1; cb = next; } NCCLCHECK(result); return ncclSuccess; } inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) { int phase = comm->intraBarrierPhase; if (comm->intraRanks == 1) { // Release everyone (just me). comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1); } else { struct ncclComm* comm0 = comm->intraComm0; uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE); if (uint32_t(count) == uint32_t(comm->intraRanks)) { // Reset. __atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED); // Release everyone. __atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE); } } } // returns sum of x values contributed to ncclCommIntraBarrierIn(comm, x) inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) { struct ncclComm* comm0 = comm->intraComm0; comm->intraBarrierPhase ^= 1; uint32_t phase = comm->intraBarrierPhase; uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED); if ((gate & 1) != phase) { uint64_t t0 = clockNano(); do { // Spin vigorously for first 5us. if (clockNano()-t0 >= 5*1000) sched_yield(); gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED); } while ((gate & 1) != phase); } if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE); return gate>>32; } // Scrambles the bits of non-builtin values of ncclRedOp_t according to the // communicator memory address. Used to catch bugs so that integer handles // associated with this communicator won't collide with handles of other // communicatrs. This function is its own inverse. static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) { // Preserve the built-in values. if(int(op) < int(ncclNumOps)) return op; uint64_t h = reinterpret_cast(comm); h ^= h >> 32; h *= 0x9e3779b97f4a7c13u; // Knuth's 64-bit magical hash constant h >>= 32; // h is now an excellent 32-bit hash of the comm pointer h &= int(ncclMaxRedOp); // ncclMaxRedOp is a power of 2 minus 1 int op1 = int(h) ^ int(op); // Since builtin values are preserved, we also have to preserve their preimage. return op1 < int(ncclNumOps) ? op : ncclRedOp_t(op1); } ncclResult_t ncclCommEnsureReady(ncclComm_t comm); ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState); #endif nccl-2.22.3-1/src/include/core.h000066400000000000000000000022121463451655400162020ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_CORE_H_ #define NCCL_CORE_H_ #include #include #include #include #include // For std::min/std::max #include "nccl.h" #ifdef PROFAPI #define NCCL_API(ret, func, args...) \ __attribute__ ((visibility("default"))) \ __attribute__ ((alias(#func))) \ ret p##func (args); \ extern "C" \ __attribute__ ((visibility("default"))) \ __attribute__ ((weak)) \ ret func(args) #else #define NCCL_API(ret, func, args...) \ extern "C" \ __attribute__ ((visibility("default"))) \ ret func(args) #endif // end PROFAPI #include "debug.h" #include "checks.h" #include "cudawrap.h" #include "alloc.h" #include "utils.h" #include "param.h" #include "nvtx.h" #endif // end include guard nccl-2.22.3-1/src/include/cpuset.h000066400000000000000000000027441463451655400165670ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_CPUSET_H_ #define NCCL_CPUSET_H_ // Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t static int hexToInt(char c) { int v = c - '0'; if (v < 0) return -1; if (v > 9) v = 10 + c - 'a'; if ((v < 0) || (v > 15)) return -1; return v; } #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t)) static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) { uint32_t cpumasks[CPU_SET_N_U32]; int m = CPU_SET_N_U32-1; cpumasks[m] = 0; for (int o=0; o=0; o--) { if (c == 0 && m8[o] == 0) continue; sprintf(str+c, "%02x", m8[o]); c+=2; if (o && o%4 == 0) { sprintf(str+c, ","); c++; } } str[c] = '\0'; return ncclSuccess; } #endif nccl-2.22.3-1/src/include/cudawrap.h000066400000000000000000000103371463451655400170670ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_CUDAWRAP_H_ #define NCCL_CUDAWRAP_H_ #include #include #include "checks.h" // Is cuMem API usage enabled extern int ncclCuMemEnable(); #if CUDART_VERSION >= 11030 #include // Handle type used for cuMemCreate() extern CUmemAllocationHandleType ncclCuMemHandleType; #endif #define CUPFN(symbol) pfn_##symbol // Check CUDA PFN driver calls #define CUCHECK(cmd) do { \ CUresult err = pfn_##cmd; \ if( err != CUDA_SUCCESS ) { \ const char *errStr; \ (void) pfn_cuGetErrorString(err, &errStr); \ WARN("Cuda failure %d '%s'", err, errStr); \ return ncclUnhandledCudaError; \ } \ } while(false) #define CUCHECKGOTO(cmd, res, label) do { \ CUresult err = pfn_##cmd; \ if( err != CUDA_SUCCESS ) { \ const char *errStr; \ (void) pfn_cuGetErrorString(err, &errStr); \ WARN("Cuda failure %d '%s'", err, errStr); \ res = ncclUnhandledCudaError; \ goto label; \ } \ } while(false) // Report failure but clear error and continue #define CUCHECKIGNORE(cmd) do { \ CUresult err = pfn_##cmd; \ if( err != CUDA_SUCCESS ) { \ const char *errStr; \ (void) pfn_cuGetErrorString(err, &errStr); \ INFO(NCCL_ALL,"%s:%d Cuda failure %d '%s'", __FILE__, __LINE__, err, errStr); \ } \ } while(false) #define CUCHECKTHREAD(cmd, args) do { \ CUresult err = pfn_##cmd; \ if (err != CUDA_SUCCESS) { \ INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \ args->ret = ncclUnhandledCudaError; \ return args; \ } \ } while(0) #define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol #if CUDART_VERSION >= 11030 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */ DECLARE_CUDA_PFN_EXTERN(cuDeviceGet); DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute); DECLARE_CUDA_PFN_EXTERN(cuGetErrorString); DECLARE_CUDA_PFN_EXTERN(cuGetErrorName); DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange); DECLARE_CUDA_PFN_EXTERN(cuCtxCreate); DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy); DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent); DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent); DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice); DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute); DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel); #if CUDART_VERSION >= 11080 DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx); #endif // cuMem API support DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve); DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree); DECLARE_CUDA_PFN_EXTERN(cuMemCreate); DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity); DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle); DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle); DECLARE_CUDA_PFN_EXTERN(cuMemMap); DECLARE_CUDA_PFN_EXTERN(cuMemRelease); DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle); DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess); DECLARE_CUDA_PFN_EXTERN(cuMemUnmap); #if CUDA_VERSION >= 11070 DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support #endif #if CUDA_VERSION >= 12010 /* NVSwitch Multicast support */ DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice); DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem); DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr); DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate); DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity); DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind); #endif #endif ncclResult_t ncclCudaLibraryInit(void); extern int ncclCudaDriverVersionCache; extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit() inline ncclResult_t ncclCudaDriverVersion(int* driver) { int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED); if (version == -1) { CUDACHECK(cudaDriverGetVersion(&version)); __atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED); } *driver = version; return ncclSuccess; } #endif nccl-2.22.3-1/src/include/debug.h000066400000000000000000000026031463451655400163440ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_INT_DEBUG_H_ #define NCCL_INT_DEBUG_H_ #include "nccl.h" #include "nccl_common.h" #include #include // Conform to pthread and NVTX standard #define NCCL_THREAD_NAMELEN 16 extern int ncclDebugLevel; extern FILE *ncclDebugFile; void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6))); // Let code temporarily downgrade WARN into INFO extern thread_local int ncclDebugNoWarn; extern char ncclLastError[]; #define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) #define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__) #ifdef ENABLE_TRACE #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__) #else #define TRACE(...) #endif void ncclSetThreadName(pthread_t thread, const char *fmt, ...); #endif nccl-2.22.3-1/src/include/device.h000066400000000000000000000464251463451655400165270ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_DEVICE_H_ #define NCCL_DEVICE_H_ #include "nccl.h" #include "nccl_common.h" #include "bitops.h" #include #include #include extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS]; extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS]; extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS]; #define NCCL_MAX_OPS 2048 #define NCCL_STEPS 8 #ifdef __CUDA_ARCH__ #define NCCL_CUDA_ARCH __CUDA_ARCH__ #else #define NCCL_CUDA_ARCH 0 #endif #include "net_device.h" enum ncclDevRedOp_t { ncclDevSum, ncclDevProd, ncclDevMinMax, ncclDevPreMulSum, ncclDevSumPostDiv, ncclNumDevRedOps }; struct ncclDevRedOpFull { ncclDevRedOp_t op; ncclRedOp_t proxyOp; bool scalarArgIsPtr; uint64_t scalarArg; }; union ncclLLFifoLine { /* Flags have to be *after* data, because otherwise, an incomplete receive from the network may receive the flag but not the data. Note this is assuming that either we receive contiguous chunks of data (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */ struct { uint32_t data1; uint32_t flag1; uint32_t data2; uint32_t flag2; }; uint64_t v[2]; int4 i4; }; #define WARP_SIZE 32 #define MAXCHANNELS 32 #define NCCL_MAX_LOCAL_RANKS 64 #define NCCL_MAX_NTHREADS 640 #define NCCL_MIN_NTHREADS (4*WARP_SIZE) #define NCCL_SIMPLE_MAX_NTHREADS 512 #define NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE (3*WARP_SIZE) #define NCCL_LL_MAX_NTHREADS 512 #define NCCL_LL_LINES_PER_THREAD 8 #ifdef TEST_LL_CLEANUP #define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup #define NCCL_LL_FLAG_MAX 0x100 #define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX)) #else #define NCCL_LL_CLEAN_MASK 0x7ffffff8 #define NCCL_LL_FLAG(a) ((uint32_t)(a)) #endif // Make sure the clean mask will last for at least NCCL_NSTEPS static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value"); #define NCCL_LL128_LINESIZE 128 #define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t)) #define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1) #define NCCL_LL128_MAX_NTHREADS 640 #define NCCL_LL128_ELEMS_PER_THREAD 120 #define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS) #define NCCL_DIRECT_WRITE 0x01 #define NCCL_DIRECT_READ 0x02 #define NCCL_DIRECT_NIC 0x04 #define NCCL_IPC_WRITE 0x08 #define NCCL_IPC_READ 0x10 #define NCCL_NVLS_MIN_POLL 0x20 // Number of named barriers supported by CUDA #define NCCL_MAX_GROUPS 16 #define NCCL_MAX_COLLNET_SIZE (1L << 29) enum ncclRegBufferType { NCCL_REGULAR_BUFFER = 0, NCCL_IPC_REG_BUFFER = 1, NCCL_NVLS_REG_BUFFER = 2, NCCL_COLLNET_REG_BUFFER = 3 }; struct ncclConnInfo { // Regular comm mechanism char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send void* mhandles[NCCL_NUM_PROTOCOLS]; uint64_t *tail; // Local for recv, remote for send uint64_t *head; // Local for send, remote for recv int flags; // Direct communication / other flags int shared; // Buffers are shared int stepSize; // Step size for the SIMPLE buffer void **ptrExchange; // Pointer exchange for direct communication uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case struct ncclConnFifo* connFifo; // Used for GPU - Proxy communication uint64_t step; // Keep where we are uint64_t llLastCleaning; ncclNetDeviceHandle_t netDeviceHandle; }; struct ncclProxyConnector { int tpRank; int tpLocalRank; int sameProcess; struct ncclProxyConnection* connection; ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); // Copied from transport if necessary }; struct ncclConnector { int connected; struct ncclProxyConnector proxyConn; struct ncclTransportComm* transportComm; void* transportResources; struct ncclConnInfo conn; }; struct ncclRing { // Shortcuts for userRanks[1] and userRanks[n-1] int prev; int next; // Maps an internal nccl index to user-specified rank order. This is necessary // since we need to know how the user expects data to be ordered across // devices. Ordered from current device. int* userRanks; int index; // This rank's index in the ring }; // The root of each tree only has one node down (+1 intra-node). #define NCCL_MAX_TREE_ARITY_TOP 2 // Nodes inside the binary tree can have to two nodes down (+1 intra-node). #define NCCL_MAX_TREE_ARITY 3 struct ncclTree { int depth; int up; int down[NCCL_MAX_TREE_ARITY]; }; #define NCCL_MAX_DIRECT_ARITY 7 struct ncclDirect { int depth; int out; int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC) int shift; // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads // The heads[...] are guaranteed to be in rotated order start with self: // headRank, (headRank+1)%nHeads, (headRank+2)%nHeads, ... int heads[NCCL_MAX_DIRECT_ARITY+1]; int up[NCCL_MAX_DIRECT_ARITY]; int down[NCCL_MAX_DIRECT_ARITY]; }; #define NCCL_MAX_NVLS_ARITY 32 #define NCCL_MAX_NVLS_TREE_ARITY 3 struct ncclNvls { int out; int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC) int up[NCCL_MAX_NVLS_ARITY]; int down; int treeUp; int treeDown[NCCL_MAX_NVLS_TREE_ARITY]; int node; int nNodes; }; #if __CUDA_ARCH__ >= 900 #define NCCL_MAX_ARITY NCCL_MAX_NVLS_ARITY #else #define NCCL_MAX_ARITY NCCL_MAX_DIRECT_ARITY #endif #define NCCL_MAX_CONNS 2 struct ncclChannelPeer { struct ncclConnector send[NCCL_MAX_CONNS]; struct ncclConnector recv[NCCL_MAX_CONNS]; int refCount; }; struct ncclDevComm; struct alignas(16) ncclDevWorkP2p { void *sendAddr, *recvAddr; size_t sendBytes, recvBytes; int sendRank, recvRank; // From the part index, nP2pChannels, and channelBase the device code can // calculate which part of the transfer a channel is responsible for. uint8_t nP2pChannels; // Always equal to comm->p2pnChannels uint8_t channelBase; // Channel owning first part. // Zero channels indicates no work in that direction. uint8_t nSendChannels, nRecvChannels; // Chunk size stored in 8 bits via u32fp8Encode/Decode. uint8_t sendChunkSize_u32fp8, recvChunkSize_u32fp8; uint8_t sendProtoLL:1, recvProtoLL:1; uint8_t sendRegistered:1, recvRegistered:1; }; // Compute the subset of the data transfer corresponding to the given part index. inline __host__ __device__ void ncclP2pPartBounds(int nParts, int part, size_t bytes, size_t* partBeg, size_t* partEnd) { size_t partBytes = alignUp(divUp(bytes, nParts), 4<<10); #if __CUDA_ARCH__ *partBeg = min((part+0)*partBytes, bytes); *partEnd = min((part+1)*partBytes, bytes); #else *partBeg = std::min((part+0)*partBytes, bytes); *partEnd = std::min((part+1)*partBytes, bytes); #endif } // implemented in channel.h inline __host__ uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound); // ncclP2pChannelToPart and ncclP2pChannelForPart are inverses. The device code // uses ncclP2pChannelToPart to determine which part "this" channel is responsible for. inline __host__ int ncclP2pChannelForPart(int nP2pChannels, int base, int part) { // Only works because nP2pChannels is pow2 int nChannelsLog2 = countOneBits(nP2pChannels-1); int delta = reverseBits(part, nChannelsLog2); return (base + delta) & (nP2pChannels-1); } inline __device__ int ncclP2pChannelToPart(int nP2pChannels, int base, int channel) { // Only works because nP2pChannels is pow2 int nChannelsLog2 = countOneBits(nP2pChannels-1); int delta = (channel-base) & (nP2pChannels-1); return reverseBits(delta, nChannelsLog2); } struct alignas(16) ncclDevWorkColl { // Running on channels [channelLo..channelHi], hi is inclusive. // nChannels == (channelHi - channelLo) + 1 uint32_t channelLo:8, channelHi:8; uint32_t nWarps:8; uint32_t redOpArgIsPtr:1, regUsed:2, oneNode:1, direct:4; uint32_t root; void* recvbuff; void* sendbuff; union { // Continuous-byte-distribution scheduling. The lo and hi channels are of // different size than the channels in the middle. struct { size_t countLo, countMid, countHi; // Chunk counts where units are ncclProtoGrainSize(protocol) bytes uint64_t chunkGrainsLo:21, chunkGrainsMid:21, chunkGrainsHi:21; } cbd; // Collnet scheduling. All channels divide work evenly. struct { size_t count; // Total size, not divided per channel. uint32_t chunkCount; } collnet; }; uint64_t redOpArg; }; __host__ __device__ constexpr int ncclProtoGrainSize(int proto) { return proto == NCCL_PROTO_LL ? 16 : proto == NCCL_PROTO_LL128 ? WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS*NCCL_LL128_DATAELEMS*sizeof(uint64_t) : proto == NCCL_PROTO_SIMPLE ? 512 : -1; } template __host__ __device__ inline void ncclCollCbdPart( struct ncclDevWorkColl* work, uint32_t channelId, int proto, int eltSize, Int* count, Int* partOffset, Int* partCount, Int* chunkCount ) { int eltPerGrain = ncclProtoGrainSize(proto)/eltSize; int nMidChannels = work->channelHi - work->channelLo - 1; // We can assum that nMidChannels<0 implies countMid==0, which let's us assume // that countMid*nMidChannels == 0. if (count != nullptr) { *count = work->cbd.countLo + work->cbd.countMid*nMidChannels + work->cbd.countHi; } if (channelId == work->channelLo) { *partOffset = 0; *partCount = work->cbd.countLo; *chunkCount = work->cbd.chunkGrainsLo*eltPerGrain; } else if (channelId == work->channelHi) { *partOffset = work->cbd.countLo + nMidChannels*work->cbd.countMid; *partCount = work->cbd.countHi; *chunkCount = work->cbd.chunkGrainsHi*eltPerGrain; } else { int mid = channelId - work->channelLo - 1; *partOffset = work->cbd.countLo + mid*work->cbd.countMid; *partCount = work->cbd.countMid; *chunkCount = work->cbd.chunkGrainsMid*eltPerGrain; } } struct alignas(16) ncclDevWorkCollReg { struct ncclDevWorkColl coll; void* dnInputs[NCCL_MAX_DIRECT_ARITY+1]; void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1]; void* upOutputs[NCCL_MAX_DIRECT_ARITY+1]; }; enum ncclDevWorkType: uint8_t { ncclDevWorkTypeP2p, ncclDevWorkTypeColl, ncclDevWorkTypeCollReg }; constexpr size_t ncclDevWorkSize(enum ncclDevWorkType type) { return type == ncclDevWorkTypeP2p ? sizeof(ncclDevWorkP2p) : type == ncclDevWorkTypeColl ? sizeof(ncclDevWorkColl) : sizeof(ncclDevWorkCollReg); } #define NCCL_MAX_DEV_WORK_BATCH_BYTES 1024 #define NCCL_MAX_DEV_WORK_BATCH_COLLS (NCCL_MAX_DEV_WORK_BATCH_BYTES/sizeof(ncclDevWorkColl)) #define NCCL_MAX_DEV_WORK_P2P_PER_BATCH 8 struct alignas(16) ncclDevWorkBatch { union { struct { // nextExtends: should next one be merged into this one. // nextJump=0: end of this channel's batch list // nextJump>0: batches[thisIndex+nextJump] is next batch in this list uint32_t nextJump:14, nextExtends:1; uint32_t workType:2, funcId:15; }; // Unioning bitfields with underlying type hints compiler to emit the best // SASS LD/ST accesses. uint32_t flags; }; // Rolling offset in fifo where this batch's work structs begin uint32_t offsetBase; // Set of relative offsets from offsetBase for this channel's subset of the batch: // For each bit index i in offsetMask, find work at fifo offset: offsetBase + i*sizeof(WorkStructType) uint64_t offsetBitset; }; struct ncclDevChannelPeer { // Stripped version of ncclChannelPeer where we only keep the ncclConnInfo // instead of the full ncclConnector. struct ncclConnInfo send[NCCL_MAX_CONNS]; struct ncclConnInfo recv[NCCL_MAX_CONNS]; }; struct alignas(16) ncclDevChannel { struct ncclDevChannelPeer** peers; struct ncclRing ring; struct ncclTree tree; struct ncclTree collnetChain; struct ncclDirect collnetDirect; struct ncclNvls nvls; uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed }; struct ncclDevComm { int rank; int nRanks; int node; int nNodes; int buffSizes[NCCL_NUM_PROTOCOLS]; int p2pChunkSize; // Work fifo return credits uint32_t* workConsumed/*[MAXCHANNELS]*/; int* collNetDenseToUserRank; // Flag to ask NCCL kernels to abort volatile uint32_t* abortFlag; // Channels, device side struct ncclDevChannel* channels/*[MAXCHANNELS]*/; }; struct alignas(16) ncclDevCommAndChannels { struct ncclDevComm comm; struct ncclDevChannel channels[MAXCHANNELS]; }; enum ncclDevWorkStorageType: uint8_t { ncclDevWorkStorageTypeArgs=0, ncclDevWorkStorageTypeFifo=1, ncclDevWorkStorageTypePersistent=2 }; struct alignas(16) ncclDevKernelArgs { struct ncclDevComm* comm; uint64_t channelMask; enum ncclDevWorkStorageType workStorageType; uint32_t workMask; void* workBuf; // A channel's first batch is at `blockIdx.x`. Use `nextJump` to follow rest of list. // struct ncclDevWorkBatch batches[]; }; __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { //return (cudaArch < 700 || cudaDriver < 12010) ? 4<<10 : (32<<10)-4; return 4<<10; } template struct alignas(16) ncclDevKernelArgsStorage { union { struct ncclDevKernelArgs args; ulong2 storage[capacity/sizeof(ulong2)]; }; }; typedef ncclDevKernelArgsStorage<(4<<10)> ncclDevKernelArgs4K; //typedef ncclDevKernelArgsStorage<(32<<10)-4> ncclDevKernelArgs31K; template __host__ __device__ constexpr T min_constexpr(T a) { return a; } template __host__ __device__ constexpr T min_constexpr(T a, T b, Ts ...c) { return min_constexpr((a < b ? a : b), c...); } template __host__ __device__ constexpr T max_constexpr(T a) { return a; } template __host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) { return max_constexpr((a > b ? a : b), c...); } constexpr int ncclDevMaxChannelsForArgsBytes(size_t argsBytes) { return min_constexpr(MAXCHANNELS, (argsBytes - sizeof(struct ncclDevKernelArgs))/sizeof(struct ncclDevWorkBatch)); } // Calculate the unroll factor given: // * bytePerPack: number of bytes accessed per instruction // * insns: max permissible unroll value // * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack) __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int bytes) { return min_constexpr(insns, (bytes + bytePerPack-1)/bytePerPack); } // Note that all unroll value logic should depend on a given cudaArch argument // and not __CUDA_ARCH__ since these need to be host-side executable where the // arch value is strictly runtime only. By defaulting to NCCL_CUDA_ARCH, device // side code can elide passing the arch for brevity. __host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) { // Our collective unroll should move to the same bytes&insns model as NVLS. return cudaArch >= 800 ? 8 : 4; } __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } __host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch = NCCL_CUDA_ARCH) { return ncclCalcUnroll(bytePerPack, ncclNvlsUnrollInsns(cudaArch), ncclNvlsUnrollBytes(cudaArch)); } // The amount of dynamic shmem per warp __host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) { return (max_constexpr( /*LL */0, /*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t), /*SIMPLE*/(ncclCollUnroll(cudaArch)*WARP_SIZE + 1)*16, // NVLS needs an extra 16B to read unaligned data. /*NVLS */WARP_SIZE*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16 ) + 15) & -16; // pad to 16 bytes } // The amount of dynamic shmem per block __host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) { return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE); } // Host-side table of kernel function pointers. extern int const ncclDevKernelCount; extern void* const ncclDevKernelList[/*ncclDevKernelCount*/]; // Table of most specialized kernel function to run given func index. extern int const ncclDevFuncIdCount; extern int const ncclDevFuncRowToId[]; extern void* const ncclDevKernelForFunc[/*funcIndex*/]; extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/]; // Launch a one-rank reduction on stream. ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t type, cudaStream_t stream); // `ncclNvlsSupported()` needs to be in sync with "func_valid" in "src/device/generate.py" inline bool ncclNvlsSupported(int devRedOp, int type) { switch (type) { case ncclInt32: case ncclUint32: case ncclInt64: case ncclUint64: case ncclFloat16: #if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: #endif return devRedOp == ncclDevSum || devRedOp == ncclDevMinMax; case ncclFloat: case ncclDouble: return devRedOp == ncclDevSum; default: return false; } } // `ncclDevFuncIndex()` needs to be in sync with "all_functions()" in "src/device/generate.py" inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) { #if defined(__CUDA_BF16_TYPES_EXIST__) constexpr int NumTypes = ncclNumTypes; #else constexpr int NumTypes = ncclNumTypes + 1; #endif int row; do { row = 0; // ncclDevFuncIndex_P2p if (coll == ncclFuncSendRecv) break; row += 1; int nAlgos = 3; if (coll == ncclFuncAllGather) { int algo1 = algo == NCCL_ALGO_RING ? 0 : algo == NCCL_ALGO_COLLNET_DIRECT ? 1 : /*algo == NCCL_ALGO_NVLS*/ 2; row += algo1*NCCL_NUM_PROTOCOLS + proto; break; } row += nAlgos*NCCL_NUM_PROTOCOLS; nAlgos = 1; if (coll == ncclFuncBroadcast) { row += proto; break; } row += nAlgos*NCCL_NUM_PROTOCOLS; nAlgos = NCCL_NUM_ALGORITHMS; if (coll == ncclFuncAllReduce) { row += ((devRedOp*NumTypes + type)*nAlgos + algo)*NCCL_NUM_PROTOCOLS + proto; break; } row += ncclNumDevRedOps*NumTypes*nAlgos*NCCL_NUM_PROTOCOLS; nAlgos = 1; if (coll == ncclFuncReduce) { row += (devRedOp*NumTypes + type)*NCCL_NUM_PROTOCOLS + proto; break; } row += ncclNumDevRedOps*NumTypes*nAlgos*NCCL_NUM_PROTOCOLS; nAlgos = 3; if (coll == ncclFuncReduceScatter) { int algo1 = algo == NCCL_ALGO_RING ? 0 : algo == NCCL_ALGO_COLLNET_DIRECT ? 1 : /*algo == NCCL_ALGO_NVLS*/ 2; row += ((devRedOp*NumTypes + type)*nAlgos + algo1)*NCCL_NUM_PROTOCOLS + proto; break; } row += ncclNumDevRedOps*NumTypes*nAlgos*NCCL_NUM_PROTOCOLS; } while (false); return ncclDevFuncRowToId[row]; } inline int ncclDevFuncId_P2p() { return ncclDevFuncRowToId[0]; } #endif nccl-2.22.3-1/src/include/enqueue.h000066400000000000000000000023441463451655400167270ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_ENQUEUE_H_ #define NCCL_ENQUEUE_H_ #include "comm.h" #include "group.h" #include "collectives.h" #include "utils.h" #define NCCL_LL_ALIGNMENT_PER_THREAD sizeof(uint64_t) #define NCCL_LL128_ALIGNMENT_PER_WARP 480 #define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL) #define NCCL_BYTES_ALIGNMENT 16 ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize); ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); ncclResult_t ncclLaunchPrepare(struct ncclComm* comm); ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan); ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); ncclResult_t ncclLaunchFinish(struct ncclComm* comm); ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo); #endif // End include guard nccl-2.22.3-1/src/include/gdrwrap.h000066400000000000000000000210731463451655400167260ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_GDRWRAP_H_ #define NCCL_GDRWRAP_H_ #include "nccl.h" #include "alloc.h" #include // for standard [u]intX_t types #include #include // These can be used if the GDR library isn't thread safe #include extern pthread_mutex_t gdrLock; #define GDRLOCK() pthread_mutex_lock(&gdrLock) #define GDRUNLOCK() pthread_mutex_unlock(&gdrLock) #define GDRLOCKCALL(cmd, ret) do { \ GDRLOCK(); \ ret = cmd; \ GDRUNLOCK(); \ } while(false) #define GDRCHECK(cmd) do { \ int e; \ /* GDRLOCKCALL(cmd, e); */ \ e = cmd; \ if( e != 0 ) { \ WARN("GDRCOPY failure %d", e); \ return ncclSystemError; \ } \ } while(false) // This is required as the GDR memory is mapped WC #if !defined(__NVCC__) #if defined(__PPC__) static inline void wc_store_fence(void) { asm volatile("sync") ; } #elif defined(__x86_64__) #include static inline void wc_store_fence(void) { _mm_sfence(); } #elif defined(__aarch64__) #ifdef __cplusplus #include static inline void wc_store_fence(void) { std::atomic_thread_fence(std::memory_order_release); } #else #include static inline void wc_store_fence(void) { atomic_thread_fence(memory_order_release); } #endif #endif #endif //#define GDR_DIRECT 1 #ifdef GDR_DIRECT // Call the GDR API library code directly rather than via // dlopen() wrappers #include static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; } static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; } static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); return ncclSuccess; } static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) { GDRCHECK(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle)); return ncclSuccess; } static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) { GDRCHECK(gdr_unpin_buffer(g, handle)); return ncclSuccess; } static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) { GDRCHECK(gdr_get_info(g, handle, info)); return ncclSuccess; } static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) { GDRCHECK(gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size)); return ncclSuccess; } static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) { GDRCHECK(gdr_unmap(gdr_t g, gdr_mh_t handle, void **va, size_t size)); return ncclSuccess; } static void wrap_gdr_runtime_get_version(int *major, int *minor) { gdr_runtime_get_version(major, minor); return ncclSuccess; } static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) { gdr_driver_get_version(g, major, minor); return ncclSuccess; } static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) { GDRCHECK(gdr_copy_to_mapping(handle, map_d_ptr, h_ptr, size)); return ncclSuccess; } static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) { GDRCHECK(gdr_copy_from_mapping(handle, h_ptr, map_d_ptr, size)); return ncclSuccess; } #else // Dynamically handle dependency the GDR API library /* Extracted from gdrapi.h (v2.1 Nov 2020) */ #define GPU_PAGE_SHIFT 16 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT) #define GPU_PAGE_OFFSET (GPU_PAGE_SIZE-1) #define GPU_PAGE_MASK (~GPU_PAGE_OFFSET) struct gdr; typedef struct gdr *gdr_t; typedef struct gdr_mh_s { unsigned long h; } gdr_mh_t; struct gdr_info { uint64_t va; uint64_t mapped_size; uint32_t page_size; uint64_t tm_cycles; uint32_t cycles_per_ms; unsigned mapped:1; unsigned wc_mapping:1; }; typedef struct gdr_info gdr_info_t; /* End of gdrapi.h */ ncclResult_t wrap_gdr_symbols(void); gdr_t wrap_gdr_open(void); ncclResult_t wrap_gdr_close(gdr_t g); ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle); ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle); ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info); ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size); ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size); ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor); ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor); ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size); ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size); #endif // GDR_DIRECT // Global GDR driver handle extern gdr_t ncclGdrCopy; #include "alloc.h" typedef struct gdr_mem_desc { void *gdrDevMem; void *gdrMap; size_t gdrOffset; size_t gdrMapSize; gdr_mh_t gdrMh; } gdr_mem_desc_t; static gdr_t ncclGdrInit() { int libMajor, libMinor, drvMajor, drvMinor; gdr_t handle = NULL; // Dynamically load the GDRAPI library symbols if (wrap_gdr_symbols() == ncclSuccess) { handle = wrap_gdr_open(); if (handle != NULL) { ncclResult_t res; // Query the version of libgdrapi NCCLCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error); // Query the version of gdrdrv driver NCCLCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error); // Only support GDRAPI 2.1 and later if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) { goto error; } else INFO(NCCL_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor); } } return handle; error: if (handle != NULL) (void) wrap_gdr_close(handle); return NULL; } template static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) { gdr_info_t info; size_t mapSize; gdr_mh_t mh; char *devMem; void *gdrMap; mapSize = ncclSizeOfT()*nelem; // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE ALIGN_SIZE(mapSize, GPU_PAGE_SIZE); // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1)); uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK; size_t align = alignedAddr - (uint64_t)devMem; //TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zu size %zu", alignedAddr, devMem, align, mapSize); NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh)); NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize)); //TRACE(NCCL_INIT, "GDRCOPY : mapped %p (0x%lx) at %p", devMem, alignedAddr, gdrMap); NCCLCHECK(wrap_gdr_get_info(ncclGdrCopy, mh, &info)); // Will offset ever be non zero ? ssize_t off = info.va - alignedAddr; gdr_mem_desc_t* md; NCCLCHECK(ncclCalloc(&md, 1)); md->gdrDevMem = devMem; md->gdrMap = gdrMap; md->gdrMapSize = mapSize; md->gdrOffset = off+align; md->gdrMh = mh; *gdrHandle = md; *ptr = (T *)((char *)gdrMap+off); if (devPtr) *devPtr = (T *)(devMem+off+align); TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zu at %p", md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr); return ncclSuccess; } template static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) { gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*ncclSizeOfT())); return ncclSuccess; } static ncclResult_t ncclGdrCudaFree(void* gdrHandle) { gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize)); NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh)); NCCLCHECK(ncclCudaFree(md->gdrDevMem)); free(md); return ncclSuccess; } #endif // End include guard nccl-2.22.3-1/src/include/graph.h000066400000000000000000000127511463451655400163640ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_GRAPH_H_ #define NCCL_GRAPH_H_ #include "nccl.h" #include "device.h" #include #include #include #include #include ncclResult_t ncclTopoCudaPath(int cudaDev, char** path); struct ncclTopoSystem; // Build the topology ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system); ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system); ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system); ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm); void ncclTopoFree(struct ncclTopoSystem* system); ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm); ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm); ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks); int ncclTopoPathAllNVLink(struct ncclTopoSystem* system); ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm); // Query topology ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank); ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank); ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret); ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int64_t netId, int read, int* useGdr); ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush); ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net); int ncclPxnDisable(struct ncclComm* comm); ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks); // Find CPU affinity ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity); #define NCCL_TOPO_CPU_ARCH_X86 1 #define NCCL_TOPO_CPU_ARCH_POWER 2 #define NCCL_TOPO_CPU_ARCH_ARM 3 #define NCCL_TOPO_CPU_ARCH_MIXED 4 #define NCCL_TOPO_CPU_VENDOR_INTEL 1 #define NCCL_TOPO_CPU_VENDOR_AMD 2 #define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3 #define NCCL_TOPO_CPU_VENDOR_MIXED 4 #define NCCL_TOPO_CPU_TYPE_BDW 1 #define NCCL_TOPO_CPU_TYPE_SKL 2 #define NCCL_TOPO_CPU_TYPE_YONGFENG 1 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model); ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev); ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex); ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count); #define NCCL_TOPO_MAX_NODES 256 // Init search. Needs to be done before calling ncclTopoCompute ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system); #define NCCL_TOPO_PATTERN_BALANCED_TREE 1 // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU) #define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU) #define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU #define NCCL_TOPO_PATTERN_RING 4 // Ring #define NCCL_TOPO_PATTERN_NVLS 5 // NVLS+SHARP and NVLS+Tree #define NCCL_TOPO_PATTERN_COLLNET_DIRECT 6 // Collnet Direct struct ncclTopoGraph { // Input / output int id; // ring : 0, tree : 1, collnet : 2 int pattern; int crossNic; int collNet; int minChannels; int maxChannels; // Output int nChannels; float bwIntra; float bwInter; float latencyInter; int typeIntra; int typeInter; int sameChannels; int nHops; int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES]; int64_t inter[MAXCHANNELS*2]; }; ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs); struct ncclTopoRanks { int ringRecv[MAXCHANNELS]; int ringSend[MAXCHANNELS]; int ringPrev[MAXCHANNELS]; int ringNext[MAXCHANNELS]; int treeToParent[MAXCHANNELS]; int treeToChild0[MAXCHANNELS]; int treeToChild1[MAXCHANNELS]; int nvlsHeads[MAXCHANNELS]; int nvlsHeadNum; }; ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks); ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent); ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs); ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup=nullptr); #endif nccl-2.22.3-1/src/include/group.h000066400000000000000000000113231463451655400164110ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_GROUP_H_ #define NCCL_GROUP_H_ #include "nccl.h" #include "comm.h" ncclResult_t ncclGroupErrCheck(ncclResult_t ret); void ncclGroupCommJoin(struct ncclComm* comm); void ncclGroupCommPreconnect(struct ncclComm* comm); ncclResult_t ncclGroupCommLeave(struct ncclComm* comm); ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob); ncclResult_t ncclGroupJobComplete(struct ncclGroupJob *groupJob); typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); typedef enum ncclGroupJobState { ncclGroupJobRunning = 0, ncclGroupJobDone = 1, ncclGroupJobJoined = 2, } ncclGroupJobState_t; struct ncclAsyncJob { struct ncclAsyncJob* next; pthread_t thread; ncclResult_t result; ncclResult_t(*func)(struct ncclAsyncJob*); void(*undo)(struct ncclAsyncJob*); void(*destructor)(void*); ncclGroupJobState_t state; uint32_t* abortFlag; /* point to comm abortFlag */ uint32_t* abortFlagDev; /* point to comm abortFlagDev */ uint32_t* childAbortFlag; /* point to child abortFlag */ uint32_t* childAbortFlagDev; /* point to child abortFlagDev */ ncclComm_t comm; int destroyFlag; }; ncclResult_t ncclAsyncLaunch( struct ncclAsyncJob* job, ncclResult_t(*func)(struct ncclAsyncJob*), void(*undo)(struct ncclAsyncJob*), void(*destructor)(void*), ncclComm_t comm ); struct ncclGroupJob { struct ncclAsyncJob base; struct ncclComm **groupCommHeadPtr; struct ncclComm **groupCommPreconnectHeadPtr; ncclResult_t *groupErrorPtr; bool *abortFlagPtr; int *groupBlockingPtr; struct ncclIntruQueue *asyncJobsPtr; bool initialized; }; ncclResult_t ncclGroupStartInternal(); ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo = NULL); ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job); //////////////////////////////////////////////////////////////////////////////// extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting extern __thread ncclResult_t ncclGroupError; extern __thread struct ncclComm* ncclGroupCommHead; extern __thread struct ncclComm* ncclGroupCommPreconnectHead; extern __thread int ncclGroupBlocking; extern __thread struct ncclGroupJob *ncclGroupJobMainPtr; extern __thread struct ncclGroupJob ncclGroupJobMain; static inline void groupResetJobState() { ncclGroupBlocking = -1; ncclGroupJobMainPtr = NULL; memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob)); return; } static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) { ncclResult_t ret = ncclSuccess; if (job) { ret = ncclAsyncJobComplete(&job->base); groupResetJobState(); } return ret; } inline ncclResult_t ncclGroupStartInternal() { ncclGroupDepth++; return ncclSuccess; } inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) { if (ncclGroupDepth > 0) { if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret; } return ret; } // Add comm to this thread's group inline void ncclGroupCommJoin(struct ncclComm* comm) { if (comm->groupNext == reinterpret_cast(0x1)) { // Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves // the users program order yet insures siblings occur consecutively. This // is required by doLaunches() in "group.cc". struct ncclComm** pp = &ncclGroupCommHead; while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0) pp = &(*pp)->groupNext; comm->groupNext = *pp; *pp = comm; // Comms gets a new memory stack scope upon joining. Each task batched for // this comm is allocated there. ncclMemoryStackPush(&comm->memScoped); // Initialize planner ncclKernelPlanner::Peer* tmp = comm->planner.peers; memset(&comm->planner, 0, sizeof(comm->planner)); comm->planner.peers = tmp; } ncclGroupBlocking = comm->config.blocking; } // Add comm to this thread's group needing preconnect inline void ncclGroupCommPreconnect(struct ncclComm* comm) { if (comm->preconnectNext == reinterpret_cast(0x1)) { comm->preconnectNext = ncclGroupCommPreconnectHead; ncclGroupCommPreconnectHead = comm; } } // Comm has left group inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) { comm->groupNext = reinterpret_cast(0x1); ncclMemoryStackPop(&comm->memScoped); return ncclSuccess; } #endif nccl-2.22.3-1/src/include/ibvcore.h000066400000000000000000000617711463451655400167220ustar00rootroot00000000000000#ifndef NCCL_IBV_CORE_H_ #define NCCL_IBV_CORE_H_ /* Basic IB verbs structs. Needed to dynamically load IB verbs functions without * explicit including of IB verbs header. */ #include #include #include #include #if __GNUC__ >= 3 # define __attribute_const __attribute__((const)) #else # define __attribute_const #endif union ibv_gid { uint8_t raw[16]; struct { uint64_t subnet_prefix; uint64_t interface_id; } global; }; #ifndef container_of /** * container_of - cast a member of a structure out to the containing structure * @ptr: the pointer to the member. * @type: the type of the container struct this is embedded in. * @member: the name of the member within the struct. * */ #define container_of(ptr, type, member) \ ((type *) ((uint8_t *)(ptr) - offsetof(type, member))) #endif #define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz)) /*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/ //static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1; enum ibv_node_type { IBV_NODE_UNKNOWN = -1, IBV_NODE_CA = 1, IBV_NODE_SWITCH, IBV_NODE_ROUTER, IBV_NODE_RNIC, /* Leave a gap for future node types before starting with * experimental node types. */ IBV_EXP_NODE_TYPE_START = 32, IBV_EXP_NODE_MIC = IBV_EXP_NODE_TYPE_START }; enum ibv_transport_type { IBV_TRANSPORT_UNKNOWN = -1, IBV_TRANSPORT_IB = 0, IBV_TRANSPORT_IWARP, /* Leave a gap for future transport types before starting with * experimental transport types. */ IBV_EXP_TRANSPORT_TYPE_START = 32, IBV_EXP_TRANSPORT_SCIF = IBV_EXP_TRANSPORT_TYPE_START }; enum ibv_device_cap_flags { IBV_DEVICE_RESIZE_MAX_WR = 1, IBV_DEVICE_BAD_PKEY_CNTR = 1 << 1, IBV_DEVICE_BAD_QKEY_CNTR = 1 << 2, IBV_DEVICE_RAW_MULTI = 1 << 3, IBV_DEVICE_AUTO_PATH_MIG = 1 << 4, IBV_DEVICE_CHANGE_PHY_PORT = 1 << 5, IBV_DEVICE_UD_AV_PORT_ENFORCE = 1 << 6, IBV_DEVICE_CURR_QP_STATE_MOD = 1 << 7, IBV_DEVICE_SHUTDOWN_PORT = 1 << 8, IBV_DEVICE_INIT_TYPE = 1 << 9, IBV_DEVICE_PORT_ACTIVE_EVENT = 1 << 10, IBV_DEVICE_SYS_IMAGE_GUID = 1 << 11, IBV_DEVICE_RC_RNR_NAK_GEN = 1 << 12, IBV_DEVICE_SRQ_RESIZE = 1 << 13, IBV_DEVICE_N_NOTIFY_CQ = 1 << 14, IBV_DEVICE_XRC = 1 << 20, IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29 }; enum ibv_atomic_cap { IBV_ATOMIC_NONE, IBV_ATOMIC_HCA, IBV_ATOMIC_GLOB }; struct ibv_device_attr { char fw_ver[64]; uint64_t node_guid; uint64_t sys_image_guid; uint64_t max_mr_size; uint64_t page_size_cap; uint32_t vendor_id; uint32_t vendor_part_id; uint32_t hw_ver; int max_qp; int max_qp_wr; int device_cap_flags; int max_sge; int max_sge_rd; int max_cq; int max_cqe; int max_mr; int max_pd; int max_qp_rd_atom; int max_ee_rd_atom; int max_res_rd_atom; int max_qp_init_rd_atom; int max_ee_init_rd_atom; enum ibv_atomic_cap atomic_cap; int max_ee; int max_rdd; int max_mw; int max_raw_ipv6_qp; int max_raw_ethy_qp; int max_mcast_grp; int max_mcast_qp_attach; int max_total_mcast_qp_attach; int max_ah; int max_fmr; int max_map_per_fmr; int max_srq; int max_srq_wr; int max_srq_sge; uint16_t max_pkeys; uint8_t local_ca_ack_delay; uint8_t phys_port_cnt; }; enum ibv_mtu { IBV_MTU_256 = 1, IBV_MTU_512 = 2, IBV_MTU_1024 = 3, IBV_MTU_2048 = 4, IBV_MTU_4096 = 5 }; enum ibv_port_state { IBV_PORT_NOP = 0, IBV_PORT_DOWN = 1, IBV_PORT_INIT = 2, IBV_PORT_ARMED = 3, IBV_PORT_ACTIVE = 4, IBV_PORT_ACTIVE_DEFER = 5 }; enum { IBV_LINK_LAYER_UNSPECIFIED, IBV_LINK_LAYER_INFINIBAND, IBV_LINK_LAYER_ETHERNET, /* Leave a gap for future link layer types before starting with * experimental link layer. */ IBV_EXP_LINK_LAYER_START = 32, IBV_EXP_LINK_LAYER_SCIF = IBV_EXP_LINK_LAYER_START }; enum ibv_port_cap_flags { IBV_PORT_SM = 1 << 1, IBV_PORT_NOTICE_SUP = 1 << 2, IBV_PORT_TRAP_SUP = 1 << 3, IBV_PORT_OPT_IPD_SUP = 1 << 4, IBV_PORT_AUTO_MIGR_SUP = 1 << 5, IBV_PORT_SL_MAP_SUP = 1 << 6, IBV_PORT_MKEY_NVRAM = 1 << 7, IBV_PORT_PKEY_NVRAM = 1 << 8, IBV_PORT_LED_INFO_SUP = 1 << 9, IBV_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, IBV_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, IBV_PORT_CM_SUP = 1 << 16, IBV_PORT_SNMP_TUNNEL_SUP = 1 << 17, IBV_PORT_REINIT_SUP = 1 << 18, IBV_PORT_DEVICE_MGMT_SUP = 1 << 19, IBV_PORT_VENDOR_CLASS = 1 << 24, IBV_PORT_CLIENT_REG_SUP = 1 << 25, IBV_PORT_IP_BASED_GIDS = 1 << 26, }; struct ibv_port_attr { enum ibv_port_state state; enum ibv_mtu max_mtu; enum ibv_mtu active_mtu; int gid_tbl_len; uint32_t port_cap_flags; uint32_t max_msg_sz; uint32_t bad_pkey_cntr; uint32_t qkey_viol_cntr; uint16_t pkey_tbl_len; uint16_t lid; uint16_t sm_lid; uint8_t lmc; uint8_t max_vl_num; uint8_t sm_sl; uint8_t subnet_timeout; uint8_t init_type_reply; uint8_t active_width; uint8_t active_speed; uint8_t phys_state; uint8_t link_layer; uint8_t reserved; }; enum ibv_event_type { IBV_EVENT_CQ_ERR, IBV_EVENT_QP_FATAL, IBV_EVENT_QP_REQ_ERR, IBV_EVENT_QP_ACCESS_ERR, IBV_EVENT_COMM_EST, IBV_EVENT_SQ_DRAINED, IBV_EVENT_PATH_MIG, IBV_EVENT_PATH_MIG_ERR, IBV_EVENT_DEVICE_FATAL, IBV_EVENT_PORT_ACTIVE, IBV_EVENT_PORT_ERR, IBV_EVENT_LID_CHANGE, IBV_EVENT_PKEY_CHANGE, IBV_EVENT_SM_CHANGE, IBV_EVENT_SRQ_ERR, IBV_EVENT_SRQ_LIMIT_REACHED, IBV_EVENT_QP_LAST_WQE_REACHED, IBV_EVENT_CLIENT_REREGISTER, IBV_EVENT_GID_CHANGE, /* new experimental events start here leaving enough * room for 14 events which should be enough */ IBV_EXP_EVENT_DCT_KEY_VIOLATION = 32, IBV_EXP_EVENT_DCT_ACCESS_ERR, IBV_EXP_EVENT_DCT_REQ_ERR, }; struct ibv_async_event { union { struct ibv_cq *cq; struct ibv_qp *qp; struct ibv_srq *srq; struct ibv_exp_dct *dct; int port_num; /* For source compatible with Legacy API */ uint32_t xrc_qp_num; } element; enum ibv_event_type event_type; }; enum ibv_wc_status { IBV_WC_SUCCESS, IBV_WC_LOC_LEN_ERR, IBV_WC_LOC_QP_OP_ERR, IBV_WC_LOC_EEC_OP_ERR, IBV_WC_LOC_PROT_ERR, IBV_WC_WR_FLUSH_ERR, IBV_WC_MW_BIND_ERR, IBV_WC_BAD_RESP_ERR, IBV_WC_LOC_ACCESS_ERR, IBV_WC_REM_INV_REQ_ERR, IBV_WC_REM_ACCESS_ERR, IBV_WC_REM_OP_ERR, IBV_WC_RETRY_EXC_ERR, IBV_WC_RNR_RETRY_EXC_ERR, IBV_WC_LOC_RDD_VIOL_ERR, IBV_WC_REM_INV_RD_REQ_ERR, IBV_WC_REM_ABORT_ERR, IBV_WC_INV_EECN_ERR, IBV_WC_INV_EEC_STATE_ERR, IBV_WC_FATAL_ERR, IBV_WC_RESP_TIMEOUT_ERR, IBV_WC_GENERAL_ERR }; const char *ibv_wc_status_str(enum ibv_wc_status status); enum ibv_wc_opcode { IBV_WC_SEND, IBV_WC_RDMA_WRITE, IBV_WC_RDMA_READ, IBV_WC_COMP_SWAP, IBV_WC_FETCH_ADD, IBV_WC_BIND_MW, /* * Set value of IBV_WC_RECV so consumers can test if a completion is a * receive by testing (opcode & IBV_WC_RECV). */ IBV_WC_RECV = 1 << 7, IBV_WC_RECV_RDMA_WITH_IMM }; enum ibv_wc_flags { IBV_WC_GRH = 1 << 0, IBV_WC_WITH_IMM = 1 << 1 }; struct ibv_wc { uint64_t wr_id; enum ibv_wc_status status; enum ibv_wc_opcode opcode; uint32_t vendor_err; uint32_t byte_len; uint32_t imm_data; /* in network byte order */ uint32_t qp_num; uint32_t src_qp; int wc_flags; uint16_t pkey_index; uint16_t slid; uint8_t sl; uint8_t dlid_path_bits; }; enum ibv_access_flags { IBV_ACCESS_LOCAL_WRITE = 1, IBV_ACCESS_REMOTE_WRITE = (1<<1), IBV_ACCESS_REMOTE_READ = (1<<2), IBV_ACCESS_REMOTE_ATOMIC = (1<<3), IBV_ACCESS_MW_BIND = (1<<4), IBV_ACCESS_RELAXED_ORDERING = (1<<20), }; struct ibv_pd { struct ibv_context *context; uint32_t handle; }; enum ibv_xrcd_init_attr_mask { IBV_XRCD_INIT_ATTR_FD = 1 << 0, IBV_XRCD_INIT_ATTR_OFLAGS = 1 << 1, IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2 }; struct ibv_xrcd_init_attr { uint32_t comp_mask; int fd; int oflags; }; struct ibv_xrcd { struct ibv_context *context; }; enum ibv_rereg_mr_flags { IBV_REREG_MR_CHANGE_TRANSLATION = (1 << 0), IBV_REREG_MR_CHANGE_PD = (1 << 1), IBV_REREG_MR_CHANGE_ACCESS = (1 << 2), IBV_REREG_MR_KEEP_VALID = (1 << 3) }; struct ibv_mr { struct ibv_context *context; struct ibv_pd *pd; void *addr; size_t length; uint32_t handle; uint32_t lkey; uint32_t rkey; }; enum ibv_mw_type { IBV_MW_TYPE_1 = 1, IBV_MW_TYPE_2 = 2 }; struct ibv_mw { struct ibv_context *context; struct ibv_pd *pd; uint32_t rkey; }; struct ibv_global_route { union ibv_gid dgid; uint32_t flow_label; uint8_t sgid_index; uint8_t hop_limit; uint8_t traffic_class; }; struct ibv_grh { uint32_t version_tclass_flow; uint16_t paylen; uint8_t next_hdr; uint8_t hop_limit; union ibv_gid sgid; union ibv_gid dgid; }; enum ibv_rate { IBV_RATE_MAX = 0, IBV_RATE_2_5_GBPS = 2, IBV_RATE_5_GBPS = 5, IBV_RATE_10_GBPS = 3, IBV_RATE_20_GBPS = 6, IBV_RATE_30_GBPS = 4, IBV_RATE_40_GBPS = 7, IBV_RATE_60_GBPS = 8, IBV_RATE_80_GBPS = 9, IBV_RATE_120_GBPS = 10, IBV_RATE_14_GBPS = 11, IBV_RATE_56_GBPS = 12, IBV_RATE_112_GBPS = 13, IBV_RATE_168_GBPS = 14, IBV_RATE_25_GBPS = 15, IBV_RATE_100_GBPS = 16, IBV_RATE_200_GBPS = 17, IBV_RATE_300_GBPS = 18 }; /** * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the * base rate of 2.5 Gbit/sec. For example, IBV_RATE_5_GBPS will be * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. * @rate: rate to convert. */ int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const; /** * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum. * @mult: multiple to convert. */ enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const; /** * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec. * For example, IBV_RATE_5_GBPS will return the value 5000. * @rate: rate to convert. */ int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const; /** * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum. * @mbps: value to convert. */ enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const; struct ibv_ah_attr { struct ibv_global_route grh; uint16_t dlid; uint8_t sl; uint8_t src_path_bits; uint8_t static_rate; uint8_t is_global; uint8_t port_num; }; enum ibv_srq_attr_mask { IBV_SRQ_MAX_WR = 1 << 0, IBV_SRQ_LIMIT = 1 << 1 }; struct ibv_srq_attr { uint32_t max_wr; uint32_t max_sge; uint32_t srq_limit; }; struct ibv_srq_init_attr { void *srq_context; struct ibv_srq_attr attr; }; enum ibv_srq_type { IBV_SRQT_BASIC, IBV_SRQT_XRC }; enum ibv_srq_init_attr_mask { IBV_SRQ_INIT_ATTR_TYPE = 1 << 0, IBV_SRQ_INIT_ATTR_PD = 1 << 1, IBV_SRQ_INIT_ATTR_XRCD = 1 << 2, IBV_SRQ_INIT_ATTR_CQ = 1 << 3, IBV_SRQ_INIT_ATTR_RESERVED = 1 << 4 }; struct ibv_srq_init_attr_ex { void *srq_context; struct ibv_srq_attr attr; uint32_t comp_mask; enum ibv_srq_type srq_type; struct ibv_pd *pd; struct ibv_xrcd *xrcd; struct ibv_cq *cq; }; enum ibv_qp_type { IBV_QPT_RC = 2, IBV_QPT_UC, IBV_QPT_UD, /* XRC compatible code */ IBV_QPT_XRC, IBV_QPT_RAW_PACKET = 8, IBV_QPT_RAW_ETH = 8, IBV_QPT_XRC_SEND = 9, IBV_QPT_XRC_RECV, /* Leave a gap for future qp types before starting with * experimental qp types. */ IBV_EXP_QP_TYPE_START = 32, IBV_EXP_QPT_DC_INI = IBV_EXP_QP_TYPE_START }; struct ibv_qp_cap { uint32_t max_send_wr; uint32_t max_recv_wr; uint32_t max_send_sge; uint32_t max_recv_sge; uint32_t max_inline_data; }; struct ibv_qp_init_attr { void *qp_context; struct ibv_cq *send_cq; struct ibv_cq *recv_cq; struct ibv_srq *srq; struct ibv_qp_cap cap; enum ibv_qp_type qp_type; int sq_sig_all; /* Below is needed for backwards compatabile */ struct ibv_xrc_domain *xrc_domain; }; enum ibv_qp_init_attr_mask { IBV_QP_INIT_ATTR_PD = 1 << 0, IBV_QP_INIT_ATTR_XRCD = 1 << 1, IBV_QP_INIT_ATTR_RESERVED = 1 << 2 }; struct ibv_qp_init_attr_ex { void *qp_context; struct ibv_cq *send_cq; struct ibv_cq *recv_cq; struct ibv_srq *srq; struct ibv_qp_cap cap; enum ibv_qp_type qp_type; int sq_sig_all; uint32_t comp_mask; struct ibv_pd *pd; struct ibv_xrcd *xrcd; }; enum ibv_qp_open_attr_mask { IBV_QP_OPEN_ATTR_NUM = 1 << 0, IBV_QP_OPEN_ATTR_XRCD = 1 << 1, IBV_QP_OPEN_ATTR_CONTEXT = 1 << 2, IBV_QP_OPEN_ATTR_TYPE = 1 << 3, IBV_QP_OPEN_ATTR_RESERVED = 1 << 4 }; struct ibv_qp_open_attr { uint32_t comp_mask; uint32_t qp_num; struct ibv_xrcd *xrcd; void *qp_context; enum ibv_qp_type qp_type; }; enum ibv_qp_attr_mask { IBV_QP_STATE = 1 << 0, IBV_QP_CUR_STATE = 1 << 1, IBV_QP_EN_SQD_ASYNC_NOTIFY = 1 << 2, IBV_QP_ACCESS_FLAGS = 1 << 3, IBV_QP_PKEY_INDEX = 1 << 4, IBV_QP_PORT = 1 << 5, IBV_QP_QKEY = 1 << 6, IBV_QP_AV = 1 << 7, IBV_QP_PATH_MTU = 1 << 8, IBV_QP_TIMEOUT = 1 << 9, IBV_QP_RETRY_CNT = 1 << 10, IBV_QP_RNR_RETRY = 1 << 11, IBV_QP_RQ_PSN = 1 << 12, IBV_QP_MAX_QP_RD_ATOMIC = 1 << 13, IBV_QP_ALT_PATH = 1 << 14, IBV_QP_MIN_RNR_TIMER = 1 << 15, IBV_QP_SQ_PSN = 1 << 16, IBV_QP_MAX_DEST_RD_ATOMIC = 1 << 17, IBV_QP_PATH_MIG_STATE = 1 << 18, IBV_QP_CAP = 1 << 19, IBV_QP_DEST_QPN = 1 << 20 }; enum ibv_qp_state { IBV_QPS_RESET, IBV_QPS_INIT, IBV_QPS_RTR, IBV_QPS_RTS, IBV_QPS_SQD, IBV_QPS_SQE, IBV_QPS_ERR, IBV_QPS_UNKNOWN }; enum ibv_mig_state { IBV_MIG_MIGRATED, IBV_MIG_REARM, IBV_MIG_ARMED }; struct ibv_qp_attr { enum ibv_qp_state qp_state; enum ibv_qp_state cur_qp_state; enum ibv_mtu path_mtu; enum ibv_mig_state path_mig_state; uint32_t qkey; uint32_t rq_psn; uint32_t sq_psn; uint32_t dest_qp_num; int qp_access_flags; struct ibv_qp_cap cap; struct ibv_ah_attr ah_attr; struct ibv_ah_attr alt_ah_attr; uint16_t pkey_index; uint16_t alt_pkey_index; uint8_t en_sqd_async_notify; uint8_t sq_draining; uint8_t max_rd_atomic; uint8_t max_dest_rd_atomic; uint8_t min_rnr_timer; uint8_t port_num; uint8_t timeout; uint8_t retry_cnt; uint8_t rnr_retry; uint8_t alt_port_num; uint8_t alt_timeout; }; enum ibv_wr_opcode { IBV_WR_RDMA_WRITE, IBV_WR_RDMA_WRITE_WITH_IMM, IBV_WR_SEND, IBV_WR_SEND_WITH_IMM, IBV_WR_RDMA_READ, IBV_WR_ATOMIC_CMP_AND_SWP, IBV_WR_ATOMIC_FETCH_AND_ADD }; enum ibv_send_flags { IBV_SEND_FENCE = 1 << 0, IBV_SEND_SIGNALED = 1 << 1, IBV_SEND_SOLICITED = 1 << 2, IBV_SEND_INLINE = 1 << 3 }; struct ibv_sge { uint64_t addr; uint32_t length; uint32_t lkey; }; struct ibv_send_wr { uint64_t wr_id; struct ibv_send_wr *next; struct ibv_sge *sg_list; int num_sge; enum ibv_wr_opcode opcode; int send_flags; uint32_t imm_data; /* in network byte order */ union { struct { uint64_t remote_addr; uint32_t rkey; } rdma; struct { uint64_t remote_addr; uint64_t compare_add; uint64_t swap; uint32_t rkey; } atomic; struct { struct ibv_ah *ah; uint32_t remote_qpn; uint32_t remote_qkey; } ud; } wr; union { union { struct { uint32_t remote_srqn; } xrc; } qp_type; uint32_t xrc_remote_srq_num; }; }; struct ibv_recv_wr { uint64_t wr_id; struct ibv_recv_wr *next; struct ibv_sge *sg_list; int num_sge; }; struct ibv_mw_bind { uint64_t wr_id; struct ibv_mr *mr; void *addr; size_t length; int send_flags; int mw_access_flags; }; struct ibv_srq { struct ibv_context *context; void *srq_context; struct ibv_pd *pd; uint32_t handle; pthread_mutex_t mutex; pthread_cond_t cond; uint32_t events_completed; /* below are for source compatabilty with legacy XRC, * padding based on ibv_srq_legacy. */ uint32_t xrc_srq_num_bin_compat_padding; struct ibv_xrc_domain *xrc_domain_bin_compat_padding; struct ibv_cq *xrc_cq_bin_compat_padding; void *ibv_srq_padding; /* legacy fields */ uint32_t xrc_srq_num; struct ibv_xrc_domain *xrc_domain; struct ibv_cq *xrc_cq; }; /* Not in use in new API, needed for compilation as part of source compat layer */ enum ibv_event_flags { IBV_XRC_QP_EVENT_FLAG = 0x80000000, }; struct ibv_qp { struct ibv_context *context; void *qp_context; struct ibv_pd *pd; struct ibv_cq *send_cq; struct ibv_cq *recv_cq; struct ibv_srq *srq; uint32_t handle; uint32_t qp_num; enum ibv_qp_state state; enum ibv_qp_type qp_type; pthread_mutex_t mutex; pthread_cond_t cond; uint32_t events_completed; }; struct ibv_comp_channel { struct ibv_context *context; int fd; int refcnt; }; struct ibv_cq { struct ibv_context *context; struct ibv_comp_channel *channel; void *cq_context; uint32_t handle; int cqe; pthread_mutex_t mutex; pthread_cond_t cond; uint32_t comp_events_completed; uint32_t async_events_completed; }; struct ibv_ah { struct ibv_context *context; struct ibv_pd *pd; uint32_t handle; }; enum ibv_flow_flags { IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1, IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1, }; enum ibv_flow_attr_type { /* steering according to rule specifications */ IBV_FLOW_ATTR_NORMAL = 0x0, /* default unicast and multicast rule - * receive all Eth traffic which isn't steered to any QP */ IBV_FLOW_ATTR_ALL_DEFAULT = 0x1, /* default multicast rule - * receive all Eth multicast traffic which isn't steered to any QP */ IBV_FLOW_ATTR_MC_DEFAULT = 0x2, }; enum ibv_flow_spec_type { IBV_FLOW_SPEC_ETH = 0x20, IBV_FLOW_SPEC_IPV4 = 0x30, IBV_FLOW_SPEC_TCP = 0x40, IBV_FLOW_SPEC_UDP = 0x41, }; struct ibv_flow_eth_filter { uint8_t dst_mac[6]; uint8_t src_mac[6]; uint16_t ether_type; /* * same layout as 802.1q: prio 3, cfi 1, vlan id 12 */ uint16_t vlan_tag; }; struct ibv_flow_spec_eth { enum ibv_flow_spec_type type; uint16_t size; struct ibv_flow_eth_filter val; struct ibv_flow_eth_filter mask; }; struct ibv_flow_ipv4_filter { uint32_t src_ip; uint32_t dst_ip; }; struct ibv_flow_spec_ipv4 { enum ibv_flow_spec_type type; uint16_t size; struct ibv_flow_ipv4_filter val; struct ibv_flow_ipv4_filter mask; }; struct ibv_flow_tcp_udp_filter { uint16_t dst_port; uint16_t src_port; }; struct ibv_flow_spec_tcp_udp { enum ibv_flow_spec_type type; uint16_t size; struct ibv_flow_tcp_udp_filter val; struct ibv_flow_tcp_udp_filter mask; }; struct ibv_flow_spec { union { struct { enum ibv_flow_spec_type type; uint16_t size; } hdr; struct ibv_flow_spec_eth eth; struct ibv_flow_spec_ipv4 ipv4; struct ibv_flow_spec_tcp_udp tcp_udp; }; }; struct ibv_flow_attr { uint32_t comp_mask; enum ibv_flow_attr_type type; uint16_t size; uint16_t priority; uint8_t num_of_specs; uint8_t port; uint32_t flags; /* Following are the optional layers according to user request * struct ibv_flow_spec_xxx [L2] * struct ibv_flow_spec_yyy [L3/L4] */ }; struct ibv_flow { uint32_t comp_mask; struct ibv_context *context; uint32_t handle; }; struct ibv_device; struct ibv_context; struct ibv_device_ops { struct ibv_context * (*alloc_context)(struct ibv_device *device, int cmd_fd); void (*free_context)(struct ibv_context *context); }; enum { IBV_SYSFS_NAME_MAX = 64, IBV_SYSFS_PATH_MAX = 256 }; struct ibv_device { struct ibv_device_ops ops; enum ibv_node_type node_type; enum ibv_transport_type transport_type; /* Name of underlying kernel IB device, eg "mthca0" */ char name[IBV_SYSFS_NAME_MAX]; /* Name of uverbs device, eg "uverbs0" */ char dev_name[IBV_SYSFS_NAME_MAX]; /* Path to infiniband_verbs class device in sysfs */ char dev_path[IBV_SYSFS_PATH_MAX]; /* Path to infiniband class device in sysfs */ char ibdev_path[IBV_SYSFS_PATH_MAX]; }; struct verbs_device { struct ibv_device device; /* Must be first */ size_t sz; size_t size_of_context; int (*init_context)(struct verbs_device *device, struct ibv_context *ctx, int cmd_fd); void (*uninit_context)(struct verbs_device *device, struct ibv_context *ctx); /* future fields added here */ }; struct ibv_context_ops { int (*query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr); int (*query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); struct ibv_pd * (*alloc_pd)(struct ibv_context *context); int (*dealloc_pd)(struct ibv_pd *pd); struct ibv_mr * (*reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access); struct ibv_mr * (*rereg_mr)(struct ibv_mr *mr, int flags, struct ibv_pd *pd, void *addr, size_t length, int access); int (*dereg_mr)(struct ibv_mr *mr); struct ibv_mw * (*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type); int (*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw, struct ibv_mw_bind *mw_bind); int (*dealloc_mw)(struct ibv_mw *mw); struct ibv_cq * (*create_cq)(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector); int (*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc); int (*req_notify_cq)(struct ibv_cq *cq, int solicited_only); void (*cq_event)(struct ibv_cq *cq); int (*resize_cq)(struct ibv_cq *cq, int cqe); int (*destroy_cq)(struct ibv_cq *cq); struct ibv_srq * (*create_srq)(struct ibv_pd *pd, struct ibv_srq_init_attr *srq_init_attr); int (*modify_srq)(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, int srq_attr_mask); int (*query_srq)(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr); int (*destroy_srq)(struct ibv_srq *srq); int (*post_srq_recv)(struct ibv_srq *srq, struct ibv_recv_wr *recv_wr, struct ibv_recv_wr **bad_recv_wr); struct ibv_qp * (*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); int (*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); int (*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); int (*destroy_qp)(struct ibv_qp *qp); int (*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr); int (*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); struct ibv_ah * (*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr); int (*destroy_ah)(struct ibv_ah *ah); int (*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); int (*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); void (*async_event)(struct ibv_async_event *event); }; struct ibv_context { struct ibv_device *device; struct ibv_context_ops ops; int cmd_fd; int async_fd; int num_comp_vectors; pthread_mutex_t mutex; void *abi_compat; }; enum verbs_context_mask { VERBS_CONTEXT_XRCD = (uint64_t)1 << 0, VERBS_CONTEXT_SRQ = (uint64_t)1 << 1, VERBS_CONTEXT_QP = (uint64_t)1 << 2, VERBS_CONTEXT_RESERVED = (uint64_t)1 << 3, VERBS_CONTEXT_EXP = (uint64_t)1 << 62 }; struct verbs_context { /* "grows up" - new fields go here */ int (*_reserved_2) (void); int (*destroy_flow) (struct ibv_flow *flow); int (*_reserved_1) (void); struct ibv_flow * (*create_flow) (struct ibv_qp *qp, struct ibv_flow_attr *flow_attr); struct ibv_qp * (*open_qp)(struct ibv_context *context, struct ibv_qp_open_attr *attr); struct ibv_qp * (*create_qp_ex)(struct ibv_context *context, struct ibv_qp_init_attr_ex *qp_init_attr_ex); int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num); struct ibv_srq * (*create_srq_ex)(struct ibv_context *context, struct ibv_srq_init_attr_ex *srq_init_attr_ex); struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context, struct ibv_xrcd_init_attr *xrcd_init_attr); int (*close_xrcd)(struct ibv_xrcd *xrcd); uint64_t has_comp_mask; size_t sz; /* Must be immediately before struct ibv_context */ struct ibv_context context;/* Must be last field in the struct */ }; /*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/ /*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx) { return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ? NULL : container_of(ctx, struct verbs_context, context); } #define verbs_get_ctx_op(ctx, op) ({ \ struct verbs_context *_vctx = verbs_get_ctx(ctx); \ (!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \ !_vctx->op) ? NULL : _vctx; })*/ #define verbs_set_ctx_op(_vctx, op, ptr) ({ \ struct verbs_context *vctx = _vctx; \ if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \ vctx->op = ptr; }) static inline struct verbs_device *verbs_get_device(struct ibv_device *dev) { return (dev->ops.alloc_context) ? NULL : container_of(dev, struct verbs_device, device); } static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { return qp->context->ops.post_send(qp, wr, bad_wr); } struct ibv_ece { /* * Unique identifier of the provider vendor on the network. * The providers will set IEEE OUI here to distinguish * itself in non-homogenius network. */ uint32_t vendor_id; /* * Provider specific attributes which are supported or * needed to be enabled by ECE users. */ uint32_t options; uint32_t comp_mask; }; #endif // NCCL_IBV_CORE_H_ nccl-2.22.3-1/src/include/ibvsymbols.h000066400000000000000000000051521463451655400174510ustar00rootroot00000000000000#ifndef NCCL_IBV_SYMBOLS_H_ #define NCCL_IBV_SYMBOLS_H_ #ifdef NCCL_BUILD_RDMA_CORE #include #else #include "ibvcore.h" #endif #include "nccl.h" /* IB Verbs Function Pointers*/ struct ncclIbvSymbols { int (*ibv_internal_fork_init)(void); struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices); void (*ibv_internal_free_device_list)(struct ibv_device **list); const char * (*ibv_internal_get_device_name)(struct ibv_device *device); struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device); int (*ibv_internal_close_device)(struct ibv_context *context); int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event); void (*ibv_internal_ack_async_event)(struct ibv_async_event *event); int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr); int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context); int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd); struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access); struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access); /* DMA-BUF support */ struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); int (*ibv_internal_dereg_mr)(struct ibv_mr *mr); struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); int (*ibv_internal_destroy_cq)(struct ibv_cq *cq); struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); int (*ibv_internal_destroy_qp)(struct ibv_qp *qp); const char * (*ibv_internal_event_type_str)(enum ibv_event_type event); int (*ibv_internal_query_ece)(struct ibv_qp *qp, struct ibv_ece *ece); int (*ibv_internal_set_ece)(struct ibv_qp *qp, struct ibv_ece *ece); }; /* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */ ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols); #endif // NCCL_IBV_SYMBOLS_H_ nccl-2.22.3-1/src/include/ibvwrap.h000066400000000000000000000116371463451655400167370ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_IBVWRAP_H_ #define NCCL_IBVWRAP_H_ #ifdef NCCL_BUILD_RDMA_CORE #include #else #include "ibvcore.h" #endif #include "core.h" #include #include typedef enum ibv_return_enum { IBV_SUCCESS = 0, //!< The operation was successful } ibv_return_t; ncclResult_t wrap_ibv_symbols(void); /* NCCL wrappers of IB verbs functions */ ncclResult_t wrap_ibv_fork_init(void); ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices); ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list); const char *wrap_ibv_get_device_name(struct ibv_device *device); ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device); ncclResult_t wrap_ibv_close_device(struct ibv_context *context); ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event); ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event); ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr); ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context); ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd); ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access); struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access); ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access); /* DMA-BUF support */ ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr); ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context); ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel); ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq); static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) { int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/ if (done < 0) { WARN("Call to ibv_poll_cq() returned %d", done); return ncclSystemError; } *num_done = done; return ncclSuccess; } ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp); ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported); ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported); static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ if (ret != IBV_SUCCESS) { WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr); return ncclSystemError; } return ncclSuccess; } static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { int ret = qp->context->ops.post_recv(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ if (ret != IBV_SUCCESS) { WARN("ibv_post_recv() failed with error %s", strerror(ret)); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event); #endif //End include guard nccl-2.22.3-1/src/include/info.h000066400000000000000000000014131463451655400162070ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_INFO_H_ #define NCCL_INFO_H_ #include "nccl.h" #include "collectives.h" #include "core.h" #include "utils.h" // Used to pass NCCL call information between functions struct ncclInfo { ncclFunc_t coll; const char* opName; // NCCL Coll Args const void* sendbuff; void* recvbuff; size_t count; ncclDataType_t datatype; ncclRedOp_t op; int root; // peer for p2p operations ncclComm_t comm; cudaStream_t stream; // Algorithm details int chunkSteps; int sliceSteps; }; #endif nccl-2.22.3-1/src/include/ipcsocket.h000066400000000000000000000023351463451655400172440ustar00rootroot00000000000000/* * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved. * * See COPYRIGHT for license information */ #ifndef NCCL_IPCSOCKET_H #define NCCL_IPCSOCKET_H #include "nccl.h" #include #include #include #include #include #include #include #include #include #include #include #define NCCL_IPC_SOCKNAME_LEN 64 struct ncclIpcSocket { int fd; char socketName[NCCL_IPC_SOCKNAME_LEN]; volatile uint32_t* abortFlag; }; ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag); ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle); ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd); ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd); ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash); ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash); ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd); #endif /* NCCL_IPCSOCKET_H */ nccl-2.22.3-1/src/include/nccl_common.h000066400000000000000000000033051463451655400175450ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_DEBUG_H_ #define NCCL_DEBUG_H_ typedef enum { NCCL_LOG_NONE = 0, NCCL_LOG_VERSION = 1, NCCL_LOG_WARN = 2, NCCL_LOG_INFO = 3, NCCL_LOG_ABORT = 4, NCCL_LOG_TRACE = 5 } ncclDebugLogLevel; typedef enum { NCCL_INIT = 0x1, NCCL_COLL = 0x2, NCCL_P2P = 0x4, NCCL_SHM = 0x8, NCCL_NET = 0x10, NCCL_GRAPH = 0x20, NCCL_TUNING = 0x40, NCCL_ENV = 0x80, NCCL_ALLOC = 0x100, NCCL_CALL = 0x200, NCCL_PROXY = 0x400, NCCL_NVLS = 0x800, NCCL_BOOTSTRAP = 0x1000, NCCL_REG = 0x2000, NCCL_PROFILE = 0x4000, NCCL_ALL = ~0 } ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now typedef enum { ncclFuncBroadcast = 0, ncclFuncReduce = 1, ncclFuncAllGather = 2, ncclFuncReduceScatter = 3, ncclFuncAllReduce = 4, ncclFuncSendRecv = 5, ncclFuncSend = 6, ncclFuncRecv = 7, ncclNumFuncs = 8 } ncclFunc_t; #define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* #define NCCL_ALGO_UNDEF -1 #define NCCL_ALGO_TREE 0 #define NCCL_ALGO_RING 1 #define NCCL_ALGO_COLLNET_DIRECT 2 #define NCCL_ALGO_COLLNET_CHAIN 3 #define NCCL_ALGO_NVLS 4 #define NCCL_ALGO_NVLS_TREE 5 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 #define NCCL_PROTO_UNDEF -1 #define NCCL_PROTO_LL 0 #define NCCL_PROTO_LL128 1 #define NCCL_PROTO_SIMPLE 2 #define NCCL_ALGO_PROTO_IGNORE -1.0 #endif nccl-2.22.3-1/src/include/nccl_net.h000066400000000000000000000624501463451655400170510ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_NET_H_ #define NCCL_NET_H_ #include "nccl.h" #include "nccl_common.h" #include "net_device.h" #include #define NCCL_NET_HANDLE_MAXSIZE 128 #define NCCL_PTR_HOST 0x1 #define NCCL_PTR_CUDA 0x2 #define NCCL_PTR_DMABUF 0x4 // Maximum number of requests per comm object #define NCCL_NET_MAX_REQUESTS 32 typedef struct { char* name; // Used mostly for logging. char* pciPath; // Path to the PCI device in /sys. uint64_t guid; // Unique identifier for the NIC chip. Important for // cards with multiple PCI functions (Physical or virtual). int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] int regIsGlobal; // regMr is not tied to a particular comm int speed; // Port speed in Mbps. int port; // Port number. float latency; // Network latency int maxComms; // Maximum number of comms we can create int maxRecvs; // Maximum number of grouped receives. ncclNetDeviceType netDeviceType; // Network offload type int netDeviceVersion; // Version number for network offload } ncclNetProperties_v8_t; typedef ncclNetProperties_v8_t ncclNetProperties_t; typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); /* DMA-BUF support */ ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* sizes); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); // Copy the given mhandle to a dptr in a format usable by this plugin's device code ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); // Notify the plugin that a recv has completed by the device ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); } ncclNet_v8_t; typedef ncclNet_v8_t ncclNet_t; #define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v8 typedef struct { void* mhandle; void* address; uint32_t size; } ncclNetSGE_v8_t; typedef struct { // Name of the collective network (mainly for logs) const char* name; // Initialize the collective network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters capable of doing collective operations. // If ndev returns 0, all other functions might be set to NULL. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create connections. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Create a group for collective operations. handles have been created // using listen() above. rank indicates caller's rank in the collective network. ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); // Returns whether a reduction operation on a data type is supported. // 1 for supported, 0 otherwise. ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); /* DMA-BUF support */ ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); ncclResult_t (*deregMr)(void* collComm, void* mhandle); // Performs an asynchronous allreduce operation on the collective group. // May return request == NULL if the call cannot be performed (or would block). ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts, size_t bytesPerRank, size_t windowOffset, size_t windowBytes, void* sendMhandle, void** request); ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData, size_t bytesPerRank, size_t windowOffset, size_t windowBytes, ncclDataType_t dataType, ncclRedOp_t redOp, void* recvMhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* size); // Close and free collective comm objects ncclResult_t (*closeColl)(void* collComm); ncclResult_t (*closeListen)(void* listenComm); } ncclCollNet_v8_t; typedef ncclCollNet_v8_t ncclCollNet_t; #define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v8 typedef struct { char* name; // Used mostly for logging. char* pciPath; // Path to the PCI device in /sys. uint64_t guid; // Unique identifier for the NIC chip. Important for // cards with multiple PCI functions (Physical or virtual). int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] int speed; // Port speed in Mbps. int port; // Port number. float latency; // Network latency int maxComms; // Maximum number of comms we can create int maxRecvs; // Maximum number of grouped receives. ncclNetDeviceType netDeviceType; // Network offload type int netDeviceVersion; // Version number for network offload } ncclNetProperties_v7_t; typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); /* DMA-BUF support */ ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* sizes); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); // Copy the given mhandle to a dptr in a format usable by this plugin's device code ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); // Notify the plugin that a recv has completed by the device ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); } ncclNet_v7_t; typedef struct { // Name of the collective network (mainly for logs) const char* name; // Initialize the collective network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters capable of doing collective operations. // If ndev returns 0, all other functions might be set to NULL. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create connections. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Create a group for collective operations. handles have been created // using listen() above. rank indicates caller's rank in the collective network. ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); // Returns whether a reduction operation on a data type is supported. // 1 for supported, 0 otherwise. ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); /* DMA-BUF support */ ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); ncclResult_t (*deregMr)(void* collComm, void* mhandle); // Performs an asynchronous allreduce operation on the collective group. // May return request == NULL if the call cannot be performed (or would block). ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* size); // Close and free collective comm objects ncclResult_t (*closeColl)(void* collComm); ncclResult_t (*closeListen)(void* listenComm); } ncclCollNet_v7_t; #define NCCL_NET_MAX_REQUESTS_V6 8 // v6 struct for backwards compatibility typedef struct { char* name; // Used mostly for logging. char* pciPath; // Path to the PCI device in /sys. uint64_t guid; // Unique identifier for the NIC chip. Important for // cards with multiple PCI functions (Physical or virtual). int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] int speed; // Port speed in Mbps. int port; // Port number. float latency; // Network latency int maxComms; // Maximum number of comms we can create int maxRecvs; // Maximum number of grouped receives. } ncclNetProperties_v6_t; typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); /* DMA-BUF support */ ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* sizes); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v6_t; typedef struct { // Name of the collective network (mainly for logs) const char* name; // Initialize the collective network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters capable of doing collective operations. // If ndev returns 0, all other functions might be set to NULL. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create connections. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Create a group for collective operations. handles have been created // using listen() above. rank indicates caller's rank in the collective network. ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); // Returns whether a reduction operation on a data type is supported. // 1 for supported, 0 otherwise. ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); /* DMA-BUF support */ ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); ncclResult_t (*deregMr)(void* collComm, void* mhandle); // Performs an asynchronous allreduce operation on the collective group. // May return request == NULL if the call cannot be performed (or would block). ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* size); // Close and free collective comm objects ncclResult_t (*closeColl)(void* collComm); ncclResult_t (*closeListen)(void* listenComm); } ncclCollNet_v6_t; // v5 struct for backwards compatibility typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* sizes); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v5_t; // v5 struct for backwards compatibility typedef struct { // Name of the collective network (mainly for logs) const char* name; // Initialize the collective network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters capable of doing collective operations. // If ndev returns 0, all other functions might be set to NULL. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create connections. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Create a group for collective operations. handles have been created // using listen() above. rank indicates caller's rank in the collective network. ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); // Returns whether a reduction operation on a data type is supported. // 1 for supported, 0 otherwise. ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* collComm, void* mhandle); // Performs an asynchronous allreduce operation on the collective group. // May return request == NULL if the call cannot be performed (or would block). ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* size); // Close and free collective comm objects ncclResult_t (*closeColl)(void* collComm); ncclResult_t (*closeListen)(void* listenComm); } ncclCollNet_v5_t; #endif // end include guard nccl-2.22.3-1/src/include/nccl_tuner.h000066400000000000000000000106441463451655400174160ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_TUNER_H_ #define NCCL_TUNER_H_ #include "nccl.h" #include "nccl_common.h" // API to be implemented by external tuner typedef struct { // Name of the tuner const char* name; // Initializes tuner states. // Inputs: // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. // - nNodes: number of nodes in current communicator. // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. // Outputs: // - context: tuner context object ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); // Gets info (algo, protocol, number of ctas and threads) for a given collective. // Inputs: // - context: tuner context object // - collType: collective type , e.g., allreduce, allgather… // - nBytes: collective size in bytes // - numPipeOps: number of operations in the group // - numAlgo: number of algorithms in collCostTable // - numProto: number of protocols in collCostTable // // Outputs: // - nChannels: number of channels (hence SMs) to be used. // // InOut: // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). // // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the // default tuning for the given collective. // Also, the plugin is allowed to not set any output, or set only the // algorithm and protocol, but not only the algorithm or only the protocol. // Unset fields will be set automatically by NCCL. ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int* nChannels); // Terminates the plugin and cleans up any resources that the plugin allocated. // context: tuner context object ncclResult_t (*destroy)(void* context); } ncclTuner_v3_t; typedef ncclTuner_v3_t ncclTuner_t; #define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3" // API to be implemented by external tuner typedef struct { // Name of the tuner const char* name; // Initializes tuner states. // Inputs: // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. // - nNodes: number of nodes in current communicator. // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. // Outputs: // - context: tuner context object ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); // Gets info (algo, protocol, number of ctas and threads) for a given collective. // Inputs: // - context: tuner context object // - collType: collective type , e.g., allreduce, allgather… // - nBytes: collective size in bytes // - collNetTypeSupport: whether collnet supports this type // - nvlsTypeSupport: whether nvlink sharp supports this time // - numPipeOps: number of operations in the group // // Outputs: // - algorithm: selected algorithm to be used for the given collective // - protocol: selected protocol to be used for the give collective // - nChannels: number of channels (hence SMs) to be used. // // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the // default tuning for the given collective. // Also, the plugin is allowed to not set any output, or set only the // algorithm and protocol, but not only the algorithm or only the protocol. // Unset fields will be set automatically by NCCL. ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, int collNetSupport, int nvlsSupport, int numPipeOps, int* algorithm, int* protocol, int* nChannels); // Terminates the plugin and cleans up any resources that the plugin allocated. // context: tuner context object ncclResult_t (*destroy)(void* context); } ncclTuner_v2_t; #endif nccl-2.22.3-1/src/include/net.h000066400000000000000000000016121463451655400160430ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_INT_NET_H_ #define NCCL_INT_NET_H_ #include "nccl.h" #include "nccl_net.h" #include "comm.h" #include "checks.h" typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; ncclResult_t ncclNetPluginLoad(struct ncclComm* comm); ncclResult_t ncclNetPluginUnload(struct ncclComm* comm); ncclResult_t ncclNetInit(struct ncclComm* comm); ncclResult_t ncclNetFinalize(struct ncclComm* comm); int ncclNetVersion(struct ncclComm* comm); // Test whether the current GPU support GPU Direct RDMA. ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport); extern ncclNet_t ncclNetIb; extern ncclNet_t ncclNetSocket; #endif nccl-2.22.3-1/src/include/net_device.h000066400000000000000000000021641463451655400173650ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_NET_DEVICE_H_ #define NCCL_NET_DEVICE_H_ #define NCCL_NET_DEVICE_INVALID_VERSION 0x0 #define NCCL_NET_MTU_SIZE 4096 // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; typedef struct { ncclNetDeviceType netDeviceType; // Network offload type int netDeviceVersion; // Version number for network offload void* handle; size_t size; int needsProxyProgress; } ncclNetDeviceHandle_v7_t; typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_t; #endif nccl-2.22.3-1/src/include/nvmlwrap.h000066400000000000000000000333011463451655400171230ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_NVMLWRAP_H_ #define NCCL_NVMLWRAP_H_ #include "nccl.h" //#define NCCL_NVML_DIRECT 1 #ifndef NCCL_NVML_DIRECT #define NCCL_NVML_DIRECT 0 #endif #if NCCL_NVML_DIRECT #include "nvml.h" #else // Dynamically handle dependencies on NVML /* Extracted from nvml.h */ #define NVML_API_VERSION 12 #define NVML_STRUCT_VERSION(data, ver) (unsigned int)(sizeof(nvml ## data ## _v ## ver ## _t) | \ (ver << 24U)) typedef struct nvmlDevice_st* nvmlDevice_t; #define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16 typedef enum nvmlEnableState_enum { NVML_FEATURE_DISABLED = 0, //!< Feature disabled NVML_FEATURE_ENABLED = 1 //!< Feature enabled } nvmlEnableState_t; typedef enum nvmlNvLinkCapability_enum { NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device // should be last NVML_NVLINK_CAP_COUNT } nvmlNvLinkCapability_t; typedef enum nvmlReturn_enum { NVML_SUCCESS = 0, //!< The operation was successful NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit() NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred } nvmlReturn_t; typedef struct nvmlPciInfo_st { char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffff unsigned int bus; //!< The bus on which the device resides, 0 to 0xff unsigned int device; //!< The device's id on the bus, 0 to 31 unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id // Added in NVML 2.285 API unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID // NVIDIA reserved for internal use only unsigned int reserved0; unsigned int reserved1; unsigned int reserved2; unsigned int reserved3; } nvmlPciInfo_t; /* P2P Capability Index Status*/ typedef enum nvmlGpuP2PStatus_enum { NVML_P2P_STATUS_OK = 0, NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, NVML_P2P_STATUS_GPU_NOT_SUPPORTED, NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, NVML_P2P_STATUS_DISABLED_BY_REGKEY, NVML_P2P_STATUS_NOT_SUPPORTED, NVML_P2P_STATUS_UNKNOWN } nvmlGpuP2PStatus_t; /* P2P Capability Index*/ typedef enum nvmlGpuP2PCapsIndex_enum { NVML_P2P_CAPS_INDEX_READ = 0, NVML_P2P_CAPS_INDEX_WRITE, NVML_P2P_CAPS_INDEX_NVLINK, NVML_P2P_CAPS_INDEX_ATOMICS, NVML_P2P_CAPS_INDEX_PROP, NVML_P2P_CAPS_INDEX_UNKNOWN } nvmlGpuP2PCapsIndex_t; /** * Represents the type for sample value returned */ typedef enum nvmlValueType_enum { NVML_VALUE_TYPE_DOUBLE = 0, NVML_VALUE_TYPE_UNSIGNED_INT = 1, NVML_VALUE_TYPE_UNSIGNED_LONG = 2, NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, // Keep this last NVML_VALUE_TYPE_COUNT }nvmlValueType_t; /** * Union to represent different types of Value */ typedef union nvmlValue_st { double dVal; //!< If the value is double unsigned int uiVal; //!< If the value is unsigned int unsigned long ulVal; //!< If the value is unsigned long unsigned long long ullVal; //!< If the value is unsigned long long signed long long sllVal; //!< If the value is signed long long }nvmlValue_t; /** * Field Identifiers. * * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change. */ /* NVLink Speed */ #define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90 //!< Common NVLink Speed in MBps for active links #define NVML_FI_DEV_NVLINK_LINK_COUNT 91 //!< Number of NVLinks present on the device /** * Remote device NVLink ID * * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t. */ #define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID 146 //!< Remote device NVLink ID /** * NVSwitch: connected NVLink count */ #define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT 147 //!< Number of NVLinks connected to NVSwitch #define NVML_FI_DEV_NVLINK_GET_SPEED 164 #define NVML_FI_DEV_NVLINK_GET_STATE 165 #define NVML_FI_DEV_NVLINK_GET_VERSION 166 #define NVML_FI_DEV_C2C_LINK_COUNT 170 //!< Number of C2C Links present on the device #define NVML_FI_DEV_C2C_LINK_GET_STATUS 171 //!< C2C Link Status 0=INACTIVE 1=ACTIVE #define NVML_FI_DEV_C2C_LINK_GET_MAX_BW 172 //!< C2C Link Speed in MBps for active links #define NVML_FI_MAX 173 //!< One greater than the largest field ID defined above /** * Information for a Field Value Sample */ typedef struct nvmlFieldValue_st { unsigned int fieldId; //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above. unsigned int scopeId; //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId can represent linkId. long long timestamp; //!< CPU Timestamp of this value in microseconds since 1970 long long latencyUsec; //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by the same driver call. nvmlValueType_t valueType; //!< Type of the value stored in value nvmlReturn_t nvmlReturn; //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn != NVML_SUCCESS nvmlValue_t value; //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS } nvmlFieldValue_t; #define NVML_GPU_FABRIC_UUID_LEN 16 #define NVML_GPU_FABRIC_STATE_NOT_SUPPORTED 0 #define NVML_GPU_FABRIC_STATE_NOT_STARTED 1 #define NVML_GPU_FABRIC_STATE_IN_PROGRESS 2 #define NVML_GPU_FABRIC_STATE_COMPLETED 3 typedef unsigned char nvmlGpuFabricState_t; typedef struct { unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete". unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs nvmlGpuFabricState_t state; //!< Current state of GPU registration process } nvmlGpuFabricInfo_t; #define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED 0 #define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE 1 #define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE 2 #define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW 0 #define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW 0x11 /** * GPU Fabric Health Status Mask for various fields can be obtained * using the below macro. * Ex - NVML_GPU_FABRIC_HEALTH_GET(var, _DEGRADED_BW) */ #define NVML_GPU_FABRIC_HEALTH_GET(var, type) \ (((var) >> NVML_GPU_FABRIC_HEALTH_MASK_SHIFT##type) & \ (NVML_GPU_FABRIC_HEALTH_MASK_WIDTH##type)) /** * GPU Fabric Health Status Mask for various fields can be tested * using the below macro. * Ex - NVML_GPU_FABRIC_HEALTH_TEST(var, _DEGRADED_BW, _TRUE) */ #define NVML_GPU_FABRIC_HEALTH_TEST(var, type, val) \ (NVML_GPU_FABRIC_HEALTH_GET(var, type) == \ NVML_GPU_FABRIC_HEALTH_MASK##type##val) /** * GPU Fabric information (v2). * * Version 2 adds the \ref nvmlGpuFabricInfo_v2_t.version field * to the start of the structure, and the \ref nvmlGpuFabricInfo_v2_t.healthMask * field to the end. This structure is not backwards-compatible with * \ref nvmlGpuFabricInfo_t. */ typedef struct { unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuFabricInfo_v2) unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete". unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs nvmlGpuFabricState_t state; //!< Current state of GPU registration process unsigned int healthMask; //!< GPU Fabric health Status Mask } nvmlGpuFabricInfo_v2_t; typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t; /** * Version identifier value for \ref nvmlGpuFabricInfo_v2_t.version. */ #define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2) /** * Confidential Compute Feature Status values */ #define NVML_CC_SYSTEM_FEATURE_DISABLED 0 #define NVML_CC_SYSTEM_FEATURE_ENABLED 1 typedef struct nvmlConfComputeSystemState_st { unsigned int environment; unsigned int ccFeature; unsigned int devToolsMode; } nvmlConfComputeSystemState_t; /** * Confidential Compute Multigpu mode values */ #define NVML_CC_SYSTEM_MULTIGPU_NONE 0 #define NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE 1 /** * Confidential Compute System settings */ typedef struct { unsigned int version; unsigned int environment; unsigned int ccFeature; unsigned int devToolsMode; unsigned int multiGpuMode; } nvmlSystemConfComputeSettings_v1_t; typedef nvmlSystemConfComputeSettings_v1_t nvmlSystemConfComputeSettings_t; #define nvmlSystemConfComputeSettings_v1 NVML_STRUCT_VERSION(SystemConfComputeSettings, 1) /* End of nvml.h */ #endif // NCCL_NVML_DIRECT constexpr int ncclNvmlMaxDevices = 32; struct ncclNvmlDeviceInfo { nvmlDevice_t handle; int computeCapabilityMajor, computeCapabilityMinor; }; struct ncclNvmlDevicePairInfo { nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite; }; extern int ncclNvmlDeviceCount; extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices]; extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices]; struct ncclNvmlCCStatus { bool CCEnabled; bool multiGpuCCEnabled; }; // All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly. // Outsiders need only call it if they want to inspect the ncclNvml global // tables above. ncclResult_t ncclNvmlEnsureInitialized(); ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device); ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index); ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult); ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor); ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus); ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo); ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status); #endif // End include guard nccl-2.22.3-1/src/include/nvtx.h000066400000000000000000000061501463451655400162560ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_NVTX_H_ #define NCCL_NVTX_H_ #include "nvtx3/nvtx3.hpp" #if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14) #define NVTX3_CONSTEXPR_IF_CPP14 constexpr #else #define NVTX3_CONSTEXPR_IF_CPP14 #endif // Define all NCCL-provided static schema IDs here (avoid duplicates). #define NVTX_SID_CommInitRank 0 #define NVTX_SID_CommInitAll 1 #define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank #define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank #define NVTX_SID_AllGather 4 #define NVTX_SID_AllReduce 5 #define NVTX_SID_Broadcast 6 #define NVTX_SID_ReduceScatter 7 #define NVTX_SID_Reduce 8 #define NVTX_SID_Send 9 #define NVTX_SID_Recv 10 // Define static schema ID for the reduction operation. #define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START extern const nvtxDomainHandle_t ncclNvtxDomainHandle; struct nccl_domain{static constexpr char const* name{"NCCL"};}; class payload_schema { public: explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept { schema_attr.name = schemaName; schema_attr.entries = entries; schema_attr.numEntries = numEntries; schema_attr.schemaId = schemaId; nvtxPayloadSchemaRegister(nvtx3::domain::get(), &schema_attr); } payload_schema() = delete; ~payload_schema() = default; payload_schema(payload_schema const&) = default; payload_schema& operator=(payload_schema const&) = default; payload_schema(payload_schema&&) = default; payload_schema& operator=(payload_schema&&) = default; private: nvtxPayloadSchemaAttr_t schema_attr{ NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, nullptr, NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, nullptr, 0, 0, 0, 0, nullptr}; }; // Create NVTX push/pop range with parameters // @param name of the operation (see `NVTX_SID_*`) // @param N schema name // @param S schema (entries) // @param P payload (struct) #define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \ static const payload_schema schema{S, std::extent::value, \ NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \ static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ nvtxPayloadData_t nvtx3_bpl__[] = { \ {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \ ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \ ::nvtx3::v1::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; extern void initNvtxRegisteredEnums(); #endif nccl-2.22.3-1/src/include/nvtx3/000077500000000000000000000000001463451655400161665ustar00rootroot00000000000000nccl-2.22.3-1/src/include/nvtx3/nvToolsExt.h000066400000000000000000001431221463451655400204670ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ /** \file nvToolsExt.h */ /* ========================================================================= */ /** \mainpage * \tableofcontents * \section INTRODUCTION Introduction * * The NVIDIA Tools Extension library is a set of functions that a * developer can use to provide additional information to tools. * The additional information is used by the tool to improve * analysis and visualization of data. * * The library introduces close to zero overhead if no tool is * attached to the application. The overhead when a tool is * attached is specific to the tool. * * \section INITIALIZATION_SECTION Initialization * * Typically the tool's library that plugs into NVTX is indirectly * loaded via enviromental properties that are platform specific. * For some platform or special cases, the user may be required * to instead explicity initialize instead though. This can also * be helpful to control when the API loads a tool's library instead * of what would typically be the first function call to emit info. * For these rare case, see \ref INITIALIZATION for additional information. * * \section MARKERS_AND_RANGES Markers and Ranges * * Markers and ranges are used to describe events at a specific time (markers) * or over a time span (ranges) during the execution of the application * respectively. * * \subsection MARKERS Markers * * Markers denote specific moments in time. * * * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on * how to specify the domain. * * \subsection THREAD_RANGES Thread Ranges * * Thread ranges denote nested time ranges. Nesting is maintained per thread * per domain and does not require any additional correlation mechanism. The * duration of a thread range is defined by the corresponding pair of * nvtxRangePush* to nvtxRangePop API calls. * * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on * how to specify the domain. * * \subsection PROCESS_RANGES Process Ranges * * Process ranges denote a time span that can expose arbitrary concurrency, as * opposed to thread ranges that only support nesting. In addition the range * start event can happen on a different thread than the end marker. For the * correlation of a start/end pair an unique correlation ID is used that is * returned from the start API call and needs to be passed into the end API * call. * * \subsection EVENT_ATTRIBUTES Event Attributes * * \ref MARKERS_AND_RANGES can be annotated with various attributes to provide * additional information for an event or to guide the tool's visualization of * the data. Each of the attributes is optional and if left unused the * attributes fall back to a default value. The attributes include: * - color * - category * * To specify any attribute other than the text message, the \ref * EVENT_ATTRIBUTE_STRUCTURE "Event Attribute Structure" must be used. * * \section DOMAINS Domains * * Domains enable developers to scope annotations. By default all events and * annotations are in the default domain. Additional domains can be registered. * This allows developers to scope markers, ranges, and resources names to * avoid conflicts. * * The function ::nvtxDomainCreateA or ::nvtxDomainCreateW is used to create * a named domain. * * Each domain maintains its own * - categories * - thread range stacks * - registered strings * * The function ::nvtxDomainDestroy marks the end of the domain. Destroying * a domain unregisters and destroys all objects associated with it such as * registered strings, resource objects, named categories, and started ranges. * * \section RESOURCE_NAMING Resource Naming * * This section covers calls that allow to annotate objects with user-provided * names in order to allow for a better analysis of complex trace data. All of * the functions take the handle or the ID of the object to name and the name. * The functions can be called multiple times during the execution of an * application, however, in that case it is implementation dependent which * name will be reported by the tool. * * \subsection CATEGORY_NAMING Category Naming * * Some function in this library support associating an integer category * to enable filtering and sorting. The category naming functions allow * the application to associate a user friendly name with the integer * category. Support for domains have been added in NVTX_VERSION_2 to * avoid collisions when domains are developed independantly. * * \subsection RESOURCE_OBJECTS Resource Objects * * Resource objects are a generic mechanism for attaching data to an application * resource. The identifier field makes the association to a pointer or handle, * while the type field helps provide deeper understanding of the identifier as * well as enabling differentiation in cases where handles generated by different * APIs may collide. The resource object may also have an associated message to * associate with the application resource, enabling further annotation of this * object and how it is used. * * The resource object was introduced in NVTX_VERSION_2 to supersede existing naming * functions and allow the application resource identified by those functions to be * associated to a domain. The other naming functions are still supported for backward * compatibility but will be associated only to the default domain. * * \subsection RESOURCE_NAMING_OS Resource Naming * * Some operating system resources creation APIs do not support providing a user friendly * name, such as some OS thread creation APIs. This API support resource naming though * both through resource objects and functions following the pattern * nvtxName[RESOURCE_TYPE][A|W](identifier, name). Resource objects introduced in NVTX_VERSION 2 * supersede the other functions with a a more general method of assigning names to OS resources, * along with associating them to domains too. The older nvtxName* functions are only associated * with the default domain. * \section EXTENSIONS Optional Extensions * Optional extensions will either appear within the existing sections the extend or appear * in the "Related Pages" when they introduce new concepts. */ /** * Tools Extension API version */ #if defined(NVTX_VERSION) && NVTX_VERSION < 3 #error "Trying to #include NVTX version 3 in a source file where an older NVTX version has already been included. If you are not directly using NVTX (the NVIDIA Tools Extension library), you are getting this error because libraries you are using have included different versions of NVTX. Suggested solutions are: (1) reorder #includes so the newest NVTX version is included first, (2) avoid using the conflicting libraries in the same .c/.cpp file, or (3) update the library using the older NVTX version to use the newer version instead." #endif /* Header guard */ #if !defined(NVTX_VERSION) #define NVTX_VERSION 3 #if defined(_MSC_VER) #define NVTX_API __stdcall #define NVTX_INLINE_STATIC __inline static #else /*defined(__GNUC__)*/ #define NVTX_API #if defined(__cplusplus) || (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) #define NVTX_INLINE_STATIC inline static #else #define NVTX_INLINE_STATIC __inline__ static #endif #endif /* Platform */ #if defined(NVTX_NO_IMPL) /* When omitting implementation, avoid declaring functions inline */ /* without definitions, since this causes compiler warnings. */ #define NVTX_DECLSPEC #elif defined(NVTX_EXPORT_API) /* Allow overriding definition of NVTX_DECLSPEC when exporting API. */ /* Default is empty, meaning non-inline with external linkage. */ #if !defined(NVTX_DECLSPEC) #define NVTX_DECLSPEC #endif #else /* Normal NVTX usage defines the NVTX API inline with static */ /* (internal) linkage. */ #define NVTX_DECLSPEC NVTX_INLINE_STATIC #endif #include "nvtxDetail/nvtxLinkOnce.h" #define NVTX_VERSIONED_IDENTIFIER_L3(NAME, VERSION) NAME##_v##VERSION #define NVTX_VERSIONED_IDENTIFIER_L2(NAME, VERSION) NVTX_VERSIONED_IDENTIFIER_L3(NAME, VERSION) #define NVTX_VERSIONED_IDENTIFIER(NAME) NVTX_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION) /** * The nvToolsExt library depends on stdint.h. If the build tool chain in use * does not include stdint.h then define NVTX_STDINT_TYPES_ALREADY_DEFINED * and define the following types: *
    *
  • uint8_t *
  • int8_t *
  • uint16_t *
  • int16_t *
  • uint32_t *
  • int32_t *
  • uint64_t *
  • int64_t *
  • uintptr_t *
  • intptr_t *
* #define NVTX_STDINT_TYPES_ALREADY_DEFINED if you are using your own header file. */ #ifndef NVTX_STDINT_TYPES_ALREADY_DEFINED #include #endif #include #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /** * Result Codes */ #define NVTX_SUCCESS 0 #define NVTX_FAIL 1 #define NVTX_ERR_INIT_LOAD_PROPERTY 2 #define NVTX_ERR_INIT_ACCESS_LIBRARY 3 #define NVTX_ERR_INIT_LOAD_LIBRARY 4 #define NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT 5 #define NVTX_ERR_INIT_FAILED_LIBRARY_ENTRY_POINT 6 #define NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE 7 /** * Size of the nvtxEventAttributes_t structure. */ #define NVTX_EVENT_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxEventAttributes_t) ) ) #define NVTX_NO_PUSH_POP_TRACKING ((int)-2) typedef uint64_t nvtxRangeId_t; /* Forward declaration of opaque domain registration structure */ struct nvtxDomainRegistration_st; typedef struct nvtxDomainRegistration_st nvtxDomainRegistration; /* \brief Domain Handle Structure. * \anchor DOMAIN_HANDLE_STRUCTURE * * This structure is opaque to the user and is used as a handle to reference * a domain. This type is returned from tools when using the NVTX API to * create a domain. * */ typedef nvtxDomainRegistration* nvtxDomainHandle_t; /* Forward declaration of opaque string registration structure */ struct nvtxStringRegistration_st; typedef struct nvtxStringRegistration_st nvtxStringRegistration; /* \brief Registered String Handle Structure. * \anchor REGISTERED_STRING_HANDLE_STRUCTURE * * This structure is opaque to the user and is used as a handle to reference * a registered string. This type is returned from tools when using the NVTX * API to create a registered string. * */ typedef nvtxStringRegistration* nvtxStringHandle_t; /* ========================================================================= */ /** \defgroup GENERAL General * @{ */ /** --------------------------------------------------------------------------- * Color Types * ------------------------------------------------------------------------- */ typedef enum nvtxColorType_t { NVTX_COLOR_UNKNOWN = 0, /**< Color attribute is unused. */ NVTX_COLOR_ARGB = 1 /**< An ARGB color is provided. */ } nvtxColorType_t; /** --------------------------------------------------------------------------- * Message Types * ------------------------------------------------------------------------- */ typedef enum nvtxMessageType_t { NVTX_MESSAGE_UNKNOWN = 0, /**< Message attribute is unused. */ NVTX_MESSAGE_TYPE_ASCII = 1, /**< A character sequence is used as payload. */ NVTX_MESSAGE_TYPE_UNICODE = 2, /**< A wide character sequence is used as payload. */ /* NVTX_VERSION_2 */ NVTX_MESSAGE_TYPE_REGISTERED = 3, /**< A unique string handle that was registered with \ref nvtxDomainRegisterStringA() or \ref nvtxDomainRegisterStringW(). */ } nvtxMessageType_t; typedef union nvtxMessageValue_t { const char* ascii; const wchar_t* unicode; /* NVTX_VERSION_2 */ nvtxStringHandle_t registered; } nvtxMessageValue_t; /** @} */ /*END defgroup*/ /* ------------------------------------------------------------------------- */ /** \brief Force initialization (optional) * * Force NVTX library to initialize. The first call to any NVTX API function * will automatically initialize the entire API. This can make the first call * much slower than subsequent calls. In applications where the first call to * NVTX may be in a performance-critical section, calling nvtxInitialize before * any performance-critical sections will ensure NVTX initialization occurs at * an acceptable time. Since nvtxInitialize takes no parameters and has no * expected behavior besides initialization, it is convenient to add a call to * nvtxInitialize in NVTX-instrumented applications that need to force earlier * initialization without changing any other code. For example, if an app's * first NVTX call is nvtxDomainCreate, and it is difficult to move that call * earlier because the domain handle must be stored in an object only created * at that point, adding a call to nvtxInitialize at the top of main() will * ensure the later call to nvtxDomainCreate is as fast as possible. * * \version \NVTX_VERSION_3 * * \param reserved - must be zero or NULL. * * @{ */ NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved); /** @} */ /** @} */ /*END defgroup*/ /* ========================================================================= */ /** \defgroup EVENT_ATTRIBUTES Event Attributes * @{ */ /** --------------------------------------------------------------------------- * Payload Types * ------------------------------------------------------------------------- */ typedef enum nvtxPayloadType_t { NVTX_PAYLOAD_UNKNOWN = 0, /**< Payload attribute is unused. */ NVTX_PAYLOAD_TYPE_UNSIGNED_INT64 = 1, /**< A 64 bit unsigned integer value is used as payload. */ NVTX_PAYLOAD_TYPE_INT64 = 2, /**< A 64 bit signed integer value is used as payload. */ NVTX_PAYLOAD_TYPE_DOUBLE = 3, /**< A 64 bit floating point value is used as payload. */ /* NVTX_VERSION_2 */ NVTX_PAYLOAD_TYPE_UNSIGNED_INT32 = 4, /**< A 32 bit floating point value is used as payload. */ NVTX_PAYLOAD_TYPE_INT32 = 5, /**< A 32 bit floating point value is used as payload. */ NVTX_PAYLOAD_TYPE_FLOAT = 6 /**< A 32 bit floating point value is used as payload. */ } nvtxPayloadType_t; /** \brief Event Attribute Structure. * \anchor EVENT_ATTRIBUTE_STRUCTURE * * This structure is used to describe the attributes of an event. The layout of * the structure is defined by a specific version of the tools extension * library and can change between different versions of the Tools Extension * library. * * \par Initializing the Attributes * * The caller should always perform the following three tasks when using * attributes: *
    *
  • Zero the structure *
  • Set the version field *
  • Set the size field *
* * Zeroing the structure sets all the event attributes types and values * to the default value. * * The version and size field are used by the Tools Extension * implementation to handle multiple versions of the attributes structure. * * It is recommended that the caller use one of the following to methods * to initialize the event attributes structure: * * \par Method 1: Initializing nvtxEventAttributes for future compatibility * \code * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * \endcode * * \par Method 2: Initializing nvtxEventAttributes for a specific version * \code * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = 1; * eventAttrib.size = (uint16_t)(sizeof(nvtxEventAttributes_v1)); * \endcode * * If the caller uses Method 1 it is critical that the entire binary * layout of the structure be configured to 0 so that all fields * are initialized to the default value. * * The caller should either use both NVTX_VERSION and * NVTX_EVENT_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values * and a versioned type (Method 2). Using a mix of the two methods * will likely cause either source level incompatibility or binary * incompatibility in the future. * * \par Settings Attribute Types and Values * * * \par Example: * \code * // Initialize * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * * // Configure the Attributes * eventAttrib.colorType = NVTX_COLOR_ARGB; * eventAttrib.color = 0xFF880000; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; * eventAttrib.message.ascii = "Example"; * \endcode * * In the example the caller does not have to set the value of * \ref ::nvtxEventAttributes_v2::category or * \ref ::nvtxEventAttributes_v2::payload as these fields were set to * the default value by {0}. * \sa * ::nvtxDomainMarkEx * ::nvtxDomainRangeStartEx * ::nvtxDomainRangePushEx */ typedef struct nvtxEventAttributes_v2 { /** * \brief Version flag of the structure. * * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs * supported in this header file. This can optionally be overridden to * another version of the tools extension library. */ uint16_t version; /** * \brief Size of the structure. * * Needs to be set to the size in bytes of the event attribute * structure used to specify the event. */ uint16_t size; /** * \brief ID of the category the event is assigned to. * * A category is a user-controlled ID that can be used to group * events. The tool may use category IDs to improve filtering or * enable grouping of events in the same category. The functions * \ref ::nvtxNameCategoryA or \ref ::nvtxNameCategoryW can be used * to name a category. * * Default Value is 0 */ uint32_t category; /** \brief Color type specified in this attribute structure. * * Defines the color format of the attribute structure's \ref COLOR_FIELD * "color" field. * * Default Value is NVTX_COLOR_UNKNOWN */ int32_t colorType; /* nvtxColorType_t */ /** \brief Color assigned to this event. \anchor COLOR_FIELD * * The color that the tool should use to visualize the event. */ uint32_t color; /** * \brief Payload type specified in this attribute structure. * * Defines the payload format of the attribute structure's \ref PAYLOAD_FIELD * "payload" field. * * Default Value is NVTX_PAYLOAD_UNKNOWN */ int32_t payloadType; /* nvtxPayloadType_t */ int32_t reserved0; /** * \brief Payload assigned to this event. \anchor PAYLOAD_FIELD * * A numerical value that can be used to annotate an event. The tool could * use the payload data to reconstruct graphs and diagrams. */ union payload_t { uint64_t ullValue; int64_t llValue; double dValue; /* NVTX_VERSION_2 */ uint32_t uiValue; int32_t iValue; float fValue; } payload; /** \brief Message type specified in this attribute structure. * * Defines the message format of the attribute structure's \ref MESSAGE_FIELD * "message" field. * * Default Value is NVTX_MESSAGE_UNKNOWN */ int32_t messageType; /* nvtxMessageType_t */ /** \brief Message assigned to this attribute structure. \anchor MESSAGE_FIELD * * The text message that is attached to an event. */ nvtxMessageValue_t message; } nvtxEventAttributes_v2; typedef struct nvtxEventAttributes_v2 nvtxEventAttributes_t; /** @} */ /*END defgroup*/ /* ========================================================================= */ /** \defgroup MARKERS_AND_RANGES Markers and Ranges * * See \ref MARKERS_AND_RANGES for more details * * @{ */ /** \name Marker */ /* ------------------------------------------------------------------------- */ /** \brief Marks an instantaneous event in the application. * * A marker can contain a text message or specify additional information * using the event attributes structure. These attributes include a text * message, color, category, and a payload. Each of the attributes is optional * and can only be sent out using the \ref nvtxDomainMarkEx function. * * nvtxDomainMarkEx(NULL, event) is equivalent to calling * nvtxMarkEx(event). * * \param domain - The domain of scoping the category. * \param eventAttrib - The event attribute structure defining the marker's * attribute types and attribute values. * * \sa * ::nvtxMarkEx * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Marks an instantaneous event in the application. * * A marker can contain a text message or specify additional information * using the event attributes structure. These attributes include a text * message, color, category, and a payload. Each of the attributes is optional * and can only be sent out using the \ref nvtxMarkEx function. * If \ref nvtxMarkA or \ref nvtxMarkW are used to specify the marker * or if an attribute is unspecified then a default value will be used. * * \param eventAttrib - The event attribute structure defining the marker's * attribute types and attribute values. * * \par Example: * \code * // zero the structure * nvtxEventAttributes_t eventAttrib = {0}; * // set the version and the size information * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * // configure the attributes. 0 is the default for all attributes. * eventAttrib.colorType = NVTX_COLOR_ARGB; * eventAttrib.color = 0xFF880000; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; * eventAttrib.message.ascii = "Example nvtxMarkEx"; * nvtxMarkEx(&eventAttrib); * \endcode * * \sa * ::nvtxDomainMarkEx * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Marks an instantaneous event in the application. * * A marker created using \ref nvtxMarkA or \ref nvtxMarkW contains only a * text message. * * \param message - The message associated to this marker event. * * \par Example: * \code * nvtxMarkA("Example nvtxMarkA"); * nvtxMarkW(L"Example nvtxMarkW"); * \endcode * * \sa * ::nvtxDomainMarkEx * ::nvtxMarkEx * * \version \NVTX_VERSION_0 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message); NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message); /** @} */ /** \name Process Ranges */ /* ------------------------------------------------------------------------- */ /** \brief Starts a process range in a domain. * * \param domain - The domain of scoping the category. * \param eventAttrib - The event attribute structure defining the range's * attribute types and attribute values. * * \return The unique ID used to correlate a pair of Start and End events. * * \remarks Ranges defined by Start/End can overlap. * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("my domain"); * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; * eventAttrib.message.ascii = "my range"; * nvtxRangeId_t rangeId = nvtxDomainRangeStartEx(&eventAttrib); * // ... * nvtxDomainRangeEnd(rangeId); * \endcode * * \sa * ::nvtxDomainRangeEnd * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Starts a process range. * * \param eventAttrib - The event attribute structure defining the range's * attribute types and attribute values. * * \return The unique ID used to correlate a pair of Start and End events. * * \remarks Ranges defined by Start/End can overlap. * * \par Example: * \code * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib.category = 3; * eventAttrib.colorType = NVTX_COLOR_ARGB; * eventAttrib.color = 0xFF0088FF; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; * eventAttrib.message.ascii = "Example Range"; * nvtxRangeId_t rangeId = nvtxRangeStartEx(&eventAttrib); * // ... * nvtxRangeEnd(rangeId); * \endcode * * \sa * ::nvtxRangeEnd * ::nvtxDomainRangeStartEx * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Starts a process range. * * \param message - The event message associated to this range event. * * \return The unique ID used to correlate a pair of Start and End events. * * \remarks Ranges defined by Start/End can overlap. * * \par Example: * \code * nvtxRangeId_t r1 = nvtxRangeStartA("Range 1"); * nvtxRangeId_t r2 = nvtxRangeStartW(L"Range 2"); * nvtxRangeEnd(r1); * nvtxRangeEnd(r2); * \endcode * * \sa * ::nvtxRangeEnd * ::nvtxRangeStartEx * ::nvtxDomainRangeStartEx * * \version \NVTX_VERSION_0 * @{ */ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message); NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Ends a process range. * * \param domain - The domain * \param id - The correlation ID returned from a nvtxRangeStart call. * * \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd. * It does not need a domain param since that is associated iwth the range ID at ::nvtxDomainRangeStartEx * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("my domain"); * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; * eventAttrib.message.ascii = "my range"; * nvtxRangeId_t rangeId = nvtxDomainRangeStartEx(&eventAttrib); * // ... * nvtxDomainRangeEnd(rangeId); * \endcode * * \sa * ::nvtxDomainRangeStartEx * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Ends a process range. * * \param id - The correlation ID returned from an nvtxRangeStart call. * * \sa * ::nvtxDomainRangeStartEx * ::nvtxRangeStartEx * ::nvtxRangeStartA * ::nvtxRangeStartW * * \version \NVTX_VERSION_0 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id); /** @} */ /** \name Thread Ranges */ /* ------------------------------------------------------------------------- */ /** \brief Starts a nested thread range. * * \param domain - The domain of scoping. * \param eventAttrib - The event attribute structure defining the range's * attribute types and attribute values. * * \return The 0 based level of range being started. This value is scoped to the domain. * If an error occurs, a negative value is returned. * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain"); * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib.colorType = NVTX_COLOR_ARGB; * eventAttrib.color = 0xFFFF0000; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; * eventAttrib.message.ascii = "Level 0"; * nvtxDomainRangePushEx(domain, &eventAttrib); * * // Re-use eventAttrib * eventAttrib.messageType = NVTX_MESSAGE_TYPE_UNICODE; * eventAttrib.message.unicode = L"Level 1"; * nvtxDomainRangePushEx(domain, &eventAttrib); * * nvtxDomainRangePop(domain); //level 1 * nvtxDomainRangePop(domain); //level 0 * \endcode * * \sa * ::nvtxDomainRangePop * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Starts a nested thread range. * * \param eventAttrib - The event attribute structure defining the range's * attribute types and attribute values. * * \return The 0 based level of range being started. This level is per domain. * If an error occurs a negative value is returned. * * \par Example: * \code * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib.colorType = NVTX_COLOR_ARGB; * eventAttrib.color = 0xFFFF0000; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; * eventAttrib.message.ascii = "Level 0"; * nvtxRangePushEx(&eventAttrib); * * // Re-use eventAttrib * eventAttrib.messageType = NVTX_MESSAGE_TYPE_UNICODE; * eventAttrib.message.unicode = L"Level 1"; * nvtxRangePushEx(&eventAttrib); * * nvtxRangePop(); * nvtxRangePop(); * \endcode * * \sa * ::nvtxDomainRangePushEx * ::nvtxRangePop * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Starts a nested thread range. * * \param message - The event message associated to this range event. * * \return The 0 based level of range being started. If an error occurs a * negative value is returned. * * \par Example: * \code * nvtxRangePushA("Level 0"); * nvtxRangePushW(L"Level 1"); * nvtxRangePop(); * nvtxRangePop(); * \endcode * * \sa * ::nvtxDomainRangePushEx * ::nvtxRangePop * * \version \NVTX_VERSION_0 * @{ */ NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message); NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Ends a nested thread range. * * \return The level of the range being ended. If an error occurs a negative * value is returned on the current thread. * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreate("example library"); * nvtxDomainRangePushA(domain, "Level 0"); * nvtxDomainRangePushW(domain, L"Level 1"); * nvtxDomainRangePop(domain); * nvtxDomainRangePop(domain); * \endcode * * \sa * ::nvtxRangePushEx * ::nvtxRangePushA * ::nvtxRangePushW * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Ends a nested thread range. * * \return The level of the range being ended. If an error occurs a negative * value is returned on the current thread. * * \par Example: * \code * nvtxRangePushA("Level 0"); * nvtxRangePushW(L"Level 1"); * nvtxRangePop(); * nvtxRangePop(); * \endcode * * \sa * ::nvtxRangePushEx * ::nvtxRangePushA * ::nvtxRangePushW * * \version \NVTX_VERSION_0 * @{ */ NVTX_DECLSPEC int NVTX_API nvtxRangePop(void); /** @} */ /** @} */ /*END defgroup*/ /* ========================================================================= */ /** \defgroup RESOURCE_NAMING Resource Naming * * See \ref RESOURCE_NAMING for more details * * @{ */ /* ------------------------------------------------------------------------- */ /** \name Functions for Generic Resource Naming*/ /* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */ /** \cond SHOW_HIDDEN * \brief Resource typing helpers. * * Classes are used to make it easy to create a series of resource types * per API without collisions */ #define NVTX_RESOURCE_MAKE_TYPE(CLASS, INDEX) ((((uint32_t)(NVTX_RESOURCE_CLASS_ ## CLASS))<<16)|((uint32_t)(INDEX))) #define NVTX_RESOURCE_CLASS_GENERIC 1 /** \endcond */ /* ------------------------------------------------------------------------- */ /** \brief Generic resource type for when a resource class is not available. * * \sa * ::nvtxDomainResourceCreate * * \version \NVTX_VERSION_2 */ typedef enum nvtxResourceGenericType_t { NVTX_RESOURCE_TYPE_UNKNOWN = 0, NVTX_RESOURCE_TYPE_GENERIC_POINTER = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 1), /**< Generic pointer assumed to have no collisions with other pointers. */ NVTX_RESOURCE_TYPE_GENERIC_HANDLE = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 2), /**< Generic handle assumed to have no collisions with other handles. */ NVTX_RESOURCE_TYPE_GENERIC_THREAD_NATIVE = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 3), /**< OS native thread identifier. */ NVTX_RESOURCE_TYPE_GENERIC_THREAD_POSIX = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 4) /**< POSIX pthread identifier. */ } nvtxResourceGenericType_t; /** \brief Resource Attribute Structure. * \anchor RESOURCE_ATTRIBUTE_STRUCTURE * * This structure is used to describe the attributes of a resource. The layout of * the structure is defined by a specific version of the tools extension * library and can change between different versions of the Tools Extension * library. * * \par Initializing the Attributes * * The caller should always perform the following three tasks when using * attributes: *
    *
  • Zero the structure *
  • Set the version field *
  • Set the size field *
* * Zeroing the structure sets all the resource attributes types and values * to the default value. * * The version and size field are used by the Tools Extension * implementation to handle multiple versions of the attributes structure. * * It is recommended that the caller use one of the following to methods * to initialize the event attributes structure: * * \par Method 1: Initializing nvtxEventAttributes for future compatibility * \code * nvtxResourceAttributes_t attribs = {0}; * attribs.version = NVTX_VERSION; * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE; * \endcode * * \par Method 2: Initializing nvtxEventAttributes for a specific version * \code * nvtxResourceAttributes_v0 attribs = {0}; * attribs.version = 2; * attribs.size = (uint16_t)(sizeof(nvtxResourceAttributes_v0)); * \endcode * * If the caller uses Method 1 it is critical that the entire binary * layout of the structure be configured to 0 so that all fields * are initialized to the default value. * * The caller should either use both NVTX_VERSION and * NVTX_RESOURCE_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values * and a versioned type (Method 2). Using a mix of the two methods * will likely cause either source level incompatibility or binary * incompatibility in the future. * * \par Settings Attribute Types and Values * * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain"); * * // Initialize * nvtxResourceAttributes_t attribs = {0}; * attribs.version = NVTX_VERSION; * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE; * * // Configure the Attributes * attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER; * attribs.identifier.pValue = (const void*)pMutex; * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII; * attribs.message.ascii = "Single thread access to database."; * * nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs); * \endcode * * \sa * ::nvtxDomainResourceCreate */ typedef struct nvtxResourceAttributes_v0 { /** * \brief Version flag of the structure. * * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs * supported in this header file. This can optionally be overridden to * another version of the tools extension library. */ uint16_t version; /** * \brief Size of the structure. * * Needs to be set to the size in bytes of this attribute * structure. */ uint16_t size; /** * \brief Identifier type specifies how to interpret the identifier field * * Defines the identifier format of the attribute structure's \ref RESOURCE_IDENTIFIER_FIELD * "identifier" field. * * Default Value is NVTX_RESOURCE_TYPE_UNKNOWN */ int32_t identifierType; /* values from enums following the pattern nvtxResource[name]Type_t */ /** * \brief Identifier for the resource. * \anchor RESOURCE_IDENTIFIER_FIELD * * An identifier may be a pointer or a handle to an OS or middleware API object. * The resource type will assist in avoiding collisions where handles values may collide. */ union identifier_t { const void* pValue; uint64_t ullValue; } identifier; /** \brief Message type specified in this attribute structure. * * Defines the message format of the attribute structure's \ref RESOURCE_MESSAGE_FIELD * "message" field. * * Default Value is NVTX_MESSAGE_UNKNOWN */ int32_t messageType; /* nvtxMessageType_t */ /** \brief Message assigned to this attribute structure. \anchor RESOURCE_MESSAGE_FIELD * * The text message that is attached to a resource. */ nvtxMessageValue_t message; } nvtxResourceAttributes_v0; typedef struct nvtxResourceAttributes_v0 nvtxResourceAttributes_t; /* \cond SHOW_HIDDEN * \version \NVTX_VERSION_2 */ #define NVTX_RESOURCE_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxResourceAttributes_v0) ) ) typedef struct nvtxResourceHandle* nvtxResourceHandle_t; /** \endcond */ /* ------------------------------------------------------------------------- */ /** \brief Create a resource object to track and associate data with OS and middleware objects * * Allows users to associate an API handle or pointer with a user-provided name. * * * \param domain - Domain to own the resource object * \param attribs - Attributes to be associated with the resource * * \return A handle that represents the newly created resource object. * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain"); * nvtxResourceAttributes_t attribs = {0}; * attribs.version = NVTX_VERSION; * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE; * attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER; * attribs.identifier.pValue = (const void*)pMutex; * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII; * attribs.message.ascii = "Single thread access to database."; * nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs); * \endcode * * \sa * ::nvtxResourceAttributes_t * ::nvtxDomainResourceDestroy * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Destroy a resource object to track and associate data with OS and middleware objects * * Allows users to associate an API handle or pointer with a user-provided name. * * \param resource - Handle to the resource in which to operate. * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain"); * nvtxResourceAttributes_t attribs = {0}; * attribs.version = NVTX_VERSION; * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE; * attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER; * attribs.identifier.pValue = (const void*)pMutex; * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII; * attribs.message.ascii = "Single thread access to database."; * nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs); * nvtxDomainResourceDestroy(handle); * \endcode * * \sa * ::nvtxDomainResourceCreate * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource); /** @} */ /** \name Functions for NVTX Category Naming*/ /* ------------------------------------------------------------------------- */ /** * \brief Annotate an NVTX category used within a domain. * * Categories are used to group sets of events. Each category is identified * through a unique ID and that ID is passed into any of the marker/range * events to assign that event to a specific category. The nvtxDomainNameCategory * function calls allow the user to assign a name to a category ID that is * specific to the domain. * * nvtxDomainNameCategory(NULL, category, name) is equivalent to calling * nvtxNameCategory(category, name). * * \param domain - The domain of scoping the category. * \param category - The category ID to name. * \param name - The name of the category. * * \remarks The category names are tracked per domain. * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("example"); * nvtxDomainNameCategoryA(domain, 1, "Memory Allocation"); * nvtxDomainNameCategoryW(domain, 2, L"Memory Transfer"); * \endcode * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name); NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name); /** @} */ /** \brief Annotate an NVTX category. * * Categories are used to group sets of events. Each category is identified * through a unique ID and that ID is passed into any of the marker/range * events to assign that event to a specific category. The nvtxNameCategory * function calls allow the user to assign a name to a category ID. * * \param category - The category ID to name. * \param name - The name of the category. * * \remarks The category names are tracked per process. * * \par Example: * \code * nvtxNameCategory(1, "Memory Allocation"); * nvtxNameCategory(2, "Memory Transfer"); * nvtxNameCategory(3, "Memory Object Lifetime"); * \endcode * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name); /** @} */ /** \name Functions for OS Threads Naming*/ /* ------------------------------------------------------------------------- */ /** \brief Annotate an OS thread. * * Allows the user to name an active thread of the current process. If an * invalid thread ID is provided or a thread ID from a different process is * used the behavior of the tool is implementation dependent. * * Tools expect thread ID to be a number that uniquely identifies the thread * at the time of the call. Note that a thread's ID can be reused after * it is destroyed. Tools may choose how to handle aliasing of thread IDs. * * POSIX pthread_t type returned by pthread_self() may not comply with these * expectations. Please use OS-specific thread ID instead of pthread_t. * * The thread name is associated to the default domain. To support domains * use resource objects via ::nvtxDomainResourceCreate. * * \param threadId - The ID of the thread to name. * \param name - The name of the thread. * * \par Examples: * MS Windows: * \code * #include * nvtxNameOsThread(GetCurrentThreadId(), "Current thread"); * nvtxNameOsThread(GetThreadId(SomeThreadHandle), "Other thread"); * \endcode * * Android: * \code * #include * nvtxNameOsThreadA(gettid(), "Current thread"); * nvtxNameOsThreadA(getpid(), "Main thread"); * \endcode * * Linux: * \code * #include * nvtxNameOsThreadA(syscall(SYS_gettid), "Current thread"); * \endcode * \code * #include * nvtxNameOsThreadA(getpid(), "Main thread"); * \endcode * * OS X: * \code * #include * nvtxNameOsThreadA(syscall(SYS_thread_selfid), "Current thread"); * \endcode * \code * #include * __uint64_t id; * pthread_threadid_np(pthread_self(), &id); * nvtxNameOsThreadA(id, "Current thread"); * pthread_threadid_np(somePThreadId, &id); * nvtxNameOsThreadA(id, "Other thread"); * \endcode * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name); /** @} */ /** @} */ /*END defgroup*/ /* ========================================================================= */ /** \defgroup STRING_REGISTRATION String Registration * * Registered strings are intended to increase performance by lowering instrumentation * overhead. String may be registered once and the handle may be passed in place of * a string where an the APIs may allow. * * See \ref STRING_REGISTRATION for more details * * @{ */ /* ------------------------------------------------------------------------- */ /** \brief Register a string. * Registers an immutable string with NVTX. Once registered the pointer used * to register the domain name can be used in nvtxEventAttributes_t * \ref MESSAGE_FIELD. This allows NVTX implementation to skip copying the * contents of the message on each event invocation. * * String registration is an optimization. It is recommended to use string * registration if the string will be passed to an event many times. * * String are not unregistered, except that by unregistering the entire domain * * \param domain - Domain handle. If NULL then the global domain is used. * \param string - A unique pointer to a sequence of characters. * * \return A handle representing the registered string. * * \par Example: * \code * nvtxDomainCreateA("com.nvidia.nvtx.example"); * nvtxStringHandle_t message = nvtxDomainRegisterStringA(domain, "registered string"); * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_REGISTERED; * eventAttrib.message.registered = message; * \endcode * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string); NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string); /** @} */ /** @} */ /*END defgroup*/ /* ========================================================================= */ /** \defgroup DOMAINS Domains * * Domains are used to group events to a developer defined scope. Middleware * vendors may also scope their own events to avoid collisions with the * the application developer's events, so that the application developer may * inspect both parts and easily differentiate or filter them. By default * all events are scoped to a global domain where NULL is provided or when * using APIs provided b versions of NVTX below v2 * * Domains are intended to be typically long lived objects with the intention * of logically separating events of large modules from each other such as * middleware libraries from each other and the main application. * * See \ref DOMAINS for more details * * @{ */ /* ------------------------------------------------------------------------- */ /** \brief Register a NVTX domain. * * Domains are used to scope annotations. All NVTX_VERSION_0 and NVTX_VERSION_1 * annotations are scoped to the global domain. The function nvtxDomainCreate * creates a new named domain. * * Each domain maintains its own nvtxRangePush and nvtxRangePop stack. * * \param name - A unique string representing the domain. * * \return A handle representing the domain. * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("com.nvidia.nvtx.example"); * * nvtxMarkA("nvtxMarkA to global domain"); * * nvtxEventAttributes_t eventAttrib1 = {0}; * eventAttrib1.version = NVTX_VERSION; * eventAttrib1.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib1.message.ascii = "nvtxDomainMarkEx to global domain"; * nvtxDomainMarkEx(NULL, &eventAttrib1); * * nvtxEventAttributes_t eventAttrib2 = {0}; * eventAttrib2.version = NVTX_VERSION; * eventAttrib2.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib2.message.ascii = "nvtxDomainMarkEx to com.nvidia.nvtx.example"; * nvtxDomainMarkEx(domain, &eventAttrib2); * nvtxDomainDestroy(domain); * \endcode * * \sa * ::nvtxDomainDestroy * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* name); NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Unregister a NVTX domain. * * Unregisters the domain handle and frees all domain specific resources. * * \param domain - the domain handle * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("com.nvidia.nvtx.example"); * nvtxDomainDestroy(domain); * \endcode * * \sa * ::nvtxDomainCreateA * ::nvtxDomainCreateW * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain); /** @} */ /** @} */ /*END defgroup*/ /* ========================================================================= */ /** \cond SHOW_HIDDEN */ #ifdef UNICODE #define nvtxMark nvtxMarkW #define nvtxRangeStart nvtxRangeStartW #define nvtxRangePush nvtxRangePushW #define nvtxNameCategory nvtxNameCategoryW #define nvtxNameOsThread nvtxNameOsThreadW /* NVTX_VERSION_2 */ #define nvtxDomainCreate nvtxDomainCreateW #define nvtxDomainRegisterString nvtxDomainRegisterStringW #define nvtxDomainNameCategory nvtxDomainNameCategoryW #else #define nvtxMark nvtxMarkA #define nvtxRangeStart nvtxRangeStartA #define nvtxRangePush nvtxRangePushA #define nvtxNameCategory nvtxNameCategoryA #define nvtxNameOsThread nvtxNameOsThreadA /* NVTX_VERSION_2 */ #define nvtxDomainCreate nvtxDomainCreateA #define nvtxDomainRegisterString nvtxDomainRegisterStringA #define nvtxDomainNameCategory nvtxDomainNameCategoryA #endif /** \endcond */ #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ #define NVTX_IMPL_GUARD /* Ensure other headers cannot be included directly */ #include "nvtxDetail/nvtxTypes.h" #ifndef NVTX_NO_IMPL #include "nvtxDetail/nvtxImpl.h" #endif /*NVTX_NO_IMPL*/ #undef NVTX_IMPL_GUARD #endif /* !defined(NVTX_VERSION) */ nccl-2.22.3-1/src/include/nvtx3/nvToolsExtCounters.h000066400000000000000000000253401463451655400222130ustar00rootroot00000000000000/** * The NVTX counters extension is intended to collect counter values of various * sources. It uses the NVTX payload extension to specify the data layout a * counter group. * * A counter group is a set of counters that are collected together (at the same * time). Counters are always registered as a group. Hence, a single counter is * represented by a group with one counter. * * A sample refers to all values for a given timestamp. These values must * include counter values and may include multiple instances of a counter group. * * The NVTX domain handle is the first argument to all counter collect * functions. 0/NULL/nullptr represents the default domain (no domain). */ #include "nvToolsExtPayload.h" #ifndef NVTOOLSEXT_COUNTERS_H #define NVTOOLSEXT_COUNTERS_H /** * \brief The compatibility ID is used for versioning of this extension. */ #ifndef NVTX_EXT_COUNTERS_COMPATID #define NVTX_EXT_COUNTERS_COMPATID 0x0101 #endif /** * \brief The module ID identifies the payload extension. It has to be unique * among the extension modules. */ #ifndef NVTX_EXT_COUNTERS_MODULEID #define NVTX_EXT_COUNTERS_MODULEID 4 #endif /** Identifies an invalid scope and indicates an error if returned by `nvtxScopeRegister`. */ #define NVTX_SCOPE_NONE 0 /* no scope */ #define NVTX_SCOPE_ROOT 1 #define NVTX_SCOPE_CURRENT_HW_MACHINE 2 /* Node/machine name, Device? */ #define NVTX_SCOPE_CURRENT_HW_SOCKET 3 #define NVTX_SCOPE_CURRENT_HW_CPU 4 #define NVTX_SCOPE_CURRENT_HW_CPU_LOGICAL 5 /* Innermost HW execution context at registration time */ #define NVTX_SCOPE_CURRENT_HW_INNERMOST 6 /* Virtualized hardware, virtual machines, OS (if you don't know any better) */ #define NVTX_SCOPE_CURRENT_HYPERVISOR 7 #define NVTX_SCOPE_CURRENT_VM 8 #define NVTX_SCOPE_CURRENT_KERNEL 9 #define NVTX_SCOPE_CURRENT_CONTAINER 10 #define NVTX_SCOPE_CURRENT_OS 11 /* Software scopes */ #define NVTX_SCOPE_CURRENT_SW_PROCESS 12 /* Process scope */ #define NVTX_SCOPE_CURRENT_SW_THREAD 13 /* Thread scope */ #define NVTX_SCOPE_CURRENT_SW_FIBER 14 /* Innermost SW execution context at registration time */ #define NVTX_SCOPE_CURRENT_SW_INNERMOST 15 /** Static (user-provided) scope IDs (feed forward) */ #define NVTX_SCOPE_ID_STATIC_START (1 << 24) /** Dynamically (tool) generated scope IDs */ #define NVTX_SCOPE_ID_DYNAMIC_START 4294967296 /* 1 << 32 */ /** Identifier of the semantic extension for counters. */ #define NVTX_SEMANTIC_ID_COUNTERS_V1 5 /*** Flags to augment the counter value. ***/ #define NVTX_COUNTERS_FLAG_NONE 0 /** * Convert the fixed point value to a normalized floating point. * Use the sign/unsign from the underlying type this flag is applied to. * Unsigned [0f : 1f] or signed [-1f : 1f] */ #define NVTX_COUNTERS_FLAG_NORM (1 << 1) /** * Tools should apply scale and limits when graphing, ideally in a "soft" way to * to see when limits are exceeded. */ #define NVTX_COUNTERS_FLAG_LIMIT_MIN (1 << 2) #define NVTX_COUNTERS_FLAG_LIMIT_MAX (1 << 3) #define NVTX_COUNTERS_FLAG_LIMITS \ (NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX) /** Counter time scope **/ #define NVTX_COUNTERS_FLAG_TIME_POINT (1 << 5) #define NVTX_COUNTERS_FLAG_TIME_SINCE_LAST (2 << 5) #define NVTX_COUNTERS_FLAG_TIME_UNTIL_NEXT (3 << 5) #define NVTX_COUNTERS_FLAG_TIME_SINCE_START (4 << 5) /** Counter value type **/ #define NVTX_COUNTERS_FLAG_VALUE_ABSOLUTE (1 << 10) #define NVTX_COUNTERS_FLAG_VALUE_DELTA (2 << 10) // delta to previous counter sample /** Counter visualization hints **/ #define NVTX_COUNTERS_FLAG_INTERPOLATE (1 << 14) /** Datatypes for limits union (value of `limitType`). */ #define NVTX_COUNTERS_LIMIT_I64 0 #define NVTX_COUNTERS_LIMIT_U64 1 #define NVTX_COUNTERS_LIMIT_F64 2 /** Reasons for the missing sample value. */ #define NVTX_COUNTERS_SAMPLE_ZERO 0 #define NVTX_COUNTERS_SAMPLE_UNCHANGED 1 #define NVTX_COUNTERS_SAMPLE_UNAVAILABLE 2 #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /** * \brief Specify additional properties of a counter or counter group. */ typedef struct nvtxSemanticsCounter_v1 { /** Header of the semantic extension (with identifier, version, etc.). */ struct nvtxSemanticsHeader_v1 header; /** * Flag if normalization, scale limits, etc. should be applied to counter * values. */ uint64_t flags; /** Unit of the counter value (case insensitive) */ const char* unit; /** Should be 1 if not used. */ uint64_t unitScaleNumerator; /** Should be 1 if not used. */ uint64_t unitScaleDenominator; /** Determines the used union member. Use defines `NVTX_COUNTERS_LIMIT_*`. */ int64_t limitType; /** Soft graph limit. */ union limits_t { int64_t i64[2]; uint64_t u64[2]; double d[2]; } limits; } nvtxSemanticsCounter_t; typedef struct nvtxCountersAttr_v1 { size_t structSize; /** * A schema ID referring to the data layout of the counter group or a * predefined NVTX payloads number type. */ uint64_t schemaId; /** Name of the counter group. */ const char* name; /** Identifier of the scope of the counters. */ uint64_t scopeId; /** * (Optional) Specify additional semantics for a counter (group). The * semantics provided are applied to the all counters in a group. If the * semantics should only refer to a single counter in a group, the semantics * field of the payload entry has to be used. Accepted semantics are * `nvtxSemanticsCounter_t` and `nvtxSemanticsTime_t`. */ const nvtxSemanticsHeader_t* semantics; } nvtxCountersAttr_t; /* Forward declaration of opaque counter group registration structure */ struct nvtxCountersRegistration_st; typedef struct nvtxCountersRegistration_st nvtxCountersRegistration; /* \brief Counters Handle Structure. * \anchor COUNTERS_HANDLE_STRUCTURE * * This structure is opaque to the user and is used as a handle to reference a counter group. * This type is returned from tools when using the NVTX API to create a counters group. */ typedef nvtxCountersRegistration* nvtxCountersHandle_t; typedef struct nvtxCountersBatch_v1 { /** Handle to attributes (data layout, scope, etc.) of a counter (group). */ nvtxCountersHandle_t hCounter; /** Array of counter samples. */ const void* counters; /** Size of the `counters` array (in bytes). */ size_t cntArrSize; /** Array of timestamps or reference-time plus delta pair. `NULL` is used, if timestamps are part of the counter (group) layout.) */ const void* timestamps; /** Size of the `timestamps` array or definition (in bytes). */ size_t tsSize; } nvtxCountersBatch_t; /** * \brief Register a counter group. * * @param hDomain NVTX domain handle. * @param attr Pointer to the attributes of the counter (group). * * @return Counter handle identifying a counter or counter (group). * The counter handle is unique within the NVTX domain. */ NVTX_DECLSPEC nvtxCountersHandle_t NVTX_API nvtxCountersRegister( nvtxDomainHandle_t hDomain, const nvtxCountersAttr_t* attr); /** * \brief Sample one integer counter by value immediately (the NVTX tool determines the timestamp). * * @param hDomain handle of the NVTX domain. * @param hCounter handle of the NVTX counter (group). * @param value 64-bit integer counter value. */ NVTX_DECLSPEC void NVTX_API nvtxCountersSampleInt64( nvtxDomainHandle_t hDomain, nvtxCountersHandle_t hCounter, int64_t value); /** * \brief Sample one floating point counter by value immediately (the NVTX tool determines the timestamp). * * @param hDomain handle of the NVTX domain. * @param hCounter handle of the NVTX counter (group). * @param value 64-bit floating-point counter value. */ NVTX_DECLSPEC void NVTX_API nvtxCountersSampleFloat64( nvtxDomainHandle_t hDomain, nvtxCountersHandle_t hCounter, double value); /** * \brief Sample a counter group by reference immediately (the NVTX tool determines the timestamp). * * @param hDomain handle of the NVTX domain. * @param hCounter handle of the NVTX counter (group). * @param counters pointer to one or more counter values. * @param size size of the counter value(s) in bytes. */ NVTX_DECLSPEC void NVTX_API nvtxCountersSample( nvtxDomainHandle_t hDomain, nvtxCountersHandle_t hCounter, void* values, size_t size); /** * \brief Sample without value. * * @param hDomain handle of the NVTX domain. * @param hCounter handle of the NVTX counter (group). * @param reason reason for the missing sample value. */ NVTX_DECLSPEC void NVTX_API nvtxCountersSampleNoValue( nvtxDomainHandle_t hDomain, nvtxCountersHandle_t hCounter, uint8_t reason); /** * \brief Submit a batch of counters in the given domain. * Timestamps are part of the counter sample data. * * The size of a data sampling point is defined by the `staticSize` field of the * payload schema. An NVTX tool can assume that the counter samples are stored * as an array with each entry being `staticSize` bytes. * * @param hDomain handle of the NVTX domain * @param hCounter handle of the counter group (includes counter data decoding schema) * @param counters blob containing counter data and timestamps * @param size size of the counter data blob in bytes */ NVTX_DECLSPEC void NVTX_API nvtxCountersSubmitBatch( nvtxDomainHandle_t hDomain, nvtxCountersHandle_t hCounter, const void* counters, size_t size); /** * \brief Submit a batch of counters in the given domain. * Timestamps are separated from the counter data. * * @param hDomain handle of the NVTX domain * @param counterBatch Pointer to the counter data to be submitted. */ NVTX_DECLSPEC void NVTX_API nvtxCountersSubmitBatchEx( nvtxDomainHandle_t hDomain, const nvtxCountersBatch_t* counterBatch); #define NVTX3EXT_CBID_nvtxCountersRegister 0 #define NVTX3EXT_CBID_nvtxCountersSampleInt64 1 #define NVTX3EXT_CBID_nvtxCountersSampleFloat64 2 #define NVTX3EXT_CBID_nvtxCountersSample 3 #define NVTX3EXT_CBID_nvtxCountersSampleNoValue 4 #define NVTX3EXT_CBID_nvtxCountersSubmitBatch 5 #define NVTX3EXT_CBID_nvtxCountersSubmitBatchEx 6 #ifdef __GNUC__ #pragma GCC visibility push(internal) #endif #define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot be included directly */ #include "nvtxDetail/nvtxExtTypes.h" #undef NVTX_EXT_TYPES_GUARD #ifndef NVTX_NO_IMPL #define NVTX_EXT_IMPL_COUNTERS_GUARD /* Ensure other headers cannot be included directly */ #include "nvtxDetail/nvtxExtImplCounters_v1.h" #undef NVTX_EXT_IMPL_COUNTERS_GUARD #endif /*NVTX_NO_IMPL*/ #ifdef __GNUC__ #pragma GCC visibility pop #endif #ifdef __cplusplus } #endif /* __cplusplus */ #endif /* NVTOOLSEXT_COUNTERS_H */nccl-2.22.3-1/src/include/nvtx3/nvToolsExtCuda.h000066400000000000000000000110131463451655400212550ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #include "nvToolsExt.h" #include "cuda.h" #ifndef NVTOOLSEXT_CUDA_V3 #define NVTOOLSEXT_CUDA_V3 #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /* ========================================================================= */ /** \name Functions for CUDA Resource Naming */ /** \addtogroup RESOURCE_NAMING * \section RESOURCE_NAMING_CUDA CUDA Resource Naming * * This section covers the API functions that allow to annotate CUDA resources * with user-provided names. * * @{ */ /* ------------------------------------------------------------------------- */ /* \cond SHOW_HIDDEN * \brief Used to build a non-colliding value for resource types separated class * \version \NVTX_VERSION_2 */ #define NVTX_RESOURCE_CLASS_CUDA 4 /** \endcond */ /* ------------------------------------------------------------------------- */ /** \brief Resource types for CUDA */ typedef enum nvtxResourceCUDAType_t { NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */ NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */ NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */ NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */ } nvtxResourceCUDAType_t; /* ------------------------------------------------------------------------- */ /** \brief Annotates a CUDA device. * * Allows the user to associate a CUDA device with a user-provided name. * * \param device - The handle of the CUDA device to name. * \param name - The name of the CUDA device. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates a CUDA context. * * Allows the user to associate a CUDA context with a user-provided name. * * \param context - The handle of the CUDA context to name. * \param name - The name of the CUDA context. * * \par Example: * \code * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice ); * if ( CUDA_SUCCESS != status ) * goto Error; * nvtxNameCuContext(cuContext, "CTX_NAME"); * \endcode * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates a CUDA stream. * * Allows the user to associate a CUDA stream with a user-provided name. * * \param stream - The handle of the CUDA stream to name. * \param name - The name of the CUDA stream. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates a CUDA event. * * Allows the user to associate a CUDA event with a user-provided name. * * \param event - The handle of the CUDA event to name. * \param name - The name of the CUDA event. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name); /** @} */ /** @} */ /* END RESOURCE_NAMING */ /* ========================================================================= */ #ifdef UNICODE #define nvtxNameCuDevice nvtxNameCuDeviceW #define nvtxNameCuContext nvtxNameCuContextW #define nvtxNameCuStream nvtxNameCuStreamW #define nvtxNameCuEvent nvtxNameCuEventW #else #define nvtxNameCuDevice nvtxNameCuDeviceA #define nvtxNameCuContext nvtxNameCuContextA #define nvtxNameCuStream nvtxNameCuStreamA #define nvtxNameCuEvent nvtxNameCuEventA #endif #ifdef __cplusplus } #endif /* __cplusplus */ #ifndef NVTX_NO_IMPL #define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot be included directly */ #include "nvtxDetail/nvtxImplCuda_v3.h" #undef NVTX_IMPL_GUARD_CUDA #endif /*NVTX_NO_IMPL*/ #endif /* NVTOOLSEXT_CUDA_V3 */ nccl-2.22.3-1/src/include/nvtx3/nvToolsExtCudaRt.h000066400000000000000000000073401463451655400215730ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #include "nvToolsExt.h" #include "cuda.h" #include "driver_types.h" #ifndef NVTOOLSEXT_CUDART_V3 #define NVTOOLSEXT_CUDART_V3 #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /* ========================================================================= */ /** \name Functions for CUDA Resource Naming */ /** \addtogroup RESOURCE_NAMING * \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming * * This section covers the API functions that allow to annotate CUDA resources * with user-provided names. * * @{ */ /* ------------------------------------------------------------------------- */ /* \cond SHOW_HIDDEN * \brief Used to build a non-colliding value for resource types separated class * \version \NVTX_VERSION_2 */ #define NVTX_RESOURCE_CLASS_CUDART 5 /** \endcond */ /* ------------------------------------------------------------------------- */ /** \brief Resource types for CUDART */ typedef enum nvtxResourceCUDARTType_t { NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */ NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */ NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */ } nvtxResourceCUDARTType_t; /* ------------------------------------------------------------------------- */ /** \brief Annotates a CUDA device. * * Allows the user to associate a CUDA device with a user-provided name. * * \param device - The id of the CUDA device to name. * \param name - The name of the CUDA device. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates a CUDA stream. * * Allows the user to associate a CUDA stream with a user-provided name. * * \param stream - The handle of the CUDA stream to name. * \param name - The name of the CUDA stream. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates a CUDA event. * * Allows the user to associate a CUDA event with a user-provided name. * * \param event - The handle of the CUDA event to name. * \param name - The name of the CUDA event. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name); /** @} */ /** @} */ /* END RESOURCE_NAMING */ /* ========================================================================= */ #ifdef UNICODE #define nvtxNameCudaDevice nvtxNameCudaDeviceW #define nvtxNameCudaStream nvtxNameCudaStreamW #define nvtxNameCudaEvent nvtxNameCudaEventW #else #define nvtxNameCudaDevice nvtxNameCudaDeviceA #define nvtxNameCudaStream nvtxNameCudaStreamA #define nvtxNameCudaEvent nvtxNameCudaEventA #endif #ifdef __cplusplus } #endif /* __cplusplus */ #ifndef NVTX_NO_IMPL #define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot be included directly */ #include "nvtxDetail/nvtxImplCudaRt_v3.h" #undef NVTX_IMPL_GUARD_CUDART #endif /*NVTX_NO_IMPL*/ #endif /* NVTOOLSEXT_CUDART_V3 */ nccl-2.22.3-1/src/include/nvtx3/nvToolsExtMem.h000066400000000000000000000644101463451655400211300ustar00rootroot00000000000000/* * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #include "nvToolsExt.h" #ifndef NVTOOLSEXTV3_MEM_V1 #define NVTOOLSEXTV3_MEM_V1 #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ #define NVTX_EXT_MODULEID_MEM 1 /* \cond SHOW_HIDDEN * \brief A compatibility ID value used in structures and initialization to * identify version differences. */ #define NVTX_EXT_COMPATID_MEM 0x0102 /* \cond SHOW_HIDDEN * \brief This value is returned by functions that return `nvtxMemHeapHandle_t`, * if a tool is not attached. */ #define NVTX_MEM_HEAP_HANDLE_NO_TOOL ((nvtxMemHeapHandle_t)(intptr_t)-1) /* \cond SHOW_HIDDEN * \brief This value is returned by functions that return `nvtxMemRegionHandle_t` * if a tool is not attached. */ #define NVTX_MEM_REGION_HANDLE_NO_TOOL ((nvtxMemRegionHandle_t)(intptr_t)-1) /* \cond SHOW_HIDDEN * \brief This value is returned by functions that return `nvtxMemPermissionsHandle_t` * if a tool is not attached. */ #define NVTX_MEM_PERMISSIONS_HANDLE_NO_TOOL ((nvtxMemPermissionsHandle_t)-1) /* \cond SHOW_HIDDEN * \brief This should not be used and is considered an error but defined to * detect an accidental use of zero or NULL. */ #define NVTX_MEM_HEAP_USAGE_UNKNOWN 0x0 /* \cond SHOW_HIDDEN * \brief This should not be used and is considered an error but defined to * detect an accidental use of zero or NULL. */ #define NVTX_MEM_TYPE_UNKNOWN 0x0 /* ------------------------------------------------------------------------- */ /** \defgroup MEMORY Memory * See page \ref PAGE_MEMORY. * @{ */ /** * \brief To indicate the full process virtual address space as a heap for * functions where a nvtxMemHeapHandle_t is accepted. * * The heap by default is always read-write-execute permissions without creating regions. * Regions created in this heap have read-write access by default but not execute. */ #define NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE ((nvtxMemHeapHandle_t)0) /** \brief This heap is a sub-allocator. * * Heap created with this usage should not be accessed by the user until regions are registered. * Regions from a heap with this usage have read-write access by default but not execute. */ #define NVTX_MEM_HEAP_USAGE_TYPE_SUB_ALLOCATOR 0x1 /** * \brief This is a heap of memory that has an explicit layout. * * The layout could be static or dynamic (calculated). This often represents an algorithm's * structures that are packed together. By default this heap is assumed to be accessible for * scopes where the memory is naturally accessible by hardware. Regions may be use to further * annotate or restrict access. A tool may have an option to be more strict, but special * consideration must be made for `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`. * * The behavior of this usage is similar to NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE but * a tool can use it to track special behaviors and reservation. * * Memory in a heap with this usage has read-write permissions by default but not execute without * creating regions. Regions created in this heap have the same default permission access. */ #define NVTX_MEM_HEAP_USAGE_TYPE_LAYOUT 0x2 /** * \brief Standard process userspace virtual addresses for linear allocations. * * APIs that map into this space, such as CUDA UVA should use this type. * * Relevant functions: cudaMalloc, cudaMallocManaged, cudaHostAlloc, cudaMallocHost * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is supported * * nvtxMemHeapRegister receives a heapDesc of type nvtxMemVirtualRangeDesc_t */ #define NVTX_MEM_TYPE_VIRTUAL_ADDRESS 0x1 /** * \brief To indicate you are modifying permissions to the process-wide * full virtual address space. * * This is a companion object to `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`. */ #define NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE ((nvtxMemPermissionsHandle_t)0) #define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_NONE 0x0 #define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_READ 0x1 #define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE 0x2 #define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_ATOMIC 0x4 /* \cond SHOW_HIDDEN * \brief Forward declaration of opaque memory heap structure. */ struct nvtxMemHeap_v1; typedef struct nvtxMemHeap_v1 nvtxMemHeap_t; /** \brief A handle returned by a tool to represent a memory heap. */ typedef nvtxMemHeap_t* nvtxMemHeapHandle_t; /* \cond SHOW_HIDDEN * \brief Forward declaration of opaque memory heap structure. */ struct nvtxMemRegion_v1; typedef struct nvtxMemRegion_v1 nvtxMemRegion_t; /** \brief A handle returned by a tool to represent a memory region. */ typedef nvtxMemRegion_t* nvtxMemRegionHandle_t; /** \brief A reference to a memory region (by pointer or handle). * Which member of the union will be determined by a type or flag field outside. */ typedef union nvtxMemRegionRef_t { void const* pointer; nvtxMemRegionHandle_t handle; } nvtxMemRegionRef_t; /* \cond SHOW_HIDDEN * \brief Forward declaration of opaque memory permissions structure */ struct nvtxMemPermissions_v1; typedef struct nvtxMemPermissions_v1 nvtxMemPermissions_t; /** \brief A handle returned by a tool to represent a memory permissions mask. */ typedef nvtxMemPermissions_t* nvtxMemPermissionsHandle_t; typedef struct nvtxMemVirtualRangeDesc_v1 { size_t size; void const* ptr; } nvtxMemVirtualRangeDesc_v1 ; typedef nvtxMemVirtualRangeDesc_v1 nvtxMemVirtualRangeDesc_t; /** \brief structure to describe a heap in process virtual memory. */ typedef struct nvtxMemHeapDesc_v1 { uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ uint16_t structSize; /* Size of the structure. */ uint32_t reserved0; /** \brief Usage characteristics of the heap * * Usage characteristics help tools like memcheckers, santiizer, * as well as other debugging and profiling tools to determine some * special behaviors they should apply to the heap and it's regions. * The value follows the convention NVTX_MEM_HEAP_USAGE_* * * Default Value is 0, which is invalid. */ uint32_t usage; /** \brief Memory type characteristics of the heap * * The 'type' indicates how to interpret the ptr field of the heapDesc. * This is intended to support many additional types of memory, beyond * standard process virtual memory, such as API specific memory only * addressed by handles or multi-dimensional memory requiring more complex * descriptions to handle features like strides, tiling, or interlace. * * The values conforms to NVTX_MEM_TYPE_* * * The value in the field 'type' identifies the descriptor type that will * be in the field 'typeSpecificDesc'. 'typeSpecificDesc' is void* because * it is extensible. Example usage is if type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS, * then typeSpecificDesc points to a nvtxMemVirtualRangeDesc_t. * * Default Value is 0, which is invalid. */ uint32_t type; /** \brief size of the heap memory descriptor pointed to by typeSpecificDesc * * Default Value is 0 which is invalid. */ size_t typeSpecificDescSize; /** \brief Pointer to the heap memory descriptor * * The value in the field 'type' identifies the descriptor type that will * be in the field 'typeSpecificDesc'. 'typeSpecificDesc' is void* because * it is extensible. Example usage is if type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS, * then typeSpecificDesc points to a nvtxMemVirtualRangeDesc_t. * * Default Value is 0, which is invalid. */ void const* typeSpecificDesc; /** \brief ID of the category the event is assigned to. * * A category is a user-controlled ID that can be used to group * events. The tool may use category IDs to improve filtering or * enable grouping of events in the same category. The functions * \ref ::nvtxNameCategoryA or \ref ::nvtxNameCategoryW can be used * to name a category. * * Default Value is 0. */ uint32_t category; /** \brief Message type specified in this attribute structure. * * Defines the message format of the attribute structure's \ref MESSAGE_FIELD * "message" field. * * Default Value is `NVTX_MESSAGE_UNKNOWN`. */ uint32_t messageType; /* nvtxMessageType_t */ /** \brief Message assigned to this attribute structure. \anchor MESSAGE_FIELD * * The text message that is attached to an event. */ nvtxMessageValue_t message; } nvtxMemHeapDesc_v1 ; typedef nvtxMemHeapDesc_v1 nvtxMemHeapDesc_t; /** * \brief Create a memory heap to represent a object or range of memory that will be further * sub-divided into regions. * * The handle used to addrss the heap will depend on the heap's type. Where the heap is virtual * memory accessible, the addrss of the heap's memory itself is it's handle. This will likewise * be returned from the function. * * For more advanced types, where the heap is not virtual memory accessible the tools may be * responsible for returning a void const * that that uniquely identifies the object. Please see * the description of each heap type for more details on whether this is expected to be a uniquely * generated by the tool or otherwise. */ NVTX_DECLSPEC nvtxMemHeapHandle_t NVTX_API nvtxMemHeapRegister( nvtxDomainHandle_t domain, nvtxMemHeapDesc_t const* desc); /** \brief Destroy a memory heap. */ NVTX_DECLSPEC void NVTX_API nvtxMemHeapUnregister( nvtxDomainHandle_t domain, nvtxMemHeapHandle_t heap);/* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported */ /** * \brief Reset the memory heap wipes out any changes, as if it were a fresh heap. * * This includes invalidating all regions and their handles. */ NVTX_DECLSPEC void NVTX_API nvtxMemHeapReset( nvtxDomainHandle_t domain, nvtxMemHeapHandle_t heap); /* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is supported */ /** * \brief Register a region of memory inside of a heap. * * The heap refers the the heap within which the region resides. This can be from * `nvtxMemHeapRegister`, `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`, or one provided * from other extension API. * * The regionType arg will define which type is used in regionDescArray. * The most commonly used type is `NVTX_MEM_TYPE_VIRTUAL_ADDRESS`. * In this case regionDescElements is an array of `nvtxMemVirtualRangeDesc_t`. * * The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut. * * The regionHandleArrayOut arg points to an array where the tool will provide region handles. If * a pointer is provided, it is expected to have regionCount elements. This pointer can be NULL if * regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case, the user can use the pointer to the * virtual memory to reference the region in other related functions which accept nvtMemRegionRef_t. */ typedef struct nvtxMemRegionsRegisterBatch_v1 { uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ uint16_t structSize; /* Size of the structure. */ uint32_t regionType; /* NVTX_MEM_TYPE_* */ nvtxMemHeapHandle_t heap; size_t regionCount; size_t regionDescElementSize; void const* regionDescElements; /* This will also become the handle for this region. */ nvtxMemRegionHandle_t* regionHandleElementsOut; /* This will also become the handle for this region. */ } nvtxMemRegionsRegisterBatch_v1; typedef nvtxMemRegionsRegisterBatch_v1 nvtxMemRegionsRegisterBatch_t; /** \brief Register a region of memory inside of a heap of linear process virtual memory */ NVTX_DECLSPEC void NVTX_API nvtxMemRegionsRegister( nvtxDomainHandle_t domain, nvtxMemRegionsRegisterBatch_t const* desc); /** * \brief Register a region of memory inside of a heap. * * The heap refers the the heap within which the region resides. * This can be from nvtxMemHeapRegister, NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE, or * one provided from other extension API. * * The regionType arg will define which type is used in regionDescArray. * The most commonly used type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. * * The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut. * * The regionHandleArrayOut arg points to an array where the tool will provide region handles. If * a pointer if provided, it is expected to have regionCount elements. This pointer can be NULL if * regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case, the user can use the pointer to the * virtual memory to reference the region in other related functions which accept nvtMemRegionRef_t. */ typedef struct nvtxMemRegionsResizeBatch_v1 { uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ uint16_t structSize; /* Size of the structure. */ uint32_t regionType; /* NVTX_MEM_TYPE_* */ size_t regionDescCount; size_t regionDescElementSize; void const* regionDescElements; /* This will also become the handle for this region. */ } nvtxMemRegionsResizeBatch_v1; typedef nvtxMemRegionsResizeBatch_v1 nvtxMemRegionsResizeBatch_t; /** \brief Register a region of memory inside of a heap of linear process virtual memory */ NVTX_DECLSPEC void NVTX_API nvtxMemRegionsResize( nvtxDomainHandle_t domain, nvtxMemRegionsResizeBatch_t const* desc); #define NVTX_MEM_REGION_REF_TYPE_UNKNOWN 0x0 #define NVTX_MEM_REGION_REF_TYPE_POINTER 0x1 #define NVTX_MEM_REGION_REF_TYPE_HANDLE 0x2 /** * \brief Register a region of memory inside of a heap. * * The heap refers the the heap within which the region resides. * This can be from nvtxMemHeapRegister, `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`, or * one provided from other extension API. * * The regionType arg will define which type is used in `regionDescArray`. * The most commonly used type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. * * The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut. * * The regionHandleArrayOut arg points to an array where the tool will provide region handles. * If a pointer if provided, it is expected to have regionCount elements. * This pointer can be NULL if regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case, * the user can use the pointer to the virtual memory to reference the region in other * related functions which accept a nvtMemRegionRef_t. */ typedef struct nvtxMemRegionsUnregisterBatch_v1 { uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ uint16_t structSize; /* Size of the structure. */ uint32_t refType; /* NVTX_MEM_REGION_REF_TYPE_* */ size_t refCount; /* count of elements in refArray */ size_t refElementSize; nvtxMemRegionRef_t const* refElements; /* This will also become the handle for this region. */ } nvtxMemRegionsUnregisterBatch_v1; typedef nvtxMemRegionsUnregisterBatch_v1 nvtxMemRegionsUnregisterBatch_t; /** * \brief Unregistration for regions of process virtual memory * * This is not necessary if the nvtx heap destroy function has been called that * contains this object. */ NVTX_DECLSPEC void NVTX_API nvtxMemRegionsUnregister( nvtxDomainHandle_t domain, nvtxMemRegionsUnregisterBatch_t const* desc); typedef struct nvtxMemRegionNameDesc_v1 { uint32_t regionRefType; /* NVTX_MEM_REGION_REF_TYPE_* */ uint32_t nameType; /* nvtxMessageType_t */ nvtxMemRegionRef_t region; nvtxMessageValue_t name; uint32_t category; uint32_t reserved0; } nvtxMemRegionNameDesc_v1; typedef nvtxMemRegionNameDesc_v1 nvtxMemRegionNameDesc_t; typedef struct nvtxMemRegionsNameBatch_v1 { uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ uint16_t structSize; /* Size of the structure. */ uint32_t reserved0; size_t regionCount; size_t regionElementSize; nvtxMemRegionNameDesc_t const* regionElements; size_t reserved1; } nvtxMemRegionsNameBatch_v1 ; typedef nvtxMemRegionsNameBatch_v1 nvtxMemRegionsNameBatch_t; /** \brief Name or rename a region. */ NVTX_DECLSPEC void NVTX_API nvtxMemRegionsName( nvtxDomainHandle_t domain, nvtxMemRegionsNameBatch_t const* desc); /** \brief There are no permissions for this memory. */ #define NVTX_MEM_PERMISSIONS_REGION_FLAGS_NONE 0x0 /** \brief The memory is readable. */ #define NVTX_MEM_PERMISSIONS_REGION_FLAGS_READ 0x1 /** \brief The memory is writable. */ #define NVTX_MEM_PERMISSIONS_REGION_FLAGS_WRITE 0x2 /** \brief The memory is for atomic RW. */ #define NVTX_MEM_PERMISSIONS_REGION_FLAGS_ATOMIC 0x4 /** * \brief The memory access permissions are reset for a region. * * This is as if never set, rather than documented defaults. As as result any flags * indicating how unspecified regions are handle will affect this area. * * This should not be used with READ, WRITE, nor ATOMIC, as those flags would have no effect. */ #define NVTX_MEM_PERMISSIONS_REGION_FLAGS_RESET 0x8 typedef struct nvtxMemPermissionsAssignRegionDesc_v1 { uint32_t flags; /* NVTX_MEM_PERMISSIONS_REGION_FLAGS_* */ uint32_t regionRefType; /* NVTX_MEM_REGION_REF_TYPE_* */ nvtxMemRegionRef_t region; } nvtxMemPermissionsAssignRegionDesc_v1 ; typedef nvtxMemPermissionsAssignRegionDesc_v1 nvtxMemPermissionsAssignRegionDesc_t; typedef struct nvtxMemPermissionsAssignBatch_v1 { uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ uint16_t structSize; /* Size of the structure. */ uint32_t reserved0; nvtxMemPermissionsHandle_t permissions; size_t regionCount; size_t regionElementSize; nvtxMemPermissionsAssignRegionDesc_t const* regionElements; size_t reserved1; } nvtxMemPermissionsAssignBatch_v1 ; typedef nvtxMemPermissionsAssignBatch_v1 nvtxMemPermissionsAssignBatch_t; /** \brief Change the permissions of a region of process virtual memory. */ NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsAssign( nvtxDomainHandle_t domain, nvtxMemPermissionsAssignBatch_t const* desc); /** * \brief Create a permissions object for fine grain thread-local control in * multi-threading scenarios * * Unlike the global permissions object (NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE), a new * permissions object is empty. There are no regions registered to it, so more memory is accessible * if bound(bind) without calls to nvtxMemPermissionsSetAccess* first. The permissions are not * active until nvtxMemPermissionsBind. See `nvtxMemPermissionsBind` for more details. * * Use the flags NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_* to control how the regions in * this permission object will interact with global permissions when bound. You may choose to * either replace global memory regions setting or overlay on top of them. The most common uses are * as follows: * * To limit tools to validate writing exclusively specified in this object but inherit all * global read access regions use `NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE` * * To limit tools to validate both read & write permissions exclusively specified in this * object use NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_READ * & NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE * * Also see `nvtxMemPermissionsBind` & `nvtxMemPermissionsSetAccess*`. */ NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemPermissionsCreate( nvtxDomainHandle_t domain, int32_t creationflags); /* NVTX_MEM_PERMISSIONS_CREATE_FLAGS_* */ /** * \brief Destroy the permissions object. * * If bound(bind), destroy will also unbind it. */ NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsDestroy( nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissionsHandle); /* only supported on objects from nvtxMemPermissionsCreate */ /** \brief Reset the permissions object back to its created state. */ NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsReset( nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissionsHandle); /* NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE and other special handles are supported */ #define NVTX_MEM_PERMISSIONS_BIND_FLAGS_NONE 0x0 /** \brief Upon binding, with the thread, exclude parent scope write regions instead of overlaying on top of them. * * EX A developer may chose to first prevent all writes except the ones specified to avoid * OOB writes, since there are typically less regions written to than read from. **/ #define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_WRITE 0x2 /** \brief Upon binding, with the thread, exclude parent scope read regions instead of overlaying on top of them. * * EX After eliminating any errors when applying strict writes, a developer may then choose to * annotate and enforce strict reads behaviors in segments of code. **/ #define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_READ 0x1 /** \brief Upon binding, with the thread, exclude parent scope atomic RW regions instead of overlaying on top of them. * * EX After eliminating any errors from read and write, a developer may chose to ensure * that atomics are in their own region, removing standard read/write, and replacing with * this strict atomic only access. This way they know that conventional reads or writes * will not cause unepected issues. **/ #define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_ATOMIC 0x4 #define NVTX_MEM_PERMISSIONS_BIND_SCOPE_UNKNOWN 0x0 /** \brief Bind to thread scope. In this case, tools should validate that local thread's * execution is honoring the permissions as well as the state of NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE * at the time of binding. If this is not bound then NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE should be * used to validate the memory. * * Not all tools will support every scope, such a GPU sanitizer. **/ #define NVTX_MEM_PERMISSIONS_BIND_SCOPE_CPU_THREAD 0x1 /** * \brief Bind to CUDA stream scope. * * In this case, work enqueued to a CUDA stream should be validated by the tool, * when it executes, that it respect the permission of the permission at the point * of binding, as well as the appropriate nvtxMemCudaGetDevicePermissions at the * time of binding. If this is not bound then nvtxMemCudaGetDevicePermissions at * the time of stream enqueue should be used to validate the memory. * * This could apply to work done either on the GPU like a kernel launch or to * CPU based callbacks like cudaStreamAddCallback if the tools supports it. * * Binding is applies locally to a CPU thread so that if N CPU threads are enqueing * work to the same stream (like the default stream) that there cannot be a race * condition between thread binding vs launching their work. IE users should * expect the permissions bound in the thread to be honored by the proceeding * work (launches, copies, etc) invoked from in the CPU thread until unbound. */ #define NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM 0x2 /** * \brief Bind the permissions object into a particular scope on the caller thread * * Permissions do not take affect until binding. Binding permissions is a thread local * activity that overrides global behaviors. This is to avoid multi-threaded race conditions, * * The scope dictates what type of processing it applies to, and when in some cases. * EX1: NVTX_MEM_PERMISSIONS_BIND_SCOPE_CPU_THREAD applies to CPU code accessing memory while bound. * EX2: NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM applies to CUDA streams, and the permissions * must be recorded and applied when the work in the stream dequeues to executes. In this case * it could be GPU or CPU, if the tool support both. * * Bind can be called again on the same object and thread to take any updates to the * specified permission object or the inherited properties. * * Bind flags support changing how the binding process inherits region access control. * In the case of thread scope this is NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE and from CUDA_STREAM * this is nvtxMemCudaGetDevicePermissions. Choosing stricter modes allows the user to * further reduce the access with less work, since memory by default, behaves as natural * until the NVTX annotations instructs a tool to treat it anther way. See strict flags * for more details. * * Also see nvtxMemPermissionsUnbind */ NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsBind( nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, /* special object like NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE are not supported */ uint32_t bindScope, /* NVTX_MEM_PERMISSIONS_BIND_SCOPE_* */ uint32_t bindFlags); /* NVTX_MEM_PERMISSIONS_BIND_FLAGS_* */ /** * \brief Unbind the permissions object bound to the caller thread. * * Upon unbind, the thread local permissions for a scope are restored to the default * behavior defined by the scope. */ NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsUnbind( nvtxDomainHandle_t domain, uint32_t bindScope); /** @} */ /*END defgroup*/ typedef enum NvtxExtMemCallbackId { /* CBID 0 is invalid */ NVTX3EXT_CBID_nvtxMemHeapRegister = 1, NVTX3EXT_CBID_nvtxMemHeapUnregister = 2, NVTX3EXT_CBID_nvtxMemHeapReset = 3, NVTX3EXT_CBID_nvtxMemRegionsRegister = 4, NVTX3EXT_CBID_nvtxMemRegionsResize = 5, NVTX3EXT_CBID_nvtxMemRegionsUnregister = 6, NVTX3EXT_CBID_nvtxMemRegionsName = 7, NVTX3EXT_CBID_nvtxMemPermissionsAssign = 8, NVTX3EXT_CBID_nvtxMemPermissionsCreate = 9, NVTX3EXT_CBID_nvtxMemPermissionsDestroy = 10, NVTX3EXT_CBID_nvtxMemPermissionsReset = 11, NVTX3EXT_CBID_nvtxMemPermissionsBind = 12, NVTX3EXT_CBID_nvtxMemPermissionsUnbind = 13, /* 14-16 in nvtExtImplMemCudaRt1.h */ NVTX3EXT_CBID_nvtxMemCudaGetProcessWidePermissions = 14, NVTX3EXT_CBID_nvtxMemCudaGetDeviceWidePermissions = 15, NVTX3EXT_CBID_nvtxMemCudaSetPeerAccess = 16, NVTX3EXT_CBID_MEM_FN_NUM = 17 } NvtxExtMemCallbackId; #ifdef __GNUC__ #pragma GCC visibility push(internal) #endif /* Extension types are required for the implementation and the NVTX handler. */ #define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot be included directly */ #include "nvtxDetail/nvtxExtTypes.h" #undef NVTX_EXT_TYPES_GUARD #ifndef NVTX_NO_IMPL /* Ensure other headers cannot be included directly */ #define NVTX_EXT_IMPL_MEM_GUARD #include "nvtxDetail/nvtxExtImplMem_v1.h" #undef NVTX_EXT_IMPL_MEM_GUARD #endif /*NVTX_NO_IMPL*/ #ifdef __GNUC__ #pragma GCC visibility pop #endif #ifdef __cplusplus } #endif /* __cplusplus */ #endif /* NVTOOLSEXTV3_MEM_V1 */ nccl-2.22.3-1/src/include/nvtx3/nvToolsExtMemCudaRt.h000066400000000000000000000121031463451655400222230ustar00rootroot00000000000000/* * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTOOLSEXTV3_MEM_CUDART_V1 #define NVTOOLSEXTV3_MEM_CUDART_V1 #include "nvToolsExtMem.h" #include "cuda.h" #include "cuda_runtime.h" #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /** \brief The memory is from a CUDA runtime array. * * Relevant functions: cudaMallocArray, cudaMalloc3DArray * Also cudaArray_t from other types such as cudaMipmappedArray_t * * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported * * nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo() * nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCudaArrayRangeDesc_t */ #define NVTX_MEM_TYPE_CUDA_ARRAY 0x11 /** \brief structure to describe memory in a CUDA array object */ typedef struct nvtxMemCudaArrayRangeDesc_v1 { uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ uint16_t structSize; /* Size of the structure. */ uint32_t reserved0; cudaArray_t src; size_t offset[3]; size_t extent[3]; } nvtxMemCudaArrayRangeDesc_v1; typedef nvtxMemCudaArrayRangeDesc_v1 nvtxMemCudaArrayRangeDesc_t; /** \brief The memory is from a CUDA device array. * * Relevant functions: cuArrayCreate, cuArray3DCreate * Also CUarray from other types such as CUmipmappedArray * * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported * * nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo() * nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCuArrayRangeDesc_t */ #define NVTX_MEM_TYPE_CU_ARRAY 0x12 /** \brief structure to describe memory in a CUDA array object */ typedef struct nvtxMemCuArrayRangeDesc_v1 { uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ uint16_t structSize; /* Size of the structure. */ uint32_t reserved0; CUarray src; size_t offset[3]; size_t extent[3]; } nvtxMemCuArrayRangeDesc_v1; typedef nvtxMemCuArrayRangeDesc_v1 nvtxMemCuArrayRangeDesc_t; /* Reserving 0x2-0xF for more common types */ #define NVTX_MEM_CUDA_PEER_ALL_DEVICES -1 /** \brief Get the permission object that represent the CUDA runtime device * or cuda driver context * * This object will allow developers to adjust permissions applied to work executed * on the GPU. It may be inherited or overridden by permissions object bound * with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags. * * Ex. change the peer to peer access permissions between devices in entirety * or punch through special holes * * By default, all memory is accessible that naturally would be to a CUDA kernel until * modified otherwise by nvtxMemCudaSetPeerAccess or changing regions. * * This object should also represent the CUDA driver API level context. */ NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetProcessWidePermissions( nvtxDomainHandle_t domain); /** \brief Get the permission object that represent the CUDA runtime device * or cuda driver context * * This object will allow developers to adjust permissions applied to work executed * on the GPU. It may be inherited or overridden by permissions object bound * with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags. * * Ex. change the peer to peer access permissions between devices in entirety * or punch through special holes * * By default, all memory is accessible that naturally would be to a CUDA kernel until * modified otherwise by nvtxMemCudaSetPeerAccess or changing regions. * * This object should also represent the CUDA driver API level context. */ NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetDeviceWidePermissions( nvtxDomainHandle_t domain, int device); /** \brief Change the default behavior for all memory mapped in from a particular device. * * While typically all memory defaults to readable and writable, users may desire to limit * access to reduced default permissions such as read-only and a per-device basis. * * Regions can used to further override smaller windows of memory. * * devicePeer can be NVTX_MEM_CUDA_PEER_ALL_DEVICES * */ NVTX_DECLSPEC void NVTX_API nvtxMemCudaSetPeerAccess( nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, int devicePeer, /* device number such as from cudaGetDevice() or NVTX_MEM_CUDA_PEER_ALL_DEVICES */ uint32_t flags); /* NVTX_MEM_PERMISSIONS_REGION_FLAGS_* */ /** @} */ /*END defgroup*/ #ifdef __GNUC__ #pragma GCC visibility push(internal) #endif #ifndef NVTX_NO_IMPL #define NVTX_EXT_IMPL_MEM_CUDART_GUARD /* Ensure other headers cannot be included directly */ #include "nvtxDetail/nvtxExtImplMemCudaRt_v1.h" #undef NVTX_EXT_IMPL_MEM_CUDART_GUARD #endif /*NVTX_NO_IMPL*/ #ifdef __GNUC__ #pragma GCC visibility pop #endif #ifdef __cplusplus } #endif /* __cplusplus */ #endif /* NVTOOLSEXTV3_MEM_CUDART_V1 */ nccl-2.22.3-1/src/include/nvtx3/nvToolsExtOpenCL.h000066400000000000000000000155011463451655400215270ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #include "nvToolsExt.h" #include #ifndef NVTOOLSEXT_OPENCL_V3 #define NVTOOLSEXT_OPENCL_V3 #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /* ========================================================================= */ /** \name Functions for OpenCL Resource Naming */ /** \addtogroup RESOURCE_NAMING * \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming * * This section covers the API functions that allow to annotate OpenCL resources * with user-provided names. * * @{ */ /* ------------------------------------------------------------------------- */ /* \cond SHOW_HIDDEN * \brief Used to build a non-colliding value for resource types separated class * \version \NVTX_VERSION_2 */ #define NVTX_RESOURCE_CLASS_OPENCL 6 /** \endcond */ /* ------------------------------------------------------------------------- */ /** \brief Resource types for OpenCL */ typedef enum nvtxResourceOpenCLType_t { NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1), NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2), NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3), NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4), NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5), NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6), NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7), } nvtxResourceOpenCLType_t; /* ------------------------------------------------------------------------- */ /** \brief Annotates an OpenCL device. * * Allows to associate an OpenCL device with a user-provided name. * * \param device - The handle of the OpenCL device to name. * \param name - The name of the OpenCL device. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates an OpenCL context. * * Allows to associate an OpenCL context with a user-provided name. * * \param context - The handle of the OpenCL context to name. * \param name - The name of the OpenCL context. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates an OpenCL command queue. * * Allows to associate an OpenCL command queue with a user-provided name. * * \param command_queue - The handle of the OpenCL command queue to name. * \param name - The name of the OpenCL command queue. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates an OpenCL memory object. * * Allows to associate an OpenCL memory object with a user-provided name. * * \param memobj - The handle of the OpenCL memory object to name. * \param name - The name of the OpenCL memory object. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates an OpenCL sampler. * * Allows to associate an OpenCL sampler with a user-provided name. * * \param sampler - The handle of the OpenCL sampler to name. * \param name - The name of the OpenCL sampler. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates an OpenCL program. * * Allows to associate an OpenCL program with a user-provided name. * * \param program - The handle of the OpenCL program to name. * \param name - The name of the OpenCL program. * * \code * cpProgram = clCreateProgramWithSource(cxGPUContext, 1, * (const char **) &cSourceCL, &program_length, &ciErrNum); * shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); * nvtxNameClProgram(cpProgram, L"PROGRAM_NAME"); * \endcode * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates an OpenCL event. * * Allows to associate an OpenCL event with a user-provided name. * * \param evnt - The handle of the OpenCL event to name. * \param name - The name of the OpenCL event. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name); /** @} */ /** @} */ /* END RESOURCE_NAMING */ /* ========================================================================= */ #ifdef UNICODE #define nvtxNameClDevice nvtxNameClDeviceW #define nvtxNameClContext nvtxNameClContextW #define nvtxNameClCommandQueue nvtxNameClCommandQueueW #define nvtxNameClMemObject nvtxNameClMemObjectW #define nvtxNameClSampler nvtxNameClSamplerW #define nvtxNameClProgram nvtxNameClProgramW #define nvtxNameClEvent nvtxNameClEventW #else #define nvtxNameClDevice nvtxNameClDeviceA #define nvtxNameClContext nvtxNameClContextA #define nvtxNameClCommandQueue nvtxNameClCommandQueueA #define nvtxNameClMemObject nvtxNameClMemObjectA #define nvtxNameClSampler nvtxNameClSamplerA #define nvtxNameClProgram nvtxNameClProgramA #define nvtxNameClEvent nvtxNameClEventA #endif #ifdef __cplusplus } #endif /* __cplusplus */ #ifndef NVTX_NO_IMPL #define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot be included directly */ #include "nvtxDetail/nvtxImplOpenCL_v3.h" #undef NVTX_IMPL_GUARD_OPENCL #endif /*NVTX_NO_IMPL*/ #endif /* NVTOOLSEXT_OPENCL_V3 */ nccl-2.22.3-1/src/include/nvtx3/nvToolsExtPayload.h000066400000000000000000001111231463451655400217750ustar00rootroot00000000000000/* * Copyright 2021-2024 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #include "nvToolsExt.h" /* Optionally include helper macros. */ /* #include "nvToolsExtPayloadHelper.h" */ /** * If needed, semantic extension headers can be included after this header. */ /** * \brief The compatibility ID is used for versioning of this extension. */ #ifndef NVTX_EXT_PAYLOAD_COMPATID #define NVTX_EXT_PAYLOAD_COMPATID 0x0103 #endif /** * \brief The module ID identifies the payload extension. It has to be unique * among the extension modules. */ #ifndef NVTX_EXT_PAYLOAD_MODULEID #define NVTX_EXT_PAYLOAD_MODULEID 2 #endif /** * \brief Additional value for the enum @ref nvtxPayloadType_t */ #ifndef NVTX_PAYLOAD_TYPE_EXT #define NVTX_PAYLOAD_TYPE_EXT ((int32_t)0xDFBD0009) #endif /** --------------------------------------------------------------------------- * Payload schema entry flags. Used for @ref nvtxPayloadSchemaEntry_t::flags. * ------------------------------------------------------------------------- */ #ifndef NVTX_PAYLOAD_ENTRY_FLAGS_V1 #define NVTX_PAYLOAD_ENTRY_FLAGS_V1 #define NVTX_PAYLOAD_ENTRY_FLAG_UNUSED 0 /** * Absolute pointer into a payload (entry) of the same event. */ #define NVTX_PAYLOAD_ENTRY_FLAG_POINTER (1 << 1) /** * Offset from base address of the payload. */ #define NVTX_PAYLOAD_ENTRY_FLAG_OFFSET_FROM_BASE (1 << 2) /** * Offset from the end of this payload entry. */ #define NVTX_PAYLOAD_ENTRY_FLAG_OFFSET_FROM_HERE (1 << 3) /** * The value is an array with fixed length, set with the field `arrayLength`. */ #define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE (1 << 4) /** * The value is a zero-/null-terminated array. */ #define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED (2 << 4) /** * \brief A single or multi-dimensional array of variable length. * * The field `arrayOrUnionDetail` contains the index of the schema entry that * holds the length(s). If the length entry is a scalar, then this entry is a 1D * array. If the length entry is a fixed-size array, then the number of * dimensions is defined with the registration of the schema. If the length * entry is a zero-terminated array, then the array of the dimensions can be * determined at runtime. * For multidimensional arrays, values are stored in row-major order, with rows * being stored consecutively in contiguous memory. The size of the entry (in * bytes) is the product of the dimensions multiplied with size of the array * element. */ #define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX (3 << 4) /** * \brief A single or multi-dimensional array of variable length, where the * dimensions are stored in a different payload (index) of the same event. * * This enables an existing address to an array to be directly passed, while the * dimensions are defined in a separate payload (with only one payload entry). */ #define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_PAYLOAD_INDEX (4 << 4) /** * \brief The value or data that is pointed to by this payload entry value shall * be copied by the NVTX handler. * * A tool may not support deep copy and just ignore this flag. * See @ref NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY for more details. */ #define NVTX_PAYLOAD_ENTRY_FLAG_DEEP_COPY (1 << 8) /** * Notifies the NVTX handler to hide this entry in case of visualization. */ #define NVTX_PAYLOAD_ENTRY_FLAG_HIDE (1 << 9) /** * The entry specifies the event message. Any string type can be used. */ #define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE (1 << 10) /** * \brief The entry contains an event timestamp. * * The time source might be provided via the entry semantics field. In most * cases, the timestamp (entry) type is @ref NVTX_PAYLOAD_ENTRY_TYPE_UINT64. */ #define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP (2 << 10) /** * These flags specify the NVTX event type to which an entry refers. */ #define NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN (1 << 12) #define NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END (2 << 12) #define NVTX_PAYLOAD_ENTRY_FLAG_MARK (3 << 12) #define NVTX_PAYLOAD_ENTRY_FLAG_COUNTER (4 << 12) #endif /* NVTX_PAYLOAD_ENTRY_FLAGS_V1 */ /** --------------------------------------------------------------------------- * END: Payload schema entry flags. * ------------------------------------------------------------------------- */ /** \todo: Keep this in the header? */ /** * @note The ‘array’ flags assume that the array is embedded. Otherwise, * @ref NVTX_PAYLOAD_ENTRY_FLAG_POINTER has to be additionally specified. Some * combinations may be invalid based on the `NVTX_PAYLOAD_SCHEMA_TYPE_*` this * entry is enclosed. For instance, variable length embedded arrays are valid * within @ref NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC but invalid with * @ref NVTX_PAYLOAD_SCHEMA_TYPE_STATIC. See `NVTX_PAYLOAD_SCHEMA_TYPE_*` for * additional details. */ /* Helper macro to check if an entry represents an array. */ #define NVTX_PAYLOAD_ENTRY_FLAG_IS_ARRAY (\ NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE | \ NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED | \ NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX) #define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_TYPE(F) \ (F & NVTX_PAYLOAD_ENTRY_FLAG_IS_ARRAY) /** \todo end */ /** --------------------------------------------------------------------------- * Types of entries in a payload schema. * * @note Several of the predefined types contain the size (in bits) in their * names. For some data types the size (in bytes) is not fixed and may differ * for different platforms/operating systems/compilers. To provide portability, * an array of sizes (in bytes) for type 1 to 28 ( @ref * NVTX_PAYLOAD_ENTRY_TYPE_CHAR to @ref NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE) * is passed to the NVTX extension initialization function * @ref InitializeInjectionNvtxExtension via the `extInfo` field of * @ref nvtxExtModuleInfo_t. * ------------------------------------------------------------------------- */ #ifndef NVTX_PAYLOAD_ENTRY_TYPES_V1 #define NVTX_PAYLOAD_ENTRY_TYPES_V1 #define NVTX_PAYLOAD_ENTRY_TYPE_INVALID 0 /** * Basic integer types. */ #define NVTX_PAYLOAD_ENTRY_TYPE_CHAR 1 #define NVTX_PAYLOAD_ENTRY_TYPE_UCHAR 2 #define NVTX_PAYLOAD_ENTRY_TYPE_SHORT 3 #define NVTX_PAYLOAD_ENTRY_TYPE_USHORT 4 #define NVTX_PAYLOAD_ENTRY_TYPE_INT 5 #define NVTX_PAYLOAD_ENTRY_TYPE_UINT 6 #define NVTX_PAYLOAD_ENTRY_TYPE_LONG 7 #define NVTX_PAYLOAD_ENTRY_TYPE_ULONG 8 #define NVTX_PAYLOAD_ENTRY_TYPE_LONGLONG 9 #define NVTX_PAYLOAD_ENTRY_TYPE_ULONGLONG 10 /** * Integer types with explicit size. */ #define NVTX_PAYLOAD_ENTRY_TYPE_INT8 11 #define NVTX_PAYLOAD_ENTRY_TYPE_UINT8 12 #define NVTX_PAYLOAD_ENTRY_TYPE_INT16 13 #define NVTX_PAYLOAD_ENTRY_TYPE_UINT16 14 #define NVTX_PAYLOAD_ENTRY_TYPE_INT32 15 #define NVTX_PAYLOAD_ENTRY_TYPE_UINT32 16 #define NVTX_PAYLOAD_ENTRY_TYPE_INT64 17 #define NVTX_PAYLOAD_ENTRY_TYPE_UINT64 18 /** * Floating point types */ #define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT 19 #define NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE 20 #define NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE 21 /** * Size type (`size_t` in C). */ #define NVTX_PAYLOAD_ENTRY_TYPE_SIZE 22 /** * Any address, e.g. `void*`. If the pointer type matters, use the flag @ref * NVTX_PAYLOAD_ENTRY_FLAG_POINTER and the respective type instead. */ #define NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS 23 /** * Special character types. */ #define NVTX_PAYLOAD_ENTRY_TYPE_WCHAR 24 /* wide character (since C90) */ #define NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 25 /* since C2x and C++20 */ #define NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 26 #define NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 27 /** * There is type size and alignment information for all previous types. */ #define NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE (NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 + 1) /** * Store raw 8-bit binary data. As with `char`, 1-byte alignment is assumed. * Typically, a tool will display this as hex or binary. */ #define NVTX_PAYLOAD_ENTRY_TYPE_BYTE 32 /** * These types do not have standardized equivalents. It is assumed that the * number at the end corresponds to the bits used to store the value and that * the alignment corresponds to standardized types of the same size. * A tool may not support these types. */ #define NVTX_PAYLOAD_ENTRY_TYPE_INT128 33 #define NVTX_PAYLOAD_ENTRY_TYPE_UINT128 34 #define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT16 42 #define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT32 43 #define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT64 44 #define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT128 45 #define NVTX_PAYLOAD_ENTRY_TYPE_BF16 50 #define NVTX_PAYLOAD_ENTRY_TYPE_TF32 52 /** * Data types are as defined by NVTXv3 core. */ #define NVTX_PAYLOAD_ENTRY_TYPE_CATEGORY 68 /* uint32_t */ #define NVTX_PAYLOAD_ENTRY_TYPE_COLOR_ARGB 69 /* uint32_t */ /** * The scope of events or counters (see `nvtxScopeRegister`). */ #define NVTX_PAYLOAD_ENTRY_TYPE_SCOPE_ID 70 /* uint64_t */ /** * Thread ID as scope. */ #define NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT32 73 #define NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT64 74 /** * \brief String types. * * If no flags are set for the entry and `arrayOrUnionDetail > 0`, the entry is * assumed to be a fixed-size string with the given length, embedded in the payload. * `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE` is redundant for fixed-size strings. * * \todo(Revise the following paragraph.) * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED` specifies a * zero-terminated string. If `arrayOrUnionDetail > 0`, the entry is handled as * a zero-terminated array of fixed-size strings. * * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX` specifies a * variable-length string with the length given in the entry specified by the * field `arrayOrUnionDetail`. */ #define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING 75 /* `char*`, system LOCALE */ #define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF8 76 #define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF16 77 #define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF32 78 /** * The entry value is of type @ref nvtxStringHandle_t returned by * @ref nvtxDomainRegisterString. */ #define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_REGISTERED_STRING_HANDLE 80 /** * This type marks the union selector member (entry index) in schemas used by * a union with internal selector. * See @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR. */ #define NVTX_PAYLOAD_ENTRY_TYPE_UNION_SELECTOR 100 /** * \brief Predefined schema ID for payload data that is referenced in another payload. * * This schema ID can be used in @ref nvtxPayloadData_t::schema_id to indicate that the * payload is a blob of memory which other payload entries may point into. * A tool will not expose this payload directly. * * This schema ID cannot be used as schema entry type! */ #define NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED 1022 /** * \brief Predefined schema ID for raw payload data. * * This schema ID can be used in @ref nvtxPayloadData_t::schema_id to indicate * that the payload is a blob, which can be shown with an arbitrary data viewer. * This schema ID cannot be used as schema entry type! */ #define NVTX_TYPE_PAYLOAD_SCHEMA_RAW 1023 /** * \deprecated: Remove for official release! * In the initial version of this header custom schema IDs started * here. Unless predefined types require more than 16 bits we can keep this * value to preserve backwards compatibility. The value is not used as first * ID for custom schemas any more, but in the analysis every entry type >= this * value is assumed to be a custom schema. */ #define NVTX_PAYLOAD_ENTRY_TYPE_CUSTOM_BASE 65536 /* Custom (static) schema IDs. */ #define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START (1 << 24) /* Dynamic schema IDs (generated by the tool) start here. */ #define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START 4294967296 /* 1 << 32 */ #endif /* NVTX_PAYLOAD_ENTRY_TYPES_V1 */ /** --------------------------------------------------------------------------- * END: Payload schema entry types. * ------------------------------------------------------------------------- */ #ifndef NVTX_PAYLOAD_SCHEMA_TYPES_V1 #define NVTX_PAYLOAD_SCHEMA_TYPES_V1 /** * \brief The payload schema type. * * A schema can be either of the following types. It is set with * @ref nvtxPayloadSchemaAttr_t::type. */ #define NVTX_PAYLOAD_SCHEMA_TYPE_INVALID 0 #define NVTX_PAYLOAD_SCHEMA_TYPE_STATIC 1 #define NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC 2 #define NVTX_PAYLOAD_SCHEMA_TYPE_UNION 3 #define NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR 4 #endif /* NVTX_PAYLOAD_SCHEMA_TYPES_V1 */ #ifndef NVTX_PAYLOAD_SCHEMA_FLAGS_V1 #define NVTX_PAYLOAD_SCHEMA_FLAGS_V1 /** * \brief Flags for static and dynamic schemas. * * The schema flags are used with @ref nvtxPayloadSchemaAttr_t::flags. */ #define NVTX_PAYLOAD_SCHEMA_FLAG_NONE 0 /** * This flag indicates that a schema and the corresponding payloads can * contain fields which require a deep copy. */ #define NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY (1 << 1) /** * This flag indicates that a schema and the corresponding payload can be * referenced by another payload of the same event. If the schema is not * intended to be visualized directly, it is possible use * @ref NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED instead. */ #define NVTX_PAYLOAD_SCHEMA_FLAG_REFERENCED (1 << 2) /** * The schema defines a counter group. An NVTX handler can expect that the schema * contains entries with counter semantics. */ #define NVTX_PAYLOAD_SCHEMA_FLAG_COUNTER_GROUP (1 << 3) #endif /* NVTX_PAYLOAD_SCHEMA_FLAGS_V1 */ #ifndef NVTX_PAYLOAD_SCHEMA_ATTRS_V1 #define NVTX_PAYLOAD_SCHEMA_ATTRS_V1 /** * The values allow the valid fields in @ref nvtxPayloadSchemaAttr_t to be * specified via setting the field `fieldMask`. */ #define NVTX_PAYLOAD_SCHEMA_ATTR_NAME (1 << 1) #define NVTX_PAYLOAD_SCHEMA_ATTR_TYPE (1 << 2) #define NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS (1 << 3) #define NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES (1 << 4) #define NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES (1 << 5) #define NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE (1 << 6) #define NVTX_PAYLOAD_SCHEMA_ATTR_ALIGNMENT (1 << 7) #define NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID (1 << 8) #define NVTX_PAYLOAD_SCHEMA_ATTR_EXTENSION (1 << 9) #endif /* NVTX_PAYLOAD_SCHEMA_ATTRS_V1 */ #ifndef NVTX_PAYLOAD_ENUM_ATTRS_V1 #define NVTX_PAYLOAD_ENUM_ATTRS_V1 /** * The values are used to set the field `fieldMask` and specify which fields in * @ref nvtxPayloadEnumAttr_t are set. */ #define NVTX_PAYLOAD_ENUM_ATTR_NAME (1 << 1) #define NVTX_PAYLOAD_ENUM_ATTR_ENTRIES (1 << 2) #define NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES (1 << 3) #define NVTX_PAYLOAD_ENUM_ATTR_SIZE (1 << 4) #define NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID (1 << 5) #define NVTX_PAYLOAD_ENUM_ATTR_EXTENSION (1 << 6) #endif /* NVTX_PAYLOAD_ENUM_ATTRS_V1 */ /** * An NVTX scope specifies the execution scope or source of events or counters. */ #ifndef NVTX_SCOPES_V1 #define NVTX_SCOPES_V1 /** Identifies an invalid scope and indicates an error if returned by `nvtxScopeRegister`. */ #define NVTX_SCOPE_NONE 0 /* no scope */ #define NVTX_SCOPE_ROOT 1 #define NVTX_SCOPE_CURRENT_HW_MACHINE 2 /* Node/machine name */ #define NVTX_SCOPE_CURRENT_HW_SOCKET 3 #define NVTX_SCOPE_CURRENT_HW_CPU_PHYSICAL 4 /* Physical CPU core */ #define NVTX_SCOPE_CURRENT_HW_CPU_LOGICAL 5 /* Logical CPU core */ /* Innermost HW execution context at registration time */ #define NVTX_SCOPE_CURRENT_HW_INNERMOST 15 /* Virtualized hardware, virtual machines, OS (if you don't know any better) \todo: Need to be more precise what information is expected for each of these scopes. */ #define NVTX_SCOPE_CURRENT_HYPERVISOR 16 #define NVTX_SCOPE_CURRENT_VM 17 #define NVTX_SCOPE_CURRENT_KERNEL 18 #define NVTX_SCOPE_CURRENT_CONTAINER 19 #define NVTX_SCOPE_CURRENT_OS 20 /* Software scopes */ #define NVTX_SCOPE_CURRENT_SW_PROCESS 21 /* Process scope */ #define NVTX_SCOPE_CURRENT_SW_THREAD 22 /* Thread scope */ /* Innermost SW execution context at registration time */ #define NVTX_SCOPE_CURRENT_SW_INNERMOST 31 /** Static (user-provided) scope IDs (feed forward) */ #define NVTX_SCOPE_ID_STATIC_START (1 << 24) /** Dynamically (tool) generated scope IDs */ #define NVTX_SCOPE_ID_DYNAMIC_START 4294967296 /* 1 << 32 */ #endif /* NVTX_SCOPES_V1 */ #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ #ifndef NVTX_PAYLOAD_TYPEDEFS_V1 #define NVTX_PAYLOAD_TYPEDEFS_V1 /** * \brief Size and alignment information for predefined payload entry types. * * The struct contains the size and the alignment size in bytes. A respective * array for the predefined types is passed via nvtxExtModuleInfo_t to the NVTX * client/handler. The type (ID) is used as index into this array. */ typedef struct nvtxPayloadEntryTypeInfo_v1 { uint16_t size; uint16_t align; } nvtxPayloadEntryTypeInfo_t; /** * \brief Binary payload data, size and decoding information. * * An array of type `nvtxPayloadData_t` is passed to the NVTX event attached to * an NVTX event via the `payload.ullvalue` field of NVTX event attributes. * * The `schemaId` be a predefined schema entry type (`NVTX_PAYLOAD_ENTRY_TYPE*`), * a schema ID (statically specified or dynamically created) or one of * `NVTX_PAYLOAD_TYPE_REFERENCED` or `NVTX_PAYLOAD_TYPE_RAW`. * * Setting the size of a payload to `MAX_SIZE` can be useful to reduce the * overhead of NVTX instrumentation, when no NVTX handler is attached. However, * a tool might not be able to detect the size of a payload and thus skip it. * A reasonable use case is a payload that represents a null-terminated * C string, where the NVTX handler can call `strlen()`. */ typedef struct nvtxPayloadData_v1 { /** * The schema ID, which defines the layout of the binary data. */ uint64_t schemaId; /** * Size of the payload (blob) in bytes. `SIZE_MAX` (`-1`) indicates the tool * that it should figure out the size, which might not be possible. */ size_t size; /** * Pointer to the binary payload data. */ const void* payload; } nvtxPayloadData_t; /** * \brief Header of the payload entry's semantic field. * * If the semantic field of the payload schema entry is set, the first four * fields (header) are defined with this type. A tool can iterate through the * extensions and check, if it supports (can handle) it. */ typedef struct nvtxSemanticsHeader_v1 { uint32_t structSize; /** Size of semantic extension struct. */ uint16_t semanticId; uint16_t version; const struct nvtxSemanticsHeader_v1* next; /** linked list */ /* Additional fields are defined by the specific semantic extension. */ } nvtxSemanticsHeader_t; /** * \brief Entry in a schema. * * A payload schema consists of an array of payload schema entries. It is * registered with @ref nvtxPayloadSchemaRegister. `flag` can be set to `0` for * simple values, 'type' is the only "required" field. If not set explicitly, * all other fields are zero-initialized, which means that the entry has no name * and the offset is determined based on self-alignment rules. * * Example schema: * nvtxPayloadSchemaEntry_t schema[] = { * {0, NVTX_EXT_PAYLOAD_TYPE_UINT8, "one byte"}, * {0, NVTX_EXT_PAYLOAD_TYPE_INT32, "four bytes"} * }; */ typedef struct nvtxPayloadSchemaEntry_v1 { /** * \brief Flags to augment the basic type. * * This field allows additional properties of the payload entry to be * specified. Valid values are `NVTX_PAYLOAD_ENTRY_FLAG_*`. */ uint64_t flags; /** * \brief Predefined payload schema entry type or custom schema ID. * * Predefined types are `NVTX_PAYLOAD_ENTRY_TYPE_*`. Passing a schema ID * enables nesting of schemas. */ uint64_t type; /** * \brief Name or label of the payload entry. (Optional) * * A meaningful name or label can help organizing and interpreting the data. */ const char* name; /** * \brief Description of the payload entry. (Optional) * * A more detail description of the data that is stored with this entry. */ const char* description; /** * \brief String length, array length or member selector for union types. * * If @ref type is a C string type, this field specifies the string length. * * If @ref flags specify that the entry is an array, this field specifies * the array length. See `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_*` for more details. * * If @ref type is a union with schema type @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION * (external selection of the union member), this field contains the index * (starting with 0) to an entry of integral type in the same schema. The * associated field value specifies the selected union member. * * @note An array of schema type @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION is not * supported. @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR can * be used instead. */ uint64_t arrayOrUnionDetail; /** * \brief Offset in the binary payload data (in bytes). * * This field specifies the byte offset from the base address of the actual * binary data (blob) to the start address of the data of this entry. * * It is recommended (but not required) to provide the offset it. Otherwise, * the NVTX handler will determine the offset from natural alignment rules. * In some cases, e.g. dynamic schema layouts, the offset cannot be set and * has to be determined based on the data of prior entries. * * Setting the offset can also be used to skip entries during payload parsing. */ uint64_t offset; /** * \brief Additional semantics of the payload entry. * * The field points to the first element in a linked list, which enables * multiple semantic extensions. */ const nvtxSemanticsHeader_t* semantics; /** * \brief Reserved for future use. Do not use it! */ const void* reserved; } nvtxPayloadSchemaEntry_t; /** * \brief Header of the schema attribute extension field. */ typedef struct nvtxPayloadSchemaExtension_v1 { uint32_t structSize; /** Size of schema extension struct. */ uint16_t schemaExtId; uint16_t version; const struct nvtxPayloadSchemaExtension_v1* next; /** linked list */ /* Additional fields are defined by the specific schema extension. */ } nvtxPayloadSchemaExtension_t; /** * \brief NVTX payload schema attributes. */ typedef struct nvtxPayloadSchemaAttr_v1 { /** * \brief Mask of valid fields in this struct. * * Use the `NVTX_PAYLOAD_SCHEMA_ATTR_*` defines. */ uint64_t fieldMask; /** * \brief Name of the payload schema. (Optional) */ const char* name; /** * \brief Payload schema type. (Mandatory) \anchor PAYLOAD_TYPE_FIELD * * Use the `NVTX_PAYLOAD_SCHEMA_TYPE_*` defines. */ uint64_t type; /** * \brief Payload schema flags. (Optional) * * Flags defined by `NVTX_PAYLOAD_SCHEMA_FLAG_*` can be used to set * additional properties of the schema. */ uint64_t flags; /** * \brief Entries of a payload schema. (Mandatory) \anchor ENTRIES_FIELD * * This field is a pointer to an array of schema entries, each describing a * field in a data structure, e.g. in a C struct or union. */ const nvtxPayloadSchemaEntry_t* entries; /** * \brief Number of entries in the payload schema. (Mandatory) * * Number of entries in the array of payload entries \ref ENTRIES_FIELD. */ size_t numEntries; /** * \brief The binary payload size in bytes for static payload schemas. * * If \ref PAYLOAD_TYPE_FIELD is @ref NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC this * value is ignored. If this field is not specified for a schema of type * @ref NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, the size can be automatically * determined by a tool. */ size_t payloadStaticSize; /** * \brief The byte alignment for packed structures. * * If not specified, this field defaults to `0`, which means that the fields * in the data structure are not packed and natural alignment rules can be * applied. */ size_t packAlign; /* Static/custom schema ID must be >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START */ uint64_t schemaId; /* Flexible extension for schema attributes. */ void* extension; } nvtxPayloadSchemaAttr_t; /** * \brief This type is used to describe an enumeration. * * Since the value of an enum entry might not be meaningful for the analysis * and/or visualization, a tool can show the name of enum entry instead. * * An array of this struct is passed to @ref nvtxPayloadEnumAttr_t::entries to be * finally registered via @ref nvtxPayloadEnumRegister with the NVTX handler. * * @note EXPERIMENTAL */ typedef struct nvtxPayloadEnum_v1 { /** * Name of the enum value. */ const char* name; /** * Value of the enum entry. */ uint64_t value; /** * Indicates that this entry sets a specific set of bits, which can be used * to define bitsets. */ int8_t isFlag; } nvtxPayloadEnum_t; /** * \brief NVTX payload enumeration type attributes. * * A pointer to this struct is passed to @ref nvtxPayloadEnumRegister. */ typedef struct nvtxPayloadEnumAttr_v1 { /** * Mask of valid fields in this struct. See `NVTX_PAYLOAD_ENUM_ATTR_*`. */ uint64_t fieldMask; /** * Name of the enum. (Optional) */ const char* name; /** * Entries of the enum. (Mandatory) */ const nvtxPayloadEnum_t* entries; /** * Number of entries in the enum. (Mandatory) */ size_t numEntries; /** * Size of enumeration type in bytes */ size_t sizeOfEnum; /** * Static/custom schema ID must be * >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and * < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START */ uint64_t schemaId; /* Flexible extension for enumeration attributes. */ void* extension; } nvtxPayloadEnumAttr_t; typedef struct nvtxScopeAttr_v1 { size_t structSize; /** Path delimited by '/' characters, relative to parentScope. Leading slashes are ignored. Nodes in the path may use name[key] syntax to indicate an array of sibling nodes, which may be combined with other non-array nodes or different arrays at the same scope. Node names should be UTF8 printable characters, excluding '/', '[', and ']' characters which have special meaning here. An empty C string "" and `NULL` are valid inputs and treated equivalently. */ const char* path; uint64_t parentScope; /** The static scope ID must be unique within the domain, >= NVTX_EVENT_SCOPE_ID_STATIC_START, and < NVTX_EVENT_SCOPE_ID_DYNAMIC_START. */ uint64_t scopeId; } nvtxScopeAttr_t; #endif /* NVTX_PAYLOAD_TYPEDEFS_V1 */ #ifndef NVTX_PAYLOAD_API_FUNCTIONS_V1 #define NVTX_PAYLOAD_API_FUNCTIONS_V1 /** * \brief Register a payload schema. * * @param domain NVTX domain handle. * @param attr NVTX payload schema attributes. */ NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadSchemaRegister( nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr); /** * \brief Register an enumeration type with the payload extension. * * @param domain NVTX domain handle * @param attr NVTX payload enumeration type attributes. */ NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadEnumRegister( nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr); /** * \brief Register a scope. * * @param domain NVTX domain handle (0 for default domain) * @param attr Scope attributes. * * @return an identifier for the scope. If the operation was not successful, * `NVTX_SCOPE_NONE` is returned. */ NVTX_DECLSPEC uint64_t NVTX_API nvtxScopeRegister( nvtxDomainHandle_t domain, const nvtxScopeAttr_t* attr); /** * \brief Marks an instantaneous event in the application with the attributes * being passed via the extended payload. * * An NVTX handler can assume that the payload contains the event message. * Otherwise, it might ignore the event. * * @param domain NVTX domain handle * @param payloadData pointer to an array of structured payloads. * @param count number of payload BLOBs. */ NVTX_DECLSPEC void NVTX_API nvtxMarkPayload( nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count); /** * \brief Begin a nested thread range with the attributes being passed via the * payload. * * @param domain NVTX domain handle * @param payloadData pointer to an array of structured payloads. * @param count number of payload BLOBs. * * @return The level of the range being ended. If an error occurs a negative * value is returned on the current thread. */ NVTX_DECLSPEC int NVTX_API nvtxRangePushPayload( nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count); /** * \brief End a nested thread range with an additional custom payload. * * NVTX event attributes passed to this function (via the payloads) overwrite * event attributes (message and color) that have been set in the push event. * Other payload entries extend the data of the range. * * @param domain NVTX domain handle * @param payloadData pointer to an array of structured payloads. * @param count number of payload BLOBs. * * @return The level of the range being ended. If an error occurs a negative * value is returned on the current thread. */ NVTX_DECLSPEC int NVTX_API nvtxRangePopPayload( nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count); /** * \brief Start a thread range with attributes passed via the extended payload. * * @param domain NVTX domain handle * @param payloadData pointer to an array of structured payloads. * @param count number of payload BLOBs. * * @return The level of the range being ended. If an error occurs a negative * value is returned on the current thread. */ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartPayload( nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count); /** * \brief End a thread range and pass a custom payload. * * NVTX event attributes passed to this function (via the payloads) overwrite * event attributes (message and color) that have been set in the start event. * Other payload entries extend the data of the range. * * @param domain NVTX domain handle * @param id The correlation ID returned from a NVTX range start call. * @param payloadData pointer to an array of structured payloads. * @param count number of payload BLOBs. */ NVTX_DECLSPEC void NVTX_API nvtxRangeEndPayload( nvtxDomainHandle_t domain, nvtxRangeId_t id, const nvtxPayloadData_t* payloadData, size_t count); /** * @brief Checks if an NVTX domain is enabled (unofficial and may not work) * * @param domain NVTX domain handle * @return 0 if the domain is not enabled. */ NVTX_DECLSPEC uint8_t NVTX_API nvtxDomainIsEnabled( nvtxDomainHandle_t domain); #endif /* NVTX_PAYLOAD_API_FUNCTIONS_V1 */ #ifndef NVTX_PAYLOAD_CALLBACK_ID_V1 #define NVTX_PAYLOAD_CALLBACK_ID_V1 /** * \brief Callback Ids of API functions in the payload extension. * * The NVTX handler can use these values to register a handler function. When * InitializeInjectionNvtxExtension(nvtxExtModuleInfo_t* moduleInfo) is * executed, a handler routine 'handlenvtxPayloadRegisterSchema' can be * registered as follows: * \code{.c} * moduleInfo->segments->slots[NVTX3EXT_CBID_nvtxPayloadSchemaRegister] = * (intptr_t)YourPayloadRegisterSchemaHandlerFn; * \endcode */ #define NVTX3EXT_CBID_nvtxPayloadSchemaRegister 0 #define NVTX3EXT_CBID_nvtxPayloadEnumRegister 1 #define NVTX3EXT_CBID_nvtxMarkPayload 2 #define NVTX3EXT_CBID_nvtxRangePushPayload 3 #define NVTX3EXT_CBID_nvtxRangePopPayload 4 #define NVTX3EXT_CBID_nvtxRangeStartPayload 5 #define NVTX3EXT_CBID_nvtxRangeEndPayload 6 #define NVTX3EXT_CBID_nvtxDomainIsEnabled 7 #define NVTX3EXT_CBID_nvtxScopeRegister 12 #endif /* NVTX_PAYLOAD_CALLBACK_ID_V1 */ /*** Helper utilities ***/ /** \brief Helper macro for safe double-cast of pointer to uint64_t value. */ #ifndef NVTX_POINTER_AS_PAYLOAD_ULLVALUE # ifdef __cplusplus # define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) \ static_cast(reinterpret_cast(p)) # else #define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) ((uint64_t)(uintptr_t)p) # endif #endif #ifndef NVTX_PAYLOAD_EVTATTR_SET_DATA /** * \brief Helper macro to attach a single payload to an NVTX event attribute. * * @param evtAttr NVTX event attribute (variable name) * @param pldata_addr Adress of `nvtxPayloadData_t` variable. * @param schema_id NVTX binary payload schema ID. * @param pl_addr Address of the (actual) payload. * @param sz size of the (actual) payload. */ #define NVTX_PAYLOAD_EVTATTR_SET_DATA(evtAttr, pldata_addr, schema_id, pl_addr, sz) \ (pldata_addr)->schemaId = schema_id; \ (pldata_addr)->size = sz; \ (pldata_addr)->payload = pl_addr; \ (evtAttr).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(pldata_addr); \ (evtAttr).payloadType = NVTX_PAYLOAD_TYPE_EXT; \ (evtAttr).reserved0 = 1; #endif /* NVTX_PAYLOAD_EVTATTR_SET_DATA */ #ifndef NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE /** * \brief Helper macro to attach multiple payloads to an NVTX event attribute. * * @param evtAttr NVTX event attribute (variable name) * @param pldata Payload data array (of type `nvtxPayloadData_t`) */ #define NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE(evtAttr, pldata) \ (evtAttr).payloadType = NVTX_PAYLOAD_TYPE_EXT; \ (evtAttr).reserved0 = sizeof(pldata)/sizeof(nvtxPayloadData_t); \ (evtAttr).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(pldata); #endif /* NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE */ #ifndef NVTX_PAYLOAD_EVTATTR_SET /* * Do not use this macro directly! It is a helper to attach a single payload to * an NVTX event attribute. * @warning The NVTX push, start or mark operation must not be in an outer scope. */ #define NVTX_PAYLOAD_EVTATTR_SET(evtAttr, schema_id, pl_addr, sz) \ nvtxPayloadData_t _NVTX_PAYLOAD_DATA_VAR[] = \ {{schema_id, sz, pl_addr}}; \ (evtAttr)->payload.ullValue = \ NVTX_POINTER_AS_PAYLOAD_ULLVALUE(_NVTX_PAYLOAD_DATA_VAR); \ (evtAttr)->payloadType = NVTX_PAYLOAD_TYPE_EXT; \ (evtAttr)->reserved0 = 1; #endif /* NVTX_PAYLOAD_EVTATTR_SET */ #ifndef nvtxPayloadRangePush /** * \brief Helper macro to push a range with extended payload. * * @param domain NVTX domain handle (0 for default domain) * @param evtAttr pointer to NVTX event attribute. * @param schemaId NVTX payload schema ID * @param plAddr Pointer to the binary data (actual payload) * @param size Size of the binary payload data in bytes. */ #define nvtxPayloadRangePush(domain, evtAttr, schemaId, plAddr, size) \ do { \ NVTX_PAYLOAD_EVTATTR_SET(evtAttr, schemaId, plAddr, size) \ nvtxDomainRangePushEx(domain, evtAttr); \ } while (0) #endif /* nvtxPayloadRangePush */ #ifndef nvtxPayloadMark /** * \brief Helper macro to set a marker with extended payload. * * @param domain NVTX domain handle (0 for default domain) * @param evtAttr pointer to NVTX event attribute. * @param schemaId NVTX payload schema ID * @param plAddr Pointer to the binary data (actual payload) * @param size Size of the binary payload data in bytes. */ #define nvtxPayloadMark(domain, evtAttr, schemaId, plAddr, size) \ do { \ NVTX_PAYLOAD_EVTATTR_SET(evtAttr, schemaId, plAddr, size) \ nvtxDomainMarkEx(domain, evtAttr); \ } while (0) #endif /* nvtxPayloadMark */ #ifdef __GNUC__ #pragma GCC visibility push(internal) #endif /* Extension types are required for the implementation and the NVTX handler. */ #define NVTX_EXT_TYPES_GUARD #include "nvtxDetail/nvtxExtTypes.h" #undef NVTX_EXT_TYPES_GUARD #ifndef NVTX_NO_IMPL #define NVTX_EXT_IMPL_PAYLOAD_GUARD #include "nvtxDetail/nvtxExtImplPayload_v1.h" #undef NVTX_EXT_IMPL_PAYLOAD_GUARD #endif /* NVTX_NO_IMPL */ #ifdef __GNUC__ #pragma GCC visibility pop #endif #ifdef __cplusplus } #endif /* __cplusplus */ nccl-2.22.3-1/src/include/nvtx3/nvToolsExtPayloadHelper.h000066400000000000000000000152221463451655400231400ustar00rootroot00000000000000/* * Copyright 2023 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #include "nvtxDetail/nvtxExtPayloadHelperInternal.h" /* This is just an empty marker (for readability), which can be omitted. */ /* TODO: Fix issue with trailing comma at end of entry list. */ #define NVTX_PAYLOAD_ENTRIES /** * Use this macro for payload entries that are defined by a schema (nested * payload schema). */ #define NVTX_PAYLOAD_NESTED(schemaId) _NVTX_PAYLOAD_NESTED(schemaId) /** * \brief Define a payload schema for an existing C `struct` definition. * * This macro does * 1) create schema description (array of schema entries). * 2) set the schema attributes for a static data layout. * * It can be used in static code or within a function context. * * Example: * NVTX_DEFINE_SCHEMA_FOR_STRUCT(your_struct, "SchemaName", * NVTX_PAYLOAD_ENTRIES( * (index, TYPE_INT, "integer value"), * (dpfloat, TYPE_DOUBLE, "fp64 value"), * (text, TYPE_CSTRING, "text", NULL, 24) * ) * ) * * It is required to at least provide the struct name and the payload entries. * The first two fields (member name and NVTX entry type) of each payload entry * are required. * * The optional parameters are only allowed to be passed in the predefined order. * Hence, `payload_flags` requires `payload_schema` to be given and * `prefix` requires `payload_flags` and `payload_schema` to be given. * The payload entries are always the last parameter. A maximum of 16 schema * entries is supported. * * It is recommended to use `NVTX_PAYLOAD_SCHEMA_REGISTER` to register the schema. * * @param struct_id The name of the struct. * @param schema_name (Optional 1) name of the payload schema. Default is `NULL`. * @param prefix (Optional 2) prefix before the schema and attributes variables, * e.g. `static const`. Leave this empty, if no prefix is desired. * @param schema_flags (Optional 2) flags to augment the payload schema. * Default is `NVTX_PAYLOAD_SCHEMA_FLAG_NONE`. * @param schema_id (Optional 4) User-defined payload schema ID. * @param entries (Mandatory) Payload schema entries. This is always the last * parameter to the macro. */ #define NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, ...) \ _NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, __VA_ARGS__) /** * \brief Define a C struct together with a matching schema. * * This macro does * 1) define the payload type (typedef struct). * 2) create schema description (array of schema entries). * 3) set the schema attributes for a static data layout. * * The macro can be used in static code or within a function context. * * It defines the schema attributes in `struct_id##Attr`. Thus, it is recommended * to use `NVTX_PAYLOAD_SCHEMA_REGISTER(domain, struct_id)` to register the schema. * * Example: * NVTX_DEFINE_STRUCT_WITH_SCHEMA(your_struct_name, "Your schema name", * NVTX_PAYLOAD_ENTRIES( * (int, index, TYPE_INT, "integer value"), * (double, dpfloat, TYPE_DOUBLE, "fp64 value"), * (const char, (text, 24), TYPE_CSTRING, "text", NULL, 24) * ) * ) * * The first three fields (C type, member, entry type) of each entry are required. * A fixed-size array or string requires a special notation with the member * name and the size separated by comma and put into brackets (see last entry * in the example). * * The optional parameters are positional (only allowed to be passed in the * predefined order). A maximum of 16 schema entries is supported. * * @param struct_id The name of the struct. * @param schema_name (Optional 1) name of the payload schema. Default is `NULL`. * @param prefix (Optional 2) prefix before the schema and attributes variables, * e.g. `static const`. Leave this empty, if no prefix is desired. * @param schema_flags (Optional 3) flags to augment the payload schema. * Default is `NVTX_PAYLOAD_SCHEMA_FLAG_NONE`. * @param schema_id (Optional 4) User-defined payload schema ID. * @param entries (Mandatory) The schema entries. This is always the last * parameter to the macro. */ #define NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, ...) \ _NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, __VA_ARGS__) /** * \brief Initialize and register the NVTX binary payload schema. * * This does essentially the same as `NVTX_DEFINE_STRUCT_WITH_SCHEMA`, but in * addition the schema is registered. The schema ID will be defined as follows: * `const uint64_t struct_id##_schemaId`. * * @param domain The NVTX domain handle (0 for default domain). * All other parameters are similar to `NVTX_DEFINE_STRUCT_WITH_SCHEMA`. */ #define NVTX_DEFINE_STRUCT_WITH_SCHEMA_AND_REGISTER(domain, struct_id, ...) \ _NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, __VA_ARGS__) \ const uint64_t struct_id##_schemaId = nvtxPayloadSchemaRegister(domain, &struct_id##Attr); /** * \brief Define payload schema for an existing `struct` and register the schema. * * This does essentially the same as `NVTX_PAYLOAD_STATIC_SCHEMA_DEFINE`, but in * addition, the schema is registered and `uint64_t struct_id##_schemaId` set. * * @param domain The NVTX domain handle (0 for default domain). * All other parameters are similar to `NVTX_PAYLOAD_STATIC_SCHEMA_DEFINE`. */ #define NVTX_DEFINE_SCHEMA_FOR_STRUCT_AND_REGISTER(domain, struct_id, ...) \ _NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, __VA_ARGS__) \ const uint64_t struct_id##_schemaId = nvtxPayloadSchemaRegister(domain, &struct_id##Attr); /** * \brief Create a type definition for the given struct ID and members. * * This is a convenience macro. A normal `typedef` can be used instead. * * Example usage: * NVTX_DEFINE_STRUCT(your_struct, * (double, fp64), * (uint8_t, u8), * (float, fp32[3]) * ) * * @param struct_id The name of the struct. * @param members The members of the struct. */ #define NVTX_DEFINE_STRUCT(struct_id, ...) \ _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, __VA_ARGS__) /** * \brief Register an NVTX binary payload schema. * * This is a convenience macro, which takes the same `struct_id` that has been * used in other helper macros. Instead, `nvtxPayloadSchemaRegister` can also be * used, but `&struct_id##Attr` has to be passed. * * @param domain The NVTX domain handle (0 for default domain). * @param struct_id The name of the struct. * * @return NVTX schema ID */ #define NVTX_PAYLOAD_SCHEMA_REGISTER(domain, struct_id) \ nvtxPayloadSchemaRegister(domain, &struct_id##Attr); nccl-2.22.3-1/src/include/nvtx3/nvToolsExtSemanticsCounters.h000066400000000000000000000046501463451655400240630ustar00rootroot00000000000000/* * Copyright 2024 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ /** * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand. */ #ifndef NVTX_SEMANTIC_ID_COUNTERS_V1 #define NVTX_SEMANTIC_ID_COUNTERS_V1 2 /** * Flags to extend the semantics of counters. */ #define NVTX_COUNTERS_FLAGS_NONE 0 /** * Convert the fixed point value to a normalized floating point value. * Unsigned [0f : 1f] or signed [-1f : 1f] is determined by the underlying type * this flag is applied to. */ #define NVTX_COUNTERS_FLAG_NORMALIZE (1 << 1) /** * Visual tools should apply scale and limits when graphing. */ #define NVTX_COUNTERS_FLAG_LIMIT_MIN (1 << 2) #define NVTX_COUNTERS_FLAG_LIMIT_MAX (1 << 3) #define NVTX_COUNTERS_FLAG_LIMITS \ (NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX) /** * Counter time scopes. */ #define NVTX_COUNTERS_FLAG_TIMESCOPE_POINT (1 << 5) #define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_LAST (2 << 5) #define NVTX_COUNTERS_FLAG_TIMESCOPE_UNTIL_NEXT (3 << 5) #define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_START (4 << 5) /** * Counter value types. */ #define NVTX_COUNTERS_FLAG_VALUETYPE_ABSOLUTE (1 << 10) /** Delta to previous value of same counter type. */ #define NVTX_COUNTERS_FLAG_VALUETYPE_DELTA (2 << 10) /** * Datatypes for the `limits` union. */ #define NVTX_COUNTERS_LIMIT_I64 0 #define NVTX_COUNTERS_LIMIT_U64 1 #define NVTX_COUNTERS_LIMIT_F64 2 /** *\brief Specify counter semantics. */ typedef struct nvtxSemanticsCounter_v1 { /** Header of the semantic extensions (with identifier, version, etc.). */ struct nvtxSemanticsHeader_v1 header; /** Flags to provide more context about the counter value. */ uint64_t flags; /** Unit of the counter value (case-insensitive). */ const char* unit; /** Should be 1 if not used. */ uint64_t unitScaleNumerator; /** Should be 1 if not used. */ uint64_t unitScaleDenominator; /** Determines the used union member. Use defines `NVTX_COUNTER_LIMIT_*`. */ int64_t limitType; /** Graph limits {minimum, maximum}. */ union limits_t { int64_t i64[2]; uint64_t u64[2]; double d[2]; } limits; } nvtxSemanticsCounter_t; #endif /* NVTX_SEMANTIC_ID_COUNTERS_V1 */nccl-2.22.3-1/src/include/nvtx3/nvToolsExtSemanticsScope.h000066400000000000000000000015331463451655400233270ustar00rootroot00000000000000/* * Copyright 2024 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ /** * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand. */ #ifndef NVTX_SEMANTIC_ID_SCOPE_V1 #define NVTX_SEMANTIC_ID_SCOPE_V1 1 /** * \brief Specify the NVTX scope for a payload entry. * * This allows the scope to be set for a specific value or counter in a payload. * The scope must be known at schema registration time. */ typedef struct nvtxSemanticsScope_v1 { struct nvtxSemanticsHeader_v1 header; /** Specifies the scope of a payload entry, e.g. a counter or timestamp. */ uint64_t scopeId; } nvtxSemanticsScope_t; #endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */nccl-2.22.3-1/src/include/nvtx3/nvToolsExtSync.h000066400000000000000000000315471463451655400213330ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #include "nvToolsExt.h" #ifndef NVTOOLSEXT_SYNC_V3 #define NVTOOLSEXT_SYNC_V3 #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /* \cond SHOW_HIDDEN * \version \NVTX_VERSION_2 */ #define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) ) /** \endcond */ /** * \page PAGE_SYNCHRONIZATION Synchronization * * This section covers a subset of the API that allow users to track additional * synchronization details of their application. Naming OS synchronization primitives * may allow users to better understand the data collected by traced synchronization * APIs. Additionally, a user defined synchronization object can allow the users to * to tell the tools when the user is building their own synchronization system * that do not rely on the OS to provide behaviors and instead use techniques like * atomic operations and spinlocks. * * See module \ref SYNCHRONIZATION for details. * * \par Example: * \code * class MyMutex * { * volatile long bLocked; * nvtxSyncUser_t hSync; * public: * MyMutex(const char* name, nvtxDomainHandle_t d){ * bLocked = 0; * * nvtxSyncUserAttributes_t attribs = { 0 }; * attribs.version = NVTX_VERSION; * attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE; * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII; * attribs.message.ascii = name; * hSync = nvtxDomainSyncUserCreate(d, &attribs); * } * * ~MyMutex() { * nvtxDomainSyncUserDestroy(hSync); * } * * bool Lock() { * nvtxDomainSyncUserAcquireStart(hSync); * bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic * if (acquired) { * nvtxDomainSyncUserAcquireSuccess(hSync); * } * else { * nvtxDomainSyncUserAcquireFailed(hSync); * } * return acquired; * } * void Unlock() { * nvtxDomainSyncUserReleasing(hSync); * bLocked = false; * } * }; * \endcode * * \version \NVTX_VERSION_2 */ /* ------------------------------------------------------------------------- */ /* \cond SHOW_HIDDEN * \brief Used to build a non-colliding value for resource types separated class * \version \NVTX_VERSION_2 */ #define NVTX_RESOURCE_CLASS_SYNC_OS 2 /**< Synchronization objects that are OS specific. */ #define NVTX_RESOURCE_CLASS_SYNC_PTHREAD 3 /**< Synchronization objects that are from the POSIX Threads API (pthread)*/ /** \endcond */ /* ------------------------------------------------------------------------- */ /** \defgroup SYNCHRONIZATION Synchronization * See page \ref PAGE_SYNCHRONIZATION. * @{ */ /** \brief Resource type values for OSs with POSIX Thread API support */ typedef enum nvtxResourceSyncPosixThreadType_t { NVTX_RESOURCE_TYPE_SYNC_PTHREAD_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 1), /* pthread_mutex_t */ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_CONDITION = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 2), /* pthread_cond_t */ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_RWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 3), /* pthread_rwlock_t */ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_BARRIER = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 4), /* pthread_barrier_t */ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 5), /* pthread_spinlock_t */ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_ONCE = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 6) /* pthread_once_t */ } nvtxResourceSyncPosixThreadType_t; /** \brief Resource type values for Windows OSs */ typedef enum nvtxResourceSyncWindowsType_t { NVTX_RESOURCE_TYPE_SYNC_WINDOWS_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1), NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2), NVTX_RESOURCE_TYPE_SYNC_WINDOWS_EVENT = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3), NVTX_RESOURCE_TYPE_SYNC_WINDOWS_CRITICAL_SECTION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4), NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SRWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5) } nvtxResourceSyncWindowsType_t; /** \brief Resource type values for Linux and Linux derived OSs such as Android * \sa * ::nvtxResourceSyncPosixThreadType_t */ typedef enum nvtxResourceSyncLinuxType_t { NVTX_RESOURCE_TYPE_SYNC_LINUX_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1), NVTX_RESOURCE_TYPE_SYNC_LINUX_FUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2), NVTX_RESOURCE_TYPE_SYNC_LINUX_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3), NVTX_RESOURCE_TYPE_SYNC_LINUX_COMPLETION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4), NVTX_RESOURCE_TYPE_SYNC_LINUX_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5), NVTX_RESOURCE_TYPE_SYNC_LINUX_SEQLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 6), NVTX_RESOURCE_TYPE_SYNC_LINUX_RCU = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 7) } nvtxResourceSyncLinuxType_t; /** \brief Resource type values for Android come from Linux. * \sa * ::nvtxResourceSyncLinuxType_t * ::nvtxResourceSyncPosixThreadType_t */ typedef enum nvtxResourceSyncLinuxType_t nvtxResourceSyncAndroidType_t; /** \brief User Defined Synchronization Object Handle . * \anchor SYNCUSER_HANDLE_STRUCTURE * * This structure is opaque to the user and is used as a handle to reference * a user defined syncrhonization object. The tools will return a pointer through the API for the application * to hold on it's behalf to reference the string in the future. * */ typedef struct nvtxSyncUser* nvtxSyncUser_t; /** \brief User Defined Synchronization Object Attributes Structure. * \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE * * This structure is used to describe the attributes of a user defined synchronization * object. The layout of the structure is defined by a specific version of the tools * extension library and can change between different versions of the Tools Extension * library. * * \par Initializing the Attributes * * The caller should always perform the following three tasks when using * attributes: *
    *
  • Zero the structure *
  • Set the version field *
  • Set the size field *
* * Zeroing the structure sets all the event attributes types and values * to the default value. * * The version and size field are used by the Tools Extension * implementation to handle multiple versions of the attributes structure. * * It is recommended that the caller use one of the following to methods * to initialize the event attributes structure: * * \par Method 1: Initializing nvtxEventAttributes for future compatibility * \code * nvtxSyncUserAttributes_t attribs = {0}; * attribs.version = NVTX_VERSION; * attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE; * \endcode * * \par Method 2: Initializing nvtxSyncUserAttributes_t for a specific version * \code * nvtxSyncUserAttributes_t attribs = {0}; * attribs.version = 1; * attribs.size = (uint16_t)(sizeof(nvtxSyncUserAttributes_t)); * \endcode * * If the caller uses Method 1 it is critical that the entire binary * layout of the structure be configured to 0 so that all fields * are initialized to the default value. * * The caller should either use both NVTX_VERSION and * NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values * and a versioned type (Method 2). Using a mix of the two methods * will likely cause either source level incompatibility or binary * incompatibility in the future. * * \par Settings Attribute Types and Values * * * \par Example: * \code * // Initialize * nvtxSyncUserAttributes_t attribs = {0}; * attribs.version = NVTX_VERSION; * attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE; * * // Configure the Attributes * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII; * attribs.message.ascii = "Example"; * \endcode * * \sa * ::nvtxDomainSyncUserCreate */ typedef struct nvtxSyncUserAttributes_v0 { /** * \brief Version flag of the structure. * * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs * supported in this header file. This can optionally be overridden to * another version of the tools extension library. */ uint16_t version; /** * \brief Size of the structure. * * Needs to be set to the size in bytes of the event attribute * structure used to specify the event. */ uint16_t size; /** \brief Message type specified in this attribute structure. * * Defines the message format of the attribute structure's \ref nvtxSyncUserAttributes_v0::message * "message" field. * * Default Value is NVTX_MESSAGE_UNKNOWN */ int32_t messageType; /* nvtxMessageType_t */ /** \brief Message assigned to this attribute structure. * * The text message that is attached to an event. */ nvtxMessageValue_t message; } nvtxSyncUserAttributes_v0; typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t; /* ------------------------------------------------------------------------- */ /** \brief Create a user defined synchronization object * This is used to track non-OS synchronization working with spinlocks and atomics * * \param domain - Domain to own the resource * \param attribs - A structure to assign multiple attributes to the object. * * \return A handle that represents the newly created user defined synchronization object. * * \sa * ::nvtxDomainSyncUserCreate * ::nvtxDomainSyncUserDestroy * ::nvtxDomainSyncUserAcquireStart * ::nvtxDomainSyncUserAcquireFailed * ::nvtxDomainSyncUserAcquireSuccess * ::nvtxDomainSyncUserReleasing * * \version \NVTX_VERSION_2 */ NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs); /* ------------------------------------------------------------------------- */ /** \brief Destroy a user defined synchronization object * This is used to track non-OS synchronization working with spinlocks and atomics * * \param handle - A handle to the object to operate on. * * \sa * ::nvtxDomainSyncUserCreate * ::nvtxDomainSyncUserDestroy * ::nvtxDomainSyncUserAcquireStart * ::nvtxDomainSyncUserAcquireFailed * ::nvtxDomainSyncUserAcquireSuccess * ::nvtxDomainSyncUserReleasing * * \version \NVTX_VERSION_2 */ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle); /* ------------------------------------------------------------------------- */ /** \brief Signal to tools that an attempt to acquire a user defined synchronization object * * \param handle - A handle to the object to operate on. * * \sa * ::nvtxDomainSyncUserCreate * ::nvtxDomainSyncUserDestroy * ::nvtxDomainSyncUserAcquireStart * ::nvtxDomainSyncUserAcquireFailed * ::nvtxDomainSyncUserAcquireSuccess * ::nvtxDomainSyncUserReleasing * * \version \NVTX_VERSION_2 */ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle); /* ------------------------------------------------------------------------- */ /** \brief Signal to tools of failure in acquiring a user defined synchronization object * This should be called after \ref nvtxDomainSyncUserAcquireStart * * \param handle - A handle to the object to operate on. * * \sa * ::nvtxDomainSyncUserCreate * ::nvtxDomainSyncUserDestroy * ::nvtxDomainSyncUserAcquireStart * ::nvtxDomainSyncUserAcquireFailed * ::nvtxDomainSyncUserAcquireSuccess * ::nvtxDomainSyncUserReleasing * * \version \NVTX_VERSION_2 */NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle); /* ------------------------------------------------------------------------- */ /** \brief Signal to tools of success in acquiring a user defined synchronization object * This should be called after \ref nvtxDomainSyncUserAcquireStart. * * \param handle - A handle to the object to operate on. * * \sa * ::nvtxDomainSyncUserCreate * ::nvtxDomainSyncUserDestroy * ::nvtxDomainSyncUserAcquireStart * ::nvtxDomainSyncUserAcquireFailed * ::nvtxDomainSyncUserAcquireSuccess * ::nvtxDomainSyncUserReleasing * * \version \NVTX_VERSION_2 */NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle); /* ------------------------------------------------------------------------- */ /** \brief Signal to tools of releasing a reservation on user defined synchronization object * This should be called after \ref nvtxDomainSyncUserAcquireSuccess. * * \param handle - A handle to the object to operate on. * * \sa * ::nvtxDomainSyncUserCreate * ::nvtxDomainSyncUserDestroy * ::nvtxDomainSyncUserAcquireStart * ::nvtxDomainSyncUserAcquireFailed * ::nvtxDomainSyncUserAcquireSuccess * ::nvtxDomainSyncUserReleasing * * \version \NVTX_VERSION_2 */ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle); /** @} */ /*END defgroup*/ #ifdef __cplusplus } #endif /* __cplusplus */ #ifndef NVTX_NO_IMPL #define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot be included directly */ #include "nvtxDetail/nvtxImplSync_v3.h" #undef NVTX_IMPL_GUARD_SYNC #endif /*NVTX_NO_IMPL*/ #endif /* NVTOOLSEXT_SYNC_V3 */ nccl-2.22.3-1/src/include/nvtx3/nvtx3.hpp000066400000000000000000003063361463451655400177740ustar00rootroot00000000000000/* * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ /* Temporary helper #defines, #undef'ed at end of header */ #define NVTX3_CPP_VERSION_MAJOR 1 #define NVTX3_CPP_VERSION_MINOR 0 /* This section handles the decision of whether to provide unversioned symbols. * If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is #defined, unversioned symbols are * not provided, and explicit-version symbols such as nvtx3::v1::scoped_range * and NVTX3_V1_FUNC_RANGE must be used. By default, the first #include of this * header will define the unversioned symbols such as nvtx3::scoped_range and * NVTX3_FUNC_RANGE. Subsequently including a different major version of this * header without #defining NVTX3_CPP_REQUIRE_EXPLICIT_VERSION triggers an error * since the symbols would conflict. Subsequently including of a different * minor version within the same major version is allowed. Functionality of * minor versions is cumulative, regardless of include order. * * Since NVTX3_CPP_REQUIRE_EXPLICIT_VERSION allows all combinations of versions * to coexist without problems within a translation unit, the recommended best * practice for instrumenting header-based libraries with NVTX C++ Wrappers is * is to #define NVTX3_CPP_REQUIRE_EXPLICIT_VERSION before including nvtx3.hpp, * #undef it afterward, and only use explicit-version symbols. This is not * necessary in common cases, such as instrumenting a standalone application, or * static/shared libraries in .cpp files or headers private to those projects. */ /* clang-format off */ #if !defined(NVTX3_CPP_REQUIRE_EXPLICIT_VERSION) /* Define macro used by all definitions in this header to indicate the * unversioned symbols should be defined in addition to the versioned ones. */ #define NVTX3_INLINE_THIS_VERSION #if !defined(NVTX3_CPP_INLINED_VERSION_MAJOR) /* First occurrence of this header in the translation unit. Define macros * indicating which version shall be used for unversioned symbols. */ /** * @brief Semantic major version number for NVTX C++ wrappers of unversioned symbols * * Breaking changes may occur between major versions, and different major versions * cannot provide unversioned symbols in the same translation unit (.cpp file). * * Note: If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is defined, this macro is not defined. * * Not to be confused with the version number of the NVTX core library. */ #define NVTX3_CPP_INLINED_VERSION_MAJOR 1 // NVTX3_CPP_VERSION_MAJOR /** * @brief Semantic minor version number for NVTX C++ wrappers of unversioned symbols * * No breaking changes occur between minor versions -- minor version changes within * a major version are purely additive. * * Note: If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is defined, this macro is not defined. * * Not to be confused with the version number of the NVTX core library. */ #define NVTX3_CPP_INLINED_VERSION_MINOR 0 // NVTX3_CPP_VERSION_MINOR #elif NVTX3_CPP_INLINED_VERSION_MAJOR != NVTX3_CPP_VERSION_MAJOR /* Unsupported case -- cannot define unversioned symbols for different major versions * in the same translation unit. */ #error \ "Two different major versions of the NVTX C++ Wrappers are being included in a single .cpp file, with unversioned symbols enabled in both. Only one major version can enable unversioned symbols in a .cpp file. To disable unversioned symbols, #define NVTX3_CPP_REQUIRE_EXPLICIT_VERSION before #including nvtx3.hpp, and use the explicit-version symbols instead -- this is the preferred way to use nvtx3.hpp from a header file." #elif (NVTX3_CPP_INLINED_VERSION_MAJOR == NVTX3_CPP_VERSION_MAJOR) && \ (NVTX3_CPP_INLINED_VERSION_MINOR < NVTX3_CPP_VERSION_MINOR) /* An older minor version of the same major version already defined unversioned * symbols. The new features provided in this header will be inlined * redefine the minor version macro to this header's version. */ #undef NVTX3_CPP_INLINED_VERSION_MINOR #define NVTX3_CPP_INLINED_VERSION_MINOR 0 // NVTX3_CPP_VERSION_MINOR // else, already have this version or newer, nothing to do #endif #endif /* clang-format on */ /** * @file nvtx3.hpp * * @brief Provides C++ constructs making the NVTX library safer and easier to * use with zero overhead. */ /** * \mainpage * \tableofcontents * * \section QUICK_START Quick Start * * To add NVTX ranges to your code, use the `nvtx3::scoped_range` RAII object. A * range begins when the object is created, and ends when the object is * destroyed. * * \code{.cpp} * #include "nvtx3.hpp" * void some_function() { * // Begins a NVTX range with the messsage "some_function" * // The range ends when some_function() returns and `r` is destroyed * nvtx3::scoped_range r{"some_function"}; * * for(int i = 0; i < 6; ++i) { * nvtx3::scoped_range loop{"loop range"}; * std::this_thread::sleep_for(std::chrono::seconds{1}); * } * } // Range ends when `r` is destroyed * \endcode * * The example code above generates the following timeline view in Nsight * Systems: * * \image html * https://raw.githubusercontent.com/NVIDIA/NVTX/release-v3/docs/images/example_range.png * * Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add * ranges to your code that automatically use the name of the enclosing function * as the range's message. * * \code{.cpp} * #include "nvtx3.hpp" * void some_function() { * // Creates a range with a message "some_function" that ends when the * // enclosing function returns * NVTX3_FUNC_RANGE(); * ... * } * \endcode * * * \section Overview * * The NVTX library provides a set of functions for users to annotate their code * to aid in performance profiling and optimization. These annotations provide * information to tools like Nsight Systems to improve visualization of * application timelines. * * \ref RANGES are one of the most commonly used NVTX constructs for annotating * a span of time. For example, imagine a user wanted to see every time a * function, `my_function`, is called and how long it takes to execute. This can * be accomplished with an NVTX range created on the entry to the function and * terminated on return from `my_function` using the push/pop C APIs: * * \code{.cpp} * void my_function(...) { * nvtxRangePushA("my_function"); // Begins NVTX range * // do work * nvtxRangePop(); // Ends NVTX range * } * \endcode * * One of the challenges with using the NVTX C API is that it requires manually * terminating the end of the range with `nvtxRangePop`. This can be challenging * if `my_function()` has multiple returns or can throw exceptions as it * requires calling `nvtxRangePop()` before all possible return points. * * NVTX C++ solves this inconvenience through the "RAII" technique by providing * a `nvtx3::scoped_range` class that begins a range at construction and ends * the range on destruction. The above example then becomes: * * \code{.cpp} * void my_function(...) { * nvtx3::scoped_range r{"my_function"}; // Begins NVTX range * // do work * } // Range ends on exit from `my_function` when `r` is destroyed * \endcode * * The range object `r` is deterministically destroyed whenever `my_function` * returns---ending the NVTX range without manual intervention. For more * information, see \ref RANGES and `nvtx3::scoped_range_in`. * * Another inconvenience of the NVTX C APIs are the several constructs where the * user is expected to initialize an object at the beginning of an application * and reuse that object throughout the lifetime of the application. For example * see domains, categories, and registered messages. * * Example: * \code{.cpp} * nvtxDomainHandle_t D = nvtxDomainCreateA("my domain"); * // Reuse `D` throughout the rest of the application * \endcode * * This can be problematic if the user application or library does not have an * explicit initialization function called before all other functions to * ensure that these long-lived objects are initialized before being used. * * NVTX C++ makes use of the "construct on first use" technique to alleviate * this inconvenience. In short, a function local static object is constructed * upon the first invocation of a function and returns a reference to that * object on all future invocations. See the documentation for `nvtx3::domain`, * `nvtx3::named_category`, `nvtx3::registered_string`, and * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use for more * information. * * Using construct on first use, the above example becomes: * \code{.cpp} * struct my_domain{ static constexpr char const* name{"my domain"}; }; * * // The first invocation of `domain::get` for the type `my_domain` will * // construct a `nvtx3::domain` object and return a reference to it. Future * // invocations simply return a reference. * nvtx3::domain const& D = nvtx3::domain::get(); * \endcode * For more information about NVTX and how it can be used, see * https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx and * https://devblogs.nvidia.com/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/ * for more information. * * \section RANGES Ranges * * Ranges are used to describe a span of time during the execution of an * application. Common examples are using ranges to annotate the time it takes * to execute a function or an iteration of a loop. * * NVTX C++ uses RAII to automate the generation of ranges that are tied to the * lifetime of objects. Similar to `std::lock_guard` in the C++ Standard * Template Library. * * \subsection scoped_range Scoped Range * * `nvtx3::scoped_range_in` is a class that begins a range upon construction * and ends the range at destruction. This is one of the most commonly used * constructs in NVTX C++ and is useful for annotating spans of time on a * particular thread. These ranges can be nested to arbitrary depths. * * `nvtx3::scoped_range` is an alias for a `nvtx3::scoped_range_in` in the * global NVTX domain. For more information about Domains, see \ref DOMAINS. * * Various attributes of a range can be configured constructing a * `nvtx3::scoped_range_in` with a `nvtx3::event_attributes` object. For * more information, see \ref ATTRIBUTES. * * Example: * * \code{.cpp} * void some_function() { * // Creates a range for the duration of `some_function` * nvtx3::scoped_range r{}; * * while(true) { * // Creates a range for every loop iteration * // `loop_range` is nested inside `r` * nvtx3::scoped_range loop_range{}; * } * } * \endcode * * \subsection unique_range Unique Range * * `nvtx3::unique_range` is similar to `nvtx3::scoped_range`, with a few key differences: * - `unique_range` objects can be destroyed in any order whereas `scoped_range` objects must be * destroyed in exact reverse creation order * - `unique_range` can start and end on different threads * - `unique_range` is moveable * - `unique_range` objects can be constructed as heap objects * * There is extra overhead associated with `unique_range` constructs and therefore use of * `nvtx3::scoped_range_in` should be preferred. * * \section MARKS Marks * * `nvtx3::mark` annotates an instantaneous point in time with a "marker". * * Unlike a "range" which has a beginning and an end, a marker is a single event * in an application, such as detecting a problem: * * \code{.cpp} * bool success = do_operation(...); * if (!success) { * nvtx3::mark("operation failed!"); * } * \endcode * * \section DOMAINS Domains * * Similar to C++ namespaces, domains allow for scoping NVTX events. By default, * all NVTX events belong to the "global" domain. Libraries and applications * should scope their events to use a custom domain to differentiate where the * events originate from. * * It is common for a library or application to have only a single domain and * for the name of that domain to be known at compile time. Therefore, Domains * in NVTX C++ are represented by _tag types_. * * For example, to define a custom domain, simply define a new concrete type * (a `class` or `struct`) with a `static` member called `name` that contains * the desired name of the domain. * * \code{.cpp} * struct my_domain{ static constexpr char const* name{"my domain"}; }; * \endcode * * For any NVTX C++ construct that can be scoped to a domain, the type * `my_domain` can be passed as an explicit template argument to scope it to * the custom domain. * * The tag type `nvtx3::domain::global` represents the global NVTX domain. * * \code{.cpp} * // By default, `scoped_range_in` belongs to the global domain * nvtx3::scoped_range_in<> r0{}; * * // Alias for a `scoped_range_in` in the global domain * nvtx3::scoped_range r1{}; * * // `r` belongs to the custom domain * nvtx3::scoped_range_in r{}; * \endcode * * When using a custom domain, it is recommended to define type aliases for NVTX * constructs in the custom domain. * \code{.cpp} * using my_scoped_range = nvtx3::scoped_range_in; * using my_registered_string = nvtx3::registered_string_in; * using my_named_category = nvtx3::named_category_in; * \endcode * * See `nvtx3::domain` for more information. * * \section ATTRIBUTES Event Attributes * * NVTX events can be customized with various attributes to provide additional * information (such as a custom message) or to control visualization of the * event (such as the color used). These attributes can be specified per-event * via arguments to a `nvtx3::event_attributes` object. * * NVTX events can be customized via four "attributes": * - \ref COLOR : color used to visualize the event in tools. * - \ref MESSAGES : Custom message string. * - \ref PAYLOAD : User-defined numerical value. * - \ref CATEGORY : Intra-domain grouping. * * It is possible to construct a `nvtx3::event_attributes` from any number of * attribute objects (nvtx3::color, nvtx3::message, nvtx3::payload, * nvtx3::category) in any order. If an attribute is not specified, a tool * specific default value is used. See `nvtx3::event_attributes` for more * information. * * \code{.cpp} * // Set message, same as passing nvtx3::message{"message"} * nvtx3::event_attributes attr{"message"}; * * // Set message and color * nvtx3::event_attributes attr{"message", nvtx3::rgb{127, 255, 0}}; * * // Set message, color, payload, category * nvtx3::event_attributes attr{"message", * nvtx3::rgb{127, 255, 0}, * nvtx3::payload{42}, * nvtx3::category{1}}; * * // Same as above -- can use any order of arguments * nvtx3::event_attributes attr{nvtx3::payload{42}, * nvtx3::category{1}, * "message", * nvtx3::rgb{127, 255, 0}}; * * // Multiple arguments of the same type are allowed, but only the first is * // used -- in this example, payload is set to 42: * nvtx3::event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; * * // Using the nvtx3 namespace in a local scope makes the syntax more succinct: * using namespace nvtx3; * event_attributes attr{"message", rgb{127, 255, 0}, payload{42}, category{1}}; * \endcode * * \subsection MESSAGES message * * `nvtx3::message` sets the message string for an NVTX event. * * Example: * \code{.cpp} * // Create an `event_attributes` with the message "my message" * nvtx3::event_attributes attr{nvtx3::message{"my message"}}; * * // strings and string literals implicitly assumed to be a `nvtx3::message` * nvtx3::event_attributes attr{"my message"}; * \endcode * * \subsubsection REGISTERED_MESSAGE Registered Messages * * Associating a `nvtx3::message` with an event requires copying the contents of * the message every time the message is used, i.e., copying the entire message * string. This may cause non-trivial overhead in performance sensitive code. * * To eliminate this overhead, NVTX allows registering a message string, * yielding a "handle" that is inexpensive to copy that may be used in place of * a message string. When visualizing the events, tools such as Nsight Systems * will take care of mapping the message handle to its string. * * A message should be registered once and the handle reused throughout the rest * of the application. This can be done by either explicitly creating static * `nvtx3::registered_string` objects, or using the * `nvtx3::registered_string::get` construct on first use helper (recommended). * * Similar to \ref DOMAINS, `nvtx3::registered_string::get` requires defining a * custom tag type with a static `message` member whose value will be the * contents of the registered string. * * Example: * \code{.cpp} * // Explicitly constructed, static `registered_string` in my_domain: * static registered_string_in static_message{"my message"}; * * // Or use construct on first use: * // Define a tag type with a `message` member string to register * struct my_message{ static constexpr char const* message{ "my message" }; }; * * // Uses construct on first use to register the contents of * // `my_message::message` * auto& msg = nvtx3::registered_string_in::get(); * \endcode * * \subsection COLOR color * * Associating a `nvtx3::color` with an event allows controlling how the event * is visualized in a tool such as Nsight Systems. This is a convenient way to * visually differentiate among different events. * * \code{.cpp} * // Define a color via rgb color values * nvtx3::color c{nvtx3::rgb{127, 255, 0}}; * nvtx3::event_attributes attr{c}; * * // rgb color values can be passed directly to an `event_attributes` * nvtx3::event_attributes attr1{nvtx3::rgb{127,255,0}}; * \endcode * * \subsection CATEGORY category * * A `nvtx3::category` is simply an integer id that allows for fine-grain * grouping of NVTX events. For example, one might use separate categories for * IO, memory allocation, compute, etc. * * \code{.cpp} * nvtx3::event_attributes{nvtx3::category{1}}; * \endcode * * \subsubsection NAMED_CATEGORIES Named Categories * * Associates a `name` string with a category `id` to help differentiate among * categories. * * For any given category id `Id`, a `named_category{Id, "name"}` should only * be constructed once and reused throughout an application. This can be done by * either explicitly creating static `nvtx3::named_category` objects, or using * the `nvtx3::named_category::get` construct on first use helper (recommended). * * Similar to \ref DOMAINS, `nvtx3::named_category::get` requires defining a * custom tag type with static `name` and `id` members. * * \code{.cpp} * // Explicitly constructed, static `named_category` in my_domain: * static nvtx3::named_category_in static_category{42, "my category"}; * * // Or use construct on first use: * // Define a tag type with `name` and `id` members * struct my_category { * static constexpr char const* name{"my category"}; // category name * static constexpr uint32_t id{42}; // category id * }; * * // Use construct on first use to name the category id `42` * // with name "my category": * auto& cat = named_category_in::get(); * * // Range `r` associated with category id `42` * nvtx3::event_attributes attr{cat}; * \endcode * * \subsection PAYLOAD payload * * Allows associating a user-defined numerical value with an event. * * \code{.cpp} * // Constructs a payload from the `int32_t` value 42 * nvtx3:: event_attributes attr{nvtx3::payload{42}}; * \endcode * * * \section EXAMPLE Example * * Putting it all together: * \code{.cpp} * // Define a custom domain tag type * struct my_domain{ static constexpr char const* name{"my domain"}; }; * * // Define a named category tag type * struct my_category{ * static constexpr char const* name{"my category"}; * static constexpr uint32_t id{42}; * }; * * // Define a registered string tag type * struct my_message{ static constexpr char const* message{"my message"}; }; * * // For convenience, use aliases for domain scoped objects * using my_scoped_range = nvtx3::scoped_range_in; * using my_registered_string = nvtx3::registered_string_in; * using my_named_category = nvtx3::named_category_in; * * // Default values for all attributes * nvtx3::event_attributes attr{}; * my_scoped_range r0{attr}; * * // Custom (unregistered) message, and unnamed category * nvtx3::event_attributes attr1{"message", nvtx3::category{2}}; * my_scoped_range r1{attr1}; * * // Alternatively, pass arguments of `event_attributes` ctor directly to * // `my_scoped_range` * my_scoped_range r2{"message", nvtx3::category{2}}; * * // construct on first use a registered string * auto& msg = my_registered_string::get(); * * // construct on first use a named category * auto& cat = my_named_category::get(); * * // Use registered string and named category with a custom payload * my_scoped_range r3{msg, cat, nvtx3::payload{42}}; * * // Any number of arguments in any order * my_scoped_range r{nvtx3::rgb{127, 255,0}, msg}; * * \endcode * \section MACROS Convenience Macros * * Oftentimes users want to quickly and easily add NVTX ranges to their library * or application to aid in profiling and optimization. * * A convenient way to do this is to use the \ref NVTX3_FUNC_RANGE and * \ref NVTX3_FUNC_RANGE_IN macros. These macros take care of constructing an * `nvtx3::scoped_range_in` with the name of the enclosing function as the * range's message. * * \code{.cpp} * void some_function() { * // Automatically generates an NVTX range for the duration of the function * // using "some_function" as the event's message. * NVTX3_FUNC_RANGE(); * } * \endcode * */ /* Temporary helper #defines, removed with #undef at end of header */ /* Some compilers do not correctly support SFINAE, which is used in this API * to detect common usage errors and provide clearer error messages (by using * static_assert) than the compiler would produce otherwise. These compilers * will generate errors while compiling this file such as: * * error: ‘name’ is not a member of ‘nvtx3::v1::domain::global’ * * The following compiler versions are known to have this problem, and so are * set by default to disable the SFINAE-based checks: * * - All MSVC versions prior to VS2017 Update 7 (15.7) * - GCC 8.1-8.3 (the problem was fixed in GCC 8.4) * * If you find your compiler hits this problem, you can work around it by * defining NVTX3_USE_CHECKED_OVERLOADS_FOR_GET to 0 before including this * header, or you can add a check for your compiler version to this #if. * Also, please report the issue on the NVTX github page. */ #if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET) #if defined(_MSC_VER) && _MSC_VER < 1914 \ || defined(__GNUC__) && __GNUC__ == 8 && __GNUC_MINOR__ < 4 #define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 0 #else #define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 1 #endif #define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET_DEFINED_HERE #endif /* Within this header, nvtx3::NVTX3_VERSION_NAMESPACE resolves to nvtx3::vX, * where "X" is the major version number. */ #define NVTX3_CONCAT(A, B) A##B #define NVTX3_NAMESPACE_FOR(VERSION) NVTX3_CONCAT(v, VERSION) #define NVTX3_VERSION_NAMESPACE NVTX3_NAMESPACE_FOR(NVTX3_CPP_VERSION_MAJOR) /* Avoid duplicating #if defined(NVTX3_INLINE_THIS_VERSION) for namespaces * in each minor version by making a macro to use unconditionally, which * resolves to "inline" or nothing as appropriate. */ #if defined(NVTX3_INLINE_THIS_VERSION) #define NVTX3_INLINE_IF_REQUESTED inline #else #define NVTX3_INLINE_IF_REQUESTED #endif /* Enables the use of constexpr when support for C++14 constexpr is present. * * Initialization of a class member that is a union to a specific union member * can only be done in the body of a constructor, not in a member initializer * list. A constexpr constructor must have an empty body until C++14, so there * is no way to make an initializer of a member union constexpr in C++11. This * macro allows making functions constexpr in C++14 or newer, but non-constexpr * in C++11 compilation. It is used here on constructors that initialize their * member unions. */ #if __cpp_constexpr >= 201304L #define NVTX3_CONSTEXPR_IF_CPP14 constexpr #else #define NVTX3_CONSTEXPR_IF_CPP14 #endif /* Use a macro for static asserts, which defaults to static_assert, but that * testing tools can replace with a logging function. For example: * #define NVTX3_STATIC_ASSERT(c, m) \ * do { if (!(c)) printf("static_assert would fail: %s\n", m); } while (0) */ #if !defined(NVTX3_STATIC_ASSERT) #define NVTX3_STATIC_ASSERT(condition, message) static_assert(condition, message); #define NVTX3_STATIC_ASSERT_DEFINED_HERE #endif /* Implementation sections, enclosed in guard macros for each minor version */ #ifndef NVTX3_CPP_DEFINITIONS_V1_0 #define NVTX3_CPP_DEFINITIONS_V1_0 #include "nvToolsExt.h" #include "nvToolsExtPayload.h" #include #include #include #include #include namespace nvtx3 { NVTX3_INLINE_IF_REQUESTED namespace NVTX3_VERSION_NAMESPACE { namespace detail { template struct always_false : std::false_type {}; template struct has_name : std::false_type {}; template struct has_name : std::true_type {}; template struct has_id : std::false_type {}; template struct has_id : std::true_type {}; template struct has_message : std::false_type {}; template struct has_message : std::true_type {}; template struct is_c_string : std::false_type {}; template struct is_c_string::value || std::is_convertible::value >::type> : std::true_type {}; template using is_uint32 = std::is_same::type, uint32_t>; } // namespace detail /** * @brief `domain`s allow for grouping NVTX events into a single scope to * differentiate them from events in other `domain`s. * * By default, all NVTX constructs are placed in the "global" NVTX domain. * * A custom `domain` may be used in order to differentiate a library's or * application's NVTX events from other events. * * `domain`s are expected to be long-lived and unique to a library or * application. As such, it is assumed a domain's name is known at compile * time. Therefore, all NVTX constructs that can be associated with a domain * require the domain to be specified via a *type* `D` passed as an * explicit template parameter. * * The type `domain::global` may be used to indicate that the global NVTX * domain should be used. * * None of the C++ NVTX constructs require the user to manually construct a * `domain` object. Instead, if a custom domain is desired, the user is * expected to define a type `D` that contains a member * `D::name` which resolves to either a `char const*` or `wchar_t * const*`. The value of `D::name` is used to name and uniquely * identify the custom domain. * * Upon the first use of an NVTX construct associated with the type * `D`, the "construct on first use" pattern is used to construct a * function local static `domain` object. All future NVTX constructs * associated with `D` will use a reference to the previously * constructed `domain` object. See `domain::get`. * * Example: * \code{.cpp} * // The type `my_domain` defines a `name` member used to name and identify * // the `domain` object identified by `my_domain`. * struct my_domain{ static constexpr char const* name{"my_domain"}; }; * * // The NVTX range `r` will be grouped with all other NVTX constructs * // associated with `my_domain`. * nvtx3::scoped_range_in r{}; * * // An alias can be created for a `scoped_range_in` in the custom domain * using my_scoped_range = nvtx3::scoped_range_in; * my_scoped_range my_range{}; * * // `domain::global` indicates that the global NVTX domain is used * nvtx3::scoped_range_in r2{}; * * // For convenience, `nvtx3::scoped_range` is an alias for a range in the * // global domain * nvtx3::scoped_range r3{}; * \endcode */ class domain { public: domain(domain const&) = delete; domain& operator=(domain const&) = delete; domain(domain&&) = delete; domain& operator=(domain&&) = delete; /** * @brief Tag type for the "global" NVTX domain. * * This type may be passed as a template argument to any function/class * expecting a type to identify a domain to indicate that the global domain * should be used. * * All NVTX events in the global domain across all libraries and * applications will be grouped together. * */ struct global { }; #if NVTX3_USE_CHECKED_OVERLOADS_FOR_GET /** * @brief Returns reference to an instance of a function local static * `domain` object. * * Uses the "construct on first use" idiom to safely ensure the `domain` * object is initialized exactly once upon first invocation of * `domain::get()`. All following invocations will return a * reference to the previously constructed `domain` object. See * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use * * None of the constructs in this header require the user to directly invoke * `domain::get`. It is automatically invoked when constructing objects like * a `scoped_range_in` or `category`. Advanced users may wish to use * `domain::get` for the convenience of the "construct on first use" idiom * when using domains with their own use of the NVTX C API. * * This function is threadsafe as of C++11. If two or more threads call * `domain::get` concurrently, exactly one of them is guaranteed * to construct the `domain` object and the other(s) will receive a * reference to the object after it is fully constructed. * * The domain's name is specified via the type `D` pass as an * explicit template parameter. `D` is required to contain a * member `D::name` that resolves to either a `char const*` or * `wchar_t const*`. The value of `D::name` is used to name and * uniquely identify the `domain`. * * Example: * \code{.cpp} * // The type `my_domain` defines a `name` member used to name and identify * // the `domain` object identified by `my_domain`. * struct my_domain{ static constexpr char const* name{"my domain"}; }; * * auto& D1 = domain::get(); // First invocation constructs a * // `domain` with the name "my domain" * * auto& D2 = domain::get(); // Quickly returns reference to * // previously constructed `domain`. * \endcode * * @tparam D Type that contains a `D::name` member used to * name the `domain` object. * @return Reference to the `domain` corresponding to the type `D`. */ template ::value , int>::type = 0> static domain const& get() noexcept { static domain const d(D::name); return d; } /** * @brief Overload of `domain::get` to provide a clear compile error when * `D` has a `name` member that is not directly convertible to either * `char const*` or `wchar_t const*`. */ template ::value , int>::type = 0> static domain const& get() noexcept { NVTX3_STATIC_ASSERT(detail::always_false::value, "Type used to identify an NVTX domain must contain a static constexpr member " "called 'name' of type const char* or const wchar_t* -- 'name' member is not " "convertible to either of those types"); static domain const unused; return unused; // Function must compile for static_assert to be triggered } /** * @brief Overload of `domain::get` to provide a clear compile error when * `D` does not have a `name` member. */ template ::value , int>::type = 0> static domain const& get() noexcept { NVTX3_STATIC_ASSERT(detail::always_false::value, "Type used to identify an NVTX domain must contain a static constexpr member " "called 'name' of type const char* or const wchar_t* -- 'name' member is missing"); static domain const unused; return unused; // Function must compile for static_assert to be triggered } #else template static domain const& get() noexcept { static domain const d(D::name); return d; } #endif /** * @brief Conversion operator to `nvtxDomainHandle_t`. * * Allows transparently passing a domain object into an API expecting a * native `nvtxDomainHandle_t` object. */ operator nvtxDomainHandle_t() const noexcept { return _domain; } private: /** * @brief Construct a new domain with the specified `name`. * * This constructor is private as it is intended that `domain` objects only * be created through the `domain::get` function. * * @param name A unique name identifying the domain */ explicit domain(char const* name) noexcept : _domain{nvtxDomainCreateA(name)} {} /** * @brief Construct a new domain with the specified `name`. * * This constructor is private as it is intended that `domain` objects only * be created through the `domain::get` function. * * @param name A unique name identifying the domain */ explicit domain(wchar_t const* name) noexcept : _domain{nvtxDomainCreateW(name)} {} /** * @brief Construct a new domain with the specified `name`. * * This constructor is private as it is intended that `domain` objects only * be created through the `domain::get` function. * * @param name A unique name identifying the domain */ explicit domain(std::string const& name) noexcept : domain{name.c_str()} {} /** * @brief Construct a new domain with the specified `name`. * * This constructor is private as it is intended that `domain` objects only * be created through the `domain::get` function. * * @param name A unique name identifying the domain */ explicit domain(std::wstring const& name) noexcept : domain{name.c_str()} {} /** * @brief Default constructor creates a `domain` representing the * "global" NVTX domain. * * All events not associated with a custom `domain` are grouped in the * "global" NVTX domain. * */ domain() noexcept {} /** * @brief Intentionally avoid calling nvtxDomainDestroy on the `domain` object. * * No currently-available tools attempt to free domain resources when the * nvtxDomainDestroy function is called, due to the thread-safety and * efficiency challenges of freeing thread-local storage for other threads. * Since libraries may be disallowed from introducing static destructors, * and destroying the domain is likely to have no effect, the destructor * for `domain` intentionally chooses to not destroy the domain. * * In a situation where domain destruction is necessary, either manually * call nvtxDomainDestroy on the domain's handle, or make a class that * derives from `domain` and calls nvtxDomainDestroy in its destructor. */ ~domain() = default; private: nvtxDomainHandle_t const _domain{}; ///< The `domain`s NVTX handle }; /** * @brief Returns reference to the `domain` object that represents the global * NVTX domain. * * This specialization for `domain::global` returns a default constructed, * `domain` object for use when the "global" domain is desired. * * All NVTX events in the global domain across all libraries and applications * will be grouped together. * * @return Reference to the `domain` corresponding to the global NVTX domain. * */ template <> inline domain const& domain::get() noexcept { static domain const d{}; return d; } /** * @brief Indicates the values of the red, green, and blue color channels for * an RGB color to use as an event attribute (assumes no transparency). * */ struct rgb { /// Type used for component values using component_type = uint8_t; /** * @brief Construct a rgb with red, green, and blue channels * specified by `red_`, `green_`, and `blue_`, respectively. * * Valid values are in the range `[0,255]`. * * @param red_ Value of the red channel * @param green_ Value of the green channel * @param blue_ Value of the blue channel */ constexpr rgb( component_type red_, component_type green_, component_type blue_) noexcept : red{red_}, green{green_}, blue{blue_} { } component_type red{}; ///< Red channel value component_type green{}; ///< Green channel value component_type blue{}; ///< Blue channel value }; /** * @brief Indicates the value of the alpha, red, green, and blue color * channels for an ARGB color to use as an event attribute. * */ struct argb final : rgb { /** * @brief Construct an argb with alpha, red, green, and blue channels * specified by `alpha_`, `red_`, `green_`, and `blue_`, respectively. * * Valid values are in the range `[0,255]`. * * @param alpha_ Value of the alpha channel (opacity) * @param red_ Value of the red channel * @param green_ Value of the green channel * @param blue_ Value of the blue channel * */ constexpr argb( component_type alpha_, component_type red_, component_type green_, component_type blue_) noexcept : rgb{red_, green_, blue_}, alpha{alpha_} { } component_type alpha{}; ///< Alpha channel value }; /** * @brief Represents a custom color that can be associated with an NVTX event * via it's `event_attributes`. * * Specifying colors for NVTX events is a convenient way to visually * differentiate among different events in a visualization tool such as Nsight * Systems. * */ class color { public: /// Type used for the color's value using value_type = uint32_t; /** * @brief Constructs a `color` using the value provided by `hex_code`. * * `hex_code` is expected to be a 4 byte argb hex code. * * The most significant byte indicates the value of the alpha channel * (opacity) (0-255) * * The next byte indicates the value of the red channel (0-255) * * The next byte indicates the value of the green channel (0-255) * * The least significant byte indicates the value of the blue channel * (0-255) * * @param hex_code The hex code used to construct the `color` */ constexpr explicit color(value_type hex_code) noexcept : _value{hex_code} {} /** * @brief Construct a `color` using the alpha, red, green, blue components * in `argb`. * * @param argb The alpha, red, green, blue components of the desired `color` */ constexpr color(argb argb_) noexcept : color{from_bytes_msb_to_lsb(argb_.alpha, argb_.red, argb_.green, argb_.blue)} { } /** * @brief Construct a `color` using the red, green, blue components in * `rgb`. * * Uses maximum value for the alpha channel (opacity) of the `color`. * * @param rgb The red, green, blue components of the desired `color` */ constexpr color(rgb rgb_) noexcept : color{from_bytes_msb_to_lsb(0xFF, rgb_.red, rgb_.green, rgb_.blue)} { } /** * @brief Returns the `color`s argb hex code * */ constexpr value_type get_value() const noexcept { return _value; } /** * @brief Return the NVTX color type of the color. * */ constexpr nvtxColorType_t get_type() const noexcept { return _type; } color() = delete; ~color() = default; color(color const&) = default; color& operator=(color const&) = default; color(color&&) = default; color& operator=(color&&) = default; private: /** * @brief Constructs an unsigned, 4B integer from the component bytes in * most to least significant byte order. * */ constexpr static value_type from_bytes_msb_to_lsb( uint8_t byte3, uint8_t byte2, uint8_t byte1, uint8_t byte0) noexcept { return uint32_t{byte3} << 24 | uint32_t{byte2} << 16 | uint32_t{byte1} << 8 | uint32_t{byte0}; } value_type _value{}; ///< color's argb color code nvtxColorType_t _type{NVTX_COLOR_ARGB}; ///< NVTX color type code }; /** * @brief Object for intra-domain grouping of NVTX events. * * A `category` is simply an integer id that allows for fine-grain grouping of * NVTX events. For example, one might use separate categories for IO, memory * allocation, compute, etc. * * Example: * \code{.cpp} * nvtx3::category cat1{1}; * * // Range `r1` belongs to the category identified by the value `1`. * nvtx3::scoped_range r1{cat1}; * * // Range `r2` belongs to the same category as `r1` * nvtx3::scoped_range r2{nvtx3::category{1}}; * \endcode * * To associate a name string with a category id, see `named_category`. * */ class category { public: /// Type used for `category`s integer id. using id_type = uint32_t; /** * @brief Construct a `category` with the specified `id`. * * The `category` will be unnamed and identified only by its `id` value. * * All `category`s in a domain sharing the same `id` are equivalent. * * @param[in] id The `category`'s identifying value */ constexpr explicit category(id_type id) noexcept : id_{id} {} /** * @brief Returns the id of the category. * */ constexpr id_type get_id() const noexcept { return id_; } category() = delete; ~category() = default; category(category const&) = default; category& operator=(category const&) = default; category(category&&) = default; category& operator=(category&&) = default; private: id_type id_{}; ///< category's unique identifier }; /** * @brief A `category` with an associated name string. * * Associates a `name` string with a category `id` to help differentiate among * categories. * * For any given category id `Id`, a `named_category(Id, "name")` should only * be constructed once and reused throughout an application. This can be done * by either explicitly creating static `named_category` objects, or using the * `named_category::get` construct on first use helper (recommended). * * Creating two or more `named_category` objects with the same value for `id` * in the same domain results in undefined behavior. * * Similarly, behavior is undefined when a `named_category` and `category` * share the same value of `id`. * * Example: * \code{.cpp} * // Explicitly constructed, static `named_category` in global domain: * static nvtx3::named_category static_category{42, "my category"}; * * // Range `r` associated with category id `42` * nvtx3::scoped_range r{static_category}; * * // OR use construct on first use: * * // Define a type with `name` and `id` members * struct my_category { * static constexpr char const* name{"my category"}; // category name * static constexpr uint32_t id{42}; // category id * }; * * // Use construct on first use to name the category id `42` * // with name "my category" * auto& cat = named_category_in::get(); * * // Range `r` associated with category id `42` * nvtx3::scoped_range r{cat}; * \endcode * * `named_category_in`'s association of a name to a category id is local to * the domain specified by the type `D`. An id may have a different name in * another domain. * * @tparam D Type containing `name` member used to identify the `domain` to * which the `named_category_in` belongs. Else, `domain::global` to indicate * that the global NVTX domain should be used. */ template class named_category_in final : public category { public: #if NVTX3_USE_CHECKED_OVERLOADS_FOR_GET /** * @brief Returns a global instance of a `named_category_in` as a * function-local static. * * Creates a `named_category_in` with name and id specified by the contents * of a type `C`. `C::name` determines the name and `C::id` determines the * category id. * * This function is useful for constructing a named `category` exactly once * and reusing the same instance throughout an application. * * Example: * \code{.cpp} * // Define a type with `name` and `id` members * struct my_category { * static constexpr char const* name{"my category"}; // category name * static constexpr uint32_t id{42}; // category id * }; * * // Use construct on first use to name the category id `42` * // with name "my category" * auto& cat = named_category_in::get(); * * // Range `r` associated with category id `42` * nvtx3::scoped_range r{cat}; * \endcode * * Uses the "construct on first use" idiom to safely ensure the `category` * object is initialized exactly once. See * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use * * @tparam C Type containing a member `C::name` that resolves to either a * `char const*` or `wchar_t const*` and `C::id`. */ template ::value && detail::is_uint32::value , int>::type = 0> static named_category_in const& get() noexcept { static named_category_in const cat(C::id, C::name); return cat; } /** * @brief Overload of `named_category_in::get` to provide a clear compile error * when `C` has the required `name` and `id` members, but they are not the * required types. `name` must be directly convertible to `char const*` or * `wchar_t const*`, and `id` must be `uint32_t`. */ template ::value || !detail::is_uint32::value , int>::type = 0> static named_category_in const& get() noexcept { NVTX3_STATIC_ASSERT(detail::is_c_string::value, "Type used to name an NVTX category must contain a static constexpr member " "called 'name' of type const char* or const wchar_t* -- 'name' member is not " "convertible to either of those types"); NVTX3_STATIC_ASSERT(detail::is_uint32::value, "Type used to name an NVTX category must contain a static constexpr member " "called 'id' of type uint32_t -- 'id' member is the wrong type"); static named_category_in const unused; return unused; // Function must compile for static_assert to be triggered } /** * @brief Overload of `named_category_in::get` to provide a clear compile error * when `C` does not have the required `name` and `id` members. */ template ::value || !detail::has_id::value , int>::type = 0> static named_category_in const& get() noexcept { NVTX3_STATIC_ASSERT(detail::has_name::value, "Type used to name an NVTX category must contain a static constexpr member " "called 'name' of type const char* or const wchar_t* -- 'name' member is missing"); NVTX3_STATIC_ASSERT(detail::has_id::value, "Type used to name an NVTX category must contain a static constexpr member " "called 'id' of type uint32_t -- 'id' member is missing"); static named_category_in const unused; return unused; // Function must compile for static_assert to be triggered } #else template static named_category_in const& get() noexcept { static named_category_in const cat(C::id, C::name); return cat; } #endif private: // Default constructor is only used internally for static_assert(false) cases. named_category_in() noexcept : category{0} {} public: /** * @brief Construct a `named_category_in` with the specified `id` and `name`. * * The name `name` will be registered with `id`. * * Every unique value of `id` should only be named once. * * @param[in] id The category id to name * @param[in] name The name to associated with `id` */ named_category_in(id_type id, char const* name) noexcept : category{id} { #ifndef NVTX_DISABLE nvtxDomainNameCategoryA(domain::get(), get_id(), name); #else (void)id; (void)name; #endif }; /** * @brief Construct a `named_category_in` with the specified `id` and `name`. * * The name `name` will be registered with `id`. * * Every unique value of `id` should only be named once. * * @param[in] id The category id to name * @param[in] name The name to associated with `id` */ named_category_in(id_type id, wchar_t const* name) noexcept : category{id} { #ifndef NVTX_DISABLE nvtxDomainNameCategoryW(domain::get(), get_id(), name); #else (void)id; (void)name; #endif }; }; /** * @brief Alias for a `named_category_in` in the global NVTX domain. * */ using named_category = named_category_in; /** * @brief A message registered with NVTX. * * Normally, associating a `message` with an NVTX event requires copying the * contents of the message string. This may cause non-trivial overhead in * highly performance sensitive regions of code. * * message registration is an optimization to lower the overhead of * associating a message with an NVTX event. Registering a message yields a * handle that is inexpensive to copy that may be used in place of a message * string. * * A particular message should only be registered once and the handle * reused throughout the rest of the application. This can be done by either * explicitly creating static `registered_string_in` objects, or using the * `registered_string_in::get` construct on first use helper (recommended). * * Example: * \code{.cpp} * // Explicitly constructed, static `registered_string` in my_domain: * static registered_string_in static_message{"message"}; * * // "message" is associated with the range `r` * nvtx3::scoped_range r{static_message}; * * // Or use construct on first use: * * // Define a type with a `message` member that defines the contents of the * // registered string * struct my_message{ static constexpr char const* message{ "my message" }; }; * * // Uses construct on first use to register the contents of * // `my_message::message` * auto& msg = registered_string_in::get(); * * // "my message" is associated with the range `r` * nvtx3::scoped_range r{msg}; * \endcode * * `registered_string_in`s are local to a particular domain specified via * the type `D`. * * @tparam D Type containing `name` member used to identify the `domain` to * which the `registered_string_in` belongs. Else, `domain::global` to indicate * that the global NVTX domain should be used. */ template class registered_string_in { public: #if NVTX3_USE_CHECKED_OVERLOADS_FOR_GET /** * @brief Returns a global instance of a `registered_string_in` as a function * local static. * * Provides a convenient way to register a message with NVTX without having * to explicitly register the message. * * Upon first invocation, constructs a `registered_string_in` whose contents * are specified by `message::message`. * * All future invocations will return a reference to the object constructed * in the first invocation. * * Example: * \code{.cpp} * // Define a type with a `message` member that defines the contents of the * // registered string * struct my_message{ static constexpr char const* message{ "my message" }; * }; * * // Uses construct on first use to register the contents of * // `my_message::message` * auto& msg = registered_string_in::get(); * * // "my message" is associated with the range `r` * nvtx3::scoped_range r{msg}; * \endcode * * @tparam M Type required to contain a member `M::message` that * resolves to either a `char const*` or `wchar_t const*` used as the * registered string's contents. * @return Reference to a `registered_string_in` associated with the type `M`. */ template ::value , int>::type = 0> static registered_string_in const& get() noexcept { static registered_string_in const regstr(M::message); return regstr; } /** * @brief Overload of `registered_string_in::get` to provide a clear compile error * when `M` has a `message` member that is not directly convertible to either * `char const*` or `wchar_t const*`. */ template ::value , int>::type = 0> static registered_string_in const& get() noexcept { NVTX3_STATIC_ASSERT(detail::always_false::value, "Type used to register an NVTX string must contain a static constexpr member " "called 'message' of type const char* or const wchar_t* -- 'message' member is " "not convertible to either of those types"); static registered_string_in const unused; return unused; // Function must compile for static_assert to be triggered } /** * @brief Overload of `registered_string_in::get` to provide a clear compile error when * `M` does not have a `message` member. */ template ::value , int>::type = 0> static registered_string_in const& get() noexcept { NVTX3_STATIC_ASSERT(detail::always_false::value, "Type used to register an NVTX string must contain a static constexpr member " "called 'message' of type const char* or const wchar_t* -- 'message' member " "is missing"); static registered_string_in const unused; return unused; // Function must compile for static_assert to be triggered } #else template static registered_string_in const& get() noexcept { static registered_string_in const regstr(M::message); return regstr; } #endif /** * @brief Constructs a `registered_string_in` from the specified `msg` string. * * Registers `msg` with NVTX and associates a handle with the registered * message. * * A particular message should should only be registered once and the handle * reused throughout the rest of the application. * * @param msg The contents of the message */ explicit registered_string_in(char const* msg) noexcept : handle_{nvtxDomainRegisterStringA(domain::get(), msg)} { } /** * @brief Constructs a `registered_string_in` from the specified `msg` string. * * Registers `msg` with NVTX and associates a handle with the registered * message. * * A particular message should should only be registered once and the handle * reused throughout the rest of the application. * * @param msg The contents of the message */ explicit registered_string_in(std::string const& msg) noexcept : registered_string_in{msg.c_str()} {} /** * @brief Constructs a `registered_string_in` from the specified `msg` string. * * Registers `msg` with NVTX and associates a handle with the registered * message. * * A particular message should should only be registered once and the handle * reused throughout the rest of the application. * * @param msg The contents of the message */ explicit registered_string_in(wchar_t const* msg) noexcept : handle_{nvtxDomainRegisterStringW(domain::get(), msg)} { } /** * @brief Constructs a `registered_string_in` from the specified `msg` string. * * Registers `msg` with NVTX and associates a handle with the registered * message. * * A particular message should only be registered once and the handle * reused throughout the rest of the application. * * @param msg The contents of the message */ explicit registered_string_in(std::wstring const& msg) noexcept : registered_string_in{msg.c_str()} {} /** * @brief Returns the registered string's handle * */ nvtxStringHandle_t get_handle() const noexcept { return handle_; } private: // Default constructor is only used internally for static_assert(false) cases. registered_string_in() noexcept {}; public: ~registered_string_in() = default; registered_string_in(registered_string_in const&) = default; registered_string_in& operator=(registered_string_in const&) = default; registered_string_in(registered_string_in&&) = default; registered_string_in& operator=(registered_string_in&&) = default; private: nvtxStringHandle_t handle_{}; ///< The handle returned from ///< registering the message with NVTX }; /** * @brief Alias for a `registered_string_in` in the global NVTX domain. * */ using registered_string = registered_string_in; /** * @brief Allows associating a message string with an NVTX event via * its `EventAttribute`s. * * Associating a `message` with an NVTX event through its `event_attributes` * allows for naming events to easily differentiate them from other events. * * Every time an NVTX event is created with an associated `message`, the * contents of the message string must be copied. This may cause non-trivial * overhead in highly performance sensitive sections of code. Use of a * `nvtx3::registered_string` is recommended in these situations. * * Example: * \code{.cpp} * // Creates an `event_attributes` with message "message 0" * nvtx3::event_attributes attr0{nvtx3::message{"message 0"}}; * * // `range0` contains message "message 0" * nvtx3::scoped_range range0{attr0}; * * // `std::string` and string literals are implicitly assumed to be * // the contents of an `nvtx3::message` * // Creates an `event_attributes` with message "message 1" * nvtx3::event_attributes attr1{"message 1"}; * * // `range1` contains message "message 1" * nvtx3::scoped_range range1{attr1}; * * // `range2` contains message "message 2" * nvtx3::scoped_range range2{nvtx3::Mesage{"message 2"}}; * * // `std::string` and string literals are implicitly assumed to be * // the contents of an `nvtx3::message` * // `range3` contains message "message 3" * nvtx3::scoped_range range3{"message 3"}; * \endcode */ class message { public: using value_type = nvtxMessageValue_t; /** * @brief Construct a `message` whose contents are specified by `msg`. * * @param msg The contents of the message */ NVTX3_CONSTEXPR_IF_CPP14 message(char const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_ASCII} { value_.ascii = msg; } /** * @brief Construct a `message` whose contents are specified by `msg`. * * @param msg The contents of the message */ message(std::string const& msg) noexcept : message{msg.c_str()} {} /** * @brief Disallow construction for `std::string` r-value * * `message` is a non-owning type and therefore cannot take ownership of an * r-value. Therefore, constructing from an r-value is disallowed to prevent * a dangling pointer. * */ message(std::string&&) = delete; /** * @brief Construct a `message` whose contents are specified by `msg`. * * @param msg The contents of the message */ NVTX3_CONSTEXPR_IF_CPP14 message(wchar_t const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_UNICODE} { value_.unicode = msg; } /** * @brief Construct a `message` whose contents are specified by `msg`. * * @param msg The contents of the message */ message(std::wstring const& msg) noexcept : message{msg.c_str()} {} /** * @brief Disallow construction for `std::wstring` r-value * * `message` is a non-owning type and therefore cannot take ownership of an * r-value. Therefore, constructing from an r-value is disallowed to prevent * a dangling pointer. * */ message(std::wstring&&) = delete; /** * @brief Construct a `message` from a `registered_string_in`. * * @tparam D Type containing `name` member used to identify the `domain` * to which the `registered_string_in` belongs. Else, `domain::global` to * indicate that the global NVTX domain should be used. * @param msg The message that has already been registered with NVTX. */ template NVTX3_CONSTEXPR_IF_CPP14 message(registered_string_in const& msg) noexcept : type_{NVTX_MESSAGE_TYPE_REGISTERED} { value_.registered = msg.get_handle(); } /** * @brief Construct a `message` from NVTX C API type and value. * * @param type nvtxMessageType_t enum value indicating type of the payload * @param value nvtxMessageValue_t union containing message */ constexpr message( nvtxMessageType_t const& type, nvtxMessageValue_t const& value) noexcept : type_{type}, value_(value) { } /** * @brief Construct a `message` from NVTX C API registered string handle. * * @param handle nvtxStringHandle_t value of registered string handle */ NVTX3_CONSTEXPR_IF_CPP14 message(nvtxStringHandle_t handle) noexcept : type_{NVTX_MESSAGE_TYPE_REGISTERED} { value_.registered = handle; } /** * @brief Return the union holding the value of the message. * */ constexpr value_type get_value() const noexcept { return value_; } /** * @brief Return the type information about the value the union holds. * */ constexpr nvtxMessageType_t get_type() const noexcept { return type_; } private: nvtxMessageType_t type_{}; ///< message type nvtxMessageValue_t value_{}; ///< message contents }; /** * @brief A numerical value that can be associated with an NVTX event via * its `event_attributes`. * * Example: * \code{.cpp} * // Constructs a payload from the int32_t value 42 * nvtx3:: event_attributes attr{nvtx3::payload{42}}; * * // `range0` will have an int32_t payload of 42 * nvtx3::scoped_range range0{attr}; * * // range1 has double payload of 3.14 * nvtx3::scoped_range range1{nvtx3::payload{3.14}}; * \endcode */ class payload { public: using value_type = typename nvtxEventAttributes_v2::payload_t; /** * @brief Construct a `payload` from a signed, 8 byte integer. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(int64_t value) noexcept : type_{NVTX_PAYLOAD_TYPE_INT64}, value_{} { value_.llValue = value; } /** * @brief Construct a `payload` from a signed, 4 byte integer. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(int32_t value) noexcept : type_{NVTX_PAYLOAD_TYPE_INT32}, value_{} { value_.iValue = value; } /** * @brief Construct a `payload` from an unsigned, 8 byte integer. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(uint64_t value) noexcept : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT64}, value_{} { value_.ullValue = value; } /** * @brief Construct a `payload` from an unsigned, 4 byte integer. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(uint32_t value) noexcept : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT32}, value_{} { value_.uiValue = value; } /** * @brief Construct a `payload` from a single-precision floating point * value. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(float value) noexcept : type_{NVTX_PAYLOAD_TYPE_FLOAT}, value_{} { value_.fValue = value; } /** * @brief Construct a `payload` from a double-precision floating point * value. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(double value) noexcept : type_{NVTX_PAYLOAD_TYPE_DOUBLE}, value_{} { value_.dValue = value; } /** * @brief Construct a `payload` from NVTX C API type and value. * * @param type nvtxPayloadType_t enum value indicating type of the payload * @param value nvtxEventAttributes_t::payload_t union containing payload */ constexpr payload( nvtxPayloadType_t const& type, value_type const& value) noexcept : type_{type}, value_(value) { } /** * @brief Return the union holding the value of the payload * */ constexpr value_type get_value() const noexcept { return value_; } /** * @brief Return the information about the type the union holds. * */ constexpr nvtxPayloadType_t get_type() const noexcept { return type_; } private: nvtxPayloadType_t type_; ///< Type of the payload value value_type value_; ///< Union holding the payload value }; /** * @brief Describes the attributes of a NVTX event. * * NVTX events can be customized via four "attributes": * * - color: color used to visualize the event in tools such as Nsight * Systems. See `color`. * - message: Custom message string. See `message`. * - payload: User-defined numerical value. See `payload`. * - category: Intra-domain grouping. See `category`. * * These component attributes are specified via an `event_attributes` object. * See `nvtx3::color`, `nvtx3::message`, `nvtx3::payload`, and * `nvtx3::category` for how these individual attributes are constructed. * * While it is possible to specify all four attributes, it is common to want * to only specify a subset of attributes and use default values for the * others. For convenience, `event_attributes` can be constructed from any * number of attribute components in any order. * * Example: * \code{.cpp} * // Set message, same as using nvtx3::message{"message"} * event_attributes attr{"message"}; * * // Set message and color * event_attributes attr{"message", nvtx3::rgb{127, 255, 0}}; * * // Set message, color, payload, category * event_attributes attr{"message", * nvtx3::rgb{127, 255, 0}, * nvtx3::payload{42}, * nvtx3::category{1}}; * * // Same as above -- can use any order of arguments * event_attributes attr{nvtx3::payload{42}, * nvtx3::category{1}, * "message", * nvtx3::rgb{127, 255, 0}}; * * // Multiple arguments of the same type are allowed, but only the first is * // used -- in this example, payload is set to 42: * event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; * * // Range `r` will be customized according the attributes in `attr` * nvtx3::scoped_range r{attr}; * * // For convenience, `event_attributes` constructor arguments may be passed * // to the `scoped_range_in` contructor -- they are forwarded to the * // `event_attributes` constructor * nvtx3::scoped_range r{nvtx3::payload{42}, nvtx3::category{1}, "message"}; * * // Using the nvtx3 namespace in a local scope makes the syntax more succinct: * using namespace nvtx3; * scoped_range r{payload{42}, category{1}, "message"}; * \endcode * */ class event_attributes { public: using value_type = nvtxEventAttributes_t; /** * @brief Default constructor creates an `event_attributes` with no * category, color, payload, nor message. */ constexpr event_attributes() noexcept : attributes_{ NVTX_VERSION, // version sizeof(nvtxEventAttributes_t), // size 0, // category NVTX_COLOR_UNKNOWN, // color type 0, // color value NVTX_PAYLOAD_UNKNOWN, // payload type 0, // reserved 4B {0}, // payload value (union) NVTX_MESSAGE_UNKNOWN, // message type {0} // message value (union) } { } /** * @brief Variadic constructor where the first argument is a `category`. * * Sets the value of the `EventAttribute`s category based on `c` and * forwards the remaining variadic parameter pack to the next constructor. * */ template NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(category const& c, Args const&... args) noexcept : event_attributes(args...) { attributes_.category = c.get_id(); } /** * @brief Variadic constructor where the first argument is a `color`. * * Sets the value of the `EventAttribute`s color based on `c` and forwards * the remaining variadic parameter pack to the next constructor. * */ template NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(color const& c, Args const&... args) noexcept : event_attributes(args...) { attributes_.color = c.get_value(); attributes_.colorType = c.get_type(); } /** * @brief Variadic constructor where the first argument is a `payload`. * * Sets the value of the `EventAttribute`s payload based on `p` and forwards * the remaining variadic parameter pack to the next constructor. * */ template NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(payload const& p, Args const&... args) noexcept : event_attributes(args...) { attributes_.payload = p.get_value(); attributes_.payloadType = p.get_type(); } /** * @brief Variadic constructor where the first argument is a `message`. * * Sets the value of the `EventAttribute`s message based on `m` and forwards * the remaining variadic parameter pack to the next constructor. * */ template NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(message const& m, Args const&... args) noexcept : event_attributes(args...) { attributes_.message = m.get_value(); attributes_.messageType = m.get_type(); } /** * @brief Variadic constructor where the first argument is an extended payload. * * Sets the `ullValue` of the `EventAttribute`s payload and forwards * the remaining variadic parameter pack to the next constructor. * */ template NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(nvtxPayloadData_t const* p, Args const&... args) noexcept : event_attributes(args...) { attributes_.payloadType = NVTX_PAYLOAD_TYPE_EXT; attributes_.reserved0 = 1; // NCCL uses only a single binary payload per event. attributes_.payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p); } ~event_attributes() = default; event_attributes(event_attributes const&) = default; event_attributes& operator=(event_attributes const&) = default; event_attributes(event_attributes&&) = default; event_attributes& operator=(event_attributes&&) = default; /** * @brief Get raw pointer to underlying NVTX attributes object. * */ constexpr value_type const* get() const noexcept { return &attributes_; } private: value_type attributes_{}; ///< The NVTX attributes structure }; /** * @brief A RAII object for creating a NVTX range local to a thread within a * domain. * * When constructed, begins a nested NVTX range on the calling thread in the * specified domain. Upon destruction, ends the NVTX range. * * Behavior is undefined if a `scoped_range_in` object is * created/destroyed on different threads. * * `scoped_range_in` is neither moveable nor copyable. * * `scoped_range_in`s may be nested within other ranges. * * The domain of the range is specified by the template type parameter `D`. * By default, the `domain::global` is used, which scopes the range to the * global NVTX domain. The convenience alias `scoped_range` is provided for * ranges scoped to the global domain. * * A custom domain can be defined by creating a type, `D`, with a static * member `D::name` whose value is used to name the domain associated with * `D`. `D::name` must resolve to either `char const*` or `wchar_t const*` * * Example: * \code{.cpp} * // Define a type `my_domain` with a member `name` used to name the domain * // associated with the type `my_domain`. * struct my_domain{ * static constexpr char const* name{"my domain"}; * }; * \endcode * * Usage: * \code{.cpp} * nvtx3::scoped_range_in r1{"range 1"}; // Range in my domain * * // Three equivalent ways to make a range in the global domain: * nvtx3::scoped_range_in r2{"range 2"}; * nvtx3::scoped_range_in<> r3{"range 3"}; * nvtx3::scoped_range r4{"range 4"}; * * // Create an alias to succinctly make ranges in my domain: * using my_scoped_range = nvtx3::scoped_range_in; * * my_scoped_range r3{"range 3"}; * \endcode */ template class scoped_range_in { public: /** * @brief Construct a `scoped_range_in` with the specified * `event_attributes` * * Example: * \code{cpp} * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; * nvtx3::scoped_range range{attr}; // Creates a range with message contents * // "msg" and green color * \endcode * * @param[in] attr `event_attributes` that describes the desired attributes * of the range. */ explicit scoped_range_in(event_attributes const& attr) noexcept { #ifndef NVTX_DISABLE nvtxDomainRangePushEx(domain::get(), attr.get()); #else (void)attr; #endif } /** * @brief Constructs a `scoped_range_in` from the constructor arguments * of an `event_attributes`. * * Forwards the arguments `args...` to construct an * `event_attributes` object. The `event_attributes` object is then * associated with the `scoped_range_in`. * * For more detail, see `event_attributes` documentation. * * Example: * \code{cpp} * // Creates a range with message "message" and green color * nvtx3::scoped_range r{"message", nvtx3::rgb{127,255,0}}; * \endcode * * @param[in] args Arguments to used to construct an `event_attributes` associated with this * range. * */ template explicit scoped_range_in(Args const&... args) noexcept : scoped_range_in{event_attributes{args...}} { } /** * @brief Default constructor creates a `scoped_range_in` with no * message, color, payload, nor category. * */ scoped_range_in() noexcept : scoped_range_in{event_attributes{}} {} /** * @brief Delete `operator new` to disallow heap allocated objects. * * `scoped_range_in` must follow RAII semantics to guarantee proper push/pop semantics. * */ void* operator new(std::size_t) = delete; scoped_range_in(scoped_range_in const&) = delete; scoped_range_in& operator=(scoped_range_in const&) = delete; scoped_range_in(scoped_range_in&&) = delete; scoped_range_in& operator=(scoped_range_in&&) = delete; /** * @brief Destroy the scoped_range_in, ending the NVTX range event. */ ~scoped_range_in() noexcept { #ifndef NVTX_DISABLE nvtxDomainRangePop(domain::get()); #endif } }; /** * @brief Alias for a `scoped_range_in` in the global NVTX domain. * */ using scoped_range = scoped_range_in; namespace detail { /// @cond internal template class optional_scoped_range_in { public: optional_scoped_range_in() = default; void begin(event_attributes const& attr) noexcept { #ifndef NVTX_DISABLE // This class is not meant to be part of the public NVTX C++ API and should // only be used in the `NVTX3_FUNC_RANGE_IF` and `NVTX3_FUNC_RANGE_IF_IN` // macros. However, to prevent developers from misusing this class, make // sure to not start multiple ranges. if (initialized) { return; } nvtxDomainRangePushEx(domain::get(), attr.get()); initialized = true; #endif } ~optional_scoped_range_in() noexcept { #ifndef NVTX_DISABLE if (initialized) { nvtxDomainRangePop(domain::get()); } #endif } void* operator new(std::size_t) = delete; optional_scoped_range_in(optional_scoped_range_in const&) = delete; optional_scoped_range_in& operator=(optional_scoped_range_in const&) = delete; optional_scoped_range_in(optional_scoped_range_in&&) = delete; optional_scoped_range_in& operator=(optional_scoped_range_in&&) = delete; private: #ifndef NVTX_DISABLE bool initialized = false; #endif }; /// @endcond } // namespace detail /** * @brief Handle used for correlating explicit range start and end events. * * A handle is "null" if it does not correspond to any range. * */ struct range_handle { /// Type used for the handle's value using value_type = nvtxRangeId_t; /** * @brief Construct a `range_handle` from the given id. * */ constexpr explicit range_handle(value_type id) noexcept : _range_id{id} {} /** * @brief Constructs a null range handle. * * A null range_handle corresponds to no range. Calling `end_range` on a * null handle is undefined behavior when a tool is active. * */ constexpr range_handle() noexcept = default; /** * @brief Checks whether this handle is null * * Provides contextual conversion to `bool`. * * \code{cpp} * range_handle handle{}; * if (handle) {...} * \endcode * */ constexpr explicit operator bool() const noexcept { return get_value() != null_range_id; }; /** * @brief Implicit conversion from `nullptr` constructs a null handle. * * Satisfies the "NullablePointer" requirement to make `range_handle` comparable with `nullptr`. * */ constexpr range_handle(std::nullptr_t) noexcept {} /** * @brief Returns the `range_handle`'s value * * @return value_type The handle's value */ constexpr value_type get_value() const noexcept { return _range_id; } private: /// Sentinel value for a null handle that corresponds to no range static constexpr value_type null_range_id = nvtxRangeId_t{0}; value_type _range_id{null_range_id}; ///< The underlying NVTX range id }; /** * @brief Compares two range_handles for equality * * @param lhs The first range_handle to compare * @param rhs The second range_handle to compare */ inline constexpr bool operator==(range_handle lhs, range_handle rhs) noexcept { return lhs.get_value() == rhs.get_value(); } /** * @brief Compares two range_handles for inequality * * @param lhs The first range_handle to compare * @param rhs The second range_handle to compare */ inline constexpr bool operator!=(range_handle lhs, range_handle rhs) noexcept { return !(lhs == rhs); } /** * @brief Manually begin an NVTX range. * * Explicitly begins an NVTX range and returns a unique handle. To end the * range, pass the handle to `end_range_in()`. * * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and * `nvtx3::start_range_in(...)`. * * `start_range_in/end_range_in` are the most explicit and lowest level APIs * provided for creating ranges. Use of `nvtx3::unique_range_in` should be * preferred unless one is unable to tie the range to the lifetime of an object. * * Example: * \code{.cpp} * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; * // Manually begin a range * nvtx3::range_handle h = nvtx3::start_range_in(attr); * ... * nvtx3::end_range_in(h); // End the range * \endcode * * @tparam D Type containing `name` member used to identify the `domain` * to which the range belongs. Else, `domain::global` to indicate that the * global NVTX domain should be used. * @param[in] attr `event_attributes` that describes the desired attributes * of the range. * @return Unique handle to be passed to `end_range_in` to end the range. */ template inline range_handle start_range_in(event_attributes const& attr) noexcept { #ifndef NVTX_DISABLE return range_handle{nvtxDomainRangeStartEx(domain::get(), attr.get())}; #else (void)attr; return {}; #endif } /** * @brief Manually begin an NVTX range. * * Explicitly begins an NVTX range and returns a unique handle. To end the * range, pass the handle to `end_range_in()`. * * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and * `nvtx3::start_range_in(...)`. * * `start_range_in/end_range_in` are the most explicit and lowest level APIs * provided for creating ranges. Use of `nvtx3::unique_range_in` should be * preferred unless one is unable to tie the range to the lifetime of an object. * * This overload uses `args...` to construct an `event_attributes` to * associate with the range. For more detail, see `event_attributes`. * * Example: * \code{cpp} * // Manually begin a range * nvtx3::range_handle h = nvtx3::start_range_in("msg", nvtx3::rgb{127,255,0}); * ... * nvtx3::end_range_in(h); // Ends the range * \endcode * * @tparam D Type containing `name` member used to identify the `domain` * to which the range belongs. Else, `domain::global` to indicate that the * global NVTX domain should be used. * @param args[in] Variadic parameter pack of the arguments for an `event_attributes`. * @return Unique handle to be passed to `end_range` to end the range. */ template inline range_handle start_range_in(Args const&... args) noexcept { #ifndef NVTX_DISABLE return start_range_in(event_attributes{args...}); #else return {}; #endif } /** * @brief Manually begin an NVTX range in the global domain. * * Explicitly begins an NVTX range and returns a unique handle. To end the * range, pass the handle to `end_range()`. * * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and * `nvtx3::start_range_in(...)`. * * `start_range/end_range` are the most explicit and lowest level APIs * provided for creating ranges. Use of `nvtx3::unique_range` should be * preferred unless one is unable to tie the range to the lifetime of an object. * * Example: * \code{.cpp} * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; * // Manually begin a range * nvtx3::range_handle h = nvtx3::start_range(attr); * ... * nvtx3::end_range(h); // End the range * \endcode * * @param[in] attr `event_attributes` that describes the desired attributes * of the range. * @return Unique handle to be passed to `end_range_in` to end the range. */ inline range_handle start_range(event_attributes const& attr) noexcept { #ifndef NVTX_DISABLE return start_range_in(attr); #else (void)attr; return {}; #endif } /** * @brief Manually begin an NVTX range in the global domain. * * Explicitly begins an NVTX range and returns a unique handle. To end the * range, pass the handle to `end_range_in()`. * * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and * `nvtx3::start_range_in(...)`. * * `start_range_in/end_range_in` are the most explicit and lowest level APIs * provided for creating ranges. Use of `nvtx3::unique_range_in` should be * preferred unless one is unable to tie the range to the lifetime of an object. * * This overload uses `args...` to construct an `event_attributes` to * associate with the range. For more detail, see `event_attributes`. * * Example: * \code{cpp} * // Manually begin a range * nvtx3::range_handle h = nvtx3::start_range("msg", nvtx3::rgb{127,255,0}); * ... * nvtx3::end_range(h); // Ends the range * \endcode * * @param args[in] Variadic parameter pack of the arguments for an `event_attributes`. * @return Unique handle to be passed to `end_range` to end the range. */ template inline range_handle start_range(Args const&... args) noexcept { #ifndef NVTX_DISABLE return start_range_in(args...); #else return {}; #endif } /** * @brief Manually end the range associated with the handle `r` in domain `D`. * * Explicitly ends the NVTX range indicated by the handle `r` returned from a * prior call to `start_range_in`. The range may end on a different thread * from where it began. * * @tparam D Type containing `name` member used to identify the `domain` to * which the range belongs. Else, `domain::global` to indicate that the global * NVTX domain should be used. * @param r Handle to a range started by a prior call to `start_range_in`. * * @warning The domain type specified as template parameter to this function * must be the same that was specified on the associated `start_range_in` call. */ template inline void end_range_in(range_handle r) noexcept { #ifndef NVTX_DISABLE nvtxDomainRangeEnd(domain::get(), r.get_value()); #else (void)r; #endif } /** * @brief Manually end the range associated with the handle `r` in the global * domain. * * Explicitly ends the NVTX range indicated by the handle `r` returned from a * prior call to `start_range`. The range may end on a different thread from * where it began. * * @param r Handle to a range started by a prior call to `start_range`. * * @warning The domain type specified as template parameter to this function * must be the same that was specified on the associated `start_range` call. */ inline void end_range(range_handle r) noexcept { #ifndef NVTX_DISABLE end_range_in(r); #else (void)r; #endif } /** * @brief A RAII object for creating a NVTX range within a domain that can * be created and destroyed on different threads. * * When constructed, begins a NVTX range in the specified domain. Upon * destruction, ends the NVTX range. * * Similar to `nvtx3::scoped_range_in`, with a few key differences: * - `unique_range` objects can be destroyed in an order whereas `scoped_range` objects must be * destroyed in exact reverse creation order * - `unique_range` can start and end on different threads * - `unique_range` is moveable * - `unique_range` objects can be constructed as heap objects * * There is extra overhead associated with `unique_range` constructs and therefore use of * `nvtx3::scoped_range_in` should be preferred. * * @tparam D Type containing `name` member used to identify the `domain` * to which the `unique_range_in` belongs. Else, `domain::global` to * indicate that the global NVTX domain should be used. */ template class unique_range_in { public: /** * @brief Construct a new unique_range_in object with the specified event attributes * * Example: * \code{cpp} * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; * nvtx3::unique_range_in range{attr}; // Creates a range with message contents * // "msg" and green color * \endcode * * @param[in] attr `event_attributes` that describes the desired attributes * of the range. */ explicit unique_range_in(event_attributes const& attr) noexcept : handle_{start_range_in(attr)} { } /** * @brief Constructs a `unique_range_in` from the constructor arguments * of an `event_attributes`. * * Forwards the arguments `args...` to construct an * `event_attributes` object. The `event_attributes` object is then * associated with the `unique_range_in`. * * For more detail, see `event_attributes` documentation. * * Example: * \code{.cpp} * // Creates a range with message "message" and green color * nvtx3::unique_range_in<> r{"message", nvtx3::rgb{127,255,0}}; * \endcode * * @param[in] args Variadic parameter pack of arguments to construct an `event_attributes` * associated with this range. */ template explicit unique_range_in(Args const&... args) noexcept : unique_range_in{event_attributes{args...}} { } /** * @brief Default constructor creates a `unique_range_in` with no * message, color, payload, nor category. * */ constexpr unique_range_in() noexcept : unique_range_in{event_attributes{}} {} /** * @brief Destroy the `unique_range_in` ending the range. * */ ~unique_range_in() noexcept = default; /** * @brief Move constructor allows taking ownership of the NVTX range from * another `unique_range_in`. * * @param other The range to take ownership of */ unique_range_in(unique_range_in&& other) noexcept = default; /** * @brief Move assignment operator allows taking ownership of an NVTX range * from another `unique_range_in`. * * @param other The range to take ownership of */ unique_range_in& operator=(unique_range_in&& other) noexcept = default; /// Copy construction is not allowed to prevent multiple objects from owning /// the same range handle unique_range_in(unique_range_in const&) = delete; /// Copy assignment is not allowed to prevent multiple objects from owning the /// same range handle unique_range_in& operator=(unique_range_in const&) = delete; private: struct end_range_handle { using pointer = range_handle; /// Override the pointer type of the unique_ptr void operator()(range_handle h) const noexcept { end_range_in(h); } }; /// Range handle used to correlate the start/end of the range std::unique_ptr handle_; }; /** * @brief Alias for a `unique_range_in` in the global NVTX domain. * */ using unique_range = unique_range_in; /** * @brief Annotates an instantaneous point in time with a "marker", using the * attributes specified by `attr`. * * Unlike a "range" which has a beginning and an end, a marker is a single event * in an application, such as detecting a problem: * * \code{.cpp} * bool success = do_operation(...); * if (!success) { * nvtx3::event_attributes attr{"operation failed!", nvtx3::rgb{255,0,0}}; * nvtx3::mark_in(attr); * } * \endcode * * Note that nvtx3::mark_in is a function, not a class like scoped_range_in. * * @tparam D Type containing `name` member used to identify the `domain` * to which the `unique_range_in` belongs. Else, `domain::global` to * indicate that the global NVTX domain should be used. * @param[in] attr `event_attributes` that describes the desired attributes * of the mark. */ template inline void mark_in(event_attributes const& attr) noexcept { #ifndef NVTX_DISABLE nvtxDomainMarkEx(domain::get(), attr.get()); #else (void)(attr); #endif } /** * @brief Annotates an instantaneous point in time with a "marker", using the * arguments to construct an `event_attributes`. * * Unlike a "range" which has a beginning and an end, a marker is a single event * in an application, such as detecting a problem: * * \code{.cpp} * bool success = do_operation(...); * if (!success) { * nvtx3::mark_in("operation failed!", nvtx3::rgb{255,0,0}); * } * \endcode * * Note that nvtx3::mark_in is a function, not a class like scoped_range_in. * * Forwards the arguments `args...` to construct an `event_attributes` object. * The attributes are then associated with the marker. For more detail, see * the `event_attributes` documentation. * * @tparam D Type containing `name` member used to identify the `domain` * to which the `unique_range_in` belongs. Else `domain::global` to * indicate that the global NVTX domain should be used. * @param[in] args Variadic parameter pack of arguments to construct an `event_attributes` * associated with this range. * */ template inline void mark_in(Args const&... args) noexcept { #ifndef NVTX_DISABLE mark_in(event_attributes{args...}); #endif } /** * @brief Annotates an instantaneous point in time with a "marker", using the * attributes specified by `attr`, in the global domain. * * Unlike a "range" which has a beginning and an end, a marker is a single event * in an application, such as detecting a problem: * * \code{.cpp} * bool success = do_operation(...); * if (!success) { * nvtx3::event_attributes attr{"operation failed!", nvtx3::rgb{255,0,0}}; * nvtx3::mark(attr); * } * \endcode * * Note that nvtx3::mark is a function, not a class like scoped_range. * * @param[in] attr `event_attributes` that describes the desired attributes * of the mark. */ inline void mark(event_attributes const& attr) noexcept { #ifndef NVTX_DISABLE mark_in(attr); #endif } /** * @brief Annotates an instantaneous point in time with a "marker", using the * arguments to construct an `event_attributes`, in the global domain. * * Unlike a "range" which has a beginning and an end, a marker is a single event * in an application, such as detecting a problem: * * \code{.cpp} * bool success = do_operation(...); * if (!success) { * nvtx3::mark("operation failed!", nvtx3::rgb{255,0,0}); * } * \endcode * * Note that nvtx3::mark is a function, not a class like scoped_range. * * Forwards the arguments `args...` to construct an `event_attributes` object. * The attributes are then associated with the marker. For more detail, see * the `event_attributes` documentation. * * @param[in] args Variadic parameter pack of arguments to construct an * `event_attributes` associated with this range. * */ template inline void mark(Args const&... args) noexcept { #ifndef NVTX_DISABLE mark_in(args...); #endif } } // namespace NVTX3_VERSION_NAMESPACE } // namespace nvtx3 #ifndef NVTX_DISABLE /** * @brief Convenience macro for generating a range in the specified `domain` * from the lifetime of a function * * This macro is useful for generating an NVTX range in `domain` from * the entry point of a function to its exit. It is intended to be the first * line of the function. * * Constructs a static `registered_string_in` using the name of the immediately * enclosing function returned by `__func__` and constructs a * `nvtx3::scoped_range` using the registered function name as the range's * message. * * Example: * \code{.cpp} * struct my_domain{static constexpr char const* name{"my_domain"};}; * * void foo(...) { * NVTX3_FUNC_RANGE_IN(my_domain); // Range begins on entry to foo() * // do stuff * ... * } // Range ends on return from foo() * \endcode * * @param[in] D Type containing `name` member used to identify the * `domain` to which the `registered_string_in` belongs. Else, * `domain::global` to indicate that the global NVTX domain should be used. */ #define NVTX3_V1_FUNC_RANGE_IN(D) \ static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ static ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ ::nvtx3::v1::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; /** * @brief Convenience macro for generating a range in the specified `domain` * from the lifetime of a function if the given boolean expression evaluates * to true. * * Similar to `NVTX3_V1_FUNC_RANGE_IN(D)`, the only difference being that * `NVTX3_V1_FUNC_RANGE_IF_IN(D, C)` only generates a range if the given boolean * expression evaluates to true. * * @param[in] D Type containing `name` member used to identify the * `domain` to which the `registered_string_in` belongs. Else, * `domain::global` to indicate that the global NVTX domain should be used. * * @param[in] C Boolean expression used to determine if a range should be * generated. */ #define NVTX3_V1_FUNC_RANGE_IF_IN(D, C) \ ::nvtx3::v1::detail::optional_scoped_range_in optional_nvtx3_range__; \ if (C) { \ static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ static ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ optional_nvtx3_range__.begin(nvtx3_func_attr__); \ } #else #define NVTX3_V1_FUNC_RANGE_IN(D) #define NVTX3_V1_FUNC_RANGE_IF_IN(D, C) #endif // NVTX_DISABLE /** * @brief Convenience macro for generating a range in the global domain from the * lifetime of a function. * * This macro is useful for generating an NVTX range in the global domain from * the entry point of a function to its exit. It is intended to be the first * line of the function. * * Constructs a static `registered_string_in` using the name of the immediately * enclosing function returned by `__func__` and constructs a * `nvtx3::scoped_range` using the registered function name as the range's * message. * * Example: * \code{.cpp} * void foo(...) { * NVTX3_FUNC_RANGE(); // Range begins on entry to foo() * // do stuff * ... * } // Range ends on return from foo() * \endcode */ #define NVTX3_V1_FUNC_RANGE() NVTX3_V1_FUNC_RANGE_IN(::nvtx3::v1::domain::global) /** * @brief Convenience macro for generating a range in the global domain from the * lifetime of a function if the given boolean expression evaluates to true. * * Similar to `NVTX3_V1_FUNC_RANGE()`, the only difference being that * `NVTX3_V1_FUNC_RANGE_IF(C)` only generates a range if the given boolean * expression evaluates to true. * * @param[in] C Boolean expression used to determine if a range should be * generated. */ #define NVTX3_V1_FUNC_RANGE_IF(C) NVTX3_V1_FUNC_RANGE_IF_IN(::nvtx3::v1::domain::global, C) /* When inlining this version, versioned macros must have unversioned aliases. * For each NVTX3_Vx_ #define, make an NVTX3_ alias of it here.*/ #if defined(NVTX3_INLINE_THIS_VERSION) /* clang format off */ #define NVTX3_FUNC_RANGE NVTX3_V1_FUNC_RANGE #define NVTX3_FUNC_RANGE_IF NVTX3_V1_FUNC_RANGE_IF #define NVTX3_FUNC_RANGE_IN NVTX3_V1_FUNC_RANGE_IN #define NVTX3_FUNC_RANGE_IF_IN NVTX3_V1_FUNC_RANGE_IF_IN /* clang format on */ #endif #endif // NVTX3_CPP_DEFINITIONS_V1_0 /* Add functionality for new minor versions here, by copying the above section enclosed * in #ifndef NVTX3_CPP_DEFINITIONS_Vx_y, and incrementing the minor version. This code * is an example of how additions for version 1.2 would look, indented for clarity. Note * that the versioned symbols and macros are always provided, and the unversioned symbols * are only provided if NVTX3_INLINE_THIS_VERSION was defined at the top of this header. * * \code{.cpp} * #ifndef NVTX3_CPP_DEFINITIONS_V1_2 * #define NVTX3_CPP_DEFINITIONS_V1_2 * namespace nvtx3 { * NVTX3_INLINE_IF_REQUESTED namespace NVTX3_VERSION_NAMESPACE { * class new_class {}; * inline void new_function() {} * } * } * * // Macros must have the major version in their names: * #define NVTX3_V1_NEW_MACRO_A() ... * #define NVTX3_V1_NEW_MACRO_B() ... * * // If inlining, make aliases for the macros with the version number omitted * #if defined(NVTX3_INLINE_THIS_VERSION) * #define NVTX3_NEW_MACRO_A NVTX3_V1_NEW_MACRO_A * #define NVTX3_NEW_MACRO_B NVTX3_V1_NEW_MACRO_B * #endif * #endif // NVTX3_CPP_DEFINITIONS_V1_2 * \endcode */ /* Undefine all temporarily-defined unversioned macros, which would conflict with * subsequent includes of different versions of this header. */ #undef NVTX3_CPP_VERSION_MAJOR #undef NVTX3_CPP_VERSION_MINOR #undef NVTX3_CONCAT #undef NVTX3_NAMESPACE_FOR #undef NVTX3_VERSION_NAMESPACE #undef NVTX3_INLINE_IF_REQUESTED #undef NVTX3_CONSTEXPR_IF_CPP14 #if defined(NVTX3_INLINE_THIS_VERSION) #undef NVTX3_INLINE_THIS_VERSION #endif #if defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET_DEFINED_HERE) #undef NVTX3_USE_CHECKED_OVERLOADS_FOR_GET_DEFINED_HERE #undef NVTX3_USE_CHECKED_OVERLOADS_FOR_GET #endif #if defined(NVTX3_STATIC_ASSERT_DEFINED_HERE) #undef NVTX3_STATIC_ASSERT_DEFINED_HERE #undef NVTX3_STATIC_ASSERT #endif nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/000077500000000000000000000000001463451655400203105ustar00rootroot00000000000000nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h000066400000000000000000000023301463451655400244440ustar00rootroot00000000000000/* * Copyright 2023 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_EXT_HELPER_MACROS_H #define NVTX_EXT_HELPER_MACROS_H /* Combine tokens */ #define _NVTX_EXT_CONCAT(a, b) a##b #define NVTX_EXT_CONCAT(a, b) _NVTX_EXT_CONCAT(a, b) /* Resolves to the number of arguments passed. */ #define NVTX_EXT_NUM_ARGS(...) \ NVTX_EXT_SELECTA16(__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, throwaway) #define NVTX_EXT_SELECTA16(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, ...) a16 /* Cast argument(s) to void to prevent unused variable warnings. */ #define _NVTX_EXT_VOIDIFY1(a1) (void)a1; #define _NVTX_EXT_VOIDIFY2(a1, a2) (void)a1; (void)a2; #define _NVTX_EXT_VOIDIFY3(a1, a2, a3) (void)a1; (void)a2; (void)a3; #define _NVTX_EXT_VOIDIFY4(a1, a2, a3, a4) (void)a1; (void)a2; (void)a3; (void)a4; /* Mark function arguments as unused. */ #define NVTX_EXT_HELPER_UNUSED_ARGS(...) \ NVTX_EXT_CONCAT(_NVTX_EXT_VOIDIFY, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) #endif /* NVTX_EXT_HELPER_MACROS_H */nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxExtImpl.h000066400000000000000000000043261463451655400227700ustar00rootroot00000000000000/* * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_EXT_IMPL_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). #endif #ifndef NVTX_EXT_IMPL_H #define NVTX_EXT_IMPL_H /* ---- Include required platform headers ---- */ #include #include #include #include #if defined(_WIN32) #include #else #include #if defined(__ANDROID__) #include #endif #if defined(__linux__) || defined(__CYGWIN__) #include #endif #include #include #include #include #include #include #endif /* ---- Define macros used in this file ---- */ #ifdef NVTX_DEBUG_PRINT #ifdef __ANDROID__ #include #define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__); #define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__); #else #include #define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__) #define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__) #endif #else /* !defined(NVTX_DEBUG_PRINT) */ #define NVTX_ERR(...) #define NVTX_INFO(...) #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /* #ifdef __GNUC__ #pragma GCC visibility push(hidden) #endif */ #define NVTX_EXTENSION_FRESH 0 #define NVTX_EXTENSION_DISABLED 1 #define NVTX_EXTENSION_STARTING 2 #define NVTX_EXTENSION_LOADED 3 /* Function slots are local to each extension */ typedef struct nvtxExtGlobals1_t { NvtxExtInitializeInjectionFunc_t injectionFnPtr; } nvtxExtGlobals1_t; NVTX_LINKONCE_DEFINE_GLOBAL nvtxExtGlobals1_t NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1) = { (NvtxExtInitializeInjectionFunc_t)0 }; #define NVTX_EXT_INIT_GUARD #include "nvtxExtInit.h" #undef NVTX_EXT_INIT_GUARD /* #ifdef __GNUC__ #pragma GCC visibility pop #endif */ #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ #endif /* NVTX_EXT_IMPL_H */nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h000066400000000000000000000124001463451655400251110ustar00rootroot00000000000000/* * Copyright 2023-2024 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_EXT_IMPL_COUNTERS_GUARD #error Never include this file directly -- it is automatically included by nvToolsExtCounters.h (except when NVTX_NO_IMPL is defined). #endif #define NVTX_EXT_IMPL_GUARD #include "nvtxExtImpl.h" #undef NVTX_EXT_IMPL_GUARD #ifndef NVTX_EXT_IMPL_COUNTERS_V1 #define NVTX_EXT_IMPL_COUNTERS_V1 #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /* Macros to create versioned symbols. */ #define NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \ NAME##_v##VERSION##_bpl##COMPATID #define NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \ NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) #define NVTX_EXT_COUNTERS_VERSIONED_ID(NAME) \ NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COUNTERS_COMPATID) #ifdef NVTX_DISABLE #include "nvtxExtHelperMacros.h" #define NVTX_EXT_COUNTERS_IMPL_FN_V1(ret_val, fn_name, signature, arg_names) \ ret_val fn_name signature { \ NVTX_EXT_HELPER_UNUSED_ARGS arg_names \ return ((ret_val)(intptr_t)-1); \ } #else /* NVTX_DISABLE */ /* * Function slots for the counters extension. First entry is the module state, * initialized to `0` (`NVTX_EXTENSION_FRESH`). */ #define NVTX_EXT_COUNTERS_SLOT_COUNT 63 NVTX_LINKONCE_DEFINE_GLOBAL intptr_t NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX_EXT_COUNTERS_SLOT_COUNT + 1] = {0}; /* Avoid warnings about missing prototype. */ NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)(void); NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)() { intptr_t* fnSlots = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots) + 1; nvtxExtModuleSegment_t segment = { 0, /* unused (only one segment) */ NVTX_EXT_COUNTERS_SLOT_COUNT, fnSlots }; nvtxExtModuleInfo_t module = { NVTX_VERSION, sizeof(nvtxExtModuleInfo_t), NVTX_EXT_COUNTERS_MODULEID, NVTX_EXT_COUNTERS_COMPATID, 1, &segment, /* number of segments, segments */ NULL, /* no export function needed */ /* bake type sizes and alignment information into program binary */ NULL }; NVTX_INFO( "%s\n", __FUNCTION__ ); NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module, NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)); } #define NVTX_EXT_COUNTERS_IMPL_FN_V1(ret_type, fn_name, signature, arg_names) \ typedef ret_type (*fn_name##_impl_fntype)signature; \ NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \ intptr_t slot = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ if (slot != NVTX_EXTENSION_DISABLED) { \ if (slot != NVTX_EXTENSION_FRESH) { \ return (*(fn_name##_impl_fntype)slot) arg_names; \ } else { \ NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)(); \ /* Re-read function slot after extension initialization. */ \ slot = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \ return (*(fn_name##_impl_fntype)slot) arg_names; \ } \ } \ } \ NVTX_EXT_FN_RETURN_INVALID(ret_type) \ } #endif /*NVTX_DISABLE*/ /* Non-void functions. */ #define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1); NVTX_EXT_COUNTERS_IMPL_FN_V1(nvtxCountersHandle_t, nvtxCountersRegister, (nvtxDomainHandle_t domain, const nvtxCountersAttr_t* attr), (domain, attr)) #undef NVTX_EXT_FN_RETURN_INVALID /* END: Non-void functions. */ /* void functions. */ #define NVTX_EXT_FN_RETURN_INVALID(rtype) #define return NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleInt64, (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, int64_t value), (domain, hCounter, value)) NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleFloat64, (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, double value), (domain, hCounter, value)) NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSample, (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, void* values, size_t size), (domain, hCounter, values, size)) NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleNoValue, (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, uint8_t reason), (domain, hCounter, reason)) NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSubmitBatch, (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounters, const void* counters, size_t size), (domain, hCounters, counters, size)) NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSubmitBatchEx, (nvtxDomainHandle_t domain, const nvtxCountersBatch_t* countersBatch), (domain, countersBatch)) #undef return #undef NVTX_EXT_FN_RETURN_INVALID /* END: void functions. */ /* Keep NVTX_EXT_COUNTERS_IMPL_FN_V1 defined for a future version of this extension. */ #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ #endif /* NVTX_EXT_IMPL_COUNTERS_V1 */nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h000066400000000000000000000050101463451655400251270ustar00rootroot00000000000000/* * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_EXT_IMPL_MEM_CUDART_GUARD #error Never include this file directly -- it is automatically included by nvToolsExtMemCudaRt.h (except when NVTX_NO_IMPL is defined). #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ #ifdef NVTX_DISABLE #include "nvtxExtHelperMacros.h" #define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \ ret_val fn_name signature { \ NVTX_EXT_HELPER_UNUSED_ARGS arg_names \ return ((ret_val)(intptr_t)-1); \ } #else /* NVTX_DISABLE */ #define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \ typedef ret_type ( * fn_name##_impl_fntype )signature; \ NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \ intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ if (slot != NVTX_EXTENSION_DISABLED) { \ if (slot != NVTX_EXTENSION_FRESH) { \ return (*(fn_name##_impl_fntype)slot) arg_names; \ } else { \ NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \ /* Re-read function slot after extension initialization. */ \ slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \ return (*(fn_name##_impl_fntype)slot) arg_names; \ } \ } \ } \ NVTX_EXT_FN_RETURN_INVALID(ret_type) \ } #endif /*NVTX_DISABLE*/ /* Non-void functions. */ #define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1); NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetProcessWidePermissions, (nvtxDomainHandle_t domain), (domain)) NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetDeviceWidePermissions, (nvtxDomainHandle_t domain, int device), (domain, device)) #undef NVTX_EXT_FN_RETURN_INVALID /* END: Non-void functions. */ /* void functions. */ #define NVTX_EXT_FN_RETURN_INVALID(rtype) #define return NVTX_EXT_FN_IMPL(void, nvtxMemCudaSetPeerAccess, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, int devicePeer, uint32_t flags), (domain, permissions, devicePeer, flags)) #undef return #undef NVTX_EXT_FN_RETURN_INVALID /* END: void functions. */ #undef NVTX_EXT_FN_IMPL #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h000066400000000000000000000121761463451655400240370ustar00rootroot00000000000000/* * Copyright 2009-2020,2023 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_EXT_IMPL_MEM_GUARD #error Never include this file directly -- it is automatically included by nvToolsExtMem.h (except when NVTX_NO_IMPL is defined). #endif #define NVTX_EXT_IMPL_GUARD #include "nvtxExtImpl.h" #undef NVTX_EXT_IMPL_GUARD #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ #define NVTXMEM_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) NAME##_v##VERSION##_mem##COMPATID #define NVTXMEM_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) NVTXMEM_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) #define NVTX_EXT_MEM_VERSIONED_ID(NAME) NVTXMEM_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_MEM) #ifdef NVTX_DISABLE #include "nvtxExtHelperMacros.h" #define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \ ret_val fn_name signature { \ NVTX_EXT_HELPER_UNUSED_ARGS arg_names \ return ((ret_val)(intptr_t)-1); \ } #else /* NVTX_DISABLE */ /* * Function slots for the memory extension. First entry is the module * state, initialized to `0` (`NVTX_EXTENSION_FRESH`). */ NVTX_LINKONCE_DEFINE_GLOBAL intptr_t NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_MEM_FN_NUM + 2] = {0}; NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)() { intptr_t* fnSlots = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots) + 1; nvtxExtModuleSegment_t segment = { 0, /* unused (only one segment) */ NVTX3EXT_CBID_MEM_FN_NUM, fnSlots }; nvtxExtModuleInfo_t module = { NVTX_VERSION, sizeof(nvtxExtModuleInfo_t), NVTX_EXT_MODULEID_MEM, NVTX_EXT_COMPATID_MEM, 1, &segment, NULL, /* no export function needed */ NULL }; NVTX_INFO( "%s\n", __FUNCTION__ ); NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module, NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)); } #define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \ typedef ret_type ( * fn_name##_impl_fntype )signature; \ NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \ intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ if (slot != NVTX_EXTENSION_DISABLED) { \ if (slot != NVTX_EXTENSION_FRESH) { \ return (*(fn_name##_impl_fntype)slot) arg_names; \ } else { \ NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \ /* Re-read function slot after extension initialization. */ \ slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \ return (*(fn_name##_impl_fntype)slot) arg_names; \ } \ } \ } \ NVTX_EXT_FN_RETURN_INVALID(ret_type) \ } #endif /*NVTX_DISABLE*/ /* Non-void functions. */ #define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1); NVTX_EXT_FN_IMPL(nvtxMemHeapHandle_t, nvtxMemHeapRegister, (nvtxDomainHandle_t domain, nvtxMemHeapDesc_t const* desc), (domain, desc)) NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemPermissionsCreate, (nvtxDomainHandle_t domain, int32_t creationflags), (domain, creationflags)) #undef NVTX_EXT_FN_RETURN_INVALID /* END: Non-void functions. */ /* void functions. */ #define NVTX_EXT_FN_RETURN_INVALID(rtype) #define return NVTX_EXT_FN_IMPL(void, nvtxMemHeapUnregister, (nvtxDomainHandle_t domain, nvtxMemHeapHandle_t heap), (domain, heap)) NVTX_EXT_FN_IMPL(void, nvtxMemHeapReset, (nvtxDomainHandle_t domain, nvtxMemHeapHandle_t heap), (domain, heap)) NVTX_EXT_FN_IMPL(void, nvtxMemRegionsRegister, (nvtxDomainHandle_t domain, nvtxMemRegionsRegisterBatch_t const* desc), (domain, desc)) NVTX_EXT_FN_IMPL(void, nvtxMemRegionsResize, (nvtxDomainHandle_t domain,nvtxMemRegionsResizeBatch_t const* desc), (domain, desc)) NVTX_EXT_FN_IMPL(void, nvtxMemRegionsUnregister, (nvtxDomainHandle_t domain,nvtxMemRegionsUnregisterBatch_t const* desc), (domain, desc)) NVTX_EXT_FN_IMPL(void, nvtxMemRegionsName, (nvtxDomainHandle_t domain,nvtxMemRegionsNameBatch_t const* desc), (domain, desc)) NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsAssign, (nvtxDomainHandle_t domain,nvtxMemPermissionsAssignBatch_t const* desc), (domain, desc)) NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsDestroy, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions), (domain, permissions)) NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsReset, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions), (domain, permissions)) NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsBind, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, uint32_t bindScope, uint32_t bindFlags), (domain, permissions, bindScope, bindFlags)) NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsUnbind, (nvtxDomainHandle_t domain, uint32_t bindScope), (domain, bindScope)) #undef return #undef NVTX_EXT_FN_RETURN_INVALID /* END: void functions. */ #undef NVTX_EXT_FN_IMPL #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h000066400000000000000000000130111463451655400246770ustar00rootroot00000000000000/* * Copyright 2021-2023 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD #error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined). #endif #define NVTX_EXT_IMPL_GUARD #include "nvtxExtImpl.h" #undef NVTX_EXT_IMPL_GUARD #ifndef NVTX_EXT_IMPL_PAYLOAD_V1 #define NVTX_EXT_IMPL_PAYLOAD_V1 #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /* Macros to create versioned symbols. */ #define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \ NAME##_v##VERSION##_bpl##COMPATID #define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \ NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) #define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \ NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_PAYLOAD_COMPATID) #ifdef NVTX_DISABLE #include "nvtxExtHelperMacros.h" #define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_val, fn_name, signature, arg_names) \ ret_val fn_name signature { \ NVTX_EXT_HELPER_UNUSED_ARGS arg_names \ return ((ret_val)(intptr_t)-1); \ } #else /* NVTX_DISABLE */ #include "nvtxExtPayloadTypeInfo.h" /* * Function slots for the payload extension. First entry is the module state, * initialized to `0` (`NVTX_EXTENSION_FRESH`). */ #define NVTX_EXT_PAYLOAD_SLOT_COUNT 63 NVTX_LINKONCE_DEFINE_GLOBAL intptr_t NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX_EXT_PAYLOAD_SLOT_COUNT + 1] = {0}; /* Avoid warnings about missing prototype. */ NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(void); NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)() { intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1; nvtxExtModuleSegment_t segment = { 0, /* unused (only one segment) */ NVTX_EXT_PAYLOAD_SLOT_COUNT, fnSlots }; nvtxExtModuleInfo_t module = { NVTX_VERSION, sizeof(nvtxExtModuleInfo_t), NVTX_EXT_PAYLOAD_MODULEID, NVTX_EXT_PAYLOAD_COMPATID, 1, &segment, /* number of segments, segments */ NULL, /* no export function needed */ /* bake type sizes and alignment information into program binary */ &(NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo)) }; NVTX_INFO( "%s\n", __FUNCTION__ ); NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module, NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)); } #define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_type, fn_name, signature, arg_names) \ typedef ret_type (*fn_name##_impl_fntype)signature; \ NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \ intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ if (slot != NVTX_EXTENSION_DISABLED) { \ if (slot != NVTX_EXTENSION_FRESH) { \ return (*(fn_name##_impl_fntype)slot) arg_names; \ } else { \ NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \ /* Re-read function slot after extension initialization. */ \ slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \ return (*(fn_name##_impl_fntype)slot) arg_names; \ } \ } \ } \ NVTX_EXT_FN_RETURN_INVALID(ret_type) \ } #endif /*NVTX_DISABLE*/ /* Non-void functions. */ #define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1); NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadSchemaRegister, (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr), (domain, attr)) NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadEnumRegister, (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr), (domain, attr)) NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePushPayload, (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count), (domain, payloadData, count)) NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePopPayload, (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count), (domain, payloadData, count)) NVTX_EXT_PAYLOAD_IMPL_FN_V1(nvtxRangeId_t, nvtxRangeStartPayload, (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count), (domain, payloadData, count)) NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint8_t, nvtxDomainIsEnabled, (nvtxDomainHandle_t domain), (domain)) NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxScopeRegister, (nvtxDomainHandle_t domain, const nvtxScopeAttr_t* attr), (domain, attr)) #undef NVTX_EXT_FN_RETURN_INVALID /* END: Non-void functions. */ /* void functions. */ #define NVTX_EXT_FN_RETURN_INVALID(rtype) #define return NVTX_EXT_PAYLOAD_IMPL_FN_V1(void, nvtxMarkPayload, (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count), (domain, payloadData, count)) NVTX_EXT_PAYLOAD_IMPL_FN_V1(void, nvtxRangeEndPayload, (nvtxDomainHandle_t domain, nvtxRangeId_t id, const nvtxPayloadData_t* payloadData, size_t count), (domain, id, payloadData, count)) #undef return #undef NVTX_EXT_FN_RETURN_INVALID /* END: void functions. */ /* Keep NVTX_EXT_PAYLOAD_IMPL_FN_V1 defined for a future version of this extension. */ #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ #endif /* NVTX_EXT_IMPL_PAYLOAD_V1 */ nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxExtInit.h000066400000000000000000000357321463451655400227770ustar00rootroot00000000000000/* * Copyright 2009-2023 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_EXT_INIT_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /* ---- Platform-independent helper definitions and functions ---- */ /* Prefer macros over inline functions to reduce symbol resolution at link time */ #if defined(_WIN32) #define NVTX_PATHCHAR wchar_t #define NVTX_STR(x) L##x #define NVTX_GETENV _wgetenv #define NVTX_BUFSIZE 16384 #define NVTX_DLLHANDLE HMODULE #define NVTX_DLLOPEN(x) LoadLibraryW(x) #define NVTX_DLLFUNC GetProcAddress #define NVTX_DLLCLOSE FreeLibrary #define NVTX_YIELD() SwitchToThread() #define NVTX_MEMBAR() MemoryBarrier() #define NVTX_ATOMIC_WRITE_32(address, value) InterlockedExchange((volatile LONG*)address, value) #define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand) #define NVTX_ATOMIC_WRITE_PTR(address, value) InterlockedExchangePointer((volatile PVOID*)address, (PVOID)value) #define NVTX_ATOMIC_CAS_PTR(old, address, exchange, comparand) old = (intptr_t)InterlockedCompareExchangePointer((volatile PVOID*)address, (PVOID)exchange, (PVOID)comparand) #elif defined(__GNUC__) #define NVTX_PATHCHAR char #define NVTX_STR(x) x #define NVTX_GETENV getenv #define NVTX_BUFSIZE 16384 #define NVTX_DLLHANDLE void* #define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY) #define NVTX_DLLFUNC dlsym #define NVTX_DLLCLOSE dlclose #define NVTX_YIELD() sched_yield() #define NVTX_MEMBAR() __sync_synchronize() /* Ensure full memory barrier for atomics, to match Windows functions. */ #define NVTX_ATOMIC_WRITE_32(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value) #define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand) #define NVTX_ATOMIC_WRITE_PTR(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value) #define NVTX_ATOMIC_CAS_PTR(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand) #else #error The library does not support your configuration! #endif /* Define this to 1 for platforms that where pre-injected libraries can be discovered. */ #if defined(_WIN32) /* TODO */ #define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0 #else #define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0 #endif /* Define this to 1 for platforms that support environment variables. */ /* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */ /* Try: #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */ #define NVTX_SUPPORT_ENV_VARS 1 /* Define this to 1 for platforms that support dynamic/shared libraries */ #define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1 /* Injection libraries implementing InitializeInjectionNvtxExtension may be statically linked, * which will override any dynamic injection. This is useful for platforms, where dynamic * injection is not available. Since weak symbols, not explicitly marked extern, are * guaranteed to be initialized to zero, if no definitions are found by the linker, the * dynamic injection process proceeds normally, if pfnInitializeInjectionNvtx2 is 0. */ #if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__) #define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1 /* To statically inject an NVTX library, define InitializeInjectionNvtxExtension_fnptr as a normal * symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension, which * does not need to be named "InitializeInjectionNvtxExtension" as it is necessary in a dynamic * injection library. */ __attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxExtension_fnptr; #else #define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0 #endif /* This function tries to find or load an NVTX injection library and get the address of its * `InitializeInjectionExtension` function. If such a function pointer is found, it is called and * passed the address of this NVTX instance's `nvtxGetExportTable` function, so that the injection * can attach to this instance. * If the initialization fails for any reason, any dynamic library loaded will be freed, and all * NVTX implementation functions will be set to no-ops. If the initialization succeeds, NVTX * functions that are not attached to the tool will be set to no-ops. This is implemented as one * function instead of several small functions to minimize the number of weak symbols the linker * must resolve. The order of search is: * 1) Pre-injected library exporting InitializeInjectionNvtxExtension * 2) Loadable library exporting InitializeInjectionNvtxExtension * - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64) * - On Android, libNvtxInjection??.so within the package (?? is 32 or 64) * 3) Statically-linked injection library defining InitializeInjectionNvtx2_fnptr */ NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)( NvtxExtInitializeInjectionFunc_t* out_init_fnptr); NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)( NvtxExtInitializeInjectionFunc_t* out_init_fnptr) { const char* const initFuncName = "InitializeInjectionNvtxExtension"; NvtxExtInitializeInjectionFunc_t init_fnptr = (NvtxExtInitializeInjectionFunc_t)0; NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0; if (out_init_fnptr) { *out_init_fnptr = (NvtxExtInitializeInjectionFunc_t)0; } #if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY /* Use POSIX global symbol chain to query for init function from any module. */ init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(0, initFuncName); #endif #if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY /* Try discovering dynamic injection library to load */ if (!init_fnptr) { #if NVTX_SUPPORT_ENV_VARS /* If env var NVTX_INJECTION64_PATH is set, it should contain the path to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */ const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4) ? NVTX_STR("NVTX_INJECTION32_PATH") : NVTX_STR("NVTX_INJECTION64_PATH"); #endif /* NVTX_SUPPORT_ENV_VARS */ NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE]; const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0; /* Refer to this variable explicitly in case all references to it are #if'ed out. */ (void)injectionLibraryPathBuf; #if NVTX_SUPPORT_ENV_VARS /* Disable the warning for getenv & _wgetenv -- this usage is safe because these functions are not called again before using the returned value. */ #if defined(_MSC_VER) #pragma warning( push ) #pragma warning( disable : 4996 ) #endif injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName); #if defined(_MSC_VER) #pragma warning( pop ) #endif #endif #if defined(__ANDROID__) if (!injectionLibraryPath) { const char *bits = (sizeof(void*) == 4) ? "32" : "64"; char cmdlineBuf[32]; char pkgName[PATH_MAX]; int count; int pid; FILE *fp; size_t bytesRead; size_t pos; pid = (int)getpid(); count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid); if (count <= 0 || count >= (int)sizeof(cmdlineBuf)) { NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid); return NVTX_ERR_INIT_ACCESS_LIBRARY; } fp = fopen(cmdlineBuf, "r"); if (!fp) { NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf); return NVTX_ERR_INIT_ACCESS_LIBRARY; } bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp); fclose(fp); if (bytesRead == 0) { NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf); return NVTX_ERR_INIT_ACCESS_LIBRARY; } pkgName[bytesRead] = 0; /* String can contain colon as a process separator. In this case the package name is before the colon. */ pos = 0; while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0') { ++pos; } pkgName[pos] = 0; count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits); if (count <= 0 || count >= NVTX_BUFSIZE) { NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits); return NVTX_ERR_INIT_ACCESS_LIBRARY; } /* On Android, verify path is accessible due to aggressive file access restrictions. */ /* For dlopen, if the filename contains a leading slash, then it is interpreted as a */ /* relative or absolute pathname; otherwise it will follow the rules in ld.so. */ if (injectionLibraryPathBuf[0] == '/') { #if (__ANDROID_API__ < 21) int access_err = access(injectionLibraryPathBuf, F_OK | R_OK); #else int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0); #endif if (access_err != 0) { NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf); return NVTX_ERR_INIT_ACCESS_LIBRARY; } } injectionLibraryPath = injectionLibraryPathBuf; } #endif /* At this point, `injectionLibraryPath` is specified if a dynamic injection library was specified by a tool. */ if (injectionLibraryPath) { /* Load the injection library */ injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath); if (!injectionLibraryHandle) { NVTX_ERR("Failed to load injection library\n"); return NVTX_ERR_INIT_LOAD_LIBRARY; } else { /* Attempt to get the injection library's entry-point. */ init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName); if (!init_fnptr) { NVTX_DLLCLOSE(injectionLibraryHandle); NVTX_ERR("Failed to get address of function %s from injection library\n", initFuncName); return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT; } } } } #endif #if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY if (!init_fnptr) { /* Check weakly-defined function pointer. A statically-linked injection can define this as a normal symbol and it will take precedence over a dynamic injection. */ if (InitializeInjectionNvtxExtension_fnptr) { init_fnptr = InitializeInjectionNvtxExtension_fnptr; } } #endif if (out_init_fnptr) { *out_init_fnptr = init_fnptr; } /* At this point, if `init_fnptr` is not set, no tool has specified an NVTX injection library. Non-success result is returned, so that all NVTX API functions will be set to no-ops. */ if (!init_fnptr) { return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE; } return NVTX_SUCCESS; } /* Avoid warnings about missing prototypes. */ NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) ( nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState); NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) ( nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState) { intptr_t old; NVTX_INFO( "%s\n", __FUNCTION__ ); if (*moduleState == NVTX_EXTENSION_LOADED) { NVTX_INFO("Module loaded\n"); return; } NVTX_ATOMIC_CAS_PTR( old, moduleState, NVTX_EXTENSION_STARTING, NVTX_EXTENSION_FRESH); if (old == NVTX_EXTENSION_FRESH) { NvtxExtInitializeInjectionFunc_t init_fnptr = NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr; int entryPointStatus = 0; int forceAllToNoops = 0; size_t s; /* Load and initialize injection library, which will assign the function pointers. */ if (init_fnptr == 0) { int result = 0; /* Try to load vanilla NVTX first. */ nvtxInitialize(0); result = NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(&init_fnptr); /* At this point `init_fnptr` will be either 0 or a real function. */ if (result == NVTX_SUCCESS) { NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr = init_fnptr; } else { NVTX_ERR("Failed to load injection library\n"); } } if (init_fnptr != 0) { /* Invoke injection library's initialization function. If it returns 0 (failure) and a dynamic injection was loaded, unload it. */ entryPointStatus = init_fnptr(moduleInfo); if (entryPointStatus == 0) { NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n"); } } /* Clean up any functions that are still uninitialized so that they are skipped. Set all to null if injection init function failed as well. */ forceAllToNoops = (init_fnptr == 0) || (entryPointStatus == 0); for (s = 0; s < moduleInfo->segmentsCount; ++s) { nvtxExtModuleSegment_t* segment = moduleInfo->segments + s; size_t i; for (i = 0; i < segment->slotCount; ++i) { if (forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH)) { segment->functionSlots[i] = NVTX_EXTENSION_DISABLED; } } } NVTX_MEMBAR(); /* Signal that initialization has finished and the assigned function pointers will be used. */ NVTX_ATOMIC_WRITE_PTR(moduleState, NVTX_EXTENSION_LOADED); } else /* Spin-wait until initialization has finished. */ { NVTX_MEMBAR(); while (*moduleState != NVTX_EXTENSION_LOADED) { NVTX_YIELD(); NVTX_MEMBAR(); } } } #ifdef __cplusplus } #endif /* __cplusplus */ nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h000066400000000000000000000403661463451655400263210ustar00rootroot00000000000000/* * Copyright 2023 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H #define NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H /* General helper macros */ #include "nvtxExtHelperMacros.h" /* Get variable name with line number (almost unique per file). */ #define _NVTX_PAYLOAD_DATA_VAR NVTX_EXT_CONCAT(nvtxDFDB,__LINE__) /* Create real arguments from just pasting tokens next to each other. */ #define _NVTX_PAYLOAD_PASS_THROUGH(...) __VA_ARGS__ /* Avoid prefixing `NVTX_PAYLOAD_ENTRY_` for nested payloads. */ #define NVTX_PAYLOAD_ENTRY_THROWAWAY #define _NVTX_PAYLOAD_NESTED(id) THROWAWAY id /* * Create the NVTX binary payloads schema attributes. * * @param struct_id The name of the struct. * @param schema_name The name of the schema. * @param schema_flags Additional schema flags * @param mask_add Fields to be added to the mask. * @param num_entries The number schema entries. */ #define NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, mask_add, num_entries) \ nvtxPayloadSchemaAttr_t struct_id##Attr = { \ /*.fieldMask = */NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | mask_add \ NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | \ NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | \ NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE, \ /*.name = */schema_name, \ /*.type = */NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, \ /*.flags = */schema_flags, \ /*.entries = */struct_id##Schema, /*.numEntries = */num_entries, \ /*.payloadStaticSize = */sizeof(struct_id), \ /*.packAlign = */0, /*.schemaId = */schema_id}; /*****************************************************************/ /*** Helper for `NVTX_DEFINE_SCHEMA_FOR_STRUCT[_AND_REGISTER]` ***/ /* First part of schema entry for different number of arguments. */ #define _NVTX_PAYLOAD_SCHEMA_EF2(member, etype) \ 0, NVTX_PAYLOAD_ENTRY_##etype, NULL, NULL, 0, #define _NVTX_PAYLOAD_SCHEMA_EF3(member, etype, name) \ 0, NVTX_PAYLOAD_ENTRY_##etype, name, NULL, 0, #define _NVTX_PAYLOAD_SCHEMA_EF4(member, etype, name, desc) \ 0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, 0, #define _NVTX_PAYLOAD_SCHEMA_EF5(member, etype, name, desc, arraylen) \ 0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen, #define _NVTX_PAYLOAD_SCHEMA_EF6(member, etype, name, desc, arraylen, flags) \ NVTX_PAYLOAD_ENTRY_FLAG_##flags, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen, #define _NVTX_PAYLOAD_SCHEMA_ENTRY_FRONT(...) \ NVTX_EXT_CONCAT(_NVTX_PAYLOAD_SCHEMA_EF, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) /* Second part of schema entry (append struct member). (At least two arguments are passed (`member` and `etype`). */ #define _NVTX_PAYLOAD_SCHEMA_ENTRY_END(member, ...) member /* Resolve to schema entry. `entry` is `(ctype, name, ...)`. */ #define _NVTX_PAYLOAD_SCHEMA_ENTRY(struct_id, entry) \ {_NVTX_PAYLOAD_SCHEMA_ENTRY_FRONT entry \ offsetof(struct_id, _NVTX_PAYLOAD_SCHEMA_ENTRY_END entry)}, /* Handle up to 16 schema entries. */ #define _NVTX_PAYLOAD_SME1(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) #define _NVTX_PAYLOAD_SME2(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME1(s,__VA_ARGS__) #define _NVTX_PAYLOAD_SME3(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME2(s,__VA_ARGS__) #define _NVTX_PAYLOAD_SME4(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME3(s,__VA_ARGS__) #define _NVTX_PAYLOAD_SME5(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME4(s,__VA_ARGS__) #define _NVTX_PAYLOAD_SME6(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME5(s,__VA_ARGS__) #define _NVTX_PAYLOAD_SME7(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME6(s,__VA_ARGS__) #define _NVTX_PAYLOAD_SME8(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME7(s,__VA_ARGS__) #define _NVTX_PAYLOAD_SME9(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME8(s,__VA_ARGS__) #define _NVTX_PAYLOAD_SME10(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME9(s,__VA_ARGS__) #define _NVTX_PAYLOAD_SME11(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME10(s,__VA_ARGS__) #define _NVTX_PAYLOAD_SME12(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME11(s,__VA_ARGS__) #define _NVTX_PAYLOAD_SME13(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME12(s,__VA_ARGS__) #define _NVTX_PAYLOAD_SME14(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME13(s,__VA_ARGS__) #define _NVTX_PAYLOAD_SME15(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME14(s,__VA_ARGS__) #define _NVTX_PAYLOAD_SME16(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME15(s,__VA_ARGS__) #define _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, ...) \ nvtxPayloadSchemaEntry_t struct_id##Schema[] = { \ NVTX_EXT_CONCAT(_NVTX_PAYLOAD_SME, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(struct_id, __VA_ARGS__) \ {0, 0} \ }; /* * Handle optional parameters for `NVTX_DEFINE_SCHEMA_FOR_STRUCT[_AND_REGISTER]`. */ #define _NVTX_DEFINE_S4S_6(struct_id, schema_name, prefix, schema_flags, schema_id, entries) \ prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, \ NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID |,\ NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) #define _NVTX_DEFINE_S4S_5(struct_id, schema_name, prefix, schema_flags, entries) \ prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, 0, \ NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS |, \ NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) #define _NVTX_DEFINE_S4S_4(struct_id, schema_name, prefix, entries) \ prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, \ NVTX_PAYLOAD_SCHEMA_ATTR_NAME |, \ NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) #define _NVTX_DEFINE_S4S_3(struct_id, schema_name, entries) \ _NVTX_DEFINE_S4S_4(struct_id, schema_name, /*prefix*/, entries) #define _NVTX_DEFINE_S4S_2(struct_id, entries) \ _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, NULL, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, ,\ NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) #define _NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, ...) \ NVTX_EXT_CONCAT(_NVTX_DEFINE_S4S_, \ NVTX_EXT_NUM_ARGS(struct_id, __VA_ARGS__))(struct_id, __VA_ARGS__) /*** END: Helper for `NVTX_PAYLOAD_STATIC_SCHEMA_{DEFINE,SETUP}` ***/ /******************************************************************/ /*** Helper for `NVTX_DEFINE_STRUCT_WITH_SCHEMA[_AND_REGISTER]` ***/ /* Extract struct member for fixed-size arrays. */ #define _NVTX_PAYLOAD_STRUCT_ARR_MEM1(name) name #define _NVTX_PAYLOAD_STRUCT_ARR_MEM2(name, count) name[count] /* Extract type and member name and handle special case of fixed-size array. */ #define _NVTX_PAYLOAD_STRUCT_E2(type, member) type member; #define _NVTX_PAYLOAD_STRUCT_E3(type, member, etype) type member; #define _NVTX_PAYLOAD_STRUCT_E4(type, member, etype, name) type member; #define _NVTX_PAYLOAD_STRUCT_E5(type, member, etype, name, desc) type member; #define _NVTX_PAYLOAD_STRUCT_E6(type, member, etype, name, desc, arraylen) \ type NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT_ARR_MEM, NVTX_EXT_NUM_ARGS member) member; #define _NVTX_PAYLOAD_STRUCT_E7(type, member, etype, name, desc, arraylen, flags) \ _NVTX_PAYLOAD_STRUCT_E6(type, member, etype, name, desc, arraylen) /* Handle different number of arguments per struct entry. */ #define _NVTX_PAYLOAD_STRUCT_ENTRY_(...) \ NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT_E, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) /* Handle up to 16 struct members. */ #define _NVTX_PAYLOAD_STRUCT_ENTRY(entry) _NVTX_PAYLOAD_STRUCT_ENTRY_ entry #define _NVTX_PAYLOAD_STRUCT1(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) #define _NVTX_PAYLOAD_STRUCT2(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT1(__VA_ARGS__) #define _NVTX_PAYLOAD_STRUCT3(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT2(__VA_ARGS__) #define _NVTX_PAYLOAD_STRUCT4(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT3(__VA_ARGS__) #define _NVTX_PAYLOAD_STRUCT5(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT4(__VA_ARGS__) #define _NVTX_PAYLOAD_STRUCT6(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT5(__VA_ARGS__) #define _NVTX_PAYLOAD_STRUCT7(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT6(__VA_ARGS__) #define _NVTX_PAYLOAD_STRUCT8(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT7(__VA_ARGS__) #define _NVTX_PAYLOAD_STRUCT9(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT8(__VA_ARGS__) #define _NVTX_PAYLOAD_STRUCT10(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT9(__VA_ARGS__) #define _NVTX_PAYLOAD_STRUCT11(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT10(__VA_ARGS__) #define _NVTX_PAYLOAD_STRUCT12(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT11(__VA_ARGS__) #define _NVTX_PAYLOAD_STRUCT13(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT12(__VA_ARGS__) #define _NVTX_PAYLOAD_STRUCT14(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT13(__VA_ARGS__) #define _NVTX_PAYLOAD_STRUCT15(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT14(__VA_ARGS__) #define _NVTX_PAYLOAD_STRUCT16(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT15(__VA_ARGS__) /* Generate the typedef. */ #define _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, ...) \ typedef struct { \ NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) \ } struct_id; /* Generate first part of the schema entry. */ #define _NVTX_PAYLOAD_INIT_SCHEMA_N3(type, memberId, etype) \ 0, NVTX_PAYLOAD_ENTRY_##etype, NULL, NULL, 0, #define _NVTX_PAYLOAD_INIT_SCHEMA_N4(type, memberId, etype, name) \ 0, NVTX_PAYLOAD_ENTRY_##etype, name, NULL, 0, #define _NVTX_PAYLOAD_INIT_SCHEMA_N5(type, memberId, etype, name, desc) \ 0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, 0, #define _NVTX_PAYLOAD_INIT_SCHEMA_N6(type, memberId, etype, name, desc, arraylen) \ 0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen, #define _NVTX_PAYLOAD_INIT_SCHEMA_N7(type, memberId, etype, name, desc, arraylen, flags) \ NVTX_PAYLOAD_ENTRY_FLAG_##flags, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen, #define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_FRONT(...) \ NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SCHEMA_N, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) #define _NVTX_PAYLOAD_ARRAY_MEMBER1(name) name #define _NVTX_PAYLOAD_ARRAY_MEMBER2(name, count) name /* Resolve to last part of schema entry (append struct member). */ #define _NVTX_PAYLOAD_INIT_SCHEMA_NX3(type, memberId, ...) memberId #define _NVTX_PAYLOAD_INIT_SCHEMA_NX4(type, memberId, ...) memberId #define _NVTX_PAYLOAD_INIT_SCHEMA_NX5(type, memberId, ...) memberId #define _NVTX_PAYLOAD_INIT_SCHEMA_NX6(type, memberId, ...) \ NVTX_EXT_CONCAT(_NVTX_PAYLOAD_ARRAY_MEMBER, NVTX_EXT_NUM_ARGS memberId) memberId #define _NVTX_PAYLOAD_INIT_SCHEMA_NX7(type, memberId, ...) \ _NVTX_PAYLOAD_INIT_SCHEMA_NX6(type, memberId, __VA_ARGS__) #define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_END(...) \ NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SCHEMA_NX, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) /* Resolve to schema entry. `entry` is `(ctype, name, ...)`. */ #define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(struct_id, entry) \ {_NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_FRONT entry \ offsetof(struct_id, _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_END entry)}, /* Handle up to 16 schema entries. */ #define _NVTX_PAYLOAD_INIT_SME1(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) #define _NVTX_PAYLOAD_INIT_SME2(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME1(s, __VA_ARGS__) #define _NVTX_PAYLOAD_INIT_SME3(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME2(s, __VA_ARGS__) #define _NVTX_PAYLOAD_INIT_SME4(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME3(s, __VA_ARGS__) #define _NVTX_PAYLOAD_INIT_SME5(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME4(s, __VA_ARGS__) #define _NVTX_PAYLOAD_INIT_SME6(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME5(s, __VA_ARGS__) #define _NVTX_PAYLOAD_INIT_SME7(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME6(s, __VA_ARGS__) #define _NVTX_PAYLOAD_INIT_SME8(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME7(s, __VA_ARGS__) #define _NVTX_PAYLOAD_INIT_SME9(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME8(s, __VA_ARGS__) #define _NVTX_PAYLOAD_INIT_SME10(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME9(s, __VA_ARGS__) #define _NVTX_PAYLOAD_INIT_SME11(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME10(s, __VA_ARGS__) #define _NVTX_PAYLOAD_INIT_SME12(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME11(s, __VA_ARGS__) #define _NVTX_PAYLOAD_INIT_SME13(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME12(s, __VA_ARGS__) #define _NVTX_PAYLOAD_INIT_SME14(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME13(s, __VA_ARGS__) #define _NVTX_PAYLOAD_INIT_SME15(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME14(s, __VA_ARGS__) #define _NVTX_PAYLOAD_INIT_SME16(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME15(s, __VA_ARGS__) #define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, ...) \ nvtxPayloadSchemaEntry_t struct_id##Schema[] = { \ NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SME, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(struct_id, __VA_ARGS__) \ {0, 0} \ }; /* * Handle optional parameters for `NVTX_DEFINE_STRUCT_WITH_SCHEMA[_AND_REGISTER]`. */ #define _NVTX_DEFINE_SWS_6(struct_id, schema_name, prefix, schema_flags, schema_id, entries) \ _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, \ NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS | \ NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID |, \ NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) #define _NVTX_DEFINE_SWS_5(struct_id, schema_name, prefix, schema_flags, entries) \ _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, 0, \ NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS |, \ NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) #define _NVTX_DEFINE_SWS_4(struct_id, schema_name, prefix, entries) \ _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, \ NVTX_PAYLOAD_SCHEMA_ATTR_NAME |, \ NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) #define _NVTX_DEFINE_SWS_3(struct_id, schema_name, entries) \ _NVTX_DEFINE_SWS_4(struct_id, schema_name, /* no prefix */, entries) #define _NVTX_DEFINE_SWS_2(struct_id, entries) \ _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, NULL, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, , \ NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) #define _NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, ...) \ NVTX_EXT_CONCAT(_NVTX_DEFINE_SWS_, \ NVTX_EXT_NUM_ARGS(struct_id, __VA_ARGS__))(struct_id, __VA_ARGS__) /*** END: Helper for `NVTX_PAYLOAD_STATIC_SCHEMA_{INIT,CREATE}` */ #endif /* NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H */nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h000066400000000000000000000140501463451655400251310ustar00rootroot00000000000000/* * Copyright 2021-2023 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD #error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined). #endif typedef void* nvtx_payload_pointer_type; #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) #include #include #endif /* `alignof` is available as of C11 or C++11. */ #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L) #define nvtx_alignof(type) alignof(type) #define nvtx_alignof2(type,tname) alignof(type) #else /* (__STDC_VERSION__ >= 201112L) || (__cplusplus >= 201103L) */ /* Create helper structs to determine type alignment. */ #define MKTYPEDEF(type) typedef struct {char c; type d;} _nvtx_##type #define MKTYPEDEF2(type,tname) typedef struct {char c; type d;} _nvtx_##tname MKTYPEDEF(char); MKTYPEDEF2(unsigned char, uchar); MKTYPEDEF(short); MKTYPEDEF2(unsigned short, ushort); MKTYPEDEF(int); MKTYPEDEF2(unsigned int, uint); MKTYPEDEF(long); MKTYPEDEF2(unsigned long, ulong); MKTYPEDEF2(long long, longlong); MKTYPEDEF2(unsigned long long, ulonglong); MKTYPEDEF(int8_t); MKTYPEDEF(uint8_t); MKTYPEDEF(int16_t); MKTYPEDEF(uint16_t); MKTYPEDEF(int32_t); MKTYPEDEF(uint32_t); MKTYPEDEF(int64_t); MKTYPEDEF(uint64_t); MKTYPEDEF(float); MKTYPEDEF(double); MKTYPEDEF2(long double, longdouble); MKTYPEDEF(size_t); MKTYPEDEF(nvtx_payload_pointer_type); MKTYPEDEF(wchar_t); /* `char8_t` is available as of C++20 or C23 */ #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L) MKTYPEDEF(char8_t); #endif /* `char16_t` and `char32_t` are available as of C++11 or C11 */ #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L) MKTYPEDEF(char16_t); MKTYPEDEF(char32_t); #endif /* C requires to include stddef.h to use `offsetof` */ #ifndef __cplusplus #include #endif #define nvtx_alignof(tname) offsetof(_nvtx_##tname, d) #define nvtx_alignof2(type, tname) offsetof(_nvtx_##tname, d) #endif /* __STDC_VERSION__ >= 201112L */ #undef MKTYPEDEF #undef MKTYPEDEF2 /* * Helper array to get the alignment for each predefined C/C++ language type. * The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`. * * In C++, `const` variables use internal linkage by default, but we need it to * be public (extern) since weak declarations must be public. */ NVTX_LINKONCE_DEFINE_GLOBAL #ifdef __cplusplus extern #endif const nvtxPayloadEntryTypeInfo_t NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo)[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] = { /* The first entry contains this array's length and the size of each entry in this array. */ {NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE, sizeof(nvtxPayloadEntryTypeInfo_t)}, /*** C integer types ***/ /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR */ {sizeof(char), nvtx_alignof(char)}, /* NVTX_PAYLOAD_ENTRY_TYPE_UCHAR */ {sizeof(unsigned char), nvtx_alignof2(unsigned char, uchar)}, /* NVTX_PAYLOAD_ENTRY_TYPE_SHORT */ {sizeof(short), nvtx_alignof(short)}, /* NVTX_PAYLOAD_ENTRY_TYPE_USHORT */ {sizeof(unsigned short), nvtx_alignof2(unsigned short, ushort)}, /* NVTX_PAYLOAD_ENTRY_TYPE_INT */ {sizeof(int), nvtx_alignof(int)}, /* NVTX_PAYLOAD_ENTRY_TYPE_UINT */ {sizeof(unsigned int), nvtx_alignof2(unsigned int, uint)}, /* NVTX_PAYLOAD_ENTRY_TYPE_LONG */ {sizeof(long), nvtx_alignof(long)}, /* NVTX_PAYLOAD_ENTRY_TYPE_ULONG */ {sizeof(unsigned long), nvtx_alignof2(unsigned long, ulong)}, /* NVTX_PAYLOAD_ENTRY_TYPE_LONGLONG */ {sizeof(long long), nvtx_alignof2(long long, longlong)}, /* NVTX_PAYLOAD_ENTRY_TYPE_ULONGLONG */ {sizeof(unsigned long long), nvtx_alignof2(unsigned long long,ulonglong)}, /*** Integer types with explicit size ***/ /* NVTX_PAYLOAD_ENTRY_TYPE_INT8 */ {sizeof(int8_t), nvtx_alignof(int8_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_UINT8 */ {sizeof(uint8_t), nvtx_alignof(uint8_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_INT16 */ {sizeof(int16_t), nvtx_alignof(int16_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_UINT16 */ {sizeof(uint16_t), nvtx_alignof(uint16_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_INT32 */ {sizeof(int32_t), nvtx_alignof(int32_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_UINT32 */ {sizeof(uint32_t), nvtx_alignof(uint32_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_INT64 */ {sizeof(int64_t), nvtx_alignof(int64_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_UINT64 */ {sizeof(uint64_t), nvtx_alignof(uint64_t)}, /*** C floating point types ***/ /* NVTX_PAYLOAD_ENTRY_TYPE_FLOAT */ {sizeof(float), nvtx_alignof(float)}, /* NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE */ {sizeof(double), nvtx_alignof(double)}, /* NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE */ {sizeof(long double), nvtx_alignof2(long double, longdouble)}, /* NVTX_PAYLOAD_ENTRY_TYPE_SIZE */ {sizeof(size_t), nvtx_alignof(size_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(nvtx_payload_pointer_type), nvtx_alignof(nvtx_payload_pointer_type)}, /*** Special character types ***/ /* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)}, #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L) /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {sizeof(char8_t), nvtx_alignof(char8_t)}, #else /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {0, 0}, #endif #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L) /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {sizeof(char16_t), nvtx_alignof(char16_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {sizeof(char32_t), nvtx_alignof(char32_t)} #else /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {0, 0}, /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {0, 0} #endif }; #undef nvtx_alignof #undef nvtx_alignof2nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxExtTypes.h000066400000000000000000000024711463451655400231720ustar00rootroot00000000000000/* * Copyright 2021 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ /* This header defines types which are used by the internal implementation * of NVTX and callback subscribers. API clients do not use these types, * so they are defined here instead of in nvToolsExt.h to clarify they are * not part of the NVTX client API. */ #ifndef NVTXEXTTYPES_H #define NVTXEXTTYPES_H #ifndef NVTX_EXT_TYPES_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt[EXTENSION].h. #endif typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId); typedef struct nvtxExtModuleSegment_t { size_t segmentId; size_t slotCount; intptr_t* functionSlots; } nvtxExtModuleSegment_t; typedef struct nvtxExtModuleInfo_t { uint16_t nvtxVer; uint16_t structSize; uint16_t moduleId; uint16_t compatId; size_t segmentsCount; nvtxExtModuleSegment_t* segments; NvtxExtGetExportFunction_t getExportFunction; const void* extInfo; } nvtxExtModuleInfo_t; typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo); #endif /* NVTXEXTTYPES_H */nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxImpl.h000066400000000000000000000521411463451655400223050ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). #endif #include #include #include #include /* ---- Include required platform headers ---- */ #if defined(_WIN32) #include #else #include #if defined(__ANDROID__) #include #endif #if defined(__linux__) || defined(__CYGWIN__) #include #endif #include #include #include #include #include #include #endif /* ---- Define macros used in this file ---- */ #define NVTX_INIT_STATE_FRESH 0 #define NVTX_INIT_STATE_STARTED 1 #define NVTX_INIT_STATE_COMPLETE 2 #ifdef NVTX_DEBUG_PRINT #ifdef __ANDROID__ #include #define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__); #define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__); #else #include #define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__) #define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__) #endif #else /* !defined(NVTX_DEBUG_PRINT) */ #define NVTX_ERR(...) #define NVTX_INFO(...) #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ #ifdef __GNUC__ #pragma GCC visibility push(hidden) #endif /* ---- Forward declare all functions referenced in globals ---- */ NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void); NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)( NvtxCallbackModule module, NvtxFunctionTable* out_table, unsigned int* out_size); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)( uint32_t version); NVTX_LINKONCE_FWDDECL_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)( uint32_t exportTableId); #include "nvtxInitDecls.h" /* ---- Define all globals ---- */ typedef struct nvtxGlobals_t { volatile unsigned int initState; NvtxExportTableCallbacks etblCallbacks; NvtxExportTableVersionInfo etblVersionInfo; /* Implementation function pointers */ nvtxMarkEx_impl_fntype nvtxMarkEx_impl_fnptr; nvtxMarkA_impl_fntype nvtxMarkA_impl_fnptr; nvtxMarkW_impl_fntype nvtxMarkW_impl_fnptr; nvtxRangeStartEx_impl_fntype nvtxRangeStartEx_impl_fnptr; nvtxRangeStartA_impl_fntype nvtxRangeStartA_impl_fnptr; nvtxRangeStartW_impl_fntype nvtxRangeStartW_impl_fnptr; nvtxRangeEnd_impl_fntype nvtxRangeEnd_impl_fnptr; nvtxRangePushEx_impl_fntype nvtxRangePushEx_impl_fnptr; nvtxRangePushA_impl_fntype nvtxRangePushA_impl_fnptr; nvtxRangePushW_impl_fntype nvtxRangePushW_impl_fnptr; nvtxRangePop_impl_fntype nvtxRangePop_impl_fnptr; nvtxNameCategoryA_impl_fntype nvtxNameCategoryA_impl_fnptr; nvtxNameCategoryW_impl_fntype nvtxNameCategoryW_impl_fnptr; nvtxNameOsThreadA_impl_fntype nvtxNameOsThreadA_impl_fnptr; nvtxNameOsThreadW_impl_fntype nvtxNameOsThreadW_impl_fnptr; nvtxNameCuDeviceA_fakeimpl_fntype nvtxNameCuDeviceA_impl_fnptr; nvtxNameCuDeviceW_fakeimpl_fntype nvtxNameCuDeviceW_impl_fnptr; nvtxNameCuContextA_fakeimpl_fntype nvtxNameCuContextA_impl_fnptr; nvtxNameCuContextW_fakeimpl_fntype nvtxNameCuContextW_impl_fnptr; nvtxNameCuStreamA_fakeimpl_fntype nvtxNameCuStreamA_impl_fnptr; nvtxNameCuStreamW_fakeimpl_fntype nvtxNameCuStreamW_impl_fnptr; nvtxNameCuEventA_fakeimpl_fntype nvtxNameCuEventA_impl_fnptr; nvtxNameCuEventW_fakeimpl_fntype nvtxNameCuEventW_impl_fnptr; nvtxNameClDeviceA_fakeimpl_fntype nvtxNameClDeviceA_impl_fnptr; nvtxNameClDeviceW_fakeimpl_fntype nvtxNameClDeviceW_impl_fnptr; nvtxNameClContextA_fakeimpl_fntype nvtxNameClContextA_impl_fnptr; nvtxNameClContextW_fakeimpl_fntype nvtxNameClContextW_impl_fnptr; nvtxNameClCommandQueueA_fakeimpl_fntype nvtxNameClCommandQueueA_impl_fnptr; nvtxNameClCommandQueueW_fakeimpl_fntype nvtxNameClCommandQueueW_impl_fnptr; nvtxNameClMemObjectA_fakeimpl_fntype nvtxNameClMemObjectA_impl_fnptr; nvtxNameClMemObjectW_fakeimpl_fntype nvtxNameClMemObjectW_impl_fnptr; nvtxNameClSamplerA_fakeimpl_fntype nvtxNameClSamplerA_impl_fnptr; nvtxNameClSamplerW_fakeimpl_fntype nvtxNameClSamplerW_impl_fnptr; nvtxNameClProgramA_fakeimpl_fntype nvtxNameClProgramA_impl_fnptr; nvtxNameClProgramW_fakeimpl_fntype nvtxNameClProgramW_impl_fnptr; nvtxNameClEventA_fakeimpl_fntype nvtxNameClEventA_impl_fnptr; nvtxNameClEventW_fakeimpl_fntype nvtxNameClEventW_impl_fnptr; nvtxNameCudaDeviceA_impl_fntype nvtxNameCudaDeviceA_impl_fnptr; nvtxNameCudaDeviceW_impl_fntype nvtxNameCudaDeviceW_impl_fnptr; nvtxNameCudaStreamA_fakeimpl_fntype nvtxNameCudaStreamA_impl_fnptr; nvtxNameCudaStreamW_fakeimpl_fntype nvtxNameCudaStreamW_impl_fnptr; nvtxNameCudaEventA_fakeimpl_fntype nvtxNameCudaEventA_impl_fnptr; nvtxNameCudaEventW_fakeimpl_fntype nvtxNameCudaEventW_impl_fnptr; nvtxDomainMarkEx_impl_fntype nvtxDomainMarkEx_impl_fnptr; nvtxDomainRangeStartEx_impl_fntype nvtxDomainRangeStartEx_impl_fnptr; nvtxDomainRangeEnd_impl_fntype nvtxDomainRangeEnd_impl_fnptr; nvtxDomainRangePushEx_impl_fntype nvtxDomainRangePushEx_impl_fnptr; nvtxDomainRangePop_impl_fntype nvtxDomainRangePop_impl_fnptr; nvtxDomainResourceCreate_impl_fntype nvtxDomainResourceCreate_impl_fnptr; nvtxDomainResourceDestroy_impl_fntype nvtxDomainResourceDestroy_impl_fnptr; nvtxDomainNameCategoryA_impl_fntype nvtxDomainNameCategoryA_impl_fnptr; nvtxDomainNameCategoryW_impl_fntype nvtxDomainNameCategoryW_impl_fnptr; nvtxDomainRegisterStringA_impl_fntype nvtxDomainRegisterStringA_impl_fnptr; nvtxDomainRegisterStringW_impl_fntype nvtxDomainRegisterStringW_impl_fnptr; nvtxDomainCreateA_impl_fntype nvtxDomainCreateA_impl_fnptr; nvtxDomainCreateW_impl_fntype nvtxDomainCreateW_impl_fnptr; nvtxDomainDestroy_impl_fntype nvtxDomainDestroy_impl_fnptr; nvtxInitialize_impl_fntype nvtxInitialize_impl_fnptr; nvtxDomainSyncUserCreate_impl_fntype nvtxDomainSyncUserCreate_impl_fnptr; nvtxDomainSyncUserDestroy_impl_fntype nvtxDomainSyncUserDestroy_impl_fnptr; nvtxDomainSyncUserAcquireStart_impl_fntype nvtxDomainSyncUserAcquireStart_impl_fnptr; nvtxDomainSyncUserAcquireFailed_impl_fntype nvtxDomainSyncUserAcquireFailed_impl_fnptr; nvtxDomainSyncUserAcquireSuccess_impl_fntype nvtxDomainSyncUserAcquireSuccess_impl_fnptr; nvtxDomainSyncUserReleasing_impl_fntype nvtxDomainSyncUserReleasing_impl_fnptr; /* Tables of function pointers -- Extra null added to the end to ensure * a crash instead of silent corruption if a tool reads off the end. */ NvtxFunctionPointer* functionTable_CORE [NVTX_CBID_CORE_SIZE + 1]; NvtxFunctionPointer* functionTable_CUDA [NVTX_CBID_CUDA_SIZE + 1]; NvtxFunctionPointer* functionTable_OPENCL[NVTX_CBID_OPENCL_SIZE + 1]; NvtxFunctionPointer* functionTable_CUDART[NVTX_CBID_CUDART_SIZE + 1]; NvtxFunctionPointer* functionTable_CORE2 [NVTX_CBID_CORE2_SIZE + 1]; NvtxFunctionPointer* functionTable_SYNC [NVTX_CBID_SYNC_SIZE + 1]; } nvtxGlobals_t; NVTX_LINKONCE_DEFINE_GLOBAL nvtxGlobals_t NVTX_VERSIONED_IDENTIFIER(nvtxGlobals) = { NVTX_INIT_STATE_FRESH, { sizeof(NvtxExportTableCallbacks), NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable) }, { sizeof(NvtxExportTableVersionInfo), NVTX_VERSION, 0, NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion) }, /* Implementation function pointers */ NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init), /* Tables of function pointers */ { 0, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr, 0 }, { 0, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr, 0 }, { 0, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr, 0 }, { 0, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr, 0 }, { 0, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr, 0 }, { 0, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr, 0 } }; /* ---- Define static inline implementations of core API functions ---- */ #include "nvtxImplCore.h" /* ---- Define implementations of export table functions ---- */ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)( NvtxCallbackModule module, NvtxFunctionTable* out_table, unsigned int* out_size) { unsigned int bytes = 0; NvtxFunctionTable table = (NvtxFunctionTable)0; switch (module) { case NVTX_CB_MODULE_CORE: table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE; bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE); break; case NVTX_CB_MODULE_CUDA: table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA; bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA); break; case NVTX_CB_MODULE_OPENCL: table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL; bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL); break; case NVTX_CB_MODULE_CUDART: table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART; bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART); break; case NVTX_CB_MODULE_CORE2: table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2; bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2); break; case NVTX_CB_MODULE_SYNC: table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC; bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC); break; default: return 0; } if (out_size) *out_size = (bytes / (unsigned int)sizeof(NvtxFunctionPointer*)) - 1; if (out_table) *out_table = table; return 1; } NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(uint32_t exportTableId) { switch (exportTableId) { case NVTX_ETID_CALLBACKS: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblCallbacks; case NVTX_ETID_VERSIONINFO: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblVersionInfo; default: return 0; } } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(uint32_t version) { /* Reserved for custom implementations to resolve problems with tools */ (void)version; } /* ---- Define implementations of init versions of all API functions ---- */ #include "nvtxInitDefs.h" /* ---- Define implementations of initialization functions ---- */ #include "nvtxInit.h" #ifdef __GNUC__ #pragma GCC visibility pop #endif #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxImplCore.h000066400000000000000000000234331463451655400231200ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib) { #ifndef NVTX_DISABLE nvtxMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr; if(local!=0) (*local)(eventAttrib); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message) { #ifndef NVTX_DISABLE nvtxMarkA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr; if(local!=0) (*local)(message); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message) { #ifndef NVTX_DISABLE nvtxMarkW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr; if(local!=0) (*local)(message); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib) { #ifndef NVTX_DISABLE nvtxRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr; if(local!=0) return (*local)(eventAttrib); else #endif /*NVTX_DISABLE*/ return (nvtxRangeId_t)0; } NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message) { #ifndef NVTX_DISABLE nvtxRangeStartA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr; if(local!=0) return (*local)(message); else #endif /*NVTX_DISABLE*/ return (nvtxRangeId_t)0; } NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message) { #ifndef NVTX_DISABLE nvtxRangeStartW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr; if(local!=0) return (*local)(message); else #endif /*NVTX_DISABLE*/ return (nvtxRangeId_t)0; } NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id) { #ifndef NVTX_DISABLE nvtxRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr; if(local!=0) (*local)(id); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib) { #ifndef NVTX_DISABLE nvtxRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr; if(local!=0) return (*local)(eventAttrib); else #endif /*NVTX_DISABLE*/ return (int)NVTX_NO_PUSH_POP_TRACKING; } NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message) { #ifndef NVTX_DISABLE nvtxRangePushA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr; if(local!=0) return (*local)(message); else #endif /*NVTX_DISABLE*/ return (int)NVTX_NO_PUSH_POP_TRACKING; } NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message) { #ifndef NVTX_DISABLE nvtxRangePushW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr; if(local!=0) return (*local)(message); else #endif /*NVTX_DISABLE*/ return (int)NVTX_NO_PUSH_POP_TRACKING; } NVTX_DECLSPEC int NVTX_API nvtxRangePop(void) { #ifndef NVTX_DISABLE nvtxRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr; if(local!=0) return (*local)(); else #endif /*NVTX_DISABLE*/ return (int)NVTX_NO_PUSH_POP_TRACKING; } NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name) { #ifndef NVTX_DISABLE nvtxNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr; if(local!=0) (*local)(category, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr; if(local!=0) (*local)(category, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name) { #ifndef NVTX_DISABLE nvtxNameOsThreadA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr; if(local!=0) (*local)(threadId, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameOsThreadW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr; if(local!=0) (*local)(threadId, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib) { #ifndef NVTX_DISABLE nvtxDomainMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr; if(local!=0) (*local)(domain, eventAttrib); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib) { #ifndef NVTX_DISABLE nvtxDomainRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr; if(local!=0) return (*local)(domain, eventAttrib); else #endif /*NVTX_DISABLE*/ return (nvtxRangeId_t)0; } NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id) { #ifndef NVTX_DISABLE nvtxDomainRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr; if(local!=0) (*local)(domain, id); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib) { #ifndef NVTX_DISABLE nvtxDomainRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr; if(local!=0) return (*local)(domain, eventAttrib); else #endif /*NVTX_DISABLE*/ return (int)NVTX_NO_PUSH_POP_TRACKING; } NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain) { #ifndef NVTX_DISABLE nvtxDomainRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr; if(local!=0) return (*local)(domain); else #endif /*NVTX_DISABLE*/ return (int)NVTX_NO_PUSH_POP_TRACKING; } NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs) { #ifndef NVTX_DISABLE nvtxDomainResourceCreate_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr; if(local!=0) return (*local)(domain, attribs); else #endif /*NVTX_DISABLE*/ return (nvtxResourceHandle_t)0; } NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource) { #ifndef NVTX_DISABLE nvtxDomainResourceDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr; if(local!=0) (*local)(resource); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name) { #ifndef NVTX_DISABLE nvtxDomainNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr; if(local!=0) (*local)(domain, category, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxDomainNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr; if(local!=0) (*local)(domain, category, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string) { #ifndef NVTX_DISABLE nvtxDomainRegisterStringA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr; if(local!=0) return (*local)(domain, string); else #endif /*NVTX_DISABLE*/ return (nvtxStringHandle_t)0; } NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string) { #ifndef NVTX_DISABLE nvtxDomainRegisterStringW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr; if(local!=0) return (*local)(domain, string); else #endif /*NVTX_DISABLE*/ return (nvtxStringHandle_t)0; } NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* message) { #ifndef NVTX_DISABLE nvtxDomainCreateA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr; if(local!=0) return (*local)(message); else #endif /*NVTX_DISABLE*/ return (nvtxDomainHandle_t)0; } NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* message) { #ifndef NVTX_DISABLE nvtxDomainCreateW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr; if(local!=0) return (*local)(message); else #endif /*NVTX_DISABLE*/ return (nvtxDomainHandle_t)0; } NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain) { #ifndef NVTX_DISABLE nvtxDomainDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr; if(local!=0) (*local)(domain); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved) { #ifndef NVTX_DISABLE nvtxInitialize_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr; if(local!=0) (*local)(reserved); #endif /*NVTX_DISABLE*/ } nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h000066400000000000000000000060401463451655400240150ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD_CUDART #error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined). #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name); typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name); typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name); typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name); typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name); typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name); NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name) { #ifndef NVTX_DISABLE nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr; if(local!=0) (*local)(device, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr; if(local!=0) (*local)(device, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name) { #ifndef NVTX_DISABLE nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr; if(local!=0) (*local)(stream, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr; if(local!=0) (*local)(stream, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name) { #ifndef NVTX_DISABLE nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr; if(local!=0) (*local)(event, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr; if(local!=0) (*local)(event, name); #endif /*NVTX_DISABLE*/ } #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h000066400000000000000000000074531463451655400235200ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD_CUDA #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined). #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name); typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name); typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name); typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name); typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name); typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name); typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name); typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name); NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name) { #ifndef NVTX_DISABLE nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr; if(local!=0) (*local)(device, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr; if(local!=0) (*local)(device, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name) { #ifndef NVTX_DISABLE nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr; if(local!=0) (*local)(context, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr; if(local!=0) (*local)(context, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name) { #ifndef NVTX_DISABLE nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr; if(local!=0) (*local)(stream, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr; if(local!=0) (*local)(stream, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name) { #ifndef NVTX_DISABLE nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr; if(local!=0) (*local)(event, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr; if(local!=0) (*local)(event, name); #endif /*NVTX_DISABLE*/ } #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h000066400000000000000000000147201463451655400237570ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD_OPENCL #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined). #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ typedef void (NVTX_API * nvtxNameClDeviceA_impl_fntype)(cl_device_id device, const char* name); typedef void (NVTX_API * nvtxNameClDeviceW_impl_fntype)(cl_device_id device, const wchar_t* name); typedef void (NVTX_API * nvtxNameClContextA_impl_fntype)(cl_context context, const char* name); typedef void (NVTX_API * nvtxNameClContextW_impl_fntype)(cl_context context, const wchar_t* name); typedef void (NVTX_API * nvtxNameClCommandQueueA_impl_fntype)(cl_command_queue command_queue, const char* name); typedef void (NVTX_API * nvtxNameClCommandQueueW_impl_fntype)(cl_command_queue command_queue, const wchar_t* name); typedef void (NVTX_API * nvtxNameClMemObjectA_impl_fntype)(cl_mem memobj, const char* name); typedef void (NVTX_API * nvtxNameClMemObjectW_impl_fntype)(cl_mem memobj, const wchar_t* name); typedef void (NVTX_API * nvtxNameClSamplerA_impl_fntype)(cl_sampler sampler, const char* name); typedef void (NVTX_API * nvtxNameClSamplerW_impl_fntype)(cl_sampler sampler, const wchar_t* name); typedef void (NVTX_API * nvtxNameClProgramA_impl_fntype)(cl_program program, const char* name); typedef void (NVTX_API * nvtxNameClProgramW_impl_fntype)(cl_program program, const wchar_t* name); typedef void (NVTX_API * nvtxNameClEventA_impl_fntype)(cl_event evnt, const char* name); typedef void (NVTX_API * nvtxNameClEventW_impl_fntype)(cl_event evnt, const wchar_t* name); NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name) { #ifndef NVTX_DISABLE nvtxNameClDeviceA_impl_fntype local = (nvtxNameClDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr; if(local!=0) (*local)(device, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameClDeviceW_impl_fntype local = (nvtxNameClDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr; if(local!=0) (*local)(device, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name) { #ifndef NVTX_DISABLE nvtxNameClContextA_impl_fntype local = (nvtxNameClContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr; if(local!=0) (*local)(context, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameClContextW_impl_fntype local = (nvtxNameClContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr; if(local!=0) (*local)(context, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name) { #ifndef NVTX_DISABLE nvtxNameClCommandQueueA_impl_fntype local = (nvtxNameClCommandQueueA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr; if(local!=0) (*local)(command_queue, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameClCommandQueueW_impl_fntype local = (nvtxNameClCommandQueueW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr; if(local!=0) (*local)(command_queue, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name) { #ifndef NVTX_DISABLE nvtxNameClMemObjectA_impl_fntype local = (nvtxNameClMemObjectA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr; if(local!=0) (*local)(memobj, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameClMemObjectW_impl_fntype local = (nvtxNameClMemObjectW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr; if(local!=0) (*local)(memobj, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name) { #ifndef NVTX_DISABLE nvtxNameClSamplerA_impl_fntype local = (nvtxNameClSamplerA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr; if(local!=0) (*local)(sampler, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameClSamplerW_impl_fntype local = (nvtxNameClSamplerW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr; if(local!=0) (*local)(sampler, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name) { #ifndef NVTX_DISABLE nvtxNameClProgramA_impl_fntype local = (nvtxNameClProgramA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr; if(local!=0) (*local)(program, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameClProgramW_impl_fntype local = (nvtxNameClProgramW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr; if(local!=0) (*local)(program, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name) { #ifndef NVTX_DISABLE nvtxNameClEventA_impl_fntype local = (nvtxNameClEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr; if(local!=0) (*local)(evnt, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameClEventW_impl_fntype local = (nvtxNameClEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr; if(local!=0) (*local)(evnt, name); #endif /*NVTX_DISABLE*/ } #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h000066400000000000000000000064331463451655400235550ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD_SYNC #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined). #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs); typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle); NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs) { #ifndef NVTX_DISABLE nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr; if(local!=0) return (*local)(domain, attribs); else #endif /*NVTX_DISABLE*/ return (nvtxSyncUser_t)0; } NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle) { #ifndef NVTX_DISABLE nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr; if(local!=0) (*local)(handle); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle) { #ifndef NVTX_DISABLE nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr; if(local!=0) (*local)(handle); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle) { #ifndef NVTX_DISABLE nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr; if(local!=0) (*local)(handle); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle) { #ifndef NVTX_DISABLE nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr; if(local!=0) (*local)(handle); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle) { #ifndef NVTX_DISABLE nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr; if(local!=0) (*local)(handle); #endif /*NVTX_DISABLE*/ } #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxInit.h000066400000000000000000000313621463451655400223110ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). #endif /* ---- Platform-independent helper definitions and functions ---- */ /* Prefer macros over inline functions to reduce symbol resolution at link time */ #if defined(_WIN32) #define NVTX_PATHCHAR wchar_t #define NVTX_STR(x) L##x #define NVTX_GETENV _wgetenv #define NVTX_BUFSIZE 16384 #define NVTX_DLLHANDLE HMODULE #define NVTX_DLLOPEN(x) LoadLibraryW(x) #define NVTX_DLLFUNC GetProcAddress #define NVTX_DLLCLOSE FreeLibrary #define NVTX_YIELD() SwitchToThread() #define NVTX_MEMBAR() MemoryBarrier() #define NVTX_ATOMIC_WRITE_32(address, value) InterlockedExchange((volatile LONG*)address, value) #define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand) #elif defined(__GNUC__) #define NVTX_PATHCHAR char #define NVTX_STR(x) x #define NVTX_GETENV getenv #define NVTX_BUFSIZE 16384 #define NVTX_DLLHANDLE void* #define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY) #define NVTX_DLLFUNC dlsym #define NVTX_DLLCLOSE dlclose #define NVTX_YIELD() sched_yield() #define NVTX_MEMBAR() __sync_synchronize() /* Ensure full memory barrier for atomics, to match Windows functions */ #define NVTX_ATOMIC_WRITE_32(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value) #define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand) #else #error The library does not support your configuration! #endif /* Define this to 1 for platforms that where pre-injected libraries can be discovered. */ #if defined(_WIN32) /* TODO */ #define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0 #else #define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0 #endif /* Define this to 1 for platforms that support environment variables */ /* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */ /* Try: #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */ #define NVTX_SUPPORT_ENV_VARS 1 /* Define this to 1 for platforms that support dynamic/shared libraries */ #define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1 /* Injection libraries implementing InitializeInjectionNvtx2 may be statically linked, * and this will override any dynamic injection. Useful for platforms where dynamic * injection is not available. Since weak symbols not explicitly marked extern are * guaranteed to be initialized to zero if no definitions are found by the linker, the * dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */ #if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__) #define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1 /* To statically inject an NVTX library, define InitializeInjectionNvtx2_fnptr as a normal * symbol (not weak) pointing to the implementation of InitializeInjectionNvtx2 (which * does not need to be named "InitializeInjectionNvtx2" as is necessary in a dynamic * injection library. */ __attribute__((weak)) NvtxInitializeInjectionNvtxFunc_t InitializeInjectionNvtx2_fnptr; #else #define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0 #endif /* This function tries to find or load an NVTX injection library and get the * address of its InitializeInjection2 function. If such a function pointer * is found, it is called, and passed the address of this NVTX instance's * nvtxGetExportTable function, so the injection can attach to this instance. * If the initialization fails for any reason, any dynamic library loaded will * be freed, and all NVTX implementation functions will be set to no-ops. If * initialization succeeds, NVTX functions not attached to the tool will be set * to no-ops. This is implemented as one function instead of several small * functions to minimize the number of weak symbols the linker must resolve. * Order of search is: * - Pre-injected library exporting InitializeInjectionNvtx2 * - Loadable library exporting InitializeInjectionNvtx2 * - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64) * - On Android, libNvtxInjection??.so within the package (?? is 32 or 64) * - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr */ NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void); NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void) { const char* const initFuncName = "InitializeInjectionNvtx2"; NvtxInitializeInjectionNvtxFunc_t init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)0; NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0; int entryPointStatus = 0; #if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY /* Use POSIX global symbol chain to query for init function from any module */ init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(0, initFuncName); #endif #if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY /* Try discovering dynamic injection library to load */ if (!init_fnptr) { #if NVTX_SUPPORT_ENV_VARS /* If env var NVTX_INJECTION64_PATH is set, it should contain the path * to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */ const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4) ? NVTX_STR("NVTX_INJECTION32_PATH") : NVTX_STR("NVTX_INJECTION64_PATH"); #endif /* NVTX_SUPPORT_ENV_VARS */ NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE]; const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0; /* Refer to this variable explicitly in case all references to it are #if'ed out */ (void)injectionLibraryPathBuf; #if NVTX_SUPPORT_ENV_VARS /* Disable the warning for getenv & _wgetenv -- this usage is safe because * these functions are not called again before using the returned value. */ #if defined(_MSC_VER) #pragma warning( push ) #pragma warning( disable : 4996 ) #endif injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName); #if defined(_MSC_VER) #pragma warning( pop ) #endif #endif #if defined(__ANDROID__) if (!injectionLibraryPath) { const char *bits = (sizeof(void*) == 4) ? "32" : "64"; char cmdlineBuf[32]; char pkgName[PATH_MAX]; int count; int pid; FILE *fp; size_t bytesRead; size_t pos; pid = (int)getpid(); count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid); if (count <= 0 || count >= (int)sizeof(cmdlineBuf)) { NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid); return NVTX_ERR_INIT_ACCESS_LIBRARY; } fp = fopen(cmdlineBuf, "r"); if (!fp) { NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf); return NVTX_ERR_INIT_ACCESS_LIBRARY; } bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp); fclose(fp); if (bytesRead == 0) { NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf); return NVTX_ERR_INIT_ACCESS_LIBRARY; } pkgName[bytesRead] = 0; /* String can contain colon as a process separator. In this case the package name is before the colon. */ pos = 0; while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0') { ++pos; } pkgName[pos] = 0; count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits); if (count <= 0 || count >= NVTX_BUFSIZE) { NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits); return NVTX_ERR_INIT_ACCESS_LIBRARY; } /* On Android, verify path is accessible due to aggressive file access restrictions. */ /* For dlopen, if the filename contains a leading slash, then it is interpreted as a */ /* relative or absolute pathname; otherwise it will follow the rules in ld.so. */ if (injectionLibraryPathBuf[0] == '/') { #if (__ANDROID_API__ < 21) int access_err = access(injectionLibraryPathBuf, F_OK | R_OK); #else int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0); #endif if (access_err != 0) { NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf); return NVTX_ERR_INIT_ACCESS_LIBRARY; } } injectionLibraryPath = injectionLibraryPathBuf; } #endif /* At this point, injectionLibraryPath is specified if a dynamic * injection library was specified by a tool. */ if (injectionLibraryPath) { /* Load the injection library */ injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath); if (!injectionLibraryHandle) { NVTX_ERR("Failed to load injection library\n"); return NVTX_ERR_INIT_LOAD_LIBRARY; } else { /* Attempt to get the injection library's entry-point */ init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName); if (!init_fnptr) { NVTX_DLLCLOSE(injectionLibraryHandle); NVTX_ERR("Failed to get address of function InitializeInjectionNvtx2 from injection library\n"); return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT; } } } } #endif #if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY if (!init_fnptr) { /* Check weakly-defined function pointer. A statically-linked injection can define this as * a normal symbol and it will take precedence over a dynamic injection. */ if (InitializeInjectionNvtx2_fnptr) { init_fnptr = InitializeInjectionNvtx2_fnptr; } } #endif /* At this point, if init_fnptr is not set, then no tool has specified * an NVTX injection library -- return non-success result so all NVTX * API functions will be set to no-ops. */ if (!init_fnptr) { return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE; } /* Invoke injection library's initialization function. If it returns * 0 (failure) and a dynamic injection was loaded, unload it. */ entryPointStatus = init_fnptr(NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)); if (entryPointStatus == 0) { NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n"); if (injectionLibraryHandle) { NVTX_DLLCLOSE(injectionLibraryHandle); } return NVTX_ERR_INIT_FAILED_LIBRARY_ENTRY_POINT; } return NVTX_SUCCESS; } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void) { unsigned int old; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState == NVTX_INIT_STATE_COMPLETE) { return; } NVTX_ATOMIC_CAS_32( old, &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState, NVTX_INIT_STATE_STARTED, NVTX_INIT_STATE_FRESH); if (old == NVTX_INIT_STATE_FRESH) { int result; int forceAllToNoops; /* Load & initialize injection library -- it will assign the function pointers */ result = NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(); /* Set all pointers not assigned by the injection to null */ forceAllToNoops = result != NVTX_SUCCESS; /* Set all to null if injection init failed */ NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(forceAllToNoops); /* Signal that initialization has finished, so now the assigned function pointers will be used */ NVTX_ATOMIC_WRITE_32( &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState, NVTX_INIT_STATE_COMPLETE); } else /* Spin-wait until initialization has finished */ { NVTX_MEMBAR(); while (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState != NVTX_INIT_STATE_COMPLETE) { NVTX_YIELD(); NVTX_MEMBAR(); } } } nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxInitDecls.h000066400000000000000000000226201463451655400232610ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). #endif NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id); NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib); NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message); NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message); NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id); NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle); nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxInitDefs.h000066400000000000000000001055471463451655400231220ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). #endif NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxMarkEx(eventAttrib); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxMarkA(message); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxMarkW(message); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxRangeStartEx(eventAttrib); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxRangeStartA(message); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxRangeStartW(message); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxRangeEnd(id); } NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxRangePushEx(eventAttrib); } NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxRangePushA(message); } NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxRangePushW(message); } NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxRangePop(); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxNameCategoryA(category, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxNameCategoryW(category, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxNameOsThreadA(threadId, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxNameOsThreadW(threadId, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxDomainMarkEx(domain, eventAttrib); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainRangeStartEx(domain, eventAttrib); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxDomainRangeEnd(domain, id); } NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainRangePushEx(domain, eventAttrib); } NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainRangePop(domain); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainResourceCreate(domain, attribs); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxDomainResourceDestroy(resource); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxDomainNameCategoryA(domain, category, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxDomainNameCategoryW(domain, category, name); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainRegisterStringA(domain, string); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainRegisterStringW(domain, string); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainCreateA(message); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainCreateW(message); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxDomainDestroy(domain); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxInitialize(reserved); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name){ nvtxNameCuDeviceA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr; if (local) local(device, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name){ nvtxNameCuDeviceW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr; if (local) local(device, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name){ nvtxNameCuContextA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr; if (local) local(context, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name){ nvtxNameCuContextW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr; if (local) local(context, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name){ nvtxNameCuStreamA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr; if (local) local(stream, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name){ nvtxNameCuStreamW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr; if (local) local(stream, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name){ nvtxNameCuEventA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr; if (local) local(event, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name){ nvtxNameCuEventW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr; if (local) local(event, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name){ nvtxNameCudaDeviceA_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr; if (local) local(device, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name){ nvtxNameCudaDeviceW_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr; if (local) local(device, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name){ nvtxNameCudaStreamA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr; if (local) local(stream, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name){ nvtxNameCudaStreamW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr; if (local) local(stream, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name){ nvtxNameCudaEventA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr; if (local) local(event, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name){ nvtxNameCudaEventW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr; if (local) local(event, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name){ nvtxNameClDeviceA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr; if (local) local(device, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name){ nvtxNameClDeviceW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr; if (local) local(device, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name){ nvtxNameClContextA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr; if (local) local(context, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name){ nvtxNameClContextW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr; if (local) local(context, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name){ nvtxNameClCommandQueueA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr; if (local) local(command_queue, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name){ nvtxNameClCommandQueueW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr; if (local) local(command_queue, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name){ nvtxNameClMemObjectA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr; if (local) local(memobj, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name){ nvtxNameClMemObjectW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr; if (local) local(memobj, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name){ nvtxNameClSamplerA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr; if (local) local(sampler, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name){ nvtxNameClSamplerW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr; if (local) local(sampler, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name){ nvtxNameClProgramA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr; if (local) local(program, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name){ nvtxNameClProgramW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr; if (local) local(program, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name){ nvtxNameClEventA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr; if (local) local(evnt, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name){ nvtxNameClEventW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr; if (local) local(evnt, name); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs){ nvtxDomainSyncUserCreate_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr; if (local) { return local(domain, attribs); } return (nvtxSyncUser_t)0; } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle){ nvtxDomainSyncUserDestroy_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr; if (local) local(handle); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle){ nvtxDomainSyncUserAcquireStart_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr; if (local) local(handle); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle){ nvtxDomainSyncUserAcquireFailed_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr; if (local) local(handle); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle){ nvtxDomainSyncUserAcquireSuccess_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr; if (local) local(handle); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle){ nvtxDomainSyncUserReleasing_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr; if (local) local(handle); } NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops); NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops) { if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr = NULL; } nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h000066400000000000000000000100251463451655400231010ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef __NVTX_LINKONCE_H__ #define __NVTX_LINKONCE_H__ /* This header defines macros to permit making definitions of global variables * and functions in C/C++ header files which may be included multiple times in * a translation unit or linkage unit. It allows authoring header-only libraries * which can be used by multiple other header-only libraries (either as the same * copy or multiple copies), and does not require any build changes, such as * adding another .c file, linking a static library, or deploying a dynamic * library. Globals defined with these macros have the property that they have * the same address, pointing to a single instance, for the entire linkage unit. * It is expected but not guaranteed that each linkage unit will have a separate * instance. * * In some situations it is desirable to declare a variable without initializing * it, refer to it in code or other variables' initializers, and then initialize * it later. Similarly, functions can be prototyped, have their address taken, * and then have their body defined later. In such cases, use the FWDDECL macros * when forward-declaring LINKONCE global variables without initializers and * function prototypes, and then use the DEFINE macros when later defining them. * Although in many cases the FWDDECL macro is equivalent to the DEFINE macro, * following this pattern makes code maximally portable. */ #if defined(__MINGW32__) /* MinGW */ #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0."))) #if defined(__cplusplus) #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline NVTX_LINKONCE_WEAK #else #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK #endif #elif defined(_MSC_VER) /* MSVC */ #if defined(__cplusplus) #define NVTX_LINKONCE_DEFINE_GLOBAL extern "C" __declspec(selectany) #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline #else #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) #define NVTX_LINKONCE_DEFINE_FUNCTION __inline #endif #elif defined(__CYGWIN__) && defined(__clang__) /* Clang on Cygwin */ #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0."))) #if defined(__cplusplus) #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_WEAK #else #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK #endif #elif defined(__CYGWIN__) /* Assume GCC or compatible */ #define NVTX_LINKONCE_WEAK __attribute__((weak)) #if defined(__cplusplus) #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline #else #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK #endif #else /* All others: Assume GCC, clang, or compatible */ #define NVTX_LINKONCE_WEAK __attribute__((weak)) #define NVTX_LINKONCE_HIDDEN __attribute__((visibility("hidden"))) #if defined(__cplusplus) #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_HIDDEN inline #else #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK #endif #endif #define NVTX_LINKONCE_FWDDECL_GLOBAL NVTX_LINKONCE_DEFINE_GLOBAL extern #define NVTX_LINKONCE_FWDDECL_FUNCTION NVTX_LINKONCE_DEFINE_FUNCTION #endif /* __NVTX_LINKONCE_H__ */ nccl-2.22.3-1/src/include/nvtx3/nvtxDetail/nvtxTypes.h000066400000000000000000000366251463451655400225210ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ /* This header defines types which are used by the internal implementation * of NVTX and callback subscribers. API clients do not use these types, * so they are defined here instead of in nvToolsExt.h to clarify they are * not part of the NVTX client API. */ #ifndef NVTX_IMPL_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt.h. #endif /* ------ Dependency-free types binary-compatible with real types ------- */ /* In order to avoid having the NVTX core API headers depend on non-NVTX * headers like cuda.h, NVTX defines binary-compatible types to use for * safely making the initialization versions of all NVTX functions without * needing to have definitions for the real types. */ typedef int nvtx_CUdevice; typedef void* nvtx_CUcontext; typedef void* nvtx_CUstream; typedef void* nvtx_CUevent; typedef void* nvtx_cudaStream_t; typedef void* nvtx_cudaEvent_t; typedef void* nvtx_cl_platform_id; typedef void* nvtx_cl_device_id; typedef void* nvtx_cl_context; typedef void* nvtx_cl_command_queue; typedef void* nvtx_cl_mem; typedef void* nvtx_cl_program; typedef void* nvtx_cl_kernel; typedef void* nvtx_cl_event; typedef void* nvtx_cl_sampler; typedef struct nvtxSyncUser* nvtxSyncUser_t; struct nvtxSyncUserAttributes_v0; typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t; /* --------- Types for function pointers (with fake API types) ---------- */ typedef void (NVTX_API * nvtxMarkEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib); typedef void (NVTX_API * nvtxMarkA_impl_fntype)(const char* message); typedef void (NVTX_API * nvtxMarkW_impl_fntype)(const wchar_t* message); typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib); typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartA_impl_fntype)(const char* message); typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartW_impl_fntype)(const wchar_t* message); typedef void (NVTX_API * nvtxRangeEnd_impl_fntype)(nvtxRangeId_t id); typedef int (NVTX_API * nvtxRangePushEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib); typedef int (NVTX_API * nvtxRangePushA_impl_fntype)(const char* message); typedef int (NVTX_API * nvtxRangePushW_impl_fntype)(const wchar_t* message); typedef int (NVTX_API * nvtxRangePop_impl_fntype)(void); typedef void (NVTX_API * nvtxNameCategoryA_impl_fntype)(uint32_t category, const char* name); typedef void (NVTX_API * nvtxNameCategoryW_impl_fntype)(uint32_t category, const wchar_t* name); typedef void (NVTX_API * nvtxNameOsThreadA_impl_fntype)(uint32_t threadId, const char* name); typedef void (NVTX_API * nvtxNameOsThreadW_impl_fntype)(uint32_t threadId, const wchar_t* name); /* Real impl types are defined in nvtxImplCuda_v3.h, where CUDA headers are included */ typedef void (NVTX_API * nvtxNameCuDeviceA_fakeimpl_fntype)(nvtx_CUdevice device, const char* name); typedef void (NVTX_API * nvtxNameCuDeviceW_fakeimpl_fntype)(nvtx_CUdevice device, const wchar_t* name); typedef void (NVTX_API * nvtxNameCuContextA_fakeimpl_fntype)(nvtx_CUcontext context, const char* name); typedef void (NVTX_API * nvtxNameCuContextW_fakeimpl_fntype)(nvtx_CUcontext context, const wchar_t* name); typedef void (NVTX_API * nvtxNameCuStreamA_fakeimpl_fntype)(nvtx_CUstream stream, const char* name); typedef void (NVTX_API * nvtxNameCuStreamW_fakeimpl_fntype)(nvtx_CUstream stream, const wchar_t* name); typedef void (NVTX_API * nvtxNameCuEventA_fakeimpl_fntype)(nvtx_CUevent event, const char* name); typedef void (NVTX_API * nvtxNameCuEventW_fakeimpl_fntype)(nvtx_CUevent event, const wchar_t* name); /* Real impl types are defined in nvtxImplOpenCL_v3.h, where OPENCL headers are included */ typedef void (NVTX_API * nvtxNameClDeviceA_fakeimpl_fntype)(nvtx_cl_device_id device, const char* name); typedef void (NVTX_API * nvtxNameClDeviceW_fakeimpl_fntype)(nvtx_cl_device_id device, const wchar_t* name); typedef void (NVTX_API * nvtxNameClContextA_fakeimpl_fntype)(nvtx_cl_context context, const char* name); typedef void (NVTX_API * nvtxNameClContextW_fakeimpl_fntype)(nvtx_cl_context context, const wchar_t* name); typedef void (NVTX_API * nvtxNameClCommandQueueA_fakeimpl_fntype)(nvtx_cl_command_queue command_queue, const char* name); typedef void (NVTX_API * nvtxNameClCommandQueueW_fakeimpl_fntype)(nvtx_cl_command_queue command_queue, const wchar_t* name); typedef void (NVTX_API * nvtxNameClMemObjectA_fakeimpl_fntype)(nvtx_cl_mem memobj, const char* name); typedef void (NVTX_API * nvtxNameClMemObjectW_fakeimpl_fntype)(nvtx_cl_mem memobj, const wchar_t* name); typedef void (NVTX_API * nvtxNameClSamplerA_fakeimpl_fntype)(nvtx_cl_sampler sampler, const char* name); typedef void (NVTX_API * nvtxNameClSamplerW_fakeimpl_fntype)(nvtx_cl_sampler sampler, const wchar_t* name); typedef void (NVTX_API * nvtxNameClProgramA_fakeimpl_fntype)(nvtx_cl_program program, const char* name); typedef void (NVTX_API * nvtxNameClProgramW_fakeimpl_fntype)(nvtx_cl_program program, const wchar_t* name); typedef void (NVTX_API * nvtxNameClEventA_fakeimpl_fntype)(nvtx_cl_event evnt, const char* name); typedef void (NVTX_API * nvtxNameClEventW_fakeimpl_fntype)(nvtx_cl_event evnt, const wchar_t* name); /* Real impl types are defined in nvtxImplCudaRt_v3.h, where CUDART headers are included */ typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name); typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name); typedef void (NVTX_API * nvtxNameCudaStreamA_fakeimpl_fntype)(nvtx_cudaStream_t stream, const char* name); typedef void (NVTX_API * nvtxNameCudaStreamW_fakeimpl_fntype)(nvtx_cudaStream_t stream, const wchar_t* name); typedef void (NVTX_API * nvtxNameCudaEventA_fakeimpl_fntype)(nvtx_cudaEvent_t event, const char* name); typedef void (NVTX_API * nvtxNameCudaEventW_fakeimpl_fntype)(nvtx_cudaEvent_t event, const wchar_t* name); typedef void (NVTX_API * nvtxDomainMarkEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); typedef nvtxRangeId_t (NVTX_API * nvtxDomainRangeStartEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); typedef void (NVTX_API * nvtxDomainRangeEnd_impl_fntype)(nvtxDomainHandle_t domain, nvtxRangeId_t id); typedef int (NVTX_API * nvtxDomainRangePushEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); typedef int (NVTX_API * nvtxDomainRangePop_impl_fntype)(nvtxDomainHandle_t domain); typedef nvtxResourceHandle_t (NVTX_API * nvtxDomainResourceCreate_impl_fntype)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs); typedef void (NVTX_API * nvtxDomainResourceDestroy_impl_fntype)(nvtxResourceHandle_t resource); typedef void (NVTX_API * nvtxDomainNameCategoryA_impl_fntype)(nvtxDomainHandle_t domain, uint32_t category, const char* name); typedef void (NVTX_API * nvtxDomainNameCategoryW_impl_fntype)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name); typedef nvtxStringHandle_t (NVTX_API * nvtxDomainRegisterStringA_impl_fntype)(nvtxDomainHandle_t domain, const char* string); typedef nvtxStringHandle_t (NVTX_API * nvtxDomainRegisterStringW_impl_fntype)(nvtxDomainHandle_t domain, const wchar_t* string); typedef nvtxDomainHandle_t (NVTX_API * nvtxDomainCreateA_impl_fntype)(const char* message); typedef nvtxDomainHandle_t (NVTX_API * nvtxDomainCreateW_impl_fntype)(const wchar_t* message); typedef void (NVTX_API * nvtxDomainDestroy_impl_fntype)(nvtxDomainHandle_t domain); typedef void (NVTX_API * nvtxInitialize_impl_fntype)(const void* reserved); typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs); typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle); /* ---------------- Types for callback subscription --------------------- */ typedef const void *(NVTX_API * NvtxGetExportTableFunc_t)(uint32_t exportTableId); typedef int (NVTX_API * NvtxInitializeInjectionNvtxFunc_t)(NvtxGetExportTableFunc_t exportTable); typedef enum NvtxCallbackModule { NVTX_CB_MODULE_INVALID = 0, NVTX_CB_MODULE_CORE = 1, NVTX_CB_MODULE_CUDA = 2, NVTX_CB_MODULE_OPENCL = 3, NVTX_CB_MODULE_CUDART = 4, NVTX_CB_MODULE_CORE2 = 5, NVTX_CB_MODULE_SYNC = 6, /* --- New constants must only be added directly above this line --- */ NVTX_CB_MODULE_SIZE, NVTX_CB_MODULE_FORCE_INT = 0x7fffffff } NvtxCallbackModule; typedef enum NvtxCallbackIdCore { NVTX_CBID_CORE_INVALID = 0, NVTX_CBID_CORE_MarkEx = 1, NVTX_CBID_CORE_MarkA = 2, NVTX_CBID_CORE_MarkW = 3, NVTX_CBID_CORE_RangeStartEx = 4, NVTX_CBID_CORE_RangeStartA = 5, NVTX_CBID_CORE_RangeStartW = 6, NVTX_CBID_CORE_RangeEnd = 7, NVTX_CBID_CORE_RangePushEx = 8, NVTX_CBID_CORE_RangePushA = 9, NVTX_CBID_CORE_RangePushW = 10, NVTX_CBID_CORE_RangePop = 11, NVTX_CBID_CORE_NameCategoryA = 12, NVTX_CBID_CORE_NameCategoryW = 13, NVTX_CBID_CORE_NameOsThreadA = 14, NVTX_CBID_CORE_NameOsThreadW = 15, /* --- New constants must only be added directly above this line --- */ NVTX_CBID_CORE_SIZE, NVTX_CBID_CORE_FORCE_INT = 0x7fffffff } NvtxCallbackIdCore; typedef enum NvtxCallbackIdCore2 { NVTX_CBID_CORE2_INVALID = 0, NVTX_CBID_CORE2_DomainMarkEx = 1, NVTX_CBID_CORE2_DomainRangeStartEx = 2, NVTX_CBID_CORE2_DomainRangeEnd = 3, NVTX_CBID_CORE2_DomainRangePushEx = 4, NVTX_CBID_CORE2_DomainRangePop = 5, NVTX_CBID_CORE2_DomainResourceCreate = 6, NVTX_CBID_CORE2_DomainResourceDestroy = 7, NVTX_CBID_CORE2_DomainNameCategoryA = 8, NVTX_CBID_CORE2_DomainNameCategoryW = 9, NVTX_CBID_CORE2_DomainRegisterStringA = 10, NVTX_CBID_CORE2_DomainRegisterStringW = 11, NVTX_CBID_CORE2_DomainCreateA = 12, NVTX_CBID_CORE2_DomainCreateW = 13, NVTX_CBID_CORE2_DomainDestroy = 14, NVTX_CBID_CORE2_Initialize = 15, /* --- New constants must only be added directly above this line --- */ NVTX_CBID_CORE2_SIZE, NVTX_CBID_CORE2_FORCE_INT = 0x7fffffff } NvtxCallbackIdCore2; typedef enum NvtxCallbackIdCuda { NVTX_CBID_CUDA_INVALID = 0, NVTX_CBID_CUDA_NameCuDeviceA = 1, NVTX_CBID_CUDA_NameCuDeviceW = 2, NVTX_CBID_CUDA_NameCuContextA = 3, NVTX_CBID_CUDA_NameCuContextW = 4, NVTX_CBID_CUDA_NameCuStreamA = 5, NVTX_CBID_CUDA_NameCuStreamW = 6, NVTX_CBID_CUDA_NameCuEventA = 7, NVTX_CBID_CUDA_NameCuEventW = 8, /* --- New constants must only be added directly above this line --- */ NVTX_CBID_CUDA_SIZE, NVTX_CBID_CUDA_FORCE_INT = 0x7fffffff } NvtxCallbackIdCuda; typedef enum NvtxCallbackIdCudaRt { NVTX_CBID_CUDART_INVALID = 0, NVTX_CBID_CUDART_NameCudaDeviceA = 1, NVTX_CBID_CUDART_NameCudaDeviceW = 2, NVTX_CBID_CUDART_NameCudaStreamA = 3, NVTX_CBID_CUDART_NameCudaStreamW = 4, NVTX_CBID_CUDART_NameCudaEventA = 5, NVTX_CBID_CUDART_NameCudaEventW = 6, /* --- New constants must only be added directly above this line --- */ NVTX_CBID_CUDART_SIZE, NVTX_CBID_CUDART_FORCE_INT = 0x7fffffff } NvtxCallbackIdCudaRt; typedef enum NvtxCallbackIdOpenCL { NVTX_CBID_OPENCL_INVALID = 0, NVTX_CBID_OPENCL_NameClDeviceA = 1, NVTX_CBID_OPENCL_NameClDeviceW = 2, NVTX_CBID_OPENCL_NameClContextA = 3, NVTX_CBID_OPENCL_NameClContextW = 4, NVTX_CBID_OPENCL_NameClCommandQueueA = 5, NVTX_CBID_OPENCL_NameClCommandQueueW = 6, NVTX_CBID_OPENCL_NameClMemObjectA = 7, NVTX_CBID_OPENCL_NameClMemObjectW = 8, NVTX_CBID_OPENCL_NameClSamplerA = 9, NVTX_CBID_OPENCL_NameClSamplerW = 10, NVTX_CBID_OPENCL_NameClProgramA = 11, NVTX_CBID_OPENCL_NameClProgramW = 12, NVTX_CBID_OPENCL_NameClEventA = 13, NVTX_CBID_OPENCL_NameClEventW = 14, /* --- New constants must only be added directly above this line --- */ NVTX_CBID_OPENCL_SIZE, NVTX_CBID_OPENCL_FORCE_INT = 0x7fffffff } NvtxCallbackIdOpenCL; typedef enum NvtxCallbackIdSync { NVTX_CBID_SYNC_INVALID = 0, NVTX_CBID_SYNC_DomainSyncUserCreate = 1, NVTX_CBID_SYNC_DomainSyncUserDestroy = 2, NVTX_CBID_SYNC_DomainSyncUserAcquireStart = 3, NVTX_CBID_SYNC_DomainSyncUserAcquireFailed = 4, NVTX_CBID_SYNC_DomainSyncUserAcquireSuccess = 5, NVTX_CBID_SYNC_DomainSyncUserReleasing = 6, /* --- New constants must only be added directly above this line --- */ NVTX_CBID_SYNC_SIZE, NVTX_CBID_SYNC_FORCE_INT = 0x7fffffff } NvtxCallbackIdSync; /* IDs for NVTX Export Tables */ typedef enum NvtxExportTableID { NVTX_ETID_INVALID = 0, NVTX_ETID_CALLBACKS = 1, NVTX_ETID_RESERVED0 = 2, NVTX_ETID_VERSIONINFO = 3, /* --- New constants must only be added directly above this line --- */ NVTX_ETID_SIZE, NVTX_ETID_FORCE_INT = 0x7fffffff } NvtxExportTableID; typedef void (* NvtxFunctionPointer)(void); /* generic uncallable function pointer, must be casted to appropriate function type */ typedef NvtxFunctionPointer** NvtxFunctionTable; /* double pointer because array(1) of pointers(2) to function pointers */ typedef struct NvtxExportTableCallbacks { size_t struct_size; /* returns an array of pointer to function pointers*/ int (NVTX_API *GetModuleFunctionTable)( NvtxCallbackModule module, NvtxFunctionTable* out_table, unsigned int* out_size); } NvtxExportTableCallbacks; typedef struct NvtxExportTableVersionInfo { /* sizeof(NvtxExportTableVersionInfo) */ size_t struct_size; /* The API version comes from the NVTX library linked to the app. The * injection library is can use this info to make some assumptions */ uint32_t version; /* Reserved for alignment, do not use */ uint32_t reserved0; /* This must be set by tools when attaching to provide applications * the ability to, in emergency situations, detect problematic tools * versions and modify the NVTX source to prevent attaching anything * that causes trouble in the app. Currently, this value is ignored. */ void (NVTX_API *SetInjectionNvtxVersion)( uint32_t version); } NvtxExportTableVersionInfo; nccl-2.22.3-1/src/include/p2p.h000066400000000000000000000025451463451655400157640ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include #ifndef NCCL_P2P_H_ #define NCCL_P2P_H_ #include #include #include "core.h" #if CUDART_VERSION < 12030 // MNNVL: FABRIC handle support lifted from CUDA 12.3 #define CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED ((CUdevice_attribute)128) #define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL) #define CU_IPC_HANDLE_SIZE 64 typedef struct CUmemFabricHandle_st { unsigned char data[CU_IPC_HANDLE_SIZE]; } CUmemFabricHandle_v1; typedef CUmemFabricHandle_v1 CUmemFabricHandle; #endif typedef union { uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support CUmemFabricHandle handle; } ncclCuDesc; typedef union { // Legacy CUDA IPC cudaIpcMemHandle_t devIpc; // cuMem API support ncclCuDesc cuDesc; } ncclIpcDesc; ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr); ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc); ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr); #endif nccl-2.22.3-1/src/include/param.h000066400000000000000000000017771463451655400163710ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_PARAM_H_ #define NCCL_PARAM_H_ #include const char* userHomeDir(); void setEnvFile(const char* fileName); void initEnv(); const char *ncclGetEnv(const char *name); void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache); #define NCCL_PARAM(name, env, deftVal) \ int64_t ncclParam##name() { \ constexpr int64_t uninitialized = INT64_MIN; \ static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \ static int64_t cache = uninitialized; \ if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \ ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \ } \ return cache; \ } #endif nccl-2.22.3-1/src/include/profiler.h000066400000000000000000000016131463451655400171000ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_PROFILER_H_ #define NCCL_PROFILER_H_ #include "proxy.h" enum ncclProxyProfileState { ncclProxyProfileBegin = 0, ncclProxyProfileSendGPUWait = 1, ncclProxyProfileSendWait = 2, ncclProxyProfileRecvWait = 1, ncclProxyProfileRecvFlushWait = 2, ncclProxyProfileRecvGPUWait = 3, ncclProxyProfileEnd = 4, ncclProxyProfileSleep = 8, ncclProxyProfileWakeup = 9, ncclProxyProfileIdle = 16, ncclProxyProfileActive = 17, ncclProxyProfileAppend = 24, ncclProxyProfileAppendEnd = 25 }; ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state); void ncclProfilingDump(); #endif nccl-2.22.3-1/src/include/proxy.h000066400000000000000000000227351463451655400164470ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_PROXY_H_ #define NCCL_PROXY_H_ #include "device.h" #include "info.h" #include "socket.h" #include "ipcsocket.h" #include "nccl_net.h" #include #include "shm.h" #include "p2p.h" typedef enum : uint8_t { ncclPatternRing, ncclPatternRingTwice, ncclPatternPipelineFrom, ncclPatternPipelineTo, ncclPatternTreeUp, ncclPatternTreeDown, ncclPatternTreeUpDown, ncclPatternCollnetChain, ncclPatternCollnetDirect, ncclPatternNvls, ncclPatternNvlsTree, ncclPatternSend, ncclPatternRecv } ncclPattern_t; enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; struct ncclProxyArgs; typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*); #define NCCL_PROXY_MAX_SUBS MAXCHANNELS static_assert(2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH <= MAXCHANNELS, "Not enough sub space for max work elements"); union ncclProxyOpSpecifics { struct { size_t sizePerRank; int nNodes, node; } collnetDirect; }; struct ncclProxyOp { struct ncclProxyConnection* connection; ssize_t nbytes; uint64_t opCount; int root; int next; int nsteps; int chunkSize; uint8_t sliceSteps; uint8_t chunkSteps; uint8_t channelId; uint8_t /*ncclDataType_t*/ dtype; uint8_t /*ncclDevRedOp_t*/ redOp; uint8_t /*ncclFunc_t*/ coll; uint8_t /*ncclPattern_t*/ pattern; uint8_t protocol; uint8_t reg; // collnet buffer reg handles void* sendMhandle; void* recvMhandle; uint8_t* sendbuff; uint8_t* recvbuff; union ncclProxyOpSpecifics specifics; struct ncclProxyOp *enqNext; }; struct ncclProxySubArgs { struct ncclProxyConnection* connection; int reg; // p2p mhandle void* mhandle; // collnet handles void* sendMhandle; void* recvMhandle; uint8_t* sendbuff; uint8_t* recvbuff; size_t offset; int channelId; int nsteps; ssize_t nbytes; int peer; int groupSize; // Number of consecutive sub operations sharing the same recvComm uint64_t base; uint64_t posted; uint64_t received; uint64_t flushed; uint64_t transmitted; uint64_t done; uint64_t end; void* requests[NCCL_STEPS]; void* profilingEvents[NCCL_STEPS]; void* recvRequestsCache[NCCL_STEPS]; int recvRequestsSubCount; }; struct ncclProxyArgs { struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS]; proxyProgressFunc_t progress; int nsubs; int done; uint64_t opCount; int sliceSteps; int chunkSteps; int chunkSize; size_t totalSendSize; size_t totalRecvSize; size_t sendSizePerRound; size_t recvSizePerRound; uint8_t /*ncclDataType_t*/ dtype; uint8_t /*ncclDevRedOp_t*/ redOp; uint8_t /*ncclPattern_t*/ pattern; uint8_t /*ncclFunc_t*/ coll; uint8_t protocol; int state; char* sharedBuff[NCCL_STEPS]; int sharedSize[NCCL_STEPS]; int idle; // Element linking struct ncclProxyArgs* next; struct ncclProxyArgs* nextPeer; struct ncclProxyArgs** proxyAppendPtr; union ncclProxyOpSpecifics specifics; }; #define NCCL_MAX_NETDEVS 128 // ProxyOps are used to communicate between main thread and service thread // Make sure we have enough to store two full rounds of operations on all channels. // Otherwise we'd be unable to post half of them to free new elements. Each // p2p work contains a send and recv proxy op hence the 2x before it. #define MAX_OPS_PER_PEER (2*MAXCHANNELS*2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH) struct ncclProxyOpsPool { struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS]; volatile int nextOps; volatile int nextOpsEnd; volatile int freeOps[NCCL_MAX_LOCAL_RANKS]; pthread_mutex_t mutex; pthread_cond_t cond; }; struct ncclProxyOps { ncclProxyOpsPool* pool; ncclShmHandle_t handle; int count; int freeOp; int nextOps; int nextOpsEnd; }; struct ncclProxySharedP2p { int refcount; int size; char* cudaBuff; char* hostBuff; // CUDA IPC ncclIpcDesc ipcDesc; struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv }; struct ncclProxyPeer { struct ncclProxySharedP2p send; struct ncclProxySharedP2p recv; }; struct ncclSharedNetComms { void* sendComm[MAXCHANNELS]; void* recvComm[MAXCHANNELS]; int sendRefCount[MAXCHANNELS]; int recvRefCount[MAXCHANNELS]; }; struct ncclProxyPool; struct ncclProxyProgressState { // Used by main threads to send work to progress thread struct ncclProxyOpsPool* opsPool; ncclShmHandle_t handle; char opsPoolShmSuffix[6]; pthread_t thread; volatile int stop; struct ncclProxyPeer** localPeers; struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS]; struct ncclProxyArgs* active; struct ncclProxyArgs* pool; struct ncclProxyPool* pools; int nextOps; }; // Expected proxy response fifo struct ncclExpectedProxyResponse { void* opId; int respSize; bool done; void* respBuff; ncclResult_t res; struct ncclExpectedProxyResponse* next; }; struct ncclProxyAsyncOp { int type; struct ncclProxyConnection* connection; int reqSize, respSize; char *reqBuff, *respBuff; void* opId; ncclProxyAsyncOp* next; }; struct ncclProxyLocalPeer { struct ncclSocket sock; int tpRank; int tpLocalRank; ncclProxyAsyncOp* asyncOps; int asyncOpCounter; }; // Common response header for all proxyOps // We pack this into a struct to reduce the number of blocking send and recv calls struct ncclProxyRpcResponseHeader { void* opId; ncclResult_t res; int respSize; }; // UDS support struct ncclIpcHdr { int type; int rank; int reqSize; int respSize; void *opId; uint64_t data[16]; // 128-bytes }; struct ncclProxyState { int refCount; int tpRank; int tpnRanks; int tpLocalnRanks; int cudaDev; int p2pnChannels; int p2pChunkSize; int nChannels; int buffSizes[NCCL_NUM_PROTOCOLS]; bool allocP2pNetLLBuffers; bool dmaBufSupport; ncclNet_t* ncclNet; ncclCollNet_t* ncclCollNet; uint32_t* abortFlag; // Service threads pthread_t thread; pthread_t threadUDS; struct ncclSocket* listenSock; struct ncclIpcSocket ipcSock; int stop; CUcontext cudaCtx; ncclResult_t asyncResult; // Used by main thread union ncclSocketAddress* peerAddresses; struct ncclSocket* peerSocks; struct ncclProxyOps* proxyOps; void** sharedDevMems; struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS) uint64_t *peerAddressesUDS; // cuMem API support (UDS) // Progress thread struct ncclProxyProgressState progressState; // Queue of expected responses from the proxy struct ncclExpectedProxyResponse* expectedResponses; }; enum proxyConnectState { connUninitialized = 0, connInitialized = 1, connSharedInitialized = 2, connSetupDone = 3, connConnected = 4, numConnStates = 5 }; struct ncclProxyConnection { int send, transport, shared; int tpLocalRank, sameProcess; struct ncclSocket* sock; struct ncclTransportComm* tcomm; struct ncclProxyArgs *proxyAppend; struct ncclProxyArgs **proxyAppendPtr; void* transportResources; ncclNetDeviceHandle_t* netDeviceHandle; void* mhandles[NCCL_NUM_PROTOCOLS]; proxyConnectState state; struct ncclCollNetSharedRes* collNet; int needsProxyProgress; }; typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*); enum proxyMode { proxyRing = 0, proxyFrom = 1, proxyTo = 2 }; ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire); ncclResult_t ncclProxyStart(struct ncclComm* comm); ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS); ncclResult_t ncclProxyCreate(struct ncclComm* comm); ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn); enum ncclProxyMsgType { ncclProxyMsgInit = 1, ncclProxyMsgSharedInit = 2, ncclProxyMsgSetup = 3, ncclProxyMsgConnect = 4, ncclProxyMsgStart = 5, ncclProxyMsgClose = 6, ncclProxyMsgAbort = 7, ncclProxyMsgStop = 8, ncclProxyMsgGetFd = 9, // cuMem API support (UDS) ncclProxyMsgRegister = 10, ncclProxyMsgDeregister = 11 }; // This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types // Call this function on the client, supplying a locally unique opId. Then, poll on the return value of // ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId); // This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize); ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId); // UDS support ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int rank, void *handle, int* convertedFd); ncclResult_t ncclProxyStop(struct ncclComm* comm); ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm); ncclResult_t ncclProxyDestroy(struct ncclComm* comm); #endif nccl-2.22.3-1/src/include/register.h000066400000000000000000000021121463451655400170750ustar00rootroot00000000000000#ifndef NCCL_REGISTER_H_ #define NCCL_REGISTER_H_ #include "device.h" #include #include enum { NET_REG_COMPLETE = 0x01, NVLS_REG_COMPLETE = 0x02, NVLS_REG_POSSIBLE = 0x04, NVLS_REG_NO_SUPPORT = 0x08, COLLNET_REG_COMPLETE = 0x10 }; struct ncclReg { // common attributes size_t pages; int refs; uintptr_t addr; uint32_t state; // net reg int nDevs; int devs[MAXCHANNELS]; void** handles; // nvls reg uintptr_t baseAddr; size_t baseSize; CUdeviceptr regAddr; size_t regSize; int dev; CUmemGenericAllocationHandle mcHandle; uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */ // collnet reg void* collnetHandle; struct ncclProxyConnector* proxyconn; }; struct ncclRegCache { struct ncclReg **slots; int capacity, population; uintptr_t pageSize; void* sComms[MAXCHANNELS]; void* rComms[MAXCHANNELS]; }; ncclResult_t ncclRegCleanup(struct ncclComm* comm); ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg); #endif nccl-2.22.3-1/src/include/shm.h000066400000000000000000000015261463451655400160500ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_SHM_H_ #define NCCL_SHM_H_ #include "nccl.h" typedef void* ncclShmHandle_t; ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle); ncclResult_t ncclShmClose(ncclShmHandle_t handle); ncclResult_t ncclShmUnlink(ncclShmHandle_t handle); struct ncclShmemCollBuff { volatile size_t *cnt[2]; volatile void *ptr[2]; int round; size_t maxTypeSize; }; ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize); #endif nccl-2.22.3-1/src/include/socket.h000066400000000000000000000100421463451655400165420ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_SOCKET_H_ #define NCCL_SOCKET_H_ #include "nccl.h" #include #include #include #include #include #include #define MAX_IFS 16 #define MAX_IF_NAME_SIZE 16 #define SLEEP_INT 1000 // connection retry sleep interval in usec #define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) #define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV) #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL /* Common socket address storage structure for IPv4/IPv6 */ union ncclSocketAddress { struct sockaddr sa; struct sockaddr_in sin; struct sockaddr_in6 sin6; }; enum ncclSocketState { ncclSocketStateNone = 0, ncclSocketStateInitialized = 1, ncclSocketStateAccepting = 2, ncclSocketStateAccepted = 3, ncclSocketStateConnecting = 4, ncclSocketStateConnectPolling = 5, ncclSocketStateConnected = 6, ncclSocketStateReady = 7, ncclSocketStateClosed = 8, ncclSocketStateError = 9, ncclSocketStateNum = 10 }; enum ncclSocketType { ncclSocketTypeUnknown = 0, ncclSocketTypeBootstrap = 1, ncclSocketTypeProxy = 2, ncclSocketTypeNetSocket = 3, ncclSocketTypeNetIb = 4 }; struct ncclSocket { int fd; int acceptFd; int timedOutRetries; int refusedRetries; union ncclSocketAddress addr; volatile uint32_t* abortFlag; int asyncFlag; enum ncclSocketState state; int salen; uint64_t magic; enum ncclSocketType type; }; const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1); ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair); int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs); int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs); // Initialize a socket ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0); // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call ncclResult_t ncclSocketListen(struct ncclSocket* sock); ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr); // Connect to sock->addr. sock->fd is set after a successful call. ncclResult_t ncclSocketConnect(struct ncclSocket* sock); // Return socket connection state. ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running); // Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr. ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock); ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd); ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock); #define NCCL_SOCKET_SEND 0 #define NCCL_SOCKET_RECV 1 ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size); ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size); ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize); ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking); ncclResult_t ncclSocketClose(struct ncclSocket* sock); #endif nccl-2.22.3-1/src/include/strongstream.h000066400000000000000000000117301463451655400200070ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_STRONGSTREAM_H_ #define NCCL_STRONGSTREAM_H_ #include "nccl.h" #include "checks.h" #include /* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes * easily. */ struct ncclCudaGraph { #if CUDART_VERSION >= 11030 cudaGraph_t graph; unsigned long long graphId; #endif }; inline struct ncclCudaGraph ncclCudaGraphNone() { struct ncclCudaGraph tmp; #if CUDART_VERSION >= 11030 tmp.graph = nullptr; tmp.graphId = ULLONG_MAX; #endif return tmp; } inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) { #if CUDART_VERSION >= 11030 return graph.graph != nullptr; #else return false; #endif } inline bool ncclCudaGraphSame(struct ncclCudaGraph a, struct ncclCudaGraph b) { #if CUDART_VERSION >= 11030 return a.graphId == b.graphId; #else return true; #endif } ncclResult_t ncclCudaGetCapturingGraph(struct ncclCudaGraph* graph, cudaStream_t stream); ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg); /* ncclStrongStream: An abstraction over CUDA streams that do not lose their * identity while being captured. Regular streams have the deficiency that the * captured form of a stream in one graph launch has no relation to the * uncaptured stream or to the captured form in other graph launches. This makes * streams unfit for the use of serializing access to a persistent resource. * Strong streams have been introduced to address this need. * * - All updates to a strong stream must be enclosed by a Acquire/Release pair. * * - The Acquire, Release, and all updates take a ncclCudaGraph parameter * indicating the currently capturing graph (or none). This parameter must be * the same for the entire sequence of {Acquire; ...; Release}. * * - An {Acquire; ...; Release} sequence must not be concurrent with any * other operations against the strong stream including graph launches which * reference this stream. */ struct ncclStrongStream; ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss); ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss); // Acquire-fence the strong stream. ncclResult_t ncclStrongStreamAcquire( struct ncclCudaGraph graph, struct ncclStrongStream* ss ); // Acquire-fence the strong stream assuming no graph is capturing. This permits // the caller to enqueue directly to the `ss->cudaStream` member using native CUDA // calls. Strong stream still must be released via: // ncclStrongStreamRelease(ncclCudaGraphNone(), ss); ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss); // Release-fence of the strong stream. ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss); // Add a host launch to the stream. ncclResult_t ncclStrongStreamLaunchHost( struct ncclCudaGraph graph, struct ncclStrongStream* ss, cudaHostFn_t fn, void* arg ); // Add a kernel launch to the stream. ncclResult_t ncclStrongStreamLaunchKernel( struct ncclCudaGraph graph, struct ncclStrongStream* ss, void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes ); // Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired. // `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus // we want to fast-forward `a` to be a clone of `b`. Knowing this permits the // implementation to induce few graph dependencies. ncclResult_t ncclStrongStreamWaitStream( struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false ); // `b` must be capturing within `graph`. ncclResult_t ncclStrongStreamWaitStream( struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false ); // `a` must be capturing within `graph`. ncclResult_t ncclStrongStreamWaitStream( struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false ); // Synchrnoization does not need the strong stream to be acquired. ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss); //////////////////////////////////////////////////////////////////////////////// struct ncclStrongStreamGraph; // internal to ncclStrongStream struct ncclStrongStream { // Used when not graph capturing. cudaStream_t cudaStream; #if CUDART_VERSION >= 11030 // The event used to establish order between graphs and streams. During acquire // this event is waited on, during release it is recorded to. cudaEvent_t serialEvent; // This stream ever appeared in a graph capture. bool everCaptured; // Tracks whether serialEvent needs to be recorded to upon Release(). bool serialEventNeedsRecord; struct ncclStrongStreamGraph* graphHead; #else cudaEvent_t scratchEvent; #endif }; #endif nccl-2.22.3-1/src/include/timer.h000066400000000000000000000031131463451655400163730ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_TIMER_H_ #define NCCL_TIMER_H_ #if ENABLE_TIMER #include #include #include static double freq = -1; static void calibrate() { struct timeval tv; gettimeofday(&tv, NULL); uint64_t timeCycles = __rdtsc(); double time = - tv.tv_sec*1E6 - tv.tv_usec; uint64_t total = 0ULL; for (int i=0; i<10000; i++) total += __rdtsc(); gettimeofday(&tv, NULL); timeCycles = __rdtsc() - timeCycles; time += tv.tv_sec*1E6 + tv.tv_usec; freq = timeCycles/time; } static inline double gettime() { if (freq == -1) calibrate(); return __rdtsc()/freq; } static uint64_t counts[8]; static double times[8]; static double startTimes[8]; #define TIME_START(index) do { \ counts[index]++; \ startTimes[index] = gettime(); \ } while (0); #define TIME_STOP(index) do { \ times[index] += gettime() - startTimes[index]; \ } while (0); #define TIME_CANCEL(index) do { \ counts[index]--; \ } while (0); #define TIME_PRINT(name) do { \ printf("%s stats", name); \ for (int i=0; i<8; i++) { \ if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \ counts[i] = 0; \ } \ printf("\n"); \ } while (0); #else #define TIME_START(index) while(0); #define TIME_STOP(index) while(0); #define TIME_CANCEL(index) while(0); #define TIME_PRINT(name) #endif #endif nccl-2.22.3-1/src/include/transport.h000066400000000000000000000146221463451655400173160ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_TRANSPORT_H_ #define NCCL_TRANSPORT_H_ #include "device.h" #include "graph.h" #include "nvmlwrap.h" #include "core.h" #define NTRANSPORTS 4 #define TRANSPORT_UNDEFINED -1 #define TRANSPORT_P2P 0 #define TRANSPORT_SHM 1 #define TRANSPORT_NET 2 #define TRANSPORT_COLLNET 3 #include "proxy.h" #include "comm.h" extern struct ncclTransport p2pTransport; extern struct ncclTransport shmTransport; extern struct ncclTransport netTransport; extern struct ncclTransport collNetTransport; extern struct ncclTransport* ncclTransports[]; // Forward declarations struct ncclRing; struct ncclConnector; struct ncclComm; struct ncclPeerInfo { int rank; int cudaDev; int nvmlDev; int gdrSupport; uint64_t hostHash; uint64_t pidHash; dev_t shmDev; int64_t busId; struct ncclComm* comm; int cudaCompCap; // MNNVL support nvmlGpuFabricInfoV_t fabricInfo; int cuMemSupport; }; #define CONNECT_SIZE 128 struct ncclConnect { char data[CONNECT_SIZE]; }; #if CUDART_VERSION >= 12010 #define NVLS_HANDLE_SIZE 64 struct ncclNvlsSharedRes { int refCount; bool inited; CUmulticastObjectProp bufProp; CUmulticastObjectProp signalProp; CUmemAccessDesc accessDesc; int dev; size_t buffSize; size_t creditSize; CUmemGenericAllocationHandle mcBuffHandle; // Multicast handle for NVLS buffer CUmemGenericAllocationHandle mcCreditHandle; // Multicast handle for NVLS credit buffer char* mcBuff; // Multicast NVLS buffer address char* mcCredit; // Multicast NVLS credit address CUmemGenericAllocationHandle ucBuffHandle; // Unicast Handle for NVLS buffer CUmemGenericAllocationHandle ucCreditHandle; // Unicast Handle for NVLS credit buffer char* ucBuff; // Unicast NVLS buffer address char* ucCredit; // Unicast NVLS credit address int nChannels; struct ncclShmemCollBuff nvlsShmem; void *nvlsShmemHandle; }; #endif /* CUDART_VERSION >= 12010 */ struct ncclCollNetSharedRes { int refCount; int size; char* cudaBuff; char* hostBuff; struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS]; void* resources; int nChannels; size_t buffSize; int intraHighestTransportType; }; struct ncclTransportComm { ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex); ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*); ncclResult_t (*free)(struct ncclConnector*); ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels); ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState); ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); ncclResult_t (*proxyRegister)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); ncclResult_t (*proxyDeregister)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done); }; struct ncclTransport { const char name[8]; ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*); struct ncclTransportComm send; struct ncclTransportComm recv; }; ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex); ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL); ncclResult_t ncclNvlsInit(struct ncclComm* comm); ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent); ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm); ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm); ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueElts); ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv); ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size); ncclResult_t ncclNvlsFree(struct ncclComm* comm); enum { collNetRecv=0, collNetSend=1 }; int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect); ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail); ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm); ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufUsed, void** outHandle); ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueElts); ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyconn, void* handle); ncclResult_t ncclTransportRingConnect(struct ncclComm* comm); ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm); ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* graphs[]); ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm); ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm); #endif nccl-2.22.3-1/src/include/trees.h000066400000000000000000000010761463451655400164030ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_TREES_H_ #define NCCL_TREES_H_ ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType); ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1); #endif nccl-2.22.3-1/src/include/tuner.h000066400000000000000000000015231463451655400164130ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_INT_TUNER_H_ #define NCCL_INT_TUNER_H_ #include "nccl_tuner.h" #include "comm.h" // Tuning plugin to override NCCL's default algorithm/protocol tuning. // Attempts to load NCCL tuner from environmental variable. // Returns ncclSuccess if the correct tuner symbol has been found and // successully loaded. Otherwise returns an error and also logs the error. ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm); // Cleans up NCCL tuner plugin. ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm); #endif nccl-2.22.3-1/src/include/utils.h000066400000000000000000000434251463451655400164250ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_UTILS_H_ #define NCCL_UTILS_H_ #include "nccl.h" #include "alloc.h" #include "bitops.h" #include "checks.h" #include #include #include #include #include #include int ncclCudaCompCap(); // PCI Bus ID <-> int64 conversion functions ncclResult_t int64ToBusId(int64_t id, char* busId); ncclResult_t busIdToInt64(const char* busId, int64_t* id); ncclResult_t getBusId(int cudaDev, int64_t *busId); ncclResult_t getHostName(char* hostname, int maxlen, const char delim); uint64_t getHash(const char* string, int n); uint64_t getHostHash(); uint64_t getPidHash(); ncclResult_t getRandomData(void* buffer, size_t bytes); struct netIf { char prefix[64]; int port; }; int parseStringList(const char* string, struct netIf* ifList, int maxList); bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact); static long log2i(long n) { return log2Down(n); } inline uint64_t clockNano() { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec; } /* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else * return -1 */ inline ncclResult_t getRandomData(void* buffer, size_t bytes) { ncclResult_t ret = ncclSuccess; if (bytes > 0) { const size_t one = 1UL; FILE* fp = fopen("/dev/urandom", "r"); if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one) ret = ncclSystemError; if (fp) fclose(fp); } return ret; } //////////////////////////////////////////////////////////////////////////////// template inline void ncclAtomicRefCountIncrement(Int* refs) { __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED); } template inline Int ncclAtomicRefCountDecrement(Int* refs) { return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL); } //////////////////////////////////////////////////////////////////////////////// /* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that * granularity of LIFO is not per object, instead frames containing many objects * are pushed and popped. Therefor deallocation is extremely cheap since its * done at the frame granularity. * * The initial state of the stack is with one frame, the "nil" frame, which * cannot be popped. Therefor objects allocated in the nil frame cannot be * deallocated sooner than stack destruction. */ struct ncclMemoryStack; void ncclMemoryStackConstruct(struct ncclMemoryStack* me); void ncclMemoryStackDestruct(struct ncclMemoryStack* me); void ncclMemoryStackPush(struct ncclMemoryStack* me); void ncclMemoryStackPop(struct ncclMemoryStack* me); void* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t size, size_t align); template T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1); template inline Header* ncclMemoryStackAllocInlineArray(struct ncclMemoryStack* me, size_t nElt); //////////////////////////////////////////////////////////////////////////////// /* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for * a pool instance to ever hold objects whose type have differing * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by * a backing `ncclMemoryStack` passed during Alloc(). If memory * backing any currently held object is deallocated then it is an error to do * anything other than reconstruct it, after which it is a valid empty pool. */ struct ncclMemoryPool; // Equivalent to zero-initialization void ncclMemoryPoolConstruct(struct ncclMemoryPool* me); template T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing); template void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj); void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from); //////////////////////////////////////////////////////////////////////////////// /* ncclIntruQueue: A singly-linked list queue where the per-object next pointer * field is given via the `next` template argument. * * Example: * struct Foo { * struct Foo *next1, *next2; // can be a member of two lists at once * }; * ncclIntruQueue list1; * ncclIntruQueue list2; */ template struct ncclIntruQueue; template void ncclIntruQueueConstruct(ncclIntruQueue *me); template bool ncclIntruQueueEmpty(ncclIntruQueue *me); template T* ncclIntruQueueHead(ncclIntruQueue *me); template void ncclIntruQueueEnqueue(ncclIntruQueue *me, T *x); template void ncclIntruQueueEnqueueFront(ncclIntruQueue *me, T *x); template T* ncclIntruQueueDequeue(ncclIntruQueue *me); template T* ncclIntruQueueTryDequeue(ncclIntruQueue *me); template void ncclIntruQueueTransfer(ncclIntruQueue *dst, ncclIntruQueue *src); //////////////////////////////////////////////////////////////////////////////// /* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex" * and "cond" fields are part of the public interface. */ struct ncclThreadSignal { pthread_mutex_t mutex; pthread_cond_t cond; }; // returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER} constexpr ncclThreadSignal ncclThreadSignalStaticInitializer(); void ncclThreadSignalConstruct(struct ncclThreadSignal* me); void ncclThreadSignalDestruct(struct ncclThreadSignal* me); // A convenience instance per-thread. extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance; //////////////////////////////////////////////////////////////////////////////// template struct ncclIntruQueueMpsc; template void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc* me); template bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc* me); // Enqueue element. Returns true if queue is not abandoned. Even if queue is // abandoned the element enqueued, so the caller needs to make arrangements for // the queue to be tended. template bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc* me, T* x); // Dequeue all elements at a glance. If there aren't any and `waitSome` is // true then this call will wait until it can return a non empty list. template T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc* me, bool waitSome); // Dequeue all elements and set queue to abandoned state. template T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc* me); //////////////////////////////////////////////////////////////////////////////// struct ncclMemoryStack { struct Hunk { struct Hunk* above; // reverse stack pointer size_t size; // size of this allocation (including this header struct) }; struct Unhunk { // proxy header for objects allocated out-of-hunk struct Unhunk* next; void* obj; }; struct Frame { struct Hunk* hunk; // top of non-empty hunks uintptr_t bumper, end; // points into top hunk struct Unhunk* unhunks; struct Frame* below; }; static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align); static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align); struct Hunk stub; struct Frame topFrame; }; inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) { me->stub.above = nullptr; me->stub.size = 0; me->topFrame.hunk = &me->stub; me->topFrame.bumper = 0; me->topFrame.end = 0; me->topFrame.unhunks = nullptr; me->topFrame.below = nullptr; } inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) { uintptr_t o = (me->topFrame.bumper + align-1) & -uintptr_t(align); void* obj; if (__builtin_expect(o + size <= me->topFrame.end, true)) { me->topFrame.bumper = o + size; obj = reinterpret_cast(o); } else { obj = allocateSpilled(me, size, align); } return obj; } inline void* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t size, size_t align) { void *obj = ncclMemoryStack::allocate(me, size, align); memset(obj, 0, size); return obj; } template inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) { void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T)); memset(obj, 0, n*sizeof(T)); return (T*)obj; } template inline Header* ncclMemoryStackAllocInlineArray(struct ncclMemoryStack* me, size_t nElt) { size_t size = sizeof(Header); size = (size + alignof(Element)-1) & -alignof(Element); size += nElt*sizeof(Element); size_t align = alignof(Header) < alignof(Element) ? alignof(Element) : alignof(Header); void *obj = ncclMemoryStack::allocate(me, size, align); memset(obj, 0, size); return (Header*)obj; } inline void ncclMemoryStackPush(struct ncclMemoryStack* me) { using Frame = ncclMemoryStack::Frame; Frame tmp = me->topFrame; Frame* snapshot = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame)); *snapshot = tmp; // C++ struct assignment me->topFrame.unhunks = nullptr; me->topFrame.below = snapshot; } inline void ncclMemoryStackPop(struct ncclMemoryStack* me) { ncclMemoryStack::Unhunk* un = me->topFrame.unhunks; while (un != nullptr) { free(un->obj); un = un->next; } me->topFrame = *me->topFrame.below; // C++ struct assignment } //////////////////////////////////////////////////////////////////////////////// struct ncclMemoryPool { struct Cell { Cell *next; }; struct Cell* head; struct Cell* tail; // meaningful only when head != nullptr }; inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) { me->head = nullptr; } template inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) { using Cell = ncclMemoryPool::Cell; Cell* cell; if (__builtin_expect(me->head != nullptr, true)) { cell = me->head; me->head = cell->next; } else { // Use the internal allocate() since it doesn't memset to 0 yet. size_t cellSize = std::max(sizeof(Cell), sizeof(T)); size_t cellAlign = std::max(alignof(Cell), alignof(T)); cell = (Cell*)ncclMemoryStack::allocate(backing, cellSize, cellAlign); } memset(cell, 0, sizeof(T)); return reinterpret_cast(cell); } template inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) { using Cell = ncclMemoryPool::Cell; Cell* cell = reinterpret_cast(obj); cell->next = me->head; if (me->head == nullptr) me->tail = cell; me->head = cell; } inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) { if (from->head != nullptr) { from->tail->next = me->head; if (me->head == nullptr) me->tail = from->tail; me->head = from->head; from->head = nullptr; } } //////////////////////////////////////////////////////////////////////////////// template struct ncclIntruQueue { T *head, *tail; }; template inline void ncclIntruQueueConstruct(ncclIntruQueue *me) { me->head = nullptr; me->tail = nullptr; } template inline bool ncclIntruQueueEmpty(ncclIntruQueue *me) { return me->head == nullptr; } template inline T* ncclIntruQueueHead(ncclIntruQueue *me) { return me->head; } template inline T* ncclIntruQueueTail(ncclIntruQueue *me) { return me->tail; } template inline void ncclIntruQueueEnqueue(ncclIntruQueue *me, T *x) { x->*next = nullptr; (me->head ? me->tail->*next : me->head) = x; me->tail = x; } template inline void ncclIntruQueueEnqueueFront(ncclIntruQueue *me, T *x) { if (me->head == nullptr) me->tail = x; x->*next = me->head; me->head = x; } template inline T* ncclIntruQueueDequeue(ncclIntruQueue *me) { T *ans = me->head; me->head = ans->*next; if (me->head == nullptr) me->tail = nullptr; return ans; } template inline bool ncclIntruQueueDelete(ncclIntruQueue *me, T *x) { T *prev = nullptr; T *cur = me->head; bool found = false; while (cur) { if (cur == x) { found = true; break; } prev = cur; cur = cur->*next; } if (found) { if (prev == nullptr) me->head = cur->*next; else prev->*next = cur->*next; if (cur == me->tail) me->tail = prev; } return found; } template inline T* ncclIntruQueueTryDequeue(ncclIntruQueue *me) { T *ans = me->head; if (ans != nullptr) { me->head = ans->*next; if (me->head == nullptr) me->tail = nullptr; } return ans; } template void ncclIntruQueueTransfer(ncclIntruQueue *dst, ncclIntruQueue *src) { (dst->tail ? dst->tail->next : dst->head) = src->head; if (src->tail) dst->tail = src->tail; src->head = nullptr; src->tail = nullptr; } //////////////////////////////////////////////////////////////////////////////// constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() { return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}; } inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) { pthread_mutex_init(&me->mutex, nullptr); pthread_cond_init(&me->cond, nullptr); } inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) { pthread_mutex_destroy(&me->mutex); pthread_cond_destroy(&me->cond); } //////////////////////////////////////////////////////////////////////////////// template struct ncclIntruQueueMpsc { T* head; uintptr_t tail; struct ncclThreadSignal* waiting; }; template void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc* me) { me->head = nullptr; me->tail = 0x0; me->waiting = nullptr; } template bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc* me) { return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2; } template bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc* me, T* x) { __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED); uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast(x), __ATOMIC_ACQ_REL); T* prev = reinterpret_cast(utail); T** prevNext = utail <= 0x2 ? &me->head : &(prev->*next); __atomic_store_n(prevNext, x, __ATOMIC_RELAXED); if (utail == 0x1) { // waiting __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting // This lock/unlock is essential to ensure we don't race ahead of the consumer // and signal the cond before they begin waiting on it. struct ncclThreadSignal* waiting = me->waiting; pthread_mutex_lock(&waiting->mutex); pthread_mutex_unlock(&waiting->mutex); pthread_cond_broadcast(&waiting->cond); } return utail != 0x2; // not abandoned } template T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc* me, bool waitSome) { T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); if (head == nullptr) { if (!waitSome) return nullptr; uint64_t t0 = clockNano(); bool sleeping = false; do { if (clockNano()-t0 >= 10*1000) { // spin for first 10us struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance; pthread_mutex_lock(&waitSignal->mutex); uintptr_t expected = sleeping ? 0x1 : 0x0; uintptr_t desired = 0x1; me->waiting = waitSignal; // release done by successful compare exchange if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) { sleeping = true; pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex); } pthread_mutex_unlock(&waitSignal->mutex); } head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); } while (head == nullptr); } __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED); uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL); T* tail = utail <= 0x2 ? nullptr : reinterpret_cast(utail); T *x = head; while (x != tail) { T *x1; int spins = 0; while (true) { x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED); if (x1 != nullptr) break; if (++spins == 1024) { spins = 1024-1; sched_yield(); } } x = x1; } return head; } template T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc* me) { uintptr_t expected = 0x0; if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) { return nullptr; } else { int spins = 0; T* head; while (true) { head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); if (head != nullptr) break; if (++spins == 1024) { spins = 1024-1; sched_yield(); } } __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED); uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL); T* tail = utail <= 0x2 ? nullptr : reinterpret_cast(utail); T *x = head; while (x != tail) { T *x1; spins = 0; while (true) { x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED); if (x1 != nullptr) break; if (++spins == 1024) { spins = 1024-1; sched_yield(); } } x = x1; } return head; } } #endif nccl-2.22.3-1/src/init.cc000066400000000000000000002603251463451655400147430ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nccl.h" #include "channel.h" #include "nvmlwrap.h" #include "gdrwrap.h" #include "bootstrap.h" #include "transport.h" #include "group.h" #include "net.h" #include "coll_net.h" #include "enqueue.h" #include "graph.h" #include "argcheck.h" #include "tuner.h" #include #include #include #include #include #include #include #include #include "param.h" #define STR2(v) #v #define STR(v) STR2(v) #if CUDART_VERSION >= 9020 #define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream #else #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream #endif const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" }; const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree" }; const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" }; NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); NCCL_PARAM(CommBlocking, "COMM_BLOCKING", NCCL_CONFIG_UNDEF_INT); NCCL_PARAM(RuntimeConnect, "RUNTIME_CONNECT", 1); static ncclResult_t commReclaim(ncclComm_t comm); static uint64_t hashUniqueId(ncclUniqueId const &id) { char const *bytes = (char const*)&id; uint64_t h = 0xdeadbeef; for(int i=0; i < (int)sizeof(ncclUniqueId); i++) { h ^= h >> 32; h *= 0x8db3db47fa2994ad; h += bytes[i]; } return h; } // GDRCOPY support: Off by default NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0); // GDRCOPY support gdr_t ncclGdrCopy = NULL; ncclResult_t initGdrCopy() { if (ncclParamGdrCopyEnable() == 1) { ncclGdrCopy = ncclGdrInit(); } return ncclSuccess; } static ncclResult_t initResult = ncclSuccess; static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; static void initOnceFunc() { initEnv(); initGdrCopy(); // Always initialize bootstrap network NCCLCHECKGOTO(bootstrapNetInit(), initResult, exit); initNvtxRegisteredEnums(); exit:; } static ncclResult_t ncclInit() { pthread_once(&initOnceControl, initOnceFunc); return initResult; } NCCL_API(ncclResult_t, ncclGetVersion, int* version); ncclResult_t ncclGetVersion(int* version) { if (version == NULL) return ncclInvalidArgument; *version = NCCL_VERSION_CODE; return ncclSuccess; } NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out); ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { NCCLCHECK(ncclInit()); NCCLCHECK(PtrCheck(out, "GetUniqueId", "out")); ncclResult_t res = bootstrapGetUniqueId((struct ncclBootstrapHandle*)out); TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out)); return res; } // Prevent compiler from optimizing out these operations #ifdef __clang__ #define NCCL_NO_OPTIMIZE __attribute__((optnone)) #else #define NCCL_NO_OPTIMIZE __attribute__((optimize("O0"))) #endif void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) { // Important that this does not trash intraComm0. comm->rank = comm->cudaDev = comm->busId = comm->nRanks = -1; comm->startMagic = comm->endMagic = 0; } #undef NCCL_NO_OPTIMIZE static ncclResult_t ncclDestructorFnFree(struct ncclDestructor* dtor) { free(dtor->obj); return ncclSuccess; } void ncclCommPushFree(struct ncclComm* comm, void* obj) { struct ncclDestructor* dtor = ncclMemoryStackAlloc(&comm->memPermanent); dtor->fn = ncclDestructorFnFree; dtor->obj = obj; dtor->next = comm->destructorHead; comm->destructorHead = dtor; } static ncclResult_t ncclDestructorFnCudaFree(struct ncclDestructor* dtor) { NCCLCHECK(ncclCudaFree(dtor->obj)); return ncclSuccess; } void ncclCommPushCudaFree(struct ncclComm* comm, void* obj) { struct ncclDestructor* dtor = ncclMemoryStackAlloc(&comm->memPermanent); dtor->fn = ncclDestructorFnCudaFree; dtor->obj = obj; dtor->next = comm->destructorHead; comm->destructorHead = dtor; } static ncclResult_t ncclDestructorFnCudaHostFree(struct ncclDestructor* dtor) { CUDACHECK(cudaFreeHost(dtor->obj)); return ncclSuccess; } void ncclCommPushCudaHostFree(struct ncclComm* comm, void* obj) { struct ncclDestructor* dtor = ncclMemoryStackAlloc(&comm->memPermanent); dtor->fn = ncclDestructorFnCudaHostFree; dtor->obj = obj; dtor->next = comm->destructorHead; comm->destructorHead = dtor; } static ncclResult_t ncclDestructorFnCudaGdrFree(struct ncclDestructor* dtor) { NCCLCHECK(ncclGdrCudaFree(dtor->obj)); return ncclSuccess; } void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle) { struct ncclDestructor* dtor = ncclMemoryStackAlloc(&comm->memPermanent); dtor->fn = ncclDestructorFnCudaGdrFree; dtor->obj = handle; dtor->next = comm->destructorHead; comm->destructorHead = dtor; } static ncclResult_t commFree(ncclComm_t comm) { int abort = 0; /* commFree() should not involve any sync among ranks. */ if (comm == NULL) return ncclSuccess; /* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will * free all intra-process communicators; therefore, we only need to focus on local * resource cleanup in commFree(). */ if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) { pthread_join(comm->proxyState->thread, nullptr); if (comm->proxyState->threadUDS) { // UDS support pthread_join(comm->proxyState->threadUDS, nullptr);; } } delete[] comm->userRedOps; free(comm->connectSend); free(comm->connectRecv); free(comm->peerInfo); if (comm->topo) ncclTopoFree(comm->topo); if (comm->nodeRanks) { for (int n=0; nnNodes; n++) free(comm->nodeRanks[n].localRankToRank); free(comm->nodeRanks); } free(comm->rankToNode); free(comm->rankToLocalRank); free(comm->collNetHeads); if (comm->bootstrap) NCCLCHECK(bootstrapClose(comm->bootstrap)); for (int channel=0; channelchannels+channel, comm->nRanks, 1, comm->localRanks)); if (comm->sharedRes) { if (ncclAtomicRefCountDecrement(&comm->sharedRes->refCount) == 0) { for (int c=0; csharedRes->peers[c]) free(comm->sharedRes->peers[c]); if (comm->sharedRes->devPeers[c]) ncclCudaFree(comm->sharedRes->devPeers[c]); } free(comm->sharedRes->tpRankToLocalRank); NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream)); NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream)); NCCLCHECK(ncclProxyDestroy(comm)); free(comm->sharedRes); } } if (comm->nvlsSupport) NCCLCHECK(ncclNvlsFree(comm)); struct ncclDestructor* dtor = comm->destructorHead; while (dtor != nullptr) { NCCLCHECK(dtor->fn(dtor)); dtor = dtor->next; } ncclMemoryStackDestruct(&comm->memScoped); ncclMemoryStackDestruct(&comm->memPermanent); abort = *comm->abortFlag; if (ncclAtomicRefCountDecrement(comm->abortFlagRefCount) == 0) { free(comm->abortFlag); NCCLCHECK(ncclCudaHostFree((void*)comm->abortFlagDev)); free(comm->abortFlagRefCount); } free((void*)comm->config.netName); free(comm->topParentRanks); free(comm->topParentLocalRanks); NCCLCHECK(ncclRegCleanup(comm)); INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy"); commPoison(comm); // poison comm before free to avoid comm reuse. NCCLCHECK(ncclNetFinalize(comm)); NCCLCHECK(ncclNetPluginUnload(comm)); free(comm); return ncclSuccess; } NCCL_PARAM(DisableGraphHelper, "GRAPH_HELPER_DISABLE", 0); // GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1); #define NCCL_WORK_FIFO_BYTES_DEFAULT (1<<20) NCCL_PARAM(WorkFifoBytes, "WORK_FIFO_BYTES", NCCL_WORK_FIFO_BYTES_DEFAULT); NCCL_PARAM(WorkArgsBytes, "WORK_ARGS_BYTES", INT64_MAX); enum ncclLaunchMode ncclParamLaunchMode; NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 1); // Detect DMA-BUF support static ncclResult_t dmaBufSupported(struct ncclComm* comm) { if (ncclParamDmaBufEnable() == 0 || comm->ncclNet->regMrDmaBuf == NULL || ncclCudaLibraryInit() != ncclSuccess) return ncclInternalError; #if CUDA_VERSION >= 11070 int flag = 0; CUdevice dev; int cudaDriverVersion; CUDACHECK(cudaDriverGetVersion(&cudaDriverVersion)); if (CUPFN(cuDeviceGet) == NULL || cudaDriverVersion < 11070) return ncclInternalError; CUCHECK(cuDeviceGet(&dev, comm->cudaDev)); // Query device to see if DMA-BUF support is available (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev)); if (flag == 0) return ncclInternalError; INFO(NCCL_INIT, "DMA-BUF is available on GPU device %d", comm->cudaDev); return ncclSuccess; #endif return ncclInternalError; } ncclResult_t ncclCommEnsureReady(ncclComm_t comm) { /* comm must be ready, or error will be reported */ ncclResult_t ret = ncclSuccess; if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) { ncclGroupJobAbort(comm->groupJob); } else { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); if (ret != ncclSuccess) { /* if ret is not ncclInProgress, we just keep it. */ WARN("Attempt to use communicator before the previous operation returned ncclSuccess"); if (ret == ncclInProgress) ret = ncclInvalidArgument; goto exit; } /* if there is linked group job, we should complete it. */ if (comm->groupJob) { NCCLCHECK(ncclGroupJobComplete(comm->groupJob)); comm->groupJob = NULL; } } exit: return ret; } static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, int ndev, int rank) { if (ndev < 1) { WARN("invalid device count (%d) requested", ndev); return ncclInvalidArgument; } if (rank >= ndev || rank < 0) { WARN("rank %d exceeds ndev=%d", rank, ndev); return ncclInvalidArgument; } ncclMemoryStackConstruct(&comm->memPermanent); ncclMemoryStackConstruct(&comm->memScoped); comm->destructorHead = nullptr; comm->rank = rank; comm->nRanks = ndev; NCCLCHECK(ncclNetPluginLoad(comm)); NCCLCHECK(ncclNetInit(comm)); INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name); if (parent && parent->config.splitShare) { if (parent->ncclNet != comm->ncclNet) { WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parent->ncclNet->name, comm->ncclNet->name); return ncclInvalidUsage; } } // Try to create a CUDA object right away. If there is something wrong with // the device we're on (failure cause #1) , better know it early. CUDACHECK(cudaGetDevice(&comm->cudaDev)); NCCLCHECK(getBusId(comm->cudaDev, &comm->busId)); nvmlDevice_t nvmlDev; char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; NCCLCHECK(int64ToBusId(comm->busId, busId)); NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev)); NCCLCHECK(ncclNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&comm->nvmlDev)); comm->compCap = ncclCudaCompCap(); TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap); comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false; comm->collNetSupport = 0; memset(comm->collNetSupportMatrix, 0, sizeof(comm->collNetSupportMatrix)); ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan); ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp); comm->groupNext = reinterpret_cast(0x1); comm->preconnectNext = reinterpret_cast(0x1); static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels"); static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels"); NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks)); NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks)); // Mark channels as non initialized. for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1; if (parent == NULL || !parent->config.splitShare) { struct ncclSharedResources* sharedRes = NULL; NCCLCHECK(ncclCalloc(&sharedRes, 1)); /* most of attributes are assigned later in initTransportsRank(). */ sharedRes->owner = comm; sharedRes->tpNRanks = comm->nRanks; NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks)); NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream)); NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream)); comm->sharedRes = sharedRes; sharedRes->refCount = 1; } else { comm->sharedRes = parent->sharedRes; ncclAtomicRefCountIncrement(&parent->sharedRes->refCount); } if (comm->topParentRanks == NULL) { NCCLCHECK(ncclCalloc(&comm->topParentRanks, comm->nRanks)); for (int i = 0; i < comm->nRanks; ++i) comm->topParentRanks[i] = i; } ncclIntruQueueMpscConstruct(&comm->callbackQueue); comm->regCache.pageSize = sysconf(_SC_PAGESIZE); return ncclSuccess; } static ncclResult_t devCommSetup(ncclComm_t comm) { ncclResult_t ret = ncclSuccess; int nRanks = comm->nRanks; struct ncclDevCommAndChannels tmpCommAndChans; struct ncclDevCommAndChannels *devCommAndChans = NULL; struct ncclNvmlCCStatus ccStatus; NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail); NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail); ncclCommPushCudaFree(comm, devCommAndChans); comm->devComm = &devCommAndChans->comm; tmpCommAndChans.comm.rank = comm->rank; tmpCommAndChans.comm.nRanks = nRanks; tmpCommAndChans.comm.node = comm->node; tmpCommAndChans.comm.nNodes = comm->nNodes; tmpCommAndChans.comm.abortFlag = comm->abortFlagDev; for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) { tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p]; } tmpCommAndChans.comm.p2pChunkSize = comm->p2pChunkSize; tmpCommAndChans.comm.channels = &devCommAndChans->channels[0]; comm->workArgsBytes = std::min(ncclParamWorkArgsBytes(), ncclMaxKernelArgsSize(comm->cudaArch)); memset(&ccStatus, 0, sizeof(ccStatus)); if (ncclNvmlGetCCStatus(&ccStatus) == ncclSuccess && ccStatus.CCEnabled) { comm->workFifoBytes = 0; if (ccStatus.multiGpuCCEnabled == false && comm->rank == 0) { WARN("CC On, Multi-GPU CC Off (No inter-GPU communication protection)"); } } else { comm->workFifoBytes = ncclParamWorkFifoBytes(); if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) { WARN("NCCL_WORK_FIFO_BYTES=%d is being ignored because it is not a power of 2.", comm->workFifoBytes); comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT; } comm->workFifoBytes = std::min(comm->workFifoBytes, 1u<<30); } if (comm->rank == 0) { INFO(NCCL_INIT, "CC %s, Multi-GPU CC %s, workFifoBytes %d", ccStatus.CCEnabled ? "On" : "Off", ccStatus.multiGpuCCEnabled ? "On" : "Off", comm->workFifoBytes); } if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) { // The workFifoBuf lives in GDR mapped CUDA memory. NCCLCHECKGOTO(ncclGdrCudaCalloc(&comm->workFifoBuf, &comm->workFifoBufDev, comm->workFifoBytes, &comm->workFifoBufGdrHandle), ret, fail); ncclCommPushCudaGdrFree(comm, comm->workFifoBufGdrHandle); } else { // The workFifoBuf lives in cudaHost memory. comm->workFifoBufGdrHandle = nullptr; NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoBuf, comm->workFifoBytes), ret, fail); ncclCommPushCudaHostFree(comm, comm->workFifoBuf); comm->workFifoBufDev = comm->workFifoBuf; } NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoConsumed, MAXCHANNELS), ret, fail); ncclCommPushCudaHostFree(comm, comm->workFifoConsumed); comm->workFifoProduced = 0; comm->workFifoConsumedLeast = 0; tmpCommAndChans.comm.workConsumed = comm->workFifoConsumed; if (comm->collNetDenseToUserRank != nullptr) { NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); ncclCommPushCudaFree(comm, tmpCommAndChans.comm.collNetDenseToUserRank); NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); } for (int c=0; c < MAXCHANNELS; c++) { tmpCommAndChans.channels[c].peers = comm->channels[c].devPeers; tmpCommAndChans.channels[c].ring = comm->channels[c].ring; tmpCommAndChans.channels[c].ring.userRanks = comm->channels[c].devRingUserRanks; tmpCommAndChans.channels[c].tree = comm->channels[c].tree; tmpCommAndChans.channels[c].collnetChain = comm->channels[c].collnetChain; tmpCommAndChans.channels[c].collnetDirect = comm->channels[c].collnetDirect; tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls; if (comm->channels[c].ring.userRanks != nullptr) { NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); } } NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail); exit: NCCLCHECK(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream)); NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream)); return ret; fail: goto exit; } // Pre-process the string so that running "strings" on the lib can quickly reveal the version. #define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR) static void showVersion() { if (ncclDebugLevel == NCCL_LOG_VERSION || ncclDebugLevel == NCCL_LOG_WARN) { VERSION("%s", VERSION_STRING); } else { INFO(NCCL_ALL,"%s", VERSION_STRING); } } static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) { info->rank = comm->rank; info->cudaDev = comm->cudaDev; info->nvmlDev = comm->nvmlDev; info->hostHash=getHostHash()+commHash; info->pidHash=getPidHash()+commHash; info->cuMemSupport = ncclCuMemEnable(); // Get the device MAJOR:MINOR of /dev/shm so we can use that // information to decide whether we can use SHM for inter-process // communication in a container environment struct stat statbuf; SYSCHECK(stat("/dev/shm", &statbuf), "stat"); info->shmDev = statbuf.st_dev; info->busId = comm->busId; NCCLCHECK(ncclGpuGdrSupport(comm, &info->gdrSupport)); info->comm = comm; info->cudaCompCap = comm->minCompCap = comm->maxCompCap = comm->compCap; // MNNVL support { // MNNVL: Request the fabric UUID and partition info char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; nvmlDevice_t nvmlDev; NCCLCHECK(int64ToBusId(info->busId, busId)); NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev)); info->fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED; (void) ncclNvmlDeviceGetGpuFabricInfoV(nvmlDev, &info->fabricInfo); if (info->fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) { INFO(NCCL_INIT, "MNNVL busId 0x%lx fabric UUID %lx.%lx cliqueId 0x%x state %d healthMask 0x%x", info->busId, ((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1], info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask); } } return ncclSuccess; } static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) { TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); NCCLCHECK(initChannel(comm, channelId)); struct ncclRing* ring = &comm->channels[channelId].ring; // Find our ring-distance from rank zero and reorganize ranks to start with rank. int ixZero=0, ixRank=0; for (int i=0; i < nranks; i++) { if (ringRanks[i] == 0) ixZero = i; if (ringRanks[i] == rank) ixRank = i; } ring->index = (ixRank-ixZero + nranks)%nranks; for (int i=0; iuserRanks[i] = ringRanks[(i+ixRank)%nranks]; } return ncclSuccess; } #define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine)) #define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t)) #define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */ NCCL_PARAM(BuffSize, "BUFFSIZE", -2); NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2); NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2); NCCL_PARAM(P2pNetChunkSize, "P2P_NET_CHUNKSIZE", (1 << 17)); /* 128 kB */ NCCL_PARAM(P2pPciChunkSize, "P2P_PCI_CHUNKSIZE", (1 << 17)); /* 128 kB */ NCCL_PARAM(P2pNvlChunkSize, "P2P_NVL_CHUNKSIZE", (1 << 19)); /* 512 kB */ static ncclResult_t computeBuffSizes(struct ncclComm* comm) { int cpuArch, cpuVendor, cpuModel; NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); int64_t envs[NCCL_NUM_PROTOCOLS] = { ncclParamLlBuffSize(), ncclParamLl128BuffSize(), ncclParamBuffSize() }; int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE }; for (int p=0; pbuffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p]; } if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize(); else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize(); else comm->p2pChunkSize = ncclParamP2pPciChunkSize(); // Make sure P2P chunksize is not larger than coll chunksize. if (comm->p2pChunkSize * NCCL_STEPS > comm->buffSizes[NCCL_PROTO_SIMPLE]) comm->p2pChunkSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; if (comm->sharedRes->owner != comm) { /* make sure split comm p2pChunkSize won't exceed shared p2pChunkSize. */ comm->p2pChunkSize = std::min(comm->p2pChunkSize, comm->sharedRes->tpP2pChunkSize); } else { comm->sharedRes->tpP2pChunkSize = comm->p2pChunkSize; } INFO(NCCL_INIT, "P2P Chunksize set to %d", comm->p2pChunkSize); return ncclSuccess; } NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0); NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2); NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1); NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0); // MNNVL: Flag to indicate whether to enable Multi-Node NVLink NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2); #if CUDART_VERSION >= 11030 #include #include "cudawrap.h" // Determine if MNNVL support is available static int checkMNNVL(struct ncclComm* comm) { ncclResult_t ret = ncclSuccess; // MNNVL requires cuMem to be enabled if (!ncclCuMemEnable()) return 0; // MNNVL also requires FABRIC handle support int cudaDev; int flag = 0; CUdevice currentDev; CUDACHECK(cudaGetDevice(&cudaDev)); CUCHECK(cuDeviceGet(¤tDev, cudaDev)); // Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));; if (!flag) return 0; // Check that all ranks have initialized the fabric fully for (int i = 0; i < comm->nRanks; i++) { if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return 0; } // Determine our MNNVL domain/clique NCCLCHECKGOTO(ncclCalloc(&comm->clique.ranks, comm->nRanks), ret, fail); comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId; for (int i = 0; i < comm->nRanks; i++) { nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo; nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo; // Check if the cluster UUID and cliqueId match // A zero UUID means we don't have MNNVL fabric info - disable MNNVL if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) goto fail; if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) && (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) { if (i == comm->rank) { comm->cliqueRank = comm->clique.size; } comm->clique.ranks[comm->clique.size++] = i; } } // Determine whether to enable MNNVL or not comm->MNNVL = ncclParamMNNVLEnable() == 2 ? comm->clique.size > 1 : ncclParamMNNVLEnable(); INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d ", comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank); if (comm->MNNVL) { // Force the CUMEM handle type to be FABRIC for MNNVL ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC; } return comm->MNNVL; fail: if (comm->clique.ranks) free(comm->clique.ranks); return 0; } #else static int checkMNNVL(struct ncclComm* comm) { return 0; } #endif #define TIMER_INIT_TOTAL 0 #define TIMER_INIT_KERNELS 1 #define TIMER_INIT_BOOTSTRAP 2 #define TIMER_INIT_ALLGATHER 3 #define TIMER_INIT_TOPO 4 #define TIMER_INIT_GRAPHS 5 #define TIMER_INIT_CONNECT 6 #define TIMERS_INIT_COUNT 7 static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) { // We use 2 AllGathers // 1. { peerInfo, comm, compCap} // 2. { nChannels, graphInfo, topoRanks } ncclResult_t ret = ncclSuccess; int rank = comm->rank; int nranks = comm->nRanks; int nNodes = 1; cpu_set_t affinitySave; struct ncclTopoGraph* ringGraph = &comm->graphs[NCCL_ALGO_RING]; struct ncclTopoGraph* treeGraph = &comm->graphs[NCCL_ALGO_TREE]; struct ncclTopoGraph* collNetChainGraph = &comm->graphs[NCCL_ALGO_COLLNET_CHAIN]; struct ncclTopoGraph* collNetDirectGraph = &comm->graphs[NCCL_ALGO_COLLNET_DIRECT]; struct ncclTopoGraph* nvlsGraph = &comm->graphs[NCCL_ALGO_NVLS]; struct ncclTopoGraph* graphs[] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph }; struct graphInfo { int pattern; int nChannels; int sameChannels; float bwIntra; float bwInter; int typeIntra; int typeInter; int crossNic; }; struct allGatherInfo { struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS]; struct ncclTopoRanks topoRanks; int cpuArch; int cpuVendor; }; int nChannelsOrig; struct allGatherInfo *allGather3Data = NULL; struct ncclTopoRanks** allTopoRanks = NULL; int *nodesFirstRank = NULL, *nodesTreePatterns = NULL; int *rings = NULL; int* nvbPeers = NULL; struct ncclProxyConnector proxyConn; int* pxnPeers = NULL; int *topParentLocalRanks = NULL; int tpProxyRank; timers[TIMER_INIT_ALLGATHER] = clockNano(); // AllGather1 - begin NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail); NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail); comm->cuMemSupport = 1; for (int i = 0; i < nranks; i++) { if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++; if (!comm->peerInfo[i].cuMemSupport) comm->cuMemSupport = 0; if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) { WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId); ret = ncclInvalidUsage; goto fail; } } // AllGather1 - end timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER]; // MNNVL support if (nNodes > 1 && !checkMNNVL(comm) && ncclParamMNNVLEnable() == 1) { // Return an error if the user specifically requested MNNVL support WARN("MNNVL is not supported on this system"); ret = ncclSystemError; goto fail; } do { // Compute intra-process ranks int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0; for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[i].cudaCompCap); for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[i].cudaCompCap); comm->nvlsRegSupport = 1; for (int i = 0; i < nranks; i++) { if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) { // Rank is in same process if (intraProcRanks == 0) intraProcRank0 = i; if (i == rank) intraProcRank = intraProcRanks; intraProcRanks++; if (intraProcRank0 == rank && rank != i) { comm->peerInfo[i].comm->intraNext = comm->intraNext; comm->intraNext = comm->peerInfo[i].comm; } } if (comm->nvlsRegSupport) { for (int j = i + 1; j < nranks; j++) { if (comm->peerInfo[i].hostHash == comm->peerInfo[j].hostHash && comm->peerInfo[i].pidHash == comm->peerInfo[j].pidHash) { comm->nvlsRegSupport = 0; break; } } } } // Buffer Registration is not supported with MNNVL if (comm->MNNVL) comm->nvlsRegSupport = 0; TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0); if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) { WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0); ret = ncclInternalError; goto fail; } struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm; assert(intraProcRank==0 ? comm==comm0 : true); comm->intraComm0 = comm0; comm->intraRank = intraProcRank; comm->intraRanks = intraProcRanks; comm->intraBarrierPhase = 0; comm->intraBarrierCounter = 0; comm->intraBarrierGate = 0; } while(0); timers[TIMER_INIT_TOPO] = clockNano(); // Topo detection / System graph creation NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail); // Compute paths between GPUs and NICs NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail); // Remove inaccessible GPUs and unused NICs NCCLCHECKGOTO(ncclTopoTrimSystem(comm->topo, comm), ret, fail); // Recompute paths after trimming NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail); // Init search NCCLCHECKGOTO(ncclTopoSearchInit(comm->topo), ret, fail); // Decide on comm's CPU architecture. NCCLCHECKGOTO(ncclTopoComputeCommCPU(comm), ret, fail); // Print final topology NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail); timers[TIMER_INIT_TOPO] = clockNano() - timers[TIMER_INIT_TOPO]; // Set Affinity to a CPU local the our GPU, so that all memory we allocate // on the host is local. NCCLCHECKGOTO(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity), ret, fail); if (CPU_COUNT(&comm->cpuAffinity)) { sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); } // Determine local CollNet support if (collNetSupport(comm)) { const char *collNetEnable = ncclGetEnv("NCCL_COLLNET_ENABLE"); if (collNetEnable != NULL) { INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable); if (strcmp(collNetEnable, "1") == 0) { comm->collNetSupport = 1; } } } // Determine local Nvls support NCCLCHECK(ncclNvlsInit(comm)); timers[TIMER_INIT_GRAPHS] = clockNano(); // Get rings and trees memset(ringGraph, 0, sizeof(struct ncclTopoGraph)); ringGraph->id = 0; ringGraph->pattern = NCCL_TOPO_PATTERN_RING; ringGraph->minChannels = 1; ringGraph->maxChannels = MAXCHANNELS/2; NCCLCHECKGOTO(ncclTopoCompute(comm->topo, ringGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, ringGraph), ret, fail); memset(treeGraph, 0, sizeof(struct ncclTopoGraph)); treeGraph->id = 1; treeGraph->pattern = NCCL_TOPO_PATTERN_BALANCED_TREE; treeGraph->minChannels = ringGraph->nChannels; treeGraph->maxChannels = ringGraph->nChannels; NCCLCHECKGOTO(ncclTopoCompute(comm->topo, treeGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, treeGraph), ret, fail); memset(collNetChainGraph, 0, sizeof(struct ncclTopoGraph)); collNetChainGraph->id = 2; collNetChainGraph->pattern = NCCL_TOPO_PATTERN_TREE; collNetChainGraph->collNet = 1; collNetChainGraph->minChannels = ringGraph->nChannels; collNetChainGraph->maxChannels = ringGraph->nChannels; memset(collNetDirectGraph, 0, sizeof(struct ncclTopoGraph)); collNetDirectGraph->id = 2; collNetDirectGraph->pattern = NCCL_TOPO_PATTERN_COLLNET_DIRECT; collNetDirectGraph->collNet = 1; collNetDirectGraph->minChannels = 1; collNetDirectGraph->maxChannels = MAXCHANNELS; if (comm->collNetSupport) { NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetChainGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetChainGraph), ret, fail); NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetDirectGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetDirectGraph), ret, fail); } memset(nvlsGraph, 0, sizeof(struct ncclTopoGraph)); nvlsGraph->id = 3; nvlsGraph->pattern = NCCL_TOPO_PATTERN_NVLS; nvlsGraph->minChannels = 1; nvlsGraph->maxChannels = MAXCHANNELS; if (comm->nvlsSupport) { NCCLCHECKGOTO(ncclTopoCompute(comm->topo, nvlsGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, nvlsGraph), ret, fail); } timers[TIMER_INIT_GRAPHS] = clockNano() - timers[TIMER_INIT_GRAPHS]; // Initialize num P2P LL buffers for this communicator comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1; if (comm->rank == ncclParamGraphDumpFileRank()) { struct ncclTopoGraph* dumpGraphs[5] = { ringGraph, treeGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph }; NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 5, dumpGraphs), ret, fail); } // Because timers[[TIMER_INIT_ALLGATHER] already contains the timing of the first allgather, // we temporarily store the start time of the subsequent one in an as-of-yet unused CONNECT timer. timers[TIMER_INIT_CONNECT] = clockNano(); // AllGather3 - begin NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail); for (int a=0; apattern; allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels; allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels; allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra; allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter; allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra; allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter; allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic; } allGather3Data[rank].cpuArch = comm->cpuArch; allGather3Data[rank].cpuVendor = comm->cpuVendor; comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels); NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail); NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail); // Determine nNodes, firstRanks, ... NCCLCHECKGOTO(ncclCalloc(&nodesFirstRank, nranks), ret, fail); NCCLCHECKGOTO(ncclCalloc(&nodesTreePatterns, nranks), ret, fail); NCCLCHECKGOTO(ncclCalloc(&comm->rankToNode, comm->nRanks), ret, fail); for (int r=0; rnNodes && nodesFirstRank[node] != firstRank; node++); if (node == comm->nNodes) { comm->nNodes++; nodesFirstRank[node] = firstRank; // Record tree pattern of each node as they can be different depending on sm arch nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern; } comm->rankToNode[r] = node; if (comm->cpuArch != allGather3Data[r].cpuArch && comm->cpuArch != NCCL_TOPO_CPU_ARCH_MIXED) { comm->cpuArch = NCCL_TOPO_CPU_ARCH_MIXED; } if (comm->cpuVendor != allGather3Data[r].cpuVendor && comm->cpuVendor != NCCL_TOPO_CPU_VENDOR_MIXED) { comm->cpuVendor = NCCL_TOPO_CPU_VENDOR_MIXED; } } // Alert the user to the presence of mixed CPUs. In the past this has caused // locks in some collective routines. This may help debug issues in the future. if (rank==0) { if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_MIXED) { INFO(NCCL_GRAPH, "CPUs with mixed architecture were detected."); } if (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) { INFO(NCCL_GRAPH, "CPUs with mixed vendors were detected."); } } // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks, comm->nNodes), ret, fail); NCCLCHECKGOTO(ncclCalloc(&comm->rankToLocalRank, comm->nRanks), ret, fail); for (int r=0; rnRanks; r++) { int node = comm->rankToNode[r]; comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks; comm->nodeRanks[node].localRanks++; } // Allocate ranks arrays for each node for (int n=0; nnNodes; n++) { NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks), ret, fail); comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks); comm->nodeRanks[n].localRanks = 0; } // And fill the ranks arrays for (int r=0; rnRanks; r++) { int node = comm->rankToNode[r]; comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r; } comm->node = comm->rankToNode[rank]; comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank; comm->localRank = comm->rankToLocalRank[rank]; comm->localRanks = comm->nodeRanks[comm->node].localRanks; TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d", rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]); if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) { WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d", rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]); ret = ncclInternalError; goto fail; } INFO(NCCL_INIT, "comm %p rank %d nRanks %d nNodes %d localRanks %d localRank %d MNNVL %d", comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL); nChannelsOrig = comm->nChannels; NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail); for (int i=0; inChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels); graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels); graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra); graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter); graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra); graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter); graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic); } } if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0; if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0; comm->nChannels = treeGraph->nChannels = ringGraph->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels); if (comm->nChannels < nChannelsOrig) { // We started duplicating channels during Preset(), so we need to move the // duplicated channels since we have removed some. for (int i=0; inChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel)); } // Determine CollNet support after all-gather now that we know nNodes and each node localRanks if (comm->collNetSupport == 1) { int collNetNodeThreshold = ncclParamCollNetNodeThreshold(); if (comm->nNodes < collNetNodeThreshold) { INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold); comm->collNetSupport = 0; } comm->collNetRegSupport = true; for (int n=0; nnNodes; n++) { if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) { WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1); comm->collNetSupport = 0; break; } if (comm->nodeRanks[n].localRanks > 1) { // As long as there is more than 1 rank on any node, we need to disable collnet reg comm->collNetRegSupport = false; } } } NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail); NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail); // AllGather3 - end timers[TIMER_INIT_ALLGATHER] += clockNano() - timers[TIMER_INIT_CONNECT]; TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels); char line[1024]; line[0]='\0'; for (int c=0; cnChannels; c++) { struct ncclTree* tree = &comm->channels[c].tree; snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d", c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up); INFO(NCCL_GRAPH, "Ring %02d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next); } line[1023] = '\0'; INFO(NCCL_INIT, "Trees%s", line); NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail); // Compute nChannels per peer for p2p NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail); /* until now, all info of comm should be known. We can initialize shared resources and * map localRanks to top parent local ranks. NOTE: this shareRes init must be put before * all proxy operations. */ if (comm->sharedRes->owner == comm) { comm->sharedRes->tpNLocalRanks = comm->localRanks; comm->sharedRes->magic = comm->magic; comm->sharedRes->tpNChannels = comm->nChannels; comm->sharedRes->tpP2pNChannels = comm->p2pnChannels; memcpy(comm->sharedRes->tpRankToLocalRank, comm->rankToLocalRank, sizeof(int) * comm->nRanks); } NCCLCHECKGOTO(ncclCalloc(&topParentLocalRanks, comm->localRanks), ret, fail); for (int i = 0; i < comm->localRanks; ++i) { int tpRank = comm->topParentRanks[comm->localRankToRank[i]]; topParentLocalRanks[i] = comm->sharedRes->tpRankToLocalRank[tpRank]; } comm->topParentLocalRanks = topParentLocalRanks; // Launch proxy service thread, after this, the proxy calls can be used. if (parent && parent->config.splitShare) { comm->proxyState = parent->sharedRes->proxyState; ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount); } else { NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail); } timers[TIMER_INIT_CONNECT] = clockNano(); do { // Build p2p schedule int node = comm->node; int nNodes = comm->nNodes; int nRanks = comm->nRanks; int local = comm->localRank; int nLocals = comm->maxLocalRanks; struct ncclNodeRanks* nodeRanks = comm->nodeRanks; bool flat = false; for (int node = 0; node < nNodes; node++) { if (nodeRanks[node].localRanks != nLocals) { flat = true; nNodes = 1; node = 0; nLocals = nRanks; local = rank; break; } } int nNodesPow2 = pow2Up(nNodes); int nLocalsPow2 = pow2Up(nLocals); comm->p2pSchedule = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); comm->planner.peers = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); uint32_t nodeRound = 0; uint32_t nodeDelta = 0; int round = 0; // When enumerating peer deltas we use the quadratic formula (x*x+x)/2 mod N. // Since that formula only produces valid permutations when N is a pow of 2, // we let N = pow2Up(n) and filter out results greater-eq to n. // Example sequence for 16 ranks: 0, 1, 3, 6, 10, 15, 5, 12, 4, 13, 7, 2, 14, 11, 9, 8 do { if (nodeDelta < nNodes) { // Filter nonsensical node deltas int sendNode = (node + nodeDelta) % nNodes; int recvNode = (node - nodeDelta + nNodes) % nNodes; uint32_t localRound = 0; uint32_t localDelta = 0; do { if (localDelta < nLocals) { // Filter nonsensical node-local deltas int sendLocal = (local + localDelta) % nLocals; int recvLocal = (local - localDelta + nLocals) % nLocals; comm->p2pSchedule[round].sendRank = flat ? sendLocal : nodeRanks[sendNode].localRankToRank[sendLocal]; comm->p2pSchedule[round].recvRank = flat ? recvLocal : nodeRanks[recvNode].localRankToRank[recvLocal]; round += 1; } localRound += 1; localDelta = (localDelta + localRound) & (nLocalsPow2 - 1); // Quadratic update } while (localRound != nLocalsPow2); } nodeRound += 1; nodeDelta = (nodeDelta + nodeRound) & (nNodesPow2 - 1); // Quadratic update } while (nodeRound != nNodesPow2); if (round != nRanks) { WARN("P2p schedule creation has bugs."); ret = ncclInternalError; goto fail; } } while (0); comm->runtimeConn = comm->cuMemSupport && ncclParamRuntimeConnect(); if (comm->runtimeConn) { for (int c=0; cnChannels; c++) { NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail); } // Setup NVLS NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail); // Check if we can setup CollNet if (comm->collNetSupport > 0) ncclCollNetSetup(comm, parent, graphs); } else { for (int c=0; cnChannels; c++) { NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail); } NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail); // Connect Trees NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail); // Setup NVLS NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail); NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail); // And NVLS trees if needed NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail); // Check if we can setup CollNet if (comm->collNetSupport > 0) { ncclCollNetSetup(comm, parent, graphs); NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail); NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail); } // Connect to local net proxy tpProxyRank = comm->topParentRanks[comm->rank]; NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail); NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); // Then to remote ones when using PXN if (ncclPxnDisable(comm) == 0) { int nranks; NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail); for (int r=0; rtopParentRanks[pxnPeers[r]]; NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail); NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); } } if (ncclParamNvbPreconnect()) { // Connect p2p when using NVB path int nvbNpeers; NCCLCHECKGOTO(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail); for (int r=0; rp2pSchedule[sendRound].sendRank != peer) sendRound++; while (comm->p2pSchedule[recvRound].recvRank != peer) recvRound++; uint8_t sendBase = ncclP2pChannelBaseForRound(comm, sendRound); uint8_t recvBase = ncclP2pChannelBaseForRound(comm, recvRound); for (int c=0; cp2pnChannelsPerPeer; c++) { int channelId; channelId = ncclP2pChannelForPart(comm->p2pnChannels, sendBase, c); if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { comm->connectSend[peer] |= (1UL<p2pnChannels, recvBase, c); if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { comm->connectRecv[peer] |= (1UL<nChannels); // Compute time models for algorithm and protocol combinations NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail); INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer); if (comm->intraRank == 0) { // Load ncclParamLaunchMode const char* str = ncclGetEnv("NCCL_LAUNCH_MODE"); enum ncclLaunchMode mode, modeOld; if (str && strcasecmp(str, "GROUP") == 0) { mode = ncclLaunchModeGroup; } else { mode = ncclLaunchModeParallel; } // In theory we could be racing with other communicators not associated with // this one if the user is connecting to multiple ncclUniqueId's concurrently. modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED); if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') { INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP"); } } // Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock. NCCLCHECKGOTO(devCommSetup(comm), ret, fail); timers[TIMER_INIT_CONNECT] = clockNano() - timers[TIMER_INIT_CONNECT]; /* Local intra-node barrier */ NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail); // We should have allocated all buffers, collective fifos, ... we can // restore the affinity. TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); exit: if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); /* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be * properly cleaned up. */ if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm); free(allTopoRanks); free(nodesTreePatterns); free(nodesFirstRank); free(allGather3Data); free(rings); free(nvbPeers); free(pxnPeers); return ret; fail: goto exit; } NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0); NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", NCCL_CONFIG_UNDEF_INT); // Match config max/minCTAs NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT); NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT); #define NCCL_MAX_CGA_CLUSTER_SIZE 8 struct ncclCommInitRankAsyncJob { struct ncclAsyncJob base; struct ncclComm* comm; struct ncclComm** newcomm; int cudaDev; // For ncclCommInitRank int nranks, myrank; ncclUniqueId commId; // for ncclCommSplit struct ncclComm* parent; int color, key; }; struct ncclCommFinalizeAsyncJob { struct ncclAsyncJob base; ncclComm_t comm; }; NCCL_PARAM(CommSplitShareResources, "COMM_SPLIT_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT); static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* nRanksRet, int* myRankRet, int* parentRanksRet) { int* colors = NULL; int* keys = NULL; int nRanks = 0, myRank = 0; ncclResult_t ret = ncclSuccess; NCCLCHECKGOTO(ncclCalloc(&colors, parent->nRanks), ret, fail); NCCLCHECKGOTO(ncclCalloc(&keys, parent->nRanks), ret, fail); // Compute nRanks, my rank and the ranks (of the original comm) before and after me colors[parent->rank] = color; keys[parent->rank] = key; NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, colors, sizeof(int)), ret, fail); NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, keys, sizeof(int)), ret, fail); // Negative color does not create a new comm. Return now. if (color == NCCL_SPLIT_NOCOLOR) goto exit; memset(parentRanksRet, 0xff, sizeof(int) * parent->nRanks); for (int i = 0; i < parent->nRanks; i++) { if (colors[i] != color) continue; // Find where to insert this rank int insert = 0; while (insert < nRanks && keys[parentRanksRet[insert]] <= keys[i]) insert++; // Shift ranks by one after insert for (int r = nRanks; r > insert; r--) parentRanksRet[r] = parentRanksRet[r - 1]; // Insert our rank parentRanksRet[insert] = i; nRanks++; } for (int i = 0; i < nRanks; i++) { if (parentRanksRet[i] == parent->rank) myRank = i; } *nRanksRet = nRanks; *myRankRet = myRank; exit: free(colors); free(keys); return ret; fail: goto exit; } static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_; ncclComm_t comm = job->comm; ncclResult_t res = ncclSuccess; int archMajor, archMinor; size_t maxLocalSizeBytes = 0; int cudaDev = job->cudaDev; int* parentRanks = NULL; int cudaArch; uint64_t timers[TIMERS_INIT_COUNT]; timers[TIMER_INIT_TOTAL] = clockNano(); CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail); CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail); CUDACHECKGOTO(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail); cudaArch = 100*archMajor + 10*archMinor; timers[TIMER_INIT_KERNELS] = clockNano(); NCCLCHECK(ncclInitKernelsForDevice(cudaArch, &maxLocalSizeBytes)); // Set the maximum kernel stack size of all kernels to avoid // a CUDA memory reconfig on load (c.f. NVSHMEM issue) if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) { TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zu", maxLocalSizeBytes); CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, maxLocalSizeBytes)); } timers[TIMER_INIT_KERNELS] = clockNano() - timers[TIMER_INIT_KERNELS]; timers[TIMER_INIT_BOOTSTRAP] = clockNano(); if (job->parent) { NCCLCHECKGOTO(ncclCalloc(&parentRanks, job->parent->nRanks), res, fail); NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail); // Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now. if (job->color == NCCL_SPLIT_NOCOLOR) goto exit; snprintf((char*)&job->commId, sizeof(job->commId), "%016lx-%d", job->parent->commHash, job->color); NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail); NCCLCHECKGOTO(bootstrapSplit((struct ncclBootstrapHandle*)&job->commId, comm, job->parent, job->color, job->key, parentRanks), res, fail); } else { NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail); NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail); } timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP]; comm->cudaArch = cudaArch; comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); if (job->parent) { INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx - Init START", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId)); } else { INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId)); } NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail); NCCLCHECKGOTO(ncclTunerPluginLoad(comm), res, fail); if (comm->tuner) { NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog, &comm->tunerContext)); } // update communicator state comm->initState = ncclSuccess; timers[TIMER_INIT_TOTAL] = clockNano() - timers[TIMER_INIT_TOTAL]; // Trace this call for replay tool if (job->parent) { /* unlink child abort flag. */ __atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE); TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", job->parent, job->color, job->key, comm, comm->rank, comm->nRanks); } else { TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", comm, comm->nRanks, (unsigned long long)hashUniqueId(job->commId), comm->rank, comm->cudaDev); } if (job->parent) { INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx - Init COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId)); } else { INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId)); } INFO(NCCL_INIT|NCCL_PROFILE,"Init timings: rank %d nranks %d total %.2f (kernels %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, connections %.2f, rest %.2f)", comm->rank, comm->nRanks, timers[TIMER_INIT_TOTAL]/1e9, timers[TIMER_INIT_KERNELS]/1e9, timers[TIMER_INIT_BOOTSTRAP]/1e9, timers[TIMER_INIT_ALLGATHER]/1e9, timers[TIMER_INIT_TOPO]/1e9, timers[TIMER_INIT_GRAPHS]/1e9, timers[TIMER_INIT_CONNECT]/1e9, (timers[TIMER_INIT_TOTAL]-timers[TIMER_INIT_KERNELS]-timers[TIMER_INIT_BOOTSTRAP]-timers[TIMER_INIT_ALLGATHER]-timers[TIMER_INIT_TOPO]-timers[TIMER_INIT_GRAPHS]-timers[TIMER_INIT_CONNECT])/1e9); exit: if (job->newcomm) { /* assign it to user pointer. */ __atomic_store_n(job->newcomm, comm, __ATOMIC_RELEASE); } free(parentRanks); return res; fail: comm->initState = res; goto exit; } #define NCCL_CONFIG_DEFAULT(config, field, undef, defvalue, fieldStr, format) \ if (config->field == undef) { \ config->field = defvalue; \ } else { \ INFO(NCCL_ENV, "Comm config " fieldStr " set to " format, config->field); \ } static ncclResult_t envConfigOverride(ncclComm_t comm) { ncclResult_t ret = ncclSuccess; const char* tmpNetName = comm->config.netName; const char* envNetName; int blockingEnv; int cgaClusterSizeEnv; int minCTAsEnv; int maxCTAsEnv; int splitShareEnv; /* override configuration from env variable. */ blockingEnv = ncclParamCommBlocking(); if (blockingEnv == 0 || blockingEnv == 1) comm->config.blocking = blockingEnv; cgaClusterSizeEnv = ncclParamCGAClusterSize(); if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) { comm->config.cgaClusterSize = cgaClusterSizeEnv; } else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) { WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE); comm->config.cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE; } minCTAsEnv = ncclParamMinCTAs(); if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) { comm->config.minCTAs = minCTAsEnv; } maxCTAsEnv = ncclParamMaxCTAs(); if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) { comm->config.maxCTAs = maxCTAsEnv; } envNetName = ncclGetEnv("NCCL_NET"); if (envNetName) tmpNetName = envNetName; if (tmpNetName != NULL) { int netNameLen = strlen(tmpNetName) + 1; comm->config.netName = (char*)malloc(netNameLen); memcpy((void*)comm->config.netName, tmpNetName, netNameLen); } else { comm->config.netName = NULL; } splitShareEnv = ncclParamCommSplitShareResources(); if (splitShareEnv != NCCL_CONFIG_UNDEF_INT) { comm->config.splitShare = splitShareEnv; } /* cap channels if needed */ if (comm->config.minCTAs > MAXCHANNELS) { WARN("minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS); comm->config.minCTAs = MAXCHANNELS; } if (comm->config.maxCTAs > MAXCHANNELS) { WARN("maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS); comm->config.maxCTAs = MAXCHANNELS; } if (comm->config.minCTAs > comm->config.maxCTAs) { WARN("minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs); comm->config.minCTAs = comm->config.maxCTAs; } if (comm->config.splitShare != 1 && comm->config.splitShare != 0) { WARN("splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare); comm->config.splitShare = 0; } return ret; } static ncclResult_t copyCommConfig(ncclComm_t childComm, ncclComm_t parnet) { memcpy(&childComm->config, &parnet->config, sizeof(ncclConfig_t)); NCCLCHECK(envConfigOverride(childComm)); return ncclSuccess; } static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { ncclResult_t ret = ncclSuccess; /* config must not be NULL in this function */ ncclConfig_t defaultConfig = NCCL_CONFIG_INITIALIZER; ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER; ncclConfig_t *internalConfigPtr; size_t realSize; internalConfig.magic = 0; internalConfigPtr = &internalConfig; if (config) { memcpy((void*)&realSize, (void*)config, sizeof(size_t)); realSize = realSize > sizeof(ncclConfig_t) ? sizeof(ncclConfig_t) : realSize; memcpy((void*)internalConfigPtr, (void*)config, realSize); if (internalConfigPtr->magic != 0xcafebeef) { WARN("ncclConfig_t argument not initialized via NCCL_CONFIG_INITIALIZER"); ret = ncclInvalidArgument; goto fail; } /* check version. */ if (internalConfigPtr->version < NCCL_VERSION(2, 14, 0)) { internalConfigPtr->blocking = defaultConfig.blocking; } if (internalConfigPtr->version < NCCL_VERSION(2, 17, 0)) { internalConfigPtr->cgaClusterSize = defaultConfig.cgaClusterSize; internalConfigPtr->minCTAs = defaultConfig.minCTAs; internalConfigPtr->maxCTAs = defaultConfig.maxCTAs; internalConfigPtr->netName = defaultConfig.netName; } } /* check input config attributes, -1 means user-undefined and we should use default value from NCCL. */ if (internalConfigPtr->blocking != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->blocking != 0 && internalConfigPtr->blocking != 1) { WARN("Invalid config blocking attribute value %d", internalConfigPtr->blocking); ret = ncclInvalidArgument; goto fail; } if (internalConfigPtr->cgaClusterSize != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->cgaClusterSize < 0) { WARN("Invalid config cgaClusterSize attribute value %d", internalConfigPtr->cgaClusterSize); ret = ncclInvalidArgument; goto fail; } if ((internalConfigPtr->minCTAs != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->minCTAs <= 0) || (internalConfigPtr->maxCTAs != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->maxCTAs <= 0) || (internalConfigPtr->minCTAs > internalConfigPtr->maxCTAs)) { WARN("Invalid config min/max channels attribute value %d/%d", internalConfigPtr->minCTAs, internalConfigPtr->maxCTAs); ret = ncclInvalidArgument; goto fail; } if (internalConfigPtr->splitShare != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->splitShare != 0 && internalConfigPtr->splitShare != 1) { WARN("Invalid config splitShare attribute value %d", internalConfigPtr->splitShare); ret = ncclInvalidArgument; goto fail; } /* default config value can be tuned on different platform. */ NCCL_CONFIG_DEFAULT(internalConfigPtr, blocking, NCCL_CONFIG_UNDEF_INT, 1, "Blocking", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, cgaClusterSize, NCCL_CONFIG_UNDEF_INT, 4, "CGA cluster size", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, minCTAs, NCCL_CONFIG_UNDEF_INT, 1, "Min CTAs", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s"); NCCL_CONFIG_DEFAULT(internalConfigPtr, splitShare, NCCL_CONFIG_UNDEF_INT, 0, "Split share", "%d"); /* assign config to communicator */ comm->config.blocking = internalConfigPtr->blocking; comm->config.cgaClusterSize = internalConfigPtr->cgaClusterSize; comm->config.minCTAs = internalConfigPtr->minCTAs; comm->config.maxCTAs = internalConfigPtr->maxCTAs; comm->config.netName = internalConfigPtr->netName; comm->config.splitShare = internalConfigPtr->splitShare; NCCLCHECKGOTO(envConfigOverride(comm), ret, fail); exit: return ret; fail: goto exit; } static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev, ncclConfig_t *config) { ncclResult_t res = ncclSuccess; ncclComm_t comm = NULL; struct ncclCommInitRankAsyncJob *job = NULL; const char* env = ncclGetEnv("NCCL_COMM_ID"); if (env && myrank == 0) { INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env); NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&commId, true), res, fail); } NCCLCHECKGOTO(ncclInit(), res, fail); if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel != NCCL_LOG_NONE && myrank == 0)) { static pthread_once_t once = PTHREAD_ONCE_INIT; pthread_once(&once, showVersion); } // Make sure the CUDA runtime is initialized. CUDACHECKGOTO(cudaFree(NULL), res, fail); NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, fail); NCCLCHECKGOTO(PtrCheck(config, "CommInitRank", "config"), res, fail); if (nranks < 1 || myrank < 0 || myrank >= nranks) { WARN("Invalid rank requested : %d/%d", myrank, nranks); res = ncclInvalidArgument; goto fail; } NCCLCHECKGOTO(ncclCalloc(&comm, 1), res, fail); NCCLCHECKGOTO(ncclCalloc(&comm->abortFlag, 1), res, fail); NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->abortFlagDev, 1), res, fail); NCCLCHECKGOTO(ncclCalloc(&comm->abortFlagRefCount, 1), res, fail); comm->startMagic = comm->endMagic = NCCL_MAGIC; // Used to detect comm corruption. *comm->abortFlagRefCount = 1; NCCLCHECKGOTO(parseCommConfig(comm, config), res, fail); /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */ comm->initState = ncclInternalError; *newcomm = comm; NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); job->comm = comm; job->nranks = nranks; job->commId = commId; // C++ struct assignment job->myrank = myrank; job->cudaDev = cudaDev; NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail); exit: return ncclGroupErrCheck(res); fail: if (comm) { free(comm->abortFlag); if (comm->abortFlagDev) ncclCudaHostFree((void*)comm->abortFlagDev); free(comm->abortFlagRefCount); free(comm); } if (newcomm) *newcomm = NULL; goto exit; } struct NvtxParamsCommInitRank { int rank; int nranks; int cudaDev; }; constexpr nvtxPayloadSchemaEntry_t CommInitRankSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"}, {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommInitRank, nranks)}, {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommInitRank, cudaDev)}, }; NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank); ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { // Load the CUDA driver and dlsym hooks (can fail on old drivers) (void)ncclCudaLibraryInit(); int cudaDev; ncclConfig_t config = NCCL_CONFIG_INITIALIZER; CUDACHECK(cudaGetDevice(&cudaDev)); NvtxParamsCommInitRank payload{myrank, nranks, cudaDev}; NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload) NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config)); return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist); ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { ncclResult_t ret = ncclSuccess; int totalnDev; int *gpuFlags = NULL; ncclConfig_t config = NCCL_CONFIG_INITIALIZER; constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"} }; NVTX3_FUNC_WITH_PARAMS(CommInitAll, CommInitAllSchema, ndev) // Load the CUDA driver and dlsym hooks (can fail on old drivers) (void)ncclCudaLibraryInit(); NCCLCHECKGOTO(PtrCheck(comms, "CommInitAll", "comms"), ret, fail); if (ndev < 0) { WARN("Invalid device count requested : %d", ndev); ret = ncclInvalidArgument; goto fail; } CUDACHECKGOTO(cudaGetDeviceCount(&totalnDev), ret, fail); if (devlist) { NCCLCHECKGOTO(ncclCalloc(&gpuFlags, totalnDev), ret, fail); for (int i = 0; i < ndev; ++i) { /* invalid device check. */ if (devlist[i] < 0 || devlist[i] >= totalnDev) { ret = ncclUnhandledCudaError; goto fail; } /* duplicate device check. */ if (gpuFlags[devlist[i]] != 0) { ret = ncclInvalidUsage; goto fail; } gpuFlags[devlist[i]] = 1; } free(gpuFlags); gpuFlags = nullptr; } ncclUniqueId uniqueId; NCCLCHECKGOTO(ncclGetUniqueId(&uniqueId), ret, fail); NCCLCHECKGOTO(ncclGroupStart(), ret, fail); for (int i=0; i= ncclNumResults || comm == NULL) { WARN("ncclCommSetAsyncError: error comm %p sets state %d", comm, nextState); return ncclInvalidArgument; } __atomic_store_n(&comm->asyncResult, nextState, __ATOMIC_RELEASE); return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config); ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) { NVTX3_FUNC_RANGE_IN(nccl_domain); int cudaDev; ncclResult_t ret = ncclSuccess; ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER; ncclConfig_t *internalConfigPtr = NULL; NCCLCHECK(ncclGroupStartInternal()); (void)ncclCudaLibraryInit(); CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, fail); if (config == NULL) internalConfigPtr = &internalConfig; else internalConfigPtr = config; NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail); exit: ncclGroupErrCheck(ret); NCCLCHECK(ncclGroupEndInternal()); if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret); return ret; fail: if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret); goto exit; } static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) { struct ncclCommFinalizeAsyncJob* job = (struct ncclCommFinalizeAsyncJob*) job_; ncclComm_t comm = job->comm; int savedDevice; int commDevice = comm->cudaDev; ncclResult_t ret = ncclSuccess; CUDACHECKGOTO(cudaGetDevice(&savedDevice), ret, fail); if (savedDevice != commDevice) { CUDACHECKGOTO(cudaSetDevice(commDevice), ret, fail); } TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult); if (comm->initState == ncclSuccess) { NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->hostStream), ret, fail); NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), ret, fail); NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail); // And keep polling until all graphs referencing us die. while (comm->persistentRefs != 0) { NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail); } } if ((ret = ncclProxyStop(comm)) != ncclSuccess) { WARN("ncclProxyStop: comm %p (rank = %d) destroys proxy resource error %d", comm, comm->rank, ret); } if (savedDevice != commDevice) { CUDACHECKGOTO(cudaSetDevice(savedDevice), ret, fail); } exit: return ret; fail: goto exit; } static ncclResult_t commCleanup(ncclComm_t comm) { int savedDevice; int commDevice = comm->cudaDev; CUDACHECK(cudaGetDevice(&savedDevice)); if (savedDevice != commDevice) { CUDACHECK(cudaSetDevice(commDevice)); } if (comm->tuner != NULL) { NCCLCHECK(comm->tuner->destroy(comm->tunerContext)); NCCLCHECK(ncclTunerPluginUnload(comm)); } NCCLCHECK(commFree(comm)); if (savedDevice != commDevice) { CUDACHECK(cudaSetDevice(savedDevice)); } return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm); ncclResult_t ncclCommFinalize(ncclComm_t comm) { NVTX3_FUNC_RANGE_IN(nccl_domain); ncclResult_t ret = ncclSuccess; struct ncclCommFinalizeAsyncJob *job = NULL; NCCLCHECK(ncclGroupStartInternal()); if (comm == NULL) goto exit; /* wait comm ready before finalize. */ NCCLCHECKGOTO(ncclCommEnsureReady(comm), ret, fail); /* prevent double finalize. */ if (comm->finalizeCalled) { ret = ncclInvalidArgument; goto fail; } comm->finalizeCalled = true; /* launch async thread to finalize comm. */ NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); job->comm = comm; NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commDestroySync, NULL, free, comm), ret, fail); exit: ncclGroupErrCheck(ret); NCCLCHECK(ncclGroupEndInternal()); if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)) }; return ret; fail: if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret); goto exit; } static ncclResult_t commReclaim(struct ncclAsyncJob* job_) { struct ncclCommFinalizeAsyncJob* job = (struct ncclCommFinalizeAsyncJob*) job_; ncclComm_t comm = job->comm; ncclResult_t ret = ncclSuccess; if (comm->intraComm0 != NULL) { int curRankCnt; int curRank; /* Debug info */ int intraRanks = comm->intraRanks; ncclComm_t intracomm0 = comm->intraComm0; int *finalizeRankCnt = &intracomm0->finalizeRankCnt; assert(intracomm0 != NULL && finalizeRankCnt != NULL); curRankCnt = __atomic_add_fetch(finalizeRankCnt, 1, __ATOMIC_ACQ_REL); if (curRankCnt == intraRanks) { ncclComm_t curIntraComm; ncclComm_t nextIntraComm = intracomm0; /* this is the last call to ncclCommDestroy/Abort, we need to make sure all comms * in the process have been finalized before we free local resources. */ while (nextIntraComm) { curIntraComm = nextIntraComm; curRank = curIntraComm->rank; nextIntraComm = nextIntraComm->intraNext; if (curIntraComm->finalizeCalled == false) { struct ncclCommFinalizeAsyncJob job; job.comm = curIntraComm; /* every comm aborts, commDestroySync should not be blocked. */ if ((ret = commDestroySync((struct ncclAsyncJob*) &job)) != ncclSuccess) WARN("commReclaim: comm %p (rank = %d) in commDestroySync, error %d", curIntraComm, curRank, ret); } } /* free local resources. */ nextIntraComm = intracomm0; while (nextIntraComm) { curIntraComm = nextIntraComm; curRank = curIntraComm->rank; nextIntraComm = nextIntraComm->intraNext; if ((ret = commCleanup(curIntraComm)) != ncclSuccess) { WARN("commReclaim: cleanup comm %p rank %d failed in destroy/abort, error %d", curIntraComm, curRank, ret); } } } } return ret; } NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm); ncclResult_t ncclCommDestroy(ncclComm_t comm) { if (comm == NULL) { NVTX3_FUNC_RANGE_IN(nccl_domain); return ncclSuccess; } int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev; struct ncclCommFinalizeAsyncJob *job = NULL; ncclResult_t res = ncclSuccess; NvtxParamsCommInitRank payload{rank, nranks, cudaDev}; NVTX3_FUNC_WITH_PARAMS(CommDestroy, CommInitRankSchema, payload) TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId); // Try and prevent a double free of the comm struct (user error) if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) { WARN("comm %p has already been destroyed", comm); return ncclInvalidArgument; } comm->destroyFlag = 1; /* init thread must be joined before we destroy the comm. */ NCCLCHECK(ncclCommEnsureReady(comm)); NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); job->comm = comm; NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail); exit: return res; fail: free(job); goto exit; } NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm); ncclResult_t ncclCommAbort(ncclComm_t comm) { if (comm == NULL) { NVTX3_FUNC_RANGE_IN(nccl_domain); return ncclSuccess; } int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev; struct ncclCommFinalizeAsyncJob *job = NULL; ncclResult_t res = ncclSuccess; NvtxParamsCommInitRank payload{rank, nranks, cudaDev}; NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload) TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId); // Ask anything that might still be running on the device to quit if (comm->childAbortFlag != nullptr) { __atomic_store_n(comm->childAbortFlag, 1, __ATOMIC_RELEASE); __atomic_store_n(comm->childAbortFlagDev, 1, __ATOMIC_RELEASE); } __atomic_store_n(comm->abortFlag, 1, __ATOMIC_RELEASE); __atomic_store_n(comm->abortFlagDev, 1, __ATOMIC_RELEASE); comm->destroyFlag = 1; /* init thread must be joined before we destroy the comm, * and we should ignore the init error here. */ ncclCommEnsureReady(comm); NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); job->comm = comm; NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail); exit: return ncclSuccess; fail: free(job); goto exit; } NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config); ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) { struct ncclCommInitRankAsyncJob *job = NULL; struct ncclComm* childComm = NCCL_COMM_NULL; ncclResult_t res = ncclSuccess; NCCLCHECK(ncclGroupStartInternal()); NCCLCHECKGOTO(CommCheck(comm, "CommSplit", "comm"), res, fail); NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail); NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, fail); /* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */ *newcomm = NCCL_COMM_NULL; if (color == NCCL_SPLIT_NOCOLOR) { INFO(NCCL_INIT, "Rank %d has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator", comm->rank); } else { NCCLCHECKGOTO(ncclCalloc(&childComm, 1), res, fail); childComm->startMagic = childComm->endMagic = NCCL_MAGIC; if (comm->config.splitShare) { childComm->abortFlag = comm->abortFlag; childComm->abortFlagDev = comm->abortFlagDev; childComm->abortFlagRefCount = comm->abortFlagRefCount; comm->childAbortFlag = NULL; ncclAtomicRefCountIncrement(comm->abortFlagRefCount); } else { NCCLCHECKGOTO(ncclCalloc(&childComm->abortFlag, 1), res, fail); NCCLCHECKGOTO(ncclCudaHostCalloc(&childComm->abortFlagDev, 1), res, fail); NCCLCHECKGOTO(ncclCalloc(&childComm->abortFlagRefCount, 1), res, fail); /* temporarily used to abort everything during child comm init. */ comm->childAbortFlag = childComm->abortFlag; comm->childAbortFlagDev = childComm->abortFlagDev; *childComm->abortFlagRefCount = 1; } if (config == NULL) { NCCLCHECKGOTO(copyCommConfig(childComm, comm), res, fail); } else { NCCLCHECKGOTO(parseCommConfig(childComm, config), res, fail); } /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */ childComm->initState = ncclInternalError; } NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); job->comm = childComm; job->newcomm = newcomm; job->parent = comm; job->color = color; job->key = key; job->cudaDev = comm->cudaDev; NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail); exit: ncclGroupErrCheck(res); NCCLCHECK(ncclGroupEndInternal()); return res; fail: if (childComm) { if (comm && !comm->config.splitShare) { free(childComm->abortFlag); if (childComm->abortFlagDev) ncclCudaHostFree(childComm->abortFlagDev); free(childComm->abortFlagRefCount); } free(childComm); } if (newcomm) *newcomm = NULL; goto exit; } NCCL_API(const char*, ncclGetErrorString, ncclResult_t code); const char* ncclGetErrorString(ncclResult_t code) { switch (code) { case ncclSuccess : return "no error"; case ncclUnhandledCudaError : return "unhandled cuda error (run with NCCL_DEBUG=INFO for details)"; case ncclSystemError : return "unhandled system error (run with NCCL_DEBUG=INFO for details)"; case ncclInternalError : return "internal error - please report this issue to the NCCL developers"; case ncclInvalidArgument : return "invalid argument (run with NCCL_DEBUG=WARN for details)"; case ncclInvalidUsage : return "invalid usage (run with NCCL_DEBUG=WARN for details)"; case ncclRemoteError : return "remote process exited or there was a network error"; case ncclInProgress : return "NCCL operation in progress"; default : return "unknown result code"; } } /* Returns a human-readable message of the last error that occurred. * comm is currently unused and can be set to NULL */ NCCL_API(const char*, ncclGetLastError, const ncclComm_t comm); const char* ncclGetLastError(ncclComm_t comm) { return ncclLastError; } NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError); ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { NCCLCHECK(CommCheck(comm, "ncclGetAsyncError", "comm")); NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError")); *asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE); if (*asyncError == ncclSuccess && comm->proxyState) *asyncError = __atomic_load_n(&comm->proxyState->asyncResult, __ATOMIC_ACQUIRE); return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count); ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { NVTX3_FUNC_RANGE_IN(nccl_domain); NCCLCHECK(CommCheck(comm, "CommCount", "comm")); NCCLCHECK(PtrCheck(count, "CommCount", "count")); /* init thread must be joined before we access the attributes of comm. */ NCCLCHECK(ncclCommEnsureReady(comm)); *count = comm->nRanks; return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid); ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) { NVTX3_FUNC_RANGE_IN(nccl_domain); NCCLCHECK(CommCheck(comm, "CommCuDevice", "comm")); NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid")); NCCLCHECK(ncclCommEnsureReady(comm)); *devid = comm->cudaDev; return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank); ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { NVTX3_FUNC_RANGE_IN(nccl_domain); NCCLCHECK(CommCheck(comm, "CommUserRank", "comm")); NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank")); NCCLCHECK(ncclCommEnsureReady(comm)); *rank = comm->rank; return ncclSuccess; } NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size); ncclResult_t ncclMemAlloc(void **ptr, size_t size) { NVTX3_FUNC_RANGE_IN(nccl_domain); ncclResult_t ret = ncclSuccess; #if CUDART_VERSION >= 12010 size_t memGran = 0; size_t mcGran = 0; CUdevice currentDev; CUmemAllocationProp memprop = {}; CUmulticastObjectProp mcprop = {}; CUmemAccessDesc accessDesc = {}; CUmemGenericAllocationHandle handle; int cudaDev; int flag = 0; int dcnt; int mcSupport = 0; if (ptr == NULL || size == 0) goto fallback; if (ncclCudaLibraryInit() != ncclSuccess) goto fallback; CUDACHECK(cudaGetDevice(&cudaDev)); CUCHECK(cuDeviceGet(¤tDev, cudaDev)); if (CUPFN(cuMulticastCreate) != NULL) CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev)); if (mcSupport) { memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; memprop.requestedHandleTypes = ncclCuMemHandleType; memprop.location.id = currentDev; // Query device to see if RDMA support is available CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev)); if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1; CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); /* mc property */ CUDACHECK(cudaGetDeviceCount(&dcnt)); mcprop.size = size; /* device cnt is a dummy value right now, it might affect mc granularity in the future. */ mcprop.numDevices = dcnt; mcprop.handleTypes = ncclCuMemHandleType; mcprop.flags = 0; CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED)); /* only size needs to be aligned to mcGran */ ALIGN_SIZE(size, mcGran); /* Allocate the physical memory on the device */ CUCHECK(cuMemCreate(&handle, size, &memprop, 0)); /* Reserve a virtual address range */ CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0)); /* Map the virtual address range to the physical allocation */ CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); /* Now allow RW access to the newly mapped memory */ for (int i = 0; i < dcnt; ++i) { int p2p = 0; if (i == cudaDev || ((cudaDeviceCanAccessPeer(&p2p, cudaDev, i) == cudaSuccess) && p2p)) { accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDesc.location.id = i; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); } } goto exit; } fallback: #endif CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail); exit: return ret; fail: goto exit; } NCCL_API(ncclResult_t, ncclMemFree, void *ptr); ncclResult_t ncclMemFree(void *ptr) { NVTX3_FUNC_RANGE_IN(nccl_domain); ncclResult_t ret = ncclSuccess; int saveDevice; CUDACHECK(cudaGetDevice(&saveDevice)); #if CUDART_VERSION >= 12010 CUdevice ptrDev = 0; int mcSupport = 0; if (ptr == NULL) goto fallback; if (ncclCudaLibraryInit() != ncclSuccess) goto fallback; CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail); if (CUPFN(cuMulticastCreate) != NULL) CUCHECKGOTO(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, ptrDev), ret, fail); CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail); if (mcSupport) { NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail); goto exit; } fallback: #endif CUDACHECKGOTO(cudaFree(ptr), ret, fail); exit: cudaSetDevice(saveDevice); return ret; fail: goto exit; } nccl-2.22.3-1/src/init_nvtx.cc000066400000000000000000000015511463451655400160140ustar00rootroot00000000000000#include "nccl.h" #include "nvtx.h" static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = { {"Sum", ncclSum, 0}, {"Product", ncclProd, 0}, {"Max", ncclMax, 0}, {"Min", ncclMin, 0}, {"Avg", ncclAvg, 0} }; // Must be called before the first call to any reduction operation. void initNvtxRegisteredEnums() { // Register schemas and strings constexpr const nvtxPayloadEnumAttr_t eAttr { .fieldMask = NVTX_PAYLOAD_ENUM_ATTR_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_SIZE | NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID, .name = NULL, .entries = NvtxEnumRedSchema, .numEntries = std::extent::value, .sizeOfEnum = sizeof(ncclRedOp_t), .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP, .extension = nullptr }; nvtxPayloadEnumRegister(nvtx3::domain::get(), &eAttr); } nccl-2.22.3-1/src/misc/000077500000000000000000000000001463451655400144145ustar00rootroot00000000000000nccl-2.22.3-1/src/misc/argcheck.cc000066400000000000000000000061471463451655400165020ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "argcheck.h" #include "comm.h" ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { cudaPointerAttributes attr; cudaError_t err = cudaPointerGetAttributes(&attr, pointer); if (err != cudaSuccess || attr.devicePointer == NULL) { WARN("%s : %s %p is not a valid pointer", opname, ptrname, pointer); return ncclInvalidArgument; } #if CUDART_VERSION >= 10000 if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { #else if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { #endif WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev); return ncclInvalidArgument; } return ncclSuccess; } ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) { if (ptr == NULL) { WARN("%s : %s argument is NULL", opname, ptrname); return ncclInvalidArgument; } return ncclSuccess; } ncclResult_t CommCheck(struct ncclComm* comm, const char* opname, const char* ptrname) { NCCLCHECK(PtrCheck(comm, opname, ptrname)); if (comm->startMagic != NCCL_MAGIC || comm->endMagic != NCCL_MAGIC) { WARN("Error: corrupted comm object detected"); return ncclInvalidArgument; } return ncclSuccess; } ncclResult_t ArgsCheck(struct ncclInfo* info) { // First, the easy ones if (info->root < 0 || info->root >= info->comm->nRanks) { WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks); return ncclInvalidArgument; } if (info->datatype < 0 || info->datatype >= ncclNumTypes) { WARN("%s : invalid type %d", info->opName, info->datatype); return ncclInvalidArgument; } if (info->op < 0 || ncclMaxRedOp < info->op) { WARN("%s : invalid reduction operation %d", info->opName, info->op); return ncclInvalidArgument; } int opIx = int(ncclUserRedOpMangle(info->comm, info->op)) - int(ncclNumOps); if (ncclNumOps <= info->op && (info->comm->userRedOpCapacity <= opIx || info->comm->userRedOps[opIx].freeNext != -1)) { WARN("%s : reduction operation %d unknown to this communicator", info->opName, info->op); return ncclInvalidArgument; } if (info->comm->checkPointers) { if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv)) { if (info->count >0) NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName)); } else { // Check CUDA device pointers if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) { NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName)); } if (info->coll != ncclFuncReduce || info->comm->rank == info->root) { NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName)); } } } return ncclSuccess; } nccl-2.22.3-1/src/misc/cudawrap.cc000066400000000000000000000162341463451655400165370ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nccl.h" #include "debug.h" #include "param.h" #include "cudawrap.h" // This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2); // Handle type used for cuMemCreate() CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; static int ncclCuMemSupported = 0; // Determine whether CUMEM & VMM RDMA is supported on this platform int ncclIsCuMemSupported() { #if CUDART_VERSION < 11030 return 0; #else CUdevice currentDev; int cudaDev; int cudaDriverVersion; int flag = 0; ncclResult_t ret = ncclSuccess; CUDACHECKGOTO(cudaDriverGetVersion(&cudaDriverVersion), ret, error); if (cudaDriverVersion < 12000) return 0; // Need CUDA_VISIBLE_DEVICES support CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, error); if (CUPFN(cuMemCreate) == NULL) return 0; CUCHECKGOTO(cuDeviceGet(¤tDev, cudaDev), ret, error); // Query device to see if CUMEM VMM support is available CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, currentDev), ret, error); if (!flag) return 0; // Query device to see if CUMEM RDMA support is available CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev), ret, error); if (!flag) return 0; error: return (ret == ncclSuccess); #endif } int ncclCuMemEnable() { // NCCL_CUMEM_ENABLE=-2 means auto-detect CUMEM support int param = ncclParamCuMemEnable(); return param >= 0 ? param : (param == -2 && ncclCuMemSupported); } #define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr #if CUDART_VERSION >= 11030 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */ DECLARE_CUDA_PFN(cuDeviceGet); DECLARE_CUDA_PFN(cuDeviceGetAttribute); DECLARE_CUDA_PFN(cuGetErrorString); DECLARE_CUDA_PFN(cuGetErrorName); /* enqueue.cc */ DECLARE_CUDA_PFN(cuMemGetAddressRange); DECLARE_CUDA_PFN(cuLaunchKernel); #if CUDA_VERSION >= 11080 DECLARE_CUDA_PFN(cuLaunchKernelEx); #endif /* proxy.cc */ DECLARE_CUDA_PFN(cuCtxCreate); DECLARE_CUDA_PFN(cuCtxDestroy); DECLARE_CUDA_PFN(cuCtxGetCurrent); DECLARE_CUDA_PFN(cuCtxSetCurrent); DECLARE_CUDA_PFN(cuCtxGetDevice); /* cuMem API support */ DECLARE_CUDA_PFN(cuMemAddressReserve); DECLARE_CUDA_PFN(cuMemAddressFree); DECLARE_CUDA_PFN(cuMemCreate); DECLARE_CUDA_PFN(cuMemGetAllocationGranularity); DECLARE_CUDA_PFN(cuMemExportToShareableHandle); DECLARE_CUDA_PFN(cuMemImportFromShareableHandle); DECLARE_CUDA_PFN(cuMemMap); DECLARE_CUDA_PFN(cuMemRelease); DECLARE_CUDA_PFN(cuMemRetainAllocationHandle); DECLARE_CUDA_PFN(cuMemSetAccess); DECLARE_CUDA_PFN(cuMemUnmap); /* ncclMemAlloc/Free */ DECLARE_CUDA_PFN(cuPointerGetAttribute); #if CUDA_VERSION >= 11070 /* transport/collNet.cc/net.cc*/ DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support #endif #if CUDA_VERSION >= 12010 /* NVSwitch Multicast support */ DECLARE_CUDA_PFN(cuMulticastAddDevice); DECLARE_CUDA_PFN(cuMulticastBindMem); DECLARE_CUDA_PFN(cuMulticastBindAddr); DECLARE_CUDA_PFN(cuMulticastCreate); DECLARE_CUDA_PFN(cuMulticastGetGranularity); DECLARE_CUDA_PFN(cuMulticastUnbind); #endif #endif #define CUDA_DRIVER_MIN_VERSION 11030 int ncclCudaDriverVersionCache = -1; bool ncclCudaLaunchBlocking = false; #if CUDART_VERSION >= 11030 #if CUDART_VERSION >= 12000 #define LOAD_SYM(symbol, ignore) do { \ cudaDriverEntryPointQueryResult driverStatus; \ res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault, &driverStatus); \ if (res != cudaSuccess || driverStatus != cudaDriverEntryPointSuccess) { \ if (!ignore) { \ WARN("Retrieve %s failed with %d status %d", #symbol, res, driverStatus); \ return ncclSystemError; } \ } } while(0) #else #define LOAD_SYM(symbol, ignore) do { \ res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault); \ if (res != cudaSuccess) { \ if (!ignore) { \ WARN("Retrieve %s failed with %d", #symbol, res); \ return ncclSystemError; } \ } } while(0) #endif /* Load the CUDA symbols */ static ncclResult_t cudaPfnFuncLoader(void) { cudaError_t res; LOAD_SYM(cuGetErrorString, 0); LOAD_SYM(cuGetErrorName, 0); LOAD_SYM(cuDeviceGet, 0); LOAD_SYM(cuDeviceGetAttribute, 0); LOAD_SYM(cuMemGetAddressRange, 1); LOAD_SYM(cuCtxCreate, 1); LOAD_SYM(cuCtxDestroy, 1); LOAD_SYM(cuCtxGetCurrent, 1); LOAD_SYM(cuCtxSetCurrent, 1); LOAD_SYM(cuCtxGetDevice, 1); LOAD_SYM(cuLaunchKernel, 1); #if CUDA_VERSION >= 11080 LOAD_SYM(cuLaunchKernelEx, 1); #endif /* cuMem API support */ LOAD_SYM(cuMemAddressReserve, 1); LOAD_SYM(cuMemAddressFree, 1); LOAD_SYM(cuMemCreate, 1); LOAD_SYM(cuMemGetAllocationGranularity, 1); LOAD_SYM(cuMemExportToShareableHandle, 1); LOAD_SYM(cuMemImportFromShareableHandle, 1); LOAD_SYM(cuMemMap, 1); LOAD_SYM(cuMemRelease, 1); LOAD_SYM(cuMemRetainAllocationHandle, 1); LOAD_SYM(cuMemSetAccess, 1); LOAD_SYM(cuMemUnmap, 1); /* ncclMemAlloc/Free */ LOAD_SYM(cuPointerGetAttribute, 1); #if CUDA_VERSION >= 11070 LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support #endif #if CUDA_VERSION >= 12010 /* NVSwitch Multicast support */ LOAD_SYM(cuMulticastAddDevice, 1); LOAD_SYM(cuMulticastBindMem, 1); LOAD_SYM(cuMulticastBindAddr, 1); LOAD_SYM(cuMulticastCreate, 1); LOAD_SYM(cuMulticastGetGranularity, 1); LOAD_SYM(cuMulticastUnbind, 1); #endif return ncclSuccess; } #endif static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; static ncclResult_t initResult; static void initOnceFunc() { do { const char* val = ncclGetEnv("CUDA_LAUNCH_BLOCKING"); ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0); } while (0); ncclResult_t ret = ncclSuccess; int cudaDev; int driverVersion; CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, error); // Initialize the driver CUDACHECKGOTO(cudaDriverGetVersion(&driverVersion), ret, error); INFO(NCCL_INIT, "cudaDriverVersion %d", driverVersion); if (driverVersion < CUDA_DRIVER_MIN_VERSION) { // WARN("CUDA Driver version found is %d. Minimum requirement is %d", driverVersion, CUDA_DRIVER_MIN_VERSION); // Silently ignore version check mismatch for backwards compatibility goto error; } #if CUDART_VERSION >= 11030 if (cudaPfnFuncLoader()) { WARN("CUDA some PFN functions not found in the library"); goto error; } #endif // Determine whether we support the cuMem APIs or not ncclCuMemSupported = ncclIsCuMemSupported(); initResult = ret; return; error: initResult = ncclSystemError; return; } ncclResult_t ncclCudaLibraryInit() { pthread_once(&initOnceControl, initOnceFunc); return initResult; } nccl-2.22.3-1/src/misc/gdrwrap.cc000066400000000000000000000203331463451655400163720ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "gdrwrap.h" #ifndef GDR_DIRECT #include "core.h" /* Function pointers assigned from dlopen() */ static gdr_t (*gdr_internal_open)(void); static int (*gdr_internal_close)(gdr_t g); static int (*gdr_internal_pin_buffer)(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle); static int (*gdr_internal_unpin_buffer)(gdr_t g, gdr_mh_t handle); static int (*gdr_internal_get_info)(gdr_t g, gdr_mh_t handle, gdr_info_t *info); static int (*gdr_internal_map)(gdr_t g, gdr_mh_t handle, void **va, size_t size); static int (*gdr_internal_unmap)(gdr_t g, gdr_mh_t handle, void *va, size_t size); static void (*gdr_internal_runtime_get_version)(int *major, int *minor); static void (*gdr_internal_driver_get_version)(gdr_t g, int *major, int *minor); static int (*gdr_internal_copy_to_mapping)(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size); static int (*gdr_internal_copy_from_mapping)(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size); // Used to make the GDR library calls thread safe pthread_mutex_t gdrLock = PTHREAD_MUTEX_INITIALIZER; #define GDRAPI_LIBNAME "libgdrapi.so" #define LOAD_SYM(handle, symbol, funcptr) do { \ cast = (void**)&funcptr; \ tmp = dlsym(handle, symbol); \ if (tmp == NULL) { \ WARN("dlsym failed on %s - %s", symbol, dlerror());\ goto teardown; \ } \ *cast = tmp; \ } while (0) #define LOAD_SYM_OPTIONAL(handle, symbol, funcptr) do {\ cast = (void**)&funcptr; \ tmp = dlsym(handle, symbol); \ if (tmp == NULL) { \ INFO(NCCL_INIT,"dlsym failed on %s, ignoring", symbol); \ } \ *cast = tmp; \ } while (0) static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; static ncclResult_t initResult; static void initOnceFunc(void) { static void* gdrhandle = NULL; void* tmp; void** cast; gdrhandle=dlopen(GDRAPI_LIBNAME, RTLD_NOW); if (!gdrhandle) { WARN("Failed to open %s", GDRAPI_LIBNAME); goto teardown; } /* Load the function pointers from the DL library image */ LOAD_SYM(gdrhandle, "gdr_open", gdr_internal_open); LOAD_SYM(gdrhandle, "gdr_close", gdr_internal_close); LOAD_SYM(gdrhandle, "gdr_pin_buffer", gdr_internal_pin_buffer); LOAD_SYM(gdrhandle, "gdr_unpin_buffer", gdr_internal_unpin_buffer); LOAD_SYM(gdrhandle, "gdr_get_info", gdr_internal_get_info); LOAD_SYM(gdrhandle, "gdr_map", gdr_internal_map); LOAD_SYM(gdrhandle, "gdr_unmap", gdr_internal_unmap); LOAD_SYM(gdrhandle, "gdr_runtime_get_version", gdr_internal_runtime_get_version); LOAD_SYM(gdrhandle, "gdr_driver_get_version", gdr_internal_driver_get_version); LOAD_SYM(gdrhandle, "gdr_copy_to_mapping", gdr_internal_copy_to_mapping); LOAD_SYM(gdrhandle, "gdr_copy_from_mapping", gdr_internal_copy_from_mapping); initResult = ncclSuccess; return; teardown: gdr_internal_open = NULL; gdr_internal_close = NULL; gdr_internal_pin_buffer = NULL; gdr_internal_unpin_buffer = NULL; gdr_internal_get_info = NULL; gdr_internal_map = NULL; gdr_internal_unmap = NULL; gdr_internal_runtime_get_version = NULL; gdr_internal_driver_get_version = NULL; gdr_internal_copy_to_mapping = NULL; gdr_internal_copy_from_mapping = NULL; if (gdrhandle != NULL) dlclose(gdrhandle); initResult = ncclSystemError; return; } ncclResult_t wrap_gdr_symbols(void) { pthread_once(&initOnceControl, initOnceFunc); return initResult; } gdr_t wrap_gdr_open(void) { if (gdr_internal_open == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return NULL; } return gdr_internal_open(); } ncclResult_t wrap_gdr_close(gdr_t g) { if (gdr_internal_close == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret = gdr_internal_close(g); if (ret != 0) { WARN("gdr_close() failed: %d", ret); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) { if (gdr_internal_pin_buffer == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret; GDRLOCKCALL(gdr_internal_pin_buffer(g, addr, size, p2p_token, va_space, handle), ret); if (ret != 0) { WARN("gdr_pin_buffer(addr %lx, size %zu) failed: %d", addr, size, ret); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) { if (gdr_internal_unpin_buffer == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret; GDRLOCKCALL(gdr_internal_unpin_buffer(g, handle), ret); if (ret != 0) { WARN("gdr_unpin_buffer(handle %lx) failed: %d", handle.h, ret); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) { if (gdr_internal_get_info == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret; GDRLOCKCALL(gdr_internal_get_info(g, handle, info), ret); if (ret != 0) { WARN("gdr_get_info(handle %lx) failed: %d", handle.h, ret); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) { if (gdr_internal_map == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret; GDRLOCKCALL(gdr_internal_map(g, handle, va, size), ret); if (ret != 0) { WARN("gdr_map(handle %lx, size %zu) failed: %d", handle.h, size, ret); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) { if (gdr_internal_unmap == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret; GDRLOCKCALL(gdr_internal_unmap(g, handle, va, size), ret); if (ret != 0) { WARN("gdr_unmap(handle %lx, va %p, size %zu) failed: %d", handle.h, va, size, ret); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor) { if (gdr_internal_runtime_get_version == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } gdr_internal_runtime_get_version(major, minor); return ncclSuccess; } ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) { if (gdr_internal_driver_get_version == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } gdr_internal_driver_get_version(g, major, minor); return ncclSuccess; } ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) { if (gdr_internal_copy_to_mapping == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret; GDRLOCKCALL(gdr_internal_copy_to_mapping(handle, map_d_ptr, h_ptr, size), ret); if (ret != 0) { WARN("gdr_copy_to_mapping(handle %lx, map_d_ptr %p, h_ptr %p, size %zu) failed: %d", handle.h, map_d_ptr, h_ptr, size, ret); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) { if (gdr_internal_copy_from_mapping == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret; GDRLOCKCALL(gdr_internal_copy_from_mapping(handle, h_ptr, map_d_ptr, size), ret); if (ret != 0) { WARN("gdr_copy_from_mapping(handle %lx, h_ptr %p, map_d_ptr %p, size %zu) failed: %d", handle.h, h_ptr, map_d_ptr, size, ret); return ncclSystemError; } return ncclSuccess; } #endif /* !GDR_DIRECT */ nccl-2.22.3-1/src/misc/ibvsymbols.cc000066400000000000000000000166261463451655400171270ustar00rootroot00000000000000#include #include #include "ibvsymbols.h" #ifdef NCCL_BUILD_RDMA_CORE /* RDMA-core linking mode. Symbols are pointers to linked IB Verbs */ #define ASSIGN_SYM(container, symbol, name) container->name= &symbol; // Passthrough function for ibv_reg_mr macro in verbs.h struct ibv_mr* ibv_internal_reg_mr( struct ibv_pd* pd, void* addr, size_t length, int access) { return ibv_reg_mr(pd, addr, length, access); } // Passthrough function for ibv_internal_query_port macro in verbs.h int ibv_internal_query_port( struct ibv_context* context, uint8_t port_num, struct ibv_port_attr* port_attr) { return ibv_query_port(context, port_num, port_attr); } ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) { ASSIGN_SYM(ibvSymbols, ibv_get_device_list, ibv_internal_get_device_list); ASSIGN_SYM(ibvSymbols, ibv_free_device_list, ibv_internal_free_device_list); ASSIGN_SYM(ibvSymbols, ibv_get_device_name, ibv_internal_get_device_name); ASSIGN_SYM(ibvSymbols, ibv_open_device, ibv_internal_open_device); ASSIGN_SYM(ibvSymbols, ibv_close_device, ibv_internal_close_device); ASSIGN_SYM(ibvSymbols, ibv_get_async_event, ibv_internal_get_async_event); ASSIGN_SYM(ibvSymbols, ibv_ack_async_event, ibv_internal_ack_async_event); ASSIGN_SYM(ibvSymbols, ibv_query_device, ibv_internal_query_device); ASSIGN_SYM(ibvSymbols, ibv_query_gid, ibv_internal_query_gid); ASSIGN_SYM(ibvSymbols, ibv_query_qp, ibv_internal_query_qp); ASSIGN_SYM(ibvSymbols, ibv_alloc_pd, ibv_internal_alloc_pd); ASSIGN_SYM(ibvSymbols, ibv_dealloc_pd, ibv_internal_dealloc_pd); ASSIGN_SYM(ibvSymbols, ibv_reg_mr_iova2, ibv_internal_reg_mr_iova2); ASSIGN_SYM(ibvSymbols, ibv_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr); ASSIGN_SYM(ibvSymbols, ibv_dereg_mr, ibv_internal_dereg_mr); ASSIGN_SYM(ibvSymbols, ibv_create_cq, ibv_internal_create_cq); ASSIGN_SYM(ibvSymbols, ibv_destroy_cq, ibv_internal_destroy_cq); ASSIGN_SYM(ibvSymbols, ibv_create_qp, ibv_internal_create_qp); ASSIGN_SYM(ibvSymbols, ibv_modify_qp, ibv_internal_modify_qp); ASSIGN_SYM(ibvSymbols, ibv_destroy_qp, ibv_internal_destroy_qp); ASSIGN_SYM(ibvSymbols, ibv_fork_init, ibv_internal_fork_init); ASSIGN_SYM(ibvSymbols, ibv_event_type_str, ibv_internal_event_type_str); ASSIGN_SYM(ibvSymbols, ibv_query_ece, ibv_internal_query_ece); ASSIGN_SYM(ibvSymbols, ibv_set_ece, ibv_internal_set_ece); ibvSymbols->ibv_internal_reg_mr = &ibv_internal_reg_mr; ibvSymbols->ibv_internal_query_port = &ibv_internal_query_port; return ncclSuccess; } #else /* RDMA-core dynamic loading mode. Symbols are loaded from shared objects. */ #include #include "core.h" // IBVERBS Library versioning #define IBVERBS_VERSION "IBVERBS_1.1" ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) { static void* ibvhandle = NULL; void* tmp; void** cast; ibvhandle=dlopen("libibverbs.so", RTLD_NOW); if (!ibvhandle) { ibvhandle=dlopen("libibverbs.so.1", RTLD_NOW); if (!ibvhandle) { INFO(NCCL_INIT, "Failed to open libibverbs.so[.1]"); goto teardown; } } #define LOAD_SYM(handle, symbol, funcptr) do { \ cast = (void**)&funcptr; \ tmp = dlvsym(handle, symbol, IBVERBS_VERSION); \ if (tmp == NULL) { \ WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), IBVERBS_VERSION); \ goto teardown; \ } \ *cast = tmp; \ } while (0) // Attempt to load a specific symbol version - fail silently #define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \ cast = (void**)&funcptr; \ *cast = dlvsym(handle, symbol, version); \ } while (0) LOAD_SYM(ibvhandle, "ibv_get_device_list", ibvSymbols->ibv_internal_get_device_list); LOAD_SYM(ibvhandle, "ibv_free_device_list", ibvSymbols->ibv_internal_free_device_list); LOAD_SYM(ibvhandle, "ibv_get_device_name", ibvSymbols->ibv_internal_get_device_name); LOAD_SYM(ibvhandle, "ibv_open_device", ibvSymbols->ibv_internal_open_device); LOAD_SYM(ibvhandle, "ibv_close_device", ibvSymbols->ibv_internal_close_device); LOAD_SYM(ibvhandle, "ibv_get_async_event", ibvSymbols->ibv_internal_get_async_event); LOAD_SYM(ibvhandle, "ibv_ack_async_event", ibvSymbols->ibv_internal_ack_async_event); LOAD_SYM(ibvhandle, "ibv_query_device", ibvSymbols->ibv_internal_query_device); LOAD_SYM(ibvhandle, "ibv_query_port", ibvSymbols->ibv_internal_query_port); LOAD_SYM(ibvhandle, "ibv_query_gid", ibvSymbols->ibv_internal_query_gid); LOAD_SYM(ibvhandle, "ibv_query_qp", ibvSymbols->ibv_internal_query_qp); LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibvSymbols->ibv_internal_alloc_pd); LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibvSymbols->ibv_internal_dealloc_pd); LOAD_SYM(ibvhandle, "ibv_reg_mr", ibvSymbols->ibv_internal_reg_mr); // Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8 LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibvSymbols->ibv_internal_reg_mr_iova2, "IBVERBS_1.8"); // Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12 LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibvSymbols->ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12"); LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibvSymbols->ibv_internal_dereg_mr); LOAD_SYM(ibvhandle, "ibv_create_cq", ibvSymbols->ibv_internal_create_cq); LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibvSymbols->ibv_internal_destroy_cq); LOAD_SYM(ibvhandle, "ibv_create_qp", ibvSymbols->ibv_internal_create_qp); LOAD_SYM(ibvhandle, "ibv_modify_qp", ibvSymbols->ibv_internal_modify_qp); LOAD_SYM(ibvhandle, "ibv_destroy_qp", ibvSymbols->ibv_internal_destroy_qp); LOAD_SYM(ibvhandle, "ibv_fork_init", ibvSymbols->ibv_internal_fork_init); LOAD_SYM(ibvhandle, "ibv_event_type_str", ibvSymbols->ibv_internal_event_type_str); LOAD_SYM_VERSION(ibvhandle, "ibv_query_ece", ibvSymbols->ibv_internal_query_ece, "IBVERBS_1.10"); LOAD_SYM_VERSION(ibvhandle, "ibv_set_ece", ibvSymbols->ibv_internal_set_ece, "IBVERBS_1.10"); return ncclSuccess; teardown: ibvSymbols->ibv_internal_get_device_list = NULL; ibvSymbols->ibv_internal_free_device_list = NULL; ibvSymbols->ibv_internal_get_device_name = NULL; ibvSymbols->ibv_internal_open_device = NULL; ibvSymbols->ibv_internal_close_device = NULL; ibvSymbols->ibv_internal_get_async_event = NULL; ibvSymbols->ibv_internal_ack_async_event = NULL; ibvSymbols->ibv_internal_query_device = NULL; ibvSymbols->ibv_internal_query_port = NULL; ibvSymbols->ibv_internal_query_gid = NULL; ibvSymbols->ibv_internal_query_qp = NULL; ibvSymbols->ibv_internal_alloc_pd = NULL; ibvSymbols->ibv_internal_dealloc_pd = NULL; ibvSymbols->ibv_internal_reg_mr = NULL; ibvSymbols->ibv_internal_reg_mr_iova2 = NULL; ibvSymbols->ibv_internal_reg_dmabuf_mr = NULL; ibvSymbols->ibv_internal_dereg_mr = NULL; ibvSymbols->ibv_internal_create_cq = NULL; ibvSymbols->ibv_internal_destroy_cq = NULL; ibvSymbols->ibv_internal_create_qp = NULL; ibvSymbols->ibv_internal_modify_qp = NULL; ibvSymbols->ibv_internal_destroy_qp = NULL; ibvSymbols->ibv_internal_fork_init = NULL; ibvSymbols->ibv_internal_event_type_str = NULL; ibvSymbols->ibv_internal_query_ece = NULL; ibvSymbols->ibv_internal_set_ece = NULL; if (ibvhandle != NULL) dlclose(ibvhandle); return ncclSystemError; } #endif nccl-2.22.3-1/src/misc/ibvwrap.cc000066400000000000000000000241311463451655400163760ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "ibvwrap.h" #include #include #include "ibvsymbols.h" static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; static ncclResult_t initResult; struct ncclIbvSymbols ibvSymbols; ncclResult_t wrap_ibv_symbols(void) { pthread_once(&initOnceControl, [](){ initResult = buildIbvSymbols(&ibvSymbols); }); return initResult; } /* CHECK_NOT_NULL: helper macro to check for NULL symbol */ #define CHECK_NOT_NULL(container, internal_name) \ if (container.internal_name == NULL) { \ WARN("lib wrapper not initialized."); \ return ncclInternalError; \ } #define IBV_PTR_CHECK_ERRNO(container, internal_name, call, retval, error_retval, name) \ CHECK_NOT_NULL(container, internal_name); \ retval = container.call; \ if (retval == error_retval) { \ WARN("Call to " name " failed with error %s", strerror(errno)); \ return ncclSystemError; \ } \ return ncclSuccess; #define IBV_PTR_CHECK(container, internal_name, call, retval, error_retval, name) \ CHECK_NOT_NULL(container, internal_name); \ retval = container.call; \ if (retval == error_retval) { \ WARN("Call to " name " failed"); \ return ncclSystemError; \ } \ return ncclSuccess; #define IBV_INT_CHECK_RET_ERRNO_OPTIONAL(container, internal_name, call, success_retval, name, supported) \ if (container.internal_name == NULL) { \ INFO(NCCL_NET, "Call to " name " skipped, internal_name doesn't exist"); \ *supported = 0; \ return ncclSuccess; \ } \ int ret = container.call; \ if (ret == ENOTSUP || ret == EOPNOTSUPP) { \ INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \ *supported = 0; \ return ncclSuccess; \ } else if (ret != success_retval) { \ WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \ *supported = 1; \ return ncclSystemError; \ } \ *supported = 1; \ return ncclSuccess; #define IBV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \ CHECK_NOT_NULL(container, internal_name); \ int ret = container.call; \ if (ret != success_retval) { \ WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \ return ncclSystemError; \ } \ return ncclSuccess; #define IBV_INT_CHECK(container, internal_name, call, error_retval, name) \ CHECK_NOT_NULL(container, internal_name); \ int ret = container.call; \ if (ret == error_retval) { \ WARN("Call to " name " failed"); \ return ncclSystemError; \ } \ return ncclSuccess; #define IBV_PASSTHRU(container, internal_name, call) \ CHECK_NOT_NULL(container, internal_name); \ container.call; \ return ncclSuccess; ncclResult_t wrap_ibv_fork_init() { IBV_INT_CHECK(ibvSymbols, ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init"); } ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices) { *ret = ibvSymbols.ibv_internal_get_device_list(num_devices); if (*ret == NULL) *num_devices = 0; return ncclSuccess; } ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list) { IBV_PASSTHRU(ibvSymbols, ibv_internal_free_device_list, ibv_internal_free_device_list(list)); } const char *wrap_ibv_get_device_name(struct ibv_device *device) { if (ibvSymbols.ibv_internal_get_device_name == NULL) { WARN("lib wrapper not initialized."); exit(-1); } return ibvSymbols.ibv_internal_get_device_name(device); } ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device) { /*returns 0 on success, -1 on failure*/ IBV_PTR_CHECK(ibvSymbols, ibv_internal_open_device, ibv_internal_open_device(device), *ret, NULL, "ibv_open_device"); } ncclResult_t wrap_ibv_close_device(struct ibv_context *context) { /*returns 0 on success, -1 on failure*/ IBV_INT_CHECK(ibvSymbols, ibv_internal_close_device, ibv_internal_close_device(context), -1, "ibv_close_device"); } ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event) { /*returns 0 on success, and -1 on error*/ IBV_INT_CHECK(ibvSymbols, ibv_internal_get_async_event, ibv_internal_get_async_event(context, event), -1, "ibv_get_async_event"); } ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event) { IBV_PASSTHRU(ibvSymbols, ibv_internal_ack_async_event, ibv_internal_ack_async_event(event)); } ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_device, ibv_internal_query_device(context, device_attr), 0, "ibv_query_device"); } ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port"); } ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) { IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_gid, ibv_internal_query_gid(context, port_num, index, gid), 0, "ibv_query_gid"); } ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) { IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_qp, ibv_internal_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp"); } ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) { IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd"); } ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_dealloc_pd, ibv_internal_dealloc_pd(pd), 0, "ibv_dealloc_pd"); } ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) { IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr"); } struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) { if (ibvSymbols.ibv_internal_reg_mr == NULL) { WARN("lib wrapper not initialized."); return NULL; } return ibvSymbols.ibv_internal_reg_mr(pd, addr, length, access); } ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) { if (ibvSymbols.ibv_internal_reg_mr_iova2 == NULL) { return ncclInternalError; } if (ret == NULL) { return ncclSuccess; } // Assume dummy call IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2"); } /* DMA-BUF support */ ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) { IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr"); } struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) { if (ibvSymbols.ibv_internal_reg_dmabuf_mr == NULL) { errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set return NULL; } return ibvSymbols.ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access); } ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr"); } ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) { IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq"); } ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) { IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_destroy_cq, ibv_internal_destroy_cq(cq), 0, "ibv_destroy_cq"); } ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) { IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_destroy_qp, ibv_internal_destroy_qp(qp), 0, "ibv_destroy_qp"); } ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) { IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp"); } ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp"); } ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibvSymbols, ibv_internal_query_ece, ibv_internal_query_ece(qp, ece), 0, "ibv_query_ece", supported); } ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibvSymbols, ibv_internal_set_ece, ibv_internal_set_ece(qp, ece), 0, "ibv_set_ece", supported); } ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) { *ret = (char *) ibvSymbols.ibv_internal_event_type_str(event); return ncclSuccess; } nccl-2.22.3-1/src/misc/ipcsocket.cc000066400000000000000000000147231463451655400167160ustar00rootroot00000000000000/* * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved. * * See COPYRIGHT for license information */ #include "ipcsocket.h" #include "utils.h" #include #include #include // Enable Linux abstract socket naming #define USE_ABSTRACT_SOCKET #define NCCL_IPC_SOCKNAME_STR "/tmp/nccl-socket-%d-%lx" /* * Create a Unix Domain Socket */ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag) { int fd = -1; struct sockaddr_un cliaddr; char temp[NCCL_IPC_SOCKNAME_LEN] = ""; if (handle == NULL) { return ncclInternalError; } handle->fd = -1; handle->socketName[0] = '\0'; if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) { WARN("UDS: Socket creation error : %s (%d)", strerror(errno), errno); return ncclSystemError; } bzero(&cliaddr, sizeof(cliaddr)); cliaddr.sun_family = AF_UNIX; // Create unique name for the socket. int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash); if (len > (sizeof(cliaddr.sun_path) - 1)) { WARN("UDS: Cannot bind provided name to socket. Name too large"); return ncclInternalError; } #ifndef USE_ABSTRACT_SOCKET unlink(temp); #endif TRACE(NCCL_INIT, "UDS: Creating socket %s", temp); strncpy(cliaddr.sun_path, temp, len); #ifdef USE_ABSTRACT_SOCKET cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick #endif if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) { WARN("UDS: Binding to socket %s failed : %s (%d)", temp, strerror(errno), errno); close(fd); return ncclSystemError; } handle->fd = fd; strcpy(handle->socketName, temp); handle->abortFlag = abortFlag; // Mark socket as non-blocking if (handle->abortFlag) { int flags; EQCHECK(flags = fcntl(fd, F_GETFL), -1); SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); } return ncclSuccess; } ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd) { if (handle == NULL) { WARN("ncclSocketGetFd: pass NULL socket"); return ncclInvalidArgument; } if (fd) *fd = handle->fd; return ncclSuccess; } ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) { if (handle == NULL) { return ncclInternalError; } if (handle->fd <= 0) { return ncclSuccess; } #ifndef USE_ABSTRACT_SOCKET if (handle->socketName[0] != '\0') { unlink(handle->socketName); } #endif close(handle->fd); return ncclSuccess; } ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd) { struct msghdr msg = {0, 0, 0, 0, 0, 0, 0}; struct iovec iov[1]; // Union to guarantee alignment requirements for control array union { struct cmsghdr cm; char control[CMSG_SPACE(sizeof(int))]; } control_un; struct cmsghdr *cmptr; char dummy_buffer[1]; int ret; msg.msg_control = control_un.control; msg.msg_controllen = sizeof(control_un.control); if (hdr == NULL) { iov[0].iov_base = (void *)dummy_buffer; iov[0].iov_len = sizeof(dummy_buffer); } else { iov[0].iov_base = hdr; iov[0].iov_len = hdrLen; } msg.msg_iov = iov; msg.msg_iovlen = 1; while ((ret = recvmsg(handle->fd, &msg, 0)) <= 0) { if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) { WARN("UDS: Receiving data over socket failed : %d", errno); return ncclSystemError; } if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError; } if (recvFd != NULL) { if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) { if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) { WARN("UDS: Receiving data over socket failed"); return ncclSystemError; } memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd)); } else { WARN("UDS: Receiving data over socket %s failed", handle->socketName); return ncclSystemError; } TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName); } return ncclSuccess; } ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) { return ncclIpcSocketRecvMsg(handle, NULL, 0, recvFd); } ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash) { struct msghdr msg = {0, 0, 0, 0, 0, 0, 0}; struct iovec iov[1]; char temp[NCCL_IPC_SOCKNAME_LEN]; union { struct cmsghdr cm; char control[CMSG_SPACE(sizeof(int))]; } control_un; struct cmsghdr *cmptr; char dummy_buffer[1]; struct sockaddr_un cliaddr; // Construct client address to send this shareable handle to bzero(&cliaddr, sizeof(cliaddr)); cliaddr.sun_family = AF_UNIX; int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash); if (len > (sizeof(cliaddr.sun_path) - 1)) { WARN("UDS: Cannot connect to provided name for socket. Name too large"); return ncclInternalError; } (void) strncpy(cliaddr.sun_path, temp, len); #ifdef USE_ABSTRACT_SOCKET cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick #endif TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d to UDS socket %s", hdr, hdrLen, temp); if (sendFd != -1) { TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp); msg.msg_control = control_un.control; msg.msg_controllen = sizeof(control_un.control); cmptr = CMSG_FIRSTHDR(&msg); cmptr->cmsg_len = CMSG_LEN(sizeof(int)); cmptr->cmsg_level = SOL_SOCKET; cmptr->cmsg_type = SCM_RIGHTS; memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd)); } msg.msg_name = (void *)&cliaddr; msg.msg_namelen = sizeof(struct sockaddr_un); if (hdr == NULL) { iov[0].iov_base = (void *)dummy_buffer; iov[0].iov_len = sizeof(dummy_buffer); } else { iov[0].iov_base = hdr; iov[0].iov_len = hdrLen; } msg.msg_iov = iov; msg.msg_iovlen = 1; msg.msg_flags = 0; ssize_t sendResult; while ((sendResult = sendmsg(handle->fd, &msg, 0)) < 0) { if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) { WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno); return ncclSystemError; } if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError; } return ncclSuccess; } ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) { return ncclIpcSocketSendMsg(handle, NULL, 0, sendFd, rank, hash); } nccl-2.22.3-1/src/misc/nvmlwrap.cc000066400000000000000000000315231463451655400165750ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nvmlwrap.h" #include "checks.h" #include "debug.h" #include #include #include int ncclNvmlDeviceCount = 0; ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices]; ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices]; #if NCCL_NVML_DIRECT #define NCCL_NVML_FN(name, rettype, arglist) constexpr rettype(*pfn_##name)arglist = name; #else #include #define NCCL_NVML_FN(name, rettype, arglist) rettype(*pfn_##name)arglist = nullptr; #endif namespace { NCCL_NVML_FN(nvmlInit, nvmlReturn_t, ()) NCCL_NVML_FN(nvmlInit_v2, nvmlReturn_t, ()) NCCL_NVML_FN(nvmlShutdown, nvmlReturn_t, ()) NCCL_NVML_FN(nvmlDeviceGetCount, nvmlReturn_t, (unsigned int*)) NCCL_NVML_FN(nvmlDeviceGetCount_v2, nvmlReturn_t, (unsigned int*)) NCCL_NVML_FN(nvmlDeviceGetHandleByPciBusId, nvmlReturn_t, (const char* pciBusId, nvmlDevice_t* device)) NCCL_NVML_FN(nvmlDeviceGetHandleByIndex, nvmlReturn_t, (unsigned int index, nvmlDevice_t *device)) NCCL_NVML_FN(nvmlDeviceGetIndex, nvmlReturn_t, (nvmlDevice_t device, unsigned* index)) NCCL_NVML_FN(nvmlErrorString, char const*, (nvmlReturn_t r)) NCCL_NVML_FN(nvmlDeviceGetNvLinkState, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive)) NCCL_NVML_FN(nvmlDeviceGetNvLinkRemotePciInfo, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci)) NCCL_NVML_FN(nvmlDeviceGetNvLinkCapability, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult)) NCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor)) NCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus)) NCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values)) // MNNVL support NCCL_NVML_FN(nvmlDeviceGetGpuFabricInfoV, nvmlReturn_t, (nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo)) // CC support NCCL_NVML_FN(nvmlSystemGetConfComputeState, nvmlReturn_t, (nvmlConfComputeSystemState_t *state)); NCCL_NVML_FN(nvmlSystemGetConfComputeSettings, nvmlReturn_t, (nvmlSystemConfComputeSettings_t *setting)); std::mutex lock; // NVML has had some thread safety bugs bool initialized = false; thread_local bool threadInitialized = false; ncclResult_t initResult; union nvmlCCInfoInternal { nvmlConfComputeSystemState_t settingV12020; nvmlSystemConfComputeSettings_t settingV12040; }; } ncclResult_t ncclNvmlEnsureInitialized() { // Optimization to avoid repeatedly grabbing the lock when we only want to // read from the global tables. if (threadInitialized) return initResult; threadInitialized = true; std::lock_guard locked(lock); if (initialized) return initResult; initialized = true; #if !NCCL_NVML_DIRECT if (pfn_nvmlInit == nullptr) { void *libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW); if (libhandle == nullptr) { WARN("Failed to open libnvidia-ml.so.1"); initResult = ncclSystemError; return initResult; } struct Symbol { void **ppfn; char const *name; }; std::initializer_list symbols = { {(void**)&pfn_nvmlInit, "nvmlInit"}, {(void**)&pfn_nvmlInit_v2, "nvmlInit_v2"}, {(void**)&pfn_nvmlShutdown, "nvmlShutdown"}, {(void**)&pfn_nvmlDeviceGetCount, "nvmlDeviceGetCount"}, {(void**)&pfn_nvmlDeviceGetCount_v2, "nvmlDeviceGetCount_v2"}, {(void**)&pfn_nvmlDeviceGetHandleByPciBusId, "nvmlDeviceGetHandleByPciBusId"}, {(void**)&pfn_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"}, {(void**)&pfn_nvmlDeviceGetIndex, "nvmlDeviceGetIndex"}, {(void**)&pfn_nvmlErrorString, "nvmlErrorString"}, {(void**)&pfn_nvmlDeviceGetNvLinkState, "nvmlDeviceGetNvLinkState"}, {(void**)&pfn_nvmlDeviceGetNvLinkRemotePciInfo, "nvmlDeviceGetNvLinkRemotePciInfo"}, {(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"}, {(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"}, {(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"}, {(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"}, // MNNVL support {(void**)&pfn_nvmlDeviceGetGpuFabricInfoV, "nvmlDeviceGetGpuFabricInfoV"}, // CC support {(void**)&pfn_nvmlSystemGetConfComputeState, "nvmlSystemGetConfComputeState"}, {(void**)&pfn_nvmlSystemGetConfComputeSettings, "nvmlSystemGetConfComputeSettings"} }; for(Symbol sym: symbols) { *sym.ppfn = dlsym(libhandle, sym.name); } } #endif #if NCCL_NVML_DIRECT bool have_v2 = true; #else bool have_v2 = pfn_nvmlInit_v2 != nullptr; // if this compare is done in the NCCL_NVML_DIRECT=1 case then GCC warns about it never being null #endif nvmlReturn_t res1 = (have_v2 ? pfn_nvmlInit_v2 : pfn_nvmlInit)(); if (res1 != NVML_SUCCESS) { WARN("nvmlInit%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1)); initResult = ncclSystemError; return initResult; } unsigned int ndev; res1 = (have_v2 ? pfn_nvmlDeviceGetCount_v2 : pfn_nvmlDeviceGetCount)(&ndev); if (res1 != NVML_SUCCESS) { WARN("nvmlDeviceGetCount%s() failed: %s", have_v2 ? "_v2" :"", pfn_nvmlErrorString(res1)); initResult = ncclSystemError; return initResult; } ncclNvmlDeviceCount = int(ndev); if (ncclNvmlMaxDevices < ncclNvmlDeviceCount) { WARN("nvmlDeviceGetCount() reported more devices (%d) than the internal maximum (ncclNvmlMaxDevices=%d)", ncclNvmlDeviceCount, ncclNvmlMaxDevices); initResult = ncclInternalError; return initResult; } for(int a=0; a < ncclNvmlDeviceCount; a++) { res1 = pfn_nvmlDeviceGetHandleByIndex(a, &ncclNvmlDevices[a].handle); if (res1 != NVML_SUCCESS) { WARN("nvmlDeviceGetHandleByIndex(%d) failed: %s", int(a), pfn_nvmlErrorString(res1)); initResult = ncclSystemError; return initResult; } res1 = pfn_nvmlDeviceGetCudaComputeCapability(ncclNvmlDevices[a].handle, &ncclNvmlDevices[a].computeCapabilityMajor, &ncclNvmlDevices[a].computeCapabilityMinor); if (res1 != NVML_SUCCESS) { WARN("nvmlDeviceGetCudaComputeCapability(%d) failed: %s", int(a), pfn_nvmlErrorString(res1)); initResult = ncclSystemError; return initResult; } } for(int a=0; a < ncclNvmlDeviceCount; a++) { for(int b=0; b < ncclNvmlDeviceCount; b++) { nvmlDevice_t da = ncclNvmlDevices[a].handle; nvmlDevice_t db = ncclNvmlDevices[b].handle; res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_READ, &ncclNvmlDevicePairs[a][b].p2pStatusRead); if (res1 != NVML_SUCCESS) { WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1)); initResult = ncclSystemError; return initResult; } res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_WRITE, &ncclNvmlDevicePairs[a][b].p2pStatusWrite); if (res1 != NVML_SUCCESS) { WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1)); initResult = ncclSystemError; return initResult; } } } initResult = ncclSuccess; return initResult; } #define NVMLCHECK(name, ...) do { \ nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \ if (e44241808 != NVML_SUCCESS) { \ WARN(#name "() failed: %s", pfn_nvmlErrorString(e44241808)); \ return ncclSystemError; \ } \ } while(0) #define NVMLTRY(name, ...) do { \ if (!NCCL_NVML_DIRECT && pfn_##name == nullptr) \ return ncclInternalError; /* missing symbol is not a warned error */ \ nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \ if (e44241808 != NVML_SUCCESS) { \ if (e44241808 != NVML_ERROR_NOT_SUPPORTED) \ INFO(NCCL_INIT, #name "() failed: %s", pfn_nvmlErrorString(e44241808)); \ return ncclSystemError; \ } \ } while(0) ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) { NCCLCHECK(ncclNvmlEnsureInitialized()); std::lock_guard locked(lock); NVMLCHECK(nvmlDeviceGetHandleByPciBusId, pciBusId, device); return ncclSuccess; } ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) { NCCLCHECK(ncclNvmlEnsureInitialized()); *device = ncclNvmlDevices[index].handle; return ncclSuccess; } ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { NCCLCHECK(ncclNvmlEnsureInitialized()); for (int d=0; d < ncclNvmlDeviceCount; d++) { if (ncclNvmlDevices[d].handle == device) { *index = d; return ncclSuccess; } } return ncclInvalidArgument; } ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) { NCCLCHECK(ncclNvmlEnsureInitialized()); std::lock_guard locked(lock); NVMLTRY(nvmlDeviceGetNvLinkState, device, link, isActive); return ncclSuccess; } ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) { NCCLCHECK(ncclNvmlEnsureInitialized()); std::lock_guard locked(lock); NVMLTRY(nvmlDeviceGetNvLinkRemotePciInfo, device, link, pci); return ncclSuccess; } ncclResult_t ncclNvmlDeviceGetNvLinkCapability( nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult ) { NCCLCHECK(ncclNvmlEnsureInitialized()); std::lock_guard locked(lock); NVMLTRY(nvmlDeviceGetNvLinkCapability, device, link, capability, capResult); return ncclSuccess; } ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { NCCLCHECK(ncclNvmlEnsureInitialized()); for(int d=0; d < ncclNvmlDeviceCount; d++) { if(device == ncclNvmlDevices[d].handle) { *major = ncclNvmlDevices[d].computeCapabilityMajor; *minor = ncclNvmlDevices[d].computeCapabilityMinor; return ncclSuccess; } } return ncclInvalidArgument; } ncclResult_t ncclNvmlDeviceGetP2PStatus( nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus ) { NCCLCHECK(ncclNvmlEnsureInitialized()); if (p2pIndex == NVML_P2P_CAPS_INDEX_READ || p2pIndex == NVML_P2P_CAPS_INDEX_WRITE) { int a = -1, b = -1; for(int d=0; d < ncclNvmlDeviceCount; d++) { if(device1 == ncclNvmlDevices[d].handle) a = d; if(device2 == ncclNvmlDevices[d].handle) b = d; } if (a == -1 || b == -1) return ncclInvalidArgument; if (p2pIndex == NVML_P2P_CAPS_INDEX_READ) *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusRead; else *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusWrite; } else { std::lock_guard locked(lock); NVMLCHECK(nvmlDeviceGetP2PStatus, device1, device2, p2pIndex, p2pStatus); } return ncclSuccess; } ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values) { NCCLCHECK(ncclNvmlEnsureInitialized()); std::lock_guard locked(lock); NVMLTRY(nvmlDeviceGetFieldValues, device, valuesCount, values); return ncclSuccess; } // MNNVL support ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo) { NCCLCHECK(ncclNvmlEnsureInitialized()); std::lock_guard locked(lock); gpuFabricInfo->version = nvmlGpuFabricInfo_v2; NVMLTRY(nvmlDeviceGetGpuFabricInfoV, device, gpuFabricInfo); return ncclSuccess; } ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) { NCCLCHECK(ncclNvmlEnsureInitialized()); std::lock_guard locked(lock); nvmlCCInfoInternal ccInfo; if (pfn_nvmlSystemGetConfComputeSettings != NULL) { ccInfo.settingV12040.version = nvmlSystemConfComputeSettings_v1; NVMLTRY(nvmlSystemGetConfComputeSettings, &ccInfo.settingV12040); if (ccInfo.settingV12040.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED) status->CCEnabled = true; else status->CCEnabled = false; if (ccInfo.settingV12040.multiGpuMode == NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE) status->multiGpuCCEnabled = true; else status->multiGpuCCEnabled = false; } else if (pfn_nvmlSystemGetConfComputeState != NULL) { NVMLTRY(nvmlSystemGetConfComputeState, &ccInfo.settingV12020); if (ccInfo.settingV12020.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED) status->CCEnabled = true; else status->CCEnabled = false; status->multiGpuCCEnabled = false; } else { status->CCEnabled = false; status->multiGpuCCEnabled = false; } return ncclSuccess; } nccl-2.22.3-1/src/misc/param.cc000066400000000000000000000046601463451655400160310ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "param.h" #include "debug.h" #include #include #include #include #include #include #include #include #include const char* userHomeDir() { struct passwd *pwUser = getpwuid(getuid()); return pwUser == NULL ? NULL : pwUser->pw_dir; } void setEnvFile(const char* fileName) { FILE * file = fopen(fileName, "r"); if (file == NULL) return; char *line = NULL; char envVar[1024]; char envValue[1024]; size_t n = 0; ssize_t read; while ((read = getline(&line, &n, file)) != -1) { if (line[read-1] == '\n') line[read-1] = '\0'; int s=0; // Env Var Size while (line[s] != '\0' && line[s] != '=') s++; if (line[s] == '\0') continue; strncpy(envVar, line, std::min(1023,s)); envVar[s] = '\0'; s++; strncpy(envValue, line+s, 1023); envValue[1023]='\0'; setenv(envVar, envValue, 0); //printf("%s : %s->%s\n", fileName, envVar, envValue); } if (line) free(line); fclose(file); } void initEnv() { char confFilePath[1024]; const char * userDir = userHomeDir(); if (userDir) { sprintf(confFilePath, "%s/.nccl.conf", userDir); setEnvFile(confFilePath); } sprintf(confFilePath, "/etc/nccl.conf"); setEnvFile(confFilePath); } void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) { static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&mutex); if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) { const char* str = ncclGetEnv(env); int64_t value = deftVal; if (str && strlen(str) > 0) { errno = 0; value = strtoll(str, nullptr, 0); if (errno) { value = deftVal; INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal); } else { INFO(NCCL_ENV,"%s set by environment to %lld.", env, (long long)value); } } __atomic_store_n(cache, value, __ATOMIC_RELAXED); } pthread_mutex_unlock(&mutex); } const char *ncclGetEnv(const char *name) { static pthread_once_t once = PTHREAD_ONCE_INIT; pthread_once(&once, initEnv); return getenv(name); } nccl-2.22.3-1/src/misc/profiler.cc000066400000000000000000000112301463451655400165420ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "profiler.h" //#define PROFILE_PROXY 1 #ifdef PROFILE_PROXY #include "timer.h" #include "alloc.h" static const char* profilingStateSendStr[] = { "BufferWait", "GPUWait", "SendWait", "", "End" }; static const char* profilingStateRecvStr[] = { "BufferWait", "RecvWait", "FlushWait", "GPUWait", "End" }; static const char* profilingEventStr[] = { "SendRecv", "Sleep", "Idle", "Append" }; struct ncclProxyProfileEvent { double timestamp[6]; uint64_t opCount; int peer; int step; uint16_t channel; uint8_t type; // send / recv uint8_t opIndex; }; struct ncclProxyProfileEvent* profilingEvents = NULL; int profilingIndex = 0; double profilingStart = 0; #define MAX_EVENTS 200000 ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { if (profilingEvents == NULL) { NCCLCHECK(ncclCalloc(&profilingEvents, MAX_EVENTS)); profilingStart = gettime(); } struct ncclProxyProfileEvent* event = NULL; if (state%8 == 0) { if (profilingIndex == MAX_EVENTS) return ncclSuccess; args->subs[sub].profilingEvents[step%NCCL_STEPS] = event = profilingEvents+profilingIndex++; if (state == ncclProxyProfileBegin) { // Proxy operation information event->opCount = args->opCount; event->channel = args->subs[sub].channelId; event->peer = args->subs[sub].peer; event->type = args->pattern; event->step = step; event->opIndex = (((uint64_t)args)/sizeof(struct ncclProxyArgs))%256; } else event->peer = -state; } else { event = (struct ncclProxyProfileEvent*)args->subs[sub].profilingEvents[step%NCCL_STEPS]; if (state == ncclProxyProfileEnd) args->subs[sub].profilingEvents[step%NCCL_STEPS] = NULL; if (state == ncclProxyProfileAppendEnd) event->opCount = args->opCount; } // Timestamp event->timestamp[state%8] = gettime()-profilingStart; return ncclSuccess; } void ncclProfilingDump() { static int dumpDone = 0; if (dumpDone) return; dumpDone = 1; const char* str = ncclGetEnv("NCCL_PROXY_PROFILE"); if (!str) { free(profilingEvents); return; } FILE* f = fopen(str, "w"); fprintf(f, "[\n"); for (int i=0; ipeer >= 0; const char* typeStr = sendrecv ? (e->type == ncclPatternSend ? "Send" : "Recv") : profilingEventStr[-(e->peer/8)]; if (sendrecv) { int state = ncclProxyProfileBegin; const char** stateStr = e->type == ncclPatternSend ? profilingStateSendStr : profilingStateRecvStr; fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f, \"args\": { \"opCount\": %ld, \"proxyOpIndex\":%d } },\n", typeStr, e->peer, e->step, i, e->channel, e->timestamp[state], e->opCount, e->opIndex); while (statetimestamp[state]) { const char* name = stateStr[state]; fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", name, i, e->channel, e->timestamp[state]); state++; while (e->timestamp[state] == 0) state++; fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", name, i, e->channel, e->timestamp[state]); } } fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", typeStr, e->peer, e->step, i, e->channel, e->timestamp[state]); } else { if (e->peer == -ncclProxyProfileAppend) { fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f, \"args\": { \"added\": %ld } },\n", typeStr, i, e->timestamp[0], e->opCount); } else { fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n", typeStr, i, e->timestamp[0]); } fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n", typeStr, i, e->timestamp[1]); } } fprintf(f, "{} ]\n"); fclose(f); free(profilingEvents); } #else ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } void ncclProfilingDump() {} #endif nccl-2.22.3-1/src/misc/shmutils.cc000066400000000000000000000171011463451655400165730ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "shm.h" #include "comm.h" #include "checks.h" #include #include #include #include #include #include #include #include #include struct shmHandleInternal { int fd; char* shmPath; char* shmPtr; void* devShmPtr; size_t shmSize; size_t realShmSize; int* refcount; }; static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmSize, char* hptr, void* dptr, bool create, struct shmHandleInternal* handle) { handle->fd = fd; handle->shmPtr = hptr; handle->devShmPtr = dptr; handle->shmSize = shmSize; handle->realShmSize = realShmSize; handle->refcount = (hptr != NULL) ? (int*)(hptr + shmSize) : NULL; if (create) { int slen = strlen(shmPath); handle->shmPath = (char*)malloc(slen + 1); memcpy(handle->shmPath, shmPath, slen + 1); if (hptr) memset(hptr, 0, shmSize); } else { handle->shmPath = NULL; } return; } ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle) { int fd = -1; char* hptr = NULL; void* dptr = NULL; ncclResult_t ret = ncclSuccess; struct shmHandleInternal* tmphandle; bool create = refcount > 0 ? true : false; const size_t refSize = sizeof(int); /* extra sizeof(int) bytes for reference count */ const size_t realShmSize = shmSize + refSize; *handle = *shmPtr = NULL; /* assume shmPtr and handle always set correctly by users. */ EQCHECKGOTO(tmphandle = (struct shmHandleInternal*)calloc(1, sizeof(struct shmHandleInternal)), NULL, ret, fail); if (create) { /* refcount > 0 means the caller tries to allocate a shared memory. This shared memory segment will have * refcount references; when the peer attaches, it should pass -1 to reduce one reference count. When it * goes down to 0, unlink should be called in order to delete shared memory file. */ if (shmPath[0] == '\0') { sprintf(shmPath, "/dev/shm/nccl-XXXXXX"); retry_mkstemp: fd = mkstemp(shmPath); if (fd < 0) { if (errno == EINTR) { INFO(NCCL_ALL, "mkstemp: Failed to create %s, error: %s (%d) - retrying", shmPath, strerror(errno), errno); goto retry_mkstemp; } WARN("Error: failed to create shared memory file %p, error %s (%d)", shmPath, strerror(errno), errno); ret = ncclSystemError; goto fail; } } else { SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail); } retry_fallocate: if (fallocate(fd, 0, 0, realShmSize) != 0) { if (errno == EINTR) { INFO(NCCL_ALL, "fallocate: Failed to extend %s to %ld bytes, error: %s (%d) - retrying", shmPath, realShmSize, strerror(errno), errno); goto retry_fallocate; } WARN("Error: failed to extend %s to %ld bytes, error: %s (%d)", shmPath, realShmSize, strerror(errno), errno); ret = ncclSystemError; goto fail; } INFO(NCCL_ALLOC, "Allocated %ld bytes of shared memory in %s", realShmSize, shmPath); } else { SYSCHECKGOTO(fd = open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), ret, fail); } hptr = (char*)mmap(NULL, realShmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (hptr == MAP_FAILED) { WARN("Error: Could not map %s size %zu, error: %s (%d)", shmPath, realShmSize, strerror(errno), errno); ret = ncclSystemError; hptr = NULL; goto fail; } if (create) { *(int*)(hptr + shmSize) = refcount; } else { int remref = ncclAtomicRefCountDecrement((int*)(hptr + shmSize)); if (remref == 0) { /* the last peer has completed attachment, it should unlink the shm mem file. */ if (unlink(shmPath) != 0) { INFO(NCCL_ALLOC, "unlink shared memory %s failed, error: %s (%d)", shmPath, strerror(errno), errno); } } } if (devShmPtr) { CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterMapped), ret, fail); CUDACHECKGOTO(cudaHostGetDevicePointer(&dptr, (void*)hptr, 0), ret, fail); } shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle); exit: *shmPtr = hptr; if (devShmPtr) *devShmPtr = dptr; *handle = (ncclShmHandle_t)tmphandle; return ret; fail: WARN("Error while %s shared memory segment %s (size %ld), error: %s (%d)", create ? "creating" : "attaching to", shmPath, shmSize, strerror(errno), errno); if (tmphandle) { shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle); ncclShmClose((ncclShmHandle_t)tmphandle); tmphandle = NULL; } hptr = NULL; dptr = NULL; goto exit; } ncclResult_t ncclShmClose(ncclShmHandle_t handle) { ncclResult_t ret = ncclSuccess; struct shmHandleInternal* tmphandle = (struct shmHandleInternal*)handle; if (tmphandle) { if (tmphandle->fd >= 0) { close(tmphandle->fd); if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) { if (unlink(tmphandle->shmPath) != 0) { WARN("unlink shared memory %s failed, error: %s (%d)", tmphandle->shmPath, strerror(errno), errno); ret = ncclSystemError; } } free(tmphandle->shmPath); } if (tmphandle->shmPtr) { if (tmphandle->devShmPtr) CUDACHECK(cudaHostUnregister(tmphandle->shmPtr)); if (munmap(tmphandle->shmPtr, tmphandle->realShmSize) != 0) { WARN("munmap of shared memory %p size %ld failed, error: %s (%d)", tmphandle->shmPtr, tmphandle->realShmSize, strerror(errno), errno); ret = ncclSystemError; } } free(tmphandle); } return ret; } ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) { ncclResult_t ret = ncclSuccess; struct shmHandleInternal* tmphandle = (struct shmHandleInternal*)handle; if (tmphandle) { if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) { if (unlink(tmphandle->shmPath) != 0) { WARN("unlink shared memory %s failed, error: %s (%d)", tmphandle->shmPath, strerror(errno), errno); ret = ncclSystemError; } free(tmphandle->shmPath); tmphandle->shmPath = NULL; } } return ret; } ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize) { ncclResult_t ret = ncclSuccess; int curRound = shmem->round; size_t mycnt; if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL || shmem->maxTypeSize < typeSize) { ret = ncclInvalidArgument; goto exit; } memcpy((char*)shmem->ptr[curRound] + comm->localRank * typeSize, sendbuff, typeSize); /* sync among local ranks */ mycnt = __atomic_add_fetch(shmem->cnt[curRound], 1, __ATOMIC_ACQ_REL); if (mycnt == comm->localRanks) { *shmem->cnt[curRound ^ 1] = 0; /* prepare next round */ __atomic_store_n(shmem->cnt[curRound], comm->localRanks + 1, __ATOMIC_RELEASE); /* release everyone */ } else { uint64_t t0 = clockNano(); while(__atomic_load_n(shmem->cnt[curRound], __ATOMIC_ACQUIRE) != comm->localRanks + 1) { if (clockNano() - t0 >= 5 * 1000) sched_yield(); if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE) == 1) { ret = ncclInternalError; goto exit; } } } memcpy(recvbuff, (const void*)shmem->ptr[curRound], comm->localRanks * typeSize); shmem->round ^= 1; exit: return ret; } nccl-2.22.3-1/src/misc/socket.cc000066400000000000000000000760701463451655400162250ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "socket.h" #include "utils.h" #include #include #include #include #include "param.h" static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) { int bytes = 0; *closed = 0; char* data = (char*)ptr; char line[SOCKET_NAME_MAXLEN+1]; do { if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL); if (op == NCCL_SOCKET_RECV && bytes == 0) { *closed = 1; return ncclSuccess; } if (bytes == -1) { if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { WARN("socketProgressOpt: Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno)); return ncclRemoteError; } else { bytes = 0; } } (*offset) += bytes; if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) { INFO(NCCL_NET, "socketProgressOpt: abort called"); return ncclInternalError; } } while (bytes > 0 && (*offset) < size); return ncclSuccess; } static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) { int closed; NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0 /*block*/, &closed)); if (closed) { char line[SOCKET_NAME_MAXLEN+1]; WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0)); return ncclRemoteError; } return ncclSuccess; } static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) { while (*offset < size) NCCLCHECK(socketProgress(op, sock, ptr, size, offset)); return ncclSuccess; } /* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo() * * Output: "IPv4/IPv6 address" */ const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) { if (buf == NULL || addr == NULL) return NULL; struct sockaddr *saddr = &addr->sa; if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; } char host[NI_MAXHOST], service[NI_MAXSERV]; /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned. * (When not set, this will still happen in case the node's name cannot be determined.) */ int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0); (void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag); sprintf(buf, "%s<%s>", host, service); return buf; } static uint16_t socketToPort(union ncclSocketAddress *addr) { struct sockaddr *saddr = &addr->sa; return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port); } /* Allow the user to force the IPv4/IPv6 interface selection */ static int envSocketFamily(void) { int family = -1; // Family selection is not forced, will use first one found const char* env = ncclGetEnv("NCCL_SOCKET_FAMILY"); if (env == NULL) return family; INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env); if (strcmp(env, "AF_INET") == 0) family = AF_INET; // IPv4 else if (strcmp(env, "AF_INET6") == 0) family = AF_INET6; // IPv6 return family; } static int findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; #endif struct netIf userIfs[MAX_IFS]; bool searchNot = prefixList && prefixList[0] == '^'; if (searchNot) prefixList++; bool searchExact = prefixList && prefixList[0] == '='; if (searchExact) prefixList++; int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS); int found = 0; struct ifaddrs *interfaces, *interface; getifaddrs(&interfaces); for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) { if (interface->ifa_addr == NULL) continue; /* We only support IPv4 & IPv6 */ int family = interface->ifa_addr->sa_family; if (family != AF_INET && family != AF_INET6) continue; TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, ncclSocketToString((union ncclSocketAddress *) interface->ifa_addr, line)); /* Allow the caller to force the socket family type */ if (sock_family != -1 && family != sock_family) continue; /* We also need to skip IPv6 loopback interfaces */ if (family == AF_INET6) { struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr); if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue; } // check against user specified interfaces if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) { continue; } // Check that this interface has not already been saved // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link bool duplicate = false; for (int i = 0; i < found; i++) { if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; } } if (!duplicate) { // Store the interface name strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize); // Store the IP address int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); memcpy(addrs+found, interface->ifa_addr, salen); found++; } } freeifaddrs(interfaces); return found; } static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote) { /* Check family first */ int family = local_if.ifa_addr->sa_family; if (family != remote->sa.sa_family) { return false; } if (family == AF_INET) { struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr); struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask); struct sockaddr_in& remote_addr = remote->sin; struct in_addr local_subnet, remote_subnet; local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr; remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr; return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true; } else if (family == AF_INET6) { struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr); struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask); struct sockaddr_in6& remote_addr = remote->sin6; struct in6_addr& local_in6 = local_addr->sin6_addr; struct in6_addr& mask_in6 = mask->sin6_addr; struct in6_addr& remote_in6 = remote_addr.sin6_addr; bool same = true; int len = 16; //IPv6 address is 16 unsigned char for (int c = 0; c < len; c++) { //Network byte order is big-endian char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c]; char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c]; if (c1 ^ c2) { same = false; break; } } // At last, we need to compare scope id // Two Link-type addresses can have the same subnet address even though they are not in the same scope // For Global type, this field is 0, so a comparison wouldn't matter same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id); return same; } else { WARN("Net : Unsupported address family type"); return false; } } int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; #endif char line_a[SOCKET_NAME_MAXLEN+1]; int found = 0; struct ifaddrs *interfaces, *interface; getifaddrs(&interfaces); for (interface = interfaces; interface && !found; interface = interface->ifa_next) { if (interface->ifa_addr == NULL) continue; /* We only support IPv4 & IPv6 */ int family = interface->ifa_addr->sa_family; if (family != AF_INET && family != AF_INET6) continue; // check against user specified interfaces if (!matchSubnet(*interface, remoteAddr)) { continue; } // Store the local IP address int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); memcpy(localAddrs+found, interface->ifa_addr, salen); // Store the interface name strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize); TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, ncclSocketToString(localAddrs+found, line), ncclSocketToString(remoteAddr, line_a)); found++; if (found == maxIfs) break; } if (found == 0) { WARN("Net : No interface found in the same subnet as remote address %s", ncclSocketToString(remoteAddr, line_a)); } freeifaddrs(interfaces); return found; } ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) { if (!(ip_port_pair && strlen(ip_port_pair) > 1)) { WARN("Net : string is null"); return ncclInvalidArgument; } bool ipv6 = ip_port_pair[0] == '['; /* Construct the sockaddress structure */ if (!ipv6) { struct netIf ni; // parse : string, expect one pair if (parseStringList(ip_port_pair, &ni, 1) != 1) { WARN("Net : No valid : pair found"); return ncclInvalidArgument; } struct addrinfo hints, *p; int rv; memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) { WARN("Net : error encountered when getting address info : %s", gai_strerror(rv)); return ncclInvalidArgument; } // use the first if (p->ai_family == AF_INET) { struct sockaddr_in& sin = ua->sin; memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in)); sin.sin_family = AF_INET; // IPv4 //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr)); // IP address sin.sin_port = htons(ni.port); // port } else if (p->ai_family == AF_INET6) { struct sockaddr_in6& sin6 = ua->sin6; memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6)); sin6.sin6_family = AF_INET6; // IPv6 sin6.sin6_port = htons(ni.port); // port sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete sin6.sin6_scope_id = 0; // should be global scope, set to 0 } else { WARN("Net : unsupported IP family"); return ncclInvalidArgument; } freeaddrinfo(p); // all done with this structure } else { int i, j = -1, len = strlen(ip_port_pair); for (i = 1; i < len; i++) { if (ip_port_pair[i] == '%') j = i; if (ip_port_pair[i] == ']') break; } if (i == len) { WARN("Net : No valid [IPv6]:port pair found"); return ncclInvalidArgument; } bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ]; memset(ip_str, '\0', sizeof(ip_str)); memset(port_str, '\0', sizeof(port_str)); memset(if_name, '\0', sizeof(if_name)); strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1); strncpy(port_str, ip_port_pair+i+2, len-i-1); int port = atoi(port_str); if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name struct sockaddr_in6& sin6 = ua->sin6; sin6.sin6_family = AF_INET6; // IPv6 inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address sin6.sin6_port = htons(port); // port sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope } return ncclSuccess; } int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) { static int shownIfName = 0; int nIfs = 0; // Allow user to force the INET socket family selection int sock_family = envSocketFamily(); // User specified interface const char* env = ncclGetEnv("NCCL_SOCKET_IFNAME"); if (env && strlen(env) > 1) { INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env); // Specified by user : find or fail if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env); nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); } else { // Try to automatically pick the right one // Start with IB nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); // else see if we can get some hint from COMM ID if (nIfs == 0) { const char* commId = ncclGetEnv("NCCL_COMM_ID"); if (commId && strlen(commId) > 1) { INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId); // Try to find interface that is in the same subnet as the IP in comm id union ncclSocketAddress idAddr; ncclSocketGetAddrFromString(&idAddr, commId); nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs); } } // Then look for anything else (but not docker or lo) if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); // Finally look for docker, then lo. if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); } return nIfs; } ncclResult_t ncclSocketListen(struct ncclSocket* sock) { if (sock == NULL) { WARN("ncclSocketListen: pass NULL socket"); return ncclInvalidArgument; } if (sock->fd == -1) { WARN("ncclSocketListen: file descriptor is -1"); return ncclInvalidArgument; } if (socketToPort(&sock->addr)) { // Port is forced by env. Make sure we get the port. int opt = 1; #if defined(SO_REUSEPORT) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt"); #else SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt"); #endif } // addr port should be 0 (Any port) SYSCHECK(bind(sock->fd, &sock->addr.sa, sock->salen), "bind"); /* Get the assigned Port */ socklen_t size = sock->salen; SYSCHECK(getsockname(sock->fd, &sock->addr.sa, &size), "getsockname"); #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", ncclSocketToString(&sock->addr, line)); #endif /* Put the socket in listen mode * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn */ SYSCHECK(listen(sock->fd, 16384), "listen"); sock->state = ncclSocketStateReady; return ncclSuccess; } ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr) { if (sock == NULL) { WARN("ncclSocketGetAddr: pass NULL socket"); return ncclInvalidArgument; } if (sock->state != ncclSocketStateReady) return ncclInternalError; memcpy(addr, &sock->addr, sizeof(union ncclSocketAddress)); return ncclSuccess; } static ncclResult_t socketTryAccept(struct ncclSocket* sock) { socklen_t socklen = sizeof(union ncclSocketAddress); sock->fd = accept(sock->acceptFd, &sock->addr.sa, &socklen); if (sock->fd != -1) { sock->state = ncclSocketStateAccepted; } else if (errno != EAGAIN && errno != EWOULDBLOCK) { WARN("socketTryAccept: Accept failed: %s", strerror(errno)); return ncclSystemError; } return ncclSuccess; } static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) { uint64_t magic; enum ncclSocketType type; int received = 0; const int one = 1; SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt"); NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received)); if (received == 0) return ncclSuccess; NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received)); if (magic != sock->magic) { WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic); close(sock->fd); sock->fd = -1; // Ignore spurious connection and accept again sock->state = ncclSocketStateAccepting; return ncclSuccess; } else { received = 0; NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received)); if (type != sock->type) { WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type); sock->state = ncclSocketStateError; close(sock->fd); sock->fd = -1; return ncclInternalError; } else { sock->state = ncclSocketStateReady; } } return ncclSuccess; } static ncclResult_t socketStartConnect(struct ncclSocket* sock) { /* blocking/non-blocking connect() is determined by asyncFlag. */ int ret = connect(sock->fd, &sock->addr.sa, sock->salen); if (ret == 0) { sock->state = ncclSocketStateConnected; return ncclSuccess; } else if (errno == EINPROGRESS) { sock->state = ncclSocketStateConnectPolling; return ncclSuccess; } else if (errno == ECONNREFUSED) { if (++sock->refusedRetries == RETRY_REFUSED_TIMES) { sock->state = ncclSocketStateError; WARN("socketStartConnect: exceeded retries (%d)", sock->refusedRetries); return ncclRemoteError; } usleep(SLEEP_INT); if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno)); return ncclSuccess; } else if (errno == ETIMEDOUT) { if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) { sock->state = ncclSocketStateError; WARN("socketStartConnect: exceeded timeouts (%d)", sock->timedOutRetries); return ncclRemoteError; } usleep(SLEEP_INT); return ncclSuccess; } else { char line[SOCKET_NAME_MAXLEN+1]; sock->state = ncclSocketStateError; WARN("socketStartConnect: Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno)); return ncclSystemError; } } static ncclResult_t socketPollConnect(struct ncclSocket* sock) { struct pollfd pfd; int timeout = 1, ret; socklen_t rlen = sizeof(int); memset(&pfd, 0, sizeof(struct pollfd)); pfd.fd = sock->fd; pfd.events = POLLOUT; ret = poll(&pfd, 1, timeout); if (ret == 0 || (ret < 0 && errno == EINTR)) { return ncclSuccess; } else if (ret < 0) { WARN("socketPollConnect poll() failed with error %s", strerror(errno)); return ncclRemoteError; } else { EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0); } /* check socket status */ SYSCHECK(getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt"); if (ret == 0) { sock->state = ncclSocketStateConnected; } else if (ret == ECONNREFUSED) { if (++sock->refusedRetries == RETRY_REFUSED_TIMES) { sock->state = ncclSocketStateError; WARN("socketPollConnect: exceeded retries (%d)", sock->refusedRetries); return ncclRemoteError; } if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno)); usleep(SLEEP_INT); sock->state = ncclSocketStateConnecting; } else if (ret == ETIMEDOUT) { if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) { sock->state = ncclSocketStateError; WARN("socketPollConnect: exceeded timeouts (%d)", sock->timedOutRetries); return ncclRemoteError; } usleep(SLEEP_INT); sock->state = ncclSocketStateConnecting; } else if (ret != EINPROGRESS) { sock->state = ncclSocketStateError; char line[SOCKET_NAME_MAXLEN+1]; WARN("socketPollConnect: Connect to %s returned %d(%s) errno %d(%s)", ncclSocketToString(&sock->addr, line), ret, strerror(ret), errno, strerror(errno)); return ncclSystemError; } return ncclSuccess; } ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) { if (sock == NULL) { WARN("ncclSocketPollConnect: pass NULL socket"); return ncclInvalidArgument; } NCCLCHECK(socketPollConnect(sock)); return ncclSuccess; } static ncclResult_t socketFinalizeConnect(struct ncclSocket* sock) { int sent = 0; NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent)); if (sent == 0) return ncclSuccess; NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent)); sent = 0; NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent)); sock->state = ncclSocketStateReady; return ncclSuccess; } static ncclResult_t socketProgressState(struct ncclSocket* sock) { if (sock->state == ncclSocketStateAccepting) { NCCLCHECK(socketTryAccept(sock)); } if (sock->state == ncclSocketStateAccepted) { NCCLCHECK(socketFinalizeAccept(sock)); } if (sock->state == ncclSocketStateConnecting) { NCCLCHECK(socketStartConnect(sock)); } if (sock->state == ncclSocketStateConnectPolling) { NCCLCHECK(socketPollConnect(sock)); } if (sock->state == ncclSocketStateConnected) { NCCLCHECK(socketFinalizeConnect(sock)); } return ncclSuccess; } ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running) { if (sock == NULL) { *running = 0; return ncclSuccess; } if (sock->state == ncclSocketStateError || sock->state == ncclSocketStateClosed) { WARN("ncclSocketReady: unexpected socket state %d", sock->state); return ncclRemoteError; } *running = (sock->state == ncclSocketStateReady) ? 1 : 0; if (*running == 0) { NCCLCHECK(socketProgressState(sock)); *running = (sock->state == ncclSocketStateReady) ? 1 : 0; } return ncclSuccess; } ncclResult_t ncclSocketConnect(struct ncclSocket* sock) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; #endif const int one = 1; if (sock == NULL) { WARN("ncclSocketConnect: pass NULL socket"); return ncclInvalidArgument; } if (sock->fd == -1) { WARN("ncclSocketConnect: file descriptor is -1"); return ncclInvalidArgument; } if (sock->state != ncclSocketStateInitialized) { WARN("ncclSocketConnect: wrong socket state %d", sock->state); if (sock->state == ncclSocketStateError) return ncclRemoteError; return ncclInternalError; } TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line)); SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt"); sock->state = ncclSocketStateConnecting; do { NCCLCHECK(socketProgressState(sock)); } while (sock->asyncFlag == 0 && (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE) == 0) && (sock->state == ncclSocketStateConnecting || sock->state == ncclSocketStateConnectPolling || sock->state == ncclSocketStateConnected)); if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError; switch (sock->state) { case ncclSocketStateConnecting: case ncclSocketStateConnectPolling: case ncclSocketStateConnected: case ncclSocketStateReady: return ncclSuccess; case ncclSocketStateError: return ncclSystemError; default: WARN("ncclSocketConnect: wrong socket state %d", sock->state); return ncclInternalError; } } ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSock) { ncclResult_t ret = ncclSuccess; if (listenSock == NULL || sock == NULL) { WARN("ncclSocketAccept: pass NULL socket"); ret = ncclInvalidArgument; goto exit; } if (listenSock->state != ncclSocketStateReady) { WARN("ncclSocketAccept: wrong socket state %d", listenSock->state); if (listenSock->state == ncclSocketStateError) ret = ncclSystemError; else ret = ncclInternalError; goto exit; } if (sock->acceptFd == -1) { memcpy(sock, listenSock, sizeof(struct ncclSocket)); sock->acceptFd = listenSock->fd; sock->state = ncclSocketStateAccepting; } do { NCCLCHECKGOTO(socketProgressState(sock), ret, exit); } while (sock->asyncFlag == 0 && (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE) == 0) && (sock->state == ncclSocketStateAccepting || sock->state == ncclSocketStateAccepted)); if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError; switch (sock->state) { case ncclSocketStateAccepting: case ncclSocketStateAccepted: case ncclSocketStateReady: ret = ncclSuccess; break; case ncclSocketStateError: ret = ncclSystemError; break; default: WARN("ncclSocketAccept: wrong socket state %d", sock->state); ret = ncclInternalError; break; } exit: return ret; } ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag) { ncclResult_t ret = ncclSuccess; if (sock == NULL) goto exit; sock->timedOutRetries = 0; sock->refusedRetries = 0; sock->abortFlag = abortFlag; sock->asyncFlag = asyncFlag; sock->state = ncclSocketStateInitialized; sock->magic = magic; sock->type = type; sock->fd = -1; sock->acceptFd = -1; if (addr) { /* IPv4/IPv6 support */ int family; memcpy(&sock->addr, addr, sizeof(union ncclSocketAddress)); family = sock->addr.sa.sa_family; if (family != AF_INET && family != AF_INET6) { char line[SOCKET_NAME_MAXLEN+1]; WARN("ncclSocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)", ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6); ret = ncclInternalError; goto fail; } sock->salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); /* Connect to a hostname / port */ sock->fd = socket(family, SOCK_STREAM, 0); if (sock->fd == -1) { WARN("ncclSocketInit: Socket creation failed : %s", strerror(errno)); ret = ncclSystemError; goto fail; } } else { memset(&sock->addr, 0, sizeof(union ncclSocketAddress)); } /* Set socket as non-blocking if async or if we need to be able to abort */ if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) { int flags; EQCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), -1, ret, fail); SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), ret, fail); } exit: return ret; fail: goto exit; } ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) { if (sock == NULL) { WARN("ncclSocketProgress: pass NULL socket"); return ncclInvalidArgument; } NCCLCHECK(socketProgress(op, sock, ptr, size, offset)); return ncclSuccess; } ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) { if (sock == NULL) { WARN("ncclSocketWait: pass NULL socket"); return ncclInvalidArgument; } NCCLCHECK(socketWait(op, sock, ptr, size, offset)); return ncclSuccess; } ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size) { int offset = 0; if (sock == NULL) { WARN("ncclSocketSend: pass NULL socket"); return ncclInvalidArgument; } if (sock->state != ncclSocketStateReady) { WARN("ncclSocketSend: socket state (%d) is not ready", sock->state); return ncclInternalError; } NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset)); return ncclSuccess; } ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) { int offset = 0; if (sock == NULL) { WARN("ncclSocketRecv: pass NULL socket"); return ncclInvalidArgument; } if (sock->state != ncclSocketStateReady) { WARN("ncclSocketRecv: socket state (%d) is not ready", sock->state); return ncclInternalError; } NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset)); return ncclSuccess; } ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize) { int sendOffset = 0, recvOffset = 0; if (sendSock == NULL || recvSock == NULL) { WARN("ncclSocketSendRecv: invalid socket %p/%p", sendSock, recvSock); return ncclInternalError; } if (sendSock->state != ncclSocketStateReady || recvSock->state != ncclSocketStateReady) { WARN("ncclSocketSendRecv: socket state (%d/%d) is not ready", sendSock->state, recvSock->state); return ncclInternalError; } while (sendOffset < sendSize || recvOffset < recvSize) { if (sendOffset < sendSize) NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sendSock, sendPtr, sendSize, &sendOffset)); if (recvOffset < recvSize) NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, recvSock, recvPtr, recvSize, &recvOffset)); } return ncclSuccess; } // Receive or detect connection closed ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking) { int offset = 0; if (sock == NULL) { WARN("ncclSocketTryRecv: pass NULL socket"); return ncclInvalidArgument; } *closed = 0; // Block until connection closes or nbytes received if (blocking) { while (offset < size) { NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed)); if (*closed) return ncclSuccess; } } else { NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed)); if (*closed) return ncclSuccess; // If any bytes were received, block waiting for the rest if (offset > 0) { while (offset < size) { NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed)); if (*closed) return ncclSuccess; } // No bytes were received, return ncclInProgress } else { return ncclInProgress; } } return ncclSuccess; } ncclResult_t ncclSocketClose(struct ncclSocket* sock) { if (sock != NULL) { if (sock->fd >= 0) { /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful * connection close here. */ shutdown(sock->fd, SHUT_RDWR); close(sock->fd); } sock->state = ncclSocketStateClosed; sock->fd = -1; } return ncclSuccess; } ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd) { if (sock == NULL) { WARN("ncclSocketGetFd: pass NULL socket"); return ncclInvalidArgument; } if (fd) *fd = sock->fd; return ncclSuccess; } ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock) { if (sock == NULL) { WARN("ncclSocketGetFd: pass NULL socket"); return ncclInvalidArgument; } sock->fd = fd; return ncclSuccess; } nccl-2.22.3-1/src/misc/strongstream.cc000066400000000000000000000341101463451655400174520ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "strongstream.h" #include "cudawrap.h" #include "checks.h" #include "param.h" // Tracks the chain of graph nodes for a given graph captured identified by // its graph id. This state has to live for as long as captured work is being // submitted. CUDA doesn't have mechanism to inform us when the user ends capture // so the best we can do is get notified when the graph is destroyed. struct ncclStrongStreamGraph { struct ncclStrongStreamGraph* next; // Atomically exchanged to false by both the main thread or the graph destructor // callback. The last to arrive deletes the node. bool alive; unsigned long long graphId; // For each graph we track the "tip" of the chain of graph nodes. A linear // chain would always have just one node at its tip, but since we have to merge // in chains from other streams (via ncclStrongStreamWaitStream) some spots // in the chain can be wider than a single node and thus need a list, so we // maintain a dynamically sized array of tip nodes. int tipCount, tipCapacity; cudaGraphNode_t* tipNodes; }; static void ncclStrongStreamGraphDelete(struct ncclStrongStreamGraph* g) { free(g->tipNodes); free(g); } //////////////////////////////////////////////////////////////////////////////// ncclResult_t ncclCudaGetCapturingGraph( struct ncclCudaGraph* graph, cudaStream_t stream ) { #if CUDART_VERSION >= 10000 // cudaStreamGetCaptureInfo int driver; NCCLCHECK(ncclCudaDriverVersion(&driver)); if (CUDART_VERSION < 11030 || driver < 11030) { cudaStreamCaptureStatus status; unsigned long long gid; CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, &gid)); #if CUDART_VERSION >= 11030 graph->graph = nullptr; graph->graphId = ULLONG_MAX; #endif if (status != cudaStreamCaptureStatusNone) { WARN("NCCL cannot be captured in a graph if either it wasn't built with CUDA runtime >= 11.3 or if the installed CUDA driver < R465."); return ncclInvalidUsage; } } else { #if CUDART_VERSION >= 11030 cudaStreamCaptureStatus status; unsigned long long gid; CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &gid, &graph->graph, nullptr, nullptr)); if (status != cudaStreamCaptureStatusActive) { graph->graph = nullptr; gid = ULLONG_MAX; } graph->graphId = gid; #endif } #endif return ncclSuccess; } ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg) { #if CUDART_VERSION >= 11030 cudaUserObject_t object; CUDACHECK(cudaUserObjectCreate( &object, arg, fn, /*initialRefcount=*/1, cudaUserObjectNoDestructorSync )); // Hand over ownership to CUDA Graph CUDACHECK(cudaGraphRetainUserObject(graph.graph, object, 1, cudaGraphUserObjectMove)); return ncclSuccess; #else return ncclInvalidUsage; #endif } //////////////////////////////////////////////////////////////////////////////// ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss) { CUDACHECK(cudaStreamCreateWithFlags(&ss->cudaStream, cudaStreamNonBlocking)); #if CUDART_VERSION >= 11030 CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming)); ss->everCaptured = false; ss->serialEventNeedsRecord = false; ss->graphHead = nullptr; #else CUDACHECK(cudaEventCreateWithFlags(&ss->scratchEvent, cudaEventDisableTiming)); #endif return ncclSuccess; } static void graphDestructor(void* arg) { struct ncclStrongStreamGraph* g = (struct ncclStrongStreamGraph*)arg; if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) { // Last to arrive deletes list node. ncclStrongStreamGraphDelete(g); } } ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss) { CUDACHECK(cudaStreamDestroy(ss->cudaStream)); #if CUDART_VERSION >= 11030 CUDACHECK(cudaEventDestroy(ss->serialEvent)); // Delete list of per-graph chains. struct ncclStrongStreamGraph* g = ss->graphHead; while (g != nullptr) { struct ncclStrongStreamGraph* next = g->next; if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) { // Last to arrive deletes list node. ncclStrongStreamGraphDelete(g); } g = next; } #else CUDACHECK(cudaEventDestroy(ss->scratchEvent)); #endif return ncclSuccess; } NCCL_PARAM(GraphMixingSupport, "GRAPH_MIXING_SUPPORT", 1) static void ensureTips(struct ncclStrongStreamGraph* g, int n) { if (g->tipCapacity < n) { g->tipNodes = (cudaGraphNode_t*)realloc(g->tipNodes, n*sizeof(cudaGraphNode_t)); g->tipCapacity = n; } } ncclResult_t ncclStrongStreamAcquire( struct ncclCudaGraph graph, struct ncclStrongStream* ss ) { #if CUDART_VERSION >= 11030 bool mixing = ncclParamGraphMixingSupport(); if (graph.graph == nullptr) { if (mixing && ss->everCaptured) { CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); ss->serialEventNeedsRecord = false; } } else { ss->everCaptured = true; // Find the current graph in our list of graphs if it exists. struct ncclStrongStreamGraph** pg = &ss->graphHead; struct ncclStrongStreamGraph* g; while (*pg != nullptr) { g = *pg; if (g->graphId == graph.graphId) { // Move to front of list so that operations after acquire don't have to search the list. *pg = g->next; g->next = ss->graphHead; ss->graphHead = g; return ncclSuccess; } else if (false == __atomic_load_n(&g->alive, __ATOMIC_ACQUIRE)) { // Unrelated graph that has been destroyed. Remove and delete. *pg = g->next; ncclStrongStreamGraphDelete(g); } else { pg = &g->next; } } // This is a new graph so add to the list. g = (struct ncclStrongStreamGraph*)malloc(sizeof(struct ncclStrongStreamGraph)); g->graphId = graph.graphId; g->tipNodes = nullptr; g->tipCapacity = 0; g->tipCount = 0; g->next = ss->graphHead; ss->graphHead = g; g->alive = true; NCCLCHECK(ncclCudaGraphAddDestructor(graph, graphDestructor, (void*)g)); if (mixing && ss->serialEventNeedsRecord) { // Can only be here if previous release was for uncaptured work that // elided updating the event because no capture had yet occurred. CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream)); } ss->serialEventNeedsRecord = false; // First node in the chain must be a wait on the serialEvent. if (mixing) { ensureTips(g, 1); CUDACHECK(cudaGraphAddEventWaitNode(&g->tipNodes[0], graph.graph, nullptr, 0, ss->serialEvent)); g->tipCount = 1; } else { g->tipCount = 0; } } #endif return ncclSuccess; } ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss) { #if CUDART_VERSION >= 11030 bool mixing = ncclParamGraphMixingSupport(); if (mixing && ss->everCaptured) { CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); } ss->serialEventNeedsRecord = true; // Assume the caller is going to add work to stream. #endif return ncclSuccess; } static ncclResult_t checkGraphId(struct ncclStrongStreamGraph* g, unsigned long long id) { if (g == nullptr || g->graphId != id) { WARN("Expected graph id=%llu was not at head of strong stream's internal list.", id); return ncclInternalError; } return ncclSuccess; } ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss) { #if CUDART_VERSION >= 11030 bool mixing = ncclParamGraphMixingSupport(); if (mixing && ss->serialEventNeedsRecord) { if (graph.graph == nullptr) { if (ss->everCaptured) { CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream)); ss->serialEventNeedsRecord = false; } } else { struct ncclStrongStreamGraph* g = ss->graphHead; NCCLCHECK(checkGraphId(g, graph.graphId)); ensureTips(g, 1); CUDACHECK(cudaGraphAddEventRecordNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, ss->serialEvent)); g->tipCount = 1; ss->serialEventNeedsRecord = false; } } #endif return ncclSuccess; } ncclResult_t ncclStrongStreamLaunchHost( struct ncclCudaGraph graph, struct ncclStrongStream* ss, cudaHostFn_t fn, void* arg ) { #if CUDART_VERSION >= 11030 if (graph.graph == nullptr) { CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg)); } else { cudaHostNodeParams p; p.fn = fn; p.userData = arg; struct ncclStrongStreamGraph* g = ss->graphHead; NCCLCHECK(checkGraphId(g, graph.graphId)); ensureTips(g, 1); CUDACHECK(cudaGraphAddHostNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p)); g->tipCount = 1; } ss->serialEventNeedsRecord = true; #else CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg)); #endif return ncclSuccess; } ncclResult_t ncclStrongStreamLaunchKernel( struct ncclCudaGraph graph, struct ncclStrongStream* ss, void* fn, dim3 grid, dim3 block, void* args[], size_t sharedMemBytes ) { #if CUDART_VERSION >= 11030 if (graph.graph == nullptr) { CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream)); } else { cudaKernelNodeParams p; p.func = fn; p.gridDim = grid; p.blockDim = block; p.kernelParams = args; p.sharedMemBytes = sharedMemBytes; p.extra = nullptr; struct ncclStrongStreamGraph* g = ss->graphHead; NCCLCHECK(checkGraphId(g, graph.graphId)); ensureTips(g, 1); CUDACHECK(cudaGraphAddKernelNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p)); g->tipCount = 1; } ss->serialEventNeedsRecord = true; #else CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream)); #endif return ncclSuccess; } // Merge node list `b` into list `a` but don't add duplicates. static void mergeTips(struct ncclStrongStreamGraph* a, cudaGraphNode_t const* bNodes, int bn) { int an = a->tipCount; ensureTips(a, an + bn); for (int bi=0; bi < bn; bi++) { for (int ai=0; ai < an; ai++) { if (a->tipNodes[ai] == bNodes[bi]) goto next_b; } a->tipNodes[a->tipCount++] = bNodes[bi]; next_b:; } } ncclResult_t ncclStrongStreamWaitStream( struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a ) { #if CUDART_VERSION >= 11030 if (graph.graph == nullptr) { if (b->serialEventNeedsRecord) { b->serialEventNeedsRecord = false; CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream)); } CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->serialEvent, 0)); } else { struct ncclStrongStreamGraph* ag = a->graphHead; NCCLCHECK(checkGraphId(ag, graph.graphId)); struct ncclStrongStreamGraph* bg = b->graphHead; NCCLCHECK(checkGraphId(bg, graph.graphId)); if (b_subsumes_a) ag->tipCount = 0; mergeTips(ag, bg->tipNodes, bg->tipCount); } a->serialEventNeedsRecord = true; #else CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream)); CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->scratchEvent, 0)); #endif return ncclSuccess; } ncclResult_t ncclStrongStreamWaitStream( struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a ) { #if CUDART_VERSION >= 11030 if (graph.graph == nullptr) { // It is ok to use a->serialEvent to record b since we'll be setting // a->serialEventNeedsRecord so the event won't be considered accurate // until re-recorded. CUDACHECK(cudaEventRecord(a->serialEvent, b)); CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->serialEvent, 0)); } else { cudaStreamCaptureStatus status; unsigned long long bGraphId; cudaGraphNode_t const* bNodes; size_t bCount = 0; CUDACHECK(cudaStreamGetCaptureInfo_v2(b, &status, &bGraphId, nullptr, &bNodes, &bCount)); if (status != cudaStreamCaptureStatusActive || graph.graphId != bGraphId) { WARN("Stream is not being captured by the expected graph."); return ncclInvalidUsage; } struct ncclStrongStreamGraph* ag = a->graphHead; NCCLCHECK(checkGraphId(ag, graph.graphId)); if (b_subsumes_a) ag->tipCount = 0; mergeTips(ag, bNodes, bCount); } a->serialEventNeedsRecord = true; #else CUDACHECK(cudaEventRecord(a->scratchEvent, b)); CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->scratchEvent, 0)); #endif return ncclSuccess; } ncclResult_t ncclStrongStreamWaitStream( struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a ) { #if CUDART_VERSION >= 11030 if (graph.graph == nullptr) { if (b->serialEventNeedsRecord) { b->serialEventNeedsRecord = false; CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream)); } CUDACHECK(cudaStreamWaitEvent(a, b->serialEvent, 0)); } else { struct ncclStrongStreamGraph* bg = b->graphHead; NCCLCHECK(checkGraphId(bg, graph.graphId)); CUDACHECK(cudaStreamUpdateCaptureDependencies(a, bg->tipNodes, bg->tipCount, b_subsumes_a ? cudaStreamSetCaptureDependencies : cudaStreamAddCaptureDependencies )); } #else CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream)); CUDACHECK(cudaStreamWaitEvent(a, b->scratchEvent, 0)); #endif return ncclSuccess; } ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) { #if CUDART_VERSION >= 11030 CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); ss->serialEventNeedsRecord = false; #endif CUDACHECK(cudaStreamSynchronize(ss->cudaStream)); return ncclSuccess; } nccl-2.22.3-1/src/misc/tuner.cc000066400000000000000000000221631463451655400160640ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. * * See LICENSE.txt for license information ************************************************************************/ #include #include #include #include "checks.h" #include "debug.h" #include "tuner.h" pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER; static int tunerPluginRefCount; static void* tunerPluginLib = nullptr; static ncclTuner_v3_t* tunerSymbol = nullptr; static ncclTuner_v2_t* ncclTuner_v2 = nullptr; static ncclTuner_v3_t ncclTuner_v2_as_v3; static int hasNvlsSupport(float** collCostTable) { // Requirements for support of different algorithms: // // - NVLS intra-node: nvlsSupport // - NVLS intra+inter-node: collNetSupport // - NVLSTree intra-node: always disabled // - NVLSTree inter-node: nvlsSupport // - Collnet* inter-node: collNetSupport // // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1 float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0; } static int hasCollNetSupport(float** collCostTable) { float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1; } static ncclResult_t ncclTuner_v2_as_v3_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int* nChannels) { int algorithm = NCCL_ALGO_UNDEF; int protocol = NCCL_PROTO_UNDEF; int nvlsSupport = hasNvlsSupport(collCostTable); int collNetSupport = hasCollNetSupport(collCostTable); NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels)); // set time to 0 below to make sure this algorithm/protocol is selected later on if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) { float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0; } return ncclSuccess; } static ncclResult_t ncclTuner_v2_as_v3_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) { NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logFunction, context)); ncclTuner_v2_as_v3.name = ncclTuner_v2->name; ncclTuner_v2_as_v3.getCollInfo = ncclTuner_v2_as_v3_getCollInfo; ncclTuner_v2_as_v3.destroy = ncclTuner_v2->destroy; return ncclSuccess; } #define MAX_STR_LEN 255 static void* tryOpenLib(const char* name, int* err, char* errStr) { *err = 0; if (nullptr == name || strlen(name) == 0) { return nullptr; } if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) { name = nullptr; } void *handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL); if (nullptr == handle) { strncpy(errStr, dlerror(), MAX_STR_LEN); errStr[MAX_STR_LEN] = '\0'; if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) { *err = ENOENT; } } return handle; } static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) { if (openErr == ENOENT) { snprintf(nameList, *nameListLen, " %s", name); nameList += strlen(name) + 1; *nameListLen -= strlen(name) + 1; return nameList; } INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: %s", openErrStr); return nameList; } static void* openTunerPluginLib(char* couldNotFindNames, int len) { int openErr; void *pluginLib; char tunerPluginLibName[PATH_MAX]; char openErrStr[MAX_STR_LEN + 1] = { 0 }; const char *envTunerPluginName = getenv("NCCL_TUNER_PLUGIN"); if (envTunerPluginName && strlen(envTunerPluginName)) { INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: NCCL_TUNER_PLUGIN set to %s", envTunerPluginName); snprintf(tunerPluginLibName, PATH_MAX, "%s", envTunerPluginName); pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); if (pluginLib) { INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); return pluginLib; } couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner-%s.so", envTunerPluginName); pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); if (pluginLib) { INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); return pluginLib; } couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); } else { snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner.so"); pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); if (pluginLib) { return pluginLib; } couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); } const char *envNetPluginName = getenv("NCCL_NET_PLUGIN"); if (envNetPluginName && strlen(envNetPluginName)) { // Users are allowed to pack tuner into the net plugin snprintf(tunerPluginLibName, PATH_MAX, "%s", envNetPluginName); pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); if (pluginLib) { INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); return pluginLib; } couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName); pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); if (pluginLib) { INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); return pluginLib; } couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); } else { snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net.so"); pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); if (pluginLib) { return pluginLib; } couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); } tunerPluginLibName[0] = '\0'; return nullptr; } enum { tunerPluginLoadFailed = -1, tunerPluginLoadReady = 0, tunerPluginLoadSuccess = 1, }; #define MAX_PLUGIN_LOAD 4 static int status = tunerPluginLoadReady; ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) { // Initialize to nullptr by default if plugin tuner cannot be loaded. char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 }; comm->tuner = nullptr; if (tunerPluginLoadFailed == status) { return ncclSuccess; } pthread_mutex_lock(&tunerPluginLock); if (tunerPluginLoadFailed == status) { goto exit; } if (tunerPluginLoadSuccess == status) { comm->tuner = tunerSymbol; ++tunerPluginRefCount; goto exit; } tunerPluginLib = openTunerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX); if (nullptr == tunerPluginLib) { if (strlen(couldNotFindNames)) { INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Could not find:%s. Using internal tuner plugin.", couldNotFindNames); } else { INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using internal tuner plugin."); } goto fail; } tunerSymbol = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3"); if (tunerSymbol == nullptr) { INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol."); ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2"); if (ncclTuner_v2 == nullptr) { INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead."); dlclose(tunerPluginLib); goto fail; } else { ncclTuner_v2_as_v3.init = ncclTuner_v2_as_v3_init; ncclTuner_v2_as_v3.name = ncclTuner_v2->name; tunerSymbol = &ncclTuner_v2_as_v3; } } INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", tunerSymbol->name); comm->tuner = tunerSymbol; ++tunerPluginRefCount; status = tunerPluginLoadSuccess; comm->tunerPluginLoaded = 1; exit: pthread_mutex_unlock(&tunerPluginLock); return ncclSuccess; fail: tunerPluginLib = nullptr; status = tunerPluginLoadFailed; goto exit; } ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) { pthread_mutex_lock(&tunerPluginLock); if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) { INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name); dlclose(tunerPluginLib); tunerPluginLib = nullptr; tunerSymbol = nullptr; comm->tuner = nullptr; status = tunerPluginLoadReady; comm->tunerPluginLoaded = 0; } pthread_mutex_unlock(&tunerPluginLock); return ncclSuccess; } nccl-2.22.3-1/src/misc/utils.cc000066400000000000000000000227121463451655400160670ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "utils.h" #include "core.h" #include "nvmlwrap.h" #include // Get current Compute Capability int ncclCudaCompCap() { int cudaDev; if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0; int ccMajor, ccMinor; if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0; if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0; return ccMajor*10+ccMinor; } ncclResult_t int64ToBusId(int64_t id, char* busId) { sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf)); return ncclSuccess; } ncclResult_t busIdToInt64(const char* busId, int64_t* id) { char hexStr[17]; // Longest possible int64 hex string + null terminator. int hexOffset = 0; for (int i = 0; hexOffset < sizeof(hexStr) - 1; i++) { char c = busId[i]; if (c == '.' || c == ':') continue; if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) { hexStr[hexOffset++] = busId[i]; } else break; } hexStr[hexOffset] = '\0'; *id = strtol(hexStr, NULL, 16); return ncclSuccess; } // Convert a logical cudaDev index to the NVML device minor number ncclResult_t getBusId(int cudaDev, int64_t *busId) { // On most systems, the PCI bus ID comes back as in the 0000:00:00.0 // format. Still need to allocate proper space in case PCI domain goes // higher. char busIdStr[] = "00000000:00:00.0"; CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev)); NCCLCHECK(busIdToInt64(busIdStr, busId)); return ncclSuccess; } ncclResult_t getHostName(char* hostname, int maxlen, const char delim) { if (gethostname(hostname, maxlen) != 0) { strncpy(hostname, "unknown", maxlen); return ncclSystemError; } int i = 0; while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++; hostname[i] = '\0'; return ncclSuccess; } uint64_t getHash(const char* string, int n) { // Based on DJB2a, result = result * 33 ^ char uint64_t result = 5381; for (int c = 0; c < n; c++) { result = ((result << 5) + result) ^ string[c]; } return result; } /* Generate a hash of the unique identifying string for this host * that will be unique for both bare-metal and container instances * Equivalent of a hash of; * * $(hostname)$(cat /proc/sys/kernel/random/boot_id) * * This string can be overridden by using the NCCL_HOSTID env var. */ #define HOSTID_FILE "/proc/sys/kernel/random/boot_id" uint64_t getHostHash(void) { char hostHash[1024]; const char *hostId; // Fall back is the full hostname if something fails (void) getHostName(hostHash, sizeof(hostHash), '\0'); int offset = strlen(hostHash); if ((hostId = ncclGetEnv("NCCL_HOSTID")) != NULL) { INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId); strncpy(hostHash, hostId, sizeof(hostHash)-1); hostHash[sizeof(hostHash)-1] = '\0'; } else { FILE *file = fopen(HOSTID_FILE, "r"); if (file != NULL) { char *p; if (fscanf(file, "%ms", &p) == 1) { strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); free(p); } } fclose(file); } // Make sure the string is terminated hostHash[sizeof(hostHash)-1]='\0'; TRACE(NCCL_INIT,"unique hostname '%s'", hostHash); return getHash(hostHash, strlen(hostHash)); } /* Generate a hash of the unique identifying string for this process * that will be unique for both bare-metal and container instances * Equivalent of a hash of; * * $$ $(readlink /proc/self/ns/pid) */ uint64_t getPidHash(void) { char pname[1024]; // Start off with our pid ($$) sprintf(pname, "%ld", (long) getpid()); int plen = strlen(pname); int len = readlink("/proc/self/ns/pid", pname+plen, sizeof(pname)-1-plen); if (len < 0) len = 0; pname[plen+len]='\0'; TRACE(NCCL_INIT,"unique PID '%s'", pname); return getHash(pname, strlen(pname)); } int parseStringList(const char* string, struct netIf* ifList, int maxList) { if (!string) return 0; const char* ptr = string; int ifNum = 0; int ifC = 0; char c; do { c = *ptr; if (c == ':') { if (ifC > 0) { ifList[ifNum].prefix[ifC] = '\0'; ifList[ifNum].port = atoi(ptr+1); ifNum++; ifC = 0; } while (c != ',' && c != '\0') c = *(++ptr); } else if (c == ',' || c == '\0') { if (ifC > 0) { ifList[ifNum].prefix[ifC] = '\0'; ifList[ifNum].port = -1; ifNum++; ifC = 0; } } else { ifList[ifNum].prefix[ifC] = c; ifC++; } ptr++; } while (ifNum < maxList && c); return ifNum; } static bool matchIf(const char* string, const char* ref, bool matchExact) { // Make sure to include '\0' in the exact case int matchLen = matchExact ? strlen(string) + 1 : strlen(ref); return strncmp(string, ref, matchLen) == 0; } static bool matchPort(const int port1, const int port2) { if (port1 == -1) return true; if (port2 == -1) return true; if (port1 == port2) return true; return false; } bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) { // Make an exception for the case where no user list is defined if (listSize == 0) return true; for (int i=0; ihunks` points to the top of the stack non-empty hunks. Hunks above // this (reachable via `->above`) are empty. struct Hunk* top = me->topFrame.hunk; size_t mallocSize = 0; // If we have lots of space left in hunk but that wasn't enough then we'll // allocate the object unhunked. if (me->topFrame.end - me->topFrame.bumper >= 8<<10) goto unhunked; // If we have another hunk (which must be empty) waiting above this one and // the object fits then use that. if (top && top->above) { struct Hunk* top1 = top->above; uintptr_t uobj = (reinterpret_cast(top1) + sizeof(struct Hunk) + align-1) & -uintptr_t(align); if (uobj + size <= reinterpret_cast(top1) + top1->size) { me->topFrame.hunk = top1; me->topFrame.bumper = uobj + size; me->topFrame.end = reinterpret_cast(top1) + top1->size; return reinterpret_cast(uobj); } } { // If the next hunk we're going to allocate wouldn't be big enough but the // Unhunk proxy fits in the current hunk then go allocate as unhunked. size_t nextSize = (top ? top->size : 0) + (64<<10); constexpr size_t maxAlign = 64; if (nextSize < sizeof(struct Hunk) + maxAlign + size) { uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk)); if (uproxy + sizeof(struct Unhunk) <= me->topFrame.end) goto unhunked; } // At this point we must need another hunk, either to fit the object // itself or its Unhunk proxy. mallocSize = nextSize; INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize); struct Hunk *top1 = (struct Hunk*)malloc(mallocSize); if (top1 == nullptr) goto malloc_exhausted; top1->size = nextSize; top1->above = nullptr; if (top) top->above = top1; top = top1; me->topFrame.hunk = top; me->topFrame.end = reinterpret_cast(top) + nextSize; me->topFrame.bumper = reinterpret_cast(top) + sizeof(struct Hunk); } { // Try to fit object in the new top hunk. uintptr_t uobj = (me->topFrame.bumper + align-1) & -uintptr_t(align); if (uobj + size <= me->topFrame.end) { me->topFrame.bumper = uobj + size; return reinterpret_cast(uobj); } } unhunked: { // We need to allocate the object out-of-band and put an Unhunk proxy in-band // to keep track of it. uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk)); Unhunk* proxy = reinterpret_cast(uproxy); me->topFrame.bumper = uproxy + sizeof(Unhunk); proxy->next = me->topFrame.unhunks; me->topFrame.unhunks = proxy; mallocSize = size; proxy->obj = malloc(mallocSize); INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize); if (proxy->obj == nullptr) goto malloc_exhausted; return proxy->obj; } malloc_exhausted: WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize); abort(); } void ncclMemoryStackDestruct(struct ncclMemoryStack* me) { // Free unhunks first because both the frames and unhunk proxies lie within the hunks. struct ncclMemoryStack::Frame* f = &me->topFrame; while (f != nullptr) { struct ncclMemoryStack::Unhunk* u = f->unhunks; while (u != nullptr) { free(u->obj); u = u->next; } f = f->below; } // Free hunks struct ncclMemoryStack::Hunk* h = me->stub.above; while (h != nullptr) { struct ncclMemoryStack::Hunk *h1 = h->above; free(h); h = h1; } } nccl-2.22.3-1/src/nccl.h.in000066400000000000000000000463731463451655400151730ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_H_ #define NCCL_H_ #include #include #if CUDART_VERSION >= 11000 #include #endif #define NCCL_MAJOR ${nccl:Major} #define NCCL_MINOR ${nccl:Minor} #define NCCL_PATCH ${nccl:Patch} #define NCCL_SUFFIX "${nccl:Suffix}" #define NCCL_VERSION_CODE ${nccl:Version} #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) #ifdef __cplusplus extern "C" { #endif #include /* Opaque handle to communicator */ typedef struct ncclComm* ncclComm_t; #define NCCL_COMM_NULL NULL #define NCCL_UNIQUE_ID_BYTES 128 typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; /* Error type */ typedef enum { ncclSuccess = 0, ncclUnhandledCudaError = 1, ncclSystemError = 2, ncclInternalError = 3, ncclInvalidArgument = 4, ncclInvalidUsage = 5, ncclRemoteError = 6, ncclInProgress = 7, ncclNumResults = 8 } ncclResult_t; #define NCCL_CONFIG_UNDEF_INT INT_MIN #define NCCL_CONFIG_UNDEF_PTR NULL #define NCCL_SPLIT_NOCOLOR -1 #define NCCL_UNDEF_FLOAT -1.0f /* Communicator configuration. Users can assign value to attributes to specify the * behavior of a communicator. */ typedef struct ncclConfig_v21700 { /* attributes that users should never touch. */ size_t size; unsigned int magic; unsigned int version; /* attributes that users are able to customize. */ int blocking; int cgaClusterSize; int minCTAs; int maxCTAs; const char *netName; int splitShare; } ncclConfig_t; /* Config initializer must be assigned to initialize config structure when it is created. * Not initialized config will result in NCCL error. */ #define NCCL_CONFIG_INITIALIZER { \ sizeof(ncclConfig_t), /* size */ \ 0xcafebeef, /* magic */ \ NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ NCCL_CONFIG_UNDEF_INT, /* blocking */ \ NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ NCCL_CONFIG_UNDEF_PTR, /* netName */ \ NCCL_CONFIG_UNDEF_INT /* splitShare */ \ } /* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */ typedef struct ncclSimInfo_v22200 { size_t size; unsigned int magic; unsigned int version; float estimatedTime; } ncclSimInfo_t; /* NCCL_SIM_INFO_INITIALIZER must be assigned to initialize simInfo structure when it is created. * Not initialized simInfo will result in NCCL error. */ #define NCCL_SIM_INFO_INITIALIZER { \ sizeof(ncclSimInfo_t), /* size */ \ 0x74685283, /* magic */ \ NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ NCCL_UNDEF_FLOAT /* estimated time */ \ } /* NCCL malloc and free function for all types of NCCL optimizations * (e.g. user buffer registration). The actual allocated size might * be larger than requested due to granularity requirement. */ ncclResult_t ncclMemAlloc(void** ptr, size_t size); ncclResult_t pncclMemAlloc(void** ptr, size_t size); ncclResult_t ncclMemFree(void *ptr); ncclResult_t pncclMemFree(void *ptr); /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. * This integer is coded with the MAJOR, MINOR and PATCH level of the * NCCL library */ ncclResult_t ncclGetVersion(int *version); ncclResult_t pncclGetVersion(int *version); /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be * called once and the Id should be distributed to all ranks in the * communicator before calling ncclCommInitRank. */ ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); /* Create a new communicator (multi thread/process version) with a configuration * set by users. */ ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); /* Creates a new communicator (multi thread/process version). * rank must be between 0 and nranks-1 and unique within a communicator clique. * Each rank is associated to a CUDA device, which has to be set before calling * ncclCommInitRank. * ncclCommInitRank implicitly syncronizes with other ranks, so it must be * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); /* Creates a clique of communicators (single process version). * This is a convenience function to create a single-process communicator clique. * Returns an array of ndev newly initialized communicators in comm. * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). * If devlist is NULL, the first ndev CUDA devices are used. * Order of devlist defines user-order of processors within the communicator. */ ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); /* Finalize a communicator. ncclCommFinalize flushes all issued communications, * and marks communicator state as ncclInProgress. The state will change to ncclSuccess * when the communicator is globally quiescent and related resources are freed; then, * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator * itself) without blocking. */ ncclResult_t ncclCommFinalize(ncclComm_t comm); ncclResult_t pncclCommFinalize(ncclComm_t comm); /* Frees local resources associated with communicator object. */ ncclResult_t ncclCommDestroy(ncclComm_t comm); ncclResult_t pncclCommDestroy(ncclComm_t comm); /* Frees resources associated with communicator object and aborts any operations * that might still be running on the device. */ ncclResult_t ncclCommAbort(ncclComm_t comm); ncclResult_t pncclCommAbort(ncclComm_t comm); /* Creates one or more communicators from an existing one. * Ranks with the same color will end up in the same communicator. * Within the new communicator, key will be used to order ranks. * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group * and will therefore return a NULL communicator. * If config is NULL, the new communicator will inherit the original communicator's * configuration*/ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); /* Returns a string for each error code. */ const char* ncclGetErrorString(ncclResult_t result); const char* pncclGetErrorString(ncclResult_t result); /* Returns a human-readable message of the last error that occurred. */ const char* ncclGetLastError(ncclComm_t comm); const char* pncclGetLastError(ncclComm_t comm); /* Checks whether the comm has encountered any asynchronous errors */ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); /* Gets the number of ranks in the communicator clique. */ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); /* Returns the cuda device number associated with the communicator. */ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); /* Returns the user-ordered "rank" associated with the communicator. */ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); /* Register CUDA buffer for zero-copy operation */ ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); /* Deregister CUDA buffer */ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle); ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle); /* Reduction operation selector */ typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t; typedef enum { ncclSum = 0, ncclProd = 1, ncclMax = 2, ncclMin = 3, ncclAvg = 4, /* ncclNumOps: The number of built-in ncclRedOp_t values. Also * serves as the least possible value for dynamic ncclRedOp_t's * as constructed by ncclRedOpCreate*** functions. */ ncclNumOps = 5, /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. * It is defined to be the largest signed value (since compilers * are permitted to use signed enums) that won't grow * sizeof(ncclRedOp_t) when compared to previous NCCL versions to * maintain ABI compatibility. */ ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) } ncclRedOp_t; /* Data types */ typedef enum { ncclInt8 = 0, ncclChar = 0, ncclUint8 = 1, ncclInt32 = 2, ncclInt = 2, ncclUint32 = 3, ncclInt64 = 4, ncclUint64 = 5, ncclFloat16 = 6, ncclHalf = 6, ncclFloat32 = 7, ncclFloat = 7, ncclFloat64 = 8, ncclDouble = 8, #if defined(__CUDA_BF16_TYPES_EXIST__) ncclBfloat16 = 9, ncclNumTypes = 10 #else ncclNumTypes = 9 #endif } ncclDataType_t; /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ typedef enum { /* ncclScalarDevice: The scalar is in device-visible memory and will be * dereferenced while the collective is running. */ ncclScalarDevice = 0, /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be * dereferenced before the ncclRedOpCreate***() function returns. */ ncclScalarHostImmediate = 1 } ncclScalarResidence_t; /* * ncclRedOpCreatePreMulSum * * Creates a new reduction operator which pre-multiplies input values by a given * scalar locally before reducing them with peer values via summation. For use * only with collectives launched against *comm* and *datatype*. The * *residence* argument indicates how/when the memory pointed to by *scalar* * will be dereferenced. Upon return, the newly created operator's handle * is stored in *op*. */ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); /* * ncclRedOpDestroy * * Destroys the reduction operator *op*. The operator must have been created by * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be * destroyed as soon as the last NCCL function which is given that operator returns. */ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); /* * Collective communication operations * * Collective communication operations must be called separately for each * communicator in a communicator clique. * * They return when operations have been enqueued on the CUDA stream. * * Since they may perform inter-CPU synchronization, each call has to be done * from a different thread or process, or need to use Group Semantics (see * below). */ /* * Reduce * * Reduces data arrays of length count in sendbuff into recvbuff using op * operation. * recvbuff may be NULL on all calls except for root device. * root is the rank (not the CUDA device) where data will reside after the * operation is complete. * * In-place operation will happen if sendbuff == recvbuff. */ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); /* * (deprecated) Broadcast (in-place) * * Copies count values from root to all other devices. * root is the rank (not the CUDA device) where data resides before the * operation is started. * * This operation is implicitely in place. */ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); /* * Broadcast * * Copies count values from root to all other devices. * root is the rank (not the CUDA device) where data resides before the * operation is started. * * In-place operation will happen if sendbuff == recvbuff. */ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); /* * All-Reduce * * Reduces data arrays of length count in sendbuff using op operation, and * leaves identical copies of result on each recvbuff. * * In-place operation will happen if sendbuff == recvbuff. */ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); /* * Reduce-Scatter * * Reduces data in sendbuff using op operation and leaves reduced result * scattered over the devices so that recvbuff on rank i will contain the i-th * block of the result. * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff * should have a size of at least nranks*recvcount elements. * * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. */ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); /* * All-Gather * * Each device gathers sendcount values from other GPUs into recvbuff, * receiving data from rank i at offset i*sendcount. * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff * should have a size of at least nranks*sendcount elements. * * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. */ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); /* * Send * * Send data from sendbuff to rank peer. * * Rank peer needs to call ncclRecv with the same datatype and the same count from this * rank. * * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ * ncclGroupEnd section. */ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); /* * Receive * * Receive data from rank peer into recvbuff. * * Rank peer needs to call ncclSend with the same datatype and the same count to this * rank. * * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ * ncclGroupEnd section. */ ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); /* * Group semantics * * When managing multiple GPUs from a single thread, and since NCCL collective * calls may perform inter-CPU synchronization, we need to "group" calls for * different ranks/devices into a single call. * * Grouping NCCL calls as being part of the same collective operation is done * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all * collective calls until the ncclGroupEnd call, which will wait for all calls * to be complete. Note that for collective communication, ncclGroupEnd only * guarantees that the operations are enqueued on the streams, not that * the operation is effectively done. * * Both collective communication and ncclCommInitRank can be used in conjunction * of ncclGroupStart/ncclGroupEnd, but not together. * * Group semantics also allow to fuse multiple operations on the same device * to improve performance (for aggregated collective calls), or to permit * concurrent progress of multiple send/receive operations. */ /* * Group Start * * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into * a single NCCL operation. Nothing will be started on the CUDA stream until * ncclGroupEnd. */ ncclResult_t ncclGroupStart(); ncclResult_t pncclGroupStart(); /* * Group End * * End a group call. Start a fused NCCL operation consisting of all calls since * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations * need to be called after ncclGroupEnd. */ ncclResult_t ncclGroupEnd(); ncclResult_t pncclGroupEnd(); /* * Group Simulate End * * Simulate a ncclGroupEnd() call and return NCCL's simulation info in a struct. */ ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo); ncclResult_t pncclGroupSimulateEnd(ncclSimInfo_t* simInfo); #ifdef __cplusplus } // end extern "C" #endif #endif // end include guard nccl-2.22.3-1/src/nccl.pc.in000077500000000000000000000004341463451655400153350ustar00rootroot00000000000000prefix=${nccl:Prefix} exec_prefix=${prefix} libdir=${exec_prefix}/lib includedir=${prefix}/include Name: nccl Description: Optimized primitives for collective multi-GPU communication Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch} Libs: -L${libdir} -lnccl Cflags: -I${includedir} nccl-2.22.3-1/src/net.cc000066400000000000000000000664631463451655400145750ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "net.h" #include "bootstrap.h" #include "checks.h" #include #include #include //#include //#include //#include static ncclNet_v8_t ncclNet_v5_as_v8; static ncclNet_v8_t ncclNet_v6_as_v8; static ncclNet_v8_t ncclNet_v7_as_v8; static ncclNet_v5_t *ncclNet_v5; static ncclNet_v6_t *ncclNet_v6; static ncclNet_v7_t *ncclNet_v7; static ncclCollNet_v8_t ncclCollNet_v5_as_v8; static ncclCollNet_v8_t ncclCollNet_v6_as_v8; static ncclCollNet_v8_t ncclCollNet_v7_as_v8; static ncclCollNet_v5_t *ncclCollNet_v5; static ncclCollNet_v6_t *ncclCollNet_v6; static ncclCollNet_v7_t *ncclCollNet_v7; static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) { ncclNetProperties_v7_t p7; ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7); if (ans != ncclSuccess) return ans; props->name = p7.name; props->pciPath = p7.pciPath; props->guid = p7.guid; props->ptrSupport = p7.ptrSupport; props->regIsGlobal = 0; props->speed = p7.speed; props->port = p7.port; props->maxComms = p7.maxComms; props->maxRecvs = p7.maxRecvs; props->latency = p7.latency; props->netDeviceType = p7.netDeviceType; props->netDeviceVersion = p7.netDeviceVersion; return ncclSuccess; } static ncclResult_t ncclNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { if (size >= 1UL<<31) return ncclInternalError; return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle); } static ncclResult_t ncclNet_v7_as_v8_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclNet_v7->init(logfn)); ncclNet_v7_as_v8.name = ncclNet_v7->name; ncclNet_v7_as_v8.devices = ncclNet_v7->devices; ncclNet_v7_as_v8.getProperties = ncclNet_v7_as_v8_getProperties; // ncclNet_v5->getProperties; ncclNet_v7_as_v8.listen = ncclNet_v7->listen; ncclNet_v7_as_v8.connect = ncclNet_v7->connect; ncclNet_v7_as_v8.accept = ncclNet_v7->accept; ncclNet_v7_as_v8.regMr = ncclNet_v7_as_v8_regMr; ncclNet_v7_as_v8.regMrDmaBuf = ncclNet_v7->regMrDmaBuf; ncclNet_v7_as_v8.deregMr = ncclNet_v7->deregMr; ncclNet_v7_as_v8.isend = ncclNet_v7->isend; ncclNet_v7_as_v8.irecv = ncclNet_v7->irecv; ncclNet_v7_as_v8.iflush = ncclNet_v7->iflush; ncclNet_v7_as_v8.test = ncclNet_v7->test; ncclNet_v7_as_v8.closeSend = ncclNet_v7->closeSend; ncclNet_v7_as_v8.closeRecv = ncclNet_v7->closeRecv; ncclNet_v7_as_v8.closeListen = ncclNet_v7->closeListen; ncclNet_v7_as_v8.getDeviceMr = ncclNet_v7->getDeviceMr; ncclNet_v7_as_v8.irecvConsumed = ncclNet_v7->irecvConsumed; return ncclSuccess; } static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) { ncclNetProperties_v6_t p6; ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6); if (ans != ncclSuccess) return ans; props->name = p6.name; props->pciPath = p6.pciPath; props->guid = p6.guid; props->ptrSupport = p6.ptrSupport; props->regIsGlobal = 0; props->speed = p6.speed; props->port = p6.port; props->maxComms = p6.maxComms; props->maxRecvs = p6.maxRecvs; props->latency = p6.latency; props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; return ncclSuccess; } static ncclResult_t ncclNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { if (size >= 1UL<<31) return ncclInternalError; return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle); } static ncclResult_t ncclNet_v6_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { return ncclNet_v6->connect(dev, handle, sendComm); } static ncclResult_t ncclNet_v6_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { return ncclNet_v6->accept(listenComm, recvComm); } static ncclResult_t ncclNet_v6_as_v8_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclNet_v6->init(logfn)); ncclNet_v6_as_v8.name = ncclNet_v6->name; ncclNet_v6_as_v8.devices = ncclNet_v6->devices; ncclNet_v6_as_v8.getProperties = ncclNet_v6_as_v8_getProperties; // ncclNet_v5->getProperties; ncclNet_v6_as_v8.listen = ncclNet_v6->listen; ncclNet_v6_as_v8.connect = ncclNet_v6_as_v8_connect; ncclNet_v6_as_v8.accept = ncclNet_v6_as_v8_accept; ncclNet_v6_as_v8.regMr = ncclNet_v6_as_v8_regMr; ncclNet_v6_as_v8.regMrDmaBuf = ncclNet_v6->regMrDmaBuf; ncclNet_v6_as_v8.deregMr = ncclNet_v6->deregMr; ncclNet_v6_as_v8.isend = ncclNet_v6->isend; ncclNet_v6_as_v8.irecv = ncclNet_v6->irecv; ncclNet_v6_as_v8.iflush = ncclNet_v6->iflush; ncclNet_v6_as_v8.test = ncclNet_v6->test; ncclNet_v6_as_v8.closeSend = ncclNet_v6->closeSend; ncclNet_v6_as_v8.closeRecv = ncclNet_v6->closeRecv; ncclNet_v6_as_v8.closeListen = ncclNet_v6->closeListen; ncclNet_v6_as_v8.getDeviceMr = NULL; ncclNet_v6_as_v8.irecvConsumed = NULL; return ncclSuccess; } static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) { ncclNetProperties_v6_t p6; ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6); if (ans != ncclSuccess) return ans; props->name = p6.name; props->pciPath = p6.pciPath; props->guid = p6.guid; props->ptrSupport = p6.ptrSupport; props->regIsGlobal = 0; props->speed = p6.speed; props->port = p6.port; props->maxComms = p6.maxComms; props->maxRecvs = p6.maxRecvs; props->latency = p6.latency; props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; return ncclSuccess; } static ncclResult_t ncclNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { if (size >= 1UL<<31) return ncclInternalError; return ncclNet_v5->regMr(comm, data, (int) size, type, mhandle); } static ncclResult_t ncclNet_v5_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { return ncclNet_v5->connect(dev, handle, sendComm); } static ncclResult_t ncclNet_v5_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { return ncclNet_v5->accept(listenComm, recvComm); } // We use a wrapper around the v5 init to copy over the struct contents // post-init since they may not be initialized before hand. static ncclResult_t ncclNet_v5_as_v8_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclNet_v5->init(logfn)); ncclNet_v5_as_v8.name = ncclNet_v5->name; ncclNet_v5_as_v8.devices = ncclNet_v5->devices; ncclNet_v5_as_v8.getProperties = ncclNet_v5_as_v8_getProperties; ncclNet_v5_as_v8.listen = ncclNet_v5->listen; ncclNet_v5_as_v8.connect = ncclNet_v5_as_v8_connect; ncclNet_v5_as_v8.accept = ncclNet_v5_as_v8_accept; ncclNet_v5_as_v8.regMr = ncclNet_v5_as_v8_regMr; ncclNet_v5_as_v8.regMrDmaBuf = NULL; ncclNet_v5_as_v8.deregMr = ncclNet_v5->deregMr; ncclNet_v5_as_v8.isend = ncclNet_v5->isend; ncclNet_v5_as_v8.irecv = ncclNet_v5->irecv; ncclNet_v5_as_v8.iflush = ncclNet_v5->iflush; ncclNet_v5_as_v8.test = ncclNet_v5->test; ncclNet_v5_as_v8.closeSend = ncclNet_v5->closeSend; ncclNet_v5_as_v8.closeRecv = ncclNet_v5->closeRecv; ncclNet_v5_as_v8.closeListen = ncclNet_v5->closeListen; ncclNet_v5_as_v8.getDeviceMr = NULL; ncclNet_v5_as_v8.irecvConsumed = NULL; return ncclSuccess; } static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) { ncclNetProperties_v6_t p6; ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6); if (ans != ncclSuccess) return ans; props->name = p6.name; props->pciPath = p6.pciPath; props->guid = p6.guid; props->ptrSupport = p6.ptrSupport; props->regIsGlobal = 0; props->speed = p6.speed; props->port = p6.port; props->maxComms = p6.maxComms; props->maxRecvs = p6.maxRecvs; props->latency = p6.latency; props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; return ncclSuccess; } static ncclResult_t ncclCollNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { if (size >= 1UL<<31) return ncclInternalError; return ncclCollNet_v5->regMr(comm, data, (int) size, type, mhandle); } // We use a wrapper around the v5 init to copy over the struct contents // post-init since they may not be initialized before hand. static ncclResult_t ncclCollNet_v5_as_v8_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclCollNet_v5->init(logfn)); ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name; ncclCollNet_v5_as_v8.devices = ncclCollNet_v5->devices; ncclCollNet_v5_as_v8.getProperties = ncclCollNet_v5_as_v8_getProperties; ncclCollNet_v5_as_v8.listen = ncclCollNet_v5->listen; ncclCollNet_v5_as_v8.connect = ncclCollNet_v5->connect; ncclCollNet_v5_as_v8.reduceSupport = ncclCollNet_v5->reduceSupport; ncclCollNet_v5_as_v8.regMr = ncclCollNet_v5_as_v8_regMr; ncclCollNet_v5_as_v8.regMrDmaBuf = NULL; ncclCollNet_v5_as_v8.deregMr = ncclCollNet_v5->deregMr; ncclCollNet_v5_as_v8.iallreduce = ncclCollNet_v5->iallreduce; ncclCollNet_v5_as_v8.iallgather = nullptr; ncclCollNet_v5_as_v8.ireducescatter = nullptr; ncclCollNet_v5_as_v8.iflush = ncclCollNet_v5->iflush; ncclCollNet_v5_as_v8.test = ncclCollNet_v5->test; ncclCollNet_v5_as_v8.closeColl = ncclCollNet_v5->closeColl; ncclCollNet_v5_as_v8.closeListen = ncclCollNet_v5->closeListen; return ncclSuccess; } static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) { ncclNetProperties_v6_t p6; ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6); if (ans != ncclSuccess) return ans; props->name = p6.name; props->pciPath = p6.pciPath; props->guid = p6.guid; props->ptrSupport = p6.ptrSupport; props->regIsGlobal = 0; props->speed = p6.speed; props->port = p6.port; props->maxComms = p6.maxComms; props->maxRecvs = p6.maxRecvs; props->latency = p6.latency; props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; return ncclSuccess; } static ncclResult_t ncclCollNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { if (size >= 1UL<<31) return ncclInternalError; return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle); } // We use a wrapper around the v6 init to copy over the struct contents // post-init since they may not be initialized before hand. static ncclResult_t ncclCollNet_v6_as_v8_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclCollNet_v6->init(logfn)); ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name; ncclCollNet_v6_as_v8.devices = ncclCollNet_v6->devices; ncclCollNet_v6_as_v8.getProperties = ncclCollNet_v6_as_v8_getProperties; ncclCollNet_v6_as_v8.listen = ncclCollNet_v6->listen; ncclCollNet_v6_as_v8.connect = ncclCollNet_v6->connect; ncclCollNet_v6_as_v8.reduceSupport = ncclCollNet_v6->reduceSupport; ncclCollNet_v6_as_v8.regMr = ncclCollNet_v6_as_v8_regMr; ncclCollNet_v6_as_v8.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf; ncclCollNet_v6_as_v8.deregMr = ncclCollNet_v6->deregMr; ncclCollNet_v6_as_v8.iallreduce = ncclCollNet_v6->iallreduce; ncclCollNet_v6_as_v8.iallgather = nullptr; ncclCollNet_v6_as_v8.ireducescatter = nullptr; ncclCollNet_v6_as_v8.iflush = ncclCollNet_v6->iflush; ncclCollNet_v6_as_v8.test = ncclCollNet_v6->test; ncclCollNet_v6_as_v8.closeColl = ncclCollNet_v6->closeColl; ncclCollNet_v6_as_v8.closeListen = ncclCollNet_v6->closeListen; return ncclSuccess; } static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) { ncclNetProperties_v7_t p7; ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7); if (ans != ncclSuccess) return ans; props->name = p7.name; props->pciPath = p7.pciPath; props->guid = p7.guid; props->ptrSupport = p7.ptrSupport; props->regIsGlobal = 0; props->speed = p7.speed; props->port = p7.port; props->maxComms = p7.maxComms; props->maxRecvs = p7.maxRecvs; props->latency = p7.latency; props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; return ncclSuccess; } static ncclResult_t ncclCollNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { if (size >= 1UL<<31) return ncclInternalError; return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle); } // We use a wrapper around the v7 init to copy over the struct contents // post-init since they may not be initialized before hand. static ncclResult_t ncclCollNet_v7_as_v8_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclCollNet_v7->init(logfn)); ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name; ncclCollNet_v7_as_v8.devices = ncclCollNet_v7->devices; ncclCollNet_v7_as_v8.getProperties = ncclCollNet_v7_as_v8_getProperties; ncclCollNet_v7_as_v8.listen = ncclCollNet_v7->listen; ncclCollNet_v7_as_v8.connect = ncclCollNet_v7->connect; ncclCollNet_v7_as_v8.reduceSupport = ncclCollNet_v7->reduceSupport; ncclCollNet_v7_as_v8.regMr = ncclCollNet_v7_as_v8_regMr; ncclCollNet_v7_as_v8.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf; ncclCollNet_v7_as_v8.deregMr = ncclCollNet_v7->deregMr; ncclCollNet_v7_as_v8.iallreduce = ncclCollNet_v7->iallreduce; ncclCollNet_v7_as_v8.iallgather = nullptr; ncclCollNet_v7_as_v8.ireducescatter = nullptr; ncclCollNet_v7_as_v8.iflush = ncclCollNet_v7->iflush; ncclCollNet_v7_as_v8.test = ncclCollNet_v7->test; ncclCollNet_v7_as_v8.closeColl = ncclCollNet_v7->closeColl; ncclCollNet_v7_as_v8.closeListen = ncclCollNet_v7->closeListen; return ncclSuccess; } static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER; ncclNet_t* ncclNets[3] = { nullptr, &ncclNetIb, &ncclNetSocket }; ncclCollNet_t* ncclCollNets[3] = { nullptr, nullptr, nullptr }; enum ncclNetState { ncclNetStateInit = 0, ncclNetStateEnabled = 1, ncclNetStateDisabled = 2 }; enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; #define MAX_STR_LEN 255 static void* tryOpenLib(char* name, int* err, char* errStr) { *err = 0; if (nullptr == name || strlen(name) == 0) { return nullptr; } if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) { name = nullptr; } void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL); if (nullptr == handle) { strncpy(errStr, dlerror(), MAX_STR_LEN); errStr[MAX_STR_LEN] = '\0'; if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) { *err = ENOENT; } } return handle; } static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) { if (openErr == ENOENT) { snprintf(nameList, *nameListLen, " %s", name); nameList += strlen(name) + 1; *nameListLen -= strlen(name) + 1; return nameList; } INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: %s", openErrStr); return nameList; } static void* openNetPluginLib(char* couldNotFindNames, int len) { int openErr; void *pluginLib; char netPluginLibName[PATH_MAX]; char openErrStr[MAX_STR_LEN + 1] = { 0 }; const char *envNetPluginName = getenv("NCCL_NET_PLUGIN"); if (envNetPluginName && strlen(envNetPluginName)) { snprintf(netPluginLibName, PATH_MAX, "%s", envNetPluginName); pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr); if (pluginLib) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName); return pluginLib; } couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName); snprintf(netPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName); pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr); if (pluginLib) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName); return pluginLib; } couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName); } else { snprintf(netPluginLibName, PATH_MAX, "libnccl-net.so"); pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr); if (pluginLib) { return pluginLib; } couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName); } return nullptr; } static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER; static int netPluginRefCount; static void* netPluginLib; enum { netPluginLoadFailed = -1, netPluginLoadReady = 0, netPluginLoadSuccess = 1, }; static int netPluginStatus = netPluginLoadReady; #define MAX_PLUGIN_LOAD 2 ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) { char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 }; if (netPluginLoadFailed == netPluginStatus) { return ncclSuccess; } pthread_mutex_lock(&netPluginLock); if (netPluginLoadSuccess == netPluginStatus) { ++netPluginRefCount; goto exit; } netPluginLib = openNetPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX); if (netPluginLib == nullptr) { if (strlen(couldNotFindNames)) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Could not find:%s. Using internal network plugin.", couldNotFindNames); } else { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Using internal network plugin."); } goto fail; } ncclNets[0] = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8"); if (ncclNets[0] == nullptr) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol."); // Try v7 plugin ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7"); if (ncclNet_v7 == nullptr) { // Try v6 plugin ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6"); if (ncclNet_v6 == nullptr) { // Try v5 plugin ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5"); if (ncclNet_v5 == nullptr) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported."); goto fail; } else { ncclNets[0] = &ncclNet_v5_as_v8; ncclNet_v5_as_v8.init = ncclNet_v5_as_v8_init; // Set the name right away to allow for NCCL_NET=... to work ncclNet_v5_as_v8.name = ncclNet_v5->name; INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name); } } else { ncclNets[0] = &ncclNet_v6_as_v8; ncclNet_v6_as_v8.init = ncclNet_v6_as_v8_init; // Set the name right away to allow for NCCL_NET=... to work ncclNet_v6_as_v8.name = ncclNet_v6->name; INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name); } } else { ncclNets[0] = &ncclNet_v7_as_v8; ncclNet_v7_as_v8.init = ncclNet_v7_as_v8_init; // Set the name right away to allow for NCCL_NET=... to work ncclNet_v7_as_v8.name = ncclNet_v7->name; INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name); } } // Check for CollNet ncclCollNets[0] = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8"); if (ncclCollNets[0] == nullptr) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol."); ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7"); if (ncclCollNet_v7 == nullptr) { ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6"); if (ncclCollNet_v6 == nullptr) { ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5"); if (ncclCollNet_v5 == nullptr) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported."); } else { ncclCollNets[0] = &ncclCollNet_v5_as_v8; ncclCollNet_v5_as_v8.init = ncclCollNet_v5_as_v8_init; ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name; INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name); } } else { ncclCollNets[0] = &ncclCollNet_v6_as_v8; ncclCollNet_v6_as_v8.init = ncclCollNet_v6_as_v8_init; ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name; INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name); } } else { ncclCollNets[0] = &ncclCollNet_v7_as_v8; ncclCollNet_v7_as_v8.init = ncclCollNet_v7_as_v8_init; ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name; INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name); } } ++netPluginRefCount; netPluginStatus = netPluginLoadSuccess; comm->netPluginLoaded = 1; exit: pthread_mutex_unlock(&netPluginLock); return ncclSuccess; fail: if (netPluginLib) dlclose(netPluginLib); netPluginStatus = netPluginLoadFailed; goto exit; } ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) { pthread_mutex_lock(&netPluginLock); if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) { if (ncclNets[0]) { INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name); } if (ncclCollNets[0]) { INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name); } dlclose(netPluginLib); netPluginLib = nullptr; ncclNets[0] = nullptr; ncclCollNets[0] = nullptr; netPluginStatus = netPluginLoadReady; comm->netPluginLoaded = 0; } pthread_mutex_unlock(&netPluginLock); return ncclSuccess; } ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) { ncclNetProperties_t props; NCCLCHECK(net->getProperties(dev, &props)); ncclNetDeviceType type = props.netDeviceType; if (type) switch (type) { case NCCL_NET_DEVICE_UNPACK: if (props.netDeviceVersion == NCCL_NET_DEVICE_UNPACK_VERSION) { INFO(NCCL_INIT, "Using NCCL_NET_DEVICE_UNPACK net plugin version %d", props.netDeviceVersion); return ncclSuccess; } else { WARN("NCCL_DEVICE_UNPACK plugin has incompatible version %d, this NCCL build is compatible with %d, not using it", props.netDeviceVersion, NCCL_NET_DEVICE_UNPACK_VERSION); return ncclInternalError; } default: WARN("Unknown device code index"); return ncclInternalError; } return ncclSuccess; } static ncclResult_t netGetState(int i, enum ncclNetState* state) { pthread_mutex_lock(&netLock); if (ncclNetStates[i] == ncclNetStateInit) { int ndev; if (ncclNets[i]->init(ncclDebugLog) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled; else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled; else ncclNetStates[i] = ncclNetStateEnabled; } *state = ncclNetStates[i]; pthread_mutex_unlock(&netLock); return ncclSuccess; } static ncclResult_t collNetGetState(int i, enum ncclNetState* state) { pthread_mutex_lock(&netLock); if (ncclCollNetStates[i] == ncclNetStateInit) { int ndev; if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled; else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled; else ncclCollNetStates[i] = ncclNetStateEnabled; } *state = ncclCollNetStates[i]; pthread_mutex_unlock(&netLock); return ncclSuccess; } ncclResult_t ncclNetInit(struct ncclComm* comm) { // Initialize main communication network const char* netName; bool ok = false; netName = comm->config.netName; for (int i=0; i<3; i++) { if (ncclNets[i] == nullptr) continue; enum ncclNetState state; NCCLCHECK(netGetState(i, &state)); if (state != ncclNetStateEnabled) continue; if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue; if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) { // Mismatched device plugin version continue; } comm->ncclNet = ncclNets[i]; ok = true; if (ncclCollNets[i]) { NCCLCHECK(collNetGetState(i, &state)); if (state == ncclNetStateEnabled) { comm->ncclCollNet = ncclCollNets[i]; } } break; } if (!ok) { WARN("Error: network %s not found.", netName ? netName : ""); return ncclInvalidUsage; } return ncclSuccess; } ncclResult_t ncclNetFinalize(struct ncclComm* comm) { comm->ncclNet = nullptr; comm->ncclCollNet = nullptr; return ncclSuccess; } ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { constexpr int GPU_BUF_SIZE = 2*1024*1024; #if CUDART_VERSION >= 11030 // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute int driverVersion; CUDACHECK(cudaDriverGetVersion(&driverVersion)); if (driverVersion >= 11030) { int cudaDev, attr = 0; CUDACHECK(cudaGetDevice(&cudaDev)); CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev)); *gdrSupport = attr; return ncclSuccess; } #endif static int gdrSupportMatrix[32] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; if (gdrSupportMatrix[comm->cudaDev] == -1) { int netDevs; NCCLCHECK(comm->ncclNet->devices(&netDevs)); gdrSupportMatrix[comm->cudaDev] = 0; for (int dev=0; devncclNet->getProperties(dev, &props)); if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue; // Allocate memory on the GPU and try to register it on the NIC. void *lComm = NULL, *sComm = NULL, *rComm = NULL; ncclNetHandle_t handle; char* gpuPtr = NULL; void* mHandle = NULL; ncclResult_t ret; ncclDebugNoWarn = NCCL_NET; NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1); bool connected; connected = false; while (!connected) { // If we're aborting now, skip to cleanup if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) { goto cleanup2; } if (sComm == NULL) NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm, NULL), ret, cleanup2); if (rComm == NULL) NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2); connected = (rComm != NULL) && (sComm != NULL); } NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2); if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle)); NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle)); gdrSupportMatrix[comm->cudaDev] = 1; } ncclDebugNoWarn = 0; NCCLCHECK(ncclCudaFree(gpuPtr)); cleanup2: if (rComm != NULL) NCCLCHECK(comm->ncclNet->closeRecv(rComm)); if (sComm != NULL) NCCLCHECK(comm->ncclNet->closeSend(sComm)); NCCLCHECK(comm->ncclNet->closeListen(lComm)); cleanup1: break; } } *gdrSupport = gdrSupportMatrix[comm->cudaDev]; return ncclSuccess; } int ncclNetVersion(struct ncclComm* comm) { return (comm->ncclNet == &ncclNet_v5_as_v8) ? 5 : (comm->ncclNet == &ncclNet_v6_as_v8) ? 6 : (comm->ncclNet == &ncclNet_v7_as_v8) ? 7 : 8; } nccl-2.22.3-1/src/proxy.cc000066400000000000000000002004411463451655400151520ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "info.h" #include "collectives.h" #include "socket.h" #include "shm.h" #include "profiler.h" #define ENABLE_TIMER 0 #include "timer.h" #include "transport.h" #include #include #include #include enum { proxyRecv=0, proxySend=1 }; static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) { if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true; /* In chains, one rank does not need a proxy. Let's figure out which one it is */ /* Which index in the reorganized rings should we compare root against */ const int myrank = 0, nextrank = 1, prevrank = nranks-1; int index = pattern == ncclPatternPipelineFrom ? /* no recv / no send if root = */ /* bcast */ (type == proxyRecv ? myrank : nextrank ): /* reduce */ (type == proxyRecv ? prevrank : myrank ); int rank = ring->userRanks[index]; return (root != rank); } #define PROXYARGS_ALLOCATE_SIZE NCCL_MAX_OPS struct ncclProxyPool { struct ncclProxyPool *next; struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE]; }; static void expectedProxyResponseFree(struct ncclProxyState* state) { struct ncclExpectedProxyResponse* elem = state->expectedResponses; struct ncclExpectedProxyResponse* prev = NULL; while (elem) { prev = elem; elem = elem->next; free(prev->respBuff); free(prev); } } static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, void* opId, void* respBuff, int respSize, ncclResult_t res) { struct ncclExpectedProxyResponse* elem = state->expectedResponses; while (elem) { if (elem->opId == opId) { if (respSize != elem->respSize) { WARN("Mismatched response size for opId=%p", opId); return ncclInternalError; } if (elem->done) { WARN("Storing response for already completed opId=%p", opId); return ncclInternalError; } memcpy(elem->respBuff, respBuff, respSize); free(respBuff); elem->done = true; elem->res = res; return ncclSuccess; } elem = elem->next; } WARN("Proxy response for opId=%p doesn't match any expected response", opId); return ncclInternalError; } static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, void* opId, int respSize) { struct ncclExpectedProxyResponse* ex; NCCLCHECK(ncclCalloc(&ex, 1)); ex->opId = opId; // Pre-alloc response buffer ex->respBuff = malloc(respSize); ex->respSize = respSize; ex->res = ncclInternalError; ex->done = false; // Enqueue struct ncclExpectedProxyResponse* list = state->expectedResponses; if (list == NULL) { state->expectedResponses = ex; return ncclSuccess; } while (list->next) list = list->next; list->next = ex; return ncclSuccess; } static ncclResult_t expectedProxyResponseDequeue(struct ncclProxyState* state, void* opId, void* respBuff, int* found) { struct ncclExpectedProxyResponse* elem = state->expectedResponses; struct ncclExpectedProxyResponse* prev = NULL; *found = 0; while (elem) { if ((elem->opId == opId) && elem->done) { if (prev == NULL) { state->expectedResponses = elem->next; } else { prev->next = elem->next; } memcpy(respBuff, elem->respBuff, elem->respSize); ncclResult_t res = elem->res; free(elem->respBuff); free(elem); *found = 1; return res; } prev = elem; elem = elem->next; } return ncclSuccess; } static ncclResult_t expectedProxyResponseRemove(struct ncclProxyState* state, void* opId) { struct ncclExpectedProxyResponse* elem = state->expectedResponses; struct ncclExpectedProxyResponse* prev = NULL; while (elem) { if (elem->opId == opId) { if (prev == NULL) { state->expectedResponses = elem->next; } else { prev->next = elem->next; } free(elem->respBuff); free(elem); return ncclSuccess; } prev = elem; elem = elem->next; } WARN("Couldn't find opId=%p", opId); return ncclInternalError; } static ncclResult_t asyncProxyOpEnqueue(struct ncclProxyLocalPeer* peer, ncclProxyAsyncOp* op) { ncclProxyAsyncOp* list = peer->asyncOps; if (list == NULL) { peer->asyncOps = op; return ncclSuccess; } while (list->next) list = list->next; list->next = op; return ncclSuccess; } static ncclResult_t asyncProxyOpDequeue(struct ncclProxyLocalPeer* peer, ncclProxyAsyncOp* op) { struct ncclProxyAsyncOp* elem = peer->asyncOps; struct ncclProxyAsyncOp* prev = NULL; while (elem) { if (elem->opId == op->opId) { if (prev == NULL) { peer->asyncOps = elem->next; } else { prev->next = elem->next; } if (elem->reqBuff) { free(elem->reqBuff); } if (elem->respBuff) { free(elem->respBuff); } free(elem); return ncclSuccess; } prev = elem; elem = elem->next; } if (op) { WARN("Attempting to dequeue nonexistent async opId=%p", op->opId); } else { WARN("Attempting to dequeue null operation"); } return ncclInternalError; } static ncclResult_t allocateArgs(struct ncclProxyProgressState* state, struct ncclProxyArgs** argsptr) { struct ncclProxyArgs* elem; if (state->pool == NULL) { // Allocate a new pool of elements. Make sure we allocate the memory close // to the network thread struct ncclProxyPool* newPool; NCCLCHECK(ncclCalloc(&newPool, 1)); struct ncclProxyArgs* newElems = newPool->elems; // Chain newly allocated elements for (int i=0; ipool = newElems; // Save the pool memory block for later resource release newPool->next = state->pools; state->pools = newPool; } elem = state->pool; state->pool = state->pool->next; elem->next = elem->nextPeer = NULL; *argsptr = elem; return ncclSuccess; } //#define DEBUG_PROXY 1 #ifdef DEBUG_PROXY #define DEBUG_PROXY_PRINT printf #else #define DEBUG_PROXY_PRINT(...) #endif #define OP_INDEX(op) ((op) ? (op)-state->pools->elems : -1) #define OP_SEEN 0x100000 ncclResult_t getOpIndex(struct ncclProxyArgs* op, struct ncclProxyProgressState* state, int* poolIndex, int* opIndex) { struct ncclProxyPool* pool = state->pools; int p = 0; while (pool) { uint64_t o = op-pool->elems; if (o < PROXYARGS_ALLOCATE_SIZE) { *opIndex = o; *poolIndex = p; return ncclSuccess; } pool = pool->next; p++; } WARN("Could not find pool of op %p", op); return ncclInternalError; } ncclResult_t printProxyOp(struct ncclProxyArgs* op, int poolIndex, int opIndex) { printf("[%d-%d|%ld| %s", poolIndex, opIndex, op->opCount, op->pattern == ncclPatternSend ? "Send" : op->pattern == ncclPatternRecv ? "Recv" : "Coll"); for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = op->subs+s; if (op->state == ncclProxyOpProgress) { char status = ' '; if (op->pattern == ncclPatternRecv) { if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init else if (sub->received < sub->posted) status = 'R'; // Receiving else if (sub->received < sub->transmitted) status = 'R'; // Receiving else if (sub->transmitted < sub->received) status = 'F'; // Flushing else if (sub->done < sub->transmitted) status = 'G'; // Waiting on GPU else status = 'D'; // Done } else if (op->pattern == ncclPatternSend) { if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init else if (sub->transmitted < sub->posted) status = 'G'; // Waiting on GPU else if (sub->done < sub->transmitted) status = 'S'; // Sending else status = 'D'; // Done } printf(" %d%c/%d", sub->peer, status, sub->channelId); } else { printf(" %d/%d", sub->peer, sub->channelId); } } printf("]"); return ncclSuccess; } ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) { struct ncclProxyArgs* op = state->active; int poolIndex, opIndex; printf("ACTIVE OPS\n"); while (op) { NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex)); if (op->state & OP_SEEN) { WARN("List loop at element %d-%d", poolIndex, opIndex); } NCCLCHECK(printProxyOp(op, poolIndex, opIndex)); op->state |= OP_SEEN; printf("\n"); struct ncclProxyArgs* nextOp = op->nextPeer; while (nextOp) { NCCLCHECK(getOpIndex(nextOp, state, &poolIndex, &opIndex)); if (nextOp->state & OP_SEEN) { WARN("List loop at element %d-%d", poolIndex, opIndex); } printf("| `-> "); NCCLCHECK(printProxyOp(nextOp, poolIndex, opIndex)); nextOp->state |= OP_SEEN; printf("\n"); if (nextOp->next) { WARN("Inactive op has next set!"); } nextOp = nextOp->nextPeer; } if (op->nextPeer == NULL) printf("|\n"); op = op->next; printf("v\n"); } printf("[X]\n"); # if 0 printf("FREE OPS\n"); op = state->pool; while (op) { NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex)); if (op->state & OP_SEEN) { WARN("List loop at element %d-%d", poolIndex, opIndex); } NCCLCHECK(printProxyOp(op, poolIndex, opIndex)); op->state |= OP_SEEN; printf("->"); op = op->next; } printf("[X]\n"); #else op = state->pool; while (op) { NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex)); if (op->state & OP_SEEN) { WARN("List loop at element %d-%d", poolIndex, opIndex); } op->state |= OP_SEEN; op = op->next; } #endif struct ncclProxyPool* pool = state->pools; poolIndex = 0; while (pool) { struct ncclProxyArgs* elem = pool->elems; for (int e=0; estate & OP_SEEN) == 0) { printf("Elem %d-%d is not in any list:\n", poolIndex, e); NCCLCHECK(printProxyOp(elem, poolIndex, e)); printf("\n"); } else { elem->state -= OP_SEEN; } } pool = pool->next; poolIndex++; } return ncclSuccess; } static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyArgs* args, int subIndex) { struct ncclProxySubArgs* sub = args->subs+subIndex; if (subIndex >= NCCL_PROXY_MAX_SUBS) { WARN("Proxy append out of bounds"); return ncclInternalError; } //memset(sub, 0, sizeof(struct ncclProxySubArgs)); sub->connection = op->connection; sub->channelId = op->channelId; sub->nsteps = op->nsteps; sub->nbytes = op->nbytes; sub->offset = 0; sub->peer = op->root; sub->reg = op->reg; sub->sendMhandle = op->sendMhandle; sub->recvMhandle = op->recvMhandle; sub->sendbuff = op->sendbuff; sub->recvbuff = op->recvbuff; args->nsubs = subIndex+1; if (subIndex) { if ((args->sliceSteps != op->sliceSteps) || (args->chunkSteps != op->chunkSteps) || (args->protocol != op->protocol) || (args->dtype != op->dtype) || (args->redOp != op->redOp) || (args->coll != op->coll)) { WARN("Proxy append mismatch"); return ncclInternalError; } if (args->state != ncclProxyOpReady) { WARN("Proxy append on running operation"); return ncclInternalError; } return ncclSuccess; } //memset(&args->progress, 0, sizeof(struct ncclProxyArgs)-offsetof(struct ncclProxyArgs, progress)); args->done = 0; args->opCount = op->opCount; args->sliceSteps = op->sliceSteps; args->chunkSteps = op->chunkSteps; args->chunkSize = op->chunkSize; args->dtype = op->dtype; args->redOp = op->redOp; args->pattern = op->pattern; args->protocol = op->protocol; args->coll = op->coll; args->specifics = op->specifics; args->state = ncclProxyOpReady; args->progress = op->connection->tcomm->proxyProgress; args->proxyAppendPtr = op->connection->proxyAppendPtr; return ncclSuccess; } static ncclResult_t ProxyAppend(struct ncclProxyProgressState* state, struct ncclProxyOp* op) { struct ncclProxyConnection* connection = op->connection; int shared = connection->shared; struct ncclProxyArgs* args = *connection->proxyAppendPtr; if (args) { if (shared && args->opCount == op->opCount) { NCCLCHECK(ncclProxyOpToArgs(op, args, args->nsubs)); DEBUG_PROXY_PRINT("Insert (%d/%5ld/%5ld) as group with %5ld\n", shared, args->opCount, op->opCount, OP_INDEX(args)); } else { struct ncclProxyArgs* prevArgs = args; NCCLCHECK(allocateArgs(state, &args)); NCCLCHECK(ncclProxyOpToArgs(op, args, 0)); prevArgs->nextPeer = args; DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, prevArgs->opCount, args->opCount, OP_INDEX(prevArgs)); *(args->proxyAppendPtr) = args; } } else { // Nothing running for that peer. Add to the list NCCLCHECK(allocateArgs(state, &args)); NCCLCHECK(ncclProxyOpToArgs(op, args, 0)); if (state->active == NULL) { // Create the list DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as first element\n", OP_INDEX(args), shared, args->opCount); state->active = args; } else { // Append element at the end of the list struct ncclProxyArgs* last = state->active; while (last->next) last = last->next; last->next = args; DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element\n", OP_INDEX(args), shared, args->opCount); } *(args->proxyAppendPtr) = args; } return ncclSuccess; } ncclResult_t ncclProxyPost(struct ncclProxyOpsPool* pool, int nextOps, int nextOpsEnd) { pthread_mutex_lock(&pool->mutex); if (pool->nextOps == -1) { pool->nextOps = nextOps; pthread_cond_signal(&pool->cond); } else { pool->ops[pool->nextOpsEnd].next = nextOps; } pool->nextOpsEnd = nextOpsEnd; pthread_mutex_unlock(&pool->mutex); return ncclSuccess; } static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, struct ncclProxyOp* proxyOp) { int tpLocalRank = comm->topParentLocalRanks[comm->localRank]; struct ncclProxyOps* proxyOps = comm->proxyState->proxyOps; if (proxyOps == NULL) return ncclInternalError; proxyOps += proxyConn->tpLocalRank; struct ncclProxyOpsPool* pool = proxyOps->pool; TIME_START(0); int opIndex = proxyOps->freeOp; struct ncclProxyOp* op; if (opIndex != -1) { op = pool->ops+opIndex; proxyOps->freeOp = op->next; } else { int freeOp; while ((freeOp = pool->freeOps[tpLocalRank]) == -1) sched_yield(); int freeOpNew; while ((freeOpNew = __sync_val_compare_and_swap(pool->freeOps+tpLocalRank, freeOp, -1)) != freeOp) freeOp = freeOpNew; opIndex = freeOp; op = pool->ops+opIndex; proxyOps->freeOp = op->next; } if (op->next != -1) __builtin_prefetch(pool->ops+op->next); // Prefetch next free op memcpy(op, proxyOp, sizeof(struct ncclProxyOp)); op->next = -1; op->connection = proxyConn->connection; if (proxyOps->nextOps == -1) { proxyOps->nextOps = proxyOps->nextOpsEnd = opIndex; } else { pool->ops[proxyOps->nextOpsEnd].next = opIndex; proxyOps->nextOpsEnd = opIndex; } if (++proxyOps->count == MAX_OPS_PER_PEER) { // Post what we have so far to free some ops in the pool // Do not post last operations as we could have more coming with the same opCount, and posting // them in different batches would break proxyArgs aggregation with subs. uint64_t lastOpCount = pool->ops[proxyOps->nextOpsEnd].opCount; int lastOp = -1; int toSend = 0; int ops = 0; for (int op= proxyOps->nextOps; op != proxyOps->nextOpsEnd; op=pool->ops[op].next) { ops++; if (pool->ops[op].opCount != lastOpCount) { lastOp = op; toSend = ops; } } if (lastOp == -1) { WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount); return ncclInternalError; } // Cut chain at lastOp int nextOps = proxyOps->nextOps; proxyOps->nextOps = pool->ops[lastOp].next; pool->ops[lastOp].next = -1; NCCLCHECK(ncclProxyPost(proxyOps->pool, nextOps, lastOp)); proxyOps->count -= toSend; } TIME_STOP(0); return ncclSuccess; } static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) { if (peer < 0) return ncclSuccess; struct ncclChannelPeer* peerComm = channel->peers[peer]; struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex; if (connector->transportComm == NULL) { WARN("Rank %d has no transport for %s peer %d on channel %d/%d", comm->rank, type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex); return ncclInternalError; } if (connector->proxyConn.proxyProgress == NULL) return ncclSuccess; if (justInquire) *justInquire = true; else { NCCLCHECK(ncclLocalOpAppend(comm, &connector->proxyConn, op)); } return ncclSuccess; } // justInquire != nullptr means don't actually do anything, just assertain need of // ncclProxySaveOp for this op. ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool* justInquire) { struct ncclChannel* channel = &comm->channels[op->channelId]; if (justInquire) *justInquire = false; switch (op->pattern) { case ncclPatternRing: case ncclPatternRingTwice: case ncclPatternPipelineFrom: case ncclPatternPipelineTo: { struct ncclRing* ring = &channel->ring; if (NeedProxy(proxyRecv, op->pattern, op->root, ring, comm->nRanks)) { NCCLCHECK(SaveProxy(comm, channel, proxyRecv, ring->prev, op, 0, justInquire)); } if (NeedProxy(proxySend, op->pattern, op->root, ring, comm->nRanks)) { NCCLCHECK(SaveProxy(comm, channel, proxySend, ring->next, op, 0, justInquire)); } } break; case ncclPatternTreeUp: case ncclPatternTreeDown: case ncclPatternTreeUpDown: { if (op->pattern != ncclPatternTreeDown) { // Tree up struct ncclTree* tree = &channel->tree; for (int i=0; idown[i], op, 0, justInquire)); } NCCLCHECK(SaveProxy(comm, channel, proxySend, tree->up, op, 0, justInquire)); } if (op->pattern != ncclPatternTreeUp) { // Tree down struct ncclTree* tree = &channel->tree; for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) { NCCLCHECK(SaveProxy(comm, channel, proxySend, tree->down[i], op, 0, justInquire)); } NCCLCHECK(SaveProxy(comm, channel, proxyRecv, tree->up, op, 0, justInquire)); } } break; case ncclPatternCollnetChain: { NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->collnetChain.up, op, 1, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->collnetChain.up, op, 0, justInquire)); } break; case ncclPatternCollnetDirect: { NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->collnetDirect.out, op, 1, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->collnetDirect.out, op, 0, justInquire)); } break; case ncclPatternNvls: { NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.out, op, 1, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.out, op, 0, justInquire)); } break; case ncclPatternNvlsTree: { NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeDown[1], op, 0, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeDown[2], op, 0, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeUp, op, 0, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[1], op, 0, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[2], op, 0, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeUp, op, 0, justInquire)); } break; case ncclPatternSend: case ncclPatternRecv: { if (op->root == comm->rank) return ncclSuccess; NCCLCHECK(SaveProxy(comm, channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, 1, justInquire)); } break; } return ncclSuccess; } static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) { struct ncclProxyArgs* freeOp = *opPtr; struct ncclProxyArgs* next = freeOp->next; DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(next)); *opPtr = next; if (freeOp->nextPeer) { // replace op by nextPeer struct ncclProxyArgs* nextPeer = freeOp->nextPeer; if (*prevOpPtr) { (*prevOpPtr)->next = nextPeer; } else { state->active = nextPeer; } nextPeer->next = next; *(prevOpPtr) = nextPeer; } else { *(freeOp->proxyAppendPtr) = NULL; if (*prevOpPtr) { (*prevOpPtr)->next = next; } else { state->active = next; } } freeOp->next = state->pool; state->pool = freeOp; DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr)); #ifdef DEBUG_PROXY NCCLCHECK(dumpProxyState(state)); #endif return ncclSuccess; } static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) { struct ncclProxyArgs* prevOp = NULL; struct ncclProxyArgs* op = opStart; while (op) { if (op->state == ncclProxyOpNone) return ncclInternalError; TIME_START(0); TIME_START(1); NCCLCHECK(op->progress(proxyState, op)); if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); } *idle &= op->idle; if (op->state == ncclProxyOpNone) { TIME_START(2); NCCLCHECK(removeOp(state, &op, &prevOp)); TIME_STOP(2); } else { prevOp = op; op = op->next; } } return ncclSuccess; } NCCL_PARAM(ProxyAppendBatchSize, "PROXY_APPEND_BATCH_SIZE", 16); static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int* added) { struct ncclProxyProgressState* state = &proxyState->progressState; if (state->opsPool == NULL) return ncclInternalError; struct ncclProxyOpsPool* pool = state->opsPool; struct ncclProxyArgs profArgs; // Only used for profiling purposes if (state->nextOps != -1) goto process_nextops; // If we have ops to progress, no need to block waiting for something to arrive or even wait for the lock // to be available. Exit, continue progress, and come back later. if (state->active != NULL && (pool->nextOps == -1 || pthread_mutex_trylock(&pool->mutex) != 0)) return ncclSuccess; if (state->active == NULL) { pthread_mutex_lock(&pool->mutex); while (pool->nextOps == -1 && !state->stop) { struct ncclProxyArgs profArgs; // Only used for profiling purposes ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep); pthread_cond_wait(&pool->cond, &pool->mutex); ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup); } if (state->stop) { // We might have been woken up to stop. pthread_mutex_unlock(&pool->mutex); return ncclSuccess; } } state->nextOps = pool->nextOps; pool->nextOps = pool->nextOpsEnd = -1; pthread_mutex_unlock(&pool->mutex); if (state->nextOps == -1) return ncclInternalError; process_nextops: ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppend); TIME_START(2); int freeOp[NCCL_MAX_LOCAL_RANKS]; int freeOpEnd[NCCL_MAX_LOCAL_RANKS]; for (int i = 0; i < proxyState->tpLocalnRanks; i++) freeOp[i] = -1; uint64_t lastOpCount = 0; int lastPeer = -1; int count = 0; for (int opIndex = state->nextOps; opIndex != -1;) { struct ncclProxyOp* peerOp = pool->ops+opIndex; int peer = opIndex / MAX_OPS_PER_PEER; if ((lastOpCount && peerOp->opCount != lastOpCount) || ((lastPeer != -1) && peer != lastPeer)) count++; if (count == ncclParamProxyAppendBatchSize()+1) break; lastOpCount = peerOp->opCount; lastPeer = peer; if (peerOp->connection == NULL) return ncclInternalError; if (peerOp->next != -1) __builtin_prefetch(pool->ops+peerOp->next); NCCLCHECK(ProxyAppend(state, peerOp)); (*added)++; int lastOpIndex = opIndex; opIndex = peerOp->next; // Return op to peer pool if (freeOp[peer] == -1) { freeOpEnd[peer] = lastOpIndex; } else { peerOp->next = freeOp[peer]; } freeOp[peer] = lastOpIndex; state->nextOps = opIndex; } for (int i = 0; i < proxyState->tpLocalnRanks; i++) { if (freeOp[i] == -1) continue; int newFree = freeOp[i]; int oldFree = pool->freeOps[i]; pool->ops[freeOpEnd[i]].next = oldFree; if (oldFree == -1) { // Nothing for the main thread to consume, we can set it. pool->freeOps[i] = newFree; } else { // The main thread may recycle free ops at any time, replace the freeOps value atomically and check it worked. int swap = __sync_val_compare_and_swap(pool->freeOps+i, oldFree, newFree); if (swap != oldFree) { if (swap != -1) return ncclInternalError; // Ops were recycled while we were trying to swap, just set the value directly now. pool->ops[freeOpEnd[i]].next = -1; pool->freeOps[i] = newFree; } } } profArgs.opCount = *added; ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppendEnd); TIME_STOP(2); return ncclSuccess; } #include static ncclProxyProgressState* ncclLastProxyState; void ncclDumpProxyState(int signal) { dumpProxyState(ncclLastProxyState); } NCCL_PARAM(CreateThreadContext, "CREATE_THREAD_CONTEXT", 0); static int setProxyThreadContext(struct ncclProxyState* proxyState) { #if CUDART_VERSION >= 11030 static int createThreadContext = -1; if (createThreadContext == -1) { createThreadContext = ncclParamCreateThreadContext(); if (createThreadContext) { if (CUPFN(cuCtxCreate) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) { WARN("Unable to create thread context due to old driver, disabling."); createThreadContext = 0; } } } if (createThreadContext) { if (proxyState->cudaCtx == NULL) { if (CUPFN(cuCtxCreate(&proxyState->cudaCtx, NULL, 0, CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) { WARN("Failed to create CUDA context on device %d", proxyState->cudaDev); createThreadContext = 0; } } else { if (CUPFN(cuCtxSetCurrent(proxyState->cudaCtx)) != CUDA_SUCCESS) { WARN("Failed to set CUDA context on device %d", proxyState->cudaDev); return 0; } return 1; } } #endif return 0; } // Set to SIGUSR1 or SIGUSR2 to help debug proxy state during hangs NCCL_PARAM(ProxyDumpSignal, "PROXY_DUMP_SIGNAL", -1); NCCL_PARAM(ProgressAppendOpFreq, "PROGRESS_APPENDOP_FREQ", 8); void* ncclProxyProgress(void *proxyState_) { struct ncclProxyState* proxyState = (struct ncclProxyState*)proxyState_; if (setProxyThreadContext(proxyState)) { INFO(NCCL_INIT, "[Proxy Progress] Created CUDA context on device %d", proxyState->cudaDev); } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) { WARN("[Proxy Progress] Failed to set CUDA device %d", proxyState->cudaDev); } // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); struct ncclProxyProgressState* state = &proxyState->progressState; state->nextOps = -1; const int sig = ncclParamProxyDumpSignal(); if (sig != -1) signal(sig, ncclDumpProxyState); ncclLastProxyState = state; char threadName[NCCL_THREAD_NAMELEN]; snprintf(threadName, NCCL_THREAD_NAMELEN, "NCCL Progress%2d", proxyState->cudaDev); nvtxNameOsThreadA(syscall(SYS_gettid), threadName); int lastIdle = 0; /* Too frequent call of ncclProxyGetPostedOps() will result in perf regression for small message * communication. proxyOpAppendCounter is a counter that helps us decide if we need to append proxy ops. * After each progress, proxyOpAppendCounter will increase by 1 and compare with environment variable * ncclParamProgressAppendOpFreq(). If they are equal, we will append proxy ops. This will decrease the * frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */ int proxyOpAppendCounter = 0; struct ncclProxyArgs profArgs; // Only used for profiling purposes while ((state->stop == 0 || (state->stop == 1 && state->active)) && __atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) == 0) { int idle = 1; ncclResult_t ret = progressOps(proxyState, state, state->active, &idle); if (ret != ncclSuccess) { __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE); INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret); continue; } if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle); if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive); if (idle || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) { int added = 0; proxyOpAppendCounter = 0; TIME_START(3); if (state->stop == 0) ret = ncclProxyGetPostedOps(proxyState, &added); if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); } if (ret != ncclSuccess) { __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE); INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret); } if (added == 0) { sched_yield(); // No request progressed. Let others run. } } lastIdle = idle; } return NULL; } ncclResult_t ncclProxyStart(struct ncclComm* comm) { struct ncclProxyOps* proxyOps = comm->proxyState->proxyOps; if (proxyOps == NULL) return ncclSuccess; TIME_START(1); for (int r = 0; r < comm->sharedRes->tpNLocalRanks; r++) { struct ncclProxyOps* ops = proxyOps + r; if (ops->pool == NULL || ops->nextOps == -1) continue; NCCLCHECK(ncclProxyPost(ops->pool, ops->nextOps, ops->nextOpsEnd)); ops->nextOps = ops->nextOpsEnd = -1; ops->count = 0; } comm->opCount++; TIME_STOP(1); return ncclSuccess; } static ncclResult_t ncclProxyProgressCreate(struct ncclProxyState* proxyState) { struct ncclProxyProgressState* state = &proxyState->progressState; if (!state->thread) { pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState); ncclSetThreadName(state->thread, "NCCL Progress%2d", proxyState->tpLocalnRanks); } return ncclSuccess; } ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) { struct ncclProxyProgressState* state = &proxyState->progressState; // Request the proxy to stop and then wake it if (state->opsPool) { pthread_mutex_lock(&state->opsPool->mutex); state->stop = 1; pthread_cond_signal(&state->opsPool->cond); pthread_mutex_unlock(&state->opsPool->mutex); pthread_join(state->thread, NULL); } // Free off any memory allocated for the proxy arg pools while (state->pools != NULL) { struct ncclProxyPool *next = state->pools->next; free(state->pools); state->pools = next; } ncclProfilingDump(); TIME_PRINT("Proxy"); return ncclSuccess; } #define NCCL_PROXY_CONN_POOL_SIZE_POW2 7 #define NCCL_PROXY_CONN_POOL_SIZE (1<<(NCCL_PROXY_CONN_POOL_SIZE_POW2)) #define NCCL_PROXY_CONN_POOL_MASK ((NCCL_PROXY_CONN_POOL_SIZE)-1) struct ncclProxyConnectionPool { struct ncclProxyConnection** pools; int banks; int offset; }; static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) { if (pool->offset == NCCL_PROXY_CONN_POOL_SIZE) { NCCLCHECK(ncclRealloc(&pool->pools, pool->banks, pool->banks+1)); NCCLCHECK(ncclCalloc(pool->pools+pool->banks, NCCL_PROXY_CONN_POOL_SIZE)); pool->banks++; pool->offset = 0; } *id = ((pool->banks-1) << NCCL_PROXY_CONN_POOL_SIZE_POW2) + pool->offset; pool->offset++; return ncclSuccess; } static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool, int id, struct ncclProxyConnection** conn) { int bank = id>>NCCL_PROXY_CONN_POOL_SIZE_POW2; int offset = id&NCCL_PROXY_CONN_POOL_MASK; if ((pool->pools == NULL) || (bank > pool->banks) || (pool->pools[bank] == NULL)) return ncclInternalError; *conn = pool->pools[bank]+offset; return ncclSuccess; } static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { if (connection->send) { if (ncclTransports[connection->transport]->send.proxyFree) { NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, proxyState)); } } else { if (ncclTransports[connection->transport]->recv.proxyFree) { NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, proxyState)); } } return ncclSuccess; } static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* pool, struct ncclProxyState* proxyState) { for (int b=0; bbanks; b++) { int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE; for (int i=0; ipools[b]+i; if (connection->state != connUninitialized) { NCCLCHECK(proxyFree(connection, proxyState)); } } free(pool->pools[b]); } free(pool->pools); return ncclSuccess; } #include "transport.h" struct ncclProxyInitReq { int transport; int send; int tpLocalRank; int tpRank; int sameProcess; }; struct ncclProxyInitResp { ncclProxyConnection* connection; char devShmPath[6]; // "XXXXXX" - May or may not be set }; ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int tpProxyRank, struct ncclProxyConnector* proxyConn) { struct ncclSocket* sock; int ready, proxyRank = -1; struct ncclProxyState* sharedProxyState = comm->proxyState; // Keep one connection per local rank for (int i = 0; i < comm->localRanks; ++i) { /* find the proxy rank in comm. */ if (comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) { proxyRank = comm->localRankToRank[i]; break; } } proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0; // Keep one connection per local rank proxyConn->connection = NULL; proxyConn->tpRank = tpProxyRank; if (sharedProxyState->peerSocks == NULL) { NCCLCHECK(ncclCalloc(&sharedProxyState->peerSocks, comm->sharedRes->tpNLocalRanks)); NCCLCHECK(ncclCalloc(&sharedProxyState->proxyOps, comm->sharedRes->tpNLocalRanks)); NCCLCHECK(ncclCalloc(&sharedProxyState->sharedDevMems, comm->sharedRes->tpNLocalRanks)); for (int i = 0; i < comm->sharedRes->tpNLocalRanks; ++i) { NCCLCHECK(ncclSocketSetFd(-1, &sharedProxyState->peerSocks[i])); } } proxyConn->tpLocalRank = comm->sharedRes->tpRankToLocalRank[proxyConn->tpRank]; sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank; NCCLCHECK(ncclSocketReady(sock, &ready)); if (!ready) { NCCLCHECK(ncclSocketInit(sock, sharedProxyState->peerAddresses+proxyConn->tpRank, comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag)); NCCLCHECK(ncclSocketConnect(sock)); } struct ncclProxyInitReq req = {0}; req.transport = transport; req.send = send; req.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; req.tpRank = comm->topParentRanks[comm->rank]; req.sameProcess = proxyConn->sameProcess; struct ncclProxyInitResp resp = {0}; // This usually sends proxyConn->connection to identify which connection this is. // However, this is part of the response and therefore is ignored NCCLCHECK(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgInit, &req, sizeof(req), &resp, sizeof(resp))); proxyConn->connection = resp.connection; // If we need proxy progress, map progress ops struct ncclTransportComm* tcomm = send ? &ncclTransports[transport]->send : &ncclTransports[transport]->recv; if (tcomm->proxyProgress) { char poolPath[] = "/dev/shm/nccl-XXXXXX"; strncpy(poolPath+sizeof("/dev/shm/nccl-")-1, resp.devShmPath, sizeof("XXXXXX")-1); struct ncclProxyOps* proxyOps = sharedProxyState->proxyOps + proxyConn->tpLocalRank; if (proxyOps->pool == NULL) { NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle)); proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1; } } INFO(NCCL_NET|NCCL_PROXY, "Connected to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection); return ncclSuccess; } // UDS support ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, int tpRank, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int *respFd) { ncclResult_t res = ncclSuccess; struct ncclIpcSocket ipcSock = { 0 }; void *opId; NCCLCHECK(getRandomData(&opId, sizeof(opId))); int rank = comm->topParentLocalRanks[comm->localRank]; struct ncclProxyState* sharedProxyState = comm->proxyState; uint64_t pidHash = sharedProxyState->peerAddressesUDS[tpRank]; INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %p opId %p", comm, rank, tpRank, pidHash, reqSize, respSize, respFd, opId); // cuMem: Create a UDS socket to receive the response NCCLCHECK(ncclIpcSocketInit(&ipcSock, rank, (uint64_t)opId, comm->abortFlag)); ncclIpcHdr hdr; hdr.type = type; hdr.rank = rank; hdr.reqSize = reqSize; hdr.respSize = respSize; hdr.opId = opId; assert(reqSize <= sizeof(hdr.data)); memcpy(&hdr.data, reqBuff, reqSize); NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &hdr, sizeof(hdr), -1, tpRank, pidHash), res, error); NCCLCHECKGOTO(ncclIpcSocketRecvMsg(&ipcSock, respBuff, respSize, respFd), res, error); NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), res, error); INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %d opId %p - DONE", comm, rank, tpRank, pidHash, reqSize, respSize, (respFd ? *respFd : -1), opId); return res; error: NCCLCHECK(ncclIpcSocketClose(&ipcSock)); WARN("ncclProxyCallBlockingUDS call to tpRank %d(%lx) failed : %d", tpRank, pidHash, res); return res; } // cuMem API support // The request/response is sent out-of-band using ncclIpcSocket for this specific command ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int tpRank, void *handle, int* convertedFd) { ncclResult_t ret = ncclSuccess; // Request the allocation of a UDS fd for the handle NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, tpRank, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), NULL, 0, convertedFd), ret, error); // We have now received the converted fd over UDS INFO(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx tpRank %d returned fd %d", *(uint64_t*)handle, tpRank, *convertedFd); return ret; error: WARN("ncclProxyClientGetFd call to tpRank %d handle 0x%lx failed : %d", tpRank, *(uint64_t*)handle, ret); return ret; } const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" }; ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) { struct ncclSocket* sock; ncclResult_t ret = ncclSuccess; struct ncclProxyState* sharedProxyState = comm->proxyState; if (sharedProxyState->peerSocks == NULL) return ncclInternalError; sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank; if (sock == NULL) return ncclInternalError; NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error); NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error); NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error); NCCLCHECKGOTO(ncclSocketSend(sock, &respSize, sizeof(int)), ret, error); if (reqSize) NCCLCHECKGOTO(ncclSocketSend(sock, reqBuff, reqSize), ret, error); // Send opId to proxy NCCLCHECKGOTO(ncclSocketSend(sock, &opId, sizeof(opId)), ret, error); // Add proxyOp to expected response queue NCCLCHECK(expectedProxyResponseEnqueue(sharedProxyState, opId, respSize)); return ncclSuccess; error: return ret; } ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId) { struct ncclProxyState* sharedProxyState = comm->proxyState; // Receive the connection pointer from the Proxy if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) { WARN("Comm %p is in abort state", comm); return ncclInternalError; } if (sharedProxyState->peerSocks == NULL) return ncclInternalError; // Check response queue int found = 0; ncclResult_t res = expectedProxyResponseDequeue(sharedProxyState, opId, respBuff, &found); if (found == 0) { // Attempt to read in a new response header from the proxy thread struct ncclSocket* sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank; ncclProxyRpcResponseHeader resp = {0}; int offset = 0; if (ncclSuccess != ncclSocketProgress(NCCL_SOCKET_RECV, sock, &resp, sizeof(resp), &offset)) { WARN("Socket recv failed while polling for opId=%p", opId); return ncclInternalError; } if (offset == 0) { return ncclInProgress; // If we've returned a partial response, block to receive the rest of it } else if (offset < sizeof(resp)) { while (offset < sizeof(resp)) NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &resp, sizeof(resp), &offset)); } INFO(NCCL_PROXY, "ncclPollProxyResponse Received new opId=%p", resp.opId); // If there's a respSize to recv if (resp.respSize > 0) { if (resp.opId != opId) { // Unexpected response, need to buffer the socket data respBuff = malloc(resp.respSize); } assert(respBuff != NULL); NCCLCHECK(ncclSocketRecv(sock, respBuff, resp.respSize)); } if (resp.opId == opId) { INFO(NCCL_PROXY, "resp.opId=%p matches expected opId=%p", resp.opId, opId); NCCLCHECK(expectedProxyResponseRemove(sharedProxyState, resp.opId)); return resp.res; } else { INFO(NCCL_PROXY, "Queuing opId=%p respBuff=%p respSize=%d", resp.opId, respBuff, resp.respSize); // Store the result and mark response as completed NCCLCHECK(expectedProxyResponseStore(sharedProxyState, resp.opId, respBuff, resp.respSize, resp.res)); return ncclInProgress; } } else { INFO(NCCL_PROXY, "ncclPollProxyResponse Dequeued cached opId=%p", opId); } return res; } ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) { // Alloc some memory to act as a handle ncclResult_t res = ncclSuccess; void* opId = malloc(1); NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, type, reqBuff, reqSize, respSize, opId), res, fail); do { res = ncclPollProxyResponse(comm, proxyConn, respBuff, opId); } while (res == ncclInProgress); exit: free(opId); return res; fail: goto exit; } static ncclResult_t proxyProgressInit(struct ncclProxyState* proxyState) { struct ncclProxyProgressState* state = &proxyState->progressState; if (state->opsPool == NULL) { int size = sizeof(struct ncclProxyOpsPool); struct ncclProxyOpsPool* pool = NULL; char shmPath[sizeof("/dev/shm/nccl-XXXXXX")]; shmPath[0] = '\0'; NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, proxyState->tpLocalnRanks, &state->handle)); // Init pool pool->nextOps = -1; for (int r = 0; r < proxyState->tpLocalnRanks; r++) { pool->freeOps[r] = r*MAX_OPS_PER_PEER; for (int i=0; iops[r*MAX_OPS_PER_PEER+i].next = r*MAX_OPS_PER_PEER+i+1; pool->ops[(r+1)*MAX_OPS_PER_PEER-1].next = -1; } // Setup mutex/cond to work inter-process pthread_mutexattr_t mutexAttr; pthread_mutexattr_init(&mutexAttr); pthread_mutexattr_setpshared(&mutexAttr, PTHREAD_PROCESS_SHARED); pthread_mutex_init(&pool->mutex, &mutexAttr); pthread_condattr_t condAttr; pthread_condattr_setpshared(&condAttr, PTHREAD_PROCESS_SHARED); pthread_cond_init(&pool->cond, &condAttr); state->opsPool = pool; memcpy(state->opsPoolShmSuffix, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1); // All ops structures are created, we can start the progress thread NCCLCHECK(ncclProxyProgressCreate(proxyState)); } return ncclSuccess; } static void proxyOpsFree(struct ncclProxyState* proxyState) { struct ncclProxyProgressState* state = &proxyState->progressState; if (ncclShmClose(state->handle) != ncclSuccess) { WARN("[Service thread] shm close failed"); } } ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) { struct ncclProxyProgressState* state = &comm->proxyState->progressState; if (state->opsPool == NULL) return ncclSuccess; if (ncclShmUnlink(state->handle) != ncclSuccess) { WARN("[Service thread] proxy ops shm unlink failed"); } return ncclSuccess; } static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclProxyState* proxyState, ncclProxyInitReq* req, ncclProxyInitResp* resp, struct ncclProxyConnection** connection) { int id; NCCLCHECK(ncclProxyNewConnection(connectionPool, &id)); NCCLCHECK(ncclProxyGetConnection(connectionPool, id, connection)); (*connection)->sock = &peer->sock; (*connection)->transport = req->transport; (*connection)->send = req->send; (*connection)->tpLocalRank = req->tpLocalRank; (*connection)->sameProcess = req->sameProcess; peer->tpLocalRank = req->tpLocalRank; peer->tpRank = req->tpRank; resp->connection = *connection; (*connection)->tcomm = (*connection)->send ? &ncclTransports[(*connection)->transport]->send : &ncclTransports[(*connection)->transport]->recv; // If we need proxy progress, let's allocate ops and start the thread if ((*connection)->tcomm->proxyProgress) { NCCLCHECK(proxyProgressInit(proxyState)); struct ncclProxyProgressState* state = &proxyState->progressState; strncpy(resp->devShmPath, state->opsPoolShmSuffix, sizeof(resp->devShmPath)); } INFO(NCCL_NET|NCCL_PROXY, "New proxy %s connection %d from local rank %d, transport %d", (*connection)->send ? "send":"recv", id, (*connection)->tpLocalRank, (*connection)->transport); __atomic_store_n(&(*connection)->state, connInitialized, __ATOMIC_RELEASE); return ncclSuccess; } // cuMem API support static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void *opId, uint64_t handle) { #if CUDART_VERSION >= 11030 // cuMem API support ncclResult_t ret = ncclSuccess; struct ncclIpcSocket ipcSock = { 0 }; uint64_t hash = (uint64_t) opId; INFO(NCCL_PROXY, "UDS proxyGetFd received handle 0x%lx peer %d opId %lx", handle, rank, hash); CUmemAllocationHandleType type = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; int fd = -1; CUCHECK(cuMemExportToShareableHandle(&fd, handle, type, 0)); // Send back the converted fd using UDS NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, error); NCCLCHECKGOTO(ncclIpcSocketSendFd(&ipcSock, fd, rank, hash), ret, error); error: NCCLCHECK(ncclIpcSocketClose(&ipcSock)); // We can now safely close the exported fd (void) close(fd); return ret; #else return ncclInternalError; #endif } static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclProxyState* proxyState, int* asyncOpCount, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool) { int done = 1; ncclResult_t res = ncclInternalError; if (op->type == ncclProxyMsgSetup) { TRACE(NCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId); res = op->connection->tcomm->proxySetup(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done); } else if (op->type == ncclProxyMsgConnect) { TRACE(NCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff); res = op->connection->tcomm->proxyConnect(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done); } else if (op->type == ncclProxyMsgSharedInit) { int nChannels = (int) *op->reqBuff; TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgSharedInit opId=%p op.reqBuff=%p nChannels=%d", op->opId, op->reqBuff, nChannels); if (op->connection->tcomm->proxySharedInit) res = op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels); __atomic_store_n(&op->connection->state, connSharedInitialized, __ATOMIC_RELEASE); } else if (op->type == ncclProxyMsgInit) { TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff); res = proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection); } else if (op->type == ncclProxyMsgRegister) { TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgRegister opId=%p op.reqBuff=%p, op->reqSize=%d, op->respSize=%d", op->opId, op->reqBuff, op->reqSize, op->respSize); res = op->connection->tcomm->proxyRegister(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done); } else if (op->type == ncclProxyMsgDeregister) { TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgDeregister opId=%p op.reqBuff=%p, op->reqSize=%d, op->respSize=%d", op->opId, op->reqBuff, op->reqSize, op->respSize); res = op->connection->tcomm->proxyDeregister(op->connection, proxyState, op->reqBuff, op->reqSize, &done); } else return ncclInternalError; if (done) { INFO(NCCL_PROXY, "proxyProgressAsync opId=%p op.type=%d op.reqBuff=%p op.respSize=%d done", op->opId, op->type, op->reqBuff, op->respSize); if (op->type == ncclProxyMsgSetup) __atomic_store_n(&op->connection->state, connSetupDone, __ATOMIC_RELEASE); else if (op->type == ncclProxyMsgConnect) __atomic_store_n(&op->connection->state, connConnected, __ATOMIC_RELEASE); /* if setup or connect is done, we should not return any error at this point since * ncclSocketSend might already send the respBuff to the requester. If we still choose * to abort and close the connection, it can cause segfault if the requester is using * the respBuff. */ ncclProxyRpcResponseHeader resp = {op->opId, res, op->respSize}; // Send the opId for referencing async operation NCCLCHECK(ncclSocketSend(op->connection->sock, &resp, sizeof(resp))); if (op->respSize) { // Send the response NCCLCHECK(ncclSocketSend(op->connection->sock, op->respBuff, op->respSize)); } asyncProxyOpDequeue(peer, op); (*asyncOpCount)--; return ncclSuccess; } else if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) { return ncclInternalError; } return ncclInProgress; } static ncclResult_t proxyServiceInitOp(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclProxyState* proxyState, int* asyncOpCount) { struct ncclSocket* sock = &peer->sock; struct ncclProxyAsyncOp* asyncOp; NCCLCHECK(ncclCalloc(&asyncOp, 1)); asyncOp->type = type; NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*))); NCCLCHECK(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int))); NCCLCHECK(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int))); if (asyncOp->reqSize) { NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize)); NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize)); } // Store opId for completion response NCCLCHECK(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId))); if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize)); asyncProxyOpEnqueue(peer, asyncOp); (*asyncOpCount)++; NCCLCHECK(proxyProgressAsync(asyncOp, proxyState, asyncOpCount, peer, connectionPool)); return ncclSuccess; } #include static bool proxyMatchOpType(int type) { switch (type) { case ncclProxyMsgInit: case ncclProxyMsgSharedInit: case ncclProxyMsgSetup: case ncclProxyMsgConnect: case ncclProxyMsgGetFd: case ncclProxyMsgRegister: case ncclProxyMsgDeregister: return true; default: return false; } } void* ncclProxyService(void* _args) { struct ncclProxyState* proxyState = (struct ncclProxyState*) _args; // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); if (setProxyThreadContext(proxyState)) { INFO(NCCL_INIT, "[Proxy Service] Created CUDA context on device %d", proxyState->cudaDev); } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) { WARN("[Proxy Service] Failed to set CUDA device %d", proxyState->cudaDev); } // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); // Prepare poll descriptor struct ncclProxyConnectionPool connectionPool; connectionPool.pools = NULL; connectionPool.banks = 0; connectionPool.offset = NCCL_PROXY_CONN_POOL_SIZE; struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1]; struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS]; memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS); for (int s=0; slistenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) { WARN("[Proxy Service] Get listenSock fd fails"); return NULL; }; pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN; int maxnpeers = 0; int npeers = 0; int stop = 0; int asyncOpCount = 0; while (stop == 0 || (stop == 1 && npeers > 0)) { /* Even if local comm aborts, we cannot let proxy thread exit if we still have peer * connections. Need to wait until all other related comms call abort and safely exit * together, or we could face segmentation fault. */ if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) stop = 1; /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */ int ret; do { ret = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500); } while (ret < 0 && errno == EINTR); if (ret < 0) { WARN("[Proxy Service] Poll failed: %s", strerror(errno)); return NULL; } if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) { int s = 0; while (s < NCCL_MAX_LOCAL_RANKS && pollfds[s].fd >= 0) s++; if (s == NCCL_MAX_LOCAL_RANKS) { WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS); return NULL; } if (maxnpeers < s+1) maxnpeers = s+1; if (ncclSocketInit(&peers[s].sock) != ncclSuccess) { WARN("[Service thread] Initialize peers[%d].sock fails", s); return NULL; } if (ncclSocketAccept(&peers[s].sock, proxyState->listenSock) != ncclSuccess) { WARN("[Service thread] Accept failed %s", strerror(errno)); } else { if (ncclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != ncclSuccess) { WARN("[Service thread] Get peers[%d].sock fd fails", s); return NULL; } npeers++; peers[s].tpLocalRank = -1; } } for (int s=0; ssock; int closeConn = 0; int type = 0; ncclResult_t res = ncclSuccess; if (pollfds[s].fd == -1) continue; // Progress all ops for this ncclProxyLocalPeer ncclProxyAsyncOp* op = peer->asyncOps; while (op != nullptr) { ncclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */ type = op->type; res = proxyProgressAsync(op, proxyState, &asyncOpCount, peer, &connectionPool); if (res == ncclSuccess || res == ncclInProgress) { op = opnext; } else { // Res is a bad result closeConn = 1; WARN("[Service thread] Error encountered progressing operation=%s, res=%d, closing connection", ncclProxyMsgTypeStr[type], res); break; } } // Check for additional ops coming in if (pollfds[s].revents & POLLIN) { int closed; res = ncclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/); if (res != ncclSuccess && res != ncclInProgress) { WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed); closeConn = 1; } else if (closed) { INFO(NCCL_INIT|NCCL_NET|NCCL_PROXY, "[Service thread] Connection closed by localRank %d", peer->tpLocalRank); closeConn = 1; } else if (res == ncclSuccess) { // We received something from the sock if (type == ncclProxyMsgStop) { stop = 1; closeConn = 1; } else if (type == ncclProxyMsgClose) { closeConn = 1; } else if (proxyMatchOpType(type)) { res = proxyServiceInitOp(type, peers+s, &connectionPool, proxyState, &asyncOpCount); } else { WARN("[Service thread] Unknown command %d from localRank %d", type, peer->tpLocalRank); closeConn = 1; } INFO(NCCL_PROXY, "Received and initiated operation=%s res=%d", ncclProxyMsgTypeStr[type], res); } } else if (pollfds[s].revents & POLLHUP) { closeConn = 1; } if (res != ncclSuccess && res != ncclInProgress) { WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", proxyState->tpRank, ncclProxyMsgTypeStr[type], peer->tpRank, res); closeConn = 1; } if (closeConn) { ncclSocketClose(sock); if (op != nullptr) { asyncProxyOpDequeue(peer, op); asyncOpCount--; } pollfds[s].fd = -1; npeers--; } } } // Wait for all operations to complete and stop progress thread before freeing any resource if (ncclProxyProgressDestroy(proxyState) != ncclSuccess) { WARN("[Proxy Service] proxyDestroy failed"); } for (int s=0; slistenSock); free(proxyState->listenSock); proxyOpsFree(proxyState); return NULL; } // Process a request on the UDS socket static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd) { ncclIpcHdr hdr; NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), NULL)); if (hdr.type == ncclProxyMsgGetFd) { // cuMem API support uint64_t handle = *(uint64_t*)hdr.data; INFO(NCCL_PROXY, "proxyUDSRecvReq::ncclProxyMsgGetFd rank %d opId %p handle=0x%lx", hdr.rank, hdr.opId, handle); return proxyGetFd(proxyState, hdr.rank, hdr.opId, handle); } return ncclInternalError; } // UDS fd handle support void* ncclProxyServiceUDS(void* _args) { struct ncclProxyState* proxyState = (struct ncclProxyState*) _args; struct pollfd pollfds[1]; if (setProxyThreadContext(proxyState)) { INFO(NCCL_INIT, "[Proxy Service UDS] Created CUDA context on device %d", proxyState->cudaDev); } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) { WARN("[Proxy Service UDS] Failed to set CUDA device %d", proxyState->cudaDev); } if (ncclIpcSocketGetFd(&proxyState->ipcSock, &pollfds[0].fd) != ncclSuccess) { WARN("[Proxy Service UDS] Get listenSock fd fails"); return NULL; }; pollfds[0].events = POLLIN|POLLHUP; while (1) { /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */ int ret; do { ret = poll(pollfds, 1, 500); } while (ret < 0 && errno == EINTR); if (ret < 0) { WARN("[Proxy Service UDS] Poll failed: %s", strerror(errno)); return NULL; } // Check for stop/abort if (proxyState->stop || *proxyState->abortFlag) break; if (pollfds[0].revents) { // A request was seen on the UDS fd proxyUDSRecvReq(proxyState, pollfds[0].fd); } } ncclIpcSocketClose(&proxyState->ipcSock); INFO(NCCL_PROXY, "[Proxy Service UDS] exit: stop %d abortFlag %d", proxyState->stop, *proxyState->abortFlag); return NULL; } ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS) { assert(comm->sharedRes->proxyState == NULL); NCCLCHECK(ncclCalloc(&comm->sharedRes->proxyState, 1)); comm->proxyState = comm->sharedRes->proxyState; comm->proxyState->refCount = 1; comm->proxyState->listenSock = sock; comm->proxyState->peerAddresses = peerAddresses; comm->proxyState->peerAddressesUDS = peerAddressesUDS; // UDS support NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, peerAddressesUDS[comm->rank], comm->abortFlag)); return ncclSuccess; } ncclResult_t ncclProxyCreate(struct ncclComm* comm) { /* proxyState is shared among parent comm and split comms. comm->proxyState->thread is * pthread_join()'d by commFree() in init.cc when the refCount reduces down to 0. */ struct ncclProxyState* proxyState = comm->proxyState; if (proxyState->refCount == 1) { /* we have to make sure all following fields in comm have been initialized. */ proxyState->tpRank = comm->rank; proxyState->tpnRanks = comm->nRanks; proxyState->tpLocalnRanks = comm->localRanks; proxyState->cudaDev = comm->cudaDev; proxyState->abortFlag = comm->abortFlag; proxyState->p2pnChannels = comm->p2pnChannels; proxyState->p2pChunkSize = comm->p2pChunkSize; proxyState->nChannels = comm->nChannels; proxyState->allocP2pNetLLBuffers = comm->allocP2pNetLLBuffers; proxyState->dmaBufSupport = comm->dmaBufSupport; proxyState->ncclNet = comm->ncclNet; proxyState->ncclCollNet = comm->ncclCollNet; memcpy(proxyState->buffSizes, comm->buffSizes, sizeof(comm->buffSizes)); pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState); ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev); // UDS support INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d", comm, comm->rank); pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState); ncclSetThreadName(comm->proxyState->threadUDS, "NCCL UDS Service %2d", comm->cudaDev); } return ncclSuccess; } ncclResult_t ncclProxyStop(struct ncclComm* comm) { if (comm->proxyState) { struct ncclProxyState* sharedProxyState = comm->proxyState; if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) { if (comm->proxyState->threadUDS) { // UDS support comm->proxyState->stop = 1; } if (sharedProxyState->peerAddresses) { struct ncclSocket sock; int type = ncclProxyMsgStop; ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag); if (ncclSocketConnect(&sock) == ncclSuccess) { ncclSocketSend(&sock, &type, sizeof(int)); } ncclSocketClose(&sock); } if (sharedProxyState->peerSocks) { int tplocalRanks = comm->sharedRes->tpNLocalRanks; for (int i = 0; i < tplocalRanks; i++) { int fd; NCCLCHECK(ncclSocketGetFd(sharedProxyState->peerSocks + i, &fd)); if (fd >= 0) { if (sharedProxyState->proxyOps[i].pool) { NCCLCHECK(ncclShmClose(sharedProxyState->proxyOps[i].handle)); } if (sharedProxyState->sharedDevMems[i]) { if (!ncclCuMemEnable()) { CUDACHECK(cudaIpcCloseMemHandle(sharedProxyState->sharedDevMems[i])); } } int type = ncclProxyMsgClose; ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int)); NCCLCHECK(ncclSocketClose(sharedProxyState->peerSocks + i)); } } } } } return ncclSuccess; } ncclResult_t ncclProxyDestroy(struct ncclComm* comm) { struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState; assert(sharedProxyState->refCount == 0); free(sharedProxyState->peerAddresses); free(sharedProxyState->peerAddressesUDS); free(sharedProxyState->peerSocks); free(sharedProxyState->proxyOps); free(sharedProxyState->sharedDevMems); expectedProxyResponseFree(sharedProxyState); free(sharedProxyState); return ncclSuccess; } nccl-2.22.3-1/src/register.cc000066400000000000000000000166411463451655400156240ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "argcheck.h" // Need some checks here since we access comm #include "nccl.h" #include "comm.h" #include "net.h" #include "register.h" #include "transport.h" ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) { struct ncclRegCache* cache = &comm->regCache; ncclDebugNoWarn = NCCL_NET; for (int d=0; dnDevs; d++) { if (reg->handles[d] != NULL) NCCLCHECK(comm->ncclNet->deregMr(cache->sComms[reg->devs[d]], reg->handles[d])); } reg->nDevs = 0; free(reg->handles); reg->handles = NULL; ncclDebugNoWarn = 0; return ncclSuccess; } ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, struct ncclReg* reg) { struct ncclRegCache* cache = &comm->regCache; int netCount; NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount)); if (netCount == 0) return ncclSuccess; ncclResult_t ret = ncclSuccess; // Find local devices for p2p operations for (int c=0; cp2pnChannels; c++) { int dev; if (ncclTopoGetLocalNet(comm->topo, comm->rank, c, NULL, &dev) != ncclSuccess) goto end; // No local net ncclNetProperties_t props; NCCLCHECKGOTO(comm->ncclNet->getProperties(dev, &props), ret, end); if (props.regIsGlobal == 0) { // We need to be sure all NICs support global registration. reg->nDevs = 0; break; } int found = 0; for (int d=0; dnDevs; d++) if (reg->devs[d] == dev) found = 1; if (!found) reg->devs[reg->nDevs++] = dev; } NCCLCHECKGOTO(ncclCalloc(®->handles, reg->nDevs), ret, end); ncclDebugNoWarn = NCCL_NET; for (int d=0; dnDevs; d++) { int dev = reg->devs[d]; reg->handles[d] = NULL; if (cache->sComms[dev] == NULL) { // Create a loopback network comm object for that device to register the buffers. void *lComm = NULL; ncclNetHandle_t netHandle; bool connected = false; NCCLCHECKGOTO(comm->ncclNet->listen(dev, &netHandle, &lComm), ret, end); while (!connected) { if (*comm->abortFlag) { goto end; } if (cache->sComms[dev] == NULL) NCCLCHECKGOTO(comm->ncclNet->connect(dev, &netHandle, cache->sComms+dev, NULL), ret, end); if (cache->rComms[dev] == NULL) NCCLCHECKGOTO(comm->ncclNet->accept(lComm, cache->rComms+dev, NULL), ret, end); connected = (cache->rComms[dev] != NULL) && (cache->sComms[dev] != NULL); } NCCLCHECK(comm->ncclNet->closeListen(lComm)); } if (comm->ncclNet->regMr(cache->sComms[dev], addr, size, NCCL_PTR_CUDA, reg->handles+d) != ncclSuccess) { reg->handles[d] = NULL; NCCLCHECK(ncclNetDeregister(comm, reg)); reg->nDevs = 0; goto end; } } end: INFO(NCCL_INIT, "Register ptr %p size %ld on %d net devices", addr, size, reg->nDevs); ncclDebugNoWarn = 0; if (ret != ncclSuccess) NCCLCHECK(ncclNetDeregister(comm, reg)); return ret; } ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) { struct ncclRegCache* cache = &comm->regCache; uintptr_t pageSize = cache->pageSize; uintptr_t addr = (uintptr_t)data & -pageSize; size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; *reg = NULL; for (int slot=0; /*true*/; slot++) { if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess; if ((addr >= cache->slots[slot]->addr) && ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) { *reg = cache->slots[slot]; return ncclSuccess; } } } NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1); ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, void** handle) { if (!ncclParamLocalRegister()) return ncclSuccess; struct ncclRegCache* cache = &comm->regCache; uintptr_t pageSize = cache->pageSize; uintptr_t addr = (uintptr_t)data & -pageSize; size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; for (int slot=0; /*true*/; slot++) { if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) { if (cache->population == cache->capacity) { // must grow cache cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity; NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity)); } memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*)); NCCLCHECK(ncclCalloc(cache->slots+slot, 1)); struct ncclReg* regSlot = cache->slots[slot]; regSlot->addr = addr; regSlot->pages = pages; regSlot->refs = 1; NCCLCHECK(ncclNetRegister(comm, (void*)addr, pages*pageSize, regSlot)); regSlot->state |= NET_REG_COMPLETE; cache->population += 1; *handle = regSlot; return ncclSuccess; } else if ((addr >= cache->slots[slot]->addr) && ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) { cache->slots[slot]->refs++; *handle = cache->slots[slot]; return ncclSuccess; } } } ncclResult_t ncclRegCleanup(struct ncclComm* comm) { struct ncclRegCache* cache = &comm->regCache; for (int i=0; ipopulation; i++) { INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)cache->slots[i]->addr, cache->slots[i]->pages); NCCLCHECK(ncclNetDeregister(comm, cache->slots[i])); if (cache->slots[i]->state & NVLS_REG_COMPLETE) NCCLCHECK(ncclNvlsDeregBuffer(&cache->slots[i]->mcHandle, cache->slots[i]->regAddr, cache->slots[i]->dev, cache->slots[i]->regSize)); free(cache->slots[i]); } free(cache->slots); for (int d=0; dsComms[d]) NCCLCHECK(comm->ncclNet->closeSend(cache->sComms[d])); if (cache->rComms[d]) NCCLCHECK(comm->ncclNet->closeRecv(cache->rComms[d])); } return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle); ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) { NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm")); if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister")); NCCLCHECK(ncclRegister(comm, buff, size, handle)); return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle); ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) { NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm")); struct ncclReg* reg = (struct ncclReg*)handle; struct ncclRegCache* cache = &comm->regCache; int slot; for (slot=0; slotpopulation && cache->slots[slot] != reg; slot++); if (slot == cache->population) { WARN("Deregister: Could not find handle"); return ncclInvalidUsage; } if (--reg->refs) return ncclSuccess; NCCLCHECK(ncclNetDeregister(comm, reg)); if (reg->state & NVLS_REG_COMPLETE) { NCCLCHECK(ncclNvlsDeregBuffer(®->mcHandle, reg->regAddr, reg->dev, reg->regSize)); reg->regAddr = (CUdeviceptr)NULL; } if (reg->state & COLLNET_REG_COMPLETE) { NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->proxyconn, reg->collnetHandle)); } free(reg); memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*)); cache->population -= 1; return ncclSuccess; } nccl-2.22.3-1/src/transport.cc000066400000000000000000000434571463451655400160410ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "info.h" #include "bootstrap.h" #define ENABLE_TIMER 0 #include "timer.h" #include "transport.h" struct ncclTransport* ncclTransports[NTRANSPORTS] = { &p2pTransport, &shmTransport, &netTransport, &collNetTransport }; template static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex, int* transportType) { struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank; struct ncclPeerInfo* peerInfo = comm->peerInfo+peer; struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer]->send + connIndex : comm->channels[channelId].peers[peer]->recv + connIndex; for (int t=0; tsend : &transport->recv; int ret = 0; NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo)); if (ret) { connector->transportComm = transportComm; NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex)); if (transportType) *transportType = t; return ncclSuccess; } } WARN("No transport found for rank %d[%lx] -> rank %d[%lx]", myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); return ncclSystemError; } ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) { TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv); struct ncclChannel* channel = &comm->channels[channelId]; uint64_t mask = 1UL << channel->id; for (int i=0; i= comm->nRanks || peer == comm->rank || channel->peers[peer]->recv[connIndex].connected) continue; comm->connectRecv[peer] |= mask; } for (int i=0; i= comm->nRanks || peer == comm->rank || channel->peers[peer]->send[connIndex].connected) continue; comm->connectSend[peer] |= mask; } return ncclSuccess; } void dumpData(struct ncclConnect* data, int ndata) { for (int n=0; n ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) { // Stream used during transport setup; need for P2P pre-connect + CUDA Graph ncclResult_t ret = ncclSuccess; int highestType = TRANSPORT_UNDEFINED; // track highest transport type struct ncclConnect** data; // Store intermediate send/recvData structs for connect struct ncclConnect** recvData; // Points to entries inside data for given recv connection within a channel struct ncclConnect** sendData; // Points to entries inside data for given send connection within a channel int done = 0; int maxPeers = ncclParamConnectRoundMaxPeers(); NCCLCHECK(ncclCalloc(&data, maxPeers)); NCCLCHECK(ncclCalloc(&recvData, maxPeers)); NCCLCHECK(ncclCalloc(&sendData, maxPeers)); struct timeval timeStart, timeLast; gettimeofday(&timeStart, NULL); timeLast = timeStart; // struct copy bool timeReported = false; NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); // First time initialization for (int i=1; inRanks; i++) { int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0); int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks; int sendPeer = (comm->rank + i) % comm->nRanks; uint64_t recvMask = comm->connectRecv[recvPeer]; uint64_t sendMask = comm->connectSend[sendPeer]; // Data[i] contains all ncclConnect information for all send and receive connections with a given send and recv peer // This data is packed in the array based on the number of sendChannels and recvChannels connected with these peers // The first N entries contain recvData, connection information for recv connections // The next M entries contain sendData, connection information for send connections // It's not guaranteed that each entry of data has the same number of total or send/recv specific connections int p = i-(done+1); if (recvMask || sendMask) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS)); recvData[p] = data[p]; int sendChannels = 0, recvChannels = 0; int type; TIME_START(0); for (int c=0; c(comm, graph, recvData[p]+recvChannels++, c, recvPeer, connIndex, &type), ret, fail); if (type > highestType) highestType = type; } } TIME_STOP(0); TIME_START(1); sendData[p] = recvData[p]+recvChannels; for (int c=0; c(comm, graph, sendData[p]+sendChannels++, c, sendPeer, connIndex, &type), ret, fail); if (type > highestType) highestType = type; } } TIME_STOP(1); TIME_START(2); if (sendPeer == recvPeer) { if (recvChannels+sendChannels) { NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data[p], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail); NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data[p], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail); sendData[p] = data[p]; recvData[p] = data[p]+sendChannels; } } else { if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData[p], sizeof(struct ncclConnect)*recvChannels), ret, fail); if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData[p], sizeof(struct ncclConnect)*sendChannels), ret, fail); if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData[p], sizeof(struct ncclConnect)*sendChannels), ret, fail); if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData[p], sizeof(struct ncclConnect)*recvChannels), ret, fail); } TIME_STOP(2); if (i-done == maxPeers || i == comm->nRanks-1) { // Loop until all channels with all ranks have been connected bool allChannelsConnected; allChannelsConnected = false; while (!allChannelsConnected) { allChannelsConnected = true; for (int j=done+1; j<=i; j++) { int recvPeer = (comm->rank - j + comm->nRanks) % comm->nRanks; int sendPeer = (comm->rank + j) % comm->nRanks; uint64_t recvMask = comm->connectRecv[recvPeer]; uint64_t sendMask = comm->connectSend[sendPeer]; int p = j-(done+1); int sendDataOffset = 0; int recvDataOffset = 0; for (int c=0; cchannels[c].peers[sendPeer]->send + connIndex; // This connector hasn't completed connection yet if (conn->connected == 0) { NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[p] + sendDataOffset++, 1, comm->rank, conn), ret, fail); if (ret == ncclSuccess) { conn->connected = 1; /* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */ CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[sendPeer]->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail); } else if (ret == ncclInProgress) { allChannelsConnected = false; } } } TIME_STOP(3); // Start with recv channels TIME_START(4); if (recvMask & (1UL<channels[c].peers[recvPeer]->recv + connIndex; // This connector hasn't completed connection yet if (conn->connected == 0) { NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[p] + recvDataOffset++, 1, comm->rank, conn), ret, fail); if (ret == ncclSuccess) { conn->connected = 1; /* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */ CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[recvPeer]->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail); } else if (ret == ncclInProgress) { allChannelsConnected = false; } } } TIME_STOP(4); } if (sendMask || recvMask) { free(data[p]); data[p] = NULL; } } if (ncclParamReportConnectProgress() && comm->rank == 0) { struct timeval now; gettimeofday(&now, NULL); if (((now.tv_sec - timeLast.tv_sec)*1.0 + (now.tv_usec-timeLast.tv_usec)*1e-6) > 1) { float elapsed = (now.tv_sec - timeStart.tv_sec)*1.0 + (now.tv_usec-timeStart.tv_usec)*1e-6; float remaining = elapsed*(comm->nRanks-done)/done; printf("%sP2p connect: %g%% Elapsed %d:%02d Remaining %d:%02d ", timeReported ? "\r" : "", done*100.0/comm->nRanks, ((int)elapsed)/60, ((int)elapsed)%60, ((int)remaining)/60, ((int)remaining)%60); fflush(stdout); timeReported = true; timeLast = now; // struct copy; } } } done = i; } } { struct timeval now; gettimeofday(&now, NULL); float elapsed = (now.tv_sec - timeStart.tv_sec)*1.0 + (now.tv_usec-timeStart.tv_usec)*1e-6; if (elapsed > 1.0) INFO(NCCL_PROFILE, "timings: rank %d nranks %d P2p connect done in %.2f", comm->rank, comm->nRanks, elapsed); if (timeReported) { printf("\rP2p connect done in %d:%02d \n", ((int)elapsed)/60, ((int)elapsed)%60); fflush(stdout); } } /* We need to sync ranks here since some ranks might run too fast after connection setup * and start to destroy the connection after returning from this function; however, the * others might still be trying to connect and import the buffer. No sync can lead to invalid * shmem/cuda buffer. In addition, we also clear all connect masks and free each connectInfo array */ for (int i = 1; i < comm->nRanks; i++) { int bootstrapTag = (i << 8) + (1 << 7) + (graph ? graph->id + 1 : 0); int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks; int sendPeer = (comm->rank + i) % comm->nRanks; int flag = 0; if (recvPeer != sendPeer) { if (comm->connectSend[sendPeer] != 0UL) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); if (comm->connectRecv[recvPeer] != 0UL) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); if (comm->connectSend[sendPeer] != 0UL) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); if (comm->connectRecv[recvPeer] != 0UL) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); } else { if (comm->connectSend[sendPeer] != 0UL || comm->connectRecv[recvPeer] != 0UL) { NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); } } comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL; } free(data); free(sendData); free(recvData); if (highestTransportType != NULL) *highestTransportType = highestType; TIME_PRINT("P2P Setup/Connect"); exit: NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream)); NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream)); return ret; fail: goto exit; } extern struct ncclTransport collNetTransport; // All ranks must participate in collNetSetup call // We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect) { int fail = 1; int rank = comm->rank; int nranks = comm->nRanks; int nMasters = comm->nNodes; int isMaster = (rank == masterRank) ? 1 : 0; // check if we can connect to collnet, whose root is the nranks-th rank struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks; peerInfo->rank = nranks; if (isMaster && type == collNetSend) { TRACE(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, comm->node, nMasters, masterPeer); } // select struct ncclChannelPeer* root = channel->peers[nranks]; // connector index: 0 for recv, 1 for send struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type; struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send); conn->transportComm = transportComm; // setup struct ncclConnect myConnect; if (isMaster) { NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type)); } // prepare connect handles ncclResult_t res; struct { int isMaster; ncclConnect connect; } *allConnects = NULL; ncclConnect *masterConnects = NULL; NCCLCHECK(ncclCalloc(&masterConnects, nMasters)); if (type == collNetRecv) { // recv side: AllGather // all ranks must participate NCCLCHECK(ncclCalloc(&allConnects, nranks)); allConnects[rank].isMaster = isMaster; memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect)); NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup); // consolidate int c = 0; for (int r = 0; r < nranks; r++) { if (allConnects[r].isMaster) { memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect)); c++; } } } else { // send side : copy in connect info received from peer recv master if (isMaster) memcpy(masterConnects+comm->node, connect, sizeof(struct ncclConnect)); } // connect if (isMaster) { NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, comm->node, conn), res, cleanup); struct ncclDevChannelPeer* devRoot; CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup); struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv + type : devRoot->send + type; CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup); } if (isMaster && type == collNetRecv) { memcpy(connect, masterConnects+comm->node, sizeof(struct ncclConnect)); TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, comm->node, nMasters, masterPeer); } fail = 0; cleanup: if (allConnects != NULL) free(allConnects); if (masterConnects != NULL) free(masterConnects); return fail; } ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) { // AllGather collNet setup results int allGatherFailures[NCCL_MAX_LOCAL_RANKS] = {0}; allGatherFailures[comm->localRank] = collNetSetupFail; NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, allGatherFailures, sizeof(int))); for (int i=0; ilocalRanks; i++) { if (allGatherFailures[i] != 0) { collNetSetupFail = 1; break; } } if (collNetSetupFail) { if (comm->localRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead"); return ncclSystemError; } return ncclSuccess; } ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) { // Free collNet resources for (int r=0; rnChannels; r++) { struct ncclChannel* channel = comm->channels+r; struct ncclChannelPeer* peer = channel->peers[comm->nRanks]; if (peer) { if (ncclAtomicRefCountDecrement(&peer->refCount) == 0) { for (int b=0; bsend + b; if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send)); send->transportResources = NULL; // avoid double free } for (int b=0; brecv + b; if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv)); recv->transportResources = NULL; // avoid double free } } } } return ncclSuccess; } nccl-2.22.3-1/src/transport/000077500000000000000000000000001463451655400155155ustar00rootroot00000000000000nccl-2.22.3-1/src/transport/coll_net.cc000066400000000000000000001764051463451655400176400ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "coll_net.h" #include "graph.h" #include "proxy.h" #include "gdrwrap.h" #include "transport.h" #include "assert.h" #include "bootstrap.h" #include "channel.h" int64_t ncclParamGdrCopySyncEnable(); int64_t ncclParamGdrCopyFlushEnable(); struct collNetRecvConnectInfo { int rank; int nranks; collNetHandle_t collNetHandle; }; struct collNetSendConnectInfo { void* mhandles[NCCL_NUM_PROTOCOLS]; void* reqFifo; }; #define COLLNET_GROUP_NSUBS 8 #define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS) #define NCCL_NET_MAP_HOSTMEM 0 #define NCCL_NET_MAP_DEVMEM 1 #define NCCL_NET_MAP_SHARED_HOSTMEM 2 #define NCCL_NET_MAP_SHARED_DEVMEM 3 #define NCCL_NET_MAP_GDCMEM 4 #define NCCL_NET_MAP_MEMS 5 #define NCCL_NET_MAP_MASK_DEVMEM 0x40000000 #define NCCL_NET_MAP_MASK_SHARED 0x80000000 #define NCCL_NET_MAP_MASK_USED 0x20000000 #define NCCL_NET_MAP_MASK_OFFSET 0x1fffffff #define NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName) \ ((mapStruct)->offsets.offsetName >> 30) #define NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) \ (((mapStruct)->offsets.offsetName >> 29) == 0) #define NCCL_NET_MAP_GET_POINTER(mapStruct, cpuOrGpu, offsetName) \ (NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) ? NULL : \ (mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET)) #define NCCL_NET_MAP_DEV_MEM(mapStruct, offsetName) \ (((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_DEVMEM) != 0) #define NCCL_NET_MAP_ADD_POINTER(mapStruct, shared, dev, memSize, offsetName) do { \ int bank = NCCL_NET_MAP_MASK_USED + (dev)*NCCL_NET_MAP_MASK_DEVMEM + (shared)*NCCL_NET_MAP_MASK_SHARED; \ if ((shared) == 0) { \ if (dev) { \ (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size; \ (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size += memSize; \ } else { \ (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size; \ (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size += memSize; \ } \ } else { \ (mapStruct)->offsets.offsetName = bank; \ } \ } while (0); struct connectMapMem{ char* gpuPtr; char* cpuPtr; int size; }; struct connectMap { int shared; // First 3 bits of offsets determine the mem bank. 001 is host mem, 011 is dev mem, 101 is shared host mem and 111 is shared dev mem. struct connectMapMem mems[NCCL_NET_MAP_MEMS]; // Offsets. 3 MSBs indicate mem bank, 111 indicates NULL. struct { uint32_t sendMem; uint32_t recvMem; uint32_t buffs[NCCL_NUM_PROTOCOLS]; } offsets; }; struct reqSlot { bool turnIsSendNotRecv; int size; }; struct sendResources { struct connectMap map; void* collNetComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; int rank; int nranks; int netDev; int useGdr; int useDmaBuf; uint64_t* gdcSync; void* gdrDesc; void* sendMhandles[NCCL_NUM_PROTOCOLS]; void* recvMhandles[NCCL_NUM_PROTOCOLS]; uint64_t step; struct reqSlot (*reqFifo)[NCCL_STEPS]; int collNetRank; }; struct recvResources { struct connectMap map; void* collNetComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; int rank; int nranks; int netDev; int useGdr; int useDmaBuf; int needFlush; uint64_t* gdcSync; uint64_t* gdcFlush; void* gdrDesc; void* mhandles[NCCL_NUM_PROTOCOLS]; uint64_t step; struct reqSlot reqFifo[COLLNET_MAX_GROUPS][NCCL_STEPS]; int collNetRank; }; static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { // This transport cannot be used for p2p *ret = 0; return ncclSuccess; } struct setupReq { int netDev; int useGdr; int needFlush; struct ncclCollNetSharedRes* collNet; }; /* Setup send connector, and return connect information for others in the coll * communicator to connect to me */ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct setupReq req = { 0 }; int proxyRank, tpProxyRank; int64_t netId; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; tpProxyRank = comm->topParentRanks[myInfo->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn)); ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount); req.collNet = comm->collNetSharedRes; NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, req.useGdr ? "/GDRDMA" : ""); return ncclSuccess; } static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { struct setupReq req = { 0 }; int proxyRank, tpProxyRank; int64_t netId; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr)); recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; // Determine whether we need to flush the GDR buffer on recv or not if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; tpProxyRank = comm->topParentRanks[myInfo->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn)); struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount); req.collNet = comm->collNetSharedRes; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t))); INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, req.useGdr ? "/GDRDMA" : ""); return ncclSuccess; } static ncclResult_t collNetDumpMap(struct connectMap* map) { printf("Dump map\n"); struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM; printf("Mem 0: Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); mem = map->mems+NCCL_NET_MAP_DEVMEM; printf("Mem 1: Vid mem CPU (%x B) %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM; printf("Mem 2: Shared Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM; printf("Mem 3: Shared Vid (%x B) mem CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", map->offsets.sendMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, NCCL_NET_MAP_OFFSET_BANK(map, sendMem), map->offsets.sendMem & NCCL_NET_MAP_MASK_OFFSET, NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem), NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem)); printf("RecvMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", map->offsets.recvMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, NCCL_NET_MAP_OFFSET_BANK(map, recvMem), map->offsets.recvMem & NCCL_NET_MAP_MASK_OFFSET, NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem), NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem)); for (int p=0; p Used %d Bank %d Offset %x, cpu %p, gpu %p\n", p, map->offsets.buffs[p] & NCCL_NET_MAP_MASK_USED ? 1 : 0, NCCL_NET_MAP_OFFSET_BANK(map, buffs[p]), map->offsets.buffs[p] & NCCL_NET_MAP_MASK_OFFSET, NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]), NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p])); } printf("End of dump\n"); return ncclSuccess; } struct collNetConnectArgs { int rank; int nranks; struct ncclConnect* connectInfos; }; static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args); static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) { // We're on the same process as the proxy. We can pass a pointer to a struct. struct collNetConnectArgs args = { rank, nranks, connectInfos }; struct connectMap* map; NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*))); // If collnet connect failed, propagate error to fallback on regular p2p if (map == NULL) return ncclSystemError; //NCCLCHECK(collNetDumpMap(map)); struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; send->conn.head = gdcMem ? (uint64_t*)gdcMem : &sendMem->head; struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); send->conn.tail = &recvMem->tail; send->conn.connFifo = recvMem->connFifo; for (int i=0; iconn.connFifo[i].size = -1; send->conn.connFifo[i].mode = NCCL_MODE_OFFSET; } for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); send->proxyConn.proxyProgress = sendProxyProgress; return ncclSuccess; } static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args); static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) { // We're on the same process as the proxy. We can pass a pointer to a struct. struct collNetConnectArgs args = { rank, nranks, connectInfos }; struct connectMap* map; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*))); // If collnet connect failed, propagate error to fallback on regular p2p if (map == NULL) return ncclSystemError; //NCCLCHECK(collNetDumpMap(map)); struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); recv->conn.head = &sendMem->head; struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail; recv->conn.connFifo = recvMem->connFifo; for (int i=0; iconn.connFifo[i].mode = NCCL_MODE_OFFSET; } for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); } recv->proxyConn.proxyProgress = recvProxyProgress; return ncclSuccess; } static ncclResult_t sendFree(struct ncclConnector* send) { return ncclSuccess; } static ncclResult_t recvFree(struct ncclConnector* recv) { return ncclSuccess; } static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct setupReq* req = (struct setupReq*)reqBuff; if (reqSize != sizeof(struct setupReq)) return ncclInternalError; struct sendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); connection->transportResources = resources; connection->shared = 1; resources->netDev = req->netDev; resources->useGdr = req->useGdr; ncclNetProperties_t props; NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props)); connection->collNet = req->collNet; /* DMA-BUF support */ resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); return ncclSuccess; } struct sharedResources { void* collNetListenComms[MAXCHANNELS]; void* collNetComms[MAXCHANNELS]; int commRefCount[NCCL_MAX_NETDEVS]; }; static ncclResult_t sharedListen(struct ncclProxyState* proxyState, int netDev, struct ncclCollNetSharedRes* collNet, void* collNetHandle) { struct sharedResources* resources = (struct sharedResources*)collNet->resources; if (resources == NULL) { NCCLCHECK(ncclCalloc(&resources, 1)); collNet->resources = resources; } if (resources->collNetComms[netDev] == NULL) NCCLCHECK(proxyState->ncclCollNet->listen(netDev, collNetHandle, resources->collNetListenComms + netDev)); return ncclSuccess; } static ncclResult_t sharedConnect(struct ncclProxyState* proxyState, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclCollNetSharedRes* collNet, void** collNetComm) { struct sharedResources* resources = (struct sharedResources*)collNet->resources; if (resources->collNetComms[netDev] == NULL) { // Connect to coll comm collNetHandle_t** handlePtrs = NULL; NCCLCHECK(ncclCalloc(&handlePtrs, nranks)); for (int i = 0; i < nranks; i++) { struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i); handlePtrs[i] = &(info->collNetHandle); } ncclResult_t ret = proxyState->ncclCollNet->connect((void**)handlePtrs, nranks, rank, resources->collNetListenComms[netDev], resources->collNetComms+netDev); free(handlePtrs); if (ret == ncclSuccess) { // Close listen comm NCCLCHECK(proxyState->ncclCollNet->closeListen(resources->collNetListenComms[netDev])); } else { resources->collNetListenComms[netDev] = NULL; } } *collNetComm = resources->collNetComms[netDev]; if (*collNetComm) resources->commRefCount[netDev]++; return ncclSuccess; } static ncclResult_t sharedFree(struct ncclProxyState* proxyState, struct ncclCollNetSharedRes* collNet, int netDev) { struct sharedResources* resources = (struct sharedResources*)collNet->resources; resources->commRefCount[netDev]--; if (resources->commRefCount[netDev] == 0) { NCCLCHECK(proxyState->ncclCollNet->closeColl(resources->collNetComms[netDev])); } for (int n=0; ncommRefCount[n]) return ncclSuccess; collNet->resources = NULL; free(resources); return ncclSuccess; } static ncclResult_t sharedBuffersInit(struct ncclCollNetSharedRes* collNet, int cuda, char** gpuPtr, char** cpuPtr, int* size) { if (collNet->size == 0) { collNet->size = 2 * collNet->nChannels * collNet->buffSize; } *size = collNet->size; if (cuda && collNet->cudaBuff == NULL) { NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size)); cudaMemset(collNet->cudaBuff, 0x33, *size/2); cudaMemset((char*)collNet->cudaBuff + *size/2, 0x66, *size/2); } if (!cuda && collNet->hostBuff == NULL) { NCCLCHECK(ncclCudaHostCalloc(&collNet->hostBuff, *size)); } *gpuPtr = *cpuPtr = cuda ? collNet->cudaBuff : collNet->hostBuff; return ncclSuccess; } static ncclResult_t sharedBuffersGet(struct ncclCollNetSharedRes* collNet, int type, int slot, int channel, int* offset) { // Use different pools for different channels and also separate send/recv. int slotSize = collNet->buffSize / NCCL_STEPS; int globalSlot = (type * NCCL_STEPS + slot) * collNet->nChannels + channel; *offset = slotSize * globalSlot; return ncclSuccess; } static ncclResult_t sharedBuffersDestroy(struct ncclCollNetSharedRes* collNet) { if (collNet->size == 0) return ncclSuccess; NCCLCHECK(ncclCudaFree(collNet->cudaBuff)); NCCLCHECK(ncclCudaHostFree(collNet->hostBuff)); // This will be called multiple times, with multiple channels and send/recv. Make sure we only do it once. collNet->size = 0; return ncclSuccess; } static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct setupReq* req = (struct setupReq*)reqBuff; if (reqSize != sizeof (struct setupReq)) return ncclInternalError; struct recvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); connection->transportResources = resources; connection->shared = 1; resources->netDev = req->netDev; resources->useGdr = req->useGdr; resources->needFlush = req->needFlush; ncclNetProperties_t props; NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props)); connection->collNet = req->collNet; /* DMA-BUF support */ resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); collNetHandle_t* netHandle = (collNetHandle_t*) respBuff; if (respSize != sizeof(collNetHandle_t)) return ncclInternalError; NCCLCHECK(sharedListen(proxyState, req->netDev, req->collNet, netHandle)); return ncclSuccess; } static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank); struct sendResources* resources = (struct sendResources*)(connection->transportResources); // Get info from recv side resources->collNetRank = args->rank; resources->reqFifo = (struct reqSlot (*)[NCCL_STEPS])(info->reqFifo); for (int p=0; precvMhandles[p] = info->mhandles[p]; NCCLCHECK(sharedConnect(proxyState, resources->netDev, args->connectInfos, args->nranks, args->rank, connection->collNet, &resources->collNetComm)); // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller. if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; } if (resources->collNetComm == NULL) { *((struct connectMap**)respBuff) = NULL; return ncclSuccess; } connection->proxyAppendPtr = connection->collNet->proxyAppend + 2 * resources->netDev; struct connectMap* map = &resources->map; NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; if (ncclGdrCopy && ncclParamGdrCopySyncEnable()) { uint64_t *cpuPtr, *gpuPtr; NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc)); resources->gdcSync = cpuPtr; struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; gdcMem->cpuPtr = (char*)cpuPtr; gdcMem->gpuPtr = (char*)gpuPtr; gdcMem->size = sizeof(uint64_t); // sendMem->head } resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); // Don't give credits yet in shared mode. (resources->gdcSync ? *resources->gdcSync : resources->sendMem->head) = -NCCL_STEPS; // Allocate & Register shared buffers for the Simple protocol int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); #if CUDA_VERSION >= 11070 /* DMA-BUF support */ if (resources->useGdr && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &resources->sendMhandles[NCCL_PROTO_SIMPLE])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif { NCCLCHECK(proxyState->ncclCollNet->regMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size, resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->sendMhandles[NCCL_PROTO_SIMPLE])); } *((struct connectMap**)respBuff) = &resources->map; return ncclSuccess; } static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; struct recvResources* resources = (struct recvResources*)(connection->transportResources); struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank); resources->collNetRank = args->rank; NCCLCHECK(sharedConnect(proxyState, resources->netDev, args->connectInfos, args->nranks, args->rank, connection->collNet, &resources->collNetComm)); // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller. if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; } if (resources->collNetComm == NULL) { *((struct connectMap**)respBuff) = NULL; return ncclSuccess; } connection->proxyAppendPtr = connection->collNet->proxyAppend + 2 * resources->netDev + 1; struct connectMap* map = &resources->map; NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; if (ncclGdrCopy) { uint64_t *cpuPtr, *gpuPtr; NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc)); if (ncclParamGdrCopySyncEnable()) { resources->gdcSync = cpuPtr; struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; gdcMem->cpuPtr = (char*)cpuPtr; gdcMem->gpuPtr = (char*)gpuPtr; gdcMem->size = sizeof(uint64_t); } if (ncclParamGdrCopyFlushEnable()) resources->gdcFlush = cpuPtr + 1; } resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); // Allocate & Register shared buffers for the Simple protocol int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); #if CUDA_VERSION >= 11070 /* DMA-BUF support */ if (resources->useGdr && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &resources->mhandles[NCCL_PROTO_SIMPLE])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif { NCCLCHECK(proxyState->ncclCollNet->regMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size, resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[NCCL_PROTO_SIMPLE])); } // Pass info to send side info->reqFifo = resources->reqFifo; for (int p=0; pmhandles[p] = resources->mhandles[p]; if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; } *((struct connectMap**)respBuff) = &resources->map; return ncclSuccess; } static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct sendResources* resources = (struct sendResources*)(connection->transportResources); if (resources) { for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) { if (resources->sendMhandles[p]) { NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, resources->sendMhandles[p])); } } struct connectMapMem* mems = resources->map.mems; NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); NCCLCHECK(sharedBuffersDestroy(connection->collNet)); NCCLCHECK(sharedFree(proxyState, connection->collNet, resources->netDev)); if (ncclAtomicRefCountDecrement(&connection->collNet->refCount) == 0) free(connection->collNet); free(connection->transportResources); } return ncclSuccess; } static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct recvResources* resources = (struct recvResources*)(connection->transportResources); if (resources) { for (int p=0; pmhandles[p]) { NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, resources->mhandles[p])); } } struct connectMapMem* mems = resources->map.mems; NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); NCCLCHECK(sharedBuffersDestroy(connection->collNet)); NCCLCHECK(sharedFree(proxyState, connection->collNet, resources->netDev)); if (ncclAtomicRefCountDecrement(&connection->collNet->refCount) == 0) free(connection->collNet); free(connection->transportResources); } return ncclSuccess; } static size_t calcAlgoOffset(struct ncclProxyArgs* args, int isAllNotOne, int sub, uint64_t step) { int chunkSize = args->chunkSize; int nNodes = args->specifics.collnetDirect.nNodes; int node = args->specifics.collnetDirect.node; size_t sizePerRank = args->specifics.collnetDirect.sizePerRank; size_t offset = (step*(args->nsubs) + sub)*chunkSize; if (isAllNotOne) { offset = std::min(offset, nNodes*sizePerRank); } else { offset = std::max(offset, (node+0)*sizePerRank); offset = std::min(offset, (node+1)*sizePerRank); } return offset; } static int calcRegionOffset( struct ncclProxyArgs* args, int isRecvNotSend, int sub, uint64_t step, int side // 0=begin, 1=end ) { struct ncclCollNetSharedRes* collNet = args->subs[0].connection->collNet; int slotSize = collNet->buffSize/NCCL_STEPS; int chunkSize = args->chunkSize; int base = isRecvNotSend*NCCL_STEPS + (step%NCCL_STEPS); base *= collNet->nChannels*slotSize; if (args->coll == ncclFuncAllReduce) { return base + (sub+side)*chunkSize; } else { int isAllNotOne = isRecvNotSend ^ (args->coll == ncclFuncReduceScatter); int sub0 = sub - (sub%COLLNET_GROUP_NSUBS); size_t off = sub0*slotSize; off += calcAlgoOffset(args, isAllNotOne, sub+side, step) - calcAlgoOffset(args, isAllNotOne, sub0, step); return base + off; } } #define LAST_OF_GROUP(args, s) \ ((s)%COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || (s) == (args)->nsubs-1) static constexpr int calcStepsPerGroup(int nGroups) { //return NCCL_STEPS/nGroups; return NCCL_STEPS; } static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->received = sub->transmitted = sub->done = 0; resources->step = sub->base + sub->nsteps; } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = NCCL_PROTO_SIMPLE; int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS); for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); void* sendMhandle = resources->sendMhandles[p]; void* recvMhandle = resources->recvMhandles[p]; char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]); auto reqFifo = resources->reqFifo; int group = s/COLLNET_GROUP_NSUBS; int groupStart = s - (s%COLLNET_GROUP_NSUBS); if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) { int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; if (sub->reg == 0) { resources->recvMem->connFifo[buffSlot].offset = calcRegionOffset(args, 0, s, sub->posted, 0); __sync_synchronize(); } volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] posted offset %d @ %p signal %ld->%ld", long(sub->posted), group, buffSlot, resources->recvMem->connFifo[buffSlot].offset, &resources->recvMem->connFifo[buffSlot].offset, long(*sendHead), long(sub->base + sub->posted + args->sliceSteps - NCCL_STEPS)); sub->posted += args->sliceSteps; *sendHead = sub->base + sub->posted - NCCL_STEPS; if (resources->gdcSync) wc_store_fence(); // Flush out WC write } if (sub->received < sub->posted && sub->received < sub->done + calcStepsPerGroup(nGroups)) { int buffSlot = (sub->base+sub->received)%NCCL_STEPS; volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo; volatile uint64_t* recvTail = &resources->recvMem->tail; if ((connFifo[buffSlot].size != -1 || sub->reg) && ((*recvTail > (sub->base+sub->received)))) { if (args->coll != ncclFuncAllReduce && sub->reg == 0) { int sendBeg = calcRegionOffset(args, 0, s, sub->received, 0); int sendEnd = calcRegionOffset(args, 0, s, sub->received, 1); if (sendEnd-sendBeg != connFifo[buffSlot].size) { WARN("CollNet sizes: want=%d got=%ld", sendEnd-sendBeg, connFifo[buffSlot].size); return ncclInternalError; } } connFifo[buffSlot].size = -1; sub->received += args->sliceSteps; args->idle = 0; } } // Enforce collective ordering of collnet ops. bool ordered = s==0 ? args->subs[args->nsubs-1].transmitted == sub->transmitted : sub->transmitted < (sub-1)->transmitted; if (ordered && (sub->transmitted < sub->received)) { if (LAST_OF_GROUP(args, s)) { int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; if (!reqFifo[group][buffSlot].turnIsSendNotRecv) continue; ssize_t sizePerRank = 0; size_t allBeg = calcAlgoOffset(args, 1, groupStart, sub->transmitted); size_t allEnd = calcAlgoOffset(args, 1, s+1, sub->transmitted); int sendBeg = calcRegionOffset(args, 0, groupStart, sub->transmitted, 0); int sendEnd = calcRegionOffset(args, 0, s, sub->transmitted, 1); int recvBeg = calcRegionOffset(args, 1, groupStart, sub->transmitted, 0); int recvEnd = calcRegionOffset(args, 1, s, sub->transmitted, 1); reqFifo[group][buffSlot].size = recvEnd - recvBeg; size_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype); if (sendBeg==sendEnd && recvBeg==recvEnd && sub->reg == 0) { sub->requests[buffSlot] = nullptr; // trivally finished request } else { if (args->coll == ncclFuncAllReduce) { if (sub->reg) { size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE); int count = (int)(nBytes / eltSize); NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sub->sendbuff, sub->recvbuff, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->sendMhandle, sub->recvMhandle, sub->requests + buffSlot)); if (sub->requests[buffSlot]) { sub->nbytes -= nBytes; sub->sendbuff += nBytes; sub->recvbuff += nBytes; } } else { int count = (sendEnd - sendBeg) / eltSize; NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region + sendBeg, region + recvBeg, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests + buffSlot)); } } else { sizePerRank = args->specifics.collnetDirect.sizePerRank; if (args->coll == ncclFuncAllGather) { ncclNetSGE_v8_t recvParts; if (sub->reg) { size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE); void *sendbuff; recvParts.mhandle = sub->recvMhandle; recvParts.address = sub->recvbuff; recvParts.size = nBytes; if (sub->offset / sizePerRank == args->specifics.collnetDirect.node) { sendbuff = sub->sendbuff + sub->offset % sizePerRank; } else { sendbuff = sub->sendbuff; } NCCLCHECK(proxyState->ncclCollNet->iallgather( resources->collNetComm, sendbuff, 1, &recvParts, sizePerRank, sub->offset, nBytes, sub->sendMhandle, sub->requests + buffSlot)); if (sub->requests[buffSlot]) { sub->recvbuff += nBytes; sub->nbytes -= nBytes; sub->offset += nBytes; } } else { recvParts.mhandle = recvMhandle; recvParts.address = region + recvBeg; recvParts.size = allEnd - allBeg; NCCLCHECK(proxyState->ncclCollNet->iallgather( resources->collNetComm, region + sendBeg, 1, &recvParts, sizePerRank, allBeg, allEnd - allBeg, sendMhandle, sub->requests + buffSlot)); } } else { ncclNetSGE_v8_t sendParts; if (sub->reg) { size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE); void *recvbuff; sendParts.mhandle = sub->sendMhandle; sendParts.address = sub->sendbuff; sendParts.size = nBytes; if (sub->offset / sizePerRank == args->specifics.collnetDirect.node) { recvbuff = sub->recvbuff + sub->offset % sizePerRank; } else { recvbuff = sub->recvbuff; } NCCLCHECK(proxyState->ncclCollNet->ireducescatter( resources->collNetComm, 1, &sendParts, recvbuff, sizePerRank, sub->offset, nBytes, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->recvMhandle, sub->requests + buffSlot)); if (sub->requests[buffSlot]) { sub->sendbuff += nBytes; sub->nbytes -= nBytes; sub->offset += nBytes; } } else { sendParts.mhandle = sendMhandle; sendParts.address = region + sendBeg; sendParts.size = allEnd - allBeg; NCCLCHECK(proxyState->ncclCollNet->ireducescatter( resources->collNetComm, 1, &sendParts, region + recvBeg, sizePerRank, allBeg, allEnd - allBeg, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, recvMhandle, sub->requests + buffSlot)); } } } if (sub->requests[buffSlot] == nullptr) continue; if (args->coll == ncclFuncAllReduce) { TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallreduce posted, size %d req %p", (long)sub->transmitted, group, buffSlot, int(sendEnd-sendBeg), sub->requests[buffSlot]); } else if (args->coll == ncclFuncAllGather) { TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallgather posted sendSize=%ld recvOffset=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(sizePerRank), long(allBeg), long(allEnd-allBeg), sub->requests[buffSlot]); } else { TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Ireducescatter posted sendOffset=%ld sendSize=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(allBeg), long(allEnd-allBeg), long(sizePerRank), sub->requests[buffSlot]); } } } sub->transmitted += args->sliceSteps; args->idle = 0; continue; } // Check whether the network has completed some send operations. if (LAST_OF_GROUP(args, s) && sub->done < sub->transmitted) { int done, size; int buffSlot = (sub->base+sub->done)%NCCL_STEPS; done = 1; if (sub->requests[buffSlot]) NCCLCHECK(proxyState->ncclCollNet->test((void*)(sub->requests[buffSlot]), &done, &size)); if (done) { TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] request %p done, size %d", (long)sub->done, group, buffSlot, sub->requests[buffSlot], size); sub->requests[buffSlot] = nullptr; reqFifo[group][buffSlot].turnIsSendNotRecv = false; // Notify recvProxy for (int i=groupStart; i<=s; i++) args->subs[i].done += args->sliceSteps; args->idle = 0; int allDone = 1; for (int i=0; insubs; i++) { if (args->subs[i].done < args->subs[i].nsteps) { allDone = 0; break; } } if (allDone) { args->state = ncclProxyOpNone; TRACE(NCCL_NET, "sendProxy [%ld/%d] stopped", (long)sub->done, s); } } } } } return ncclSuccess; } static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0; resources->step = sub->base + sub->nsteps; memset(sub->requests, 0, sizeof(sub->requests)); } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = NCCL_PROTO_SIMPLE; int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS); for (int s=0; snsubs; s++) { int group = s/COLLNET_GROUP_NSUBS; int groupStart = s - (s%COLLNET_GROUP_NSUBS); struct ncclProxySubArgs* sub = args->subs+s; struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); void* mhandle = resources->mhandles[p]; auto reqFifo = resources->reqFifo; char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); // Enforce sync between operations of the same group. if (LAST_OF_GROUP(args, s) && (sub->posted < sub->done + calcStepsPerGroup(nGroups)) && (sub->posted < sub->nsteps)) { int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; reqFifo[group][buffSlot].turnIsSendNotRecv = true; TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] posted buffer", (long)sub->posted, group, buffSlot); sub->posted += args->sliceSteps; args->idle = 0; continue; } if (LAST_OF_GROUP(args, s) && (sub->received < sub->posted)) { int buffSlot = (sub->base+sub->received)%NCCL_STEPS; if (!reqFifo[group][buffSlot].turnIsSendNotRecv) { // Buffer is cleared : coll is complete int recvBeg = calcRegionOffset(args, 1, groupStart, sub->received, 0); int recvEnd = calcRegionOffset(args, 1, s, sub->received, 1); int totalSize = recvEnd - recvBeg; TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] received, size %d chunkSize=%d", (long)sub->received, group, buffSlot, totalSize, args->chunkSize); sub->received += args->sliceSteps; if ((reqFifo[group][buffSlot].size > 0 || sub->reg) && resources->useGdr && resources->needFlush) { // GDRCOPY support if (resources->gdcFlush) { #if defined (__x86_64__) // Force a PCI-E read from GPU memory asm volatile ("mov (%0), %%eax" :: "l"(resources->gdcFlush) : "%eax"); #else WARN("NET: GDR Flush only supported on x86_64"); return ncclInternalError; #endif } else { if (sub->reg) { size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE); size_t offset = 0; if (args->coll == ncclFuncReduceScatter) { size_t sizePerRank = args->specifics.collnetDirect.sizePerRank; int node = args->specifics.collnetDirect.node; int startNode = sub->offset / sizePerRank; int lastNode = (sub->offset + nBytes) / sizePerRank; if (startNode == node) { offset = sub->offset % sizePerRank; nBytes = std::min(sizePerRank - offset, nBytes); } else if (startNode < node && node < lastNode) { nBytes = sizePerRank; } else if (node == lastNode) { nBytes = (sub->offset + nBytes) % sizePerRank; } else { // no need to flush nBytes = 0; } } NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, sub->recvbuff + offset, nBytes, sub->recvMhandle, sub->requests+buffSlot)); if (sub->requests[buffSlot]) { sub->nbytes -= nBytes; sub->offset += nBytes; if (args->coll == ncclFuncAllGather || args->coll == ncclFuncAllReduce) { sub->recvbuff += nBytes; } } } else { NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region+recvBeg, totalSize, mhandle, sub->requests+buffSlot)); } } } args->idle = 0; continue; } } if (LAST_OF_GROUP(args, s) && (sub->flushed < sub->received)) { // Progress flush operations int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS; int done = 1; if (sub->requests[buffSlot]) NCCLCHECK(proxyState->ncclCollNet->test(sub->requests[buffSlot], &done, NULL)); if (done) { sub->requests[buffSlot] = nullptr; TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] flushed", (long)sub->flushed, group, buffSlot); for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps; args->idle = 0; //continue; } } if (sub->transmitted < sub->flushed) { if (sub->reg == 0) { int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS; volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo; connFifo[buffSlot].offset = calcRegionOffset(args, 1, s, sub->transmitted, 0); __sync_synchronize(); } volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail; *recvTail = sub->base + sub->flushed; if (resources->gdcSync) wc_store_fence(); // Flush out WC write sub->transmitted += args->sliceSteps; args->idle = 0; continue; } // Enforce sync here to make sure the last sub doesn't increase "done" before all others in the group have // reached the same point, otherwise we would start posting buffers to the send proxy before we're done // processing all the shared buffer. bool groupSync = s==0 ? args->subs[args->nsubs-1].done == sub->done : (sub-1)->done > sub->done; volatile uint64_t* sendHead = &resources->sendMem->head; if (groupSync && sub->done < sub->transmitted && (sub->base+sub->done) < *sendHead) { sub->done += args->sliceSteps; args->idle = 0; if (sub->done == sub->nsteps && s == args->nsubs-1) { args->state = ncclProxyOpNone; TRACE(NCCL_NET, "recvProxy [%ld/%d] stopped", (long)sub->done, s); } } } } return ncclSuccess; } struct collnetRegInfo { uintptr_t buffer; size_t size; }; ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle) { ncclResult_t ret = ncclSuccess; struct ncclReg *regRecord = NULL; *outRegBufFlag = 0; *outHandle = NULL; if (comm && userbuff && buffSize > 0) { NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, ®Record), ret, fail); if (regRecord) { if (regRecord->state & COLLNET_REG_COMPLETE) { // reuse previous registration *outRegBufFlag = 2; *outHandle = regRecord->collnetHandle; goto exit; } else { /* start register collnet buffer */ struct collnetRegInfo info = {regRecord->addr, regRecord->pages * comm->regCache.pageSize}; void* handle = NULL; struct ncclProxyConnector* proxyconn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn; NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail); if (handle) { regRecord->state |= COLLNET_REG_COMPLETE; regRecord->proxyconn = proxyconn; *outHandle = regRecord->collnetHandle = handle; *outRegBufFlag = 1; } } } } exit: return ret; fail: *outRegBufFlag = 0; *outHandle = NULL; goto exit; } struct ncclCollnetCleanupCallback { struct ncclCommCallback base; struct ncclProxyConnector* proxyConn; void* buffer; size_t size; void* mhandle; }; static ncclResult_t cleanupCollnet(struct ncclComm* comm, struct ncclCommCallback* cb) { struct ncclCollnetCleanupCallback* obj = (struct ncclCollnetCleanupCallback*)cb; NCCLCHECK(ncclCollnetDeregBuffer(comm, obj->proxyConn, obj->mhandle)); INFO(NCCL_REG, "rank %d - deregistered collnet buffer handle %p, size %ld, buff %p", comm->rank, obj->mhandle, obj->size, obj->buffer); free(obj); return ncclSuccess; } ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueElts) { ncclResult_t ret = ncclSuccess; void* handle = NULL; struct ncclRegCache* cache = &comm->regCache; uintptr_t pageSize = cache->pageSize; uintptr_t addr = (uintptr_t)userbuff & -pageSize; size_t size = DIVUP((uintptr_t)userbuff - addr + buffSize, pageSize) * pageSize; collnetRegInfo info = {addr, size}; struct ncclCollnetCleanupCallback* record = NULL; struct ncclProxyConnector* proxyConn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn; *outRegBufFlag = 0; NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail); record = (struct ncclCollnetCleanupCallback*)malloc(sizeof(struct ncclCollnetCleanupCallback)); record->base.fn = cleanupCollnet; record->proxyConn = proxyConn; record->buffer = (void*)userbuff; record->size = buffSize; *outHandle = record->mhandle = handle; *outRegBufFlag = 1; ncclIntruQueueEnqueue(cleanupQueue, &record->base); *nCleanupQueueElts += 1; exit: return ret; fail: *outRegBufFlag = 0; *outHandle = NULL; goto exit; } ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyconn, void* handle) { NCCLCHECK(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgDeregister, &handle, sizeof(void*), NULL, 0)); return ncclSuccess; } static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { void* handle; struct collnetRegInfo* info = (struct collnetRegInfo*)reqBuff; struct sendResources* resources = (struct sendResources*)(connection->transportResources); assert(reqSize == sizeof(struct collnetRegInfo)); assert(respSize == sizeof(void*)); if (proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle) != ncclSuccess) handle = NULL; memcpy(respBuff, (void*)&handle, sizeof(void*)); *done = 1; return ncclSuccess; } static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { void* handle; struct collnetRegInfo* info = (struct collnetRegInfo*)reqBuff; struct recvResources* resources = (struct recvResources*)(connection->transportResources); assert(reqSize == sizeof(struct collnetRegInfo)); assert(respSize == sizeof(void*)); if (proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle) != ncclSuccess) handle = NULL; memcpy(respBuff, (void*)&handle, sizeof(void*)); *done = 1; return ncclSuccess; } static ncclResult_t sendProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) { void* handle; struct sendResources* resources = (struct sendResources*)(connection->transportResources); assert(reqSize == sizeof(void*)); memcpy(&handle, reqBuff, sizeof(void*)); NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, handle)); *done = 1; return ncclSuccess; } static ncclResult_t recvProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) { void* handle; struct recvResources* resources = (struct recvResources*)(connection->transportResources); assert(reqSize == sizeof(void*)); memcpy(&handle, reqBuff, sizeof(void*)); NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, handle)); *done = 1; return ncclSuccess; } struct ncclTransport collNetTransport = { "COL", canConnect, { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer }, { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer } }; ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm) { ncclResult_t ret = ncclSuccess; char line[1024]; if (comm->collNetSupport == 0) goto exit; // Connect Collnet + chain for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channel = comm->channels + c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, fail); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_CHAIN], 0), ret, fail); for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channel = comm->channels + c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, fail); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_CHAIN], 1), ret, fail); line[0] = '\0'; for (int c = 0; c < comm->nChannels; c++) { struct ncclTree* chain = &comm->channels[c].collnetChain; snprintf(line + strlen(line), 1023 - strlen(line), " [%d] %d->%d->%d", c, chain->down[0], comm->rank, chain->up); } line[1023] = '\0'; INFO(NCCL_INIT, "Connected Collnet Chains %s", line); exit: return ret; fail: goto exit; } ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) { ncclResult_t ret = ncclSuccess; int highestTransportType0 = TRANSPORT_UNDEFINED, highestTransportType1 = TRANSPORT_UNDEFINED; if (comm->collNetSupport == 0) goto exit; // Connect intra-node CollNet + Direct for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channelRecv = comm->channels + c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, fail); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 0, &highestTransportType0), ret, fail); for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channelSend = comm->channels + c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, fail); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1, &highestTransportType1), ret, fail); // Exchange highest intra-node transport type among ranks // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer if (highestTransportType0 != TRANSPORT_UNDEFINED && highestTransportType1 != TRANSPORT_UNDEFINED) { int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_UNDEFINED }; comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1; NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail); for (int i = 0; i < comm->localRanks; i++) { if (highestTypes[i] > comm->intraHighestTransportType) comm->intraHighestTransportType = highestTypes[i]; } if (comm->collNetSharedRes->intraHighestTransportType < comm->intraHighestTransportType) comm->collNetSharedRes->intraHighestTransportType = comm->intraHighestTransportType; } else if (comm->intraHighestTransportType == TRANSPORT_UNDEFINED) { // reuse previous shared intraHighestTransportType comm->intraHighestTransportType = comm->collNetSharedRes->intraHighestTransportType; } INFO(NCCL_INIT, "rank %d Connected CollNet", comm->rank); exit: return ret; fail: goto exit; } static ncclResult_t collNetInitRailRankMap(ncclComm_t comm) { int rank = comm->rank; uint64_t nonHeadMask = (1ull << comm->localRanks) - 1; comm->collNetDenseToUserRank = ncclMemoryStackAlloc(&comm->memPermanent, comm->nRanks); comm->collNetUserToDenseRank = ncclMemoryStackAlloc(&comm->memPermanent, comm->nRanks); // initialize collNetUserToDenseRank[rank] comm->collNetUserToDenseRank[rank] = -1; for (int h = 0; h < comm->collNetHeadsNum; h++) { nonHeadMask ^= 1ull << comm->rankToLocalRank[comm->collNetHeads[h]]; if (comm->collNetHeads[h] == rank) { comm->collNetUserToDenseRank[rank] = h; break; } } if (comm->collNetUserToDenseRank[rank] == -1) { comm->collNetUserToDenseRank[rank] = __builtin_popcountll(nonHeadMask & ((1ull << comm->localRank) - 1)); } comm->collNetUserToDenseRank[rank] += comm->node * comm->localRanks; NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->collNetUserToDenseRank, sizeof(int))); for (int r = 0; r < comm->nRanks; r++) { comm->collNetDenseToUserRank[comm->collNetUserToDenseRank[r]] = r; } return ncclSuccess; } ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* graphs[]) { ncclResult_t ret = ncclSuccess; int rank = comm->rank; int collNetSetupFail = 0; // Find all head ranks int nHeadsUnique = 0; int* headsUnique = NULL; bool share; struct ncclTopoGraph* directGraph = graphs[NCCL_ALGO_COLLNET_DIRECT]; struct collnetShareInfo { int headPosition; int isMaster; }; struct collnetShareInfo* infos = NULL; NCCLCHECKGOTO(ncclCalloc(&headsUnique, directGraph->nChannels), ret, fail); { uint64_t mask = 0; // Head GPU index is always 0 for (int c = 0; c < directGraph->nChannels; c++) { int head = directGraph->intra[c * comm->localRanks + 0]; assert(comm->rankToNode[head] == comm->node); uint64_t mask0 = mask; mask |= 1ull<rankToLocalRank[head]; if (mask != mask0) headsUnique[nHeadsUnique++] = head; } } comm->collNetHeads = headsUnique; comm->collNetHeadsNum = nHeadsUnique; if (parent && parent->collNetSupport && parent->nNodes == comm->nNodes) { if (!parent->config.splitShare) { collNetSetupFail = 1; goto fail; } NCCLCHECKGOTO(ncclCalloc(&infos, comm->nRanks), ret, fail); /* check whether child can share collnet resources of parent. Since parent builds each collnet communicator * based on heads with the same head position in each node, as long as the collnet heads of child comm * can match parent's heads, we can let child communicator share parent's collnet resources. */ for (int h = 0; h < nHeadsUnique; ++h) { int prev = INT_MIN; struct collnetShareInfo* myinfo; share = true; myinfo = infos + comm->rank; memset(myinfo, 0, sizeof(struct collnetShareInfo)); /* find the child head position in parent collnet heads. */ if (headsUnique[h] == comm->rank) { myinfo->headPosition = -1; myinfo->isMaster = 1; for (int th = 0; th < parent->collNetHeadsNum; ++th) if (parent->topParentRanks[parent->collNetHeads[th]] == comm->topParentRanks[comm->rank]) { myinfo->headPosition = th; break; } } NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, infos, sizeof(struct collnetShareInfo)), ret, fail); for (int i = 0; i < comm->nRanks; ++i) { if (infos[i].isMaster) { if (prev == INT_MIN) prev = infos[i].headPosition; if (infos[i].headPosition == -1 || prev != infos[i].headPosition) { share = false; break; } } } if (share) { if (myinfo->isMaster) { comm->collNetSharedRes = parent->collNetSharedRes; for (int c = 0; c < comm->nChannels; ++c) NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, true), ret, fail); } NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail); } else { /* TODO: CX-6 and CX-7 both do not support multiple sharp resources per process, if child comm cannot * share the sharp resource from parent, we cannot use sharp in this case. This restriction might be * lifted by sharp plugin/IB hardware in the future. */ collNetSetupFail = 1; if (comm->rank == 0) { WARN("Child comms (nRanks %d) fails to share parent comms (nRanks %d) sharp resources", comm->nRanks, parent->nRanks); } goto fail; } } share = true; } else { /* this allocated buffer will be freed on proxy side */ NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1)); comm->collNetSharedRes->nChannels = comm->nChannels; comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE]; NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail); for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channel = comm->channels + c; NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, false), ret, fail); for (int h = 0; h < nHeadsUnique; h++) { const int head = headsUnique[h]; ncclConnect connect; collNetSetupFail |= ncclTransportCollNetSetup(comm, directGraph, channel, head, head, h, collNetRecv, &connect); if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, directGraph, channel, head, head, h, collNetSend, &connect); } // Verify CollNet setup across ranks after trying the first channel if (c == 0) { NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail); } } share = false; } if (share) { memcpy(comm->collNetSupportMatrix, parent->collNetSupportMatrix, sizeof(comm->collNetSupportMatrix)); } else { do { /* Initialize all entries in collNetSupportMatrix[redop][type]. Since some ranks don't connect to sharp we enable a (redop,type) if any rank claims support. */ uint8_t(*matrix)[4][ncclNumTypes]; bool isHead = false; matrix = nullptr; NCCLCHECKGOTO(ncclCalloc(&matrix, comm->nRanks), ret, matrix_end); for (int h = 0; h < nHeadsUnique; h++) isHead |= (headsUnique[h] == comm->rank); if (isHead) { for (int ty=0; ty < ncclNumTypes; ty++) { for (int op=0; op < 4; op++) { int support = 0; NCCLCHECKGOTO(collNetReduceSupport(comm, (ncclDataType_t)ty, (ncclRedOp_t)op, &support), ret, matrix_end); // bit 0 = not supported, bit 1 = supported matrix[rank][op][ty] = 1<<(support ? 1 : 0); } } } NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, matrix, sizeof(*matrix)), ret, matrix_end); for (int ty=0; ty < ncclNumTypes; ty++) { for (int op=0; op < 4; op++) { uint8_t accum = 0; for (int r=0; r < comm->nRanks; r++) accum |= matrix[r][op][ty]; // We support (redop, type) if some rank supports it and no rank doesn't support it comm->collNetSupportMatrix[op][ty] = (accum == (1<<1)); } } matrix_end: free(matrix); if (ret != ncclSuccess) goto fail; } while (0); } // Verify CollNet setup across ranks after trying all channels NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail); TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank); exit: free(infos); return ret; fail: ncclTransportCollNetFree(comm); comm->collNetSupport = 0; goto exit; } nccl-2.22.3-1/src/transport/generic.cc000066400000000000000000000024131463451655400174400ustar00rootroot00000000000000#include "comm.h" #include "transport.h" ncclResult_t ncclTransportRingConnect(struct ncclComm* comm) { ncclResult_t ret = ncclSuccess; if (comm && comm->nRanks > 1) { for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channel = comm->channels + c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_RING], 0), ret, fail); INFO(NCCL_INIT, "Connected all rings"); } exit: return ret; fail: goto exit; } ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm) { ncclResult_t ret = ncclSuccess; if (comm && comm->nRanks > 1) { // Connect Trees for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channel = comm->channels + c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail); NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail); INFO(NCCL_INIT, "Connected all trees"); } exit: return ret; fail: goto exit; } nccl-2.22.3-1/src/transport/net.cc000066400000000000000000002016101463451655400166120ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "net.h" #include "graph.h" #include "proxy.h" #include "collectives.h" #include "gdrwrap.h" #include "shm.h" #include "p2p.h" #include "profiler.h" #include "transport.h" static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large"); #define NCCL_NET_MAP_HOSTMEM 0 #define NCCL_NET_MAP_DEVMEM 1 #define NCCL_NET_MAP_SHARED_HOSTMEM 2 #define NCCL_NET_MAP_SHARED_DEVMEM 3 #define NCCL_NET_MAP_GDCMEM 4 #define NCCL_NET_MAP_MEMS 5 #define NCCL_NET_MAP_MASK_DEVMEM 0x40000000 #define NCCL_NET_MAP_MASK_SHARED 0x80000000 #define NCCL_NET_MAP_MASK_USED 0x20000000 #define NCCL_NET_MAP_MASK_OFFSET 0x1fffffff #define NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName) \ ((mapStruct)->offsets.offsetName >> 30) #define NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) \ (((mapStruct)->offsets.offsetName >> 29) == 0) #define NCCL_NET_MAP_GET_POINTER(mapStruct, cpuOrGpu, offsetName) \ (NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) ? NULL : \ (mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET)) #define NCCL_NET_MAP_DEV_MEM(mapStruct, offsetName) \ (((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_DEVMEM) != 0) #define NCCL_NET_MAP_ADD_POINTER(mapStruct, shared, dev, memSize, offsetName) do { \ int bank = NCCL_NET_MAP_MASK_USED + (dev)*NCCL_NET_MAP_MASK_DEVMEM + (shared)*NCCL_NET_MAP_MASK_SHARED; \ if ((shared) == 0) { \ if (dev) { \ (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size; \ (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size += memSize; \ } else { \ (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size; \ (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size += memSize; \ } \ } else { \ (mapStruct)->offsets.offsetName = bank; \ } \ } while (0); struct connectMapMem{ char* gpuPtr; char* cpuPtr; int size; ncclIpcDesc ipcDesc; char shmPath[PATH_MAX]; ncclShmHandle_t attachHandle; ncclShmHandle_t createHandle; }; struct connectMap { int sameProcess; int shared; int cudaDev; // First 3 bits of offsets determine the mem bank. 001 is host mem, 011 is dev mem, 101 is shared host mem and 111 is shared dev mem. struct connectMapMem mems[NCCL_NET_MAP_MEMS]; // Offsets. 3 MSBs indicate mem bank, 111 indicates NULL. struct { uint32_t sendMem; uint32_t recvMem; uint32_t buffs[NCCL_NUM_PROTOCOLS]; } offsets; }; struct sendNetResources { struct connectMap map; void* netSendComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; int tpRank; int tpLocalRank; int tpRemoteRank; int netDev; int useGdr; int useDmaBuf; int maxRecvs; uint64_t* gdcSync; void* gdrDesc; int shared; int channelId; int connIndex; char* buffers[NCCL_NUM_PROTOCOLS]; int buffSizes[NCCL_NUM_PROTOCOLS]; void* mhandles[NCCL_NUM_PROTOCOLS]; uint64_t step; uint64_t llLastCleaning; int netDeviceVersion; ncclNetDeviceType netDeviceType; ncclNetDeviceHandle_t* netDeviceHandle; }; struct recvNetResources { struct connectMap map; void* netListenComm; void* netRecvComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; int tpRank; int tpLocalRank; int tpRemoteRank; int tpRemoteProxyRank; int netDev; int useGdr; int useDmaBuf; int needFlush; int maxRecvs; uint64_t* gdcSync; uint64_t* gdcFlush; void* gdrDesc; int shared; int channelId; int connIndex; char* buffers[NCCL_NUM_PROTOCOLS]; int buffSizes[NCCL_NUM_PROTOCOLS]; void* mhandles[NCCL_NUM_PROTOCOLS]; uint64_t step; uint64_t llLastCleaning; int netDeviceVersion; ncclNetDeviceType netDeviceType; ncclNetDeviceHandle_t* netDeviceHandle; }; /* Determine if two peers can communicate with NET */ static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { *ret = 1; if (info1->hostHash == info2->hostHash) { // If on the same host, check intra-node net is not disabled. NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, ret)); } return ncclSuccess; } NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2); NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1); struct setupReq { int tpRank; int tpLocalRank; int tpRemoteRank; int shared; int netDev; int useGdr; int needFlush; int channelId; int connIndex; }; // Forward declaration static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args); /* Determine if we will use this transport for this peer and return connect * information for this peer */ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct setupReq req = { 0 }; int tpProxyRank; send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; req.channelId = channelId; req.connIndex = connIndex; int proxyRank; int64_t netId; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netId, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; tpProxyRank = comm->topParentRanks[proxyRank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn)); req.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; req.tpRank = comm->topParentRanks[myInfo->rank]; req.tpRemoteRank = comm->topParentRanks[peerInfo->rank]; NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); if (proxyRank == myInfo->rank) { INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); } else { INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev, proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); } *((int*)connectInfo) = tpProxyRank; return ncclSuccess; } // GDRCOPY support: TAIL_ENABLE When enabled locates the RX proxy tail in CUDA memory NCCL_PARAM(GdrCopySyncEnable, "GDRCOPY_SYNC_ENABLE", 1); // GDRCOPY support: FLUSH_ENABLE When enabled uses a PCI-E read to flush GDRDMA buffers NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0); /* Setup recv connector */ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { struct setupReq req = { 0 }; recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; req.channelId = channelId; req.connIndex = connIndex; // Use myInfo->rank as the receiver uses its own NIC int proxyRank, tpProxyRank; int64_t netId; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &netId, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr)); // Determine whether we need to flush the GDR buffer on recv or not if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); // We don't support PXN on receive yet tpProxyRank = comm->topParentRanks[myInfo->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn)); req.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; req.tpRank = comm->topParentRanks[myInfo->rank]; req.tpRemoteRank = comm->topParentRanks[peerInfo->rank]; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t))); INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); return ncclSuccess; } static ncclResult_t netMapShm(struct connectMapMem* mem) { mem->cpuPtr = NULL; mem->gpuPtr = NULL; NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, -1, &mem->attachHandle)); return ncclSuccess; } static ncclResult_t netCreateShm(struct connectMapMem* mem) { mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1, &mem->createHandle)); return ncclSuccess; } static ncclResult_t netDumpMap(struct connectMap* map) { printf("Dump map same process %d shared %d\n", map->sameProcess, map->shared); struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM; printf("Mem 0: Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr); mem = map->mems+NCCL_NET_MAP_DEVMEM; printf("Mem 1: Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM; printf("Mem 2: Shared Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr); mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM; printf("Mem 3: Shared Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", map->offsets.sendMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, NCCL_NET_MAP_OFFSET_BANK(map, sendMem), map->offsets.sendMem & NCCL_NET_MAP_MASK_OFFSET, NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem), NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem)); printf("RecvMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", map->offsets.recvMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, NCCL_NET_MAP_OFFSET_BANK(map, recvMem), map->offsets.recvMem & NCCL_NET_MAP_MASK_OFFSET, NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem), NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem)); for (int p=0; p Used %d Bank %d Offset %x, cpu %p, gpu %p\n", p, map->offsets.buffs[p] & NCCL_NET_MAP_MASK_USED ? 1 : 0, NCCL_NET_MAP_OFFSET_BANK(map, buffs[p]), map->offsets.buffs[p] & NCCL_NET_MAP_MASK_OFFSET, NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]), NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p])); } printf("End of dump\n"); return ncclSuccess; } struct netSendConnectArgs { ncclNetHandle_t handle; }; struct netRecvConnectArgs { int proxyRank; }; static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { struct connectMap* map = (connectMap*) send->transportResources; void* opId; // map isn't allocated thus this op hasn't been submitted yet if (!map) { // Setup device pointers NCCLCHECK(ncclCalloc(&map, 1)); send->transportResources = map; opId = send; INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId); netSendConnectArgs args = {0}; memcpy(&args.handle, connectInfo, sizeof(ncclNetHandle_t)); NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(netSendConnectArgs), sizeof(struct connectMap), opId)); } else { opId = send; } ncclResult_t ret; ret = ncclPollProxyResponse(comm, &send->proxyConn, map, opId); if (ret != ncclSuccess) { if (ret != ncclInProgress) { free(map); send->transportResources = NULL; } return ret; } INFO(NCCL_PROXY, "sendConnect ncclPollProxyResponse opId=%p", opId); if (map->sameProcess && !ncclCuMemEnable()) { if (map->cudaDev != comm->cudaDev) { // Enable P2P access for Legacy IPC cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { cudaGetLastError(); } else if (err != cudaSuccess) { WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err)); return ncclInternalError; } } } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) { if (!map->sameProcess) NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM)); if (map->mems[NCCL_NET_MAP_DEVMEM].size) { map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr = NULL; NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank, map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = NULL; } if (map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size) { void** sharedDevMemPtr = comm->proxyState->sharedDevMems + send->proxyConn.tpLocalRank; if (*sharedDevMemPtr == NULL) { map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = NULL; NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank, map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size, &map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipcDesc, sharedDevMemPtr)); } map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = (char*)(*sharedDevMemPtr); map->mems[NCCL_NET_MAP_SHARED_DEVMEM].cpuPtr = NULL; } } //NCCLCHECK(netDumpMap(map)); struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; send->conn.head = gdcMem ? (uint64_t*)gdcMem : &sendMem->head; struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); send->conn.tail = &recvMem->tail; send->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; send->conn.connFifo = recvMem->connFifo; // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree for (int i=0; iconn.connFifo[i].offset = -1; recvMem->connFifo[i].mode = map->shared ? NCCL_MODE_OFFSET : NCCL_MODE_NORMAL; } for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); if (send->proxyConn.sameProcess) { if (send->proxyConn.connection->netDeviceHandle) { send->conn.netDeviceHandle = *send->proxyConn.connection->netDeviceHandle; for (int p=0; pconn.mhandles[p] = send->proxyConn.connection->mhandles[p]; } if (send->proxyConn.connection->needsProxyProgress) { send->proxyConn.proxyProgress = sendProxyProgress; } else { send->proxyConn.proxyProgress = NULL; } } else { send->proxyConn.proxyProgress = sendProxyProgress; } return ncclSuccess; } // Forward declare static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args); /* Connect to this peer */ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { struct connectMap* map = (connectMap*) recv->transportResources; void* opId; if (!map) { NCCLCHECK(ncclCalloc(&map, 1)); recv->transportResources = map; // Use recv connector as unique identifier opId = recv; INFO(NCCL_PROXY, "recvConnect ncclProxyCallAsync opId=%p &recv->proxyConn=%p connectInfo=%p", opId, &recv->proxyConn, connectInfo); netRecvConnectArgs args = {0}; args.proxyRank = *((int*)connectInfo); NCCLCHECK(ncclProxyCallAsync(comm, &recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(netRecvConnectArgs), sizeof(struct connectMap), opId)); } else { opId = recv; } ncclResult_t ret; NCCLCHECK(ret = ncclPollProxyResponse(comm, &recv->proxyConn, map, opId)); if (ret != ncclSuccess) { if (ret != ncclInProgress) { free(map); recv->transportResources = NULL; } return ret; } INFO(NCCL_PROXY, "recvConnect ncclPollProxyResponse opId=%p", opId); //NCCLCHECK(netDumpMap(map)); struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); recv->conn.head = &sendMem->head; struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail; recv->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; recv->conn.connFifo = recvMem->connFifo; // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree for (int i=0; iconnFifo[i].mode = map->shared ? NCCL_MODE_OFFSET : NCCL_MODE_NORMAL; } for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); if (recv->proxyConn.sameProcess) { if (recv->proxyConn.connection->netDeviceHandle) { recv->conn.netDeviceHandle = *recv->proxyConn.connection->netDeviceHandle; for (int p=0; pconn.mhandles[p] = recv->proxyConn.connection->mhandles[p]; } if (recv->proxyConn.connection->needsProxyProgress) { recv->proxyConn.proxyProgress = recvProxyProgress; } else { recv->proxyConn.proxyProgress = NULL; } } else { recv->proxyConn.proxyProgress = recvProxyProgress; } return ncclSuccess; } static ncclResult_t sendFree(struct ncclConnector* send) { struct connectMap* map = (struct connectMap*)(send->transportResources); if (map) { int cudaDev; CUDACHECK(cudaGetDevice(&cudaDev)); if (map->sameProcess && map->cudaDev == cudaDev) { // Our own GPU, so it wasn't mapped in free(map); return ncclSuccess; } if (!map->sameProcess || ncclCuMemEnable()) { if (!map->sameProcess) NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].attachHandle)); if (map->mems[NCCL_NET_MAP_DEVMEM].size) { if (ncclCuMemEnable()) { // cuMem API support NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc)); NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); } else { // Legacy CUDA IPC support CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); } } } free(map); } return ncclSuccess; } static ncclResult_t recvFree(struct ncclConnector* recv) { if (recv->transportResources) free(recv->transportResources); return ncclSuccess; } #define NCCL_SHARED_STEPS 16 static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int cuda, int tpLocalRank, int type, int sameProcess, int nChannels, char** gpuPtr, char** cpuPtr, int* size, ncclIpcDesc *ipcDesc) { if (cuda == 0 && sameProcess == 0) { WARN("PXN should not use host buffers for data"); return ncclInternalError; } struct ncclProxyProgressState* progressState = &proxyState->progressState; if (progressState->localPeers == NULL) { NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks)); } struct ncclProxyPeer** localPeers = progressState->localPeers; if (localPeers[tpLocalRank] == NULL) { NCCLCHECK(ncclCalloc(localPeers + tpLocalRank, 1)); } struct ncclProxyPeer* peer = localPeers[tpLocalRank]; struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv; state->refcount++; if (state->size == 0) { state->size = nChannels * NCCL_SHARED_STEPS * proxyState->p2pChunkSize; } if (size) *size = state->size; if (cuda && state->cudaBuff == NULL) { if (sameProcess == 0 || ncclCuMemEnable()) { NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, &state->ipcDesc, (void**)&state->cudaBuff)); } else { NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size)); } } if (!cuda && state->hostBuff == NULL) { NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size)); } if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff; if (gpuPtr) *gpuPtr = sameProcess ? *cpuPtr : NULL; if (ipcDesc) memcpy(ipcDesc, &state->ipcDesc, sizeof(state->ipcDesc)); return ncclSuccess; } static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset, int* size) { // Use different pools for different channels and also separate send/recv. int globalSlot = (channel*NCCL_SHARED_STEPS)+slot; *offset = proxyState->p2pChunkSize * globalSlot; if (size) *size = proxyState->p2pChunkSize; return ncclSuccess; } static ncclResult_t sharedNetBuffersDestroy(struct ncclProxyState* proxyState, int tpLocalRank, int type, struct ncclProxyConnection* connection) { if (proxyState->progressState.localPeers == NULL) NCCLCHECK(ncclInternalError); struct ncclProxyPeer* peer = proxyState->progressState.localPeers[tpLocalRank]; if (peer == NULL) NCCLCHECK(ncclInternalError;) struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv; if (state->size == 0) NCCLCHECK(ncclInternalError); if (ncclAtomicRefCountDecrement(&state->refcount) == 0) { if (state->cudaBuff) { if (!connection->sameProcess || ncclCuMemEnable()) { NCCLCHECK(ncclP2pFreeShareableBuffer(&state->ipcDesc)); } NCCLCHECK(ncclCudaFree(state->cudaBuff)); } if (state->hostBuff) NCCLCHECK(ncclCudaHostFree(state->hostBuff)); } if (peer->send.refcount || peer->recv.refcount) return ncclSuccess; free(peer); proxyState->progressState.localPeers[tpLocalRank] = NULL; for (int r = 0; r < proxyState->tpLocalnRanks; r++) { if (proxyState->progressState.localPeers[r]) return ncclSuccess; } // All peers are freed, free array free(proxyState->progressState.localPeers); proxyState->progressState.localPeers = NULL; return ncclSuccess; } static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels) { NCCLCHECK(sharedNetBuffersInit(proxyState, 1, connection->tpLocalRank, 0, connection->sameProcess, nChannels, NULL, NULL, NULL, NULL)); return ncclSuccess; } static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct setupReq* req = (struct setupReq*) reqBuff; if (reqSize != sizeof(struct setupReq)) return ncclInternalError; struct sendNetResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); connection->transportResources = resources; resources->tpRank = req->tpRank; resources->tpLocalRank = req->tpLocalRank; resources->tpRemoteRank = req->tpRemoteRank; resources->netDev = req->netDev; resources->shared = connection->shared = req->shared; resources->useGdr = req->useGdr; resources->channelId = req->channelId; resources->connIndex = req->connIndex; ncclNetProperties_t props; NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props)); /* DMA-BUF support */ resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); resources->maxRecvs = props.maxRecvs; resources->netDeviceVersion = props.netDeviceVersion; resources->netDeviceType = props.netDeviceType; resources->netDeviceVersion = props.netDeviceVersion; resources->netDeviceType = props.netDeviceType; // We don't return any data if (respSize != 0) return ncclInternalError; *done = 1; return ncclSuccess; } static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct setupReq* req = (struct setupReq*) reqBuff; if (reqSize != sizeof(struct setupReq)) return ncclInternalError; struct recvNetResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); connection->transportResources = resources; resources->tpRank = req->tpRank; resources->tpLocalRank = req->tpLocalRank; resources->tpRemoteRank = req->tpRemoteRank; resources->netDev = req->netDev; resources->shared = connection->shared = req->shared; resources->useGdr = req->useGdr; resources->needFlush = req->needFlush; resources->channelId = req->channelId; resources->connIndex = req->connIndex; ncclNetProperties_t props; NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props)); /* DMA-BUF support */ resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); resources->maxRecvs = props.maxRecvs; resources->netDeviceVersion = props.netDeviceVersion; resources->netDeviceType = props.netDeviceType; if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError; NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm)); *done = 1; return ncclSuccess; } // This function embeds plugin-specific rules given the current versions static ncclResult_t ncclNetGetDeviceHandle(ncclNetDeviceType type, int version, bool isRecv, ncclNetDeviceHandle_t** handle) { bool needsDeviceHandle = false; if (type == NCCL_NET_DEVICE_UNPACK) { if (version == NCCL_NET_DEVICE_UNPACK_VERSION && isRecv) { needsDeviceHandle = true; } } // Don't re-alloc netDeviceHandles if (needsDeviceHandle && (*handle == NULL)) { NCCLCHECK(ncclCalloc(handle, 1)); (*handle)->netDeviceType = type; (*handle)->netDeviceVersion = version; } else if (!needsDeviceHandle) { *handle = NULL; } return ncclSuccess; } static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources); if (reqSize != sizeof(netSendConnectArgs)) return ncclInternalError; ncclResult_t ret = ncclSuccess; netSendConnectArgs* req = (netSendConnectArgs*) reqBuff; NCCLCHECK(ncclNetGetDeviceHandle(resources->netDeviceType, resources->netDeviceVersion, false /*isRecv*/, &resources->netDeviceHandle)); if (resources->shared) { // Shared buffers struct ncclProxyProgressState* progressState = &proxyState->progressState; if (progressState->localPeers == NULL) { NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks)); } struct ncclProxyPeer** localPeers = progressState->localPeers; if (localPeers[resources->tpLocalRank] == NULL) { NCCLCHECK(ncclCalloc(localPeers + resources->tpLocalRank, 1)); } connection->proxyAppendPtr = localPeers[resources->tpLocalRank]->send.proxyAppend + resources->channelId; if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { // Connect or reuse connection for a netdev/remote rank. if (progressState->netComms[resources->netDev] == NULL) { NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks)); } struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank; if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle); resources->netSendComm = comms->sendComm[resources->channelId]; if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++; } else { ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle); } } else { // Connect to remote peer ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle); connection->proxyAppendPtr = &connection->proxyAppend; } NCCLCHECK(ret); if (resources->netSendComm == NULL) { *done = 0; return ncclInProgress; } *done = 1; if (resources->netDeviceHandle) { connection->netDeviceHandle = resources->netDeviceHandle; connection->needsProxyProgress = connection->netDeviceHandle->needsProxyProgress; } else { connection->needsProxyProgress = 1; } // Create structures struct connectMap* map = &resources->map; map->sameProcess = connection->sameProcess; map->shared = resources->shared; CUDACHECK(cudaGetDevice(&map->cudaDev)); if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p for (int p=0; puseGdr, proxyState->buffSizes[p], buffs[p]); resources->buffSizes[p] = proxyState->buffSizes[p]; } } else { // Get shared buffers int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; NCCLCHECK(sharedNetBuffersInit( proxyState, resources->useGdr, resources->tpLocalRank, 0, map->sameProcess, proxyState->p2pnChannels, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipcDesc)); resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size; if (proxyState->allocP2pNetLLBuffers) { NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*p == NCCL_PROTO_LL*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]); resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL]; } NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); } NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); if (map->mems[NCCL_NET_MAP_DEVMEM].size) { if (resources->shared == 0) { if (!map->sameProcess || ncclCuMemEnable()) { ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN); NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); } else { NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size)); } map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr; } } if (map->sameProcess) { NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; } else { NCCLCHECK(netCreateShm(map->mems+NCCL_NET_MAP_HOSTMEM)); } if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) { uint64_t *cpuPtr, *gpuPtr; NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc)); resources->gdcSync = cpuPtr; struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; gdcMem->cpuPtr = (char*)cpuPtr; gdcMem->gpuPtr = (char*)gpuPtr; gdcMem->size = sizeof(uint64_t); // sendMem->head } resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); // Don't give credits yet in shared mode. (resources->gdcSync ? *resources->gdcSync : resources->sendMem->head) = (map->shared ? -NCCL_STEPS : 0); for (int i=0; irecvMem->connFifo[i].size = -1; for (int p=0; pbuffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]); if (resources->buffers[p]) { #if CUDA_VERSION >= 11070 /* DMA-BUF support */ int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST; if (type == NCCL_PTR_CUDA && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif { NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); } // Copy the mhandle dptr, if implemented if (resources->netDeviceHandle && proxyState->ncclNet->getDeviceMr) NCCLCHECK(proxyState->ncclNet->getDeviceMr(resources->netSendComm, resources->mhandles[p], &connection->mhandles[p])); } } //NCCLCHECK(netDumpMap(map)); if (respSize != sizeof(struct connectMap)) return ncclInternalError; memcpy(respBuff, map, sizeof(struct connectMap)); return ncclSuccess; } static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (reqSize != sizeof(netRecvConnectArgs)) return ncclInternalError; struct recvNetResources* resources = (struct recvNetResources*)(connection->transportResources); netRecvConnectArgs* req = (netRecvConnectArgs*) reqBuff; resources->tpRemoteProxyRank = req->proxyRank; ncclResult_t ret = ncclSuccess; NCCLCHECK(ncclNetGetDeviceHandle(resources->netDeviceType, resources->netDeviceVersion, true /*isRecv*/, &resources->netDeviceHandle)); // Finish connection establishment from remote peer if (resources->shared) { // Shared buffers struct ncclProxyProgressState* progressState = &proxyState->progressState; if (progressState->localPeers == NULL) { NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks)); } struct ncclProxyPeer** localPeers = progressState->localPeers; if (localPeers[resources->tpLocalRank] == NULL) { NCCLCHECK(ncclCalloc(localPeers + resources->tpLocalRank, 1)); } connection->proxyAppendPtr = localPeers[resources->tpLocalRank]->recv.proxyAppend + resources->channelId; if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { // Connect or reuse connection for a netdev/remote rank. if (progressState->netComms[resources->netDev] == NULL) { NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks)); } struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteProxyRank; if (comms->recvComm[resources->channelId] == NULL) ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId, &resources->netDeviceHandle); resources->netRecvComm = comms->recvComm[resources->channelId]; if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++; } else { ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm, &resources->netDeviceHandle); } } else { // Connect to remote peer ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm, &resources->netDeviceHandle); connection->proxyAppendPtr = &connection->proxyAppend; } NCCLCHECK(ret); if (resources->netRecvComm == NULL) { *done = 0; return ncclInProgress; } *done = 1; if (resources->netDeviceHandle) { connection->netDeviceHandle = resources->netDeviceHandle; connection->needsProxyProgress = connection->netDeviceHandle->needsProxyProgress; } else { connection->needsProxyProgress = 1; } NCCLCHECK(proxyState->ncclNet->closeListen(resources->netListenComm)); // Create structures struct connectMap* map = &resources->map; map->sameProcess = connection->sameProcess; if (map->sameProcess == 0) return ncclInternalError; // We don't support remote proxy for recv map->shared = resources->shared; if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p for (int p=0; puseGdr, proxyState->buffSizes[p], buffs[p]); resources->buffSizes[p] = proxyState->buffSizes[p]; } } else { // Get shared buffers int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; NCCLCHECK(sharedNetBuffersInit( proxyState, resources->useGdr, resources->tpLocalRank, 1, 1, proxyState->p2pnChannels, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL)); resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size; NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); } NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); if (proxyState->allocP2pNetLLBuffers) { NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]); resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL]; } if (map->mems[NCCL_NET_MAP_DEVMEM].size) { if (resources->shared == 0) { if (ncclCuMemEnable()) { NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); } else { NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size)); } map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr; } } NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; if (ncclGdrCopy && map->sameProcess) { uint64_t *cpuPtr, *gpuPtr; NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc)); if (ncclParamGdrCopySyncEnable()) { resources->gdcSync = cpuPtr; struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; gdcMem->cpuPtr = (char*)cpuPtr; gdcMem->gpuPtr = (char*)gpuPtr; gdcMem->size = sizeof(uint64_t); } if (ncclParamGdrCopyFlushEnable()) resources->gdcFlush = cpuPtr + 1; } resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); for (int p=0; pbuffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]); if (resources->buffers[p]) { #if CUDA_VERSION >= 11070 /* DMA-BUF support */ int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST; if (type == NCCL_PTR_CUDA && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif { NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); } // Copy the mhandle dptr if (resources->netDeviceType != NCCL_NET_DEVICE_HOST && proxyState->ncclNet->getDeviceMr) NCCLCHECK(proxyState->ncclNet->getDeviceMr(resources->netRecvComm, resources->mhandles[p], &connection->mhandles[p])); } } //NCCLCHECK(netDumpMap(map)); if (respSize != sizeof(struct connectMap)) return ncclInternalError; memcpy(respBuff, map, sizeof(struct connectMap)); return ncclSuccess; } static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources); if (connection->state == connSharedInitialized) { // NVB Preconnect NCCLCHECK(sharedNetBuffersDestroy(proxyState, connection->tpLocalRank, 0, connection)); return ncclSuccess; } if (connection->state == connConnected) { for (int p=0; pbuffers[p]) { NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, resources->mhandles[p])); } } struct connectMapMem* mems = resources->map.mems; if (resources->map.sameProcess) { NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); } else { NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].createHandle)); } NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); if (!resources->map.sameProcess || ncclCuMemEnable()) { // cuMem API support if (mems[NCCL_NET_MAP_DEVMEM].size) { NCCLCHECK(ncclP2pFreeShareableBuffer(&mems[NCCL_NET_MAP_DEVMEM].ipcDesc)); } } if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); if (resources->shared) { NCCLCHECK(sharedNetBuffersDestroy(proxyState, resources->tpLocalRank, 0, connection)); if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev]+resources->tpRemoteRank; comms->sendRefCount[resources->channelId]--; if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeSend(comms->sendComm[resources->channelId])); } else { NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm)); } } else { NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm)); } } if (resources) free(resources); return ncclSuccess; } static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct recvNetResources* resources = (struct recvNetResources*)(connection->transportResources); if (connection->state == connSharedInitialized) { // NVB Preconnect NCCLCHECK(sharedNetBuffersDestroy(proxyState, connection->tpLocalRank, 1, connection)); return ncclSuccess; } if (connection->state == connConnected) { for (int p=0; pbuffers[p]) { NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, resources->mhandles[p])); } } struct connectMapMem* mems = resources->map.mems; NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); if (!resources->map.sameProcess || ncclCuMemEnable()) { // cuMem API support if (mems[NCCL_NET_MAP_DEVMEM].size) { NCCLCHECK(ncclP2pFreeShareableBuffer(&mems[NCCL_NET_MAP_DEVMEM].ipcDesc)); } } if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); if (resources->shared) { NCCLCHECK(sharedNetBuffersDestroy(proxyState, resources->tpLocalRank, 1, connection)); if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev] + resources->tpRemoteProxyRank; comms->recvRefCount[resources->channelId]--; if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeRecv(comms->recvComm[resources->channelId])); } else { NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm)); } } else { NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm)); } } if (resources) free(resources); return ncclSuccess; } static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps"); #define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two. static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); // Set step base for next op resources->step = sub->base + sub->nsteps; sub->posted = sub->transmitted = sub->done = 0; for (uint64_t step=0; stepnsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin); if (sub->reg && sub->nbytes > 0) { NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle)); } else { sub->mhandle = resources->mhandles[args->protocol]; } } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs); for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; if (sub->done == sub->nsteps) continue; struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources); volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo; int stepSize = resources->buffSizes[p] / NCCL_STEPS; char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); // Post buffers to the GPU if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) { int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; if (resources->shared) { if (!sub->reg) { int sharedBuffSlot = sub->posted%maxDepth; int offset; NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset, NULL)); resources->recvMem->connFifo[buffSlot].offset = offset; __sync_synchronize(); } volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; sub->posted += args->sliceSteps; // Only post one credit for registered buffer if (sub->reg == 0 || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS; if (resources->gdcSync) wc_store_fence(); // Flush out WC write } else sub->posted += args->sliceSteps; for (uint64_t step=sub->posted-args->sliceSteps; stepposted; step++) { ncclProfilingRecord(args, s, step, ncclProxyProfileSendGPUWait); } args->idle = 0; continue; } // Check whether we received data from the GPU and send it to the network if (sub->transmitted < sub->posted && sub->transmitted < sub->done + NCCL_STEPS) { int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; volatile uint64_t* recvTail = &resources->recvMem->tail; uint64_t tail = sub->base + (sub->reg ? 0 : sub->transmitted); if ((sub->reg || connFifo[buffSlot].size != -1) && ((*recvTail > tail) || p == NCCL_PROTO_LL)) { // We have something to receive, let's check if it's completely ready. int size = sub->reg ? std::min(MAX_NET_SIZE, sub->nbytes) : connFifo[buffSlot].size; bool shared = (p == NCCL_PROTO_SIMPLE) && resources->shared; char* buff = shared ? localBuff+connFifo[buffSlot].offset : localBuff+buffSlot*stepSize; int ready = 1; if (p == NCCL_PROTO_LL128) { ready = resources->useGdr; if (!ready) { // When data is in sysmem, we need to wait until all flags are correct since the GPU only // called threadfence() uint64_t flag = sub->base+sub->transmitted+1; int nFifoLines = DIVUP(connFifo[buffSlot].size, sizeof(uint64_t)*NCCL_LL128_LINEELEMS); volatile uint64_t* lines = (volatile uint64_t*)buff; ready = 1; for (int i=0; ibase+sub->transmitted+1); int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff; for (int i=0; ishared) { buff = sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset; } if (ready) { // Data is ready, try to send. NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->mhandle, sub->requests+buffSlot)); if (sub->requests[buffSlot] != NULL) { TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p, size %d, proto %d, myRank %d, channelId %d", sub->transmitted, buffSlot, sub->requests[buffSlot], size, p, proxyState->tpRank, sub->channelId); sub->transmitted += args->sliceSteps; for (uint64_t step=sub->transmitted-args->sliceSteps; steptransmitted; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileSendWait); args->idle = 0; continue; } } } } // Check whether the network has completed some send operations. if (sub->done < sub->transmitted) { int done; int size; int buffSlot = (sub->base+sub->done)%NCCL_STEPS; NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, &size)); if (done) { if (sub->reg) { if (size < sub->nbytes) { sub->recvbuff += size; sub->nbytes -= size; // Do one more step (at least) sub->nsteps++; } else { // Signal the GPU the send is complete and it can return. connFifo[sub->base%NCCL_STEPS].size = -1; } } // Make sure size is reset to -1 before we update the head. if (sub->reg == 0) connFifo[buffSlot].size = -1; __sync_synchronize(); TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]); sub->done += args->sliceSteps; for (uint64_t step=sub->done-args->sliceSteps; stepdone; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd); if (resources->shared == 0) { volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; if (sub->reg) { // We may have added more net steps, but reg operations only have a single step w.r.t. the GPU. if (sub->done == sub->nsteps) *sendHead = sub->base + args->sliceSteps; } else { *sendHead = sub->base + sub->done; } if (resources->gdcSync) wc_store_fence(); // Flush out WC write } args->idle = 0; if (sub->done == sub->nsteps) { if (sub->reg && sub->nbytes > 0) { NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, sub->mhandle)); } args->done++; } } } } if (args->done == args->nsubs) { args->state = ncclProxyOpNone; } } return ncclSuccess; } static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { // Initialize subs and group them by same recvComm. void* recvComm; int groupSize = 0; int maxRecvs = 1; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; if (groupSize == maxRecvs) { groupSize = 0; } else if (s>0) { // Find next sub with the same recvComm int next; for (next=s; nextnsubs; next++) { struct recvNetResources* nextRes = (struct recvNetResources*) (args->subs[next].connection->transportResources); if (nextRes->netRecvComm == recvComm) break; } if (next == args->nsubs) { // Not found groupSize = 0; } else if (s != next) { // We found a sub later with the same recvComm ; swap subs struct ncclProxySubArgs temp; memcpy(&temp, sub, sizeof(struct ncclProxySubArgs)); memcpy(sub, args->subs+next, sizeof(struct ncclProxySubArgs)); memcpy(args->subs+next, &temp, sizeof(struct ncclProxySubArgs)); } } groupSize++; struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); maxRecvs = resources->maxRecvs; recvComm = resources->netRecvComm; // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); // Set step base for next op resources->step = sub->base + sub->nsteps; sub->posted = sub->received = sub->transmitted = sub->done = 0; for (int i=0; insteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin); if (sub->reg && sub->nbytes > 0) { // Register buffer NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle)); } else { sub->mhandle = resources->mhandles[args->protocol]; } } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs); for (int s=0; snsubs; s+=args->subs[s].groupSize) { struct ncclProxySubArgs* subGroup = args->subs+s; int subCount = 0; void* ptrs[NCCL_PROXY_MAX_SUBS]; int sizes[NCCL_PROXY_MAX_SUBS]; int tags[NCCL_PROXY_MAX_SUBS]; void* mhandles[NCCL_PROXY_MAX_SUBS]; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; if (sub->posted < sub->nsteps) { if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; } struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); if (sub->reg) maxDepth = 1; int stepSize = resources->buffSizes[p] / NCCL_STEPS; char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo; if (p == NCCL_PROTO_SIMPLE && resources->shared) { if (sub->reg) { // Wait until CUDA kernel has started before we access the user buffer directly. if (connFifo[sub->base%NCCL_STEPS].size == -1) continue; ptrs[subCount] = sub->recvbuff; sizes[subCount] = std::min(MAX_NET_SIZE, sub->nbytes); } else { int sharedBuffSlot = sub->posted%maxDepth; int offset; NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset, sizes+subCount)); connFifo[buffSlot].offset = offset; ptrs[subCount] = localBuff+offset; } } else { ptrs[subCount] = localBuff+buffSlot*stepSize; sizes[subCount] = stepSize*args->sliceSteps; } if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes; tags[subCount] = resources->tpRemoteRank; mhandles[subCount] = sub->mhandle; subCount++; } } if (subCount) { uint64_t step = subGroup->posted; struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources); void** requestPtr = subGroup->requests+(step%NCCL_STEPS); NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr)); if (*requestPtr) { subGroup->recvRequestsCache[step%NCCL_STEPS] = *requestPtr; subGroup->recvRequestsSubCount = subCount; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup+i; sub->posted += args->sliceSteps; for (uint64_t step=sub->posted-args->sliceSteps; stepposted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait); } args->idle = 0; } } } if (args->idle == 0) return ncclSuccess; for (int s=0; snsubs; s+=args->subs[s].groupSize) { struct ncclProxySubArgs* subGroup = args->subs+s; if (subGroup->posted > subGroup->received) { uint64_t step = subGroup->received; int done; void* ptrs[NCCL_PROXY_MAX_SUBS]; int sizes[NCCL_PROXY_MAX_SUBS]; void* mhandles[NCCL_PROXY_MAX_SUBS]; for (int i=0; incclNet->test(subGroup->requests[step%NCCL_STEPS], &done, sizes)); if (done) { int needFlush = 0; int totalSize = 0; int subIndex = 0; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; if (sub->received < sub->nsteps) { int size = sizes[subIndex++]; if (sub->reg) { if (size < sub->nbytes) { sub->recvbuff += size; sub->nbytes -= size; // Do one more step (at least) sub->nsteps++; } else { // Reset connFifo size indicating the GPU was ready to receive. // There is a __sync_synchronize() later to ensure it is reset before it is set again by the GPU. struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo; connFifo[sub->base%NCCL_STEPS].size = -1; } } } sub->received += args->sliceSteps; for (uint64_t step=sub->received-args->sliceSteps; stepreceived; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait); if (step < sub->nsteps) { struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); if (resources->useGdr) needFlush |= resources->needFlush; } } subGroup->requests[step%NCCL_STEPS] = NULL; if (totalSize > 0 && p == NCCL_PROTO_SIMPLE && needFlush) { // GDRCOPY support struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources); if (resources->gdcFlush) { #if defined (__x86_64__) // Force a PCI-E read from GPU memory asm volatile ("mov (%0), %%eax" :: "l"(resources->gdcFlush) : "%eax"); #else WARN("NET: GDR Flush only supported on x86_64"); return ncclInternalError; #endif } else { int subCount = 0; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; if (step < sub->nsteps) { struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); int stepSize = resources->buffSizes[p] / NCCL_STEPS; char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); int buffSlot = (sub->base+sub->received-args->sliceSteps)%NCCL_STEPS; ptrs[subCount] = resources->shared ? (sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset) : localBuff+buffSlot*stepSize; mhandles[subCount] = sub->mhandle; subCount++; } } struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources); NCCLCHECK(proxyState->ncclNet->iflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS))); } } args->idle = 0; } } } if (args->idle == 0) return ncclSuccess; for (int s=0; snsubs; s+=args->subs[s].groupSize) { struct ncclProxySubArgs* subGroup = args->subs+s; if (subGroup->received > subGroup->transmitted) { uint64_t step = subGroup->transmitted; int done = 1; void* request = subGroup->requests[step%NCCL_STEPS]; if (request) NCCLCHECK(proxyState->ncclNet->test(request, &done, NULL)); if (done) { for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; sub->transmitted += args->sliceSteps; for (uint64_t step=sub->transmitted-args->sliceSteps; steptransmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait); if (step < sub->nsteps) { __sync_synchronize(); struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail; if (sub->reg) { // We may have added more net steps, but reg operations only have a single step w.r.t. the GPU. if (sub->transmitted == sub->nsteps) *recvTail = sub->base + args->sliceSteps; } else *recvTail = sub->base + sub->transmitted; if (resources->gdcSync) wc_store_fence(); // Flush out WC write } } args->idle = 0; } } } if (args->idle == 0) return ncclSuccess; for (int s=0; snsubs; s+=args->subs[s].groupSize) { struct ncclProxySubArgs* subGroup = args->subs+s; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; if (sub->done == sub->nsteps) continue; if (sub->transmitted > sub->done) { struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); volatile uint64_t* sendHead = &resources->sendMem->head; uint64_t done = sub->reg ? sub->base + sub->nsteps : *sendHead; while (done > sub->base + sub->done && // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted. sub->transmitted > sub->done) { if (subGroup->recvRequestsCache[sub->done%NCCL_STEPS]) { // the multirecv requests are only cached in the first sub. if (proxyState->ncclNet->irecvConsumed) NCCLCHECK(proxyState->ncclNet->irecvConsumed(resources->netRecvComm, subGroup->recvRequestsSubCount, subGroup->recvRequestsCache[sub->done%NCCL_STEPS])); subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL; } sub->done += args->sliceSteps; for (uint64_t step=sub->done-args->sliceSteps; stepdone; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileEnd); args->idle = 0; if (sub->done == sub->nsteps) { struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); if (sub->reg && sub->nbytes > 0) { NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, sub->mhandle)); } args->done++; break; } } } } } if (args->done == args->nsubs) { args->state = ncclProxyOpNone; } } return ncclSuccess; } struct ncclTransport netTransport = { "NET", canConnect, { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, NULL }, { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, NULL } }; nccl-2.22.3-1/src/transport/net_ib.cc000066400000000000000000002363361463451655400173010ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nccl.h" #include "core.h" #include "socket.h" #include "net.h" #include "graph.h" #include "utils.h" #include "param.h" #include #include #include #include #include #include #include #include #define ENABLE_TIMER 0 #include "timer.h" #include "ibvwrap.h" #define MAXNAMESIZE 64 static char ncclIbIfName[MAX_IF_NAME_SIZE+1]; static union ncclSocketAddress ncclIbIfAddr; struct ncclIbMr { uintptr_t addr; size_t pages; int refs; ibv_mr *mr; }; struct ncclIbMrCache { struct ncclIbMr *slots; int capacity, population; }; static int ncclNMergedIbDevs = -1; #define NCCL_IB_MAX_DEVS_PER_NIC 2 #define MAX_MERGED_DEV_NAME (MAXNAMESIZE*NCCL_IB_MAX_DEVS_PER_NIC)+NCCL_IB_MAX_DEVS_PER_NIC struct alignas(64) ncclIbMergedDev { int ndevs; int devs[NCCL_IB_MAX_DEVS_PER_NIC]; // Points to an index in ncclIbDevs int speed; char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+' }; static int ncclNIbDevs = -1; struct alignas(64) ncclIbDev { pthread_mutex_t lock; int device; uint64_t guid; uint8_t portNum; uint8_t link; int speed; ibv_context* context; int pdRefs; ibv_pd* pd; char devName[MAXNAMESIZE]; char* pciPath; int realPort; int maxQp; struct ncclIbMrCache mrCache; int ar; // ADAPTIVE_ROUTING struct ibv_port_attr portAttr; }; #define MAX_IB_DEVS 32 struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_DEVS]; struct ncclIbDev ncclIbDevs[MAX_IB_DEVS]; pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER; static int ncclIbRelaxedOrderingEnabled = 0; NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", -1); NCCL_PARAM(IbRoutableFlidIbGidIndex, "IB_ROUTABLE_FLID_GID_INDEX", 1); NCCL_PARAM(IbRoceVersionNum, "IB_ROCE_VERSION_NUM", 2); NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 18); NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7); NCCL_PARAM(IbPkey, "IB_PKEY", 0); NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0); NCCL_PARAM(IbSl, "IB_SL", 0); NCCL_PARAM(IbTc, "IB_TC", 0); NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192); NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2); NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2); NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", 0); pthread_t ncclIbAsyncThread; static void* ncclIbAsyncThreadMain(void* args) { struct ncclIbDev* dev = (struct ncclIbDev*)args; while (1) { struct ibv_async_event event; if (ncclSuccess != wrap_ibv_get_async_event(dev->context, &event)) { break; } char *str; if (ncclSuccess != wrap_ibv_event_type_str(&str, event.event_type)) { break; } if (event.event_type != IBV_EVENT_COMM_EST) WARN("NET/IB : %s:%d Got async event : %s", dev->devName, dev->portNum, str); if (ncclSuccess != wrap_ibv_ack_async_event(&event)) { break; } } return NULL; } static sa_family_t envIbAddrFamily(void) { sa_family_t family = AF_INET; const char* env = ncclGetEnv("NCCL_IB_ADDR_FAMILY"); if (env == NULL || strlen(env) == 0) { return family; } INFO(NCCL_ENV, "NCCL_IB_ADDR_FAMILY set by environment to %s", env); if (strcmp(env, "AF_INET") == 0) { family = AF_INET; } else if (strcmp(env, "AF_INET6") == 0) { family = AF_INET6; } return family; } static void* envIbAddrRange(sa_family_t af, int* mask) { *mask = 0; static struct in_addr addr; static struct in6_addr addr6; void *ret = (af == AF_INET) ? (void *)&addr : (void *)&addr6; const char* env = ncclGetEnv("NCCL_IB_ADDR_RANGE"); if (NULL == env || strlen(env) == 0) { return NULL; } INFO(NCCL_ENV, "NCCL_IB_ADDR_RANGE set by environment to %s", env); char addrString[128] = { 0 }; snprintf(addrString, 128, "%s", env); char *addrStrPtr = addrString; char *maskStrPtr = strstr(addrString, "/") + 1; if (NULL == maskStrPtr) { return NULL; } *(maskStrPtr - 1) = '\0'; if (inet_pton(af, addrStrPtr, ret) == 0) { WARN("NET/IB: Ip address '%s' is invalid for family %s, ignoring address", addrStrPtr, (af == AF_INET) ? "AF_INET" : "AF_INET6"); return NULL; } *mask = (int)strtol(maskStrPtr, NULL, 10); if (af == AF_INET && *mask > 32) { WARN("NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6"); *mask = 0; ret = NULL; } else if (af == AF_INET6 && *mask > 128) { WARN("NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6"); *mask = 0; ret = NULL; } return ret; } static sa_family_t getGidAddrFamily(union ibv_gid* gid) { const struct in6_addr *a = (struct in6_addr *)gid->raw; bool isIpV4Mapped = ((a->s6_addr32[0] | a->s6_addr32[1]) | (a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL; bool isIpV4MappedMulticast = (a->s6_addr32[0] == htonl(0xff0e0000) && ((a->s6_addr32[1] | (a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL)); return (isIpV4Mapped || isIpV4MappedMulticast) ? AF_INET : AF_INET6; } static bool matchGidAddrPrefix(sa_family_t af, void* prefix, int prefixlen, union ibv_gid* gid) { struct in_addr *base = NULL; struct in6_addr *base6 = NULL; struct in6_addr *addr6 = NULL;; if (af == AF_INET) { base = (struct in_addr *)prefix; } else { base6 = (struct in6_addr *)prefix; } addr6 = (struct in6_addr *)gid->raw; #define NETMASK(bits) (htonl(0xffffffff ^ ((1 << (32 - bits)) - 1))) int i = 0; while (prefixlen > 0 && i < 4) { if (af == AF_INET) { int mask = NETMASK(prefixlen); if ((base->s_addr & mask) ^ (addr6->s6_addr32[3] & mask)) { break; } prefixlen = 0; break; } else { if (prefixlen >= 32) { if (base6->s6_addr32[i] ^ addr6->s6_addr32[i]) { break; } prefixlen -= 32; ++i; } else { int mask = NETMASK(prefixlen); if ((base6->s6_addr32[i] & mask) ^ (addr6->s6_addr32[i] & mask)) { break; } prefixlen = 0; } } } return (prefixlen == 0) ? true : false; } static bool configuredGid(union ibv_gid* gid) { const struct in6_addr *a = (struct in6_addr *)gid->raw; int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]); if (((a->s6_addr32[0] | trailer) == 0UL) || ((a->s6_addr32[0] == htonl(0xfe800000)) && (trailer == 0UL))) { return false; } return true; } static bool linkLocalGid(union ibv_gid* gid) { const struct in6_addr *a = (struct in6_addr *)gid->raw; if (a->s6_addr32[0] == htonl(0xfe800000) && a->s6_addr32[1] == 0UL) { return true; } return false; } static bool validGid(union ibv_gid* gid) { return (configuredGid(gid) && !linkLocalGid(gid)); } static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum, int gidIndex, int* version) { char gidRoceVerStr[16] = { 0 }; char roceTypePath[PATH_MAX] = { 0 }; sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d", deviceName, portNum, gidIndex); int fd = open(roceTypePath, O_RDONLY); if (fd == -1) { return ncclSystemError; } int ret = read(fd, gidRoceVerStr, 15); close(fd); if (ret == -1) { return ncclSystemError; } if (strlen(gidRoceVerStr)) { if (strncmp(gidRoceVerStr, "IB/RoCE v1", strlen("IB/RoCE v1")) == 0 || strncmp(gidRoceVerStr, "RoCE v1", strlen("RoCE v1")) == 0) { *version = 1; } else if (strncmp(gidRoceVerStr, "RoCE v2", strlen("RoCE v2")) == 0) { *version = 2; } } return ncclSuccess; } static ncclResult_t ncclUpdateGidIndex(struct ibv_context* context, uint8_t portNum, sa_family_t af, void* prefix, int prefixlen, int roceVer, int gidIndexCandidate, int* gidIndex) { union ibv_gid gid, gidCandidate; NCCLCHECK(wrap_ibv_query_gid(context, portNum, *gidIndex, &gid)); NCCLCHECK(wrap_ibv_query_gid(context, portNum, gidIndexCandidate, &gidCandidate)); sa_family_t usrFam = af; sa_family_t gidFam = getGidAddrFamily(&gid); sa_family_t gidCandidateFam = getGidAddrFamily(&gidCandidate); bool gidCandidateMatchSubnet = matchGidAddrPrefix(usrFam, prefix, prefixlen, &gidCandidate); if (gidCandidateFam != gidFam && gidCandidateFam == usrFam && gidCandidateMatchSubnet) { *gidIndex = gidIndexCandidate; } else { if (gidCandidateFam != usrFam || !validGid(&gidCandidate) || !gidCandidateMatchSubnet) { return ncclSuccess; } int usrRoceVer = roceVer; int gidRoceVerNum, gidRoceVerNumCandidate; const char* deviceName = wrap_ibv_get_device_name(context->device); NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, *gidIndex, &gidRoceVerNum)); NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, gidIndexCandidate, &gidRoceVerNumCandidate)); if ((gidRoceVerNum != gidRoceVerNumCandidate || !validGid(&gid)) && gidRoceVerNumCandidate == usrRoceVer) { *gidIndex = gidIndexCandidate; } } return ncclSuccess; } // GID Format // global: | 64b - subnet-prefix | 64b - EUI | // raw : | 10b fixed | 22b 0 | 16b FLID | 16b subnet-prefix | 64b - EUI | static uint16_t ncclIbExtractLocalSubnetPrefix(uint64_t subnet_prefix) { return (be64toh(subnet_prefix) & 0xffff); } static int ncclIbExtractFlid (union ibv_gid *gid) { return ntohs(*((uint16_t*)((uintptr_t)(gid->raw) + 4))); } static ncclResult_t ncclIbGetGidIndex(struct ibv_context *context, uint8_t portNum, struct ibv_port_attr* portAttr, int *gidIndex) { int gidTblLen = portAttr->gid_tbl_len; //for IB, choose GID Index that will have routable FLID if present if (portAttr->link_layer == IBV_LINK_LAYER_INFINIBAND) { union ibv_gid gid; int routableGidIndex = ncclParamIbRoutableFlidIbGidIndex(); if (routableGidIndex < gidTblLen) { NCCLCHECK(wrap_ibv_query_gid(context, portNum, routableGidIndex, &gid)); if (ncclIbExtractFlid(&gid) != 0) { *gidIndex = routableGidIndex; return ncclSuccess; } } *gidIndex = 0; return ncclSuccess; } //for ROCE *gidIndex = ncclParamIbGidIndex(); if (*gidIndex >= 0) { return ncclSuccess; } sa_family_t userAddrFamily = envIbAddrFamily(); int userRoceVersion = ncclParamIbRoceVersionNum(); int prefixlen; void *prefix = envIbAddrRange(userAddrFamily, &prefixlen); *gidIndex = 0; for (int gidIndexNext = 1; gidIndexNext < gidTblLen; ++gidIndexNext) { NCCLCHECK(ncclUpdateGidIndex(context, portNum, userAddrFamily, prefix, prefixlen, userRoceVersion, gidIndexNext, gidIndex)); } return ncclSuccess; } NCCL_PARAM(IbDisable, "IB_DISABLE", 0); NCCL_PARAM(IbMergeVfs, "IB_MERGE_VFS", 1); NCCL_PARAM(IbMergeNics, "IB_MERGE_NICS", 1); static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) { char devicePath[PATH_MAX]; snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName); char* p = realpath(devicePath, NULL); if (p == NULL) { WARN("Could not find real path of %s (%s)", devName, devicePath); } else { // Merge multi-port NICs into the same PCI device p[strlen(p)-1] = '0'; // Also merge virtual functions (VF) into the same device if (ncclParamIbMergeVfs()) p[strlen(p)-3] = p[strlen(p)-4] = '0'; // And keep the real port aside (the ibv port is always 1 on recent cards) *realPort = 0; for (int d=0; dndevs > 1) { // Print out merged dev info snprintf(line+strlen(line), 2047-strlen(line), " [%d]={", d); for (int i = 0; i < mergedDev->ndevs; i++) { int ibDev = mergedDev->devs[i]; snprintf(line+strlen(line), 2047-strlen(line), "[%d] %s:%d/%s%s", ibDev, ncclIbDevs[ibDev].devName, ncclIbDevs[ibDev].portNum, ncclIbDevs[ibDev].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", // Insert comma to delineate i == (mergedDev->ndevs - 1) ? "" : ", "); } snprintf(line+strlen(line), 2047-strlen(line), "}"); } else { int ibDev = mergedDev->devs[0]; snprintf(line+strlen(line), 2047-strlen(line), " [%d]%s:%d/%s", ibDev, ncclIbDevs[ibDev].devName, ncclIbDevs[ibDev].portNum, ncclIbDevs[ibDev].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); } } line[2047] = '\0'; char addrline[SOCKET_NAME_MAXLEN+1]; INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "", ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline)); } pthread_mutex_unlock(&ncclIbLock); } return ncclSuccess; fail: pthread_mutex_unlock(&ncclIbLock); return ret; } ncclResult_t ncclIbDevices(int* ndev) { *ndev = ncclNMergedIbDevs; return ncclSuccess; } // Detect whether GDR can work on a given NIC with the current CUDA device // Returns : // ncclSuccess : GDR works // ncclSystemError : no module or module loaded but not supported by GPU ncclResult_t ncclIbGdrSupport() { static int moduleLoaded = -1; if (moduleLoaded == -1) { // Check for the nv_peer_mem module being loaded moduleLoaded = ((access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) && // Also support the new nv_mem_nc module (access("/sys/kernel/mm/memory_peers/nv_mem_nc/version", F_OK) == -1)) ? 0 : 1; } if (moduleLoaded == 0) return ncclSystemError; return ncclSuccess; } // Detect whether DMA-BUF support is present in the kernel // Returns : // ncclSuccess : DMA-BUF support is available // ncclSystemError : DMA-BUF is not supported by the kernel ncclResult_t ncclIbDmaBufSupport(int dev) { static int dmaBufSupported = -1; if (dmaBufSupported == -1) { ncclResult_t res; struct ibv_pd* pd; struct ibv_context* ctx; struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + dev; // Test each dev for (int i = 0; i < mergedDev->ndevs; i++) { int ibDev = mergedDev->devs[i]; ctx = ncclIbDevs[ibDev].context; NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure); // Test kernel DMA-BUF support with a dummy call (fd=-1) (void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/); // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise) dmaBufSupported = (errno != EOPNOTSUPP && errno != EPROTONOSUPPORT) ? 1 : 0; NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure); } } if (dmaBufSupported == 0) return ncclSystemError; return ncclSuccess; failure: dmaBufSupported = 0; return ncclSystemError; } #define NCCL_NET_IB_MAX_RECVS 8 ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) { struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs+dev; props->name = mergedDev->devName; props->speed = mergedDev->speed; // Take the rest of the properties from an arbitrary sub-device (should be the same) struct ncclIbDev* ibDev = ncclIbDevs + mergedDev->devs[0]; props->pciPath = ibDev->pciPath; props->guid = ibDev->guid; props->ptrSupport = NCCL_PTR_HOST; if (ncclIbGdrSupport() == ncclSuccess) { props->ptrSupport |= NCCL_PTR_CUDA; // GDR support via nv_peermem } props->regIsGlobal = 1; if (ncclIbDmaBufSupport(dev) == ncclSuccess) { props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF } props->latency = 0; // Not set props->port = ibDev->portNum + ibDev->realPort; props->maxComms = ibDev->maxQp; props->maxRecvs = NCCL_NET_IB_MAX_RECVS; props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; return ncclSuccess; } // We need to support NCCL_NET_MAX_REQUESTS for each concurrent receive #define MAX_REQUESTS (NCCL_NET_MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS) static_assert(MAX_REQUESTS <= 256, "request id are encoded in wr_id and we need up to 8 requests ids per completion"); #define NCCL_IB_MAX_QPS 128 // Per-QP connection metatdata struct ncclIbQpInfo { uint32_t qpn; // Fields needed for ece (enhanced connection establishment) struct ibv_ece ece; int ece_supported; int devIndex; }; // Per-Dev connection metadata struct ncclIbDevInfo { uint32_t lid; uint8_t ib_port; enum ibv_mtu mtu; uint8_t link_layer; // For RoCE and IB Rounter union ibv_gid gid; // FIFO RDMA info uint32_t fifoRkey; //remote dev info union ibv_gid remoteGid; }; // Struct containing everything needed to establish connections struct ncclIbConnectionMetadata { struct ncclIbQpInfo qpInfo[NCCL_IB_MAX_QPS]; struct ncclIbDevInfo devs[NCCL_IB_MAX_DEVS_PER_NIC]; char devName[MAX_MERGED_DEV_NAME]; uint64_t fifoAddr; int ndevs; }; enum ncclIbCommState { ncclIbCommStateStart = 0, ncclIbCommStateConnect = 1, ncclIbCommStateAccept = 3, ncclIbCommStateSend = 4, ncclIbCommStateRecv = 5, ncclIbCommStateConnecting = 6, ncclIbCommStateConnected = 7, ncclIbCommStatePendingReady = 8, }; struct ncclIbCommStage { enum ncclIbCommState state; int offset; void* buffer; void* comm; }; struct ncclIbHandle { union ncclSocketAddress connectAddr; // Filled by the target uint64_t magic; // random number to help debugging struct ncclIbCommStage stage; // Used by the other side when connecting }; // Retain local RoCE address for error logging struct ncclIbGidInfo { uint8_t link_layer; union ibv_gid localGid; int32_t localGidIndex; }; #define NCCL_NET_IB_REQ_UNUSED 0 #define NCCL_NET_IB_REQ_SEND 1 #define NCCL_NET_IB_REQ_RECV 2 #define NCCL_NET_IB_REQ_FLUSH 3 const char* reqTypeStr[] = { "Unused", "Send", "Recv", "Flush" }; struct ncclIbRequest { struct ncclIbNetCommBase* base; int type; struct ncclSocket* sock; int events[NCCL_IB_MAX_DEVS_PER_NIC]; struct ncclIbNetCommDevBase* devBases[NCCL_IB_MAX_DEVS_PER_NIC]; int nreqs; union { struct { int size; void* data; uint32_t lkeys[NCCL_IB_MAX_DEVS_PER_NIC]; int offset; } send; struct { int* sizes; } recv; }; }; struct ncclIbNetCommDevBase { int ibDevN; struct ibv_pd* pd; struct ibv_cq* cq; uint64_t pad[2]; struct ncclIbGidInfo gidInfo; }; struct ncclIbListenComm { int dev; struct ncclSocket sock; struct ncclIbCommStage stage; }; struct ncclIbSendFifo { uint64_t addr; int size; uint32_t rkeys[NCCL_IB_MAX_DEVS_PER_NIC]; uint32_t nreqs; uint32_t tag; uint64_t idx; char padding[24]; }; struct ncclIbQp { struct ibv_qp* qp; int devIndex; int remDevIdx; }; struct ncclIbRemSizesFifo { int elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; uint64_t fifoTail; uint64_t addr; uint32_t rkeys[NCCL_IB_MAX_DEVS_PER_NIC]; uint32_t flags; struct ibv_mr* mrs[NCCL_IB_MAX_DEVS_PER_NIC]; struct ibv_sge sge; }; // A per-dev struct for netIbSendComm struct alignas(8) ncclIbSendCommDev { struct ncclIbNetCommDevBase base; struct ibv_mr* fifoMr; }; // Wrapper to track an MR per-device, if needed struct ncclIbMrHandle { ibv_mr* mrs[NCCL_IB_MAX_DEVS_PER_NIC]; }; struct alignas(32) ncclIbNetCommBase { int ndevs; bool isSend; struct ncclIbRequest reqs[MAX_REQUESTS]; struct ncclIbQp qps[NCCL_IB_MAX_QPS]; int nqps; int qpIndex; int devIndex; struct ncclSocket sock; int ready; // Track necessary remDevInfo here int nRemDevs; struct ncclIbDevInfo remDevs[NCCL_IB_MAX_DEVS_PER_NIC]; }; struct ncclIbSendComm { struct ncclIbNetCommBase base; struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; // Each dev correlates to a mergedIbDev struct ncclIbSendCommDev devs[NCCL_IB_MAX_DEVS_PER_NIC]; struct ncclIbRequest* fifoReqs[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS]; struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1]; struct ncclIbRemSizesFifo remSizesFifo; uint64_t fifoHead; int ar; // Use adaptive routing when all merged devices have it enabled }; // The SendFifo needs to be 32-byte aligned and each element needs // to be a 32-byte multiple, so that an entry does not get split and // written out of order when IB Relaxed Ordering is enabled static_assert((sizeof(struct ncclIbNetCommBase) % 32) == 0, "ncclIbNetCommBase size must be 32-byte multiple to ensure fifo is at proper offset"); static_assert((offsetof(struct ncclIbSendComm, fifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned"); static_assert((sizeof(struct ncclIbSendFifo) % 32) == 0, "ncclIbSendFifo element size must be 32-byte multiples"); static_assert((offsetof(struct ncclIbSendComm, sges) % 32) == 0, "sges must be 32-byte aligned"); static_assert((offsetof(struct ncclIbSendComm, wrs) % 32) == 0, "wrs must be 32-byte aligned"); struct ncclIbGpuFlush { struct ibv_mr* hostMr; struct ibv_sge sge; struct ncclIbQp qp; }; struct ncclIbRemFifo { struct ncclIbSendFifo elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; uint64_t fifoTail; uint64_t addr; uint32_t flags; }; struct alignas(16) ncclIbRecvCommDev { struct ncclIbNetCommDevBase base; struct ncclIbGpuFlush gpuFlush; uint32_t fifoRkey; struct ibv_mr* fifoMr; struct ibv_sge fifoSge; struct ibv_mr* sizesFifoMr; }; struct ncclIbRecvComm { struct ncclIbNetCommBase base; struct ncclIbRecvCommDev devs[NCCL_IB_MAX_DEVS_PER_NIC]; struct ncclIbRemFifo remFifo; int sizesFifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; int gpuFlushHostMem; int flushEnabled; }; static_assert((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbRecvComm fifo must be 32-byte aligned"); NCCL_PARAM(IbQpsPerConn, "IB_QPS_PER_CONNECTION", 1); static void ncclIbAddEvent(struct ncclIbRequest* req, int devIndex, struct ncclIbNetCommDevBase* base) { req->events[devIndex]++; req->devBases[devIndex] = base; } ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base) { base->ibDevN = ibDevN; ncclIbDev* ibDev = ncclIbDevs + ibDevN; pthread_mutex_lock(&ibDev->lock); if (0 == ibDev->pdRefs++) { ncclResult_t res; NCCLCHECKGOTO(wrap_ibv_alloc_pd(&ibDev->pd, ibDev->context), res, failure); if (0) { failure: pthread_mutex_unlock(&ibDev->lock); return res; } } base->pd = ibDev->pd; pthread_mutex_unlock(&ibDev->lock); // Recv requests can generate 2 completions (one for the post FIFO, one for the Recv). NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0)); return ncclSuccess; } ncclResult_t ncclIbDestroyBase(struct ncclIbNetCommDevBase* base) { ncclResult_t res; NCCLCHECK(wrap_ibv_destroy_cq(base->cq)); pthread_mutex_lock(&ncclIbDevs[base->ibDevN].lock); if (0 == --ncclIbDevs[base->ibDevN].pdRefs) { NCCLCHECKGOTO(wrap_ibv_dealloc_pd(ncclIbDevs[base->ibDevN].pd), res, returning); } res = ncclSuccess; returning: pthread_mutex_unlock(&ncclIbDevs[base->ibDevN].lock); return res; } ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, struct ncclIbQp* qp) { struct ibv_qp_init_attr qpInitAttr; memset(&qpInitAttr, 0, sizeof(struct ibv_qp_init_attr)); qpInitAttr.send_cq = base->cq; qpInitAttr.recv_cq = base->cq; qpInitAttr.qp_type = IBV_QPT_RC; // We might send 2 messages per send (RDMA and RDMA_WITH_IMM) qpInitAttr.cap.max_send_wr = 2*MAX_REQUESTS; qpInitAttr.cap.max_recv_wr = MAX_REQUESTS; qpInitAttr.cap.max_send_sge = 1; qpInitAttr.cap.max_recv_sge = 1; qpInitAttr.cap.max_inline_data = ncclParamIbUseInline() ? sizeof(struct ncclIbSendFifo) : 0; NCCLCHECK(wrap_ibv_create_qp(&qp->qp, base->pd, &qpInitAttr)); struct ibv_qp_attr qpAttr; memset(&qpAttr, 0, sizeof(struct ibv_qp_attr)); qpAttr.qp_state = IBV_QPS_INIT; qpAttr.pkey_index = ncclParamIbPkey(); qpAttr.port_num = ib_port; qpAttr.qp_access_flags = access_flags; NCCLCHECK(wrap_ibv_modify_qp(qp->qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)); return ncclSuccess; } ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool override_tc) { struct ibv_qp_attr qpAttr; memset(&qpAttr, 0, sizeof(struct ibv_qp_attr)); qpAttr.qp_state = IBV_QPS_RTR; qpAttr.path_mtu = info->mtu; qpAttr.dest_qp_num = dest_qp_num; qpAttr.rq_psn = 0; qpAttr.max_dest_rd_atomic = 1; qpAttr.min_rnr_timer = 12; if (info->link_layer == IBV_LINK_LAYER_ETHERNET) { qpAttr.ah_attr.is_global = 1; qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->gid.global.subnet_prefix; qpAttr.ah_attr.grh.dgid.global.interface_id = info->gid.global.interface_id; qpAttr.ah_attr.grh.flow_label = 0; qpAttr.ah_attr.grh.sgid_index = sGidInfo->localGidIndex; qpAttr.ah_attr.grh.hop_limit = 255; if(ncclParamIbFifoTc() && override_tc) { qpAttr.ah_attr.grh.traffic_class = ncclParamIbFifoTc(); } else { qpAttr.ah_attr.grh.traffic_class = ncclParamIbTc(); } } else { //pick lid if subnet prefixs are same, FLID if they are not if (ncclIbExtractLocalSubnetPrefix(sGidInfo->localGid.global.subnet_prefix) == ncclIbExtractLocalSubnetPrefix(info->gid.global.subnet_prefix)) { qpAttr.ah_attr.is_global = 0; qpAttr.ah_attr.dlid = info->lid; } else { uint16_t flid = ncclIbExtractFlid(&info->gid); if (flid == 0) { WARN("Warning: remote FLID configured as zero even when endpoints are on different subnets, using dlid as fallback"); qpAttr.ah_attr.dlid = info->lid; } else { qpAttr.ah_attr.dlid = ncclIbExtractFlid(&info->gid); } qpAttr.ah_attr.is_global = 1; qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->gid.global.subnet_prefix; qpAttr.ah_attr.grh.dgid.global.interface_id = info->gid.global.interface_id; qpAttr.ah_attr.grh.sgid_index = sGidInfo->localGidIndex; qpAttr.ah_attr.grh.hop_limit = 255; } } qpAttr.ah_attr.sl = ncclParamIbSl(); qpAttr.ah_attr.src_path_bits = 0; qpAttr.ah_attr.port_num = info->ib_port; NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)); return ncclSuccess; } ncclResult_t ncclIbRtsQp(struct ibv_qp* qp) { struct ibv_qp_attr qpAttr; memset(&qpAttr, 0, sizeof(struct ibv_qp_attr)); qpAttr.qp_state = IBV_QPS_RTS; qpAttr.timeout = ncclParamIbTimeout(); qpAttr.retry_cnt = ncclParamIbRetryCnt(); qpAttr.rnr_retry = 7; qpAttr.sq_psn = 0; qpAttr.max_rd_atomic = 1; NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)); return ncclSuccess; } ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) { struct ncclIbListenComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large"); memset(handle, 0, sizeof(struct ncclIbHandle)); comm->dev = dev; handle->magic = NCCL_SOCKET_MAGIC; NCCLCHECK(ncclSocketInit(&comm->sock, &ncclIbIfAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1)); NCCLCHECK(ncclSocketListen(&comm->sock)); NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr)); *listenComm = comm; return ncclSuccess; } ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; struct ncclIbCommStage* stage = &handle->stage; struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm; int ready; *sendComm = NULL; if (stage->state == ncclIbCommStateConnect) goto ib_connect_check; if (stage->state == ncclIbCommStateSend) goto ib_send; if (stage->state == ncclIbCommStateConnecting) goto ib_connect; if (stage->state == ncclIbCommStateConnected) goto ib_send_ready; if (stage->state != ncclIbCommStateStart) { WARN("Error: trying to connect already connected sendComm"); return ncclInternalError; } NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm))); NCCLCHECK(ncclSocketInit(&comm->base.sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1)); stage->comm = comm; stage->state = ncclIbCommStateConnect; NCCLCHECK(ncclSocketConnect(&comm->base.sock)); ib_connect_check: /* since ncclSocketConnect is async, we must check if connection is complete */ NCCLCHECK(ncclSocketReady(&comm->base.sock, &ready)); if (!ready) return ncclSuccess; // IB Setup struct ncclIbMergedDev* mergedDev; mergedDev = ncclIbMergedDevs + dev; comm->base.ndevs = mergedDev->ndevs; comm->base.nqps = ncclParamIbQpsPerConn() * comm->base.ndevs; // We must have at least 1 qp per-device comm->base.isSend = true; // Init PD, Ctx for each IB device comm->ar = 1; // Set to 1 for logic for (int i = 0; i < mergedDev->ndevs; i++) { int ibDevN = mergedDev->devs[i]; NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base)); comm->ar = comm->ar && ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING - if all merged devs have it enabled } struct ncclIbConnectionMetadata meta; meta.ndevs = comm->base.ndevs; // Alternate QPs between devices int devIndex; devIndex = 0; for (int q = 0; q < comm->base.nqps; q++) { ncclIbSendCommDev* commDev = comm->devs + devIndex; ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN; NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, comm->base.qps+q)); comm->base.qps[q].devIndex = devIndex; meta.qpInfo[q].qpn = comm->base.qps[q].qp->qp_num; meta.qpInfo[q].devIndex = comm->base.qps[q].devIndex; // Query ece capabilities (enhanced connection establishment) NCCLCHECK(wrap_ibv_query_ece(comm->base.qps[q].qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported)); devIndex = (devIndex + 1) % comm->base.ndevs; } for (int i = 0; i < comm->base.ndevs; i++) { ncclIbSendCommDev* commDev = comm->devs + i; ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN; // Write to the metadata struct via this pointer ncclIbDevInfo* devInfo = meta.devs + i; devInfo->ib_port = ibDev->portNum; devInfo->mtu = ibDev->portAttr.active_mtu; devInfo->lid = ibDev->portAttr.lid; // Prepare my fifo NCCLCHECK(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); devInfo->fifoRkey = commDev->fifoMr->rkey; // Pack local GID info devInfo->link_layer = commDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer; NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &commDev->base.gidInfo.localGidIndex)); NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid)); devInfo->gid.global.subnet_prefix = commDev->base.gidInfo.localGid.global.subnet_prefix; devInfo->gid.global.interface_id = commDev->base.gidInfo.localGid.global.interface_id; // info logging if (devInfo->link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB for (int q = 0; q < comm->base.nqps; q++) { // Print just the QPs for this dev if (comm->base.qps[q].devIndex == i) INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d LID %d subnet-prefix %lu FLID %d fifoRkey=0x%x fifoLkey=0x%x", comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid, devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey); } } else { // RoCE for (int q = 0; q < comm->base.nqps; q++) { // Print just the QPs for this dev if (comm->base.qps[q].devIndex == i) INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x", comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, (int64_t)commDev->base.gidInfo.localGidIndex, devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey); } } } meta.fifoAddr = (uint64_t)comm->fifo; strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME); stage->state = ncclIbCommStateSend; stage->offset = 0; NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(meta))); memcpy(stage->buffer, &meta, sizeof(meta)); ib_send: NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(meta), &stage->offset)); if (stage->offset != sizeof(meta)) return ncclSuccess; stage->state = ncclIbCommStateConnecting; stage->offset = 0; // Clear the staging buffer for re-use memset(stage->buffer, 0, sizeof(meta)); ib_connect: struct ncclIbConnectionMetadata remMeta; NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclIbConnectionMetadata), &stage->offset)); if (stage->offset != sizeof(remMeta)) return ncclSuccess; memcpy(&remMeta, stage->buffer, sizeof(ncclIbConnectionMetadata)); comm->base.nRemDevs = remMeta.ndevs; if (comm->base.nRemDevs != comm->base.ndevs) { mergedDev = ncclIbMergedDevs + dev; WARN("NET/IB : Local mergedDev=%s has a different number of devices=%d as remoteDev=%s nRemDevs=%d", mergedDev->devName, comm->base.ndevs, remMeta.devName, comm->base.nRemDevs); } int link_layer; link_layer = remMeta.devs[0].link_layer; for (int i = 1; i < remMeta.ndevs; i++) { if (remMeta.devs[i].link_layer != link_layer) { WARN("NET/IB : Can't merge net devices with different link_layer. i=%d remMeta.ndevs=%d link_layer=%d rem_link_layer=%d", i, remMeta.ndevs, link_layer, remMeta.devs[i].link_layer); return ncclInternalError; } } // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc. for (int i = 0; i < remMeta.ndevs; i++) { comm->base.remDevs[i] = remMeta.devs[i]; comm->base.remDevs[i].remoteGid.global.interface_id = comm->base.remDevs[i].gid.global.interface_id; comm->base.remDevs[i].remoteGid.global.subnet_prefix = comm->base.remDevs[i].gid.global.subnet_prefix; // Retain remote sizes fifo info and prepare RDMA ops comm->remSizesFifo.rkeys[i] = remMeta.devs[i].fifoRkey; comm->remSizesFifo.addr = remMeta.fifoAddr; } for (int i=0; i < comm->base.ndevs; i++) { NCCLCHECK(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ)); } comm->base.nRemDevs = remMeta.ndevs; for (int q = 0; q < comm->base.nqps; q++) { struct ncclIbQpInfo* remQpInfo = remMeta.qpInfo + q; struct ncclIbDevInfo* remDevInfo = remMeta.devs + remQpInfo->devIndex; // Assign per-QP remDev comm->base.qps[q].remDevIdx = remQpInfo->devIndex; int devIndex = comm->base.qps[q].devIndex; ncclIbSendCommDev* commDev = comm->devs + devIndex; struct ibv_qp* qp = comm->base.qps[q].qp; if (remQpInfo->ece_supported) NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported)); NCCLCHECK(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false)); NCCLCHECK(ncclIbRtsQp(qp)); } if (link_layer == IBV_LINK_LAYER_ETHERNET ) { // RoCE for (int q = 0; q < comm->base.nqps; q++) { struct ncclIbQp* qp = comm->base.qps + q; int ibDevN = comm->devs[qp->devIndex].base.ibDevN; struct ncclIbDev* ibDev = ncclIbDevs + ibDevN; INFO(NCCL_NET,"NET/IB: IbDev %d Port %d qpn %d set_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x}", ibDevN, ibDev->portNum, remMeta.qpInfo[q].qpn, remMeta.qpInfo[q].ece_supported, remMeta.qpInfo[q].ece.vendor_id, remMeta.qpInfo[q].ece.options, remMeta.qpInfo[q].ece.comp_mask); } } comm->base.ready = 1; stage->state = ncclIbCommStateConnected; stage->offset = 0; ib_send_ready: NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, &comm->base.ready, sizeof(int), &stage->offset)); if (stage->offset != sizeof(int)) return ncclSuccess; free(stage->buffer); stage->state = ncclIbCommStateStart; *sendComm = comm; return ncclSuccess; } NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0); ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm; struct ncclIbCommStage* stage = &lComm->stage; struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm; int ready; *recvComm = NULL; if (stage->state == ncclIbCommStateAccept) goto ib_accept_check; if (stage->state == ncclIbCommStateRecv) goto ib_recv; if (stage->state == ncclIbCommStateSend) goto ib_send; if (stage->state == ncclIbCommStatePendingReady) goto ib_recv_ready; if (stage->state != ncclIbCommStateStart) { WARN("Listencomm in unknown state %d", stage->state); return ncclInternalError; } NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm))); stage->comm = rComm; stage->state = ncclIbCommStateAccept; NCCLCHECK(ncclSocketInit(&rComm->base.sock)); NCCLCHECK(ncclSocketAccept(&rComm->base.sock, &lComm->sock)); ib_accept_check: NCCLCHECK(ncclSocketReady(&rComm->base.sock, &ready)); if (!ready) return ncclSuccess; struct ncclIbConnectionMetadata remMeta; stage->state = ncclIbCommStateRecv; stage->offset = 0; NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta))); ib_recv: NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset)); if (stage->offset != sizeof(remMeta)) return ncclSuccess; /* copy back the received info */ memcpy(&remMeta, stage->buffer, sizeof(struct ncclIbConnectionMetadata)); // IB setup // Pre-declare variables because of goto struct ncclIbMergedDev* mergedDev; struct ncclIbDev* ibDev; int ibDevN; struct ncclIbRecvCommDev* rCommDev; struct ncclIbDevInfo* remDevInfo; struct ncclIbQp* qp; mergedDev = ncclIbMergedDevs + lComm->dev; rComm->base.ndevs = mergedDev->ndevs; rComm->base.nqps = ncclParamIbQpsPerConn() * rComm->base.ndevs; // We must have at least 1 qp per-device rComm->base.isSend = false; rComm->base.nRemDevs = remMeta.ndevs; if (rComm->base.nRemDevs != rComm->base.ndevs) { WARN("NET/IB : Local mergedDev %s has a different number of devices=%d as remote %s %d", mergedDev->devName, rComm->base.ndevs, remMeta.devName, rComm->base.nRemDevs); } // Metadata to send back to requestor (sender) struct ncclIbConnectionMetadata meta; for (int i = 0; i < rComm->base.ndevs; i++) { rCommDev = rComm->devs + i; ibDevN = mergedDev->devs[i]; NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &rCommDev->base)); ibDev = ncclIbDevs + ibDevN; NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex)); NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid)); } // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc. for (int i = 0; i < remMeta.ndevs; i++) { rComm->base.remDevs[i] = remMeta.devs[i]; rComm->base.remDevs[i].remoteGid.global.interface_id = rComm->base.remDevs[i].gid.global.interface_id; rComm->base.remDevs[i].remoteGid.global.subnet_prefix = rComm->base.remDevs[i].gid.global.subnet_prefix; } // Stripe QP creation across merged devs // Make sure to get correct remote peer dev and QP info int remDevIndex; int devIndex; devIndex = 0; for (int q = 0; q < rComm->base.nqps; q++) { remDevIndex = remMeta.qpInfo[q].devIndex; remDevInfo = remMeta.devs + remDevIndex; qp = rComm->base.qps+q; rCommDev = rComm->devs + devIndex; qp->remDevIdx = remDevIndex; // Local ibDevN ibDevN = rComm->devs[devIndex].base.ibDevN; ibDev = ncclIbDevs + ibDevN; NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, qp)); qp->devIndex = devIndex; devIndex = (devIndex + 1) % rComm->base.ndevs; // Set the ece (enhanced connection establishment) on this QP before RTR if (remMeta.qpInfo[q].ece_supported) { NCCLCHECK(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported)); // Query the reduced ece for this QP (matching enhancements between the requestor and the responder) // Store this in our own qpInfo for returning to the requestor if (meta.qpInfo[q].ece_supported) NCCLCHECK(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported)); } bool override_tc = (q == 0) ? true : false; NCCLCHECK(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc)); NCCLCHECK(ncclIbRtsQp(qp->qp)); } rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || ncclIbDmaBufSupport(lComm->dev) == ncclSuccess) && (ncclParamIbGdrFlushDisable() == 0)) ? 1 : 0; for (int i = 0; i < mergedDev->ndevs; i++) { rCommDev = rComm->devs + i; ibDevN = rCommDev->base.ibDevN; ibDev = ncclIbDevs + ibDevN; // Retain remote fifo info and prepare my RDMA ops rCommDev->fifoRkey = remMeta.devs[i].fifoRkey; rComm->remFifo.addr = remMeta.fifoAddr; NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ)); rCommDev->fifoSge.lkey = rCommDev->fifoMr->lkey; if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE; // Allocate Flush dummy buffer for GPU Direct RDMA if (rComm->flushEnabled) { NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE)); rCommDev->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlushHostMem; rCommDev->gpuFlush.sge.length = 1; rCommDev->gpuFlush.sge.lkey = rCommDev->gpuFlush.hostMr->lkey; NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rCommDev->gpuFlush.qp)); struct ncclIbDevInfo devInfo; devInfo.lid = ibDev->portAttr.lid; devInfo.link_layer = ibDev->portAttr.link_layer; devInfo.ib_port = ibDev->portNum; devInfo.gid.global.subnet_prefix = rCommDev->base.gidInfo.localGid.global.subnet_prefix; devInfo.gid.global.interface_id = rCommDev->base.gidInfo.localGid.global.interface_id; devInfo.mtu = ibDev->portAttr.active_mtu; NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false)); NCCLCHECK(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp)); } // Fill Handle meta.devs[i].lid = ibDev->portAttr.lid; meta.devs[i].link_layer = rCommDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer; meta.devs[i].ib_port = ibDev->portNum; meta.devs[i].gid.global.subnet_prefix = rCommDev->base.gidInfo.localGid.global.subnet_prefix; meta.devs[i].gid.global.interface_id = rCommDev->base.gidInfo.localGid.global.interface_id; // Adjust the MTU remMeta.devs[i].mtu = (enum ibv_mtu) std::min(remMeta.devs[i].mtu, ibDev->portAttr.active_mtu); meta.devs[i].mtu = remMeta.devs[i].mtu; // Prepare sizes fifo NCCLCHECK(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); meta.devs[i].fifoRkey = rComm->devs[i].sizesFifoMr->rkey; } meta.fifoAddr = (uint64_t)rComm->sizesFifo; for (int q = 0; q < rComm->base.nqps; q++) { meta.qpInfo[q].qpn = rComm->base.qps[q].qp->qp_num; meta.qpInfo[q].devIndex = rComm->base.qps[q].devIndex; } meta.ndevs = rComm->base.ndevs; strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME); stage->state = ncclIbCommStateSend; stage->offset = 0; if (stage->buffer) free(stage->buffer); NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbConnectionMetadata))); memcpy(stage->buffer, &meta, sizeof(struct ncclIbConnectionMetadata)); ib_send: NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(struct ncclIbConnectionMetadata), &stage->offset)); if (stage->offset < sizeof(struct ncclIbConnectionMetadata)) return ncclSuccess; stage->offset = 0; stage->state = ncclIbCommStatePendingReady; ib_recv_ready: NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, &rComm->base.ready, sizeof(int), &stage->offset)); if (stage->offset != sizeof(int)) return ncclSuccess; free(stage->buffer); *recvComm = rComm; /* reset lComm stage */ stage->state = ncclIbCommStateStart; stage->offset = 0; stage->comm = NULL; stage->buffer = NULL; return ncclSuccess; } ncclResult_t ncclIbGetRequest(struct ncclIbNetCommBase* base, struct ncclIbRequest** req) { for (int i=0; ireqs+i; if (r->type == NCCL_NET_IB_REQ_UNUSED) { r->base = base; r->sock = NULL; r->devBases[0] = NULL; r->devBases[1] = NULL; r->events[0] = r->events[1] = 0; *req = r; return ncclSuccess; } } WARN("NET/IB : unable to allocate requests"); *req = NULL; return ncclInternalError; } ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) { r->type = NCCL_NET_IB_REQ_UNUSED; return ncclSuccess; } ncclResult_t ncclIbTest(void* request, int* done, int* size); ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, size_t size, int type, uint64_t offset, int fd, ibv_mr** mhandle) { static __thread uintptr_t pageSize = 0; if (pageSize == 0) pageSize = sysconf(_SC_PAGESIZE); struct ncclIbMrCache* cache = &ncclIbDevs[base->ibDevN].mrCache; uintptr_t addr = (uintptr_t)data & -pageSize; size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; ncclResult_t res; pthread_mutex_lock(&ncclIbDevs[base->ibDevN].lock); for (int slot=0; /*true*/; slot++) { if (slot == cache->population || addr < cache->slots[slot].addr) { // didn't find in cache if (cache->population == cache->capacity) { // must grow cache cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity; NCCLCHECKGOTO(ncclRealloc(&cache->slots, cache->population, cache->capacity), res, returning); } // Deregister / register struct ibv_mr* mr; unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ; if (ncclIbRelaxedOrderingEnabled) flags |= IBV_ACCESS_RELAXED_ORDERING; if (fd != -1) { /* DMA-BUF support */ NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags), res, returning); } else { if (ncclIbRelaxedOrderingEnabled) { // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, base->pd, (void*)addr, pages*pageSize, addr, flags), res, returning); } else { NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, base->pd, (void*)addr, pages*pageSize, flags), res, returning); } } TRACE(NCCL_INIT|NCCL_NET,"regAddr=0x%lx size=%lld rkey=0x%x lkey=0x%x fd=%d", (unsigned long)addr, (long long)pages*pageSize, mr->rkey, mr->lkey, fd); if (slot != cache->population) memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclIbMr)); cache->slots[slot].addr = addr; cache->slots[slot].pages = pages; cache->slots[slot].refs = 1; cache->slots[slot].mr = mr; cache->population += 1; *mhandle = mr; res = ncclSuccess; goto returning; } else if ((addr >= cache->slots[slot].addr) && ((addr-cache->slots[slot].addr)/pageSize+pages) <= cache->slots[slot].pages) { cache->slots[slot].refs += 1; *mhandle = cache->slots[slot].mr; res = ncclSuccess; goto returning; } } returning: pthread_mutex_unlock(&ncclIbDevs[base->ibDevN].lock); return res; } struct ncclIbNetCommDevBase* ncclIbGetNetCommDevBase(ncclIbNetCommBase* base, int devIndex) { if (base->isSend) { struct ncclIbSendComm* sComm = (struct ncclIbSendComm*) base; return &sComm->devs[devIndex].base; } else { struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*) base; return &rComm->devs[devIndex].base; } } /* DMA-BUF support */ ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { assert(size > 0); struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm; struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle)); for (int i = 0; i < base->ndevs; i++) { // Each ncclIbNetCommDevBase is at different offset in send and recv netComms struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i); NCCLCHECK(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i)); } *mhandle = (void*) mhandleWrapper; return ncclSuccess; } ncclResult_t ncclIbRegMr(void* comm, void* data, size_t size, int type, void** mhandle) { return ncclIbRegMrDmaBuf(comm, data, size, type, 0ULL, -1, mhandle); } ncclResult_t ncclIbDeregMrInternal(ncclIbNetCommDevBase* base, ibv_mr* mhandle) { struct ncclIbMrCache* cache = &ncclIbDevs[base->ibDevN].mrCache; ncclResult_t res; pthread_mutex_lock(&ncclIbDevs[base->ibDevN].lock); for (int i=0; i < cache->population; i++) { if (mhandle == cache->slots[i].mr) { if (0 == --cache->slots[i].refs) { memmove(&cache->slots[i], &cache->slots[--cache->population], sizeof(struct ncclIbMr)); if (cache->population == 0) { free(cache->slots); cache->slots = NULL; cache->capacity = 0; } NCCLCHECKGOTO(wrap_ibv_dereg_mr(mhandle), res, returning); } res = ncclSuccess; goto returning; } } WARN("NET/IB: could not find mr %p inside cache of %d entries", mhandle, cache->population); res = ncclInternalError; returning: pthread_mutex_unlock(&ncclIbDevs[base->ibDevN].lock); return res; } ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle; struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm; for (int i = 0; i < base->ndevs; i++) { // Each ncclIbNetCommDevBase is at different offset in send and recv netComms struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i); NCCLCHECK(ncclIbDeregMrInternal(devComm, mhandleWrapper->mrs[i])); } free(mhandleWrapper); return ncclSuccess; } NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0); ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { struct ncclIbRequest** reqs = comm->fifoReqs[slot]; volatile struct ncclIbSendFifo* slots = comm->fifo[slot]; int nreqs = slots[0].nreqs; if (nreqs > NCCL_NET_IB_MAX_RECVS) return ncclInternalError; uint64_t wr_id = 0ULL; for (int r=0; rwrs+r; memset(wr, 0, sizeof(struct ibv_send_wr)); struct ibv_sge* sge = comm->sges+r; sge->addr=(uintptr_t)reqs[r]->send.data; wr->opcode = IBV_WR_RDMA_WRITE; wr->send_flags = 0; wr->wr.rdma.remote_addr = slots[r].addr; wr->next = wr + 1; wr_id += (reqs[r] - comm->base.reqs) << (r*8); } // Write size as immediate data. In the case of multi-send, only write // 0 or 1 as size to indicate whether there was data sent or received. uint32_t immData = 0; if (nreqs == 1) { immData = reqs[0]->send.size; } else { int* sizes = comm->remSizesFifo.elems[slot]; for (int r=0; rsend.size; comm->remSizesFifo.sge.addr = (uint64_t)sizes; comm->remSizesFifo.sge.length = nreqs*sizeof(int); } struct ibv_send_wr* lastWr = comm->wrs+nreqs-1; if (nreqs > 1 || (comm->ar && reqs[0]->send.size > ncclParamIbArThreshold())) { // When using ADAPTIVE_ROUTING, send the bulk of the data first as an // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote // completion. lastWr++; memset(lastWr, 0, sizeof(struct ibv_send_wr)); if (nreqs > 1) { // Write remote sizes Fifo lastWr->wr.rdma.remote_addr = comm->remSizesFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(int); lastWr->num_sge = 1; lastWr->sg_list = &comm->remSizesFifo.sge; } } lastWr->wr_id = wr_id; lastWr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM; lastWr->imm_data = immData; lastWr->next = NULL; lastWr->send_flags = IBV_SEND_SIGNALED; // Multi-QP: make sure IB writes are multiples of 128B so that LL and LL128 protocols still work const int align = 128; int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs; for (int i = 0; i < nqps; i++) { int qpIndex = comm->base.qpIndex; ncclIbQp* qp = comm->base.qps + qpIndex; int devIndex = qp->devIndex; for (int r=0; rdevs[devIndex].base); // Select proper rkey (needed even for 0-size send) comm->wrs[r].wr.rdma.rkey = slots[r].rkeys[qp->remDevIdx]; int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align; int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize); if (length <= 0) { comm->wrs[r].sg_list = NULL; comm->wrs[r].num_sge = 0; } else { // Select proper lkey comm->sges[r].lkey = reqs[r]->send.lkeys[devIndex]; comm->sges[r].length = length; comm->wrs[r].sg_list = comm->sges+r; comm->wrs[r].num_sge = 1; } } if (nreqs > 1) { // Also make sure lastWr writes remote sizes using the right lkey comm->remSizesFifo.sge.lkey = comm->remSizesFifo.mrs[devIndex]->lkey; lastWr->wr.rdma.rkey = comm->remSizesFifo.rkeys[devIndex]; } struct ibv_send_wr* bad_wr; NCCLCHECK(wrap_ibv_post_send(qp->qp, comm->wrs, &bad_wr)); for (int r=0; rsend.size, nqps), align) * align; reqs[r]->send.offset += chunkSize; comm->sges[r].addr += chunkSize; comm->wrs[r].wr.rdma.remote_addr += chunkSize; } // Select the next qpIndex comm->base.qpIndex = (comm->base.qpIndex+1) % comm->base.nqps; } return ncclSuccess; } ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; } if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; } struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle; // Wait for the receiver to have posted the corresponding receive int nreqs = 0; volatile struct ncclIbSendFifo* slots; int slot = (comm->fifoHead) % MAX_REQUESTS; struct ncclIbRequest** reqs = comm->fifoReqs[slot]; slots = comm->fifo[slot]; uint64_t idx = comm->fifoHead+1; if (slots[0].idx != idx) { *request = NULL; return ncclSuccess; } nreqs = slots[0].nreqs; // Wait until all data has arrived for (int r=1; r slots[r].size) size = slots[r].size; // Sanity checks if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkeys[0] == 0) { char line[SOCKET_NAME_MAXLEN + 1]; union ncclSocketAddress addr; ncclSocketGetAddr(&comm->base.sock, &addr); WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkeys[0]=%x", r, nreqs, tag, ncclSocketToString(&addr, line), slots[r].size, slots[r].addr, slots[r].rkeys[0]); return ncclInternalError; } struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(&comm->base, &req)); req->type = NCCL_NET_IB_REQ_SEND; req->sock = &comm->base.sock; req->base = &comm->base; req->nreqs = nreqs; req->send.size = size; req->send.data = data; req->send.offset = 0; // Populate events int nEvents = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs; int qpIndex = comm->base.qpIndex; // Count down while (nEvents > 0) { ncclIbQp* qp = comm->base.qps + qpIndex; int devIndex = qp->devIndex; ncclIbAddEvent(req, devIndex, &comm->devs[devIndex].base); // Track the valid lkey for this RDMA_Write req->send.lkeys[devIndex] = mhandleWrapper->mrs[devIndex]->lkey; nEvents--; // Don't update comm->base.qpIndex yet, we need to run through this same set of QPs inside ncclIbMultiSend() qpIndex = (qpIndex+1)%comm->base.nqps; } // Store all lkeys for (int i = 0; i < comm->base.ndevs; i++) { req->send.lkeys[i] = mhandleWrapper->mrs[i]->lkey; } *request = reqs[r] = req; // If this is a multi-recv, send only when all requests have matched. for (int r=0; rnreqs, as well as other fields to help debugging and sanity checks memset((void*)slots, 0, sizeof(struct ncclIbSendFifo)); memset(reqs, 0, NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbRequest*)); comm->fifoHead++; TIME_STOP(0); return ncclSuccess; } *request = NULL; return ncclSuccess; } ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) { struct ibv_send_wr wr; memset(&wr, 0, sizeof(wr)); int slot = comm->remFifo.fifoTail%MAX_REQUESTS; req->recv.sizes = comm->sizesFifo[slot]; for (int i=0; irecv.sizes[i] = 0; struct ncclIbSendFifo* localElem = comm->remFifo.elems[slot]; // Select the next devIndex (local) and QP to use for posting this CTS message // Since QPs are initialized by striping across devIndex, we can simply assign this to the same value ncclIbQp* ctsQp = comm->base.qps + comm->base.devIndex; comm->base.devIndex = (comm->base.devIndex + 1) % comm->base.ndevs; for (int i=0; ibase.ndevs; j++) localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey; localElem[i].nreqs = n; localElem[i].size = sizes[i]; // Sanity/Debugging localElem[i].tag = tags[i]; localElem[i].idx = comm->remFifo.fifoTail+1; } wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbSendFifo); // Lookup the correct fifoRkey wr.wr.rdma.rkey = comm->base.remDevs[ctsQp->remDevIdx].fifoRkey; // Set the correct sge properties comm->devs[ctsQp->devIndex].fifoSge.addr = (uint64_t)localElem; comm->devs[ctsQp->devIndex].fifoSge.length = n*sizeof(struct ncclIbSendFifo); wr.sg_list = &comm->devs[ctsQp->devIndex].fifoSge; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_WRITE; wr.send_flags = comm->remFifo.flags; // IBV_SEND_INLINE // We need to occasionally post a request with the IBV_SEND_SIGNALED flag, otherwise // the send queue will never empty. // // From https://www.rdmamojo.com/2014/06/30/working-unsignaled-completions/ // "How to use Unsignaled Completion?" / "Gotchas and Pitfalls" // All posted Send Requested, Signaled and Unsignaled, are considered outstanding until // a Work Completion that they, or Send Requests that were posted after them, was polled // from the Completion Queue associated with the Send Queue. This means if one works with // a Queue Pair that was configured to work with Unsignaled Completions, he must make // sure that occasionally (before the Send Queue is full with outstanding Send Requests) // a Send Request that generate Work Completion will be posted. // // Not following this rule may lead to a case that the Send Queue is full with Send // Requests that won't generate Work Completion: // // - The Send Queue is full, so no new Send Requests can be posted to it // - The Send Queue can't be emptied, since no Work Completion can be generated anymore // (the reason is that no Work Completion, that can generate Work Completion that // polling it will empty the Send Queue, can be posted) // - The status of all posted Send Request is considered unknown // // slot == devIndex - When writing to fifo slot N, and this QP lives on device index N, it should send signalled. // This works out that each fifo posting QP gets drained if (slot == ctsQp->devIndex) { wr.send_flags |= IBV_SEND_SIGNALED; wr.wr_id = req - comm->base.reqs; ncclIbAddEvent(req, ctsQp->devIndex, &comm->devs[ctsQp->devIndex].base); } struct ibv_send_wr* bad_wr; NCCLCHECK(wrap_ibv_post_send(ctsQp->qp, &wr, &bad_wr)); comm->remFifo.fifoTail++; return ncclSuccess; } ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; } if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; } if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError; struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(&comm->base, &req)); req->type = NCCL_NET_IB_REQ_RECV; req->sock = &comm->base.sock; req->nreqs = n; for (int i = 0; i < comm->base.ndevs; i++) { req->devBases[i] = &comm->devs[i].base; } struct ibv_recv_wr wr; memset(&wr, 0, sizeof(wr)); wr.wr_id = req - comm->base.reqs; wr.sg_list = NULL; wr.num_sge = 0; TIME_START(1); // Select either all QPs, or one qp per-device const int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs; // Post recvs struct ibv_recv_wr* bad_wr; for (int i = 0; i < nqps; i++) { struct ncclIbQp* qp = comm->base.qps + comm->base.qpIndex; ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base); NCCLCHECK(wrap_ibv_post_recv(qp->qp, &wr, &bad_wr)); comm->base.qpIndex = (comm->base.qpIndex+1)%comm->base.nqps; } TIME_STOP(1); // Post to FIFO to notify sender TIME_START(2); NCCLCHECK(ncclIbPostFifo(comm, n, data, sizes, tags, mhandles, req)); TIME_STOP(2); *request = req; return ncclSuccess; } ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; int last = -1; for (int i=0; iflushEnabled == 0 || last == -1) return ncclSuccess; // Only flush once using the last non-zero receive struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(&comm->base, &req)); req->type = NCCL_NET_IB_REQ_FLUSH; req->sock = &comm->base.sock; struct ncclIbMrHandle* mhandle = (struct ncclIbMrHandle*) mhandles[last]; // We don't know which devIndex the recv was on, so we flush on all devices for (int i = 0; i < comm->base.ndevs; i++) { struct ibv_send_wr wr; memset(&wr, 0, sizeof(wr)); wr.wr_id = req - comm->base.reqs; wr.wr.rdma.remote_addr = (uint64_t)data[last]; wr.wr.rdma.rkey = mhandle->mrs[i]->rkey; wr.sg_list = &comm->devs[i].gpuFlush.sge; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_READ; wr.send_flags = IBV_SEND_SIGNALED; TIME_START(4); struct ibv_send_wr* bad_wr; NCCLCHECK(wrap_ibv_post_send(comm->devs[i].gpuFlush.qp.qp, &wr, &bad_wr)); TIME_STOP(4); ncclIbAddEvent(req, i, &comm->devs[i].base); } *request = req; return ncclSuccess; } ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { struct ncclIbRequest *r = (struct ncclIbRequest*)request; *done = 0; while (1) { if (r->events[0] == 0 && r->events[1] == 0) { TRACE(NCCL_NET, "r=%p done", r); *done = 1; if (sizes && r->type == NCCL_NET_IB_REQ_RECV) { for (int i=0; inreqs; i++) sizes[i] = r->recv.sizes[i]; } if (sizes && r->type == NCCL_NET_IB_REQ_SEND) { sizes[0] = r->send.size; } NCCLCHECK(ncclIbFreeRequest(r)); return ncclSuccess; } int totalWrDone = 0; int wrDone = 0; struct ibv_wc wcs[4]; for (int i = 0; i < NCCL_IB_MAX_DEVS_PER_NIC; i++) { TIME_START(3); // If we expect any completions from this device's CQ if (r->events[i]) { NCCLCHECK(wrap_ibv_poll_cq(r->devBases[i]->cq, 4, wcs, &wrDone)); totalWrDone += wrDone; if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); } if (wrDone == 0) continue; for (int w=0; wstatus != IBV_WC_SUCCESS) { union ncclSocketAddress addr; ncclSocketGetAddr(r->sock, &addr); char localGidString[INET6_ADDRSTRLEN] = ""; char remoteGidString[INET6_ADDRSTRLEN] = ""; const char* localGidStr = NULL, *remoteGidStr = NULL; if (r->devBases[i]->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) { localGidStr = inet_ntop(AF_INET6, &r->devBases[i]->gidInfo.localGid, localGidString, sizeof(localGidString)); remoteGidStr = inet_ntop(AF_INET6, &r->base->remDevs[i].remoteGid, remoteGidString, sizeof(remoteGidString)); } char line[SOCKET_NAME_MAXLEN+1]; char *hcaName = r->devBases[i]->pd->context->device->name; WARN("NET/IB: Got completion from peer %s with status=%d opcode=%d len=%d vendor err %d (%s)%s%s%s%s hca %s", ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type], localGidStr ? " localGid ":"", localGidString, remoteGidStr ? " remoteGids":"", remoteGidString, hcaName); return ncclRemoteError; } union ncclSocketAddress addr; ncclSocketGetAddr(r->sock, &addr); struct ncclIbRequest* req = r->base->reqs+(wc->wr_id & 0xff); #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%d wr_id=%ld r=%p type=%d events={%d,%d}, i=%d", ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i); #endif if (req->type == NCCL_NET_IB_REQ_SEND) { for (int j = 0; j < req->nreqs; j++) { struct ncclIbRequest* sendReq = r->base->reqs+((wc->wr_id >> (j*8)) & 0xff); if ((sendReq->events[i] <= 0)) { WARN("NET/IB: sendReq(%p)->events={%d,%d}, i=%d, j=%d <= 0", sendReq, sendReq->events[0], sendReq->events[1], i, j); return ncclInternalError; } sendReq->events[i]--; } } else { if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { if (req->type != NCCL_NET_IB_REQ_RECV) { WARN("NET/IB: wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM and req->type=%d", req->type); return ncclInternalError; } if (req->nreqs == 1) { req->recv.sizes[0] = wc->imm_data; } } req->events[i]--; } } } } // If no CQEs found on any device, return and come back later if (totalWrDone == 0) return ncclSuccess; } } ncclResult_t ncclIbCloseSend(void* sendComm) { struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; if (comm) { NCCLCHECK(ncclSocketClose(&comm->base.sock)); for (int q = 0; q < comm->base.nqps; q++) if (comm->base.qps[q].qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->base.qps[q].qp)); for (int i = 0; i < comm->base.ndevs; i++) { struct ncclIbSendCommDev* commDev = comm->devs + i; if (commDev->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->fifoMr)); if (comm->remSizesFifo.mrs[i] != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remSizesFifo.mrs[i])); NCCLCHECK(ncclIbDestroyBase(&commDev->base)); } free(comm); } TIME_PRINT("IB"); return ncclSuccess; } ncclResult_t ncclIbCloseRecv(void* recvComm) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm) { NCCLCHECK(ncclSocketClose(&comm->base.sock)); for (int q = 0; q < comm->base.nqps; q++) if (comm->base.qps[q].qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->base.qps[q].qp)); for (int i = 0; i < comm->base.ndevs; i++) { struct ncclIbRecvCommDev* commDev = comm->devs + i; if (comm->flushEnabled) { if (commDev->gpuFlush.qp.qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(commDev->gpuFlush.qp.qp)); if (commDev->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->gpuFlush.hostMr)); } if (commDev->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->fifoMr)); if (commDev->sizesFifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->sizesFifoMr)); NCCLCHECK(ncclIbDestroyBase(&commDev->base)); } free(comm); } return ncclSuccess; } ncclResult_t ncclIbCloseListen(void* listenComm) { struct ncclIbListenComm* comm = (struct ncclIbListenComm*)listenComm; if (comm) { NCCLCHECK(ncclSocketClose(&comm->sock)); free(comm); } return ncclSuccess; } ncclNet_t ncclNetIb = { "IB", ncclIbInit, ncclIbDevices, ncclIbGetProperties, ncclIbListen, ncclIbConnect, ncclIbAccept, ncclIbRegMr, ncclIbRegMrDmaBuf, ncclIbDeregMr, ncclIbIsend, ncclIbIrecv, ncclIbIflush, ncclIbTest, ncclIbCloseSend, ncclIbCloseRecv, ncclIbCloseListen, NULL /* getDeviceMr */, NULL /* irecvConsumed */ }; nccl-2.22.3-1/src/transport/net_socket.cc000066400000000000000000000511161463451655400201660ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "core.h" #include "socket.h" #include "net.h" #include "param.h" #include #include #include #include #include /* Init functions */ static int ncclNetIfs = -1; struct ncclNetSocketDev { union ncclSocketAddress addr; char devName[MAX_IF_NAME_SIZE]; char* pciPath; }; static struct ncclNetSocketDev ncclNetSocketDevs[MAX_IFS]; pthread_mutex_t ncclNetSocketLock = PTHREAD_MUTEX_INITIALIZER; static ncclResult_t ncclNetSocketGetPciPath(char* devName, char** pciPath) { char devicePath[PATH_MAX]; snprintf(devicePath, PATH_MAX, "/sys/class/net/%s/device", devName); // May return NULL if the file doesn't exist. *pciPath = realpath(devicePath, NULL); return ncclSuccess; } ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) { if (ncclNetIfs == -1) { pthread_mutex_lock(&ncclNetSocketLock); if (ncclNetIfs == -1) { char names[MAX_IF_NAME_SIZE*MAX_IFS]; union ncclSocketAddress addrs[MAX_IFS]; ncclNetIfs = ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS); if (ncclNetIfs <= 0) { WARN("NET/Socket : no interface found"); return ncclInternalError; } else { #define MAX_LINE_LEN (2047) char line[MAX_LINE_LEN+1]; char addrline[SOCKET_NAME_MAXLEN+1]; line[0] = '\0'; addrline[SOCKET_NAME_MAXLEN] = '\0'; for (int i=0; i 0) { *speed = strtol(speedStr, NULL, 0); } close(fd); } if (*speed <= 0) { INFO(NCCL_NET, "Could not get speed from %s. Defaulting to 10 Gbps.", speedPath); *speed = 10000; } return ncclSuccess; } ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) { props->name = ncclNetSocketDevs[dev].devName; props->pciPath = ncclNetSocketDevs[dev].pciPath; props->guid = dev; props->ptrSupport = NCCL_PTR_HOST; props->regIsGlobal = 0; NCCLCHECK(ncclNetSocketGetSpeed(props->name, &props->speed)); props->latency = 0; // Not set props->port = 0; props->maxComms = 65536; props->maxRecvs = 1; props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; return ncclSuccess; } /* Communication functions */ #define MAX_SOCKETS 64 #define MAX_THREADS 16 #define MAX_REQUESTS NCCL_NET_MAX_REQUESTS #define MIN_CHUNKSIZE (64*1024) NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2); NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2); enum ncclNetSocketCommState { ncclNetSocketCommStateStart = 0, ncclNetSocketCommStateConnect = 1, ncclNetSocketCommStateAccept = 3, ncclNetSocketCommStateSend = 4, ncclNetSocketCommStateRecv = 5, }; struct ncclNetSocketCommStage { enum ncclNetSocketCommState state; uint8_t iteration; struct ncclSocket* sock; struct ncclNetSocketComm* comm; }; struct ncclNetSocketHandle { union ncclSocketAddress connectAddr; uint64_t magic; // random number to help debugging int nSocks; int nThreads; struct ncclNetSocketCommStage stage; }; struct ncclNetSocketTask { int op; void* data; int size; struct ncclSocket* sock; int offset; int used; ncclResult_t result; }; struct ncclNetSocketRequest { int op; void* data; int size; struct ncclSocket* ctrlSock; int offset; int used; struct ncclNetSocketComm* comm; struct ncclNetSocketTask* tasks[MAX_SOCKETS]; int nSubs; }; struct ncclNetSocketTaskQueue { int next; int len; struct ncclNetSocketTask* tasks; }; struct ncclNetSocketThreadResources { struct ncclNetSocketTaskQueue threadTaskQueue; int stop; struct ncclNetSocketComm* comm; pthread_mutex_t threadLock; pthread_cond_t threadCond; }; struct ncclNetSocketListenComm { struct ncclSocket sock; struct ncclNetSocketCommStage stage; int nSocks; int nThreads; int dev; }; struct ncclNetSocketComm { struct ncclSocket ctrlSock; struct ncclSocket socks[MAX_SOCKETS]; int dev; int cudaDev; int nSocks; int nThreads; int nextSock; struct ncclNetSocketRequest requests[MAX_REQUESTS]; pthread_t helperThread[MAX_THREADS]; struct ncclNetSocketThreadResources threadResources[MAX_THREADS]; }; void* persistentSocketThread(void *args_) { struct ncclNetSocketThreadResources* resource = (struct ncclNetSocketThreadResources*)args_; struct ncclNetSocketComm* comm = resource->comm; struct ncclNetSocketTaskQueue* myQueue = &resource->threadTaskQueue; int nSocksPerThread = comm->nSocks / comm->nThreads; while (1) { int idle = 1; int mark = myQueue->next; // mark newest task seen for (int i=0; ilen; i+=nSocksPerThread) { int repeat; do { repeat = 0; for (int j=0; jtasks+i+j; if (r != NULL && r->used == 1 && r->offset < r->size) { r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset); if (r->result != ncclSuccess) { WARN("NET/Socket : socket progress error"); return NULL; } idle = 0; if (r->offset < r->size) repeat = 1; } } } while (repeat); } if (idle) { pthread_mutex_lock(&resource->threadLock); while (mark == myQueue->next && resource->stop == 0) { // no new tasks, wait pthread_cond_wait(&resource->threadCond, &resource->threadLock); } pthread_mutex_unlock(&resource->threadLock); } if (resource->stop) return NULL; } } ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) { int nSocksPerThread = ncclParamSocketNsocksPerThread(); int nThreads = ncclParamSocketNthreads(); if (nThreads > MAX_THREADS) { WARN("NET/Socket : NCCL_SOCKET_NTHREADS is greater than the maximum allowed, setting to %d", MAX_THREADS); nThreads = MAX_THREADS; } if (nThreads == -2 || nSocksPerThread == -2) { // Auto-detection int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads char vendorPath[PATH_MAX]; snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetSocketDevs[dev].devName); char* rPath = realpath(vendorPath, NULL); int fd = open(rPath, O_RDONLY); free(rPath); if (fd == -1) { // Could not find device vendor. This is handled silently so // we don't want to print an INFO error. TRACE(NCCL_NET, "Open of %s failed : %s", vendorPath, strerror(errno)); goto end; } char vendor[7]; strncpy(vendor, "0x0000", 7); int len; SYSCHECKVAL(read(fd, vendor, 6), "read", len); SYSCHECK(close(fd), "close"); if (strcmp(vendor, "0x1d0f") == 0) { // AWS autoNt = 2; autoNs = 8; } else if (strcmp(vendor, "0x1ae0") == 0) { // GCP autoNt = 4; autoNs = 1; } end: if (nThreads == -2) nThreads = autoNt; if (nSocksPerThread == -2) nSocksPerThread = autoNs; } int nSocks = nSocksPerThread * nThreads; if (nSocks > MAX_SOCKETS) { nSocksPerThread = MAX_SOCKETS/nThreads; WARN("NET/Socket : the total number of sockets is greater than the maximum allowed, setting NCCL_NSOCKS_PERTHREAD to %d", nSocksPerThread); nSocks = nSocksPerThread * nThreads; } *ns = nSocks; *nt = nThreads; if (nSocks > 0) INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread); return ncclSuccess; } ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) { if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev return ncclInternalError; } struct ncclNetSocketHandle* handle = (struct ncclNetSocketHandle*) opaqueHandle; memset(handle, 0, sizeof(struct ncclNetSocketHandle)); static_assert(sizeof(struct ncclNetSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclNetSocketHandle size too large"); struct ncclNetSocketListenComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); handle->magic = NCCL_SOCKET_MAGIC; NCCLCHECK(ncclSocketInit(&comm->sock, &ncclNetSocketDevs[dev].addr, handle->magic, ncclSocketTypeNetSocket, NULL, 1)); NCCLCHECK(ncclSocketListen(&comm->sock)); NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr)); NCCLCHECK(ncclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads)); handle->nSocks = comm->nSocks; handle->nThreads = comm->nThreads; comm->dev = dev; *listenComm = comm; return ncclSuccess; } ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev return ncclInternalError; } int ready; struct ncclNetSocketHandle* handle = (struct ncclNetSocketHandle*) opaqueHandle; struct ncclNetSocketCommStage* stage = &handle->stage; struct ncclNetSocketComm* comm = stage->comm; uint8_t i = stage->iteration; struct ncclSocket* sock = stage->sock; *sendComm = NULL; if (stage->state == ncclNetSocketCommStateConnect) goto socket_connect_check; if (stage->state == ncclNetSocketCommStateSend) goto socket_send; NCCLCHECK(ncclCalloc(&comm, 1)); stage->comm = comm; comm->nSocks = handle->nSocks; comm->nThreads = handle->nThreads; comm->dev = dev; CUDACHECK(cudaGetDevice(&comm->cudaDev)); for (; inSocks+1; i++) { sock = (i == comm->nSocks) ? &comm->ctrlSock : comm->socks+i; NCCLCHECK(ncclSocketInit(sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetSocket, NULL, 1)); stage->sock = sock; stage->state = ncclNetSocketCommStateConnect; stage->iteration = i; NCCLCHECK(ncclSocketConnect(sock)); socket_connect_check: NCCLCHECK(ncclSocketReady(sock, &ready)); if (! ready) return ncclSuccess; stage->state = ncclNetSocketCommStateSend; socket_send: int done = 0; NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &i, sizeof(uint8_t), &done)); if (done == 0) return ncclSuccess; } *sendComm = comm; return ncclSuccess; } ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { struct ncclNetSocketListenComm* lComm = (struct ncclNetSocketListenComm*)listenComm; struct ncclNetSocketCommStage* stage = &lComm->stage; struct ncclNetSocketComm* rComm = stage->comm; uint8_t i = stage->iteration; struct ncclSocket* sock = stage->sock; int ready; *recvComm = NULL; if (stage->state == ncclNetSocketCommStateAccept) goto socket_accept_check; if (stage->state == ncclNetSocketCommStateRecv) goto socket_recv; NCCLCHECK(ncclCalloc(&rComm, 1)); stage->comm = rComm; rComm->nSocks = lComm->nSocks; rComm->nThreads = lComm->nThreads; rComm->dev = lComm->dev; CUDACHECK(cudaGetDevice(&rComm->cudaDev)); for (; inSocks+1; i++) { uint8_t sendSockIdx; NCCLCHECK(ncclCalloc(&sock, 1)); NCCLCHECK(ncclSocketInit(sock)); stage->sock = sock; stage->state = ncclNetSocketCommStateAccept; stage->iteration = i; NCCLCHECK(ncclSocketAccept(sock, &lComm->sock)); socket_accept_check: NCCLCHECK(ncclSocketReady(sock, &ready)); if (!ready) return ncclSuccess; stage->state = ncclNetSocketCommStateRecv; socket_recv: int done = 0; NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &sendSockIdx, sizeof(uint8_t), &done)); if (done == 0) return ncclSuccess; if (sendSockIdx == rComm->nSocks) memcpy(&rComm->ctrlSock, sock, sizeof(struct ncclSocket)); else memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket)); free(sock); } *recvComm = rComm; /* reset lComm state */ stage->state = ncclNetSocketCommStateStart; stage->iteration = 0; stage->sock = NULL; stage->comm = NULL; return ncclSuccess; } ncclResult_t ncclNetSocketGetRequest(struct ncclNetSocketComm* comm, int op, void* data, int size, struct ncclNetSocketRequest** req) { for (int i=0; irequests+i; if (r->used == 0) { r->op = op; r->data = data; r->size = size; r->ctrlSock = &comm->ctrlSock; r->used = 1; r->comm = comm; r->nSubs = 0; *req = r; return ncclSuccess; } } WARN("NET/Socket : unable to allocate requests"); return ncclInternalError; } ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void* data, int size, struct ncclNetSocketTask** req) { int tid = comm->nextSock % comm->nThreads; struct ncclNetSocketThreadResources* res = comm->threadResources+tid; struct ncclNetSocketTaskQueue* queue = &res->threadTaskQueue; // create helper threads and prepare per-thread task queue if (queue->tasks == NULL) { // each request can be divided up to nSocks tasks, and // these tasks are distributed to nThreads threads, // we need to make sure each thread queue has enough slots for MAX_REQUESTS queue->len = MAX_REQUESTS * DIVUP(comm->nSocks, comm->nThreads); NCCLCHECK(ncclCalloc(&queue->tasks, queue->len)); queue->next = 0; res->comm = comm; pthread_mutex_init(&res->threadLock, NULL); pthread_cond_init(&res->threadCond, NULL); pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res); ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev); } struct ncclNetSocketTask* r = queue->tasks+queue->next; if (r->used == 0) { r->op = op; r->data = data; r->size = size; r->sock = comm->socks + comm->nextSock; r->offset = 0; r->result = ncclSuccess; comm->nextSock = (comm->nextSock + 1) % comm->nSocks; r->used = 1; *req = r; pthread_mutex_lock(&res->threadLock); queue->next = (queue->next+1)%queue->len; pthread_cond_signal(&res->threadCond); pthread_mutex_unlock(&res->threadLock); return ncclSuccess; } WARN("NET/Socket : unable to allocate subtasks"); return ncclInternalError; } ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) { *done = 0; struct ncclNetSocketRequest *r = (struct ncclNetSocketRequest*)request; if (r == NULL) { WARN("NET/Socket : test called with NULL request"); return ncclInternalError; } if (r->used == 1) { /* try to send/recv size */ int data = r->size; int offset = 0; NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, &data, sizeof(int), &offset)); if (offset == 0) return ncclSuccess; /* Not ready -- retry later */ // Not sure we could ever receive less than 4 bytes, but just in case ... if (offset < sizeof(int)) NCCLCHECK(ncclSocketWait(r->op, r->ctrlSock, &data, sizeof(int), &offset)); // Check size is less or equal to the size provided by the user if (r->op == NCCL_SOCKET_RECV && data > r->size) { char line[SOCKET_NAME_MAXLEN+1]; union ncclSocketAddress addr; ncclSocketGetAddr(r->ctrlSock, &addr); WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \ there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks", ncclSocketToString(&addr, line), data, r->size); return ncclInvalidUsage; } r->size = data; r->offset = 0; r->used = 2; // done exchanging size // divide into subtasks int chunkOffset = 0, i = 0; if (r->comm->nSocks > 0) { // each request can be divided up to nSocks tasks int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks)); while (chunkOffset < r->size) { int chunkSize = std::min(taskSize, r->size-chunkOffset); NCCLCHECK(ncclNetSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++)); chunkOffset += chunkSize; } } r->nSubs = i; } if (r->used == 2) { // already exchanged size if (r->nSubs > 0) { int nCompleted = 0; for (int i=0; inSubs; i++) { struct ncclNetSocketTask* sub = r->tasks[i]; if (sub->result != ncclSuccess) return sub->result; if (sub->offset == sub->size) nCompleted++; } if (nCompleted == r->nSubs) { if (size) *size = r->size; *done = 1; r->used = 0; for (int i=0; inSubs; i++) { struct ncclNetSocketTask* sub = r->tasks[i]; sub->used = 0; } } } else { // progress request using main thread if (r->offset < r->size) { NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, r->data, r->size, &r->offset)); } if (r->offset == r->size) { if (size) *size = r->size; *done = 1; r->used = 0; } } } return ncclSuccess; } ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, void** mhandle) { return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess; } ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)sendComm; NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclNetSocketRequest**)request)); return ncclSuccess; } ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)recvComm; if (n != 1) return ncclInternalError; NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclNetSocketRequest**)request)); return ncclSuccess; } ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { // We don't support CUDA pointers, so we don't need a flush operation return ncclInternalError; } ncclResult_t ncclNetSocketCloseListen(void* opaqueComm) { struct ncclNetSocketListenComm* comm = (struct ncclNetSocketListenComm*)opaqueComm; if (comm) { int ready; NCCLCHECK(ncclSocketReady(&comm->sock, &ready)); if (ready) NCCLCHECK(ncclSocketClose(&comm->sock)); free(comm); } return ncclSuccess; } ncclResult_t ncclNetSocketClose(void* opaqueComm) { struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)opaqueComm; if (comm) { for (int i=0; inThreads; i++) { struct ncclNetSocketThreadResources* res = comm->threadResources+i; if (comm->helperThread[i]) { pthread_mutex_lock(&res->threadLock); res->stop = 1; pthread_cond_signal(&res->threadCond); pthread_mutex_unlock(&res->threadLock); pthread_join(comm->helperThread[i], NULL); } free(res->threadTaskQueue.tasks); } int ready; NCCLCHECK(ncclSocketReady(&comm->ctrlSock, &ready)); if (ready) NCCLCHECK(ncclSocketClose(&comm->ctrlSock)); for (int i=0; inSocks; i++) { NCCLCHECK(ncclSocketReady(&comm->socks[i], &ready)); if (ready) NCCLCHECK(ncclSocketClose(&comm->socks[i])); } free(comm); } return ncclSuccess; } ncclNet_t ncclNetSocket = { "Socket", ncclNetSocketInit, ncclNetSocketDevices, ncclNetSocketGetProperties, ncclNetSocketListen, ncclNetSocketConnect, ncclNetSocketAccept, ncclNetSocketRegMr, NULL, // No DMA-BUF support ncclNetSocketDeregMr, ncclNetSocketIsend, ncclNetSocketIrecv, ncclNetSocketIflush, ncclNetSocketTest, ncclNetSocketClose, ncclNetSocketClose, ncclNetSocketCloseListen, NULL /* getDeviceMr */, NULL /* irecvConsumed */ }; nccl-2.22.3-1/src/transport/nvls.cc000066400000000000000000001216701463451655400170150ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ // Implementation of the NVLink SHARP (NVLS) transport #include "comm.h" #include "graph.h" #include "utils.h" #include "proxy.h" #include "enqueue.h" #include "register.h" #include "transport.h" #if CUDART_VERSION >= 12010 struct graphRegData { uintptr_t offset; size_t size; }; struct localRegData { struct ncclReg reg; intptr_t offset; }; ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { // This transport cannot be used for p2p *ret = 0; return ncclSuccess; } ncclResult_t nvlsSendFree(struct ncclConnector* send) { return ncclSuccess; } ncclResult_t nvlsRecvFree(struct ncclConnector* recv) { return ncclSuccess; } struct ncclTransport nvlsTransport = { "NVLS", nvlsCanConnect, { NULL, NULL, nvlsSendFree, NULL, NULL, NULL, NULL, NULL }, { NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL } }; ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, int rank, unsigned int nranks, CUmemGenericAllocationHandle *mcHandle, char *shareableHandle) { CUmemAllocationHandleType type = ncclCuMemHandleType; size_t size = prop->size; // Create a Multicast group INFO(NCCL_NVLS, "NVLS Creating Multicast group nranks %d size %zu on rank %d", nranks, size, rank); CUCHECK(cuMulticastCreate(mcHandle, prop)); if (type == CU_MEM_HANDLE_TYPE_FABRIC) { // Get a handle to pass to other ranks CUCHECK(cuMemExportToShareableHandle(shareableHandle, *mcHandle, ncclCuMemHandleType, 0)); } else { memcpy(shareableHandle, mcHandle, sizeof(CUmemGenericAllocationHandle)); } INFO(NCCL_NVLS, "NVLS Created Multicast group %llx nranks %d size %zu on rank %d", *mcHandle, nranks, size, rank); return ncclSuccess; } ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle) { CUmemAllocationHandleType type = ncclCuMemHandleType; INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank); // Import and map the remote memory descriptor to the local GPU if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { // cuMem UDS support int fd = -1; TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle %p from rank %d", comm->localRank, shareableHandle, rank); int tpProxyRank = comm->topParentRanks[rank]; TRACE(NCCL_NVLS, "NVLS rank %d request conversion of handle 0x%lx from rank %d", comm->localRank, *(uint64_t*)shareableHandle, rank); NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpProxyRank, shareableHandle, &fd)); TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank); CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type)); (void) close(fd); } else { if (type == CU_MEM_HANDLE_TYPE_FABRIC) { CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)shareableHandle, type)); } else { memcpy(mcHandle, shareableHandle, sizeof(CUmemGenericAllocationHandle)); } } return ncclSuccess; } ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAllocationHandle* mcHandle) { int dev = comm->cudaDev; INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zu dev %d", *mcHandle, size, dev); // Unbind physical memory from group for the given device CUCHECK(cuMulticastUnbind(*mcHandle, dev, 0/*mcOffset*/, size)); return ncclSuccess; } ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, size)); CUCHECK(cuMemUnmap(ptr, size)); CUCHECK(cuMemAddressFree(ptr, size)); CUCHECK(cuMemRelease(*mcHandler)); return ncclSuccess; } ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr, CUmemGenericAllocationHandle* ucHandle, void* mcptr, CUmemGenericAllocationHandle* mcHandle) { INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", *ucHandle, ucptr, *mcHandle, mcptr); // Release the UC memory and mapping CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size)); CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size)); CUCHECK(cuMemRelease(*ucHandle)); // Release the MC memory and mapping CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size)); CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size)); CUCHECK(cuMemRelease(*mcHandle)); return ncclSuccess; } #include "bootstrap.h" #include "channel.h" #define NVLS_MEM_ALIGN_SIZE (1 << 21) NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2); NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16); NCCL_PARAM(NvlsChunkSize, "NVLS_CHUNKSIZE", 128*1024); ncclResult_t ncclNvlsInit(struct ncclComm* comm) { comm->nvlsSupport = 0; comm->nvlsChannels = 0; int gpuCount; NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount)); if (!ncclParamNvlsEnable() || ((!comm->MNNVL && gpuCount <= 2) || (comm->MNNVL && comm->clique.size <= 2))) return ncclSuccess; CUdevice dev; int driverVersion; if (CUPFN(cuDeviceGet) == NULL) return ncclSuccess; CUCHECK(cuCtxGetDevice(&dev)); CUDACHECK(cudaDriverGetVersion(&driverVersion)); if (ncclParamNvlsEnable() == 2) { // NVLS Multicast support requires CUDA12.1 UMD + KMD if (CUPFN(cuMulticastCreate) != NULL /*&& driverVersion >= 12010 */) { CUCHECK(cuDeviceGetAttribute(&comm->nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev)); } } else { comm->nvlsSupport = 1; } INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev); if (comm->nvlsSupport == 1) comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (int)ncclParamNvlsChannels())); return ncclSuccess; } ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm) { ncclResult_t ret = ncclSuccess; if (comm && comm->nvlsSupport && comm->nNodes > 1) { for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channel = comm->channels + c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail); NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_NVLS], 0), ret, fail); INFO(NCCL_INIT, "Connected NVLS tree"); } exit: return ret; fail: goto exit; } static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularity_flags mcOption, const CUmemAccessDesc* desc, size_t* sizePtr, CUmemGenericAllocationHandle* ucHandle, CUmemGenericAllocationHandle* mcHandle, void** ucptr, void** mcptr) { char shareableHandle[NVLS_HANDLE_SIZE]; CUmulticastObjectProp mcprop; CUmemAllocationProp ucprop; ncclResult_t ret = ncclSuccess; size_t size = *sizePtr; size_t originSize = size; size_t ucgran, mcgran; memset(&mcprop, 0, sizeof(CUmulticastObjectProp)); mcprop.numDevices = comm->localRanks; mcprop.handleTypes = ncclCuMemHandleType; mcprop.flags = 0; mcprop.size = size; CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, mcOption), ret, fail); ALIGN_SIZE(size, mcgran); *sizePtr = mcprop.size = size; if (comm->localRank == 0) { NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail); NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); } else { NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], mcHandle), ret, fail); } CUCHECKGOTO(cuMulticastAddDevice(*mcHandle, comm->cudaDev), ret, fail); memset(&ucprop, 0, sizeof(CUmemAllocationProp)); ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; ucprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; ucprop.location.id = comm->cudaDev; ucprop.requestedHandleTypes = ncclCuMemHandleType; CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail); // Map a VA for UC memory CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)ucptr, size, ucgran, 0U, 0), ret, fail); // Alloc local physical mem for this NVLS group CUCHECKGOTO(cuMemCreate(ucHandle, size, &ucprop, 0), ret, fail); CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, size, 0, *ucHandle, 0), ret, fail); CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, size, desc, 1), ret, fail); CUDACHECKGOTO(cudaMemset(*ucptr, 0, size), ret, fail); // Bind physical memory to the Multicast group // NB: It will block until all ranks have been added to the Group CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, size, 0/*flags*/), ret, fail); // Map mc virtual address CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)mcptr, size, mcgran, 0U, 0), ret, fail); CUCHECKGOTO(cuMemMap((CUdeviceptr)*mcptr, size, 0, *mcHandle, 0), ret, fail); CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*mcptr, size, desc, 1), ret, fail); INFO(NCCL_NVLS, "NVLS rank %d (dev %d) alloc done, ucptr %p ucgran %ld mcptr %p mcgran %ld size %ld (%ld)", comm->rank, comm->cudaDev, *ucptr, ucgran, *mcptr, mcgran, size, originSize); exit: return ret; fail: goto exit; } ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) { int nHeads = -1; int headRank = -1; ncclResult_t res = ncclSuccess; int nvlsStepSize = -1; size_t buffSize = 0; size_t nvlsPerRankSize = 0; size_t nvlsTotalSize = 0; struct ncclNvlsSharedRes* resources = NULL; int nChannels = -1; if (comm->nvlsSupport == 0 || comm->nvlsResources->inited) return ncclSuccess; // initialize after checking comm->nvlsSupport nHeads = comm->channels[0].nvls.nHeads; headRank = comm->channels[0].nvls.headRank; resources = comm->nvlsResources; nChannels = comm->nvlsResources->nChannels; nvlsStepSize = comm->nvlsChunkSize; buffSize = nvlsStepSize * NCCL_STEPS; nvlsPerRankSize = nChannels * 2 * buffSize; nvlsTotalSize = nvlsPerRankSize * nHeads; INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zu nvlsPerRankSize %zu nvlsTotalSize %zu", comm, headRank, nHeads, buffSize, nvlsPerRankSize, nvlsTotalSize); NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_RECOMMENDED, &resources->accessDesc, &nvlsTotalSize, &resources->ucBuffHandle, &resources->mcBuffHandle, (void**)&resources->ucBuff, (void**)&resources->mcBuff), res, fail); resources->buffSize = nvlsTotalSize; NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail); for (int h = 0; h < nHeads; h++) { int nvlsPeer = comm->nRanks + 1 + h; for (int c = 0; c < nChannels; c++) { struct ncclChannel* channel = comm->channels + c; struct ncclChannelPeer* peer = channel->peers[nvlsPeer]; // Reduce UC -> MC peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = resources->ucBuff + (h * 2 * nChannels + c) * buffSize; peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = resources->mcBuff + (h * 2 * nChannels + c) * buffSize; // Broadcast MC -> UC peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = resources->ucBuff + ((h * 2 + 1) * nChannels + c) * buffSize; peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * buffSize; CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); } } NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail); NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail); // For now, the barrier is a must that guarantees all buffers are mc-mapped before accessing peer's buffer NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, fail); comm->nvlsResources->inited = true; exit: return res; fail: comm->nvlsResources->inited = false; goto exit; } ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { ncclResult_t res = ncclSuccess; size_t typeSize; char shmPath[sizeof("/dev/shm/nccl-XXXXXX")]; uintptr_t *nvlsShmem = NULL; bool nvlsShare = parent && parent->nvlsSupport && parent->config.splitShare; int nHeads = comm->channels[0].nvls.nHeads; if (comm->nvlsSupport == 0 || comm->nvlsChannels == 0) return ncclSuccess; if (nvlsShare && parent->channels[0].nvls.nHeads == nHeads) { for (int ch = 0; ch < nHeads; ++ch) { bool find = false; for (int h = 0; h < parent->channels[0].nvls.nHeads; ++h) { if (comm->nvlsHeads[ch] == parent->nvlsHeads[h]) { // find the head find = true; break; } } if (find == false) { nvlsShare = false; goto setup; } } nvlsShare = true; } else { nvlsShare = false; } setup: comm->nvlsChunkSize = ncclParamNvlsChunkSize(); if (nvlsShare) { /* reuse NVLS resources */ comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels); for (int c = 0; c < comm->nChannels; c++) { NCCLCHECKGOTO(initNvlsChannel(comm, c, parent, true), res, fail); } comm->nvlsResources = parent->nvlsResources; ncclAtomicRefCountIncrement(&parent->nvlsResources->refCount); } else { struct ncclNvlsSharedRes* resources = NULL; int nHeads = comm->channels[0].nvls.nHeads; int nChannels = comm->nChannels; size_t memSize = 16; size_t creditSize = nChannels * 2 * memSize * nHeads; int nvlsStepSize = comm->nvlsChunkSize; NCCLCHECKGOTO(ncclCalloc(&comm->nvlsResources, 1), res, fail); comm->nvlsResources->inited = false; comm->nvlsResources->refCount = 1; comm->nvlsResources->nChannels = comm->nvlsChannels; resources = comm->nvlsResources; if (parent && parent->nvlsSupport && parent->config.splitShare) { /* ranks on other nodes might share the NVLS resources, we need to cap nvlsChannels * to make sure nvlsChannels match for each rank. */ comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels); } comm->nvlsResources->nChannels = comm->nvlsChannels; for (int c = 0; c < comm->nChannels; c++) { NCCLCHECKGOTO(initNvlsChannel(comm, c, NULL, false), res, fail); } memset(&resources->accessDesc, 0, sizeof(resources->accessDesc)); resources->accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; resources->accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; resources->accessDesc.location.id = comm->cudaDev; resources->dev = comm->cudaDev; NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_MINIMUM, &resources->accessDesc, &creditSize, &resources->ucCreditHandle, &resources->mcCreditHandle, (void**)&resources->ucCredit, (void**)&resources->mcCredit), res, fail); resources->creditSize = creditSize; // Set up head and tail only for now NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail); for (int h = 0; h < nHeads; h++) { int nvlsPeer = comm->nRanks + 1 + h; for (int c = 0; c < nChannels; c++) { struct ncclChannel* channel = comm->channels + c; char* mem = NULL; struct ncclChannelPeer* peer = channel->peers[nvlsPeer]; // Reduce UC -> MC mem = resources->ucCredit + (h * 2 * nChannels + c) * memSize; peer->send[1].transportComm = &nvlsTransport.send; peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = NULL; peer->send[1].conn.head = (uint64_t*)mem; peer->send[1].conn.tail = (uint64_t*)(mem + memSize / 2); peer->send[1].conn.stepSize = nvlsStepSize; mem = resources->mcCredit + (h * 2 * nChannels + c) * memSize; peer->recv[0].transportComm = &nvlsTransport.recv; peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = NULL; peer->recv[0].conn.head = (uint64_t*)mem; peer->recv[0].conn.tail = (uint64_t*)(mem + memSize / 2); peer->recv[0].conn.stepSize = nvlsStepSize; peer->recv[0].conn.flags |= NCCL_NVLS_MIN_POLL; // Broadcast MC -> UC mem = resources->ucCredit + ((h * 2 + 1) * nChannels + c) * memSize; peer->recv[1].transportComm = &nvlsTransport.recv; peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = NULL; peer->recv[1].conn.head = (uint64_t*)mem; peer->recv[1].conn.tail = (uint64_t*)(mem + memSize / 2); peer->recv[1].conn.stepSize = nvlsStepSize; mem = resources->mcCredit + ((h * 2 + 1) * nChannels + c) * memSize; peer->send[0].transportComm = &nvlsTransport.send; peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = NULL; peer->send[0].conn.head = (uint64_t*)mem; peer->send[0].conn.tail = (uint64_t*)(mem + memSize / 2); peer->send[0].conn.stepSize = nvlsStepSize; peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL; CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); } } NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail); NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail); } // MNNVL does not support NVLS buffer registration if (!comm->MNNVL && comm->nvlsResources->nvlsShmemHandle == NULL) { /* create shared memory for fast NVLS buffer registration */ typeSize = sizeof(struct localRegData) << 1; if (comm->localRank == 0) { shmPath[0] = '\0'; NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, fail); NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail); } else { NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail); NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, fail); } /* need 2 pools and a shared counter for shmem-based collectives */ comm->nvlsResources->nvlsShmem.cnt[0] = (size_t*)nvlsShmem; comm->nvlsResources->nvlsShmem.ptr[0] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[0] + sizeof(size_t)); comm->nvlsResources->nvlsShmem.cnt[1] = (size_t*)((char*)comm->nvlsResources->nvlsShmem.ptr[0] + typeSize * comm->localRanks); comm->nvlsResources->nvlsShmem.ptr[1] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[1] + sizeof(size_t)); comm->nvlsResources->nvlsShmem.round = 0; comm->nvlsResources->nvlsShmem.maxTypeSize = typeSize; } exit: return res; fail: comm->nvlsSupport = 0; goto exit; } ncclResult_t ncclNvlsFree(struct ncclComm* comm) { struct ncclNvlsSharedRes* resources = (struct ncclNvlsSharedRes*)comm->nvlsResources; if (resources == NULL) return ncclSuccess; if (ncclAtomicRefCountDecrement(&resources->refCount) == 0) { if (!comm->MNNVL && resources->nvlsShmemHandle) NCCLCHECK(ncclShmClose(resources->nvlsShmemHandle)); if (resources->ucCredit && resources->mcCredit) { NCCLCHECK(nvlsGroupUnbind(comm, resources->creditSize, &resources->mcCreditHandle)); NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditSize, resources->ucCredit, &resources->ucCreditHandle, resources->mcCredit, &resources->mcCreditHandle)); } if (comm->nvlsResources->inited) { NCCLCHECK(nvlsGroupUnbind(comm, resources->buffSize, &resources->mcBuffHandle)); NCCLCHECK(nvlsGroupUnmapMem(comm, resources->buffSize, resources->ucBuff, &resources->ucBuffHandle, resources->mcBuff, &resources->mcBuffHandle)); } free(resources); comm->nvlsResources = NULL; } return ncclSuccess; } ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, bool *regUsed) { ncclResult_t ret = ncclSuccess; struct ncclReg *regRecord = NULL; CUdeviceptr regPtr = 0; CUmulticastObjectProp mcprop; CUmemAllocationProp ucprop; char shareableHandle[NVLS_HANDLE_SIZE]; CUmemGenericAllocationHandle mcHandle; size_t minSize = SIZE_MAX; bool localRegBufUsed = false; struct localRegData* regData = NULL; cudaPointerAttributes attr; size_t ucgran, mcgran; NCCLCHECKGOTO(ncclCalloc(®Data, comm->localRanks), ret, fail); if (userBuff) { NCCLCHECKGOTO(ncclRegFind(comm, (void*)userBuff, buffSize, ®Record), ret, fail); if (regRecord) { CUDACHECK(cudaPointerGetAttributes(&attr, (void*)regRecord->addr)); if (attr.type == cudaMemoryTypeDevice) { size_t regSize = regRecord->pages * comm->regCache.pageSize; memset(&mcprop, 0, sizeof(CUmulticastObjectProp)); mcprop.numDevices = comm->localRanks; mcprop.handleTypes = ncclCuMemHandleType; mcprop.flags = 0; mcprop.size = regSize; CUCHECK(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED)); memset(&ucprop, 0, sizeof(CUmemAllocationProp)); ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; ucprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; ucprop.location.id = comm->cudaDev; ucprop.requestedHandleTypes = ncclCuMemHandleType; CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail); CUCHECK(cuMemGetAddressRange((CUdeviceptr*)®Record->baseAddr, ®Record->baseSize, (CUdeviceptr)regRecord->addr)); if (regSize % mcgran == 0) { regRecord->regSize = regSize; } else { regRecord->regSize = regRecord->baseSize - (regRecord->addr - regRecord->baseAddr); } if (regRecord->addr % ucgran == 0 && regRecord->regSize % mcgran == 0) { regRecord->state |= NVLS_REG_POSSIBLE; memcpy(®Data[comm->localRank].reg, regRecord, sizeof(struct ncclReg)); regData[comm->localRank].offset = userBuff - regRecord->addr; } } if ((regRecord->state & NVLS_REG_POSSIBLE) == 0) { regRecord->state |= NVLS_REG_NO_SUPPORT; } } } NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank, regData, sizeof(struct localRegData)), ret, fail); for (int i = 0; i < comm->localRanks; ++i) { if ((regData[i].reg.state & NVLS_REG_POSSIBLE) == 0) { goto fail; } /* get minimal reg size of nvls buffers */ if (minSize > regData[i].reg.regSize) minSize = regData[i].reg.regSize; } /* start registration */ mcprop.size = minSize; CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); if (comm->localRank == 0) { NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail); NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); } else { NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &mcHandle), ret, fail); } CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->nvlsResources->dev), ret, fail); CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail); // Create a VA for the NVLS CUCHECKGOTO(cuMemAddressReserve(®Ptr, minSize, mcgran, 0U, 0), ret, fail); // Map the VA locally CUCHECKGOTO(cuMemMap(regPtr, minSize, 0, mcHandle, 0), ret, fail); CUCHECKGOTO(cuMemSetAccess(regPtr, minSize, &comm->nvlsResources->accessDesc, 1), ret, fail); regRecord->regAddr = regPtr; regRecord->regSize = minSize; regRecord->dev = comm->nvlsResources->dev; regRecord->mcHandle = mcHandle; regRecord->state |= NVLS_REG_COMPLETE; /* get all buffer addresses */ regRecord->caddrs[comm->localRank] = regRecord->addr; NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regRecord->caddrs + comm->localRank, regRecord->caddrs, sizeof(uintptr_t)), ret, fail); /* Although registration is done, we still need to check whether the offsets are same among ranks. */ for (int i = 0; i < comm->localRanks - 1; ++i) { if (regData[i].offset != regData[i + 1].offset) { goto fail; } } localRegBufUsed = true; exit: if (localRegBufUsed) *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset; *regUsed = localRegBufUsed; free(regData); return ret; fail: localRegBufUsed = false; goto exit; } ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { ncclResult_t ret = ncclSuccess; bool localRegBufUsed = false; struct localRegData *regData = NULL; bool sendNeedReg = false, recvNeedReg = false; CUdeviceptr regSendPtr = 0; CUdeviceptr regRecvPtr = 0; struct ncclReg *sendRegRecord = NULL; struct ncclReg *recvRegRecord = NULL; *outRegBufUsed = false; NCCLCHECKGOTO(ncclCalloc(®Data, comm->localRanks * 2), ret, fail); if (sendbuff) { NCCLCHECKGOTO(ncclRegFind(comm, sendbuff, sendbuffSize, &sendRegRecord), ret, fail); if (sendRegRecord) { memcpy(®Data[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg)); regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->addr; } } if (recvbuff) { NCCLCHECKGOTO(ncclRegFind(comm, recvbuff, recvbuffSize, &recvRegRecord), ret, fail); if (recvRegRecord) { memcpy(®Data[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg)); regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->addr; } } NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank * 2, regData, sizeof(struct localRegData) * 2), ret, fail); /* first check whether all local ranks find their registered buffer */ for (int i = 0; i < comm->localRanks; ++i) { if ((regData[i * 2].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2].reg.caddrs[i] != regData[i * 2].reg.addr) { sendNeedReg = true; } if ((regData[i * 2 + 1].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2 + 1].reg.caddrs[i] != regData[i * 2 + 1].reg.addr) { recvNeedReg = true; } if ((regData[i * 2].reg.state & NVLS_REG_NO_SUPPORT) || (regData[i * 2 + 1].reg.state & NVLS_REG_NO_SUPPORT)) { goto fail; } } if (sendNeedReg == false) { for (int i = 0; i < comm->localRanks - 1; ++i) { if (regData[i * 2].offset != regData[(i + 1) * 2].offset) { /* offset are different, we cannot apply user buffer registration */ goto fail; } } /* reuse previous registered buffer if possible */ if (!sendNeedReg) regSendPtr = (CUdeviceptr)((uintptr_t)sendRegRecord->regAddr + regData[comm->localRank * 2].offset); } if (recvNeedReg == false) { for (int i = 0; i < comm->localRanks - 1; ++i) { if (regData[i * 2 + 1].offset != regData[(i + 1) * 2 + 1].offset) { goto fail; } } if (!recvNeedReg) regRecvPtr = (CUdeviceptr)((uintptr_t)recvRegRecord->regAddr + regData[comm->localRank * 2 + 1].offset); } if ((!sendNeedReg || sendbuff == NULL) && (!recvNeedReg || recvbuff == NULL)) { localRegBufUsed = true; INFO(NCCL_NVLS, "rank %d reuse local-registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr); goto exit; } /* Start Registration. Not found registered buffers, then check whether both send and recv buffer locate * in register request cache. */ if (sendNeedReg && sendbuff) { tryRegisterBuffer(comm, (uintptr_t)sendbuff, sendbuffSize, ®SendPtr, &localRegBufUsed); if (localRegBufUsed == false) goto fail; } if (recvNeedReg && recvbuff) { tryRegisterBuffer(comm, (uintptr_t)recvbuff, recvbuffSize, ®RecvPtr, &localRegBufUsed); if (localRegBufUsed == false) goto fail; } INFO(NCCL_NVLS, "rank %d successfully local-registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr); exit: *outRegBufSend = (void*)regSendPtr; *outRegBufRecv = (void*)regRecvPtr; *outRegBufUsed = localRegBufUsed; free(regData); return ncclSuccess; fail: localRegBufUsed = false; goto exit; } struct ncclNvlsCleanupCallback { struct ncclCommCallback base; CUmemGenericAllocationHandle mcHandle; CUdeviceptr ptr; int dev; size_t size; }; static ncclResult_t cleanupNvls(struct ncclComm* comm, struct ncclCommCallback* cb) { struct ncclNvlsCleanupCallback* obj = (struct ncclNvlsCleanupCallback*)cb; NCCLCHECK(ncclNvlsDeregBuffer(&obj->mcHandle, obj->ptr, obj->dev, obj->size)); INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size); free(obj); return ncclSuccess; } ncclResult_t ncclNvlsGraphRegisterBuffer( struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueEltsAdded ) { ncclResult_t ret = ncclSuccess; bool localRegBufUsed = false; struct ncclNvlsCleanupCallback* sendRecord = NULL; struct ncclNvlsCleanupCallback* recvRecord = NULL; CUdeviceptr regSendPtr = 0; CUdeviceptr regRecvPtr = 0; CUmulticastObjectProp mcprop; CUmemAllocationProp ucprop; char shareableHandle[NVLS_HANDLE_SIZE]; CUmemGenericAllocationHandle sendMcHandle, recvMcHandle; size_t sendGran = 0, recvGran = 0; bool *regBufFlags = NULL; struct graphRegData *rdata = NULL; const void *baseSend = NULL; const void *baseRecv = NULL; size_t baseSendSize = 1; size_t baseRecvSize = 1; size_t ucgran; *outRegBufUsed = false; NCCLCHECKGOTO(ncclCalloc(®BufFlags, comm->localRanks), ret, fail); NCCLCHECKGOTO(ncclCalloc(&rdata, comm->localRanks), ret, fail); if (sendbuffSize > 0 || recvbuffSize > 0) { /* retrieve base pointer and size */ if (CUPFN(cuMemGetAddressRange) == nullptr) goto fail; if (sendbuff != NULL) CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)sendbuff), ret, fail); if (recvbuff != NULL) CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff), ret, fail); memset(&ucprop, 0, sizeof(CUmemAllocationProp)); ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; ucprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; ucprop.location.id = comm->cudaDev; ucprop.requestedHandleTypes = ncclCuMemHandleType; CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail); localRegBufUsed = ((uint64_t)baseSend % ucgran != 0 || (uint64_t)baseRecv % ucgran != 0) ? false : true; regBufFlags[comm->localRank] = localRegBufUsed; NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regBufFlags, sizeof(bool)), ret, fail); for (int i = 0; i < comm->localRanks; ++i) if (regBufFlags[i] == false) goto fail; memset(&mcprop, 0, sizeof(CUmulticastObjectProp)); mcprop.numDevices = comm->localRanks; mcprop.handleTypes = ncclCuMemHandleType; mcprop.flags = 0; if (sendbuff != NULL) { mcprop.size = baseSendSize; CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); /* check send buffer offset and size */ rdata[comm->localRank].offset = (uintptr_t)sendbuff - (uintptr_t)baseSend; rdata[comm->localRank].size = baseSendSize; NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail); baseSendSize = rdata[0].size; for (int i = 1; i < comm->localRanks; ++i) { if (rdata[0].offset != rdata[i].offset) goto fail; if (baseSendSize > rdata[i].size) baseSendSize = rdata[i].size; } if (baseSendSize % sendGran != 0) goto fail; mcprop.size = baseSendSize; /* register sendbuff */ if (comm->localRank == 0) { NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &sendMcHandle, shareableHandle), ret, fail); NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); } else { NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &sendMcHandle), ret, fail); } CUCHECKGOTO(cuMulticastAddDevice(sendMcHandle, comm->nvlsResources->dev), ret, fail); CUCHECKGOTO(cuMulticastBindAddr(sendMcHandle, 0, (CUdeviceptr)baseSend, baseSendSize, 0), ret, fail); // Create a VA for the NVLS CUCHECKGOTO(cuMemAddressReserve(®SendPtr, baseSendSize, sendGran, 0U, 0), ret, fail); // Map the VA locally CUCHECKGOTO(cuMemMap(regSendPtr, baseSendSize, 0, sendMcHandle, 0), ret, fail); CUCHECKGOTO(cuMemSetAccess(regSendPtr, baseSendSize, &comm->nvlsResources->accessDesc, 1), ret, fail); sendRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback)); sendRecord->base.fn = cleanupNvls; sendRecord->mcHandle = sendMcHandle; sendRecord->ptr = regSendPtr; sendRecord->dev = comm->nvlsResources->dev; sendRecord->size = baseSendSize; } if (recvbuff != NULL) { mcprop.size = baseRecvSize; CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); rdata[comm->localRank].offset = (uintptr_t)recvbuff - (uintptr_t)baseRecv; rdata[comm->localRank].size = baseRecvSize; NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail); baseRecvSize = rdata[0].size; for (int i = 1; i < comm->localRanks; ++i) { if (rdata[0].offset != rdata[i].offset) goto fail; if (baseRecvSize > rdata[i].size) baseRecvSize = rdata[i].size; } if (baseRecvSize % recvGran != 0) goto fail; mcprop.size = baseRecvSize; if (comm->localRank == 0) { NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &recvMcHandle, shareableHandle), ret, fail); NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); } else { NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &recvMcHandle), ret, fail); } CUCHECKGOTO(cuMulticastAddDevice(recvMcHandle, comm->nvlsResources->dev), ret, fail); CUCHECKGOTO(cuMulticastBindAddr(recvMcHandle, 0, (CUdeviceptr)baseRecv, baseRecvSize, 0), ret, fail); // Create a VA for the NVLS CUCHECKGOTO(cuMemAddressReserve(®RecvPtr, baseRecvSize, recvGran, 0U, 0), ret, fail); // Map the VA locally CUCHECKGOTO(cuMemMap(regRecvPtr, baseRecvSize, 0, recvMcHandle, 0), ret, fail); CUCHECKGOTO(cuMemSetAccess(regRecvPtr, baseRecvSize, &comm->nvlsResources->accessDesc, 1), ret, fail); recvRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback)); recvRecord->base.fn = cleanupNvls; recvRecord->mcHandle = recvMcHandle; recvRecord->ptr = regRecvPtr; recvRecord->dev = comm->nvlsResources->dev; recvRecord->size = baseRecvSize; } localRegBufUsed = true; } exit: if (localRegBufUsed == false) { if (sendRecord) { ncclNvlsDeregBuffer(&sendRecord->mcHandle, sendRecord->ptr, sendRecord->dev, sendRecord->size); free(sendRecord); } if (recvRecord) { ncclNvlsDeregBuffer(&recvRecord->mcHandle, recvRecord->ptr, recvRecord->dev, recvRecord->size); free(recvRecord); } } else { if (sendRecord) { *outRegBufSend = (void*)((uintptr_t)regSendPtr + (uintptr_t)sendbuff - (uintptr_t)baseSend); ncclIntruQueueEnqueue(cleanupQueue, &sendRecord->base); *nCleanupQueueEltsAdded += 1; } if (recvRecord) { *outRegBufRecv = (void*)((uintptr_t)regRecvPtr + (uintptr_t)recvbuff - (uintptr_t)baseRecv); ncclIntruQueueEnqueue(cleanupQueue, &recvRecord->base); *nCleanupQueueEltsAdded += 1; } INFO(NCCL_NVLS, "rank %d successfully graph-registered sendbuff %p, recvbuff %p, sendbuff size %ld (register size %ld, sendGran %ld), recvbuff size %ld (register size %ld, recvGran %ld), reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, baseSendSize, sendGran, recvbuffSize, baseRecvSize, recvGran, (void*)regSendPtr, (void*)regRecvPtr); } *outRegBufUsed = localRegBufUsed; free(regBufFlags); free(rdata); /* always return success. */ return ncclSuccess; fail: localRegBufUsed = false; goto exit; } #else /* * Pre CUDA 12.1 stubs */ ncclResult_t ncclNvlsInit(struct ncclComm* comm) { comm->nvlsChannels = 0; return ncclSuccess; } ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) { return ncclSuccess; } ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { return ncclSuccess; } ncclResult_t ncclNvlsFree(struct ncclComm* comm) { return ncclSuccess; } ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm) { return ncclSuccess; } ncclResult_t ncclNvlsGraphRegisterBuffer( struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueEltsAdded ) { *outRegBufUsed = false; return ncclSuccess; } ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { *outRegBufUsed = false; return ncclSuccess; } ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { return ncclSuccess; } #endif /* CUDA_VERSION >= 12010 */ nccl-2.22.3-1/src/transport/p2p.cc000066400000000000000000000744221463451655400165360ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "graph.h" #include "utils.h" #include "shm.h" #include "p2p.h" #include "transport.h" enum p2pType { P2P_DIRECT, P2P_INTERMEDIATE, P2P_IPC, P2P_CUMEM }; struct ncclP2pBuff { void* directPtr; size_t size; ncclIpcDesc ipcDesc; }; struct p2pConnectInfo { int rank; int read; struct ncclP2pBuff p2pBuff; // Used by CE memcpy char shmName[7]; int shmSize; }; static_assert(sizeof(struct p2pConnectInfo) <= CONNECT_SIZE, "p2pConnectInfo is too large"); struct p2pShm { struct ncclSendMem sendMem; struct ncclRecvMem recvMem; }; struct p2pShmProxyInfo { // Shared memory between proxy and receiving GPU struct p2pShm* shm; struct p2pShm* devShm; char shmName[7]; int shmSize; ncclShmHandle_t handle; // Intermediate step for sender struct ncclRecvMem* ceRecvMem; char* ceDevBuff; // Receiver buffer char* recvFifo; // Used by CE memcpy progress only uint64_t step; cudaStream_t stream; cudaEvent_t events[NCCL_STEPS]; }; static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large"); struct p2pResources { enum p2pType type; union { struct ncclSendMem* sendDevMem; struct ncclRecvMem* recvDevMem; }; void* sendMemIpc; void* recvMemIpc; // CE memcpy support struct p2pShmProxyInfo proxyInfo; struct p2pShm* shm; struct p2pShm* devShm; int shmSize; ncclShmHandle_t handle; }; // cuMem API support struct p2pCuMemProxyInfo { struct ncclP2pBuff p2pBuff; }; #include /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */ static int busIdToCudaDev(int64_t busId) { int ndev; if (cudaGetDeviceCount(&ndev) != cudaSuccess) return -1; for (int i = 0; i < ndev; i++) { char devBusIdStr[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; if (cudaDeviceGetPCIBusId(devBusIdStr, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess) return -1; int64_t devBusId; NCCLCHECK(busIdToInt64(devBusIdStr, &devBusId)); if (busId == devBusId) return i; } // BusId was not found in our locally visible CUDA devices return -1; } // CE memcpy support NCCL_PARAM(P2pUseCudaMemcpy, "P2P_USE_CUDA_MEMCPY", 0); static int useMemcpy = 0; static void initCeOperation(); extern int64_t ncclParamMNNVLEnable(); /* Determine if two peers can communicate through p2p */ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { initCeOperation(); // MNNVL support if (ncclParamMNNVLEnable() != 0 && info1->hostHash != info2->hostHash) { NCCLCHECK(ncclTopoCheckMNNVL(topo, info1, info2, ret)); if (*ret) return ncclSuccess; } // Rule out different nodes / isolated containers if (info1->hostHash != info2->hostHash || info1->shmDev != info2->shmDev) { *ret = 0; return ncclSuccess; } // Check topology / p2p level. int intermediateRank; NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank)); if (*ret == 0) return ncclSuccess; if (intermediateRank != -1) { if (useMemcpy) *ret = 0; return ncclSuccess; } // Check if NET would work better int useNet = 0; NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet)); if (useNet) { *ret = 0; return ncclSuccess; } // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) int cudaDev1 = busIdToCudaDev(info1->busId); int cudaDev2 = busIdToCudaDev(info2->busId); if (cudaDev1 == -1 || cudaDev2 == -1) { #if CUDART_VERSION >= 10010 // CUDA 10.1 and later can use P2P with invisible devices. return ncclSuccess; #else // Peer's CUDA device is not visible in this process : we can't communicate with it. *ret = 0; return ncclSuccess; #endif } // Check that CUDA can do P2P int p2p; if (cudaDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != cudaSuccess) { INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)", cudaDev1, info1->busId, cudaDev2, info2->busId); *ret = 0; return ncclSuccess; } // This will always fail when using NCCL_CUMEM_ENABLE=1 if (p2p != 0 && !ncclCuMemEnable()) { // Cached result of the legacyIPC detection static int legacyIPC = -1; if (legacyIPC >= 0) { *ret = legacyIPC; return ncclSuccess; } // Check that legacy IPC support is available (WSL WAR) char *dummy; cudaIpcMemHandle_t ipc; NCCLCHECK(ncclCudaMalloc(&dummy, CUDA_IPC_MIN)); if (cudaIpcGetMemHandle(&ipc, dummy) != cudaSuccess) { INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported"); *ret = 0; } NCCLCHECK(ncclCudaFree(dummy)); legacyIPC = *ret; return ncclSuccess; } if (p2p == 0) { INFO(NCCL_INIT|NCCL_P2P,"Could not enable P2P between dev %d(=%lx) and dev %d(=%lx)", cudaDev1, info1->busId, cudaDev2, info2->busId); *ret = 0; return ncclSuccess; } return ncclSuccess; } #define TRACE_DUMP_IPC(DEVIPC) \ do { \ unsigned long *devIpc = (unsigned long *) (DEVIPC); \ TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[0], devIpc[1], devIpc[2], devIpc[3]); \ TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \ } while (0) // cuMem API support ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr) { if (ncclCuMemEnable()) { #if CUDART_VERSION >= 11030 CUmemAllocationHandleType type = ncclCuMemHandleType; // cuMem API support CUmemGenericAllocationHandle handle; NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size)); if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { // Return the native cuMem handle for later Export/Import via UDS memcpy(&ipcDesc->cuDesc.data, &handle, sizeof(handle)); } else { CUCHECK(cuMemExportToShareableHandle(&ipcDesc->cuDesc, handle, type, 0)); } #else return ncclInternalError; #endif } else { // Allocate a CUDA buffer and generate an IPC handle for it NCCLCHECK(ncclCudaCalloc((char **)ptr, size)); cudaError_t res = cudaIpcGetMemHandle(&ipcDesc->devIpc, *ptr); if (res != cudaSuccess) { WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res)); ncclCudaFree(*ptr); CUDACHECK(res); } } INFO(NCCL_P2P|NCCL_ALLOC, "Allocated shareable buffer %p size %zu ipcDesc %p", *ptr, size, ipcDesc); return ncclSuccess; } ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) { return ncclSuccess; } ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) { if (ncclCuMemEnable()) { #if CUDART_VERSION >= 11030 // cuMem API support CUdeviceptr dptr = 0; CUmemAllocationHandleType type = ncclCuMemHandleType; CUmemGenericAllocationHandle handle; ncclCuDesc *cuDesc = &ipcDesc->cuDesc; // Import and map the remote memory descriptor to the local GPU if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { // UDS fd support int fd = -1; // Send cuMem handle to remote for conversion to an fd NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpPeer, &cuDesc->data, &fd)); INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, tpPeer); CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type)); (void) close(fd); } else { CUCHECK(cuMemImportFromShareableHandle(&handle, cuDesc, type)); } CUCHECK(cuMemAddressReserve(&dptr, size, /* alignment */ 0, /* addr */ 0, /* flags */ 0)); CUCHECK(cuMemMap(dptr, size, /* offset */ 0, handle, /* flags */ 0)); TRACE(NCCL_P2P, "Imported shareable buffer size %zu handle 0x%llx dptr %p", size, handle, (void*)dptr); // Allow access by the local GPU CUmemAccessDesc accessDesc = {}; accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDesc.location.id = comm->cudaDev; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1)); TRACE(NCCL_P2P, "Set Access for %p size %zu on dev %d", (void*)dptr, size, accessDesc.location.id); *devMemPtr = (void *)dptr; #else return ncclInternalError; #endif } else { // Legacy CUDA IPC CUDACHECK(cudaIpcOpenMemHandle(devMemPtr, ipcDesc->devIpc, cudaIpcMemLazyEnablePeerAccess)); } INFO(NCCL_P2P, "Imported shareable buffer device %d size %zu ptr %p", comm->cudaDev, size, *devMemPtr); return ncclSuccess; } // Setting this to non zero causes P2P to use Reads rather than Writes NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2); NCCL_PARAM(P2pDirectDisable, "P2P_DIRECT_DISABLE", 0); #define P2P_SAME_PID(MYINFO, PEERINFO) ((MYINFO->hostHash == PEERINFO->hostHash) && (MYINFO->pidHash == PEERINFO->pidHash)) static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) { int p2p; // Queries the topology to see if the GPUs are Ampere and // connected via NVLink, if so we enable P2P Read by default NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, read, intermediateRank)); int readEnable = ncclParamP2pReadEnable(); if (readEnable != -2) *read = readEnable; return ncclSuccess; } static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) { if (P2P_SAME_PID(myInfo, peerInfo)) { if (peerInfo->cudaDev != myInfo->cudaDev) { // Same PID different GPUs, enable P2P access // Legacy CUDA IPC cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { cudaGetLastError(); } else if (err != cudaSuccess) { WARN("failed to peer with device %d(=%lx): %d %s", peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err)); return ncclInternalError; } #if CUDART_VERSION >= 11030 // cuMem API support if (ncclCuMemEnable()) { // Allow direct access to the remote buffer from the local GPU CUmemAccessDesc accessDesc = {}; accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDesc.location.id = myInfo->cudaDev; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; INFO(NCCL_P2P, "Set Access for buffer %p size %zu on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev); CUCHECK(cuMemSetAccess((CUdeviceptr) p2pBuff->directPtr, p2pBuff->size, &accessDesc, 1)); } #endif } *devMem = p2pBuff->directPtr; *ipcPtr = NULL; } else { // Different PID NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem)); *ipcPtr = *devMem; } return ncclSuccess; } /* Send: Create and return connect structures for this peer to connect to me */ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct p2pResources* resources; int tpProxyRank; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; int useRead, intermediateRank; NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank)); if (useMemcpy) useRead = 0; static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; info->read = useRead; // For CollNet, use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0) if (graph && connIndex == 1) info->read = 0; const char* useReadStr = info->read ? "/read" : ""; int sendSize = sizeof(struct ncclSendMem); // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure if (info->read) sendSize += comm->buffSizes[NCCL_PROTO_SIMPLE]; ALIGN_SIZE(sendSize, CUDA_IPC_MIN); if (intermediateRank == -1) { info->rank = myInfo->rank; if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) { resources->type = P2P_DIRECT; send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/direct pointer%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useReadStr); } else { // cuMem API support if (ncclCuMemEnable()) { resources->type = P2P_CUMEM; const char *MNNVL = comm->MNNVL ? "MNNVL" : "CUMEM"; INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, MNNVL, useReadStr, useMemcpy ? "/CE" : "");; } else { // Legacy CUDA IPC resources->type = P2P_IPC; INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/IPC%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useReadStr, useMemcpy ? "/CE" : ""); } send->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; } } else { resources->type = P2P_INTERMEDIATE; info->rank = intermediateRank; INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/indirect/%d[%d]%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, intermediateRank, comm->peerInfo[intermediateRank].nvmlDev, useReadStr); } tpProxyRank = comm->topParentRanks[info->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &send->proxyConn)); if (useMemcpy) { NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pShmProxyInfo))); info->shmSize = resources->proxyInfo.shmSize; memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName)); } else { NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); NCCLCHECK(p2pMap(comm, &send->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->sendDevMem, &resources->sendMemIpc)); } return ncclSuccess; } /* Create and return connect structures for this peer to connect to me */ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) { struct p2pResources* resources; int tpProxyRank; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; int useRead, intermediateRank; NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank)); static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; info->read = useRead; // For CollNet, use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0) if (graph && connIndex == 1) info->read = 0; int recvSize = sizeof(struct ncclRecvMem); // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure for (int p=0; pread && p == NCCL_PROTO_SIMPLE)) recvSize += comm->buffSizes[p]; ALIGN_SIZE(recvSize, CUDA_IPC_MIN); if (intermediateRank == -1) { info->rank = myInfo->rank; if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) { resources->type = P2P_DIRECT; recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; } else { if (ncclCuMemEnable()) { // cuMem API support resources->type = P2P_CUMEM; TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/CUMEM", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); } else { // Legacy CUDA IPC resources->type = P2P_IPC; } recv->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; } } else { resources->type = P2P_INTERMEDIATE; info->rank = intermediateRank; } tpProxyRank = comm->topParentRanks[info->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, tpProxyRank, &recv->proxyConn)); NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); NCCLCHECK(p2pMap(comm, &recv->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->recvDevMem, &resources->recvMemIpc)); return ncclSuccess; } /* Connect/Send to this peer */ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { struct p2pResources* resources = (struct p2pResources*)send->transportResources; struct ncclRecvMem* remDevMem = NULL; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; NCCLCHECK(p2pMap(comm, &send->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc)); char* buff = (char*)(remDevMem+1); for (int p=0; pread && p == NCCL_PROTO_SIMPLE) { /* For P2P Read the SIMPLE buffer is local (ncclSendMem) */ if (resources->sendDevMem == NULL) return ncclInternalError; // We should not use read + memcpy send->conn.buffs[p] = (char*)(resources->sendDevMem+1); } else { send->conn.buffs[p] = buff; buff += comm->buffSizes[p]; } } send->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; if (useMemcpy) { send->conn.tail = &resources->proxyInfo.ceRecvMem->tail; send->conn.connFifo = resources->proxyInfo.ceRecvMem->connFifo; send->conn.head = &resources->proxyInfo.devShm->sendMem.head; // Send SIMPLE buff to proxy, and replace it by local buffer NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0)); send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff; } else { send->conn.tail = &remDevMem->tail; send->conn.head = &resources->sendDevMem->head; send->conn.ptrExchange = &resources->sendDevMem->ptrExchange; send->conn.redOpArgExchange = resources->sendDevMem->redOpArgExchange; } // We must assign the proxyConn's proxyProgress property for proper checking at enqueue-time send->proxyConn.proxyProgress = p2pTransport.send.proxyProgress; return ncclSuccess; } /* Connect/Recv from this peer */ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { struct p2pResources* resources = (struct p2pResources*)recv->transportResources; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; struct ncclSendMem* remDevMem = NULL; if (useMemcpy) { char shmPath[PATH_MAX]; sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName); TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); resources->shmSize = info->shmSize; // Attach to peer's SHM segment NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, -1, &resources->handle)); recv->conn.tail = &resources->devShm->recvMem.tail; recv->conn.head = &resources->devShm->sendMem.head; } else { NCCLCHECK(p2pMap(comm, &recv->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc)); struct ncclRecvMem* devMem = resources->recvDevMem; recv->conn.tail = &devMem->tail; recv->conn.head = &remDevMem->head; recv->conn.ptrExchange = &remDevMem->ptrExchange; recv->conn.redOpArgExchange = remDevMem->redOpArgExchange; } recv->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; char* buff = (char*)(resources->recvDevMem+1); for (int p=0; pread && p == NCCL_PROTO_SIMPLE) { if (remDevMem == NULL) return ncclInternalError; // We should not use read + memcpy /* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */ recv->conn.buffs[p] = (char*)(remDevMem+1); } else { recv->conn.buffs[p] = buff; buff += comm->buffSizes[p]; } } return ncclSuccess; } ncclResult_t p2pSendFree(struct ncclConnector* send) { struct p2pResources* resources = (struct p2pResources*)send->transportResources; if (resources) { if (ncclCuMemEnable()) { // cuMem API support if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc)); if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc)); } else { if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); } free(resources); } return ncclSuccess; } ncclResult_t p2pRecvFree(struct ncclConnector* recv) { struct p2pResources* resources = (struct p2pResources*)recv->transportResources; if (resources) { if (ncclCuMemEnable()) { // cuMem API support if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc)); if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc)); } else { if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); if (useMemcpy) { NCCLCHECK(ncclShmClose(resources->handle)); } } free(resources); } return ncclSuccess; } static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (useMemcpy) { // CE memcpy support struct p2pShmProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); connection->transportResources = proxyInfo; NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE])); char shmPath[PATH_MAX]; shmPath[0] = '\0'; proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem); // Create a SHM segment for the peer to attach to NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1, &proxyInfo->handle)); TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize); memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName)); NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError; memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo)); } else { if (reqSize != sizeof(int)) return ncclInternalError; int size = *((int*)reqBuff); if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError; struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff; NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr)); p2pBuff->size = size; if (ncclCuMemEnable()) { // cuMem API support struct p2pCuMemProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); memcpy(&proxyInfo->p2pBuff, p2pBuff, sizeof(*p2pBuff)); connection->transportResources = proxyInfo; } else { connection->transportResources = p2pBuff->directPtr; } } *done = 1; return ncclSuccess; } static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (reqSize != sizeof(int)) return ncclInternalError; int size = *((int*)reqBuff); if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError; struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff; NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr)); p2pBuff->size = size; if (ncclCuMemEnable()) { // cuMem API support struct p2pCuMemProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); memcpy(&proxyInfo->p2pBuff, p2pBuff, sizeof(*p2pBuff)); connection->transportResources = proxyInfo; } else { connection->transportResources = p2pBuff->directPtr; } *done = 1; return ncclSuccess; } static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources; if (reqSize != sizeof(void*)) return ncclInternalError; proxyInfo->recvFifo = *((char**)reqBuff); CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking)); for (int i=0; ievents+i)); } connection->proxyAppendPtr = &connection->proxyAppend; return ncclSuccess; } static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { // CE memcpy support if (useMemcpy) { struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources; if (proxyInfo) { NCCLCHECK(ncclShmClose(proxyInfo->handle)); NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem)); NCCLCHECK(ncclCudaFree(proxyInfo->ceDevBuff)); CUDACHECK(cudaStreamDestroy(proxyInfo->stream)); for (int i=0; ievents[i])); } free(proxyInfo); } } else { if (ncclCuMemEnable()) { // cuMem API support struct p2pCuMemProxyInfo *proxyInfo = (struct p2pCuMemProxyInfo *) connection->transportResources; if (proxyInfo) { struct ncclP2pBuff *p2pBuff = &proxyInfo->p2pBuff; ncclP2pFreeShareableBuffer(&p2pBuff->ipcDesc); ncclCudaFree(p2pBuff->directPtr); free(proxyInfo); } } else { // Do not check return code as CUDA may have already shut down ncclCudaFree(connection->transportResources); } } return ncclSuccess; } static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { if (ncclCuMemEnable()) { struct p2pCuMemProxyInfo *proxyInfo = (struct p2pCuMemProxyInfo *) connection->transportResources; if (proxyInfo) { struct ncclP2pBuff *p2pBuff = &proxyInfo->p2pBuff; ncclP2pFreeShareableBuffer(&p2pBuff->ipcDesc); ncclCudaFree(p2pBuff->directPtr); free(proxyInfo); } } else { // Do not check return code as CUDA may have already shut down ncclCudaFree(connection->transportResources); } return ncclSuccess; } // CE memcpy support static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct p2pShmProxyInfo* resources = (struct p2pShmProxyInfo*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->transmitted = sub->done = 0; } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; int stepSize = proxyState->buffSizes[p] / NCCL_STEPS; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct p2pShmProxyInfo* resources = (struct p2pShmProxyInfo*) (sub->connection->transportResources); if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy resources->step = sub->base + sub->nsteps; args->done++; continue; } if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) { int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; volatile struct ncclConnFifo* connFifo = resources->ceRecvMem->connFifo; volatile uint64_t* recvTail = &resources->ceRecvMem->tail; // Check GPU has sent everything if ((*recvTail > sub->base+sub->transmitted)) { int size = connFifo[buffSlot].size; CUDACHECK(cudaMemcpyAsync(resources->recvFifo+buffSlot*stepSize, resources->ceDevBuff+buffSlot*stepSize, size, cudaMemcpyDeviceToDevice, resources->stream)); CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream)); sub->transmitted += args->sliceSteps; } } if (sub->done < sub->transmitted) { int buffSlot = (sub->base+sub->done)%NCCL_STEPS; cudaError_t res = cudaEventQuery(resources->events[buffSlot]); if (res != cudaErrorNotReady) CUDACHECK(res); if (res == cudaSuccess) { sub->done += args->sliceSteps; // Notify SHM resources->shm->recvMem.tail = sub->base + sub->done; } if (sub->done == sub->nsteps) { resources->step = sub->base + sub->nsteps; args->done++; } } } if (args->done == args->nsubs) { args->state = ncclProxyOpNone; } } return ncclSuccess; } struct ncclTransport p2pTransport = { "P2P", p2pCanConnect, { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL, NULL }, { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL, NULL } }; static void initCeOperation() { static int init = 0; if (!init) { useMemcpy = ncclParamP2pUseCudaMemcpy(); if (useMemcpy) { p2pTransport.send.proxyConnect = p2pSendProxyConnect; p2pTransport.send.proxyProgress = p2pSendProxyProgress; } init = 1; } } nccl-2.22.3-1/src/transport/shm.cc000066400000000000000000000450331463451655400166200ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "shm.h" #include "transport.h" struct shmConnectInfo { char shmName[7]; int shmSize; }; static_assert(sizeof(shmConnectInfo) <= CONNECT_SIZE, "SHM Connect info is too large"); struct shmSendResources { int remShmSize; struct ncclRecvMem* remHostMem; struct ncclRecvMem* devRemHostMem; ncclShmHandle_t remHandle; int shmSize; struct ncclSendMem* hostMem; struct ncclSendMem* devHostMem; ncclShmHandle_t hostHandle; }; struct shmRecvResources { int remShmSize; struct ncclSendMem* remHostMem; struct ncclSendMem* devRemHostMem; ncclShmHandle_t remHandle; int shmSize; struct ncclRecvMem* hostMem; struct ncclRecvMem* devHostMem; ncclShmHandle_t hostHandle; }; #define SHM_SEND_SIDE 1 #define SHM_RECV_SIDE 2 NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0); NCCL_PARAM(ShmUseCudaMemcpy, "SHM_USE_CUDA_MEMCPY", 0); NCCL_PARAM(ShmMemcpyMode, "SHM_MEMCPY_MODE", SHM_SEND_SIDE); // 1 is sender-side, 2 is receiver-side, 3 is both static int useMemcpySend = 0; static int useMemcpyRecv = 0; NCCL_PARAM(ShmLocality, "SHM_LOCALITY", SHM_RECV_SIDE); // 1 is sender-size, 2 is receiver-size static int shmLocality = 0; static void initCeOperation(); /* Determine two peers can communicate with SHM */ static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { *ret = 0; initCeOperation(); if (ncclParamShmDisable() == 1) return ncclSuccess; int useNet = 0; NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet)); if (useNet) return ncclSuccess; // Same host? TRACE(NCCL_INIT|NCCL_SHM, "peer1 hostHash %lx peer2 hostHash %lx", info1->hostHash, info2->hostHash); if (info1->hostHash != info2->hostHash) return ncclSuccess; // Common /dev/shm (between containers) ? TRACE(NCCL_INIT|NCCL_SHM, "peer1 shmDev %lx peer2 shmDev %lx", info1->shmDev, info2->shmDev); if (info1->shmDev != info2->shmDev) return ncclSuccess; *ret = 1; return ncclSuccess; } #define MAX_SHM_NAME_LEN 1024 /* Create and return connect structures for this peer to connect to me */ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct shmSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big"); struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; char shmPath[PATH_MAX]; shmPath[0] = '\0'; int shmSize = sizeof(struct ncclSendMem); if (shmLocality == SHM_SEND_SIDE) { for (int p=0; pbuffSizes[p]; } info->shmSize = resources->shmSize = shmSize; NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle)); TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize); memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName)); INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%d] -> %d[%d] via SHM/%s/%s", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useMemcpySend?"CE":"direct", useMemcpyRecv?"CE":"direct"); return ncclSuccess; } static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { struct shmRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big"); struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; char shmPath[PATH_MAX]; shmPath[0] = '\0'; int shmSize = sizeof(struct ncclRecvMem); if (shmLocality == SHM_RECV_SIDE) { for (int p=0; pbuffSizes[p]; } info->shmSize = resources->shmSize = shmSize; NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle)); TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize); memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName)); return ncclSuccess; } struct shmProxyInfo { struct ncclRecvMem* ceRecvMem; char* devFifo; char* shmFifo; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; // used by progress only uint64_t step; cudaStream_t stream; cudaEvent_t events[NCCL_STEPS]; }; /* Connect to this peer */ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { // Setup device pointers struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; struct shmSendResources* resources = (struct shmSendResources*)send->transportResources; char shmPath[PATH_MAX]; sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName); resources->remShmSize = info->shmSize; TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle)); char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1); for (int p=0; pconn.buffs[p] = buff; buff += comm->buffSizes[p]; } send->conn.tail = &resources->devRemHostMem->tail; send->conn.head = &resources->devHostMem->head; send->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; if (useMemcpyRecv) { send->conn.connFifo = resources->devRemHostMem->connFifo; } if (useMemcpySend) { int tpProxyRank; tpProxyRank = comm->topParentRanks[comm->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, tpProxyRank, &send->proxyConn)); struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem }; NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo))); send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo; send->conn.tail = &proxyInfo.ceRecvMem->tail; send->conn.connFifo = proxyInfo.ceRecvMem->connFifo; } // We must assign the proxyConn's proxyProgress property for proper checking at enqueue-time send->proxyConn.proxyProgress = shmTransport.send.proxyProgress; return ncclSuccess; } static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { // Setup device pointers struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; char shmPath[PATH_MAX]; sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName); resources->remShmSize = info->shmSize; TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle)); char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1); for (int p=0; pconn.buffs[p] = buff; buff += comm->buffSizes[p]; } recv->conn.head = &resources->devRemHostMem->head; recv->conn.tail = &resources->devHostMem->tail; recv->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; if (useMemcpyRecv) { NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn)); struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem }; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo))); recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo; recv->conn.tail = &proxyInfo.ceRecvMem->tail; } // We must assign the proxyConn's proxyProgress property for proper checking at enqueue-time recv->proxyConn.proxyProgress = shmTransport.recv.proxyProgress; return ncclSuccess; } static ncclResult_t shmSendFree(struct ncclConnector* send) { struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources; if (resources) { NCCLCHECK(ncclShmClose(resources->hostHandle)); NCCLCHECK(ncclShmClose(resources->remHandle)); free(resources); send->transportResources = NULL; } return ncclSuccess; } static ncclResult_t shmRecvFree(struct ncclConnector* recv) { struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; if (resources) { NCCLCHECK(ncclShmClose(resources->hostHandle)); NCCLCHECK(ncclShmClose(resources->remHandle)); free(resources); recv->transportResources = NULL; } return ncclSuccess; } static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct shmProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError; memcpy(proxyInfo, reqBuff, reqSize); NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE])); NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking)); for (int i=0; ievents+i)); } connection->proxyAppendPtr = &connection->proxyAppend; connection->transportResources = proxyInfo; if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError; memcpy(respBuff, proxyInfo, respSize); return ncclSuccess; } static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct shmProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError; memcpy(proxyInfo, reqBuff, reqSize); NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE])); NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking)); for (int i=0; ievents+i)); } connection->proxyAppendPtr = &connection->proxyAppend; connection->transportResources = proxyInfo; if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError; memcpy(respBuff, proxyInfo, respSize); return ncclSuccess; } static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources; if (resources) { CUDACHECK(cudaStreamDestroy(resources->stream)); NCCLCHECK(ncclCudaFree(resources->devFifo)); NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); for (int i=0; ievents[i])); } free(connection->transportResources); connection->transportResources = NULL; } return ncclSuccess; } static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources; if (resources) { CUDACHECK(cudaStreamDestroy(resources->stream)); NCCLCHECK(ncclCudaFree(resources->devFifo)); NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); for (int i=0; ievents[i])); } free(connection->transportResources); connection->transportResources = NULL; } return ncclSuccess; } static ncclResult_t shmSendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->transmitted = sub->done = 0; } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; int stepSize = proxyState->buffSizes[p] / NCCL_STEPS; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources); if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy resources->step = sub->base + sub->nsteps; args->done++; continue; } if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) { int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; volatile struct ncclConnFifo* connFifo = resources->ceRecvMem->connFifo; volatile uint64_t* recvTail = &resources->ceRecvMem->tail; // Check GPU has sent everything if ((*recvTail > sub->base+sub->transmitted)) { int size = connFifo[buffSlot].size; CUDACHECK(cudaMemcpyAsync(resources->shmFifo+buffSlot*stepSize, resources->devFifo+buffSlot*stepSize, size, cudaMemcpyDeviceToHost, resources->stream)); CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream)); resources->recvMem->connFifo[buffSlot].size = size; __sync_synchronize(); // make sure connFifo[].size is visible sub->transmitted += args->sliceSteps; } } if (sub->done < sub->transmitted) { int buffSlot = (sub->base+sub->done)%NCCL_STEPS; cudaError_t res = cudaEventQuery(resources->events[buffSlot]); if (res != cudaErrorNotReady) CUDACHECK(res); if (res == cudaSuccess) { sub->done += args->sliceSteps; // Notify SHM resources->recvMem->tail = sub->base + sub->done; } if (sub->done == sub->nsteps) { resources->step = sub->base + sub->nsteps; args->done++; } } } if (args->done == args->nsubs) { args->state = ncclProxyOpNone; } } return ncclSuccess; } static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->transmitted = sub->done = 0; } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; int stepSize = proxyState->buffSizes[p] / NCCL_STEPS; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources); if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy resources->step = sub->base + sub->nsteps; args->done++; continue; } if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) { int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; volatile struct ncclConnFifo* connFifo = resources->recvMem->connFifo; volatile uint64_t* recvTail = &resources->recvMem->tail; // Check data is ready in SHM if ((*recvTail > sub->base+sub->transmitted)) { int size = connFifo[buffSlot].size; CUDACHECK(cudaMemcpyAsync(resources->devFifo+buffSlot*stepSize, resources->shmFifo+buffSlot*stepSize, size, cudaMemcpyHostToDevice, resources->stream)); CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream)); sub->transmitted += args->sliceSteps; } } if (sub->done < sub->transmitted) { int buffSlot = (sub->base+sub->done)%NCCL_STEPS; cudaError_t res = cudaEventQuery(resources->events[buffSlot]); if (res != cudaErrorNotReady) CUDACHECK(res); if (res == cudaSuccess) { sub->done += args->sliceSteps; // Notify GPU resources->ceRecvMem->tail = sub->base + sub->done; } if (sub->done == sub->nsteps) { resources->step = sub->base + sub->nsteps; args->done++; } } } if (args->done == args->nsubs) { args->state = ncclProxyOpNone; } } return ncclSuccess; } struct ncclTransport shmTransport = { "SHM", shmCanConnect, { shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL, NULL }, { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL, NULL } }; static void initCeOperation() { static int init = 0; if (!init) { useMemcpySend = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 1); useMemcpyRecv = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 2); if (useMemcpySend) { shmTransport.send.proxyConnect = shmSendProxyConnect; shmTransport.send.proxyFree = shmSendProxyFree; shmTransport.send.proxyProgress = shmSendProxyProgress; } if (useMemcpyRecv) { shmTransport.recv.proxyConnect = shmRecvProxyConnect; shmTransport.recv.proxyFree = shmRecvProxyFree; shmTransport.recv.proxyProgress = shmRecvProxyProgress; } shmLocality = ncclParamShmLocality(); if (shmLocality != SHM_SEND_SIDE && shmLocality != SHM_RECV_SIDE) { WARN("Ignoring SHM locality, must be 1 (sender side) or 2 (receiver side, default)"); shmLocality = SHM_RECV_SIDE; } init = 1; } }