rocprofiler-sdk/rccl/details/rccl.h Source File

rocprofiler-sdk/rccl/details/rccl.h Source File#

Rocprofiler SDK Developer API: rocprofiler-sdk/rccl/details/rccl.h Source File
Rocprofiler SDK Developer API 0.5.0
ROCm Profiling API and tools
rccl.h
Go to the documentation of this file.
1/*************************************************************************
2 * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
3 * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
4 * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
5 *
6 * See LICENSE.txt for license information
7 ************************************************************************/
8
9#ifndef NCCL_H_
10#define NCCL_H_
11
12#include <hip/hip_fp16.h>
13#include <hip/hip_runtime.h>
14
15#define NCCL_MAJOR 2
16#define NCCL_MINOR 20
17#define NCCL_PATCH 5
18#define NCCL_SUFFIX ""
19
20#define NCCL_VERSION_CODE 22005
21#define NCCL_VERSION(X, Y, Z) \
22 (((X) <= 2 && (Y) <= 8) ? (X) *1000 + (Y) *100 + (Z) : (X) *10000 + (Y) *100 + (Z))
23
24#define RCCL_BFLOAT16 1
25#define RCCL_FLOAT8 1
26#define RCCL_GATHER_SCATTER 1
27#define RCCL_ALLTOALLV 1
28
29#ifdef __cplusplus
30extern "C" {
31#endif
32
33#include <limits.h>
34
35/*! @brief Opaque handle to communicator
36 @details A communicator contains information required to facilitate collective communications
37 calls */
38typedef const struct ncclComm* ncclComm_t;
39#define NCCL_COMM_NULL NULL
40
41#define NCCL_UNIQUE_ID_BYTES 128
42/*! @brief Opaque unique id used to initialize communicators
43 @details The ncclUniqueId must be passed to all participating ranks */
44typedef struct
45{
46 char internal[NCCL_UNIQUE_ID_BYTES]; /*!< Opaque array>*/
48
49/*! @defgroup rccl_result_code Result Codes
50 @details The various result codes that RCCL API calls may return
51 @{ */
52
53/*! @brief Result type
54 @details Return codes aside from ncclSuccess indicate that a call has failed */
55typedef enum
56{
57 ncclSuccess = 0, /*!< No error */
58 ncclUnhandledCudaError = 1, /*!< Unhandled HIP error */
59 ncclSystemError = 2, /*!< Unhandled system error */
60 ncclInternalError = 3, /*!< Internal Error - Please report to RCCL developers */
61 ncclInvalidArgument = 4, /*!< Invalid argument */
62 ncclInvalidUsage = 5, /*!< Invalid usage */
63 ncclRemoteError = 6, /*!< Remote process exited or there was a network error */
64 ncclInProgress = 7, /*!< RCCL operation in progress */
65 ncclNumResults = 8 /*!< Number of result types */
67/*! @} */
68
69#define NCCL_CONFIG_UNDEF_INT INT_MIN
70#define NCCL_CONFIG_UNDEF_PTR NULL
71#define NCCL_SPLIT_NOCOLOR -1
72
73/*! @defgroup rccl_config_type Communicator Configuration
74 @details Structure that allows for customizing Communicator behavior via
75 ncclCommInitRankConfig
76 @{ */
77
78/*! @brief Communicator configuration
79 @details Users can assign value to attributes to specify the behavior of a communicator */
80typedef struct ncclConfig_v21700
81{
82 /* attributes that users should never touch. */
83 size_t size; /*!< Should not be touched */
84 unsigned int magic; /*!< Should not be touched */
85 unsigned int version; /*!< Should not be touched */
86 /* attributes that users are able to customize. */
87 int blocking; /*!< Whether or not calls should block or not */
88 int cgaClusterSize; /*!< Cooperative group array cluster size */
89 int minCTAs; /*!< Minimum number of cooperative thread arrays (blocks) */
90 int maxCTAs; /*!< Maximum number of cooperative thread arrays (blocks) */
91 const char* netName; /*!< Force NCCL to use a specfic network */
92 int splitShare; /*!< Allow communicators to share resources */
94
95/* Config initializer must be assigned to initialize config structure when it is created.
96 * Not initialized config will result in an error. */
97#define NCCL_CONFIG_INITIALIZER \
98 { \
99 sizeof(ncclConfig_t), /* size */ \
100 0xcafebeef, /* magic */ \
101 NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
102 NCCL_CONFIG_UNDEF_INT, /* blocking */ \
103 NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
104 NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
105 NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
106 NCCL_CONFIG_UNDEF_PTR, /* netName */ \
107 NCCL_CONFIG_UNDEF_INT /* splitShare */ \
108 }
109/*! @} */
110
111/* NCCL malloc and free function for all types of NCCL optimizations
112 * (e.g. user buffer registration). The actual allocated size might
113 * be larger than requested due to granularity requirement. */
115ncclMemAlloc(void** ptr, size_t size);
117pncclMemAlloc(void** ptr, size_t size);
118
120ncclMemFree(void* ptr);
122pncclMemFree(void* ptr);
123
124/*! @defgroup rccl_api_version Version Information
125 @details API call that returns RCCL version
126 @{ */
127
128/*! @brief Return the RCCL_VERSION_CODE of RCCL in the supplied integer.
129 @details This integer is coded with the MAJOR, MINOR and PATCH level of RCCL.
130 @return Result code. See @ref rccl_result_code for more details.
131
132 @param[out] version Pointer to where version will be stored */
133
135ncclGetVersion(int* version);
136/*! @cond include_hidden */
138pncclGetVersion(int* version);
139/*! @endcond */
140/*! @} */
141
142/*! @defgroup rccl_api_communicator Communicator Initialization/Destruction
143 @details API calls that operate on communicators.
144 Communicators objects are used to launch collective communication
145 operations. Unique ranks between 0 and N-1 must be assigned to
146 each HIP device participating in the same Communicator.
147 Using the same HIP device for multiple ranks of the same Communicator
148 is not supported at this time.
149 @{ */
150
151/*! @brief Generates an ID for ncclCommInitRank.
152 @details Generates an ID to be used in ncclCommInitRank.
153 ncclGetUniqueId should be called once by a single rank and the
154 ID should be distributed to all ranks in the communicator before
155 using it as a parameter for ncclCommInitRank.
156 @return Result code. See @ref rccl_result_code for more details.
157
158 @param[out] uniqueId Pointer to where uniqueId will be stored */
161/*! @cond include_hidden */
163pncclGetUniqueId(ncclUniqueId* uniqueId);
164/*! @endcond */
165
166/*! @brief Create a new communicator with config.
167 @details Create a new communicator (multi thread/process version) with a configuration
168 set by users. See @ref rccl_config_type for more details.
169 Each rank is associated to a CUDA device, which has to be set before calling
170 ncclCommInitRank.
171 @return Result code. See @ref rccl_result_code for more details.
172
173 @param[out] comm Pointer to created communicator
174 @param[in] nranks Total number of ranks participating in this communicator
175 @param[in] commId UniqueId required for initialization
176 @param[in] rank Current rank to create communicator for. [0 to nranks-1]
177 @param[in] config Pointer to communicator configuration */
180 int nranks,
181 ncclUniqueId commId,
182 int rank,
183 ncclConfig_t* config);
184/*! @cond include_hidden */
186pncclCommInitRankConfig(ncclComm_t* comm,
187 int nranks,
188 ncclUniqueId commId,
189 int rank,
190 ncclConfig_t* config);
191/*! @endcond */
192
193/*! @brief Creates a new communicator (multi thread/process version).
194 @details Rank must be between 0 and nranks-1 and unique within a communicator clique.
195 Each rank is associated to a CUDA device, which has to be set before calling
196 ncclCommInitRank. ncclCommInitRank implicitly syncronizes with other ranks,
197 so it must be called by different threads/processes or use
198 ncclGroupStart/ncclGroupEnd.
199 @return Result code. See @ref rccl_result_code for more details.
200
201 @param[out] comm Pointer to created communicator
202 @param[in] nranks Total number of ranks participating in this communicator
203 @param[in] commId UniqueId required for initialization
204 @param[in] rank Current rank to create communicator for */
206ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
207/*! @cond include_hidden */
209pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
210/*! @endcond */
211
212/*! @brief Creates a clique of communicators (single process version).
213 @details This is a convenience function to create a single-process communicator clique.
214 Returns an array of ndev newly initialized communicators in comm.
215 comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
216 If devlist is NULL, the first ndev HIP devices are used.
217 Order of devlist defines user-order of processors within the communicator.
218 @return Result code. See @ref rccl_result_code for more details.
219
220 @param[out] comm Pointer to array of created communicators
221 @param[in] ndev Total number of ranks participating in this communicator
222 @param[in] devlist Array of GPU device indices to create for */
224ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
225/*! @cond include_hidden */
227pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
228/*! @endcond */
229
230/*! @brief Finalize a communicator.
231 @details ncclCommFinalize flushes all issued communications
232 and marks communicator state as ncclInProgress. The state will change to ncclSuccess
233 when the communicator is globally quiescent and related resources are freed; then,
234 calling ncclCommDestroy can locally free the rest of the resources (e.g.
235 communicator itself) without blocking.
236 @return Result code. See @ref rccl_result_code for more details.
237
238 @param[in] comm Communicator to finalize */
241/*! @cond include_hidden */
243pncclCommFinalize(ncclComm_t comm);
244/*! @endcond */
245
246/*! @brief Frees local resources associated with communicator object.
247 @details Destroy all local resources associated with the passed in communicator object
248 @return Result code. See @ref rccl_result_code for more details.
249
250 @param[in] comm Communicator to destroy */
253/*! @cond include_hidden */
255pncclCommDestroy(ncclComm_t comm);
256/*! @endcond */
257
258/*! @brief Abort any in-progress calls and destroy the communicator object.
259 @details Frees resources associated with communicator object and aborts any operations
260 that might still be running on the device.
261 @return Result code. See @ref rccl_result_code for more details.
262
263 @param[in] comm Communicator to abort and destroy */
266/*! @cond include_hidden */
268pncclCommAbort(ncclComm_t comm);
269/*! @endcond */
270
271/*! @brief Create one or more communicators from an existing one.
272 @details Creates one or more communicators from an existing one.
273 Ranks with the same color will end up in the same communicator.
274 Within the new communicator, key will be used to order ranks.
275 NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
276 and will therefore return a NULL communicator.
277 If config is NULL, the new communicator will inherit the original communicator's
278 configuration
279 @return Result code. See @ref rccl_result_code for more details.
280
281 @param[in] comm Original communicator object for this rank
282 @param[in] color Color to assign this rank
283 @param[in] key Key used to order ranks within the same new communicator
284 @param[out] newcomm Pointer to new communicator
285 @param[in] config Config file for new communicator. May be NULL to inherit from comm */
287ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclConfig_t* config);
288/*! @cond include_hidden */
290pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclConfig_t* config);
291/*! @endcond */
292/*! @} */
293
294/*! @defgroup rccl_api_errcheck Error Checking Calls
295 @details API calls that check for errors
296 @{ */
297
298/*! @brief Returns a string for each result code.
299 @details Returns a human-readable string describing the given result code.
300 @return String containing description of result code.
301
302 @param[in] result Result code to get description for */
303const char*
305/*! @cond include_hidden */
306const char*
307pncclGetErrorString(ncclResult_t result);
308/*! @endcond */
309
310/* Returns a human-readable message of the last error that occurred. */
311const char*
313/*! @cond include_hidden */
314const char*
315pncclGetLastError(ncclComm_t comm);
316/*! @endcond */
317
318/*! @brief Checks whether the comm has encountered any asynchronous errors
319 @details Query whether the provided communicator has encountered any asynchronous errors
320 @return Result code. See @ref rccl_result_code for more details.
321
322 @param[in] comm Communicator to query
323 @param[out] asyncError Pointer to where result code will be stored */
326/*! @cond include_hidden */
328pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError);
329/*! @endcond */
330/*! @} */
331
332/*! @defgroup rccl_api_comminfo Communicator Information
333 @details API calls that query communicator information
334 @{ */
335
336/*! @brief Gets the number of ranks in the communicator clique.
337 @details Returns the number of ranks in the communicator clique (as set during
338 initialization)
339 @return Result code. See @ref rccl_result_code for more details.
340
341 @param[in] comm Communicator to query
342 @param[out] count Pointer to where number of ranks will be stored */
344ncclCommCount(const ncclComm_t comm, int* count);
345/*! @cond include_hidden */
347pncclCommCount(const ncclComm_t comm, int* count);
348/*~ @endcond */
349
350/*! @brief Get the ROCm device index associated with a communicator
351 @details Returns the ROCm device number associated with the provided communicator.
352 @return Result code. See @ref rccl_result_code for more details.
353
354 @param[in] comm Communicator to query
355 @param[out] device Pointer to where the associated ROCm device index will be stored */
357ncclCommCuDevice(const ncclComm_t comm, int* device);
358/*! @cond include_hidden */
360pncclCommCuDevice(const ncclComm_t comm, int* device);
361/*! @endcond */
362
363/*! @brief Get the rank associated with a communicator
364 @details Returns the user-ordered "rank" associated with the provided communicator.
365 @return Result code. See @ref rccl_result_code for more details.
366
367 @param[in] comm Communicator to query
368 @param[out] rank Pointer to where the associated rank will be stored */
370ncclCommUserRank(const ncclComm_t comm, int* rank);
371/*! @cond include_hidden */
373pncclCommUserRank(const ncclComm_t comm, int* rank);
374/*! @endcond */
375/*! @} */
376
377/* Register CUDA buffer for zero-copy operation */
379ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
380/*! @cond include_hidden */
382pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
383/*! @endcond */
384
385/* Deregister CUDA buffer */
387ncclCommDeregister(const ncclComm_t comm, void* handle);
388/*! @cond include_hidden */
390pncclCommDeregister(const ncclComm_t comm, void* handle);
391/*! @endcond */
392
393/*! @defgroup rccl_api_enumerations API Enumerations
394 @details Enumerations used by collective communication calls
395 @{ */
396
397/*! @brief Dummy reduction enumeration
398 @details Dummy reduction enumeration used to determine value for ncclMaxRedOp */
399typedef enum
400{
403
404/*! @brief Reduction operation selector
405 @details Enumeration used to specify the various reduction operations
406 ncclNumOps is the number of built-in ncclRedOp_t values and serves as
407 the least possible value for dynamic ncclRedOp_t values constructed by
408 ncclRedOpCreate functions.
409
410 ncclMaxRedOp is the largest valid value for ncclRedOp_t and is defined
411 to be the largest signed value (since compilers are permitted to use
412 signed enums) that won't grow sizeof(ncclRedOp_t) when compared to previous
413 RCCL versions to maintain ABI compatibility. */
414typedef enum
415{
416 ncclSum = 0, /*!< Sum */
417 ncclProd = 1, /*!< Product */
418 ncclMax = 2, /*!< Max */
419 ncclMin = 3, /*!< Min */
420 ncclAvg = 4, /*!< Average */
421 ncclNumOps = 5, /*!< Number of built-in reduction ops */
423 0x7fffffff >> (32 - 8 * sizeof(ncclRedOp_dummy_t)) /*!< Largest value for ncclRedOp_t */
425
426/*! @brief Data types
427 @details Enumeration of the various supported datatype */
450#else
451 ncclNumTypes = 10
453#endif
454/*! @} */
455
456/*! @defgroup rccl_api_custom_redop Custom Reduction Operator
457 @details API calls relating to creation/destroying custom reduction operator
458 that pre-multiplies local source arrays prior to reduction
459 @{ */
460
461/*! @brief Location and dereferencing logic for scalar arguments.
462 @details Enumeration specifying memory location of the scalar argument.
463 Based on where the value is stored, the argument will be dereferenced either
464 while the collective is running (if in device memory), or before the
465 ncclRedOpCreate() function returns (if in host memory). */
466typedef enum
467{
468 ncclScalarDevice = 0, /*!< Scalar is in device-visible memory */
469 ncclScalarHostImmediate = 1 /*!< Scalar is in host-visible memory */
471
472/*! @brief Create a custom pre-multiplier reduction operator
473 @details Creates a new reduction operator which pre-multiplies input values by a given
474 scalar locally before reducing them with peer values via summation. For use
475 only with collectives launched against *comm* and *datatype*. The
476 *residence* argument indicates how/when the memory pointed to by *scalar*
477 will be dereferenced. Upon return, the newly created operator's handle
478 is stored in *op*.
479 @return Result code. See @ref rccl_result_code for more details.
480
481 @param[out] op Pointer to where newly created custom reduction operator is to be
482 stored
483 @param[in] scalar Pointer to scalar value.
484 @param[in] datatype Scalar value datatype
485 @param[in] residence Memory type of the scalar value
486 @param[in] comm Communicator to associate with this custom reduction operator */
489 void* scalar,
490 ncclDataType_t datatype,
491 ncclScalarResidence_t residence,
492 ncclComm_t comm);
493/*! @cond include_hidden */
495pncclRedOpCreatePreMulSum(ncclRedOp_t* op,
496 void* scalar,
497 ncclDataType_t datatype,
498 ncclScalarResidence_t residence,
499 ncclComm_t comm);
500/*! @endcond */
501
502/*! @brief Destroy custom reduction operator
503 @details Destroys the reduction operator *op*. The operator must have been created by
504 ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
505 destroyed as soon as the last RCCL function which is given that operator returns.
506 @return Result code. See @ref rccl_result_code for more details.
507
508 @param[in] op Custom reduction operator is to be destroyed
509 @param[in] comm Communicator associated with this reduction operator */
512/*! @cond include_hidden */
514pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
515/*! @endcond */
516/*! @} */
517
518/*! @defgroup rccl_collective_api Collective Communication Operations
519 @details Collective communication operations must be called separately for each
520 communicator in a communicator clique.
521
522 They return when operations have been enqueued on the HIP stream.
523 Since they may perform inter-CPU synchronization, each call has to be done
524 from a different thread or process, or need to use Group Semantics (see
525 below).
526 @{ */
527
528/*! @brief Reduce
529 @details Reduces data arrays of length *count* in *sendbuff* into *recvbuff* using *op*
530 operation.
531 *recvbuff* may be NULL on all calls except for root device.
532 *root* is the rank (not the HIP device) where data will reside after the
533 operation is complete.
534 In-place operation will happen if sendbuff == recvbuff.
535 @return Result code. See @ref rccl_result_code for more details.
536
537 @param[in] sendbuff Local device data buffer to be reduced
538 @param[out] recvbuff Data buffer where result is stored (only for *root* rank). May be
539 null for other ranks.
540 @param[in] count Number of elements in every send buffer
541 @param[in] datatype Data buffer element datatype
542 @param[in] op Reduction operator type
543 @param[in] root Rank where result data array will be stored
544 @param[in] comm Communicator group object to execute on
545 @param[in] stream HIP stream to execute collective on */
547ncclReduce(const void* sendbuff,
548 void* recvbuff,
549 size_t count,
550 ncclDataType_t datatype,
551 ncclRedOp_t op,
552 int root,
553 ncclComm_t comm,
554 hipStream_t stream);
555/*! @cond include_hidden */
557pncclReduce(const void* sendbuff,
558 void* recvbuff,
559 size_t count,
560 ncclDataType_t datatype,
561 ncclRedOp_t op,
562 int root,
563 ncclComm_t comm,
564 hipStream_t stream);
565/*! @endcond */
566
567/*! @brief (Deprecated) Broadcast (in-place)
568 @details Copies *count* values from *root* to all other devices.
569 root is the rank (not the CUDA device) where data resides before the
570 operation is started.
571 This operation is implicitly in-place.
572 @return Result code. See @ref rccl_result_code for more details.
573
574 @param[in,out] buff Input array on *root* to be copied to other ranks. Output array for
575 all ranks.
576 @param[in] count Number of elements in data buffer
577 @param[in] datatype Data buffer element datatype
578 @param[in] root Rank owning buffer to be copied to others
579 @param[in] comm Communicator group object to execute on
580 @param[in] stream HIP stream to execute collective on */
582ncclBcast(void* buff,
583 size_t count,
584 ncclDataType_t datatype,
585 int root,
586 ncclComm_t comm,
587 hipStream_t stream);
588/*! @cond include_hidden */
590pncclBcast(void* buff,
591 size_t count,
592 ncclDataType_t datatype,
593 int root,
594 ncclComm_t comm,
595 hipStream_t stream);
596/*! @endcond */
597
598/*! @brief Broadcast
599 @details Copies *count* values from *sendbuff* on *root* to *recvbuff* on all devices.
600 *root* is the rank (not the HIP device) where data resides before the operation is
601 started. *sendbuff* may be NULL on ranks other than *root*. In-place operation will happen if
602 *sendbuff* == *recvbuff*.
603 @return Result code. See @ref rccl_result_code for more details.
604
605 @param[in] sendbuff Data array to copy (if *root*). May be NULL for other ranks
606 @param[in] recvbuff Data array to store received array
607 @param[in] count Number of elements in data buffer
608 @param[in] datatype Data buffer element datatype
609 @param[in] root Rank of broadcast root
610 @param[in] comm Communicator group object to execute on
611 @param[in] stream HIP stream to execute collective on */
613ncclBroadcast(const void* sendbuff,
614 void* recvbuff,
615 size_t count,
616 ncclDataType_t datatype,
617 int root,
618 ncclComm_t comm,
619 hipStream_t stream);
620/*! @cond include_hidden */
622pncclBroadcast(const void* sendbuff,
623 void* recvbuff,
624 size_t count,
625 ncclDataType_t datatype,
626 int root,
627 ncclComm_t comm,
628 hipStream_t stream);
629/*! @endcond */
630
631/*! @brief All-Reduce
632 @details Reduces data arrays of length *count* in *sendbuff* using *op* operation, and
633 leaves identical copies of result on each *recvbuff*.
634 In-place operation will happen if sendbuff == recvbuff.
635 @return Result code. See @ref rccl_result_code for more details.
636
637 @param[in] sendbuff Input data array to reduce
638 @param[out] recvbuff Data array to store reduced result array
639 @param[in] count Number of elements in data buffer
640 @param[in] datatype Data buffer element datatype
641 @param[in] op Reduction operator
642 @param[in] comm Communicator group object to execute on
643 @param[in] stream HIP stream to execute collective on */
645ncclAllReduce(const void* sendbuff,
646 void* recvbuff,
647 size_t count,
648 ncclDataType_t datatype,
649 ncclRedOp_t op,
650 ncclComm_t comm,
651 hipStream_t stream);
652/*! @cond include_hidden */
654pncclAllReduce(const void* sendbuff,
655 void* recvbuff,
656 size_t count,
657 ncclDataType_t datatype,
658 ncclRedOp_t op,
659 ncclComm_t comm,
660 hipStream_t stream);
661/*! @endcond */
662
663/*! @brief Reduce-Scatter
664 @details Reduces data in *sendbuff* using *op* operation and leaves reduced result
665 scattered over the devices so that *recvbuff* on rank i will contain the i-th
666 block of the result.
667 Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff*
668 should have a size of at least nranks*recvcount elements.
669 In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
670 @return Result code. See @ref rccl_result_code for more details.
671
672 @param[in] sendbuff Input data array to reduce
673 @param[out] recvbuff Data array to store reduced result subarray
674 @param[in] recvcount Number of elements each rank receives
675 @param[in] datatype Data buffer element datatype
676 @param[in] op Reduction operator
677 @param[in] comm Communicator group object to execute on
678 @param[in] stream HIP stream to execute collective on */
680ncclReduceScatter(const void* sendbuff,
681 void* recvbuff,
682 size_t recvcount,
683 ncclDataType_t datatype,
684 ncclRedOp_t op,
685 ncclComm_t comm,
686 hipStream_t stream);
687/*! @cond include_hidden */
689pncclReduceScatter(const void* sendbuff,
690 void* recvbuff,
691 size_t recvcount,
692 ncclDataType_t datatype,
693 ncclRedOp_t op,
694 ncclComm_t comm,
695 hipStream_t stream);
696/*! @endcond */
697
698/*! @brief All-Gather
699 @details Each device gathers *sendcount* values from other GPUs into *recvbuff*,
700 receiving data from rank i at offset i*sendcount.
701 Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
702 should have a size of at least nranks*sendcount elements.
703 In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
704 @return Result code. See @ref rccl_result_code for more details.
705
706 @param[in] sendbuff Input data array to send
707 @param[out] recvbuff Data array to store the gathered result
708 @param[in] sendcount Number of elements each rank sends
709 @param[in] datatype Data buffer element datatype
710 @param[in] comm Communicator group object to execute on
711 @param[in] stream HIP stream to execute collective on */
713ncclAllGather(const void* sendbuff,
714 void* recvbuff,
715 size_t sendcount,
716 ncclDataType_t datatype,
717 ncclComm_t comm,
718 hipStream_t stream);
719/*! @cond include_hidden */
721pncclAllGather(const void* sendbuff,
722 void* recvbuff,
723 size_t sendcount,
724 ncclDataType_t datatype,
725 ncclComm_t comm,
726 hipStream_t stream);
727/*! @endcond */
728
729/*! @brief Send
730 @details Send data from *sendbuff* to rank *peer*.
731 Rank *peer* needs to call ncclRecv with the same *datatype* and the same *count*
732 as this rank.
733 This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
734 need to progress concurrently to complete, they must be fused within a
735 ncclGroupStart / ncclGroupEnd section.
736 @return Result code. See @ref rccl_result_code for more details.
737
738 @param[in] sendbuff Data array to send
739 @param[in] count Number of elements to send
740 @param[in] datatype Data buffer element datatype
741 @param[in] peer Peer rank to send to
742 @param[in] comm Communicator group object to execute on
743 @param[in] stream HIP stream to execute collective on */
745ncclSend(const void* sendbuff,
746 size_t count,
747 ncclDataType_t datatype,
748 int peer,
749 ncclComm_t comm,
750 hipStream_t stream);
751/*! @cond include_hidden */
753pncclSend(const void* sendbuff,
754 size_t count,
755 ncclDataType_t datatype,
756 int peer,
757 ncclComm_t comm,
758 hipStream_t stream);
759/*! @endcond */
760
761/*! @brief Receive
762 @details Receive data from rank *peer* into *recvbuff*.
763 Rank *peer* needs to call ncclSend with the same datatype and the same count
764 as this rank.
765 This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
766 need to progress concurrently to complete, they must be fused within a
767 ncclGroupStart/ ncclGroupEnd section.
768 @return Result code. See @ref rccl_result_code for more details.
769
770 @param[out] recvbuff Data array to receive
771 @param[in] count Number of elements to receive
772 @param[in] datatype Data buffer element datatype
773 @param[in] peer Peer rank to send to
774 @param[in] comm Communicator group object to execute on
775 @param[in] stream HIP stream to execute collective on */
777ncclRecv(void* recvbuff,
778 size_t count,
779 ncclDataType_t datatype,
780 int peer,
781 ncclComm_t comm,
782 hipStream_t stream);
783/*! @cond include_hidden */
785pncclRecv(void* recvbuff,
786 size_t count,
787 ncclDataType_t datatype,
788 int peer,
789 ncclComm_t comm,
790 hipStream_t stream);
791/*! @endcond */
792
793/*! @brief Gather
794 @details Root device gathers *sendcount* values from other GPUs into *recvbuff*,
795 receiving data from rank i at offset i*sendcount.
796 Assumes recvcount is equal to nranks*sendcount, which means that *recvbuff*
797 should have a size of at least nranks*sendcount elements.
798 In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
799 *recvbuff* may be NULL on ranks other than *root*.
800 @return Result code. See @ref rccl_result_code for more details.
801
802 @param[in] sendbuff Data array to send
803 @param[out] recvbuff Data array to receive into on *root*.
804 @param[in] sendcount Number of elements to send per rank
805 @param[in] datatype Data buffer element datatype
806 @param[in] root Rank that receives data from all other ranks
807 @param[in] comm Communicator group object to execute on
808 @param[in] stream HIP stream to execute collective on */
810ncclGather(const void* sendbuff,
811 void* recvbuff,
812 size_t sendcount,
813 ncclDataType_t datatype,
814 int root,
815 ncclComm_t comm,
816 hipStream_t stream);
817/*! @cond include_hidden */
819pncclGather(const void* sendbuff,
820 void* recvbuff,
821 size_t sendcount,
822 ncclDataType_t datatype,
823 int root,
824 ncclComm_t comm,
825 hipStream_t stream);
826/*! @endcond */
827
828/*! @brief Scatter
829 @details Scattered over the devices so that recvbuff on rank i will contain the i-th
830 block of the data on root.
831 Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff*
832 should have a size of at least nranks*recvcount elements.
833 In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
834 @return Result code. See @ref rccl_result_code for more details.
835
836 @param[in] sendbuff Data array to send (on *root* rank). May be NULL on other ranks.
837 @param[out] recvbuff Data array to receive partial subarray into
838 @param[in] recvcount Number of elements to receive per rank
839 @param[in] datatype Data buffer element datatype
840 @param[in] root Rank that scatters data to all other ranks
841 @param[in] comm Communicator group object to execute on
842 @param[in] stream HIP stream to execute collective on */
844ncclScatter(const void* sendbuff,
845 void* recvbuff,
846 size_t recvcount,
847 ncclDataType_t datatype,
848 int root,
849 ncclComm_t comm,
850 hipStream_t stream);
851/*! @cond include_hidden */
853pncclScatter(const void* sendbuff,
854 void* recvbuff,
855 size_t recvcount,
856 ncclDataType_t datatype,
857 int root,
858 ncclComm_t comm,
859 hipStream_t stream);
860/*! @endcond */
861
862/*! @brief All-To-All
863 @details Device (i) send (j)th block of data to device (j) and be placed as (i)th
864 block. Each block for sending/receiving has *count* elements, which means
865 that *recvbuff* and *sendbuff* should have a size of nranks*count elements.
866 In-place operation is NOT supported. It is the user's responsibility
867 to ensure that sendbuff and recvbuff are distinct.
868 @return Result code. See @ref rccl_result_code for more details.
869
870 @param[in] sendbuff Data array to send (contains blocks for each other rank)
871 @param[out] recvbuff Data array to receive (contains blocks from each other rank)
872 @param[in] count Number of elements to send between each pair of ranks
873 @param[in] datatype Data buffer element datatype
874 @param[in] comm Communicator group object to execute on
875 @param[in] stream HIP stream to execute collective on */
877ncclAllToAll(const void* sendbuff,
878 void* recvbuff,
879 size_t count,
880 ncclDataType_t datatype,
881 ncclComm_t comm,
882 hipStream_t stream);
883/*! @cond include_hidden */
885pncclAllToAll(const void* sendbuff,
886 void* recvbuff,
887 size_t count,
888 ncclDataType_t datatype,
889 ncclComm_t comm,
890 hipStream_t stream);
891/*! @endcond */
892
893/*! @brief All-To-Allv
894 @details Device (i) sends sendcounts[j] of data from offset sdispls[j]
895 to device (j). At the same time, device (i) receives recvcounts[j] of data
896 from device (j) to be placed at rdispls[j].
897 sendcounts, sdispls, recvcounts and rdispls are all measured in the units
898 of datatype, not bytes.
899 In-place operation will happen if sendbuff == recvbuff.
900 @return Result code. See @ref rccl_result_code for more details.
901
902 @param[in] sendbuff Data array to send (contains blocks for each other rank)
903 @param[in] sendcounts Array containing number of elements to send to each participating rank
904 @param[in] sdispls Array of offsets into *sendbuff* for each participating rank
905 @param[out] recvbuff Data array to receive (contains blocks from each other rank)
906 @param[in] recvcounts Array containing number of elements to receive from each participating
907 rank
908 @param[in] rdispls Array of offsets into *recvbuff* for each participating rank
909 @param[in] datatype Data buffer element datatype
910 @param[in] comm Communicator group object to execute on
911 @param[in] stream HIP stream to execute collective on */
913ncclAllToAllv(const void* sendbuff,
914 const size_t sendcounts[],
915 const size_t sdispls[],
916 void* recvbuff,
917 const size_t recvcounts[],
918 const size_t rdispls[],
919 ncclDataType_t datatype,
920 ncclComm_t comm,
921 hipStream_t stream);
922/*! @cond include_hidden */
924pncclAllToAllv(const void* sendbuff,
925 const size_t sendcounts[],
926 const size_t sdispls[],
927 void* recvbuff,
928 const size_t recvcounts[],
929 const size_t rdispls[],
930 ncclDataType_t datatype,
931 ncclComm_t comm,
932 hipStream_t stream);
933/*! @endcond */
934
935/*! @} */
936
937/*! @defgroup msccl_api MSCCL Algorithm
938 @details API calls relating to the optional MSCCL algorithm datapath
939 @{ */
940
941/*! @brief Opaque handle to MSCCL algorithm */
943
944/*! @brief MSCCL Load Algorithm
945 @details Load MSCCL algorithm file specified in mscclAlgoFilePath and return
946 its handle via mscclAlgoHandle. This API is expected to be called by MSCCL
947 scheduler instead of end users.
948 @return Result code. See @ref rccl_result_code for more details.
949
950 @param[in] mscclAlgoFilePath Path to MSCCL algorithm file
951 @param[out] mscclAlgoHandle Returned handle to MSCCL algorithm
952 @param[in] rank Current rank */
954mscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, int rank);
955/*! @cond include_hidden */
957pmscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, int rank);
958/*! @endcond */
959
960/*! @brief MSCCL Run Algorithm
961 @details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter
962 list merges all possible parameters required by different operations as this
963 is a general-purposed API. This API is expected to be called by MSCCL
964 scheduler instead of end users.
965 @return Result code. See @ref rccl_result_code for more details.
966
967 @param[in] sendBuff Data array to send
968 @param[in] sendCounts Array containing number of elements to send to each participating
969 rank
970 @param[in] sDisPls Array of offsets into *sendbuff* for each participating rank
971 @param[out] recvBuff Data array to receive
972 @param[in] recvCounts Array containing number of elements to receive from each
973 participating rank
974 @param[in] rDisPls Array of offsets into *recvbuff* for each participating rank
975 @param[in] count Number of elements
976 @param[in] dataType Data buffer element datatype
977 @param[in] root Root rank index
978 @param[in] peer Peer rank index
979 @param[in] op Reduction operator
980 @param[in] mscclAlgoHandle Handle to MSCCL algorithm
981 @param[in] comm Communicator group object to execute on
982 @param[in] stream HIP stream to execute collective on */
984mscclRunAlgo(const void* sendBuff,
985 const size_t sendCounts[],
986 const size_t sDisPls[],
987 void* recvBuff,
988 const size_t recvCounts[],
989 const size_t rDisPls[],
990 size_t count,
991 ncclDataType_t dataType,
992 int root,
993 int peer,
994 ncclRedOp_t op,
995 mscclAlgoHandle_t mscclAlgoHandle,
996 ncclComm_t comm,
997 hipStream_t stream);
998/*! @cond include_hidden */
1000pmscclRunAlgo(const void* sendBuff,
1001 const size_t sendCounts[],
1002 const size_t sDisPls[],
1003 void* recvBuff,
1004 const size_t recvCounts[],
1005 const size_t rDisPls[],
1006 size_t count,
1007 ncclDataType_t dataType,
1008 int root,
1009 int peer,
1010 ncclRedOp_t op,
1011 mscclAlgoHandle_t mscclAlgoHandle,
1012 ncclComm_t comm,
1013 hipStream_t stream);
1014/*! @endcond */
1015
1016/*! @brief MSCCL Unload Algorithm
1017 @deprecated This function has been removed from the public API.
1018 @details Unload MSCCL algorithm previous loaded using its handle. This API
1019 is expected to be called by MSCCL scheduler instead of end users.
1020 @return Result code. See @ref rccl_result_code for more details.
1021
1022 @param[in] mscclAlgoHandle Handle to MSCCL algorithm to unload
1023*/
1026/*! @cond include_hidden */
1028pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
1029/*! @endcond */
1030/*! @} */
1031
1032/*! @defgroup rccl_group_api Group semantics
1033 @details When managing multiple GPUs from a single thread, and since RCCL collective
1034 calls may perform inter-CPU synchronization, we need to "group" calls for
1035 different ranks/devices into a single call.
1036
1037 Grouping RCCL calls as being part of the same collective operation is done
1038 using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
1039 collective calls until the ncclGroupEnd call, which will wait for all calls
1040 to be complete. Note that for collective communication, ncclGroupEnd only
1041 guarantees that the operations are enqueued on the streams, not that
1042 the operation is effectively done.
1043
1044 Both collective communication and ncclCommInitRank can be used in conjunction
1045 of ncclGroupStart/ncclGroupEnd, but not together.
1046
1047 Group semantics also allow to fuse multiple operations on the same device
1048 to improve performance (for aggregated collective calls), or to permit
1049 concurrent progress of multiple send/receive operations.
1050 @{ */
1051
1052/*! @brief Group Start
1053 @details Start a group call. All calls to RCCL until ncclGroupEnd will be fused into
1054 a single RCCL operation. Nothing will be started on the HIP stream until
1055 ncclGroupEnd.
1056 @return Result code. See @ref rccl_result_code for more details. */
1059/*! @cond include_hidden */
1061pncclGroupStart();
1062/*! @endcond */
1063
1064/*! @brief Group End
1065 @details End a group call. Start a fused RCCL operation consisting of all calls since
1066 ncclGroupStart. Operations on the HIP stream depending on the RCCL operations
1067 need to be called after ncclGroupEnd.
1068 @return Result code. See @ref rccl_result_code for more details. */
1071/*! @cond include_hidden */
1073pncclGroupEnd();
1074/*! @endcond */
1075/*! @} */
1076
1077#ifdef __cplusplus
1078} // end extern "C"
1079#endif
1080
1081#endif // end include guard
ncclResult_t mscclRunAlgo(const void *sendBuff, const unsigned long sendCounts[], const unsigned long sDisPls[], void *recvBuff, const unsigned long recvCounts[], const unsigned long rDisPls[], unsigned long count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream)
MSCCL Run Algorithm.
int mscclAlgoHandle_t
Opaque handle to MSCCL algorithm.
Definition rccl.h:942
ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank)
MSCCL Load Algorithm.
ncclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle)
MSCCL Unload Algorithm.
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int *device)
Get the ROCm device index associated with a communicator.
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int *rank)
Get the rank associated with a communicator.
ncclResult_t ncclCommCount(const ncclComm_t comm, int *count)
Gets the number of ranks in the communicator clique.
ncclResult_t ncclCommInitRank(ncclComm_t *comm, int nranks, ncclUniqueId commId, int rank)
Creates a new communicator (multi thread/process version).
ncclResult_t ncclCommFinalize(ncclComm_t comm)
Finalize a communicator.
ncclResult_t ncclCommDestroy(ncclComm_t comm)
Frees local resources associated with communicator object.
ncclResult_t ncclGetUniqueId(ncclUniqueId *uniqueId)
Generates an ID for ncclCommInitRank.
ncclResult_t ncclCommAbort(ncclComm_t comm)
Abort any in-progress calls and destroy the communicator object.
ncclResult_t ncclCommInitRankConfig(ncclComm_t *comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t *config)
Create a new communicator with config.
ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config)
Create one or more communicators from an existing one.
ncclResult_t ncclCommInitAll(ncclComm_t *comm, int ndev, const int *devlist)
Creates a clique of communicators (single process version).
ncclScalarResidence_t
Location and dereferencing logic for scalar arguments.
Definition rccl.h:467
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm)
Create a custom pre-multiplier reduction operator.
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm)
Destroy custom reduction operator.
@ ncclScalarHostImmediate
Definition rccl.h:469
@ ncclScalarDevice
Definition rccl.h:468
ncclRedOp_dummy_t
Dummy reduction enumeration.
Definition rccl.h:400
ncclRedOp_t
Reduction operation selector.
Definition rccl.h:415
ncclDataType_t
Data types.
Definition rccl.h:429
@ ncclNumOps_dummy
Definition rccl.h:401
@ ncclMin
Definition rccl.h:419
@ ncclNumOps
Definition rccl.h:421
@ ncclSum
Definition rccl.h:416
@ ncclMaxRedOp
Definition rccl.h:422
@ ncclProd
Definition rccl.h:417
@ ncclMax
Definition rccl.h:418
@ ncclAvg
Definition rccl.h:420
@ ncclFloat64
Definition rccl.h:442
@ ncclHalf
Definition rccl.h:439
@ ncclInt32
Definition rccl.h:433
@ ncclFp8E5M2
Definition rccl.h:447
@ ncclFp8E4M3
Definition rccl.h:446
@ ncclNumTypes
Definition rccl.h:448
@ ncclUint32
Definition rccl.h:435
@ ncclInt8
Definition rccl.h:430
@ ncclDouble
Definition rccl.h:443
@ ncclFloat32
Definition rccl.h:440
@ ncclBfloat16
Definition rccl.h:444
@ ncclFloat
Definition rccl.h:441
@ ncclChar
Definition rccl.h:431
@ ncclInt
Definition rccl.h:434
@ ncclUint64
Definition rccl.h:437
@ ncclUint8
Definition rccl.h:432
@ ncclInt64
Definition rccl.h:436
@ ncclFloat16
Definition rccl.h:438
const char * ncclGetErrorString(ncclResult_t result)
Returns a string for each result code.
const char * ncclGetLastError(ncclComm_t comm)
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError)
Checks whether the comm has encountered any asynchronous errors.
ncclResult_t ncclGetVersion(int *version)
Return the RCCL_VERSION_CODE of RCCL in the supplied integer.
ncclResult_t ncclBroadcast(const void *sendbuff, void *recvbuff, unsigned long count, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream)
Broadcast.
ncclResult_t ncclScatter(const void *sendbuff, void *recvbuff, unsigned long recvcount, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream)
Scatter.
ncclResult_t ncclBcast(void *buff, unsigned long count, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream)
(Deprecated) Broadcast (in-place)
ncclResult_t ncclAllGather(const void *sendbuff, void *recvbuff, unsigned long sendcount, ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream)
All-Gather.
ncclResult_t ncclAllToAll(const void *sendbuff, void *recvbuff, unsigned long count, ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream)
All-To-All.
ncclResult_t ncclRecv(void *recvbuff, unsigned long count, ncclDataType_t datatype, int peer, ncclComm_t comm, hipStream_t stream)
Receive.
ncclResult_t ncclReduceScatter(const void *sendbuff, void *recvbuff, unsigned long recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream)
Reduce-Scatter.
ncclResult_t ncclGather(const void *sendbuff, void *recvbuff, unsigned long sendcount, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream)
Gather.
ncclResult_t ncclAllReduce(const void *sendbuff, void *recvbuff, unsigned long count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream)
All-Reduce.
ncclResult_t ncclAllToAllv(const void *sendbuff, const unsigned long sendcounts[], const unsigned long sdispls[], void *recvbuff, const unsigned long recvcounts[], const unsigned long rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream)
All-To-Allv.
ncclResult_t ncclReduce(const void *sendbuff, void *recvbuff, unsigned long count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream)
Reduce.
ncclResult_t ncclSend(const void *sendbuff, unsigned long count, ncclDataType_t datatype, int peer, ncclComm_t comm, hipStream_t stream)
Send.
int minCTAs
Definition rccl.h:89
const char * netName
Definition rccl.h:91
unsigned int magic
Definition rccl.h:84
int blocking
Definition rccl.h:87
unsigned long size
Definition rccl.h:83
int maxCTAs
Definition rccl.h:90
int splitShare
Definition rccl.h:92
unsigned int version
Definition rccl.h:85
int cgaClusterSize
Definition rccl.h:88
Communicator configuration.
Definition rccl.h:81
ncclResult_t ncclGroupStart()
Group Start.
ncclResult_t ncclGroupEnd()
Group End.
ncclResult_t
Result type.
Definition rccl.h:56
@ ncclInvalidUsage
Definition rccl.h:62
@ ncclInProgress
Definition rccl.h:64
@ ncclNumResults
Definition rccl.h:65
@ ncclRemoteError
Definition rccl.h:63
@ ncclSystemError
Definition rccl.h:59
@ ncclInternalError
Definition rccl.h:60
@ ncclSuccess
Definition rccl.h:57
@ ncclInvalidArgument
Definition rccl.h:61
@ ncclUnhandledCudaError
Definition rccl.h:58
ncclResult_t ncclCommRegister(const ncclComm_t comm, void *buff, unsigned long size, void **handle)
ncclResult_t ncclMemFree(void *ptr)
const struct ncclComm * ncclComm_t
Opaque handle to communicator.
Definition rccl.h:38
#define NCCL_UNIQUE_ID_BYTES
Definition rccl.h:41
ncclResult_t ncclMemAlloc(void **ptr, unsigned long size)
ncclResult_t pncclMemFree(void *ptr)
ncclResult_t pncclMemAlloc(void **ptr, unsigned long size)
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void *handle)
Opaque unique id used to initialize communicators.
Definition rccl.h:45