rocprofiler-sdk/rccl/details/rccl.h Source File

rocprofiler-sdk/rccl/details/rccl.h Source File#

Rocprofiler SDK Developer API: rocprofiler-sdk/rccl/details/rccl.h Source File
Rocprofiler SDK Developer API 0.6.0
ROCm Profiling API and tools
rccl.h
Go to the documentation of this file.
1/*************************************************************************
2 * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
3 * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
4 * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
5 *
6 * See LICENSE.txt for license information
7 ************************************************************************/
8
9#ifndef NCCL_H_
10#define NCCL_H_
11
12#include <hip/hip_fp16.h>
13#include <hip/hip_runtime.h>
14
15#define NCCL_MAJOR 2
16#define NCCL_MINOR 20
17#define NCCL_PATCH 5
18#define NCCL_SUFFIX ""
19
20#define NCCL_VERSION_CODE 22005
21#define NCCL_VERSION(X, Y, Z) \
22 (((X) <= 2 && (Y) <= 8) ? (X) *1000 + (Y) *100 + (Z) : (X) *10000 + (Y) *100 + (Z))
23
24#define RCCL_BFLOAT16 1
25#define RCCL_FLOAT8 1
26#define RCCL_GATHER_SCATTER 1
27#define RCCL_ALLTOALLV 1
28
29#ifdef __cplusplus
30extern "C" {
31#endif
32
33#include <limits.h>
34
35/*! @brief Opaque handle to communicator
36 @details A communicator contains information required to facilitate collective communications
37 calls */
38typedef const struct ncclComm* ncclComm_t;
39#define NCCL_COMM_NULL NULL
40
41#define NCCL_UNIQUE_ID_BYTES 128
42/*! @brief Opaque unique id used to initialize communicators
43 @details The ncclUniqueId must be passed to all participating ranks */
44typedef struct
45{
46 char internal[NCCL_UNIQUE_ID_BYTES]; /*!< Opaque array>*/
48
49/*! @defgroup rccl_result_code Result Codes
50 @brief The various result codes that RCCL API calls may return
51 @{ */
52
53/*! @brief Result type
54 @details Return codes aside from ncclSuccess indicate that a call has failed */
55typedef enum
56{
57 ncclSuccess = 0, /*!< No error */
58 ncclUnhandledCudaError = 1, /*!< Unhandled HIP error */
59 ncclSystemError = 2, /*!< Unhandled system error */
60 ncclInternalError = 3, /*!< Internal Error - Please report to RCCL developers */
61 ncclInvalidArgument = 4, /*!< Invalid argument */
62 ncclInvalidUsage = 5, /*!< Invalid usage */
63 ncclRemoteError = 6, /*!< Remote process exited or there was a network error */
64 ncclInProgress = 7, /*!< RCCL operation in progress */
65 ncclNumResults = 8 /*!< Number of result types */
67/*! @} */
68
69#define NCCL_CONFIG_UNDEF_INT INT_MIN
70#define NCCL_CONFIG_UNDEF_PTR NULL
71#define NCCL_SPLIT_NOCOLOR -1
72
73/*! @defgroup rccl_config_type Communicator Configuration
74 @brief Structure that allows for customizing Communicator behavior via
75 ncclCommInitRankConfig
76 @{ */
77
78/**
79 * @defgroup Communicator configuration
80 * @brief Users can assign value to attributes to specify the behavior of a communicator.
81 */
82typedef struct ncclConfig_v21700
83{
84 /* attributes that users should never touch. */
85 size_t size; /*!< Should not be touched */
86 unsigned int magic; /*!< Should not be touched */
87 unsigned int version; /*!< Should not be touched */
88 /* attributes that users are able to customize. */
89 int blocking; /*!< Whether or not calls should block or not */
90 int cgaClusterSize; /*!< Cooperative group array cluster size */
91 int minCTAs; /*!< Minimum number of cooperative thread arrays (blocks) */
92 int maxCTAs; /*!< Maximum number of cooperative thread arrays (blocks) */
93 const char* netName; /*!< Force NCCL to use a specfic network */
94 int splitShare; /*!< Allow communicators to share resources */
96
97/* Config initializer must be assigned to initialize config structure when it is created.
98 * Not initialized config will result in an error. */
99#define NCCL_CONFIG_INITIALIZER \
100 { \
101 sizeof(ncclConfig_t), /* size */ \
102 0xcafebeef, /* magic */ \
103 NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
104 NCCL_CONFIG_UNDEF_INT, /* blocking */ \
105 NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
106 NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
107 NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
108 NCCL_CONFIG_UNDEF_PTR, /* netName */ \
109 NCCL_CONFIG_UNDEF_INT /* splitShare */ \
110 }
111/*! @} */
112
113/* NCCL malloc and free function for all types of NCCL optimizations
114 * (e.g. user buffer registration). The actual allocated size might
115 * be larger than requested due to granularity requirement. */
117ncclMemAlloc(void** ptr, size_t size);
119pncclMemAlloc(void** ptr, size_t size);
120
122ncclMemFree(void* ptr);
124pncclMemFree(void* ptr);
125
126/*! @defgroup rccl_api_version Version Information
127 @brief API call that returns RCCL version
128 @{ */
129
130/*! @brief Return the RCCL_VERSION_CODE of RCCL in the supplied integer.
131 @details This integer is coded with the MAJOR, MINOR and PATCH level of RCCL.
132 @return Result code. See @ref rccl_result_code for more details.
133
134 @param[out] version Pointer to where version will be stored */
135
137ncclGetVersion(int* version);
138/*! @cond include_hidden */
140pncclGetVersion(int* version);
141/*! @endcond */
142/*! @} */
143
144/*! @defgroup rccl_api_communicator Communicator Initialization/Destruction
145 @brief API calls that operate on communicators.
146 Communicators objects are used to launch collective communication
147 operations. Unique ranks between 0 and N-1 must be assigned to
148 each HIP device participating in the same Communicator.
149 Using the same HIP device for multiple ranks of the same Communicator
150 is not supported at this time.
151 @{ */
152
153/*! @brief Generates an ID for ncclCommInitRank.
154 @details Generates an ID to be used in ncclCommInitRank.
155 ncclGetUniqueId should be called once by a single rank and the
156 ID should be distributed to all ranks in the communicator before
157 using it as a parameter for ncclCommInitRank.
158 @return Result code. See @ref rccl_result_code for more details.
159
160 @param[out] uniqueId Pointer to where uniqueId will be stored */
163/*! @cond include_hidden */
165pncclGetUniqueId(ncclUniqueId* uniqueId);
166/*! @endcond */
167
168/*! @brief Create a new communicator with config.
169 @details Create a new communicator (multi thread/process version) with a configuration
170 set by users. See @ref rccl_config_type for more details.
171 Each rank is associated to a CUDA device, which has to be set before calling
172 ncclCommInitRank.
173 @return Result code. See @ref rccl_result_code for more details.
174
175 @param[out] comm Pointer to created communicator
176 @param[in] nranks Total number of ranks participating in this communicator
177 @param[in] commId UniqueId required for initialization
178 @param[in] rank Current rank to create communicator for. [0 to nranks-1]
179 @param[in] config Pointer to communicator configuration */
182 int nranks,
183 ncclUniqueId commId,
184 int rank,
185 ncclConfig_t* config);
186/*! @cond include_hidden */
188pncclCommInitRankConfig(ncclComm_t* comm,
189 int nranks,
190 ncclUniqueId commId,
191 int rank,
192 ncclConfig_t* config);
193/*! @endcond */
194
195/*! @brief Creates a new communicator (multi thread/process version).
196 @details Rank must be between 0 and nranks-1 and unique within a communicator clique.
197 Each rank is associated to a CUDA device, which has to be set before calling
198 ncclCommInitRank. ncclCommInitRank implicitly syncronizes with other ranks,
199 so it must be called by different threads/processes or use
200 ncclGroupStart/ncclGroupEnd.
201 @return Result code. See @ref rccl_result_code for more details.
202
203 @param[out] comm Pointer to created communicator
204 @param[in] nranks Total number of ranks participating in this communicator
205 @param[in] commId UniqueId required for initialization
206 @param[in] rank Current rank to create communicator for */
208ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
209/*! @cond include_hidden */
211pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
212/*! @endcond */
213
214/*! @brief Creates a clique of communicators (single process version).
215 @details This is a convenience function to create a single-process communicator clique.
216 Returns an array of ndev newly initialized communicators in comm.
217 comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
218 If devlist is NULL, the first ndev HIP devices are used.
219 Order of devlist defines user-order of processors within the communicator.
220 @return Result code. See @ref rccl_result_code for more details.
221
222 @param[out] comm Pointer to array of created communicators
223 @param[in] ndev Total number of ranks participating in this communicator
224 @param[in] devlist Array of GPU device indices to create for */
226ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
227/*! @cond include_hidden */
229pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
230/*! @endcond */
231
232/*! @brief Finalize a communicator.
233 @details ncclCommFinalize flushes all issued communications
234 and marks communicator state as ncclInProgress. The state will change to ncclSuccess
235 when the communicator is globally quiescent and related resources are freed; then,
236 calling ncclCommDestroy can locally free the rest of the resources (e.g.
237 communicator itself) without blocking.
238 @return Result code. See @ref rccl_result_code for more details.
239
240 @param[in] comm Communicator to finalize */
243/*! @cond include_hidden */
245pncclCommFinalize(ncclComm_t comm);
246/*! @endcond */
247
248/*! @brief Frees local resources associated with communicator object.
249 @details Destroy all local resources associated with the passed in communicator object
250 @return Result code. See @ref rccl_result_code for more details.
251
252 @param[in] comm Communicator to destroy */
255/*! @cond include_hidden */
257pncclCommDestroy(ncclComm_t comm);
258/*! @endcond */
259
260/*! @brief Abort any in-progress calls and destroy the communicator object.
261 @details Frees resources associated with communicator object and aborts any operations
262 that might still be running on the device.
263 @return Result code. See @ref rccl_result_code for more details.
264
265 @param[in] comm Communicator to abort and destroy */
268/*! @cond include_hidden */
270pncclCommAbort(ncclComm_t comm);
271/*! @endcond */
272
273/*! @brief Create one or more communicators from an existing one.
274 @details Creates one or more communicators from an existing one.
275 Ranks with the same color will end up in the same communicator.
276 Within the new communicator, key will be used to order ranks.
277 NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
278 and will therefore return a NULL communicator.
279 If config is NULL, the new communicator will inherit the original communicator's
280 configuration
281 @return Result code. See @ref rccl_result_code for more details.
282
283 @param[in] comm Original communicator object for this rank
284 @param[in] color Color to assign this rank
285 @param[in] key Key used to order ranks within the same new communicator
286 @param[out] newcomm Pointer to new communicator
287 @param[in] config Config file for new communicator. May be NULL to inherit from comm */
289ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclConfig_t* config);
290/*! @cond include_hidden */
292pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclConfig_t* config);
293/*! @endcond */
294/*! @} */
295
296/*! @defgroup rccl_api_errcheck Error Checking Calls
297 @brief API calls that check for errors
298 @{ */
299
300/*! @brief Returns a string for each result code.
301 @details Returns a human-readable string describing the given result code.
302 @return String containing description of result code.
303
304 @param[in] result Result code to get description for */
305const char*
307/*! @cond include_hidden */
308const char*
309pncclGetErrorString(ncclResult_t result);
310/*! @endcond */
311
312/* Returns a human-readable message of the last error that occurred. */
313const char*
315/*! @cond include_hidden */
316const char*
317pncclGetLastError(ncclComm_t comm);
318/*! @endcond */
319
320/*! @brief Checks whether the comm has encountered any asynchronous errors
321 @details Query whether the provided communicator has encountered any asynchronous errors
322 @return Result code. See @ref rccl_result_code for more details.
323
324 @param[in] comm Communicator to query
325 @param[out] asyncError Pointer to where result code will be stored */
328/*! @cond include_hidden */
330pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError);
331/*! @endcond */
332/*! @} */
333
334/*! @defgroup rccl_api_comminfo Communicator Information
335 @brief API calls that query communicator information
336 @{ */
337
338/*! @brief Gets the number of ranks in the communicator clique.
339 @details Returns the number of ranks in the communicator clique (as set during
340 initialization)
341 @return Result code. See @ref rccl_result_code for more details.
342
343 @param[in] comm Communicator to query
344 @param[out] count Pointer to where number of ranks will be stored */
346ncclCommCount(const ncclComm_t comm, int* count);
347/*! @cond include_hidden */
349pncclCommCount(const ncclComm_t comm, int* count);
350/*~ @endcond */
351
352/*! @brief Get the ROCm device index associated with a communicator
353 @details Returns the ROCm device number associated with the provided communicator.
354 @return Result code. See @ref rccl_result_code for more details.
355
356 @param[in] comm Communicator to query
357 @param[out] device Pointer to where the associated ROCm device index will be stored */
359ncclCommCuDevice(const ncclComm_t comm, int* device);
360/*! @cond include_hidden */
362pncclCommCuDevice(const ncclComm_t comm, int* device);
363/*! @endcond */
364
365/*! @brief Get the rank associated with a communicator
366 @details Returns the user-ordered "rank" associated with the provided communicator.
367 @return Result code. See @ref rccl_result_code for more details.
368
369 @param[in] comm Communicator to query
370 @param[out] rank Pointer to where the associated rank will be stored */
372ncclCommUserRank(const ncclComm_t comm, int* rank);
373/*! @cond include_hidden */
375pncclCommUserRank(const ncclComm_t comm, int* rank);
376/*! @endcond */
377/*! @} */
378
379/* Register CUDA buffer for zero-copy operation */
381ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
382/*! @cond include_hidden */
384pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
385/*! @endcond */
386
387/* Deregister CUDA buffer */
389ncclCommDeregister(const ncclComm_t comm, void* handle);
390/*! @cond include_hidden */
392pncclCommDeregister(const ncclComm_t comm, void* handle);
393/*! @endcond */
394
395/*! @defgroup rccl_api_enumerations API Enumerations
396 @brief Enumerations used by collective communication calls
397 @{ */
398
399/*! @brief Dummy reduction enumeration
400 @details Dummy reduction enumeration used to determine value for ncclMaxRedOp */
401typedef enum
402{
405
406/*! @brief Reduction operation selector
407 @details Enumeration used to specify the various reduction operations
408 ncclNumOps is the number of built-in ncclRedOp_t values and serves as
409 the least possible value for dynamic ncclRedOp_t values constructed by
410 ncclRedOpCreate functions.
411
412 ncclMaxRedOp is the largest valid value for ncclRedOp_t and is defined
413 to be the largest signed value (since compilers are permitted to use
414 signed enums) that won't grow sizeof(ncclRedOp_t) when compared to previous
415 RCCL versions to maintain ABI compatibility. */
416typedef enum
417{
418 ncclSum = 0, /*!< Sum */
419 ncclProd = 1, /*!< Product */
420 ncclMax = 2, /*!< Max */
421 ncclMin = 3, /*!< Min */
422 ncclAvg = 4, /*!< Average */
423 ncclNumOps = 5, /*!< Number of built-in reduction ops */
425 0x7fffffff >> (32 - 8 * sizeof(ncclRedOp_dummy_t)) /*!< Largest value for ncclRedOp_t */
427
428/*! @brief Data types
429 @details Enumeration of the various supported datatype */
452#else
453 ncclNumTypes = 10
455#endif
456/*! @} */
457
458/*! @defgroup rccl_api_custom_redop Custom Reduction Operator
459 @brief API calls relating to creation/destroying custom reduction operator
460 that pre-multiplies local source arrays prior to reduction
461 @{ */
462
463/*! @brief Location and dereferencing logic for scalar arguments.
464 @details Enumeration specifying memory location of the scalar argument.
465 Based on where the value is stored, the argument will be dereferenced either
466 while the collective is running (if in device memory), or before the
467 ncclRedOpCreate() function returns (if in host memory). */
468typedef enum
469{
470 ncclScalarDevice = 0, /*!< Scalar is in device-visible memory */
471 ncclScalarHostImmediate = 1 /*!< Scalar is in host-visible memory */
473
474/*! @brief Create a custom pre-multiplier reduction operator
475 @details Creates a new reduction operator which pre-multiplies input values by a given
476 scalar locally before reducing them with peer values via summation. For use
477 only with collectives launched against *comm* and *datatype*. The
478 *residence* argument indicates how/when the memory pointed to by *scalar*
479 will be dereferenced. Upon return, the newly created operator's handle
480 is stored in *op*.
481 @return Result code. See @ref rccl_result_code for more details.
482
483 @param[out] op Pointer to where newly created custom reduction operator is to be
484 stored
485 @param[in] scalar Pointer to scalar value.
486 @param[in] datatype Scalar value datatype
487 @param[in] residence Memory type of the scalar value
488 @param[in] comm Communicator to associate with this custom reduction operator */
491 void* scalar,
492 ncclDataType_t datatype,
493 ncclScalarResidence_t residence,
494 ncclComm_t comm);
495/*! @cond include_hidden */
497pncclRedOpCreatePreMulSum(ncclRedOp_t* op,
498 void* scalar,
499 ncclDataType_t datatype,
500 ncclScalarResidence_t residence,
501 ncclComm_t comm);
502/*! @endcond */
503
504/*! @brief Destroy custom reduction operator
505 @details Destroys the reduction operator *op*. The operator must have been created by
506 ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
507 destroyed as soon as the last RCCL function which is given that operator returns.
508 @return Result code. See @ref rccl_result_code for more details.
509
510 @param[in] op Custom reduction operator is to be destroyed
511 @param[in] comm Communicator associated with this reduction operator */
514/*! @cond include_hidden */
516pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
517/*! @endcond */
518/*! @} */
519
520/*! @defgroup rccl_collective_api Collective Communication Operations
521 @brief Collective communication operations must be called separately for each
522 communicator in a communicator clique.
523
524 They return when operations have been enqueued on the HIP stream.
525 Since they may perform inter-CPU synchronization, each call has to be done
526 from a different thread or process, or need to use Group Semantics (see
527 below).
528 @{ */
529
530/*! @brief Reduce
531 @details Reduces data arrays of length *count* in *sendbuff* into *recvbuff* using *op*
532 operation.
533 *recvbuff* may be NULL on all calls except for root device.
534 *root* is the rank (not the HIP device) where data will reside after the
535 operation is complete.
536 In-place operation will happen if sendbuff == recvbuff.
537 @return Result code. See @ref rccl_result_code for more details.
538
539 @param[in] sendbuff Local device data buffer to be reduced
540 @param[out] recvbuff Data buffer where result is stored (only for *root* rank). May be
541 null for other ranks.
542 @param[in] count Number of elements in every send buffer
543 @param[in] datatype Data buffer element datatype
544 @param[in] op Reduction operator type
545 @param[in] root Rank where result data array will be stored
546 @param[in] comm Communicator group object to execute on
547 @param[in] stream HIP stream to execute collective on */
549ncclReduce(const void* sendbuff,
550 void* recvbuff,
551 size_t count,
552 ncclDataType_t datatype,
553 ncclRedOp_t op,
554 int root,
555 ncclComm_t comm,
556 hipStream_t stream);
557/*! @cond include_hidden */
559pncclReduce(const void* sendbuff,
560 void* recvbuff,
561 size_t count,
562 ncclDataType_t datatype,
563 ncclRedOp_t op,
564 int root,
565 ncclComm_t comm,
566 hipStream_t stream);
567/*! @endcond */
568
569/*! @brief (Deprecated) Broadcast (in-place)
570 @details Copies *count* values from *root* to all other devices.
571 root is the rank (not the CUDA device) where data resides before the
572 operation is started.
573 This operation is implicitly in-place.
574 @return Result code. See @ref rccl_result_code for more details.
575
576 @param[in,out] buff Input array on *root* to be copied to other ranks. Output array for
577 all ranks.
578 @param[in] count Number of elements in data buffer
579 @param[in] datatype Data buffer element datatype
580 @param[in] root Rank owning buffer to be copied to others
581 @param[in] comm Communicator group object to execute on
582 @param[in] stream HIP stream to execute collective on */
584ncclBcast(void* buff,
585 size_t count,
586 ncclDataType_t datatype,
587 int root,
588 ncclComm_t comm,
589 hipStream_t stream);
590/*! @cond include_hidden */
592pncclBcast(void* buff,
593 size_t count,
594 ncclDataType_t datatype,
595 int root,
596 ncclComm_t comm,
597 hipStream_t stream);
598/*! @endcond */
599
600/*! @brief Broadcast
601 @details Copies *count* values from *sendbuff* on *root* to *recvbuff* on all devices.
602 *root* is the rank (not the HIP device) where data resides before the operation is
603 started. *sendbuff* may be NULL on ranks other than *root*. In-place operation will happen if
604 *sendbuff* == *recvbuff*.
605 @return Result code. See @ref rccl_result_code for more details.
606
607 @param[in] sendbuff Data array to copy (if *root*). May be NULL for other ranks
608 @param[in] recvbuff Data array to store received array
609 @param[in] count Number of elements in data buffer
610 @param[in] datatype Data buffer element datatype
611 @param[in] root Rank of broadcast root
612 @param[in] comm Communicator group object to execute on
613 @param[in] stream HIP stream to execute collective on */
615ncclBroadcast(const void* sendbuff,
616 void* recvbuff,
617 size_t count,
618 ncclDataType_t datatype,
619 int root,
620 ncclComm_t comm,
621 hipStream_t stream);
622/*! @cond include_hidden */
624pncclBroadcast(const void* sendbuff,
625 void* recvbuff,
626 size_t count,
627 ncclDataType_t datatype,
628 int root,
629 ncclComm_t comm,
630 hipStream_t stream);
631/*! @endcond */
632
633/*! @brief All-Reduce
634 @details Reduces data arrays of length *count* in *sendbuff* using *op* operation, and
635 leaves identical copies of result on each *recvbuff*.
636 In-place operation will happen if sendbuff == recvbuff.
637 @return Result code. See @ref rccl_result_code for more details.
638
639 @param[in] sendbuff Input data array to reduce
640 @param[out] recvbuff Data array to store reduced result array
641 @param[in] count Number of elements in data buffer
642 @param[in] datatype Data buffer element datatype
643 @param[in] op Reduction operator
644 @param[in] comm Communicator group object to execute on
645 @param[in] stream HIP stream to execute collective on */
647ncclAllReduce(const void* sendbuff,
648 void* recvbuff,
649 size_t count,
650 ncclDataType_t datatype,
651 ncclRedOp_t op,
652 ncclComm_t comm,
653 hipStream_t stream);
654/*! @cond include_hidden */
656pncclAllReduce(const void* sendbuff,
657 void* recvbuff,
658 size_t count,
659 ncclDataType_t datatype,
660 ncclRedOp_t op,
661 ncclComm_t comm,
662 hipStream_t stream);
663/*! @endcond */
664
665/*! @brief Reduce-Scatter
666 @details Reduces data in *sendbuff* using *op* operation and leaves reduced result
667 scattered over the devices so that *recvbuff* on rank i will contain the i-th
668 block of the result.
669 Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff*
670 should have a size of at least nranks*recvcount elements.
671 In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
672 @return Result code. See @ref rccl_result_code for more details.
673
674 @param[in] sendbuff Input data array to reduce
675 @param[out] recvbuff Data array to store reduced result subarray
676 @param[in] recvcount Number of elements each rank receives
677 @param[in] datatype Data buffer element datatype
678 @param[in] op Reduction operator
679 @param[in] comm Communicator group object to execute on
680 @param[in] stream HIP stream to execute collective on */
682ncclReduceScatter(const void* sendbuff,
683 void* recvbuff,
684 size_t recvcount,
685 ncclDataType_t datatype,
686 ncclRedOp_t op,
687 ncclComm_t comm,
688 hipStream_t stream);
689/*! @cond include_hidden */
691pncclReduceScatter(const void* sendbuff,
692 void* recvbuff,
693 size_t recvcount,
694 ncclDataType_t datatype,
695 ncclRedOp_t op,
696 ncclComm_t comm,
697 hipStream_t stream);
698/*! @endcond */
699
700/*! @brief All-Gather
701 @details Each device gathers *sendcount* values from other GPUs into *recvbuff*,
702 receiving data from rank i at offset i*sendcount.
703 Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
704 should have a size of at least nranks*sendcount elements.
705 In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
706 @return Result code. See @ref rccl_result_code for more details.
707
708 @param[in] sendbuff Input data array to send
709 @param[out] recvbuff Data array to store the gathered result
710 @param[in] sendcount Number of elements each rank sends
711 @param[in] datatype Data buffer element datatype
712 @param[in] comm Communicator group object to execute on
713 @param[in] stream HIP stream to execute collective on */
715ncclAllGather(const void* sendbuff,
716 void* recvbuff,
717 size_t sendcount,
718 ncclDataType_t datatype,
719 ncclComm_t comm,
720 hipStream_t stream);
721/*! @cond include_hidden */
723pncclAllGather(const void* sendbuff,
724 void* recvbuff,
725 size_t sendcount,
726 ncclDataType_t datatype,
727 ncclComm_t comm,
728 hipStream_t stream);
729/*! @endcond */
730
731/*! @brief Send
732 @details Send data from *sendbuff* to rank *peer*.
733 Rank *peer* needs to call ncclRecv with the same *datatype* and the same *count*
734 as this rank.
735 This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
736 need to progress concurrently to complete, they must be fused within a
737 ncclGroupStart / ncclGroupEnd section.
738 @return Result code. See @ref rccl_result_code for more details.
739
740 @param[in] sendbuff Data array to send
741 @param[in] count Number of elements to send
742 @param[in] datatype Data buffer element datatype
743 @param[in] peer Peer rank to send to
744 @param[in] comm Communicator group object to execute on
745 @param[in] stream HIP stream to execute collective on */
747ncclSend(const void* sendbuff,
748 size_t count,
749 ncclDataType_t datatype,
750 int peer,
751 ncclComm_t comm,
752 hipStream_t stream);
753/*! @cond include_hidden */
755pncclSend(const void* sendbuff,
756 size_t count,
757 ncclDataType_t datatype,
758 int peer,
759 ncclComm_t comm,
760 hipStream_t stream);
761/*! @endcond */
762
763/*! @brief Receive
764 @details Receive data from rank *peer* into *recvbuff*.
765 Rank *peer* needs to call ncclSend with the same datatype and the same count
766 as this rank.
767 This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
768 need to progress concurrently to complete, they must be fused within a
769 ncclGroupStart/ ncclGroupEnd section.
770 @return Result code. See @ref rccl_result_code for more details.
771
772 @param[out] recvbuff Data array to receive
773 @param[in] count Number of elements to receive
774 @param[in] datatype Data buffer element datatype
775 @param[in] peer Peer rank to send to
776 @param[in] comm Communicator group object to execute on
777 @param[in] stream HIP stream to execute collective on */
779ncclRecv(void* recvbuff,
780 size_t count,
781 ncclDataType_t datatype,
782 int peer,
783 ncclComm_t comm,
784 hipStream_t stream);
785/*! @cond include_hidden */
787pncclRecv(void* recvbuff,
788 size_t count,
789 ncclDataType_t datatype,
790 int peer,
791 ncclComm_t comm,
792 hipStream_t stream);
793/*! @endcond */
794
795/*! @brief Gather
796 @details Root device gathers *sendcount* values from other GPUs into *recvbuff*,
797 receiving data from rank i at offset i*sendcount.
798 Assumes recvcount is equal to nranks*sendcount, which means that *recvbuff*
799 should have a size of at least nranks*sendcount elements.
800 In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
801 *recvbuff* may be NULL on ranks other than *root*.
802 @return Result code. See @ref rccl_result_code for more details.
803
804 @param[in] sendbuff Data array to send
805 @param[out] recvbuff Data array to receive into on *root*.
806 @param[in] sendcount Number of elements to send per rank
807 @param[in] datatype Data buffer element datatype
808 @param[in] root Rank that receives data from all other ranks
809 @param[in] comm Communicator group object to execute on
810 @param[in] stream HIP stream to execute collective on */
812ncclGather(const void* sendbuff,
813 void* recvbuff,
814 size_t sendcount,
815 ncclDataType_t datatype,
816 int root,
817 ncclComm_t comm,
818 hipStream_t stream);
819/*! @cond include_hidden */
821pncclGather(const void* sendbuff,
822 void* recvbuff,
823 size_t sendcount,
824 ncclDataType_t datatype,
825 int root,
826 ncclComm_t comm,
827 hipStream_t stream);
828/*! @endcond */
829
830/*! @brief Scatter
831 @details Scattered over the devices so that recvbuff on rank i will contain the i-th
832 block of the data on root.
833 Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff*
834 should have a size of at least nranks*recvcount elements.
835 In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
836 @return Result code. See @ref rccl_result_code for more details.
837
838 @param[in] sendbuff Data array to send (on *root* rank). May be NULL on other ranks.
839 @param[out] recvbuff Data array to receive partial subarray into
840 @param[in] recvcount Number of elements to receive per rank
841 @param[in] datatype Data buffer element datatype
842 @param[in] root Rank that scatters data to all other ranks
843 @param[in] comm Communicator group object to execute on
844 @param[in] stream HIP stream to execute collective on */
846ncclScatter(const void* sendbuff,
847 void* recvbuff,
848 size_t recvcount,
849 ncclDataType_t datatype,
850 int root,
851 ncclComm_t comm,
852 hipStream_t stream);
853/*! @cond include_hidden */
855pncclScatter(const void* sendbuff,
856 void* recvbuff,
857 size_t recvcount,
858 ncclDataType_t datatype,
859 int root,
860 ncclComm_t comm,
861 hipStream_t stream);
862/*! @endcond */
863
864/*! @brief All-To-All
865 @details Device (i) send (j)th block of data to device (j) and be placed as (i)th
866 block. Each block for sending/receiving has *count* elements, which means
867 that *recvbuff* and *sendbuff* should have a size of nranks*count elements.
868 In-place operation is NOT supported. It is the user's responsibility
869 to ensure that sendbuff and recvbuff are distinct.
870 @return Result code. See @ref rccl_result_code for more details.
871
872 @param[in] sendbuff Data array to send (contains blocks for each other rank)
873 @param[out] recvbuff Data array to receive (contains blocks from each other rank)
874 @param[in] count Number of elements to send between each pair of ranks
875 @param[in] datatype Data buffer element datatype
876 @param[in] comm Communicator group object to execute on
877 @param[in] stream HIP stream to execute collective on */
879ncclAllToAll(const void* sendbuff,
880 void* recvbuff,
881 size_t count,
882 ncclDataType_t datatype,
883 ncclComm_t comm,
884 hipStream_t stream);
885/*! @cond include_hidden */
887pncclAllToAll(const void* sendbuff,
888 void* recvbuff,
889 size_t count,
890 ncclDataType_t datatype,
891 ncclComm_t comm,
892 hipStream_t stream);
893/*! @endcond */
894
895/*! @brief All-To-Allv
896 @details Device (i) sends sendcounts[j] of data from offset sdispls[j]
897 to device (j). At the same time, device (i) receives recvcounts[j] of data
898 from device (j) to be placed at rdispls[j].
899 sendcounts, sdispls, recvcounts and rdispls are all measured in the units
900 of datatype, not bytes.
901 In-place operation will happen if sendbuff == recvbuff.
902 @return Result code. See @ref rccl_result_code for more details.
903
904 @param[in] sendbuff Data array to send (contains blocks for each other rank)
905 @param[in] sendcounts Array containing number of elements to send to each participating rank
906 @param[in] sdispls Array of offsets into *sendbuff* for each participating rank
907 @param[out] recvbuff Data array to receive (contains blocks from each other rank)
908 @param[in] recvcounts Array containing number of elements to receive from each participating
909 rank
910 @param[in] rdispls Array of offsets into *recvbuff* for each participating rank
911 @param[in] datatype Data buffer element datatype
912 @param[in] comm Communicator group object to execute on
913 @param[in] stream HIP stream to execute collective on */
915ncclAllToAllv(const void* sendbuff,
916 const size_t sendcounts[],
917 const size_t sdispls[],
918 void* recvbuff,
919 const size_t recvcounts[],
920 const size_t rdispls[],
921 ncclDataType_t datatype,
922 ncclComm_t comm,
923 hipStream_t stream);
924/*! @cond include_hidden */
926pncclAllToAllv(const void* sendbuff,
927 const size_t sendcounts[],
928 const size_t sdispls[],
929 void* recvbuff,
930 const size_t recvcounts[],
931 const size_t rdispls[],
932 ncclDataType_t datatype,
933 ncclComm_t comm,
934 hipStream_t stream);
935/*! @endcond */
936
937/*! @} */
938
939/*! @defgroup msccl_api MSCCL Algorithm
940 @brief API calls relating to the optional MSCCL algorithm datapath
941 @{ */
942
943/*! @brief Opaque handle to MSCCL algorithm */
945
946/*! @brief MSCCL Load Algorithm
947 @details Load MSCCL algorithm file specified in mscclAlgoFilePath and return
948 its handle via mscclAlgoHandle. This API is expected to be called by MSCCL
949 scheduler instead of end users.
950 @return Result code. See @ref rccl_result_code for more details.
951
952 @param[in] mscclAlgoFilePath Path to MSCCL algorithm file
953 @param[out] mscclAlgoHandle Returned handle to MSCCL algorithm
954 @param[in] rank Current rank */
956mscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, int rank);
957/*! @cond include_hidden */
959pmscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, int rank);
960/*! @endcond */
961
962/*! @brief MSCCL Run Algorithm
963 @details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter
964 list merges all possible parameters required by different operations as this
965 is a general-purposed API. This API is expected to be called by MSCCL
966 scheduler instead of end users.
967 @return Result code. See @ref rccl_result_code for more details.
968
969 @param[in] sendBuff Data array to send
970 @param[in] sendCounts Array containing number of elements to send to each participating
971 rank
972 @param[in] sDisPls Array of offsets into *sendbuff* for each participating rank
973 @param[out] recvBuff Data array to receive
974 @param[in] recvCounts Array containing number of elements to receive from each
975 participating rank
976 @param[in] rDisPls Array of offsets into *recvbuff* for each participating rank
977 @param[in] count Number of elements
978 @param[in] dataType Data buffer element datatype
979 @param[in] root Root rank index
980 @param[in] peer Peer rank index
981 @param[in] op Reduction operator
982 @param[in] mscclAlgoHandle Handle to MSCCL algorithm
983 @param[in] comm Communicator group object to execute on
984 @param[in] stream HIP stream to execute collective on */
986mscclRunAlgo(const void* sendBuff,
987 const size_t sendCounts[],
988 const size_t sDisPls[],
989 void* recvBuff,
990 const size_t recvCounts[],
991 const size_t rDisPls[],
992 size_t count,
993 ncclDataType_t dataType,
994 int root,
995 int peer,
996 ncclRedOp_t op,
997 mscclAlgoHandle_t mscclAlgoHandle,
998 ncclComm_t comm,
999 hipStream_t stream);
1000/*! @cond include_hidden */
1002pmscclRunAlgo(const void* sendBuff,
1003 const size_t sendCounts[],
1004 const size_t sDisPls[],
1005 void* recvBuff,
1006 const size_t recvCounts[],
1007 const size_t rDisPls[],
1008 size_t count,
1009 ncclDataType_t dataType,
1010 int root,
1011 int peer,
1012 ncclRedOp_t op,
1013 mscclAlgoHandle_t mscclAlgoHandle,
1014 ncclComm_t comm,
1015 hipStream_t stream);
1016/*! @endcond */
1017
1018/*! @brief MSCCL Unload Algorithm
1019 @deprecated This function has been removed from the public API.
1020 @details Unload MSCCL algorithm previous loaded using its handle. This API
1021 is expected to be called by MSCCL scheduler instead of end users.
1022 @return Result code. See @ref rccl_result_code for more details.
1023
1024 @param[in] mscclAlgoHandle Handle to MSCCL algorithm to unload
1025*/
1028/*! @cond include_hidden */
1030pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
1031/*! @endcond */
1032/*! @} */
1033
1034/*! @defgroup rccl_group_api Group semantics
1035 @brief When managing multiple GPUs from a single thread, and since RCCL collective
1036 calls may perform inter-CPU synchronization, we need to "group" calls for
1037 different ranks/devices into a single call.
1038
1039 Grouping RCCL calls as being part of the same collective operation is done
1040 using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
1041 collective calls until the ncclGroupEnd call, which will wait for all calls
1042 to be complete. Note that for collective communication, ncclGroupEnd only
1043 guarantees that the operations are enqueued on the streams, not that
1044 the operation is effectively done.
1045
1046 Both collective communication and ncclCommInitRank can be used in conjunction
1047 of ncclGroupStart/ncclGroupEnd, but not together.
1048
1049 Group semantics also allow to fuse multiple operations on the same device
1050 to improve performance (for aggregated collective calls), or to permit
1051 concurrent progress of multiple send/receive operations.
1052 @{ */
1053
1054/*! @brief Group Start
1055 @details Start a group call. All calls to RCCL until ncclGroupEnd will be fused into
1056 a single RCCL operation. Nothing will be started on the HIP stream until
1057 ncclGroupEnd.
1058 @return Result code. See @ref rccl_result_code for more details. */
1061/*! @cond include_hidden */
1063pncclGroupStart();
1064/*! @endcond */
1065
1066/*! @brief Group End
1067 @details End a group call. Start a fused RCCL operation consisting of all calls since
1068 ncclGroupStart. Operations on the HIP stream depending on the RCCL operations
1069 need to be called after ncclGroupEnd.
1070 @return Result code. See @ref rccl_result_code for more details. */
1073/*! @cond include_hidden */
1075pncclGroupEnd();
1076/*! @endcond */
1077/*! @} */
1078
1079#ifdef __cplusplus
1080} // end extern "C"
1081#endif
1082
1083#endif // end include guard
ncclResult_t mscclRunAlgo(const void *sendBuff, const unsigned long sendCounts[], const unsigned long sDisPls[], void *recvBuff, const unsigned long recvCounts[], const unsigned long rDisPls[], unsigned long count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream)
MSCCL Run Algorithm.
int mscclAlgoHandle_t
Opaque handle to MSCCL algorithm.
Definition rccl.h:944
ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank)
MSCCL Load Algorithm.
ncclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle)
MSCCL Unload Algorithm.
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int *device)
Get the ROCm device index associated with a communicator.
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int *rank)
Get the rank associated with a communicator.
ncclResult_t ncclCommCount(const ncclComm_t comm, int *count)
Gets the number of ranks in the communicator clique.
ncclResult_t ncclCommInitRank(ncclComm_t *comm, int nranks, ncclUniqueId commId, int rank)
Creates a new communicator (multi thread/process version).
ncclResult_t ncclCommFinalize(ncclComm_t comm)
Finalize a communicator.
ncclResult_t ncclCommDestroy(ncclComm_t comm)
Frees local resources associated with communicator object.
ncclResult_t ncclGetUniqueId(ncclUniqueId *uniqueId)
Generates an ID for ncclCommInitRank.
ncclResult_t ncclCommAbort(ncclComm_t comm)
Abort any in-progress calls and destroy the communicator object.
ncclResult_t ncclCommInitRankConfig(ncclComm_t *comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t *config)
Create a new communicator with config.
ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config)
Create one or more communicators from an existing one.
ncclResult_t ncclCommInitAll(ncclComm_t *comm, int ndev, const int *devlist)
Creates a clique of communicators (single process version).
ncclScalarResidence_t
Location and dereferencing logic for scalar arguments.
Definition rccl.h:469
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm)
Create a custom pre-multiplier reduction operator.
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm)
Destroy custom reduction operator.
@ ncclScalarHostImmediate
Definition rccl.h:471
@ ncclScalarDevice
Definition rccl.h:470
ncclRedOp_dummy_t
Dummy reduction enumeration.
Definition rccl.h:402
ncclRedOp_t
Reduction operation selector.
Definition rccl.h:417
ncclDataType_t
Data types.
Definition rccl.h:431
@ ncclNumOps_dummy
Definition rccl.h:403
@ ncclMin
Definition rccl.h:421
@ ncclNumOps
Definition rccl.h:423
@ ncclSum
Definition rccl.h:418
@ ncclMaxRedOp
Definition rccl.h:424
@ ncclProd
Definition rccl.h:419
@ ncclMax
Definition rccl.h:420
@ ncclAvg
Definition rccl.h:422
@ ncclFloat64
Definition rccl.h:444
@ ncclHalf
Definition rccl.h:441
@ ncclInt32
Definition rccl.h:435
@ ncclFp8E5M2
Definition rccl.h:449
@ ncclFp8E4M3
Definition rccl.h:448
@ ncclNumTypes
Definition rccl.h:450
@ ncclUint32
Definition rccl.h:437
@ ncclInt8
Definition rccl.h:432
@ ncclDouble
Definition rccl.h:445
@ ncclFloat32
Definition rccl.h:442
@ ncclBfloat16
Definition rccl.h:446
@ ncclFloat
Definition rccl.h:443
@ ncclChar
Definition rccl.h:433
@ ncclInt
Definition rccl.h:436
@ ncclUint64
Definition rccl.h:439
@ ncclUint8
Definition rccl.h:434
@ ncclInt64
Definition rccl.h:438
@ ncclFloat16
Definition rccl.h:440
const char * ncclGetErrorString(ncclResult_t result)
Returns a string for each result code.
const char * ncclGetLastError(ncclComm_t comm)
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError)
Checks whether the comm has encountered any asynchronous errors.
ncclResult_t ncclGetVersion(int *version)
Return the RCCL_VERSION_CODE of RCCL in the supplied integer.
ncclResult_t ncclBroadcast(const void *sendbuff, void *recvbuff, unsigned long count, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream)
Broadcast.
ncclResult_t ncclScatter(const void *sendbuff, void *recvbuff, unsigned long recvcount, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream)
Scatter.
ncclResult_t ncclBcast(void *buff, unsigned long count, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream)
(Deprecated) Broadcast (in-place)
ncclResult_t ncclAllGather(const void *sendbuff, void *recvbuff, unsigned long sendcount, ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream)
All-Gather.
ncclResult_t ncclAllToAll(const void *sendbuff, void *recvbuff, unsigned long count, ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream)
All-To-All.
ncclResult_t ncclRecv(void *recvbuff, unsigned long count, ncclDataType_t datatype, int peer, ncclComm_t comm, hipStream_t stream)
Receive.
ncclResult_t ncclReduceScatter(const void *sendbuff, void *recvbuff, unsigned long recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream)
Reduce-Scatter.
ncclResult_t ncclGather(const void *sendbuff, void *recvbuff, unsigned long sendcount, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream)
Gather.
ncclResult_t ncclAllReduce(const void *sendbuff, void *recvbuff, unsigned long count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream)
All-Reduce.
ncclResult_t ncclAllToAllv(const void *sendbuff, const unsigned long sendcounts[], const unsigned long sdispls[], void *recvbuff, const unsigned long recvcounts[], const unsigned long rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream)
All-To-Allv.
ncclResult_t ncclReduce(const void *sendbuff, void *recvbuff, unsigned long count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream)
Reduce.
ncclResult_t ncclSend(const void *sendbuff, unsigned long count, ncclDataType_t datatype, int peer, ncclComm_t comm, hipStream_t stream)
Send.
int minCTAs
Definition rccl.h:91
const char * netName
Definition rccl.h:93
unsigned int magic
Definition rccl.h:86
int blocking
Definition rccl.h:89
unsigned long size
Definition rccl.h:85
int maxCTAs
Definition rccl.h:92
int splitShare
Definition rccl.h:94
unsigned int version
Definition rccl.h:87
int cgaClusterSize
Definition rccl.h:90
ncclResult_t ncclGroupStart()
Group Start.
ncclResult_t ncclGroupEnd()
Group End.
ncclResult_t
Result type.
Definition rccl.h:56
@ ncclInvalidUsage
Definition rccl.h:62
@ ncclInProgress
Definition rccl.h:64
@ ncclNumResults
Definition rccl.h:65
@ ncclRemoteError
Definition rccl.h:63
@ ncclSystemError
Definition rccl.h:59
@ ncclInternalError
Definition rccl.h:60
@ ncclSuccess
Definition rccl.h:57
@ ncclInvalidArgument
Definition rccl.h:61
@ ncclUnhandledCudaError
Definition rccl.h:58
ncclResult_t ncclCommRegister(const ncclComm_t comm, void *buff, unsigned long size, void **handle)
ncclResult_t ncclMemFree(void *ptr)
const struct ncclComm * ncclComm_t
Opaque handle to communicator.
Definition rccl.h:38
#define NCCL_UNIQUE_ID_BYTES
Definition rccl.h:41
ncclResult_t ncclMemAlloc(void **ptr, unsigned long size)
ncclResult_t pncclMemFree(void *ptr)
ncclResult_t pncclMemAlloc(void **ptr, unsigned long size)
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void *handle)
Opaque unique id used to initialize communicators.
Definition rccl.h:45