Collective routines#
ROCSHMEM_BARRIER_ALL#
-
__device__ void rocshmem_barrier_all()#
-
__device__ void rocshmem_barrier_all_wave()#
-
__device__ void rocshmem_barrier_all_wg()#
- Returns:
None.
Description: This routine performs a collective barrier across all PEs in the system. The caller is blocked until the barrier is resolved and all updates local and remote are completed. These APIs should be called from only one thread/wavefront/workgroup within the grid to avoid undefined behavior.
ROCSHMEM_BARRIER#
-
__device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team)#
-
__device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team)#
-
__device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team)#
- Parameters:
ctx – Context with which to perform this operation.
- Returns:
None.
Description: This routine performs a collective barrier between all PEs in the system. The caller is blocked until the barrier is resolved.
ROCSHMEM_TEAM_SYNC#
-
__device__ void rocshmem_ctx_sync(rocshmem_ctx_t ctx, rocshmem_team_t team)#
-
__device__ void rocshmem_ctx_sync_wave(rocshmem_ctx_t ctx, rocshmem_team_t team)#
-
__device__ void rocshmem_ctx_sync_wg(rocshmem_ctx_t ctx, rocshmem_team_t team)#
- Parameters:
ctx – Context with which to perform this operation.
team – Team with which to perform this operation.
- Returns:
None.
Description: This routine registers the arrival of a PE at a barrier. The caller is blocked until the synchronization is resolved.
Unlike the shmem_barrier_all
routine, shmem_team_sync
only ensures the
completion and visibility of previously issued memory stores, but does not
ensure the completion of remote memory updates issued via OpenSHMEM routines.
ROCSHMEM_SYNC_ALL#
-
__device__ void rocshmem_sync_all()#
-
__device__ void rocshmem_sync_all_wave()#
-
__device__ void rocshmem_sync_all_wg()#
- Returns:
None.
Description:
These routines behaves the same way as rocshmem_team_sync_*
when called on the world team.
These APIs should be called from only one thread/wavefront/workgroup within the grid to avoid undefined behavior.
ROSHMEM_ALLTOALL#
-
__device__ void rocshmem_ctx_TYPENAME_alltoall_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, TYPE *dest, const TYPE *source, int nelems)#
- Parameters:
team – The team participating in the collective.
dest – Destination address. Must be an address on the symmetric heap.
source – Source address. Must be an address on the symmetric heap.
nelems – Number of data blocks transferred per pair of PEs.
- Returns:
None.
Description: This routine exchanges a fixed amount of contiguous data blocks between all pairs of PEs participating in the collective routine. This function must be called as a work-group collective.
Valid TYPENAME
and TYPE
values are listed in RMA Data Types.
ROCSHMEM_BROADCAST#
-
__device__ void rocshmem_ctx_TYPENAME_broadcast_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, TYPE *dest, const TYPE *source, int nelems, int pe_root)#
- Parameters:
ctx – Context with which to perform this collective.
team – The team participating in the collective.
dest – Destination address. Must be an address on the symmetric heap.
source – Source address. Must be an address on the symmetric heap.
nelems – Number of data blocks transferred per pair of PEs.
- Returns:
None.
Description: This routine performs a broadcast across PEs in the team. The caller is blocked until the broadcast completes.
Valid TYPENAME
and TYPE
values are listed in RMA Data Types.
ROCSHMEM_FCOLLECT#
-
__device__ void rocshmem_ctx_TYPENAME_fcollect_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, TYPE *dest, const TYPE *source, int nelems)#
- Parameters:
ctx – Context with which to perform this collective.
team – The team participating in the collective.
dest – Destination address. Must be an address on the symmetric heap.
source – Source address. Must be an address on the symmetric heap.
nelems – Number of data blocks transferred per pair of PEs.
- Returns:
None.
Description: This routine concatenates blocks of data from multiple PEs to an array in every PE participating in the collective routine.
ROCSHMEM_REDUCTION#
-
__device__ int rocshmem_ctx_TYPENAME_OPNAME_reduce_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, TYPE *dest, const TYPE *source, int nreduce)#
- Parameters:
ctx – Context with which to perform this collective.
team – The team participating in the collective.
dest – Destination address. Must be an address on the symmetric heap.
source – Source address. Must be an address on the symmetric heap.
nreduce – Number of data blocks transferred per pair of PEs.
- Returns:
Zero on successful local completion. Nonzero otherwise.
Description: This routine performs an allreduce operation across PEs in the team.
Valid TYPENAME
, TYPE
, and OPNAME
values are listed in Reduction Types, Names and Operations.
Supported reduction types and operations#
TYPE |
TYPENAME |
OPNAME |
Supported |
---|---|---|---|
char |
char |
max, min, sum, prod |
No |
signed char |
schar |
max, min, sum, prod |
No |
short |
short |
max, min, sum, prod |
Yes |
int |
int |
max, min, sum, prod |
Yes |
long |
long |
max, min, sum, prod |
Yes |
long long |
longlong |
max, min, sum, prod |
Yes |
ptrdiff_t |
ptrdiff |
max, min, sum, prod |
No |
unsigned char |
uchar |
and, or, xor, max, min, sum, prod |
No |
unsigned short |
ushort |
and, or, xor, max, min, sum, prod |
No |
unsigned int |
uint |
and, or, xor, max, min, sum, prod |
No |
unsigned long |
ulong |
and, or, xor, max, min, sum, prod |
No |
unsigned long long |
ulonglong |
and, or, xor, max, min, sum, prod |
No |
int8_t |
int8 |
and, or, xor, max, min, sum, prod |
No |
int16_t |
int16 |
and, or, xor, max, min, sum, prod |
No |
int32_t |
int32 |
and, or, xor, max, min, sum, prod |
No |
int64_t |
int64 |
and, or, xor, max, min, sum, prod |
No |
uint8_t |
uint8 |
and, or, xor, max, min, sum, prod |
No |
uint16_t |
uint16 |
and, or, xor, max, min, sum, prod |
No |
uint32_t |
uint32 |
and, or, xor, max, min, sum, prod |
No |
uint64_t |
uint64 |
and, or, xor, max, min, sum, prod |
No |
size_t |
size |
and, or, xor, max, min, sum, prod |
No |
float |
float |
max, min, sum, prod |
Yes |
double |
double |
max, min, sum, prod |
Yes |
long double |
longdouble |
max, min, sum, prod |
No |
double _Complex |
complexd |
sum, prod |
No |
float _Complex |
complexf |
sum, prod |
No |