Collective routines#
ROCSHMEM_BARRIER_ALL#
-
__device__ void rocshmem_ctx_wg_barrier_all(rocshmem_ctx_t ctx)#
-
__device__ void rocshmem_wg_barrier_all()#
- Parameters:
ctx – Context with which to perform this operation.
- Returns:
None.
Description: This routine performs a collective barrier between all PEs in the system. The caller is blocked until the barrier is resolved.
ROCSHMEM_TEAM_SYNC#
-
__device__ void rocshmem_ctx_wg_team_sync(rocshmem_ctx_t ctx, rocshmem_team_t team)#
-
__device__ void rocshmem_wg_team_sync(rocshmem_team_t team)#
- Parameters:
ctx – Context with which to perform this operation.
team – Team with which to perform this operation.
- Returns:
None.
Description: This routine registers the arrival of a PE at a barrier. The caller is blocked until the synchronization is resolved.
Unlike the shmem_barrier_all
routine, shmem_team_sync
only ensures the
completion and visibility of previously issued memory stores, but does not
ensure the completion of remote memory updates issued via OpenSHMEM routines.
ROCSHMEM_SYNC_ALL#
-
__device__ void rocshmem_ctx_wg_sync_all(rocshmem_ctx_t ctx)#
-
__device__ void rocshmem_wg_sync_all()#
- Parameters:
ctx – Context with which to perform this operation.
- Returns:
None.
Description:
This routine behaves the same as rocshmem_wg_team_sync
when called on the world team.
ROSHMEM_ALLTOALL#
-
__device__ void rocshmem_ctx_TYPENAME_wg_alltoall(rocshmem_ctx_t ctx, rocshmem_team_t team, TYPE *dest, const TYPE *source, int nelems)#
- Parameters:
team – The team participating in the collective.
dest – Destination address. Must be an address on the symmetric heap.
source – Source address. Must be an address on the symmetric heap.
nelems – Number of data blocks transferred per pair of PEs.
- Returns:
None.
Description: This routine exchanges a fixed amount of contiguous data blocks between all pairs of PEs participating in the collective routine. This function must be called as a work-group collective.
Valid TYPENAME
and TYPE
values are listed in RMA Data Types.
ROCSHMEM_BROADCAST#
-
__device__ void rocshmem_ctx_TYPENAME_wg_broadcast(rocshmem_ctx_t ctx, rocshmem_team_t team, TYPE *dest, const TYPE *source, int nelems, int pe_root)#
- Parameters:
ctx – Context with which to perform this collective.
team – The team participating in the collective.
dest – Destination address. Must be an address on the symmetric heap.
source – Source address. Must be an address on the symmetric heap.
nelems – Number of data blocks transferred per pair of PEs.
- Returns:
None.
Description: This routine performs a broadcast across PEs in the team. The caller is blocked until the broadcast completes.
Valid TYPENAME
and TYPE
values are listed in RMA Data Types.
ROCSHMEM_FCOLLECT#
-
__device__ void rocshmem_ctx_TYPENAME_wg_fcollect(rocshmem_ctx_t ctx, rocshmem_team_t team, TYPE *dest, const TYPE *source, int nelems)#
- Parameters:
ctx – Context with which to perform this collective.
team – The team participating in the collective.
dest – Destination address. Must be an address on the symmetric heap.
source – Source address. Must be an address on the symmetric heap.
nelems – Number of data blocks transferred per pair of PEs.
- Returns:
None.
Description: This routine concatenates blocks of data from multiple PEs to an array in every PE participating in the collective routine.
ROCSHMEM_REDUCTION#
-
__device__ int rocshmem_ctx_TYPENAME_OPNAME_wg_reduce(rocshmem_ctx_t ctx, rocshmem_team_t team, TYPE *dest, const TYPE *source, int nreduce)#
- Parameters:
ctx – Context with which to perform this collective.
team – The team participating in the collective.
dest – Destination address. Must be an address on the symmetric heap.
source – Source address. Must be an address on the symmetric heap.
nreduce – Number of data blocks transferred per pair of PEs.
- Returns:
Zero on successful local completion. Nonzero otherwise.
Description: This routine performs an allreduce operation across PEs in the team.
Valid TYPENAME
, TYPE
, and OPNAME
values are listed in Reduction Types, Names and Operations.
Supported reduction types and operations#
TYPE |
TYPENAME |
OPNAME |
Supported |
---|---|---|---|
char |
char |
max, min, sum, prod |
No |
signed char |
schar |
max, min, sum, prod |
No |
short |
short |
max, min, sum, prod |
Yes |
int |
int |
max, min, sum, prod |
Yes |
long |
long |
max, min, sum, prod |
Yes |
long long |
longlong |
max, min, sum, prod |
Yes |
ptrdiff_t |
ptrdiff |
max, min, sum, prod |
No |
unsigned char |
uchar |
and, or, xor, max, min, sum, prod |
No |
unsigned short |
ushort |
and, or, xor, max, min, sum, prod |
No |
unsigned int |
uint |
and, or, xor, max, min, sum, prod |
No |
unsigned long |
ulong |
and, or, xor, max, min, sum, prod |
No |
unsigned long long |
ulonglong |
and, or, xor, max, min, sum, prod |
No |
int8_t |
int8 |
and, or, xor, max, min, sum, prod |
No |
int16_t |
int16 |
and, or, xor, max, min, sum, prod |
No |
int32_t |
int32 |
and, or, xor, max, min, sum, prod |
No |
int64_t |
int64 |
and, or, xor, max, min, sum, prod |
No |
uint8_t |
uint8 |
and, or, xor, max, min, sum, prod |
No |
uint16_t |
uint16 |
and, or, xor, max, min, sum, prod |
No |
uint32_t |
uint32 |
and, or, xor, max, min, sum, prod |
No |
uint64_t |
uint64 |
and, or, xor, max, min, sum, prod |
No |
size_t |
size |
and, or, xor, max, min, sum, prod |
No |
float |
float |
max, min, sum, prod |
Yes |
double |
double |
max, min, sum, prod |
Yes |
long double |
longdouble |
max, min, sum, prod |
No |
double _Complex |
complexd |
sum, prod |
No |
float _Complex |
complexf |
sum, prod |
No |