Cooperative Groups#
Modules | |
Construct functions of Cooperative groups | |
User-exposed API of Cooperative groups | |
Namespaces | |
namespace | cooperative_groups::impl |
Functions | |
cooperative_groups::thread_group::thread_group (internal::group_type type, uint32_t size=static_cast< uint64_t >(0), uint64_t mask=static_cast< uint64_t >(0)) | |
uint32_t | cooperative_groups::thread_group::size () const |
unsigned int | cooperative_groups::thread_group::cg_type () const |
Returns the type of the group. | |
uint32_t | cooperative_groups::thread_group::thread_rank () const |
Rank of the calling thread within [0, size() ). | |
bool | cooperative_groups::thread_group::is_valid () const |
Returns true if the group has not violated any API constraints. | |
void | cooperative_groups::thread_group::sync () const |
Synchronizes the threads in the group. | |
cooperative_groups::multi_grid_group::multi_grid_group (uint32_t size) | |
Construct mutli-grid thread group (through the API this_multi_grid()) | |
uint32_t | cooperative_groups::multi_grid_group::num_grids () |
uint32_t | cooperative_groups::multi_grid_group::grid_rank () |
uint32_t | cooperative_groups::multi_grid_group::thread_rank () const |
Rank of the calling thread within [0, size() ). | |
bool | cooperative_groups::multi_grid_group::is_valid () const |
Returns true if the group has not violated any API constraints. | |
void | cooperative_groups::multi_grid_group::sync () const |
Synchronizes the threads in the group. | |
cooperative_groups::grid_group::grid_group (uint32_t size) | |
Construct grid thread group (through the API this_grid()) | |
uint32_t | cooperative_groups::grid_group::thread_rank () const |
Rank of the calling thread within [0, size() ). | |
bool | cooperative_groups::grid_group::is_valid () const |
Returns true if the group has not violated any API constraints. | |
void | cooperative_groups::grid_group::sync () const |
Synchronizes the threads in the group. | |
dim3 | cooperative_groups::grid_group::group_dim () const |
cooperative_groups::thread_block::thread_block (uint32_t size) | |
thread_group | cooperative_groups::thread_block::new_tiled_group (unsigned int tile_size) const |
static dim3 | cooperative_groups::thread_block::group_index () |
Returns 3-dimensional block index within the grid. | |
static dim3 | cooperative_groups::thread_block::thread_index () |
Returns 3-dimensional thread index within the block. | |
static uint32_t | cooperative_groups::thread_block::thread_rank () |
Rank of the calling thread within [0, size() ). | |
static uint32_t | cooperative_groups::thread_block::size () |
static bool | cooperative_groups::thread_block::is_valid () |
Returns true if the group has not violated any API constraints. | |
static void | cooperative_groups::thread_block::sync () |
Synchronizes the threads in the group. | |
dim3 | cooperative_groups::thread_block::group_dim () |
Returns the group dimensions. | |
cooperative_groups::tiled_group::tiled_group (unsigned int tileSize) | |
unsigned int | cooperative_groups::tiled_group::size () const |
unsigned int | cooperative_groups::tiled_group::thread_rank () const |
Rank of the calling thread within [0, size() ). | |
void | cooperative_groups::tiled_group::sync () const |
Synchronizes the threads in the group. | |
cooperative_groups::coalesced_group::coalesced_group (lane_mask member_mask) | |
unsigned int | cooperative_groups::coalesced_group::size () const |
unsigned int | cooperative_groups::coalesced_group::thread_rank () const |
Rank of the calling thread within [0, size() ). | |
void | cooperative_groups::coalesced_group::sync () const |
Synchronizes the threads in the group. | |
unsigned int | cooperative_groups::coalesced_group::meta_group_rank () const |
unsigned int | cooperative_groups::coalesced_group::meta_group_size () const |
Returns the number of groups created when the parent group was partitioned. | |
template<class T > | |
T | cooperative_groups::coalesced_group::shfl (T var, int srcRank) const |
Shuffle operation on group level. | |
template<class T > | |
T | cooperative_groups::coalesced_group::shfl_down (T var, unsigned int lane_delta) const |
Shuffle down operation on group level. | |
template<class T > | |
T | cooperative_groups::coalesced_group::shfl_up (T var, unsigned int lane_delta) const |
Shuffle up operation on group level. | |
unsigned long long | cooperative_groups::coalesced_group::ballot (int pred) const |
Ballot function on group level. | |
int | cooperative_groups::coalesced_group::any (int pred) const |
Any function on group level. | |
int | cooperative_groups::coalesced_group::all (int pred) const |
All function on group level. | |
template<typename T > | |
unsigned long long | cooperative_groups::coalesced_group::match_any (T value) const |
Match any function on group level. | |
template<typename T > | |
unsigned long long | cooperative_groups::coalesced_group::match_all (T value, int &pred) const |
Match all function on group level. | |
static constexpr unsigned int | cooperative_groups::tile_base< tileSize >::thread_rank () |
Rank of the thread within this tile. | |
static unsigned int | cooperative_groups::tile_base< tileSize >::size () |
Number of threads within this tile. | |
static void | cooperative_groups::thread_block_tile_base< size >::sync () |
template<class T > | |
T | cooperative_groups::thread_block_tile_base< size >::shfl (T var, int srcRank) const |
template<class T > | |
T | cooperative_groups::thread_block_tile_base< size >::shfl_down (T var, unsigned int lane_delta) const |
template<class T > | |
T | cooperative_groups::thread_block_tile_base< size >::shfl_up (T var, unsigned int lane_delta) const |
template<class T > | |
T | cooperative_groups::thread_block_tile_base< size >::shfl_xor (T var, unsigned int laneMask) const |
unsigned long long | cooperative_groups::thread_block_tile_base< size >::ballot (int pred) const |
int | cooperative_groups::thread_block_tile_base< size >::any (int pred) const |
int | cooperative_groups::thread_block_tile_base< size >::all (int pred) const |
template<typename T > | |
unsigned long long | cooperative_groups::thread_block_tile_base< size >::match_any (T value) const |
template<typename T > | |
unsigned long long | cooperative_groups::thread_block_tile_base< size >::match_all (T value, int &pred) const |
static unsigned int | cooperative_groups::parent_group_info< tileSize, ParentCGTy >::meta_group_rank () |
static unsigned int | cooperative_groups::parent_group_info< tileSize, ParentCGTy >::meta_group_size () |
Returns the number of groups created when the parent group was partitioned. | |
cooperative_groups::thread_block_tile_type< tileSize, ParentCGTy >::thread_block_tile_type () | |
cooperative_groups::thread_block_tile_type< tileSize, void >::thread_block_tile_type (unsigned int meta_group_rank, unsigned int meta_group_size) | |
unsigned int | cooperative_groups::thread_block_tile_type< tileSize, void >::meta_group_rank () const |
unsigned int | cooperative_groups::thread_block_tile_type< tileSize, void >::meta_group_size () const |
Returns the number of groups created when the parent group was partitioned. | |
thread_group | cooperative_groups::this_thread () |
thread_group | cooperative_groups::tiled_partition (const thread_block &parent, unsigned int tile_size) |
tiled_group | cooperative_groups::tiled_partition (const tiled_group &parent, unsigned int tile_size) |
coalesced_group | cooperative_groups::tiled_partition (const coalesced_group &parent, unsigned int tile_size) |
Variables | |
uint32_t | cooperative_groups::thread_group::_type |
uint32_t | cooperative_groups::thread_group::_size |
Type of the thread_group. | |
uint64_t | cooperative_groups::thread_group::_mask |
Total number of threads in the tread_group. | |
bool | cooperative_groups::thread_group::_tiled_info::is_tiled |
unsigned int | cooperative_groups::thread_group::_tiled_info::size |
unsigned int | cooperative_groups::thread_group::_tiled_info::meta_group_rank |
unsigned int | cooperative_groups::thread_group::_tiled_info::meta_group_size |
lane_mask | cooperative_groups::thread_group::_coalesced_info::member_mask |
unsigned int | cooperative_groups::thread_group::_coalesced_info::size |
struct _tiled_info | cooperative_groups::thread_group::_coalesced_info::tiled_info |
struct cooperative_groups::thread_group::_coalesced_info | cooperative_groups::thread_group::coalesced_info |
static constexpr unsigned int | cooperative_groups::tile_base< tileSize >::numThreads = tileSize |
Detailed Description
This section describes the cooperative groups functions of HIP runtime API.
The cooperative groups provides flexible thread parallel programming algorithms, threads cooperate and share data to perform collective computations.
- Note
- Cooperative groups feature is implemented on Linux, under development on Microsoft Windows.
Function Documentation
◆ all() [1/2]
|
inline |
All function on group level.
Returns non-zero if a predicate evaluates true for all threads.
- Parameters
-
pred [in] The predicate to evaluate on group threads.
◆ all() [2/2]
|
inline |
◆ any() [1/2]
|
inline |
Any function on group level.
Returns non-zero if a predicate evaluates true for any threads.
- Parameters
-
pred [in] The predicate to evaluate on group threads.
◆ any() [2/2]
|
inline |
◆ ballot() [1/2]
|
inline |
Ballot function on group level.
Returns a bit mask with the Nth bit set to one if the specified predicate evaluates as true on the Nth thread.
- Parameters
-
pred [in] The predicate to evaluate on group threads.
◆ ballot() [2/2]
|
inline |
◆ cg_type()
|
inline |
Returns the type of the group.
◆ coalesced_group()
|
inlineexplicitprotected |
◆ grid_group()
|
inlineexplicitprotected |
Construct grid thread group (through the API this_grid())
◆ grid_rank()
|
inline |
Rank of this invocation. In other words, an ID number within the range [0, num_grids()) of the GPU that kernel is running on.
◆ group_dim() [1/2]
|
inline |
Returns the group dimensions.
◆ group_dim() [2/2]
|
inline |
◆ group_index()
|
inlinestatic |
Returns 3-dimensional block index within the grid.
◆ is_valid() [1/4]
|
inlinestatic |
Returns true if the group has not violated any API constraints.
◆ is_valid() [2/4]
bool cooperative_groups::thread_group::is_valid | ( | ) | const |
Returns true if the group has not violated any API constraints.
◆ is_valid() [3/4]
|
inline |
Returns true if the group has not violated any API constraints.
◆ is_valid() [4/4]
|
inline |
Returns true if the group has not violated any API constraints.
◆ match_all() [1/2]
|
inline |
Match all function on group level.
Returns a bit mask containing a 1-bit for every participating thread if they all have the same value in value
as the caller thread. The predicate pred
is set to true if all participating threads have the same value in value
.
- Parameters
-
value [in] The value to examine on the current thread in group. pred [out] The predicate is set to true if all participating threads in the thread group have the same value.
◆ match_all() [2/2]
|
inline |
◆ match_any() [1/2]
|
inline |
Match any function on group level.
Returns a bit mask containing a 1-bit for every participating thread if that thread has the same value in value
as the caller thread.
- Parameters
-
value [in] The value to examine on the current thread in group.
◆ match_any() [2/2]
|
inline |
◆ meta_group_rank() [1/3]
|
inlinestatic |
Returns the linear rank of the group within the set of tiles partitioned from a parent group (bounded by meta_group_size)
◆ meta_group_rank() [2/3]
|
inline |
Returns the linear rank of the group within the set of tiles partitioned from a parent group (bounded by meta_group_size).
◆ meta_group_rank() [3/3]
|
inline |
Returns the linear rank of the group within the set of tiles partitioned from a parent group (bounded by meta_group_size)
◆ meta_group_size() [1/3]
|
inlinestatic |
Returns the number of groups created when the parent group was partitioned.
◆ meta_group_size() [2/3]
|
inline |
Returns the number of groups created when the parent group was partitioned.
◆ meta_group_size() [3/3]
|
inline |
Returns the number of groups created when the parent group was partitioned.
◆ multi_grid_group()
|
inlineexplicitprotected |
Construct mutli-grid thread group (through the API this_multi_grid())
◆ new_tiled_group()
|
inlineprotected |
◆ num_grids()
|
inline |
Number of invocations participating in this multi-grid group. In other words, the number of GPUs.
◆ shfl() [1/2]
|
inline |
Shuffle operation on group level.
Exchanging variables between threads without use of shared memory. Shuffle operation is a direct copy of var
from srcRank
thread ID of group.
- Template Parameters
-
T The type can be a 32-bit integer or single-precision floating point.
- Parameters
-
var [in] The source variable to copy. Only the srcRank thread ID of group is copied to other threads. srcRank [in] The source thread ID of the group for copy.
◆ shfl() [2/2]
|
inline |
◆ shfl_down() [1/2]
|
inline |
Shuffle down operation on group level.
Exchanging variables between threads without use of shared memory. Shuffle down operation is copy of var
from thread with thread ID of group relative higher with lane_delta
to caller thread ID.
- Template Parameters
-
T The type can be a 32-bit integer or single-precision floating point.
- Parameters
-
var [in] The source variable to copy. lane_delta [in] The lane_delta is the relative thread ID difference between caller thread ID and source of copy thread ID. sourceID = (threadID + lane_delta) % size()
◆ shfl_down() [2/2]
|
inline |
◆ shfl_up() [1/2]
|
inline |
Shuffle up operation on group level.
Exchanging variables between threads without use of shared memory. Shuffle up operation is copy of var
from thread with thread ID of group relative lower with lane_delta
to caller thread ID.
- Template Parameters
-
T The type can be a 32-bit integer or single-precision floating point.
- Parameters
-
var [in] The source variable to copy. lane_delta [in] The lane_delta is the relative thread ID difference between caller thread ID and source of copy thread ID. sourceID = (threadID - lane_delta) % size()
◆ shfl_up() [2/2]
|
inline |
◆ shfl_xor()
|
inline |
◆ size() [1/5]
|
inlinestatic |
Total number of threads in the thread_group, and this serves the purpose for all derived cooperative group types because their size
is directly saved during the construction.
◆ size() [2/5]
|
inlinestatic |
Number of threads within this tile.
◆ size() [3/5]
|
inline |
Total number of threads in the thread_group, and this serves the purpose for all derived cooperative group types because their size
is directly saved during the construction.
◆ size() [4/5]
|
inline |
Total number of threads in the thread_group, and this serves the purpose for all derived cooperative group types because their size
is directly saved during the construction.
◆ size() [5/5]
|
inline |
Total number of threads in the thread_group, and this serves the purpose for all derived cooperative group types because their size
is directly saved during the construction.
◆ sync() [1/7]
|
inlinestatic |
Synchronizes the threads in the group.
Causes all threads in the group to wait at this synchronization point, and for all shared and global memory accesses by the threads to complete, before running synchronization. This guarantees the visibility of accessed data for all threads in the group.
- Note
- There are potential read-after-write (RAW), write-after-read (WAR), or write-after-write (WAW) hazards, when threads in the group access the same addresses in shared or global memory. The data hazards can be avoided with synchronization of the group.
◆ sync() [2/7]
|
inlinestatic |
◆ sync() [3/7]
void cooperative_groups::thread_group::sync | ( | ) | const |
Synchronizes the threads in the group.
Causes all threads in the group to wait at this synchronization point, and for all shared and global memory accesses by the threads to complete, before running synchronization. This guarantees the visibility of accessed data for all threads in the group.
- Note
- There are potential read-after-write (RAW), write-after-read (WAR), or write-after-write (WAW) hazards, when threads in the group access the same addresses in shared or global memory. The data hazards can be avoided with synchronization of the group.
◆ sync() [4/7]
|
inline |
Synchronizes the threads in the group.
Causes all threads in the group to wait at this synchronization point, and for all shared and global memory accesses by the threads to complete, before running synchronization. This guarantees the visibility of accessed data for all threads in the group.
- Note
- There are potential read-after-write (RAW), write-after-read (WAR), or write-after-write (WAW) hazards, when threads in the group access the same addresses in shared or global memory. The data hazards can be avoided with synchronization of the group.
◆ sync() [5/7]
|
inline |
Synchronizes the threads in the group.
Causes all threads in the group to wait at this synchronization point, and for all shared and global memory accesses by the threads to complete, before running synchronization. This guarantees the visibility of accessed data for all threads in the group.
- Note
- There are potential read-after-write (RAW), write-after-read (WAR), or write-after-write (WAW) hazards, when threads in the group access the same addresses in shared or global memory. The data hazards can be avoided with synchronization of the group.
◆ sync() [6/7]
|
inline |
Synchronizes the threads in the group.
Causes all threads in the group to wait at this synchronization point, and for all shared and global memory accesses by the threads to complete, before running synchronization. This guarantees the visibility of accessed data for all threads in the group.
- Note
- There are potential read-after-write (RAW), write-after-read (WAR), or write-after-write (WAW) hazards, when threads in the group access the same addresses in shared or global memory. The data hazards can be avoided with synchronization of the group.
◆ sync() [7/7]
|
inline |
Synchronizes the threads in the group.
Causes all threads in the group to wait at this synchronization point, and for all shared and global memory accesses by the threads to complete, before running synchronization. This guarantees the visibility of accessed data for all threads in the group.
- Note
- There are potential read-after-write (RAW), write-after-read (WAR), or write-after-write (WAW) hazards, when threads in the group access the same addresses in shared or global memory. The data hazards can be avoided with synchronization of the group.
◆ this_thread()
thread_group cooperative_groups::this_thread | ( | ) |
◆ thread_block()
|
inlineexplicitprotected |
◆ thread_block_tile_type() [1/2]
|
inlineprotected |
◆ thread_block_tile_type() [2/2]
|
inlineprotected |
◆ thread_group()
|
inlineprotected |
Lanemask for coalesced and tiled partitioned group types, LSB represents lane 0, and MSB represents lane 63 Construct a thread group, and set thread group type and other essential thread group properties. This generic thread group is directly constructed only when the group is supposed to contain only the calling the thread (through the API - this_thread()
), and in all other cases, this thread group object is a sub-object of some other derived thread group object.
◆ thread_index()
|
inlinestatic |
Returns 3-dimensional thread index within the block.
◆ thread_rank() [1/7]
|
inlinestatic |
Rank of the calling thread within [0, size() ).
◆ thread_rank() [2/7]
|
inlinestaticconstexpr |
Rank of the thread within this tile.
◆ thread_rank() [3/7]
uint32_t cooperative_groups::thread_group::thread_rank | ( | ) | const |
Rank of the calling thread within [0, size() ).
◆ thread_rank() [4/7]
|
inline |
Rank of the calling thread within [0, size() ).
◆ thread_rank() [5/7]
|
inline |
Rank of the calling thread within [0, size() ).
◆ thread_rank() [6/7]
|
inline |
Rank of the calling thread within [0, size() ).
◆ thread_rank() [7/7]
|
inline |
Rank of the calling thread within [0, size() ).
◆ tiled_group()
|
inlineexplicitprotected |
◆ tiled_partition() [1/3]
coalesced_group cooperative_groups::tiled_partition | ( | const coalesced_group & | parent, |
unsigned int | tile_size | ||
) |
◆ tiled_partition() [2/3]
thread_group cooperative_groups::tiled_partition | ( | const thread_block & | parent, |
unsigned int | tile_size | ||
) |
◆ tiled_partition() [3/3]
tiled_group cooperative_groups::tiled_partition | ( | const tiled_group & | parent, |
unsigned int | tile_size | ||
) |
Variable Documentation
◆ _mask
|
protected |
Total number of threads in the tread_group.
◆ _size
|
protected |
Type of the thread_group.
◆ _type
|
protected |
◆ coalesced_info
|
protected |
◆ is_tiled
bool cooperative_groups::thread_group::_tiled_info::is_tiled |
◆ member_mask
lane_mask cooperative_groups::thread_group::_coalesced_info::member_mask |
◆ meta_group_rank
unsigned int cooperative_groups::thread_group::_tiled_info::meta_group_rank |
◆ meta_group_size
unsigned int cooperative_groups::thread_group::_tiled_info::meta_group_size |
◆ numThreads
|
staticconstexprprotected |
◆ size [1/2]
unsigned int cooperative_groups::thread_group::_tiled_info::size |
◆ size [2/2]
unsigned int cooperative_groups::thread_group::_coalesced_info::size |
◆ tiled_info
struct _tiled_info cooperative_groups::thread_group::_coalesced_info::tiled_info |
Friends
◆ binary_partition [1/3]
|
friend |
Binary partition.
This splits the input thread group into two partitions determined by predicate.
- Parameters
-
cgrp [in] The coalesced group for split. pred [in] The predicate used during the group split up.
◆ binary_partition [2/3]
|
friend |
◆ binary_partition [3/3]
|
friend |
◆ coalesced_threads
|
friend |
User-exposed API to create coalesced groups.
A collective operation that groups all active lanes into a new thread group.
- Note
- This function is implemented on Linux and is under development on Microsoft Windows.
◆ this_grid
|
friend |
User-exposed API interface to construct grid cooperative group type object - grid_group
.
Only these friend functions are allowed to construct an object of this class and access its resources.
User is not allowed to construct an object of type grid_group
directly. Instead, they should construct it through this API function.
- Note
- This function is implemented on Linux and is under development on Microsoft Windows.
◆ this_multi_grid
|
friend |
User-exposed API interface to construct grid cooperative group type object - multi_grid_group
.
Only these friend functions are allowed to construct an object of this class and access its resources.
User is not allowed to construct an object of type multi_grid_group
directly. Instead, they should construct it through this API function.
- Note
- This multi-grid cooperative API type is implemented on Linux, under development on Microsoft Windows.
◆ this_thread
|
friend |
◆ this_thread_block
|
friend |
User-exposed API interface to construct workgroup cooperative group type object - thread_block
.
Only these friend functions are allowed to construct an object of thi class and access its resources
User is not allowed to construct an object of type thread_block
directly. Instead, they should construct it through this API function.
- Note
- This function is implemented on Linux and is under development on Microsoft Windows.
◆ thread_block
|
friend |
◆ tiled_partition [1/7]
|
friend |
◆ tiled_partition [2/7]
|
friend |
◆ tiled_partition [3/7]
|
friend |
User-exposed API to partition groups.
A collective operation that partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
◆ tiled_partition [4/7]
|
friend |
User-exposed API to partition groups.
A collective operation that partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
◆ tiled_partition [5/7]
|
friend |
User-exposed API to partition groups.
A collective operation that partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
◆ tiled_partition [6/7]
|
friend |
User-exposed API to partition groups.
A collective operation that partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
◆ tiled_partition [7/7]
|
friend |