Exchange#
-
template<class T, unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
class block_exchange# The
block_exchange
class is a block level parallel primitive which provides methods for rearranging items partitioned across threads in a block.- Overview
The
block_exchange
class supports the following rearrangement methods:Transposing a blocked arrangement to a striped arrangement.
Transposing a striped arrangement to a blocked arrangement.
Transposing a blocked arrangement to a warp-striped arrangement.
Transposing a warp-striped arrangement to a blocked arrangement.
Scattering items to a blocked arrangement.
Scattering items to a striped arrangement.
Data is automatically be padded to ensure zero bank conflicts.
- Examples
In the examples exchange operation is performed on block of 128 threads, using type
int
with 8 items per thread.__global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; ... block_exchange_int b_exchange; b_exchange.blocked_to_striped(items, items, storage); ... }
- Template Parameters:
T – - the input type.
BlockSize – - the number of threads in a block.
ItemsPerThread – - the number of items contributed by each thread.
Public Types
-
using storage_type = storage_type_#
Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive.
Depending on the implementation the operations exposed by parallel primitive may require a temporary storage for thread communication. The storage should be allocated using keywords
. It can be aliased to an externally allocated memory, or be a part of a union type with other storage types to increase shared memory reusability.
Public Functions
-
template<class U>
__device__ inline void blocked_to_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread])# Transposes a blocked arrangement of items to a striped arrangement across the thread block.
- Template Parameters:
U – - [inferred] the output type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
-
template<class U>
__device__ inline void blocked_to_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type &storage)# Transposes a blocked arrangement of items to a striped arrangement across the thread block, using temporary storage.
- Storage reusage
Synchronization barrier should be placed before
storage
is reused or repurposed:__syncthreads()
orrocprim::syncthreads()
.- Example.
__global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; ... block_exchange_int b_exchange; b_exchange.blocked_to_striped(items, items, storage); ... }
- Template Parameters:
U – - [inferred] the output type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
storage – [in] - reference to a temporary storage object of type storage_type.
-
template<class U>
__device__ inline void striped_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread])# Transposes a striped arrangement of items to a blocked arrangement across the thread block.
- Template Parameters:
U – - [inferred] the output type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
-
template<class U>
__device__ inline void striped_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type &storage)# Transposes a striped arrangement of items to a blocked arrangement across the thread block, using temporary storage.
- Storage reusage
Synchronization barrier should be placed before
storage
is reused or repurposed:__syncthreads()
orrocprim::syncthreads()
.- Example.
__global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; ... block_exchange_int b_exchange; b_exchange.striped_to_blocked(items, items, storage); ... }
- Template Parameters:
U – - [inferred] the output type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
storage – [in] - reference to a temporary storage object of type storage_type.
-
template<class U>
__device__ inline void blocked_to_warp_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread])# Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block.
- Template Parameters:
U – - [inferred] the output type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
-
template<class U>
__device__ inline void blocked_to_warp_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type &storage)# Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block, using temporary storage.
- Storage reusage
Synchronization barrier should be placed before
storage
is reused or repurposed:__syncthreads()
orrocprim::syncthreads()
.- Example.
__global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; ... block_exchange_int b_exchange; b_exchange.blocked_to_warp_striped(items, items, storage); ... }
- Template Parameters:
U – - [inferred] the output type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
storage – [in] - reference to a temporary storage object of type storage_type.
-
template<class U>
__device__ inline void warp_striped_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread])# Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block.
- Template Parameters:
U – - [inferred] the output type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
-
template<class U>
__device__ inline void warp_striped_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type &storage)# Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block, using temporary storage.
- Storage reusage
Synchronization barrier should be placed before
storage
is reused or repurposed:__syncthreads()
orrocprim::syncthreads()
.- Example.
__global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; ... block_exchange_int b_exchange; b_exchange.warp_striped_to_blocked(items, items, storage); ... }
- Template Parameters:
U – - [inferred] the output type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
storage – [in] - reference to a temporary storage object of type storage_type.
-
template<class U, class Offset>
__device__ inline void scatter_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread])# Scatters items to a blocked arrangement based on their ranks across the thread block.
- Template Parameters:
U – - [inferred] the output type.
Offset – - [inferred] the rank type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
ranks – [out] - array that has rank of data.
-
template<class U, class Offset>
__device__ inline void gather_from_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread])# Gathers items from a striped arrangement based on their ranks across the thread block.
- Template Parameters:
U – - [inferred] the output type.
Offset – - [inferred] the rank type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
ranks – [out] - array that has rank of data.
-
template<class U, class Offset>
__device__ inline void scatter_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread], storage_type &storage)# Scatters items to a blocked arrangement based on their ranks across the thread block, using temporary storage.
- Storage reusage
Synchronization barrier should be placed before
storage
is reused or repurposed:__syncthreads()
orrocprim::syncthreads()
.- Example.
__global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; int ranks[8]; ... block_exchange_int b_exchange; b_exchange.scatter_to_blocked(items, items, ranks, storage); ... }
- Template Parameters:
U – - [inferred] the output type.
Offset – - [inferred] the rank type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
ranks – [out] - array that has rank of data.
storage – [in] - reference to a temporary storage object of type storage_type.
-
template<class U, class Offset>
__device__ inline void gather_from_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread], storage_type &storage)# Gathers items from a striped arrangement based on their ranks across the thread block, using temporary storage.
- Template Parameters:
U – - [inferred] the output type.
Offset – - [inferred] the rank type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
ranks – [out] - array that has rank of data.
storage – [in] - reference to a temporary storage object of type storage_type.
-
template<class U, class Offset>
__device__ inline void scatter_to_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread])# Scatters items to a striped arrangement based on their ranks across the thread block.
- Template Parameters:
U – - [inferred] the output type.
Offset – - [inferred] the rank type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
ranks – [out] - array that has rank of data.
-
template<class U, class Offset>
__device__ inline void scatter_to_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread], storage_type &storage)# Scatters items to a striped arrangement based on their ranks across the thread block, using temporary storage.
- Storage reusage
Synchronization barrier should be placed before
storage
is reused or repurposed:__syncthreads()
orrocprim::syncthreads()
.- Example.
__global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; int ranks[8]; ... block_exchange_int b_exchange; b_exchange.scatter_to_striped(items, items, ranks, storage); ... }
- Template Parameters:
U – - [inferred] the output type.
Offset – - [inferred] the rank type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
ranks – [out] - array that has rank of data.
storage – [in] - reference to a temporary storage object of type storage_type.
-
template<class U, class Offset>
__device__ inline void scatter_to_striped_guarded(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread])# Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank.
- Overview
Items with rank -1 are not scattered.
- Template Parameters:
U – - [inferred] the output type.
Offset – - [inferred] the rank type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
ranks – [in] - array that has rank of data.
-
template<class U, class Offset>
__device__ inline void scatter_to_striped_guarded(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread], storage_type &storage)# Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank, using temporary storage.
- Overview
Items with rank -1 are not scattered.
- Storage reusage
Synchronization barrier should be placed before
storage
is reused or repurposed:__syncthreads()
orrocprim::syncthreads()
.- Example.
__global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; int ranks[8]; ... block_exchange_int b_exchange; b_exchange.scatter_to_striped_guarded(items, items, ranks, storage); ... }
- Template Parameters:
U – - [inferred] the output type.
Offset – - [inferred] the rank type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
ranks – [in] - array that has rank of data.
storage – [in] - reference to a temporary storage object of type storage_type.
-
template<class U, class Offset, class ValidFlag>
__device__ inline void scatter_to_striped_flagged(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread], const ValidFlag (&is_valid)[ItemsPerThread])# Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity.
- Template Parameters:
U – - [inferred] the output type.
Offset – - [inferred] the rank type.
ValidFlag – - [inferred] the validity flag type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
ranks – [in] - array that has rank of data.
is_valid – [in] - array that has flags to denote validity.
-
template<class U, class Offset, class ValidFlag>
__device__ inline void scatter_to_striped_flagged(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread], const ValidFlag (&is_valid)[ItemsPerThread], storage_type &storage)# Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity, using temporary storage.
- Storage reusage
Synchronization barrier should be placed before
storage
is reused or repurposed:__syncthreads()
orrocprim::syncthreads()
.- Example.
__global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; int ranks[8]; int flags[8]; ... block_exchange_int b_exchange; b_exchange.scatter_to_striped_flagged(items, items, ranks, flags, storage); ... }
- Template Parameters:
U – - [inferred] the output type.
Offset – - [inferred] the rank type.
ValidFlag – - [inferred] the validity flag type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
ranks – [in] - array that has rank of data.
is_valid – [in] - array that has flags to denote validity.
storage – [in] - reference to a temporary storage object of type storage_type.