Exchange#
- 
template<class T, unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1, block_padding_hint PaddingHint = block_padding_hint::avoid_conflicts>
 class block_exchange#
- The - block_exchangeclass is a block level parallel primitive which provides methods for rearranging items partitioned across threads in a block.- Overview
- The - block_exchangeclass supports the following rearrangement methods:- Transposing a blocked arrangement to a striped arrangement. 
- Transposing a striped arrangement to a blocked arrangement. 
- Transposing a blocked arrangement to a warp-striped arrangement. 
- Transposing a warp-striped arrangement to a blocked arrangement. 
- Scattering items to a blocked arrangement. 
- Scattering items to a striped arrangement. 
 
- Data is automatically be padded to ensure zero bank conflicts. 
 
- Examples
- In the examples exchange operation is performed on block of 128 threads, using type - intwith 8 items per thread.- __global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; ... block_exchange_int b_exchange; b_exchange.blocked_to_striped(items, items, storage); ... } 
 - Template Parameters:
- T – - the input type. 
- BlockSize – - the number of threads in a block. 
- ItemsPerThread – - the number of items contributed by each thread. 
- PaddingHint – - a hint that decides when to use padding. May not always be applicable. 
 
 - Public Types - 
using storage_type = storage_type_#
- Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive. - Depending on the implementation the operations exposed by parallel primitive may require a temporary storage for thread communication. The storage should be allocated using keywords - . It can be aliased to an externally allocated memory, or be a part of a union type with other storage types to increase shared memory reusability.
 - Public Functions - 
template<class U>
 __device__ inline void blocked_to_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread])#
- Transposes a blocked arrangement of items to a striped arrangement across the thread block. - Template Parameters:
- U – - [inferred] the output type. 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
 
 
 - 
template<class U>
 __device__ inline void blocked_to_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type &storage)#
- Transposes a blocked arrangement of items to a striped arrangement across the thread block, using temporary storage. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Example.
- __global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; ... block_exchange_int b_exchange; b_exchange.blocked_to_striped(items, items, storage); ... } 
 - Template Parameters:
- U – - [inferred] the output type. 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
 
 
 - 
template<class U>
 __device__ inline void striped_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread])#
- Transposes a striped arrangement of items to a blocked arrangement across the thread block. - Template Parameters:
- U – - [inferred] the output type. 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
 
 
 - 
template<class U>
 __device__ inline void striped_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type &storage)#
- Transposes a striped arrangement of items to a blocked arrangement across the thread block, using temporary storage. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Example.
- __global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; ... block_exchange_int b_exchange; b_exchange.striped_to_blocked(items, items, storage); ... } 
 - Template Parameters:
- U – - [inferred] the output type. 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
 
 
 - 
template<class U>
 __device__ inline void blocked_to_warp_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread])#
- Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block. - Template Parameters:
- U – - [inferred] the output type. 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
 
 
 - 
template<class U>
 __device__ inline void blocked_to_warp_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type &storage)#
- Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block, using temporary storage. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Example.
- __global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; ... block_exchange_int b_exchange; b_exchange.blocked_to_warp_striped(items, items, storage); ... } 
 - Template Parameters:
- U – - [inferred] the output type. 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
 
 
 - 
template<class U>
 __device__ inline void warp_striped_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread])#
- Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block. - Template Parameters:
- U – - [inferred] the output type. 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
 
 
 - 
template<class U>
 __device__ inline void warp_striped_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type &storage)#
- Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block, using temporary storage. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Example.
- __global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; ... block_exchange_int b_exchange; b_exchange.warp_striped_to_blocked(items, items, storage); ... } 
 - Template Parameters:
- U – - [inferred] the output type. 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
 
 
 - 
template<class U, class Offset>
 __device__ inline void scatter_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread])#
- Scatters items to a blocked arrangement based on their ranks across the thread block. - Template Parameters:
- U – - [inferred] the output type. 
- Offset – - [inferred] the rank type. 
 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- ranks – [out] - array that has rank of data. 
 
 
 - 
template<class U, class Offset>
 __device__ inline void gather_from_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread])#
- Gathers items from a striped arrangement based on their ranks across the thread block. - Template Parameters:
- U – - [inferred] the output type. 
- Offset – - [inferred] the rank type. 
 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- ranks – [out] - array that has rank of data. 
 
 
 - 
template<class U, class Offset>
 __device__ inline void scatter_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread], storage_type &storage)#
- Scatters items to a blocked arrangement based on their ranks across the thread block, using temporary storage. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Example.
- __global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; int ranks[8]; ... block_exchange_int b_exchange; b_exchange.scatter_to_blocked(items, items, ranks, storage); ... } 
 - Template Parameters:
- U – - [inferred] the output type. 
- Offset – - [inferred] the rank type. 
 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- ranks – [out] - array that has rank of data. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
 
 
 - 
template<class U, class Offset>
 __device__ inline void gather_from_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread], storage_type &storage)#
- Gathers items from a striped arrangement based on their ranks across the thread block, using temporary storage. - Template Parameters:
- U – - [inferred] the output type. 
- Offset – - [inferred] the rank type. 
 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- ranks – [out] - array that has rank of data. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
 
 
 - 
template<class U, class Offset>
 __device__ inline void scatter_to_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread])#
- Scatters items to a striped arrangement based on their ranks across the thread block. - Template Parameters:
- U – - [inferred] the output type. 
- Offset – - [inferred] the rank type. 
 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- ranks – [out] - array that has rank of data. 
 
 
 - 
template<class U, class Offset>
 __device__ inline void scatter_to_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread], storage_type &storage)#
- Scatters items to a striped arrangement based on their ranks across the thread block, using temporary storage. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Example.
- __global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; int ranks[8]; ... block_exchange_int b_exchange; b_exchange.scatter_to_striped(items, items, ranks, storage); ... } 
 - Template Parameters:
- U – - [inferred] the output type. 
- Offset – - [inferred] the rank type. 
 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- ranks – [out] - array that has rank of data. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
 
 
 - 
template<unsigned int WarpSize = device_warp_size(), class U, class Offset>
 __device__ inline void scatter_to_warp_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread], storage_type &storage)#
- Scatters items to a warp striped arrangement based on their ranks across the thread block, using temporary storage. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Example.
- __global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; int ranks[8]; ... block_exchange_int b_exchange; b_exchange.scatter_to_warp_striped(items, items, ranks, storage); ... } 
 - Template Parameters:
- U – - [inferred] the output type. 
- Offset – - [inferred] the rank type. 
 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- ranks – [out] - array that has rank of data. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
 
 
 - 
template<class U, class Offset>
 __device__ inline void scatter_to_striped_guarded(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread])#
- Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank. - Overview
- Items with rank -1 are not scattered. 
 
 - Template Parameters:
- U – - [inferred] the output type. 
- Offset – - [inferred] the rank type. 
 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- ranks – [in] - array that has rank of data. 
 
 
 - 
template<class U, class Offset>
 __device__ inline void scatter_to_striped_guarded(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread], storage_type &storage)#
- Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank, using temporary storage. - Overview
- Items with rank -1 are not scattered. 
 
- Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Example.
- __global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; int ranks[8]; ... block_exchange_int b_exchange; b_exchange.scatter_to_striped_guarded(items, items, ranks, storage); ... } 
 - Template Parameters:
- U – - [inferred] the output type. 
- Offset – - [inferred] the rank type. 
 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- ranks – [in] - array that has rank of data. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
 
 
 - 
template<class U, class Offset, class ValidFlag>
 __device__ inline void scatter_to_striped_flagged(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread], const ValidFlag (&is_valid)[ItemsPerThread])#
- Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity. - Template Parameters:
- U – - [inferred] the output type. 
- Offset – - [inferred] the rank type. 
- ValidFlag – - [inferred] the validity flag type. 
 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- ranks – [in] - array that has rank of data. 
- is_valid – [in] - array that has flags to denote validity. 
 
 
 - 
template<class U, class Offset, class ValidFlag>
 __device__ inline void scatter_to_striped_flagged(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const Offset (&ranks)[ItemsPerThread], const ValidFlag (&is_valid)[ItemsPerThread], storage_type &storage)#
- Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity, using temporary storage. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Example.
- __global__ void example_kernel(...) { // specialize block_exchange for int, block of 128 threads and 8 items per thread using block_exchange_int = rocprim::block_exchange<int, 128, 8>; // allocate storage in shared memory __shared__ block_exchange_int::storage_type storage; int items[8]; int ranks[8]; int flags[8]; ... block_exchange_int b_exchange; b_exchange.scatter_to_striped_flagged(items, items, ranks, flags, storage); ... } 
 - Template Parameters:
- U – - [inferred] the output type. 
- Offset – - [inferred] the rank type. 
- ValidFlag – - [inferred] the validity flag type. 
 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- ranks – [in] - array that has rank of data. 
- is_valid – [in] - array that has flags to denote validity. 
- storage – [in] - reference to a temporary storage object of type storage_type.