Exchange#
- 
template<class T, unsigned int ItemsPerThread, unsigned int WarpSize = ::rocprim::device_warp_size()>
 class warp_exchange#
- The - warp_exchangeclass is a warp level parallel primitive which provides methods for rearranging items partitioned across threads in a warp.- Overview
- The - warp_exchangeclass supports the following rearrangement methods:- Transposing a blocked arrangement to a striped arrangement. 
- Transposing a striped arrangement to a blocked arrangement. 
 
 
- Examples
- In the example an exchange operation is performed on a warp of 8 threads, using type - intwith 4 items per thread.- __global__ void example_kernel(...) { constexpr unsigned int threads_per_block = 128; constexpr unsigned int threads_per_warp = 8; constexpr unsigned int items_per_thread = 4; constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; // specialize warp_exchange for int, warp of 8 threads and 4 items per thread using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>; // allocate storage in shared memory __shared__ warp_exchange_int::storage_type storage[warps_per_block]; int items[items_per_thread]; ... warp_exchange_int w_exchange; w_exchange.blocked_to_striped(items, items, storage[warp_id]); ... } 
 - Template Parameters:
- T – - the input type. 
- ItemsPerThread – - the number of items contributed by each thread. 
- WarpSize – - the number of threads in a warp. 
 
 - Public Types - 
using storage_type = storage_type_#
- Struct used to allocate a temporary memory that is required for thread communication during operations provided by the related parallel primitive. - Depending on the implemention the operations exposed by parallel primitive may require a temporary storage for thread communication. The storage should be allocated using keywords - . It can be aliased to an externally allocated memory, or be a part of a union type with other storage types to increase shared memory reusability.
 - Public Functions - 
template<class U>
 __device__ inline void blocked_to_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type &storage)#
- Transposes a blocked arrangement of items to a striped arrangement across the warp, using temporary storage. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Example.
- __global__ void example_kernel(...) { constexpr unsigned int threads_per_block = 128; constexpr unsigned int threads_per_warp = 8; constexpr unsigned int items_per_thread = 4; constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; // specialize warp_exchange for int, warp of 8 threads and 4 items per thread using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>; // allocate storage in shared memory __shared__ warp_exchange_int::storage_type storage[warps_per_block]; int items[items_per_thread]; ... warp_exchange_int w_exchange; w_exchange.blocked_to_striped(items, items, storage[warp_id]); ... } 
 - Template Parameters:
- U – - [inferred] the output type. 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
 
 
 - 
template<class U>
 __device__ inline void blocked_to_striped_shuffle(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread])#
- Transposes a blocked arrangement of items to a striped arrangement across the warp, using warp shuffle operations. Uses an optimized implementation for when WarpSize is equal to ItemsPerThread. Caution: this API is experimental. Performance might not be consistent. ItemsPerThread must be a divisor of WarpSize. - Example.
- __global__ void example_kernel(...) { constexpr unsigned int threads_per_block = 128; constexpr unsigned int threads_per_warp = 8; constexpr unsigned int items_per_thread = 4; constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; // specialize warp_exchange for int, warp of 8 threads and 4 items per thread using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>; int items[items_per_thread]; ... warp_exchange_int w_exchange; w_exchange.blocked_to_striped_shuffle(items, items); ... } 
 - Template Parameters:
- U – - [inferred] the output type. 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
 
 
 - 
template<class U>
 __device__ inline void striped_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type &storage)#
- Transposes a striped arrangement of items to a blocked arrangement across the warp, using temporary storage. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Example.
- __global__ void example_kernel(...) { constexpr unsigned int threads_per_block = 128; constexpr unsigned int threads_per_warp = 8; constexpr unsigned int items_per_thread = 4; constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; // specialize warp_exchange for int, warp of 8 threads and 4 items per thread using warp_exchange_int = rocprim::warp_exchange<int, threads_per_warp, items_per_thread>; // allocate storage in shared memory __shared__ warp_exchange_int::storage_type storage[warps_per_block]; int items[items_per_thread]; ... warp_exchange_int w_exchange; w_exchange.striped_to_blocked(items, items, storage[warp_id]); ... } 
 - Template Parameters:
- U – - [inferred] the output type. 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
 
 
 - 
template<class U>
 __device__ inline void striped_to_blocked_shuffle(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread])#
- Transposes a striped arrangement of items to a blocked arrangement across the warp, using warp shuffle operations. Uses an optimized implementation for when WarpSize is equal to ItemsPerThread. Caution: this API is experimental. Performance might not be consistent. ItemsPerThread must be a divisor of WarpSize. - Example.
- __global__ void example_kernel(...) { constexpr unsigned int threads_per_block = 128; constexpr unsigned int threads_per_warp = 8; constexpr unsigned int items_per_thread = 4; constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; // specialize warp_exchange for int, warp of 8 threads and 4 items per thread using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>; int items[items_per_thread]; ... warp_exchange_int w_exchange; w_exchange.striped_to_blocked_shuffle(items, items); ... } 
 - Template Parameters:
- U – - [inferred] the output type. 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
 
 
 - 
template<class U, class OffsetT>
 __device__ inline void scatter_to_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const OffsetT (&ranks)[ItemsPerThread], storage_type &storage)#
- Orders - inputvalues according to ranks using temporary storage, then writes the values to- outputin a striped manner. No values in- ranksshould exists that exceed- WarpSize*ItemsPerThread-1.- Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Example.
- __global__ void example_kernel(...) { constexpr unsigned int threads_per_block = 128; constexpr unsigned int threads_per_warp = 8; constexpr unsigned int items_per_thread = 4; constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; // specialize warp_exchange for int, warp of 8 threads and 4 items per thread using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>; // allocate storage in shared memory __shared__ warp_exchange_int::storage_type storage[warps_per_block]; int items[items_per_thread]; // data-type of `ranks` should be able to contain warp_size*items_per_thread unique elements // unsigned short is sufficient for up to 1024*64 elements unsigned short ranks[items_per_thread]; ... warp_exchange_int w_exchange; w_exchange.scatter_to_striped(items, items, ranks, storage[warp_id]); ... } 
 - Template Parameters:
- U – - [inferred] the output type. 
- Parameters:
- input – [in] - array that data is loaded from. 
- output – [out] - array that data is loaded to. 
- ranks – [in] - array containing the positions. 
- storage – [in] - reference to a temporary storage object of type storage_type.