Exchange#
-
template<class T, unsigned int ItemsPerThread, unsigned int WarpSize = ::rocprim::device_warp_size()>
class warp_exchange# The
warp_exchange
class is a warp level parallel primitive which provides methods for rearranging items partitioned across threads in a warp.- Overview
The
warp_exchange
class supports the following rearrangement methods:Transposing a blocked arrangement to a striped arrangement.
Transposing a striped arrangement to a blocked arrangement.
- Examples
In the example an exchange operation is performed on a warp of 8 threads, using type
int
with 4 items per thread.__global__ void example_kernel(...) { constexpr unsigned int threads_per_block = 128; constexpr unsigned int threads_per_warp = 8; constexpr unsigned int items_per_thread = 4; constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; // specialize warp_exchange for int, warp of 8 threads and 4 items per thread using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>; // allocate storage in shared memory __shared__ warp_exchange_int::storage_type storage[warps_per_block]; int items[items_per_thread]; ... warp_exchange_int w_exchange; w_exchange.blocked_to_striped(items, items, storage[warp_id]); ... }
- Template Parameters:
T – - the input type.
ItemsPerThread – - the number of items contributed by each thread.
WarpSize – - the number of threads in a warp.
Public Types
-
using storage_type = storage_type_#
Struct used to allocate a temporary memory that is required for thread communication during operations provided by the related parallel primitive.
Depending on the implemention the operations exposed by parallel primitive may require a temporary storage for thread communication. The storage should be allocated using keywords
. It can be aliased to an externally allocated memory, or be a part of a union type with other storage types to increase shared memory reusability.
Public Functions
-
template<class U>
__device__ inline void blocked_to_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type &storage)# Transposes a blocked arrangement of items to a striped arrangement across the warp, using temporary storage.
- Storage reusage
Synchronization barrier should be placed before
storage
is reused or repurposed:__syncthreads()
orrocprim::syncthreads()
.- Example.
__global__ void example_kernel(...) { constexpr unsigned int threads_per_block = 128; constexpr unsigned int threads_per_warp = 8; constexpr unsigned int items_per_thread = 4; constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; // specialize warp_exchange for int, warp of 8 threads and 4 items per thread using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>; // allocate storage in shared memory __shared__ warp_exchange_int::storage_type storage[warps_per_block]; int items[items_per_thread]; ... warp_exchange_int w_exchange; w_exchange.blocked_to_striped(items, items, storage[warp_id]); ... }
- Template Parameters:
U – - [inferred] the output type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
storage – [in] - reference to a temporary storage object of type storage_type.
-
template<class U>
__device__ inline void blocked_to_striped_shuffle(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread])# Transposes a blocked arrangement of items to a striped arrangement across the warp, using warp shuffle operations. Caution: this API is experimental. Performance might not be consistent. ItemsPerThread must be a divisor of WarpSize.
- Example.
__global__ void example_kernel(...) { constexpr unsigned int threads_per_block = 128; constexpr unsigned int threads_per_warp = 8; constexpr unsigned int items_per_thread = 4; constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; // specialize warp_exchange for int, warp of 8 threads and 4 items per thread using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>; int items[items_per_thread]; ... warp_exchange_int w_exchange; w_exchange.blocked_to_striped_shuffle(items, items); ... }
- Template Parameters:
U – - [inferred] the output type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
-
template<class U>
__device__ inline void striped_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type &storage)# Transposes a striped arrangement of items to a blocked arrangement across the warp, using temporary storage.
- Storage reusage
Synchronization barrier should be placed before
storage
is reused or repurposed:__syncthreads()
orrocprim::syncthreads()
.- Example.
__global__ void example_kernel(...) { constexpr unsigned int threads_per_block = 128; constexpr unsigned int threads_per_warp = 8; constexpr unsigned int items_per_thread = 4; constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; // specialize warp_exchange for int, warp of 8 threads and 4 items per thread using warp_exchange_int = rocprim::warp_exchange<int, threads_per_warp, items_per_thread>; // allocate storage in shared memory __shared__ warp_exchange_int::storage_type storage[warps_per_block]; int items[items_per_thread]; ... warp_exchange_int w_exchange; w_exchange.striped_to_blocked(items, items, storage[warp_id]); ... }
- Template Parameters:
U – - [inferred] the output type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
storage – [in] - reference to a temporary storage object of type storage_type.
-
template<class U>
__device__ inline void striped_to_blocked_shuffle(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread])# Transposes a striped arrangement of items to a blocked arrangement across the warp, using warp shuffle operations. Caution: this API is experimental. Performance might not be consistent. ItemsPerThread must be a divisor of WarpSize.
- Example.
__global__ void example_kernel(...) { constexpr unsigned int threads_per_block = 128; constexpr unsigned int threads_per_warp = 8; constexpr unsigned int items_per_thread = 4; constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; // specialize warp_exchange for int, warp of 8 threads and 4 items per thread using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>; int items[items_per_thread]; ... warp_exchange_int w_exchange; w_exchange.striped_to_blocked_shuffle(items, items); ... }
- Template Parameters:
U – - [inferred] the output type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
-
template<class U, class OffsetT>
__device__ inline void scatter_to_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const OffsetT (&ranks)[ItemsPerThread], storage_type &storage)# Orders
input
values according to ranks using temporary storage, then writes the values tooutput
in a striped manner. No values inranks
should exists that exceedWarpSize*ItemsPerThread-1
.- Storage reusage
Synchronization barrier should be placed before
storage
is reused or repurposed:__syncthreads()
orrocprim::syncthreads()
.- Example.
__global__ void example_kernel(...) { constexpr unsigned int threads_per_block = 128; constexpr unsigned int threads_per_warp = 8; constexpr unsigned int items_per_thread = 4; constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp; const unsigned int warp_id = hipThreadIdx_x / threads_per_warp; // specialize warp_exchange for int, warp of 8 threads and 4 items per thread using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>; // allocate storage in shared memory __shared__ warp_exchange_int::storage_type storage[warps_per_block]; int items[items_per_thread]; // data-type of `ranks` should be able to contain warp_size*items_per_thread unique elements // unsigned short is sufficient for up to 1024*64 elements unsigned short ranks[items_per_thread]; ... warp_exchange_int w_exchange; w_exchange.scatter_to_striped(items, items, ranks, storage[warp_id]); ... }
- Template Parameters:
U – - [inferred] the output type.
- Parameters:
input – [in] - array that data is loaded from.
output – [out] - array that data is loaded to.
ranks – [in] - array containing the positions.
storage – [in] - reference to a temporary storage object of type storage_type.