Exchange#

template<class T, unsigned int ItemsPerThread, unsigned int WarpSize = ::rocprim::device_warp_size()>
class warp_exchange#

The warp_exchange class is a warp level parallel primitive which provides methods for rearranging items partitioned across threads in a warp.

Overview

  • The warp_exchange class supports the following rearrangement methods:

    • Transposing a blocked arrangement to a striped arrangement.

    • Transposing a striped arrangement to a blocked arrangement.

Examples

In the example an exchange operation is performed on a warp of 8 threads, using type int with 4 items per thread.

__global__ void example_kernel(...)
{
    constexpr unsigned int threads_per_block = 128;
    constexpr unsigned int threads_per_warp  =   8;
    constexpr unsigned int items_per_thread  =   4;
    constexpr unsigned int warps_per_block   = threads_per_block / threads_per_warp;
    const unsigned int warp_id = hipThreadIdx_x / threads_per_warp;
    // specialize warp_exchange for int, warp of 8 threads and 4 items per thread
    using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>;
    // allocate storage in shared memory
    __shared__ warp_exchange_int::storage_type storage[warps_per_block];

    int items[items_per_thread];
    ...
    warp_exchange_int w_exchange;
    w_exchange.blocked_to_striped(items, items, storage[warp_id]);
    ...
}

Template Parameters:
  • T – - the input type.

  • ItemsPerThread – - the number of items contributed by each thread.

  • WarpSize – - the number of threads in a warp.

Public Types

using storage_type = storage_type_#

Struct used to allocate a temporary memory that is required for thread communication during operations provided by the related parallel primitive.

Depending on the implemention the operations exposed by parallel primitive may require a temporary storage for thread communication. The storage should be allocated using keywords . It can be aliased to an externally allocated memory, or be a part of a union type with other storage types to increase shared memory reusability.

Public Functions

template<class U>
__device__ inline void blocked_to_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type &storage)#

Transposes a blocked arrangement of items to a striped arrangement across the warp, using temporary storage.

Storage reusage

Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().

Example.
__global__ void example_kernel(...)
{
    constexpr unsigned int threads_per_block = 128;
    constexpr unsigned int threads_per_warp  =   8;
    constexpr unsigned int items_per_thread  =   4;
    constexpr unsigned int warps_per_block   = threads_per_block / threads_per_warp;
    const unsigned int warp_id = hipThreadIdx_x / threads_per_warp;
    // specialize warp_exchange for int, warp of 8 threads and 4 items per thread
    using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>;
    // allocate storage in shared memory
    __shared__ warp_exchange_int::storage_type storage[warps_per_block];

    int items[items_per_thread];
    ...
    warp_exchange_int w_exchange;
    w_exchange.blocked_to_striped(items, items, storage[warp_id]);
    ...
}

Template Parameters:

U – - [inferred] the output type.

Parameters:
  • input[in] - array that data is loaded from.

  • output[out] - array that data is loaded to.

  • storage[in] - reference to a temporary storage object of type storage_type.

template<class U>
__device__ inline void blocked_to_striped_shuffle(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread])#

Transposes a blocked arrangement of items to a striped arrangement across the warp, using warp shuffle operations. Uses an optimized implementation for when WarpSize is equal to ItemsPerThread. Caution: this API is experimental. Performance might not be consistent. ItemsPerThread must be a divisor of WarpSize.

Example.
__global__ void example_kernel(...)
{
    constexpr unsigned int threads_per_block = 128;
    constexpr unsigned int threads_per_warp  =   8;
    constexpr unsigned int items_per_thread  =   4;
    constexpr unsigned int warps_per_block   = threads_per_block / threads_per_warp;
    const unsigned int warp_id = hipThreadIdx_x / threads_per_warp;
    // specialize warp_exchange for int, warp of 8 threads and 4 items per thread
    using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>;

    int items[items_per_thread];
    ...
    warp_exchange_int w_exchange;
    w_exchange.blocked_to_striped_shuffle(items, items);
    ...
}

Template Parameters:

U – - [inferred] the output type.

Parameters:
  • input[in] - array that data is loaded from.

  • output[out] - array that data is loaded to.

template<class U>
__device__ inline void striped_to_blocked(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], storage_type &storage)#

Transposes a striped arrangement of items to a blocked arrangement across the warp, using temporary storage.

Storage reusage

Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().

Example.
__global__ void example_kernel(...)
{
    constexpr unsigned int threads_per_block = 128;
    constexpr unsigned int threads_per_warp  =   8;
    constexpr unsigned int items_per_thread  =   4;
    constexpr unsigned int warps_per_block   = threads_per_block / threads_per_warp;
    const unsigned int warp_id = hipThreadIdx_x / threads_per_warp;
    // specialize warp_exchange for int, warp of 8 threads and 4 items per thread
    using warp_exchange_int = rocprim::warp_exchange<int, threads_per_warp, items_per_thread>;
    // allocate storage in shared memory
    __shared__ warp_exchange_int::storage_type storage[warps_per_block];

    int items[items_per_thread];
    ...
    warp_exchange_int w_exchange;
    w_exchange.striped_to_blocked(items, items, storage[warp_id]);
    ...
}

Template Parameters:

U – - [inferred] the output type.

Parameters:
  • input[in] - array that data is loaded from.

  • output[out] - array that data is loaded to.

  • storage[in] - reference to a temporary storage object of type storage_type.

template<class U>
__device__ inline void striped_to_blocked_shuffle(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread])#

Transposes a striped arrangement of items to a blocked arrangement across the warp, using warp shuffle operations. Uses an optimized implementation for when WarpSize is equal to ItemsPerThread. Caution: this API is experimental. Performance might not be consistent. ItemsPerThread must be a divisor of WarpSize.

Example.
__global__ void example_kernel(...)
{
    constexpr unsigned int threads_per_block = 128;
    constexpr unsigned int threads_per_warp  =   8;
    constexpr unsigned int items_per_thread  =   4;
    constexpr unsigned int warps_per_block   = threads_per_block / threads_per_warp;
    const unsigned int warp_id = hipThreadIdx_x / threads_per_warp;
    // specialize warp_exchange for int, warp of 8 threads and 4 items per thread
    using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>;

    int items[items_per_thread];
    ...
    warp_exchange_int w_exchange;
    w_exchange.striped_to_blocked_shuffle(items, items);
    ...
}

Template Parameters:

U – - [inferred] the output type.

Parameters:
  • input[in] - array that data is loaded from.

  • output[out] - array that data is loaded to.

template<class U, class OffsetT>
__device__ inline void scatter_to_striped(const T (&input)[ItemsPerThread], U (&output)[ItemsPerThread], const OffsetT (&ranks)[ItemsPerThread], storage_type &storage)#

Orders input values according to ranks using temporary storage, then writes the values to output in a striped manner. No values in ranks should exists that exceed WarpSize*ItemsPerThread-1 .

Storage reusage

Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().

Example.
__global__ void example_kernel(...)
{
    constexpr unsigned int threads_per_block = 128;
    constexpr unsigned int threads_per_warp  =   8;
    constexpr unsigned int items_per_thread  =   4;
    constexpr unsigned int warps_per_block   = threads_per_block / threads_per_warp;
    const unsigned int warp_id = hipThreadIdx_x / threads_per_warp;
    // specialize warp_exchange for int, warp of 8 threads and 4 items per thread
    using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>;
    // allocate storage in shared memory
    __shared__ warp_exchange_int::storage_type storage[warps_per_block];

    int items[items_per_thread];

    // data-type of `ranks` should be able to contain warp_size*items_per_thread unique elements
    // unsigned short is sufficient for up to 1024*64 elements
    unsigned short ranks[items_per_thread];
    ...
    warp_exchange_int w_exchange;
    w_exchange.scatter_to_striped(items, items, ranks, storage[warp_id]);
    ...
}

Template Parameters:

U – - [inferred] the output type.

Parameters:
  • input[in] - array that data is loaded from.

  • output[out] - array that data is loaded to.

  • ranks[in] - array containing the positions.

  • storage[in] - reference to a temporary storage object of type storage_type.