Sort#
-
template<class Key, unsigned int WarpSize = device_warp_size(), class Value = empty_type>
class warp_sort : private rocprim::detail::warp_sort_shuffle<Key, device_warp_size(), empty_type># The warp_sort class provides warp-wide methods for computing a parallel sort of items across thread warps. This class currently implements parallel bitonic sort, and only accepts warp sizes that are powers of two.
- Overview
WarpSize
must be power of two.WarpSize
must be equal to or less than the size of hardware warp (see rocprim::device_warp_size()). If it is less, sort is performed separately within groups determined by WarpSize. For example, ifWarpSize
is 4, hardware warp is 64, sort will be performed in logical warps grouped like this:{ {0, 1, 2, 3}, {4, 5, 6, 7 }, ..., {60, 61, 62, 63} }
(thread is represented here by its id within hardware warp).Accepts custom compare_functions for sorting across a warp.
Number of threads executing warp_sort’s function must be a multiple of
WarpSize
.
- Example:
Every thread within the warp uses the warp_sort class by first specializing the warp_sort type, and instantiating an object that will be used to invoke a member function.
__global__ void example_kernel(...) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; int value = input[i]; rocprim::warp_sort<int, 64> wsort; wsort.sort(value); input[i] = value; }
Below is a snippet demonstrating how to pass a custom compare function:
__device__ bool customCompare(const int& a, const int& b) { return a < b; } ... __global__ void example_kernel(...) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; int value = input[i]; rocprim::warp_sort<int, 64> wsort; wsort.sort(value, customCompare); input[i] = value; }
- Template Parameters:
Key – Data type for parameter Key
WarpSize – [optional] The number of threads in a warp
Value – [optional] Data type for parameter Value. By default, it’s empty_type
Public Types
-
typedef base_type::storage_type storage_type#
Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive.
Depending on the implemention the operations exposed by parallel primitive may require a temporary storage for thread communication. The storage should be allocated using keywords
. It can be aliased to an externally allocated memory, or be a part of a union with other storage types to increase shared memory reusability.
Public Functions
-
template<class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key &thread_key, BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= device_warp_size()), void>::type# Warp sort for any data type.
- Template Parameters:
BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>.
- Parameters:
thread_key – - input/output to pass to other threads
compare_function – - binary operation function object that will be used for sort. The signature of the function should be equivalent to the following:
bool f(const T &a, const T &b);
. The signature does not need to haveconst &
, but function object must not modify the objects passed to it.
-
template<class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key&, BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > device_warp_size()), void>::type# Warp sort for any data type. Invalid Warp Size.
-
template<unsigned int ItemsPerThread, class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key (&thread_keys)[ItemsPerThread], BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= device_warp_size()), void>::type# Warp sort for any data type.
- Template Parameters:
BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>.
- Parameters:
thread_keys – - input/output keys to pass to other threads
compare_function – - binary operation function object that will be used for sort. The signature of the function should be equivalent to the following:
bool f(const T &a, const T &b);
. The signature does not need to haveconst &
, but function object must not modify the objects passed to it.
-
template<unsigned int ItemsPerThread, class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key (&thread_keys)[ItemsPerThread], BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > device_warp_size()), void>::type# Warp sort for any data type. Invalid Warp Size.
-
template<class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key &thread_key, storage_type &storage, BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= device_warp_size()), void>::type# Warp sort for any data type using temporary storage.
- Storage reusage
Synchronization barrier should be placed before
storage
is reused or repurposed:__syncthreads()
orrocprim::syncthreads()
.- Example.
__global__ void example_kernel(...) { int value = ...; using warp_sort_int = rp::warp_sort<int, 64>; warp_sort_int wsort; __shared__ typename warp_sort_int::storage_type storage; wsort.sort(value, storage); ... }
- Template Parameters:
BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>.
- Parameters:
thread_key – - input/output to pass to other threads
storage – - temporary storage for inputs
compare_function – - binary operation function object that will be used for sort. The signature of the function should be equivalent to the following:
bool f(const T &a, const T &b);
. The signature does not need to haveconst &
, but function object must not modify the objects passed to it.
-
template<class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key&, storage_type&, BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > device_warp_size()), void>::type# Warp sort for any data type using temporary storage. Invalid Warp Size.
-
template<unsigned int ItemsPerThread, class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key (&thread_keys)[ItemsPerThread], storage_type &storage, BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= device_warp_size()), void>::type# Warp sort for any data type using temporary storage.
- Storage reusage
Synchronization barrier should be placed before
storage
is reused or repurposed:__syncthreads()
orrocprim::syncthreads()
.- Example.
__global__ void example_kernel(...) { int value = ...; using warp_sort_int = rp::warp_sort<int, 64>; warp_sort_int wsort; __shared__ typename warp_sort_int::storage_type storage; wsort.sort(value, storage); ... }
- Template Parameters:
BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>.
- Parameters:
thread_keys – - input/output keys to pass to other threads
storage – - temporary storage for inputs
compare_function – - binary operation function object that will be used for sort. The signature of the function should be equivalent to the following:
bool f(const T &a, const T &b);
. The signature does not need to haveconst &
, but function object must not modify the objects passed to it.
-
template<unsigned int ItemsPerThread, class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key (&thread_keys)[ItemsPerThread], storage_type&, BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > device_warp_size()), void>::type# Warp sort for any data type using temporary storage. Invalid Warp Size.
-
template<class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key &thread_key, Value &thread_value, BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= device_warp_size()), void>::type# Warp sort by key for any data type.
- Template Parameters:
BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>.
- Parameters:
thread_key – - input/output key to pass to other threads
thread_value – - input/output value to pass to other threads
compare_function – - binary operation function object that will be used for sort. The signature of the function should be equivalent to the following:
bool f(const T &a, const T &b);
. The signature does not need to haveconst &
, but function object must not modify the objects passed to it.
-
template<class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key&, Value&, BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > device_warp_size()), void>::type# Warp sort by key for any data type. Invalid Warp Size.
-
template<unsigned int ItemsPerThread, class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key (&thread_keys)[ItemsPerThread], Value (&thread_values)[ItemsPerThread], BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= device_warp_size()), void>::type# Warp sort by key for any data type.
- Template Parameters:
BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>.
- Parameters:
thread_keys – - input/output keys to pass to other threads
thread_values – - input/outputs values to pass to other threads
compare_function – - binary operation function object that will be used for sort. The signature of the function should be equivalent to the following:
bool f(const T &a, const T &b);
. The signature does not need to haveconst &
, but function object must not modify the objects passed to it.
-
template<unsigned int ItemsPerThread, class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key (&thread_keys)[ItemsPerThread], Value (&thread_values)[ItemsPerThread], BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > device_warp_size()), void>::type# Warp sort by key for any data type. Invalid Warp Size.
-
template<class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key &thread_key, Value &thread_value, storage_type &storage, BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= device_warp_size()), void>::type# Warp sort by key for any data type using temporary storage.
- Storage reusage
Synchronization barrier should be placed before
storage
is reused or repurposed:__syncthreads()
orrocprim::syncthreads()
.- Example.
__global__ void example_kernel(...) { int value = ...; using warp_sort_int = rp::warp_sort<int, 64>; warp_sort_int wsort; __shared__ typename warp_sort_int::storage_type storage; wsort.sort(key, value, storage); ... }
- Template Parameters:
BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>.
- Parameters:
thread_key – - input/output key to pass to other threads
thread_value – - input/output value to pass to other threads
storage – - temporary storage for inputs
compare_function – - binary operation function object that will be used for sort. The signature of the function should be equivalent to the following:
bool f(const T &a, const T &b);
. The signature does not need to haveconst &
, but function object must not modify the objects passed to it.
-
template<class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key&, Value&, storage_type&, BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > device_warp_size()), void>::type# Warp sort by key for any data type using temporary storage. Invalid Warp Size.
-
template<unsigned int ItemsPerThread, class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key (&thread_keys)[ItemsPerThread], Value (&thread_values)[ItemsPerThread], storage_type &storage, BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= device_warp_size()), void>::type# Warp sort by key for any data type using temporary storage.
- Storage reusage
Synchronization barrier should be placed before
storage
is reused or repurposed:__syncthreads()
orrocprim::syncthreads()
.- Example.
__global__ void example_kernel(...) { int value = ...; using warp_sort_int = rp::warp_sort<int, 64>; warp_sort_int wsort; __shared__ typename warp_sort_int::storage_type storage; wsort.sort(key, value, storage); ... }
- Template Parameters:
BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>.
- Parameters:
thread_keys – - input/output keys to pass to other threads
thread_values – - input/output values to pass to other threads
storage – - temporary storage for inputs
compare_function – - binary operation function object that will be used for sort. The signature of the function should be equivalent to the following:
bool f(const T &a, const T &b);
. The signature does not need to haveconst &
, but function object must not modify the objects passed to it.
-
template<unsigned int ItemsPerThread, class BinaryFunction = ::rocprim::less<Key>, unsigned int FunctionWarpSize = WarpSize>
__device__ inline auto sort(Key (&thread_keys)[ItemsPerThread], Value (&thread_values)[ItemsPerThread], storage_type&, BinaryFunction compare_function = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > device_warp_size()), void>::type# Warp sort by key for any data type using temporary storage. Invalid Warp Size.