Sort#
Generic Block Sort#
- 
template<class Key, unsigned int BlockSizeX, unsigned int ItemsPerThread = 1, class Value = empty_type, block_sort_algorithm Algorithm = block_sort_algorithm::default_algorithm, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
 class block_sort#
- The block_sort class is a block level parallel primitive which provides methods sorting items (keys or key-value pairs) partitioned across threads in a block using comparison-based sort algorithm. - Overview
- Accepts custom compare_functions for sorting across a block. 
- Performance notes: - It is generally better if - BlockSizeand- ItemsPerThreadare powers of two.
- The overloaded functions with - sizeare generally slower.
 
 
- Examples
- In the examples sort is performed on a block of 256 threads, each thread provides 8 - intvalues, results are returned using the same variable as for input.- __global__ void example_kernel(...) { // specialize block_sort for int, block of 256 threads, and 8 items per thread // key-only sort using block_sort_int = rocprim::block_sort<int, 256, 8>; // allocate storage in shared memory __shared__ block_sort_int::storage_type storage; int input[8] = ...; // execute block sort (ascending) block_sort_int().sort( input, storage ); ... } 
 - Template Parameters:
- Key – - the key type. 
- BlockSize – - the number of threads in a block. 
- ItemsPerThread – - number of items processed by each thread. The total range will be BlockSize * ItemsPerThread long 
- Value – - the value type. Default type empty_type indicates a keys-only sort. 
- Algorithm – - selected sort algorithm. The available algorithms and default choice are documented in block_sort_algorithm. 
 
 - Public Types - 
using storage_type = typename base_type::storage_type#
- Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive. - Depending on the implemention the operations exposed by parallel primitive may require a temporary storage for thread communication. The storage should be allocated using keywords - . It can be aliased to an externally allocated memory, or be a part of a union type with other storage types to increase shared memory reusability.
 - Public Functions - 
template<class BinaryFunction = ::rocprim::less<Key>>
 __device__ inline void sort(Key &thread_key, BinaryFunction compare_function = BinaryFunction())#
- Block sort for any data type. - Template Parameters:
- BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>. 
- Parameters:
- thread_key – [inout] - reference to a key provided by a thread. 
- compare_function – [in] - comparison function object which returns true if the first argument is is ordered before the second. The signature of the function should be equivalent to the following: - bool f(const T &a, const T &b);. The signature does not need to have- const &, but function object must not modify the objects passed to it.
 
 
 - 
template<class BinaryFunction = ::rocprim::less<Key>>
 __device__ inline void sort(Key (&thread_keys)[ItemsPerThread], BinaryFunction compare_function = BinaryFunction())#
- This overload allows an array of - ItemsPerThreadkeys to be passed in so that each thread can process multiple items.
 - 
template<class BinaryFunction = ::rocprim::less<Key>>
 __device__ inline void sort(Key &thread_key, storage_type &storage, BinaryFunction compare_function = BinaryFunction())#
- Block sort for any data type. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Examples
- In the examples sort is performed on a block of 256 threads, each thread provides one - intvalue, results are returned using the same variable as for input.- __global__ void example_kernel(...) { // specialize block_sort for int, block of 256 threads // key-only sort using block_sort_int = rocprim::block_sort<int, 256>; // allocate storage in shared memory __shared__ block_sort_int::storage_type storage; int input = ...; // execute block sort (ascending) block_sort_int().sort( input, storage ); ... } 
 - Template Parameters:
- BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>. 
- Parameters:
- thread_key – [inout] - reference to a key provided by a thread. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
- compare_function – [in] - comparison function object which returns true if the first argument is is ordered before the second. The signature of the function should be equivalent to the following: - bool f(const T &a, const T &b);. The signature does not need to have- const &, but function object must not modify the objects passed to it.
 
 
 - 
template<class BinaryFunction = ::rocprim::less<Key>>
 __device__ inline void sort(Key (&thread_keys)[ItemsPerThread], storage_type &storage, BinaryFunction compare_function = BinaryFunction())#
- This overload allows arrays of - ItemsPerThreadkeys to be passed in so that each thread can process multiple items.
 - 
template<class BinaryFunction = ::rocprim::less<Key>>
 __device__ inline void sort(Key &thread_key, Value &thread_value, BinaryFunction compare_function = BinaryFunction())#
- Block sort by key for any data type. - Template Parameters:
- BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>. 
- Parameters:
- thread_key – [inout] - reference to a key provided by a thread. 
- thread_value – [inout] - reference to a value provided by a thread. 
- compare_function – [in] - comparison function object which returns true if the first argument is is ordered before the second. The signature of the function should be equivalent to the following: - bool f(const T &a, const T &b);. The signature does not need to have- const &, but function object must not modify the objects passed to it.
 
 
 - 
template<class BinaryFunction = ::rocprim::less<Key>>
 __device__ inline void sort(Key (&thread_keys)[ItemsPerThread], Value (&thread_values)[ItemsPerThread], BinaryFunction compare_function = BinaryFunction())#
- This overload allows an array of - ItemsPerThreadkeys and values to be passed in so that each thread can process multiple items.
 - 
template<class BinaryFunction = ::rocprim::less<Key>>
 __device__ inline void sort(Key &thread_key, Value &thread_value, storage_type &storage, BinaryFunction compare_function = BinaryFunction())#
- Block sort by key for any data type. - In the examples sort is performed on a block of 256 threads, each thread provides 8 - intkeys and one- intvalue, results are returned using the same variable as for input.- __global__ void example_kernel(...) { // specialize block_sort for int, block of 256 threads, and 8 items per thread using block_sort_int = rocprim::block_sort<int, 256, 8, int>; // allocate storage in shared memory __shared__ block_sort_int::storage_type storage; int key[8] = ...; int value = ...; // execute block sort (ascending) block_sort_int().sort( key, value, storage ); ... } - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
 - Template Parameters:
- BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>. 
- Parameters:
- thread_key – [inout] - reference to a key provided by a thread. 
- thread_value – [inout] - reference to a value provided by a thread. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
- compare_function – [in] - comparison function object which returns true if the first argument is is ordered before the second. The signature of the function should be equivalent to the following: - bool f(const T &a, const T &b);. The signature does not need to have- const &, but function object must not modify the objects passed to it.
 
 
 - 
template<class BinaryFunction = ::rocprim::less<Key>>
 __device__ inline void sort(Key (&thread_keys)[ItemsPerThread], Value (&thread_values)[ItemsPerThread], storage_type &storage, BinaryFunction compare_function = BinaryFunction())#
- This overload allows an array of - ItemsPerThreadkeys and values to be passed in so that each thread can process multiple items.
 - 
template<class BinaryFunction = ::rocprim::less<Key>>
 __device__ inline void sort(Key &thread_key, storage_type &storage, const unsigned int size, BinaryFunction compare_function = BinaryFunction())#
- Block sort for any data type. This function sorts up to - sizeelements blocked across threads.- Template Parameters:
- BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>. 
- Parameters:
- thread_key – [inout] - reference to a key provided by a thread. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
- size – [in] - custom size of block to be sorted. 
- compare_function – [in] - comparison function object which returns true if the first argument is is ordered before the second. The signature of the function should be equivalent to the following: - bool f(const T &a, const T &b);. The signature does not need to have- const &, but function object must not modify the objects passed to it.
 
 
 - 
template<class BinaryFunction = ::rocprim::less<Key>>
 __device__ inline void sort(Key (&thread_keys)[ItemsPerThread], storage_type &storage, const unsigned int size, BinaryFunction compare_function = BinaryFunction())#
- Block sort for any data type. This function sorts up to - sizeelements blocked across threads.- Template Parameters:
- BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>. 
- Parameters:
- thread_keys – [inout] - reference to keys provided by a thread. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
- size – [in] - custom size of block to be sorted. 
- compare_function – [in] - comparison function object which returns true if the first argument is is ordered before the second. The signature of the function should be equivalent to the following: - bool f(const T &a, const T &b);. The signature does not need to have- const &, but function object must not modify the objects passed to it.
 
 
 - 
template<class BinaryFunction = ::rocprim::less<Key>>
 __device__ inline void sort(Key &thread_key, Value &thread_value, storage_type &storage, const unsigned int size, BinaryFunction compare_function = BinaryFunction())#
- Block sort by key for any data type. This function sorts up to - sizeelements blocked across threads.- Template Parameters:
- BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>. 
- Parameters:
- thread_key – [inout] - reference to a key provided by a thread. 
- thread_value – [inout] - reference to a value provided by a thread. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
- size – [in] - custom size of block to be sorted. 
- compare_function – [in] - comparison function object which returns true if the first argument is is ordered before the second. The signature of the function should be equivalent to the following: - bool f(const T &a, const T &b);. The signature does not need to have- const &, but function object must not modify the objects passed to it.
 
 
 - 
template<class BinaryFunction = ::rocprim::less<Key>>
 __device__ inline void sort(Key (&thread_keys)[ItemsPerThread], Value (&thread_values)[ItemsPerThread], storage_type &storage, const unsigned int size, BinaryFunction compare_function = BinaryFunction())#
- Block sort by key for any data type. This function sorts up to - sizeelements blocked across threads.- Template Parameters:
- BinaryFunction – - type of binary function used for sort. Default type is rocprim::less<T>. 
- Parameters:
- thread_keys – [inout] - reference to keys provided by a thread. 
- thread_values – [inout] - reference to values provided by a thread. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
- size – [in] - custom size of block to be sorted. 
- compare_function – [in] - comparison function object which returns true if the first argument is is ordered before the second. The signature of the function should be equivalent to the following: - bool f(const T &a, const T &b);. The signature does not need to have- const &, but function object must not modify the objects passed to it.
 
 
 
- 
enum class rocprim::block_sort_algorithm#
- Available algorithms for block_sort primitive. - Values: - 
enumerator bitonic_sort#
- A bitonic sort based algorithm. - \par Stability \p bitonic_sort is <b>not stable</b>: it doesn't necessarily preserve the relative ordering of equivalent keys. That is, given two keys \p a and \p b and a binary boolean operation \p op such that: * \p a precedes \p b in the input keys, and * op(a, b) and op(b, a) are both false, then it is <b>not guaranteed</b> that \p a will precede \p b as well in the output (ordered) keys. 
 - 
enumerator merge_sort#
- A merge sort based algorithm. - \par Stability \p merge_sort <b>may</b> use \p stable_merge_sort as the underlying implementation. However, \p merge_sort is <b>not guaranteed to be stable</b>: it doesn't necessarily preserve the relative ordering of equivalent keys. That is, given two keys \p a and \p b and a binary boolean operation \p op such that: * \p a precedes \p b in the input keys, and * op(a, b) and op(b, a) are both false, then it is <b>not guaranteed</b> that \p a will precede \p b as well in the output (ordered) keys. 
 - 
enumerator stable_merge_sort#
- A merged sort based algorithm which sorts stably. - \par Stability \p stable_merge_sort is \b stable: it preserves the relative ordering of equivalent keys. That is, given two keys \p a and \p b and a binary boolean operation \p op such that: * \p a precedes \p b in the input keys, and * op(a, b) and op(b, a) are both false, then it is \b guaranteed that \p a will precede \p b as well in the output (ordered) keys. 
 - 
enumerator default_algorithm#
- Default block_sort algorithm. 
 
- 
enumerator bitonic_sort#
Radix sort#
- 
template<class Key, unsigned int BlockSizeX, unsigned int ItemsPerThread, class Value = empty_type, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1, unsigned int RadixBitsPerPass = (BlockSizeX * BlockSizeY * BlockSizeZ) % device_warp_size() == 0 ? 8 : 4, block_radix_rank_algorithm RadixRankAlgorithm = (BlockSizeX * BlockSizeY * BlockSizeZ) % device_warp_size() == 0 ? block_radix_rank_algorithm::match : block_radix_rank_algorithm::basic_memoize, block_padding_hint PaddingHint = block_padding_hint::lds_occupancy_bound>
 class block_radix_sort#
- The block_radix_sort class is a block level parallel primitive which provides methods for sorting of items (keys or key-value pairs) partitioned across threads in a block using radix sort algorithm. - Overview
- Keytype must be an arithmetic type (that is, an integral type or a floating-point type).
- Performance depends on - BlockSizeand- ItemsPerThread.- It is usually better for - BlockSizeto be a multiple of the size of the hardware warp.
- It is usually increased when - ItemsPerThreadis greater than one. However, when there are too many items per thread, each thread may need so much registers and/or shared memory that occupancy will fall too low, decreasing the performance.
- If - Keyis an integer type and the range of keys is known in advance, the performance can be improved by setting- begin_bitand- end_bit, for example if all keys are in range [100, 10000],- begin_bit = 0and- end_bit = 14will cover the whole range.
 
 
- Stability
- block_radix_sortis stable: it preserves the relative ordering of equivalent keys. That is, given two keys- aand- band a binary boolean operation- opsuch that:- aprecedes- bin the input keys, and
- op(a, b) and op(b, a) are both false, then it is guaranteed that - awill precede- bas well in the output (ordered) keys.
 
- Examples
- In the examples radix sort is performed on a block of 256 threads, each thread provides eight - intvalue, results are returned using the same array as for input.- __global__ void example_kernel(...) { // specialize block_radix_sort for int, block of 256 threads, // and eight items per thread; key-only sort using block_rsort_int = rocprim::block_radix_sort<int, 256, 8>; // allocate storage in shared memory __shared__ block_rsort_int::storage_type storage; int input[8] = ...; // execute block radix sort (ascending) block_rsort_int().sort( input, storage ); ... } 
 - Template Parameters:
- Key – - the key type. 
- BlockSize – - the number of threads in a block. 
- ItemsPerThread – - the number of items contributed by each thread. 
- Value – - the value type. Default type empty_type indicates a keys-only sort. 
- RadixBitsPerPass – - amount of bits to sort per pass. The Default is 4. 
- RadixRankAlgorithm – the rank algorithm used. 
 
 - Public Types - 
using storage_type = storage_type_#
- Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive. - Depending on the implemention the operations exposed by parallel primitive may require a temporary storage for thread communication. The storage should be allocated using keywords - . It can be aliased to an externally allocated memory, or be a part of a union type with other storage types to increase shared memory reusability.
 - Public Functions - 
template<class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort(Key (&keys)[ItemsPerThread], storage_type &storage, unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs ascending radix sort over keys partitioned across threads in a block. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Examples
- In the examples radix sort is performed on a block of 128 threads, each thread provides two - floatvalue, results are returned using the same array as for input.- __global__ void example_kernel(...) { // specialize block_radix_sort for float, block of 128 threads, // and two items per thread; key-only sort using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>; // allocate storage in shared memory __shared__ block_rsort_float::storage_type storage; float input[2] = ...; // execute block radix sort (ascending) block_rsort_float().sort( input, storage ); ... } - If the - inputvalues across threads in a block are- {[256, 255], ..., [4, 3], [2, 1]}}, then then after sort they will be equal- {[1, 2], [3, 4] ..., [255, 256]}.
 - Template Parameters:
- Decomposer – The type of the decomposer argument. Defaults to the identity decomposer. 
- Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
 
 - 
template<class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort(Key (&keys)[ItemsPerThread], unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs ascending radix sort over keys partitioned across threads in a block. - This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. - This overload does not accept storage argument. Required shared memory is allocated by the method itself. 
 - Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
 
 - 
template<class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_desc(Key (&keys)[ItemsPerThread], storage_type &storage, unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs descending radix sort over keys partitioned across threads in a block. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Examples
- In the examples radix sort is performed on a block of 128 threads, each thread provides two - floatvalue, results are returned using the same array as for input.- __global__ void example_kernel(...) { // specialize block_radix_sort for float, block of 128 threads, // and two items per thread; key-only sort using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>; // allocate storage in shared memory __shared__ block_rsort_float::storage_type storage; float input[2] = ...; // execute block radix sort (descending) block_rsort_float().sort_desc( input, storage ); ... } - If the - inputvalues across threads in a block are- {[1, 2], [3, 4] ..., [255, 256]}, then after sort they will be equal- {[256, 255], ..., [4, 3], [2, 1]}.
 - Template Parameters:
- Decomposer – The type of the decomposer argument. Defaults to the identity decomposer. 
- Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
 
 - 
template<class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_desc(Key (&keys)[ItemsPerThread], unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs descending radix sort over keys partitioned across threads in a block. - This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. - This overload does not accept storage argument. Required shared memory is allocated by the method itself. 
 - Template Parameters:
- Decomposer – The type of the decomposer argument. Defaults to the identity decomposer. 
- Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort(Key (&keys)[ItemsPerThread], typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread], storage_type &storage, unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs ascending radix sort over key-value pairs partitioned across threads in a block. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Examples
- In the examples radix sort is performed on a block of 128 threads, each thread provides two key-value - int-- floatpairs, results are returned using the same arrays as for input.- __global__ void example_kernel(...) { // specialize block_radix_sort for int-float pairs, block of 128 // threads, and two items per thread using block_rsort_ii = rocprim::block_radix_sort<int, 128, 2, int>; // allocate storage in shared memory __shared__ block_rsort_ii::storage_type storage; int keys[2] = ...; float values[2] = ...; // execute block radix sort-by-key (ascending) block_rsort_ii().sort( keys, values, storage ); ... } - If the - keysacross threads in a block are- {[256, 255], ..., [4, 3], [2, 1]}and the- valuesare- {[1, 1], [2, 2] ..., [128, 128]}, then after sort the- keyswill be equal- {[1, 2], [3, 4] ..., [255, 256]}and the- valueswill be equal- {[128, 128], [127, 127] ..., [2, 2], [1, 1]}.
 - Template Parameters:
- Decomposer – The type of the decomposer argument. Defaults to the identity decomposer. 
- Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- values – [inout] - reference to an array of values provided by a thread. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
- Pre:
- Method is enabled only if - Valuetype is different than empty_type.
 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort(Key (&keys)[ItemsPerThread], typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread], unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs ascending radix sort over key-value pairs partitioned across threads in a block. - This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. - This overload does not accept storage argument. Required shared memory is allocated by the method itself. 
 - Template Parameters:
- Decomposer – The type of the decomposer argument. Defaults to the identity decomposer. 
- Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- values – [inout] - reference to an array of values provided by a thread. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
- Pre:
- Method is enabled only if - Valuetype is different than empty_type.
 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_desc(Key (&keys)[ItemsPerThread], typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread], storage_type &storage, unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs descending radix sort over key-value pairs partitioned across threads in a block. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Examples
- In the examples radix sort is performed on a block of 128 threads, each thread provides two key-value - int-- floatpairs, results are returned using the same arrays as for input.- __global__ void example_kernel(...) { // specialize block_radix_sort for int-float pairs, block of 128 // threads, and two items per thread using block_rsort_ii = rocprim::block_radix_sort<int, 128, 2, int>; // allocate storage in shared memory __shared__ block_rsort_ii::storage_type storage; int keys[2] = ...; float values[2] = ...; // execute block radix sort-by-key (descending) block_rsort_ii().sort_desc( keys, values, storage ); ... } - If the - keysacross threads in a block are- {[1, 2], [3, 4] ..., [255, 256]}and the- valuesare- {[128, 128], [127, 127] ..., [2, 2], [1, 1]}, then after sort the- keyswill be equal- {[256, 255], ..., [4, 3], [2, 1]}and the- valueswill be equal- {[1, 1], [2, 2] ..., [128, 128]}.
 - Template Parameters:
- Decomposer – The type of the decomposer argument. Defaults to the identity decomposer. 
- Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- values – [inout] - reference to an array of values provided by a thread. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
- Pre:
- Method is enabled only if - Valuetype is different than empty_type.
 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_desc(Key (&keys)[ItemsPerThread], typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread], unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs descending radix sort over key-value pairs partitioned across threads in a block. - This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. - This overload does not accept storage argument. Required shared memory is allocated by the method itself. 
 - Template Parameters:
- Decomposer – The type of the decomposer argument. Defaults to the identity decomposer. 
- Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- values – [inout] - reference to an array of values provided by a thread. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
- Pre:
- Method is enabled only if - Valuetype is different than empty_type.
 
 - 
template<class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_to_striped(Key (&keys)[ItemsPerThread], storage_type &storage, unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs ascending radix sort over keys partitioned across threads in a block, results are saved in a striped arrangement. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Examples
- In the examples radix sort is performed on a block of 128 threads, each thread provides two - floatvalue, results are returned using the same array as for input.- __global__ void example_kernel(...) { // specialize block_radix_sort for float, block of 128 threads, // and two items per thread; key-only sort using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>; // allocate storage in shared memory __shared__ block_rsort_float::storage_type storage; float keys[2] = ...; // execute block radix sort (ascending) block_rsort_float().sort_to_striped( keys, storage ); ... } - If the - inputvalues across threads in a block are- {[256, 255], ..., [4, 3], [2, 1]}}, then then after sort they will be equal- {[1, 129], [2, 130] ..., [128, 256]}.
 - Template Parameters:
- Decomposer – The type of the decomposer argument. Defaults to the identity decomposer. 
- Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
 
 - 
template<class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_to_striped(Key (&keys)[ItemsPerThread], unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs ascending radix sort over keys partitioned across threads in a block, results are saved in a striped arrangement. - This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. - This overload does not accept storage argument. Required shared memory is allocated by the method itself. 
 - Template Parameters:
- Decomposer – The type of the decomposer argument. Defaults to the identity decomposer. 
- Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
 
 - 
template<class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_desc_to_striped(Key (&keys)[ItemsPerThread], storage_type &storage, unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs descending radix sort over keys partitioned across threads in a block, results are saved in a striped arrangement. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Examples
- In the examples radix sort is performed on a block of 128 threads, each thread provides two - floatvalue, results are returned using the same array as for input.- __global__ void example_kernel(...) { // specialize block_radix_sort for float, block of 128 threads, // and two items per thread; key-only sort using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>; // allocate storage in shared memory __shared__ block_rsort_float::storage_type storage; float input[2] = ...; // execute block radix sort (descending) block_rsort_float().sort_desc_to_striped( input, storage ); ... } - If the - inputvalues across threads in a block are- {[1, 2], [3, 4] ..., [255, 256]}, then after sort they will be equal- {[256, 128], ..., [130, 2], [129, 1]}.
 - Template Parameters:
- Decomposer – The type of the decomposer argument. Defaults to the identity decomposer. 
- Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
 
 - 
template<class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_desc_to_striped(Key (&keys)[ItemsPerThread], unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs descending radix sort over keys partitioned across threads in a block, results are saved in a striped arrangement. - This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. - This overload does not accept storage argument. Required shared memory is allocated by the method itself. 
 - Template Parameters:
- Decomposer – The type of the decomposer argument. Defaults to the identity decomposer. 
- Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_to_striped(Key (&keys)[ItemsPerThread], typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread], storage_type &storage, unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs ascending radix sort over key-value pairs partitioned across threads in a block, results are saved in a striped arrangement. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Examples
- In the examples radix sort is performed on a block of 4 threads, each thread provides two key-value - int-- floatpairs, results are returned using the same arrays as for input.- __global__ void example_kernel(...) { // specialize block_radix_sort for int-float pairs, block of 4 // threads, and two items per thread using block_rsort_ii = rocprim::block_radix_sort<int, 4, 2, int>; // allocate storage in shared memory __shared__ block_rsort_ii::storage_type storage; int keys[2] = ...; float values[2] = ...; // execute block radix sort-by-key (ascending) block_rsort_ii().sort_to_striped( keys, values, storage ); ... } - If the - keysacross threads in a block are- {[8, 7], [6, 5], [4, 3], [2, 1]}and the- valuesare- {[-1, -2], [-3, -4], [-5, -6], [-7, -8]}, then after sort the- keyswill be equal- {[1, 5], [2, 6], [3, 7], [4, 8]}and the- valueswill be equal- {[-8, -4], [-7, -3], [-6, -2], [-5, -1]}.
 - Template Parameters:
- Decomposer – The type of the decomposer argument. Defaults to the identity decomposer. 
- Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- values – [inout] - reference to an array of values provided by a thread. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
- Pre:
- Method is enabled only if - Valuetype is different than empty_type.
 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_to_striped(Key (&keys)[ItemsPerThread], typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread], unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs ascending radix sort over key-value pairs partitioned across threads in a block, results are saved in a striped arrangement. - This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. - This overload does not accept storage argument. Required shared memory is allocated by the method itself. 
 - Template Parameters:
- Decomposer – The type of the decomposer argument. Defaults to the identity decomposer. 
- Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- values – [inout] - reference to an array of values provided by a thread. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_desc_to_striped(Key (&keys)[ItemsPerThread], typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread], storage_type &storage, unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs descending radix sort over key-value pairs partitioned across threads in a block, results are saved in a striped arrangement. - Storage reusage
- Synchronization barrier should be placed before - storageis reused or repurposed:- __syncthreads()or- rocprim::syncthreads().
- Examples
- In the examples radix sort is performed on a block of 4 threads, each thread provides two key-value - int-- floatpairs, results are returned using the same arrays as for input.- __global__ void example_kernel(...) { // specialize block_radix_sort for int-float pairs, block of 4 // threads, and two items per thread using block_rsort_ii = rocprim::block_radix_sort<int, 4, 2, int>; // allocate storage in shared memory __shared__ block_rsort_ii::storage_type storage; int keys[2] = ...; float values[2] = ...; // execute block radix sort-by-key (descending) block_rsort_ii().sort_desc_to_striped( keys, values, storage ); ... } - If the - keysacross threads in a block are- {[1, 2], [3, 4], [5, 6], [7, 8]}and the- valuesare- {[80, 70], [60, 50], [40, 30], [20, 10]}, then after sort the- keyswill be equal- {[8, 4], [7, 3], [6, 2], [5, 1]}and the- valueswill be equal- {[10, 50], [20, 60], [30, 70], [40, 80]}.
 - Template Parameters:
- Decomposer – The type of the decomposer argument. Defaults to the identity decomposer. 
- Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- values – [inout] - reference to an array of values provided by a thread. 
- storage – [in] - reference to a temporary storage object of type storage_type. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
- Pre:
- Method is enabled only if - Valuetype is different than empty_type.
 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_desc_to_striped(Key (&keys)[ItemsPerThread], typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread], unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs descending radix sort over key-value pairs partitioned across threads in a block, results are saved in a striped arrangement. - This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. - This overload does not accept storage argument. Required shared memory is allocated by the method itself. 
 - Template Parameters:
- Decomposer – The type of the decomposer argument. Defaults to the identity decomposer. 
- Parameters:
- keys – [inout] - reference to an array of keys provided by a thread. 
- values – [inout] - reference to an array of values provided by a thread. 
- begin_bit – [in] - [optional] index of the first (least significant) bit used in key comparison. Must be in range - [0; 8 * sizeof(Key)). Default value:- 0.
- end_bit – [in] - [optional] past-the-end index (most significant) bit used in key comparison. Must be in range - (begin_bit; 8 * sizeof(Key)]. Default value:- * sizeof(Key).
- decomposer – [in] [optional] If - Keyis not an arithmetic type (integral, floating point), a custom decomposer functor should be passed that produces a- rocprim::tupleof references to fundamental types from this custom type.
 
 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_warp_striped_to_striped(Key (&keys)[ItemsPerThread], typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread], storage_type &storage, unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs ascending radix sort over key-value pairs in a warp-striped order partitioned across threads in a block, results are saved in a striped arrangement. - See also 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_warp_striped_to_striped(Key (&keys)[ItemsPerThread], typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread], unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs ascending radix sort over key-value pairs in a warp-striped order - See also - block_radix_sort::sort_to_striped partitioned across threads in a block, results are saved in a striped arrangement. 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_warp_striped_to_striped(Key (&keys)[ItemsPerThread], storage_type &storage, unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs ascending radix sort over key-value pairs in a warp-striped order partitioned across threads in a block, results are saved in a striped arrangement. - See also 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_warp_striped_to_striped(Key (&keys)[ItemsPerThread], unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs ascending radix sort over key-value pairs in a warp-striped order partitioned across threads in a block, results are saved in a striped arrangement. - See also 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_desc_warp_striped_to_striped(Key (&keys)[ItemsPerThread], typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread], storage_type &storage, unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs descending radix sort over key-value pairs in a warp-striped order partitioned across threads in a block, results are saved in a striped arrangement. 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_desc_warp_striped_to_striped(Key (&keys)[ItemsPerThread], typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread], unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs descending radix sort over key-value pairs in a warp-striped order partitioned across threads in a block, results are saved in a striped arrangement. 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_desc_warp_striped_to_striped(Key (&keys)[ItemsPerThread], storage_type &storage, unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs descending radix sort over key-value pairs in a warp-striped order partitioned across threads in a block, results are saved in a striped arrangement. 
 - 
template<bool WithValues = with_values, class Decomposer = ::rocprim::identity_decomposer>
 __device__ inline void sort_desc_warp_striped_to_striped(Key (&keys)[ItemsPerThread], unsigned int begin_bit = 0, unsigned int end_bit = 8 * sizeof(Key), Decomposer decomposer = {})#
- Performs descending radix sort over key-value pairs in a warp-striped order partitioned across threads in a block, results are saved in a striped arrangement.