|
__device__ __forceinline__ | BlockMergeSort (TempStorage &temp_storage) |
|
template<typename CompareOp > |
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
template<typename CompareOp > |
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
template<typename CompareOp > |
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
template<typename CompareOp , bool IS_LAST_TILE = true> |
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
template<typename CompareOp > |
__device__ __forceinline__ void | StableSort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
template<typename CompareOp > |
__device__ __forceinline__ void | StableSort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
template<typename CompareOp > |
__device__ __forceinline__ void | StableSort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
template<typename CompareOp , bool IS_LAST_TILE = true> |
__device__ __forceinline__ void | StableSort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
template<typename KeyT, int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1>
class hipcub::BlockMergeSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, BLOCK_DIM_Y, BLOCK_DIM_Z >
The BlockMergeSort class provides methods for sorting items partitioned across a CUDA thread block using a merge sorting method.
- Template Parameters
-
KeyT | KeyT type |
BLOCK_DIM_X | The thread block length in threads along the X dimension |
ITEMS_PER_THREAD | The number of items per thread |
ValueT | [optional] ValueT type (default: hipcub::NullType, which indicates a keys-only sort) |
BLOCK_DIM_Y | [optional] The thread block length in threads along the Y dimension (default: 1) |
BLOCK_DIM_Z | [optional] The thread block length in threads along the Z dimension (default: 1) |
- Overview
- BlockMergeSort arranges items into ascending order using a comparison functor with less-than semantics. Merge sort can handle arbitrary types and comparison functors, but is slower than BlockRadixSort when sorting arithmetic types into ascending/descending order.
- A Simple Example
- \blockcollective{BlockMergeSort}
- The code snippet below illustrates a sort of 512 integer keys that are partitioned across 128 threads * where each thread owns 4 consecutive items.
#include <cub/cub.cuh>
struct CustomLess
{
template <typename DataType>
__device__ bool operator()(const DataType &lhs, const DataType &rhs)
{
return lhs < rhs;
}
};
__global__ void ExampleKernel(...)
{
typedef cub::BlockMergeSort<int, 128, 4> BlockMergeSort;
__shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle;
int thread_keys[4];
...
BlockMergeSort(temp_storage_shuffle).Sort(thread_data, CustomLess());
...
}
- Suppose the set of input
thread_keys
across the block of threads is { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }
. The corresponding output thread_keys
in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
.
- Re-using dynamically allocating shared memory
- The following example under the examples/block folder illustrates usage of dynamically shared memory with BlockReduce and how to re-purpose the same memory region: example_block_reduce_dyn_smem.cu
This example can be easily adapted to the storage required by BlockMergeSort.
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1>
template<typename CompareOp >
__device__ __forceinline__ void hipcub::BlockMergeSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, BLOCK_DIM_Y, BLOCK_DIM_Z >::Sort |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
CompareOp |
compare_op |
|
) |
| |
|
inline |
Sorts items partitioned across a CUDA thread block using a merge sorting method.
- Sort is not guaranteed to be stable. That is, suppose that i and j are equivalent: neither one is less than the other. It is not guaranteed that the relative order of these two elements will be preserved by sort.
- Template Parameters
-
CompareOp | functor type having member bool operator()(KeyT lhs, KeyT rhs) CompareOp is a model of Strict Weak Ordering. |
- Parameters
-
| keys | [in-out] Keys to sort |
[in] | compare_op | Comparison function object which returns true if the first argument is ordered before the second |
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1>
template<typename CompareOp >
__device__ __forceinline__ void hipcub::BlockMergeSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, BLOCK_DIM_Y, BLOCK_DIM_Z >::Sort |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
CompareOp |
compare_op, |
|
|
int |
valid_items, |
|
|
KeyT |
oob_default |
|
) |
| |
|
inline |
Sorts items partitioned across a CUDA thread block using a merge sorting method.
- Sort is not guaranteed to be stable. That is, suppose that i and j are equivalent: neither one is less than the other. It is not guaranteed that the relative order of these two elements will be preserved by sort.
- The value of
oob_default
is assigned to all elements that are out of valid_items
boundaries. It's expected that oob_default
is ordered after any value in the valid_items
boundaries. The algorithm always sorts a fixed amount of elements, which is equal to ITEMS_PER_THREAD * BLOCK_THREADS. If there is a value that is ordered after oob_default
, it won't be placed within valid_items
boundaries.
- Template Parameters
-
CompareOp | functor type having member bool operator()(KeyT lhs, KeyT rhs) CompareOp is a model of Strict Weak Ordering. |
- Parameters
-
| keys | [in-out] Keys to sort |
[in] | compare_op | Comparison function object which returns true if the first argument is ordered before the second |
[in] | valid_items | Number of valid items to sort |
[in] | oob_default | Default value to assign out-of-bound items |
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1>
template<typename CompareOp >
__device__ __forceinline__ void hipcub::BlockMergeSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, BLOCK_DIM_Y, BLOCK_DIM_Z >::Sort |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
ValueT(&) |
items[ITEMS_PER_THREAD], |
|
|
CompareOp |
compare_op |
|
) |
| |
|
inline |
Sorts items partitioned across a CUDA thread block using a merge sorting method.
- Sort is not guaranteed to be stable. That is, suppose that i and j are equivalent: neither one is less than the other. It is not guaranteed that the relative order of these two elements will be preserved by sort.
- Template Parameters
-
CompareOp | functor type having member bool operator()(KeyT lhs, KeyT rhs) CompareOp is a model of Strict Weak Ordering. |
- Parameters
-
| keys | [in-out] Keys to sort |
| items | [in-out] Values to sort |
[in] | compare_op | Comparison function object which returns true if the first argument is ordered before the second |
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1>
template<typename CompareOp , bool IS_LAST_TILE = true>
__device__ __forceinline__ void hipcub::BlockMergeSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, BLOCK_DIM_Y, BLOCK_DIM_Z >::Sort |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
ValueT(&) |
items[ITEMS_PER_THREAD], |
|
|
CompareOp |
compare_op, |
|
|
int |
valid_items, |
|
|
KeyT |
oob_default |
|
) |
| |
|
inline |
Sorts items partitioned across a CUDA thread block using a merge sorting method.
- Sort is not guaranteed to be stable. That is, suppose that i and j are equivalent: neither one is less than the other. It is not guaranteed that the relative order of these two elements will be preserved by sort.
- The value of
oob_default
is assigned to all elements that are out of valid_items
boundaries. It's expected that oob_default
is ordered after any value in the valid_items
boundaries. The algorithm always sorts a fixed amount of elements, which is equal to ITEMS_PER_THREAD * BLOCK_THREADS. If there is a value that is ordered after oob_default
, it won't be placed within valid_items
boundaries.
- Template Parameters
-
CompareOp | functor type having member bool operator()(KeyT lhs, KeyT rhs) CompareOp is a model of Strict Weak Ordering. |
IS_LAST_TILE | True if valid_items isn't equal to the ITEMS_PER_TILE |
- Parameters
-
| keys | [in-out] Keys to sort |
| items | [in-out] Values to sort |
[in] | compare_op | Comparison function object which returns true if the first argument is ordered before the second |
[in] | valid_items | Number of valid items to sort |
[in] | oob_default | Default value to assign out-of-bound items |
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1>
template<typename CompareOp >
__device__ __forceinline__ void hipcub::BlockMergeSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, BLOCK_DIM_Y, BLOCK_DIM_Z >::StableSort |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
CompareOp |
compare_op |
|
) |
| |
|
inline |
Sorts items partitioned across a CUDA thread block using a merge sorting method.
- StableSort is stable: it preserves the relative ordering of equivalent elements. That is, if x and y are elements such that x precedes y, and if the two elements are equivalent (neither x < y nor y < x) then a postcondition of stable_sort is that x still precedes y.
- Template Parameters
-
CompareOp | functor type having member bool operator()(KeyT lhs, KeyT rhs) CompareOp is a model of Strict Weak Ordering. |
- Parameters
-
| keys | [in-out] Keys to sort |
[in] | compare_op | Comparison function object which returns true if the first argument is ordered before the second |
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1>
template<typename CompareOp >
__device__ __forceinline__ void hipcub::BlockMergeSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, BLOCK_DIM_Y, BLOCK_DIM_Z >::StableSort |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
ValueT(&) |
items[ITEMS_PER_THREAD], |
|
|
CompareOp |
compare_op |
|
) |
| |
|
inline |
Sorts items partitioned across a CUDA thread block using a merge sorting method.
- StableSort is stable: it preserves the relative ordering of equivalent elements. That is, if x and y are elements such that x precedes y, and if the two elements are equivalent (neither x < y nor y < x) then a postcondition of stable_sort is that x still precedes y.
- Template Parameters
-
CompareOp | functor type having member bool operator()(KeyT lhs, KeyT rhs) CompareOp is a model of Strict Weak Ordering. |
- Parameters
-
| keys | [in-out] Keys to sort |
| items | [in-out] Values to sort |
[in] | compare_op | Comparison function object which returns true if the first argument is ordered before the second |
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1>
template<typename CompareOp >
__device__ __forceinline__ void hipcub::BlockMergeSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, BLOCK_DIM_Y, BLOCK_DIM_Z >::StableSort |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
CompareOp |
compare_op, |
|
|
int |
valid_items, |
|
|
KeyT |
oob_default |
|
) |
| |
|
inline |
Sorts items partitioned across a CUDA thread block using a merge sorting method.
- StableSort is stable: it preserves the relative ordering of equivalent elements. That is, if x and y are elements such that x precedes y, and if the two elements are equivalent (neither x < y nor y < x) then a postcondition of stable_sort is that x still precedes y.
- The value of
oob_default
is assigned to all elements that are out of valid_items
boundaries. It's expected that oob_default
is ordered after any value in the valid_items
boundaries. The algorithm always sorts a fixed amount of elements, which is equal to ITEMS_PER_THREAD * BLOCK_THREADS. If there is a value that is ordered after oob_default
, it won't be placed within valid_items
boundaries.
- Template Parameters
-
CompareOp | functor type having member bool operator()(KeyT lhs, KeyT rhs) CompareOp is a model of Strict Weak Ordering. |
- Parameters
-
| keys | [in-out] Keys to sort |
[in] | compare_op | Comparison function object which returns true if the first argument is ordered before the second |
[in] | valid_items | Number of valid items to sort |
[in] | oob_default | Default value to assign out-of-bound items |
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1>
template<typename CompareOp , bool IS_LAST_TILE = true>
__device__ __forceinline__ void hipcub::BlockMergeSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, BLOCK_DIM_Y, BLOCK_DIM_Z >::StableSort |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
ValueT(&) |
items[ITEMS_PER_THREAD], |
|
|
CompareOp |
compare_op, |
|
|
int |
valid_items, |
|
|
KeyT |
oob_default |
|
) |
| |
|
inline |
Sorts items partitioned across a CUDA thread block using a merge sorting method.
- StableSort is stable: it preserves the relative ordering of equivalent elements. That is, if x and y are elements such that x precedes y, and if the two elements are equivalent (neither x < y nor y < x) then a postcondition of stable_sort is that x still precedes y.
- The value of
oob_default
is assigned to all elements that are out of valid_items
boundaries. It's expected that oob_default
is ordered after any value in the valid_items
boundaries. The algorithm always sorts a fixed amount of elements, which is equal to ITEMS_PER_THREAD * BLOCK_THREADS. If there is a value that is ordered after oob_default
, it won't be placed within valid_items
boundaries.
- Template Parameters
-
CompareOp | functor type having member bool operator()(KeyT lhs, KeyT rhs) CompareOp is a model of Strict Weak Ordering. |
IS_LAST_TILE | True if valid_items isn't equal to the ITEMS_PER_TILE |
- Parameters
-
| keys | [in-out] Keys to sort |
| items | [in-out] Values to sort |
[in] | compare_op | Comparison function object which returns true if the first argument is ordered before the second |
[in] | valid_items | Number of valid items to sort |
[in] | oob_default | Default value to assign out-of-bound items |