Data movement functions#
Direct Blocked#
Load#
-
template<class InputIterator, class T, unsigned int ItemsPerThread>
__device__ inline void rocprim::block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread])# Loads data from continuous memory into a blocked arrangement of items across the thread block.
The block arrangement is assumed to be (block-threads *
ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to load a range ofItemsPerThread
intoitems
.- Template Parameters:
InputIterator – - [inferred] an iterator type for input (can be a simple pointer
T – - [inferred] the data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_input – - the input iterator from the thread block to load from
items – - array that data is loaded to
-
template<class InputIterator, class T, unsigned int ItemsPerThread>
__device__ inline void rocprim::block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid)# Loads data from continuous memory into a blocked arrangement of items across the thread block, which is guarded by range
valid
.The block arrangement is assumed to be (block-threads *
ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to load a range ofItemsPerThread
intoitems
.- Template Parameters:
InputIterator – - [inferred] an iterator type for input (can be a simple pointer
T – - [inferred] the data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_input – - the input iterator from the thread block to load from
items – - array that data is loaded to
valid – - maximum range of valid numbers to load
-
template<class InputIterator, class T, unsigned int ItemsPerThread, class Default>
__device__ inline void rocprim::block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds)# Loads data from continuous memory into a blocked arrangement of items across the thread block, which is guarded by range with a fall-back value for out-of-bound elements.
The block arrangement is assumed to be (block-threads *
ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to load a range ofItemsPerThread
intoitems
.- Template Parameters:
InputIterator – - [inferred] an iterator type for input (can be a simple pointer
T – - [inferred] the data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
Default – - [inferred] The data type of the default value
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_input – - the input iterator from the thread block to load from
items – - array that data is loaded to
valid – - maximum range of valid numbers to load
out_of_bounds – - default value assigned to out-of-bound items
Store#
-
template<class OutputIterator, class T, unsigned int ItemsPerThread>
__device__ inline void rocprim::block_store_direct_blocked(unsigned int flat_id, OutputIterator block_output, T (&items)[ItemsPerThread])# Stores a blocked arrangement of items from across the thread block into a blocked arrangement on continuous memory.
The block arrangement is assumed to be (block-threads *
ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to store a range ofItemsPerThread
items
to the thread block.- Template Parameters:
OutputIterator – - [inferred] an iterator type for input (can be a simple pointer
T – - [inferred] the data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_output – - the input iterator from the thread block to store to
items – - array that data is stored to thread block
-
template<class OutputIterator, class T, unsigned int ItemsPerThread>
__device__ inline void rocprim::block_store_direct_blocked(unsigned int flat_id, OutputIterator block_output, T (&items)[ItemsPerThread], unsigned int valid)# Stores a blocked arrangement of items from across the thread block into a blocked arrangement on continuous memory, which is guarded by range
valid
.The block arrangement is assumed to be (block-threads *
ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to store a range ofItemsPerThread
items
to the thread block.- Template Parameters:
OutputIterator – - [inferred] an iterator type for input (can be a simple pointer
T – - [inferred] the data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_output – - the input iterator from the thread block to store to
items – - array that data is stored to thread block
valid – - maximum range of valid numbers to store
Direct Blocked Vectorized#
Load#
-
template<class T, class U, unsigned int ItemsPerThread>
__device__ inline auto rocprim::block_load_direct_blocked_vectorized(unsigned int flat_id, T *block_input, U (&items)[ItemsPerThread]) -> typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value>::type# Loads data from continuous memory into a blocked arrangement of items across the thread block.
The block arrangement is assumed to be (block-threads *
ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to load a range ofItemsPerThread
intoitems
.The input offset (
block_input
+ offset) must be quad-item aligned.The following conditions will prevent vectorization and switch to default block_load_direct_blocked:
ItemsPerThread
is odd.The datatype
T
is not a primitive or a HIP vector type (e.g. int2, int4, etc.
The type
T
must be such that it can be implicitly converted toU
.- Template Parameters:
T – - [inferred] the input data type
U – - [inferred] the output data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_input – - the input iterator from the thread block to load from
items – - array that data is loaded to
Store#
-
template<class T, class U, unsigned int ItemsPerThread>
__device__ inline auto rocprim::block_store_direct_blocked_vectorized(unsigned int flat_id, T *block_output, U (&items)[ItemsPerThread]) -> typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value>::type# Stores a blocked arrangement of items from across the thread block into a blocked arrangement on continuous memory.
The block arrangement is assumed to be (block-threads *
ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to store a range ofItemsPerThread
items
to the thread block.The input offset (
block_output
+ offset) must be quad-item aligned.The following conditions will prevent vectorization and switch to default block_load_direct_blocked:
ItemsPerThread
is odd.The datatype
T
is not a primitive or a HIP vector type (e.g. int2, int4, etc.
The type
U
must be such that it can be implicitly converted toT
.- Template Parameters:
T – - [inferred] the output data type
U – - [inferred] the input data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_output – - the input iterator from the thread block to load from
items – - array that data is loaded to
Direct Striped#
Load#
-
template<unsigned int BlockSize, class InputIterator, class T, unsigned int ItemsPerThread>
__device__ inline void rocprim::block_load_direct_striped(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread])# Loads data from continuous memory into a striped arrangement of items across the thread block.
The striped arrangement is assumed to be (
BlockSize
*ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to load a range ofItemsPerThread
intoitems
.- Template Parameters:
BlockSize – - the number of threads in a block
InputIterator – - [inferred] an iterator type for input (can be a simple pointer
T – - [inferred] the data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_input – - the input iterator from the thread block to load from
items – - array that data is loaded to
-
template<unsigned int BlockSize, class InputIterator, class T, unsigned int ItemsPerThread>
__device__ inline void rocprim::block_load_direct_striped(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid)# Loads data from continuous memory into a striped arrangement of items across the thread block, which is guarded by range
valid
.The striped arrangement is assumed to be (
BlockSize
*ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to load a range ofItemsPerThread
intoitems
.- Template Parameters:
BlockSize – - the number of threads in a block
InputIterator – - [inferred] an iterator type for input (can be a simple pointer
T – - [inferred] the data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_input – - the input iterator from the thread block to load from
items – - array that data is loaded to
valid – - maximum range of valid numbers to load
-
template<unsigned int BlockSize, class InputIterator, class T, unsigned int ItemsPerThread, class Default>
__device__ inline void rocprim::block_load_direct_striped(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds)# Loads data from continuous memory into a striped arrangement of items across the thread block, which is guarded by range with a fall-back value for out-of-bound elements.
The striped arrangement is assumed to be (
BlockSize
*ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to load a range ofItemsPerThread
intoitems
.- Template Parameters:
BlockSize – - the number of threads in a block
InputIterator – - [inferred] an iterator type for input (can be a simple pointer
T – - [inferred] the data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
Default – - [inferred] The data type of the default value
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_input – - the input iterator from the thread block to load from
items – - array that data is loaded to
valid – - maximum range of valid numbers to load
out_of_bounds – - default value assigned to out-of-bound items
Store#
-
template<unsigned int BlockSize, class OutputIterator, class T, unsigned int ItemsPerThread>
__device__ inline void rocprim::block_store_direct_striped(unsigned int flat_id, OutputIterator block_output, T (&items)[ItemsPerThread])# Stores a striped arrangement of items from across the thread block into a blocked arrangement on continuous memory.
The striped arrangement is assumed to be (
BlockSize
*ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to store a range ofItemsPerThread
items
to the thread block.- Template Parameters:
BlockSize – - the number of threads in a block
OutputIterator – - [inferred] an iterator type for input (can be a simple pointer
T – - [inferred] the data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_output – - the input iterator from the thread block to store to
items – - array that data is stored to thread block
-
template<unsigned int BlockSize, class OutputIterator, class T, unsigned int ItemsPerThread>
__device__ inline void rocprim::block_store_direct_striped(unsigned int flat_id, OutputIterator block_output, T (&items)[ItemsPerThread], unsigned int valid)# Stores a striped arrangement of items from across the thread block into a blocked arrangement on continuous memory, which is guarded by range
valid
.The striped arrangement is assumed to be (
BlockSize
*ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to store a range ofItemsPerThread
items
to the thread block.- Template Parameters:
BlockSize – - the number of threads in a block
OutputIterator – - [inferred] an iterator type for input (can be a simple pointer
T – - [inferred] the data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_output – - the input iterator from the thread block to store to
items – - array that data is stored to thread block
valid – - maximum range of valid numbers to store
Direct Warp Striped#
Load#
-
template<unsigned int WarpSize = device_warp_size(), class InputIterator, class T, unsigned int ItemsPerThread>
__device__ inline void block_load_direct_warp_striped(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread])# Loads data from continuous memory into a warp-striped arrangement of items across the thread block.
The warp-striped arrangement is assumed to be (
WarpSize
*ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to load a range ofItemsPerThread
intoitems
.The number of threads in the block must be a multiple of
WarpSize
.The default
WarpSize
is a hardware warpsize and is an optimal value.WarpSize
must be a power of two and equal or less than the size of hardware warp.Using
WarpSize
smaller than hardware warpsize could result in lower performance.
- Template Parameters:
WarpSize – - [optional] the number of threads in a warp
InputIterator – - [inferred] an iterator type for input (can be a simple pointer
T – - [inferred] the data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_input – - the input iterator from the thread block to load from
items – - array that data is loaded to
-
template<unsigned int WarpSize = device_warp_size(), class InputIterator, class T, unsigned int ItemsPerThread>
__device__ inline void block_load_direct_warp_striped(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid)# Loads data from continuous memory into a warp-striped arrangement of items across the thread block, which is guarded by range
valid
.The warp-striped arrangement is assumed to be (
WarpSize
*ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to load a range ofItemsPerThread
intoitems
.The number of threads in the block must be a multiple of
WarpSize
.The default
WarpSize
is a hardware warpsize and is an optimal value.WarpSize
must be a power of two and equal or less than the size of hardware warp.Using
WarpSize
smaller than hardware warpsize could result in lower performance.
- Template Parameters:
WarpSize – - [optional] the number of threads in a warp
InputIterator – - [inferred] an iterator type for input (can be a simple pointer
T – - [inferred] the data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_input – - the input iterator from the thread block to load from
items – - array that data is loaded to
valid – - maximum range of valid numbers to load
-
template<unsigned int WarpSize = device_warp_size(), class InputIterator, class T, unsigned int ItemsPerThread, class Default>
__device__ inline void block_load_direct_warp_striped(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds)# Loads data from continuous memory into a warp-striped arrangement of items across the thread block, which is guarded by range with a fall-back value for out-of-bound elements.
The warp-striped arrangement is assumed to be (
WarpSize
*ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to load a range ofItemsPerThread
intoitems
.The number of threads in the block must be a multiple of
WarpSize
.The default
WarpSize
is a hardware warpsize and is an optimal value.WarpSize
must be a power of two and equal or less than the size of hardware warp.Using
WarpSize
smaller than hardware warpsize could result in lower performance.
- Template Parameters:
WarpSize – - [optional] the number of threads in a warp
InputIterator – - [inferred] an iterator type for input (can be a simple pointer
T – - [inferred] the data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
Default – - [inferred] The data type of the default value
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_input – - the input iterator from the thread block to load from
items – - array that data is loaded to
valid – - maximum range of valid numbers to load
out_of_bounds – - default value assigned to out-of-bound items
Store#
-
template<unsigned int WarpSize = device_warp_size(), class OutputIterator, class T, unsigned int ItemsPerThread>
__device__ inline void block_store_direct_warp_striped(unsigned int flat_id, OutputIterator block_output, T (&items)[ItemsPerThread])# Stores a warp-striped arrangement of items from across the thread block into a blocked arrangement on continuous memory.
The warp-striped arrangement is assumed to be (
WarpSize
*ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to store a range ofItemsPerThread
items
to the thread block.The number of threads in the block must be a multiple of
WarpSize
.The default
WarpSize
is a hardware warpsize and is an optimal value.WarpSize
must be a power of two and equal or less than the size of hardware warp.Using
WarpSize
smaller than hardware warpsize could result in lower performance.
- Template Parameters:
WarpSize – - [optional] the number of threads in a warp
OutputIterator – - [inferred] an iterator type for input (can be a simple pointer
T – - [inferred] the data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_output – - the input iterator from the thread block to store to
items – - array that data is stored to thread block
-
template<unsigned int WarpSize = device_warp_size(), class OutputIterator, class T, unsigned int ItemsPerThread>
__device__ inline void block_store_direct_warp_striped(unsigned int flat_id, OutputIterator block_output, T (&items)[ItemsPerThread], unsigned int valid)# Stores a warp-striped arrangement of items from across the thread block into a blocked arrangement on continuous memory, which is guarded by range
valid
.The warp-striped arrangement is assumed to be (
WarpSize
*ItemsPerThread
) items across a thread block. Each thread uses aflat_id
to store a range ofItemsPerThread
items
to the thread block.The number of threads in the block must be a multiple of
WarpSize
.The default
WarpSize
is a hardware warpsize and is an optimal value.WarpSize
must be a power of two and equal or less than the size of hardware warp.Using
WarpSize
smaller than hardware warpsize could result in lower performance.
- Template Parameters:
WarpSize – - [optional] the number of threads in a warp
OutputIterator – - [inferred] an iterator type for input (can be a simple pointer
T – - [inferred] the data type
ItemsPerThread – - [inferred] the number of items to be processed by each thread
- Parameters:
flat_id – - a local flat 1D thread id in a block (tile) for the calling thread
block_output – - the input iterator from the thread block to store to
items – - array that data is stored to thread block
valid – - maximum range of valid numbers to store