Data movement functions#
Direct Blocked#
Load#
- 
template<class InputIterator, class T, unsigned int ItemsPerThread>
 __device__ inline void rocprim::block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread])#
- Loads data from continuous memory into a blocked arrangement of items across the thread block. - The block arrangement is assumed to be (block-threads * - ItemsPerThread) items across a thread block. Each thread uses a- flat_idto load a range of- ItemsPerThreadinto- items.- Template Parameters:
- InputIterator – - [inferred] an iterator type for input (can be a simple pointer 
- T – - [inferred] the data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_input – - the input iterator from the thread block to load from 
- items – - array that data is loaded to 
 
 
- 
template<class InputIterator, class T, unsigned int ItemsPerThread>
 __device__ inline void rocprim::block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid)#
- Loads data from continuous memory into a blocked arrangement of items across the thread block, which is guarded by range - valid.- The block arrangement is assumed to be (block-threads * - ItemsPerThread) items across a thread block. Each thread uses a- flat_idto load a range of- ItemsPerThreadinto- items.- Template Parameters:
- InputIterator – - [inferred] an iterator type for input (can be a simple pointer 
- T – - [inferred] the data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_input – - the input iterator from the thread block to load from 
- items – - array that data is loaded to 
- valid – - maximum range of valid numbers to load 
 
 
- 
template<class InputIterator, class T, unsigned int ItemsPerThread, class Default>
 __device__ inline void rocprim::block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds)#
- Loads data from continuous memory into a blocked arrangement of items across the thread block, which is guarded by range with a fall-back value for out-of-bound elements. - The block arrangement is assumed to be (block-threads * - ItemsPerThread) items across a thread block. Each thread uses a- flat_idto load a range of- ItemsPerThreadinto- items.- Template Parameters:
- InputIterator – - [inferred] an iterator type for input (can be a simple pointer 
- T – - [inferred] the data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
- Default – - [inferred] The data type of the default value 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_input – - the input iterator from the thread block to load from 
- items – - array that data is loaded to 
- valid – - maximum range of valid numbers to load 
- out_of_bounds – - default value assigned to out-of-bound items 
 
 
Store#
- 
template<class OutputIterator, class T, unsigned int ItemsPerThread>
 __device__ inline void rocprim::block_store_direct_blocked(unsigned int flat_id, OutputIterator block_output, T (&items)[ItemsPerThread])#
- Stores a blocked arrangement of items from across the thread block into a blocked arrangement on continuous memory. - The block arrangement is assumed to be (block-threads * - ItemsPerThread) items across a thread block. Each thread uses a- flat_idto store a range of- ItemsPerThread- itemsto the thread block.- Template Parameters:
- OutputIterator – - [inferred] an iterator type for input (can be a simple pointer 
- T – - [inferred] the data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_output – - the input iterator from the thread block to store to 
- items – - array that data is stored to thread block 
 
 
- 
template<class OutputIterator, class T, unsigned int ItemsPerThread>
 __device__ inline void rocprim::block_store_direct_blocked(unsigned int flat_id, OutputIterator block_output, T (&items)[ItemsPerThread], unsigned int valid)#
- Stores a blocked arrangement of items from across the thread block into a blocked arrangement on continuous memory, which is guarded by range - valid.- The block arrangement is assumed to be (block-threads * - ItemsPerThread) items across a thread block. Each thread uses a- flat_idto store a range of- ItemsPerThread- itemsto the thread block.- Template Parameters:
- OutputIterator – - [inferred] an iterator type for input (can be a simple pointer 
- T – - [inferred] the data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_output – - the input iterator from the thread block to store to 
- items – - array that data is stored to thread block 
- valid – - maximum range of valid numbers to store 
 
 
Direct Blocked Vectorized#
Load#
- 
template<class T, class U, unsigned int ItemsPerThread>
 __device__ inline auto rocprim::block_load_direct_blocked_vectorized(unsigned int flat_id, T *block_input, U (&items)[ItemsPerThread]) -> typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value>::type#
- Loads data from continuous memory into a blocked arrangement of items across the thread block. - The block arrangement is assumed to be (block-threads * - ItemsPerThread) items across a thread block. Each thread uses a- flat_idto load a range of- ItemsPerThreadinto- items.- The input offset ( - block_input+ offset) must be quad-item aligned.- The following conditions will prevent vectorization and switch to default block_load_direct_blocked: - ItemsPerThreadis odd.
- The datatype - Tis not a primitive or a HIP vector type (e.g. int2, int4, etc.
 - The type - Tmust be such that it can be implicitly converted to- U.- Template Parameters:
- T – - [inferred] the input data type 
- U – - [inferred] the output data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_input – - the input iterator from the thread block to load from 
- items – - array that data is loaded to 
 
 
Store#
- 
template<class T, class U, unsigned int ItemsPerThread>
 __device__ inline auto rocprim::block_store_direct_blocked_vectorized(unsigned int flat_id, T *block_output, U (&items)[ItemsPerThread]) -> typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value>::type#
- Stores a blocked arrangement of items from across the thread block into a blocked arrangement on continuous memory. - The block arrangement is assumed to be (block-threads * - ItemsPerThread) items across a thread block. Each thread uses a- flat_idto store a range of- ItemsPerThread- itemsto the thread block.- The input offset ( - block_output+ offset) must be quad-item aligned.- The following conditions will prevent vectorization and switch to default block_load_direct_blocked: - ItemsPerThreadis odd.
- The datatype - Tis not a primitive or a HIP vector type (e.g. int2, int4, etc.
 - The type - Umust be such that it can be implicitly converted to- T.- Template Parameters:
- T – - [inferred] the output data type 
- U – - [inferred] the input data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_output – - the input iterator from the thread block to load from 
- items – - array that data is loaded to 
 
 
Direct Striped#
Load#
- 
template<unsigned int BlockSize, class InputIterator, class T, unsigned int ItemsPerThread>
 __device__ inline void rocprim::block_load_direct_striped(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread])#
- Loads data from continuous memory into a striped arrangement of items across the thread block. - The striped arrangement is assumed to be ( - BlockSize*- ItemsPerThread) items across a thread block. Each thread uses a- flat_idto load a range of- ItemsPerThreadinto- items.- Template Parameters:
- BlockSize – - the number of threads in a block 
- InputIterator – - [inferred] an iterator type for input (can be a simple pointer 
- T – - [inferred] the data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_input – - the input iterator from the thread block to load from 
- items – - array that data is loaded to 
 
 
- 
template<unsigned int BlockSize, class InputIterator, class T, unsigned int ItemsPerThread>
 __device__ inline void rocprim::block_load_direct_striped(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid)#
- Loads data from continuous memory into a striped arrangement of items across the thread block, which is guarded by range - valid.- The striped arrangement is assumed to be ( - BlockSize*- ItemsPerThread) items across a thread block. Each thread uses a- flat_idto load a range of- ItemsPerThreadinto- items.- Template Parameters:
- BlockSize – - the number of threads in a block 
- InputIterator – - [inferred] an iterator type for input (can be a simple pointer 
- T – - [inferred] the data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_input – - the input iterator from the thread block to load from 
- items – - array that data is loaded to 
- valid – - maximum range of valid numbers to load 
 
 
- 
template<unsigned int BlockSize, class InputIterator, class T, unsigned int ItemsPerThread, class Default>
 __device__ inline void rocprim::block_load_direct_striped(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds)#
- Loads data from continuous memory into a striped arrangement of items across the thread block, which is guarded by range with a fall-back value for out-of-bound elements. - The striped arrangement is assumed to be ( - BlockSize*- ItemsPerThread) items across a thread block. Each thread uses a- flat_idto load a range of- ItemsPerThreadinto- items.- Template Parameters:
- BlockSize – - the number of threads in a block 
- InputIterator – - [inferred] an iterator type for input (can be a simple pointer 
- T – - [inferred] the data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
- Default – - [inferred] The data type of the default value 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_input – - the input iterator from the thread block to load from 
- items – - array that data is loaded to 
- valid – - maximum range of valid numbers to load 
- out_of_bounds – - default value assigned to out-of-bound items 
 
 
Store#
- 
template<unsigned int BlockSize, class OutputIterator, class T, unsigned int ItemsPerThread>
 __device__ inline void rocprim::block_store_direct_striped(unsigned int flat_id, OutputIterator block_output, T (&items)[ItemsPerThread])#
- Stores a striped arrangement of items from across the thread block into a blocked arrangement on continuous memory. - The striped arrangement is assumed to be ( - BlockSize*- ItemsPerThread) items across a thread block. Each thread uses a- flat_idto store a range of- ItemsPerThread- itemsto the thread block.- Template Parameters:
- BlockSize – - the number of threads in a block 
- OutputIterator – - [inferred] an iterator type for input (can be a simple pointer 
- T – - [inferred] the data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_output – - the input iterator from the thread block to store to 
- items – - array that data is stored to thread block 
 
 
- 
template<unsigned int BlockSize, class OutputIterator, class T, unsigned int ItemsPerThread>
 __device__ inline void rocprim::block_store_direct_striped(unsigned int flat_id, OutputIterator block_output, T (&items)[ItemsPerThread], unsigned int valid)#
- Stores a striped arrangement of items from across the thread block into a blocked arrangement on continuous memory, which is guarded by range - valid.- The striped arrangement is assumed to be ( - BlockSize*- ItemsPerThread) items across a thread block. Each thread uses a- flat_idto store a range of- ItemsPerThread- itemsto the thread block.- Template Parameters:
- BlockSize – - the number of threads in a block 
- OutputIterator – - [inferred] an iterator type for input (can be a simple pointer 
- T – - [inferred] the data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_output – - the input iterator from the thread block to store to 
- items – - array that data is stored to thread block 
- valid – - maximum range of valid numbers to store 
 
 
Direct Warp Striped#
Load#
- 
template<unsigned int WarpSize = device_warp_size(), class InputIterator, class T, unsigned int ItemsPerThread>
 __device__ inline void block_load_direct_warp_striped(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread])#
- Loads data from continuous memory into a warp-striped arrangement of items across the thread block. - The warp-striped arrangement is assumed to be ( - WarpSize*- ItemsPerThread) items across a thread block. Each thread uses a- flat_idto load a range of- ItemsPerThreadinto- items.- The number of threads in the block must be a multiple of - WarpSize.
- The default - WarpSizeis a hardware warpsize and is an optimal value.
- WarpSizemust be a power of two and equal or less than the size of hardware warp.
- Using - WarpSizesmaller than hardware warpsize could result in lower performance.
 - Template Parameters:
- WarpSize – - [optional] the number of threads in a warp 
- InputIterator – - [inferred] an iterator type for input (can be a simple pointer 
- T – - [inferred] the data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_input – - the input iterator from the thread block to load from 
- items – - array that data is loaded to 
 
 
- 
template<unsigned int WarpSize = device_warp_size(), class InputIterator, class T, unsigned int ItemsPerThread>
 __device__ inline void block_load_direct_warp_striped(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid)#
- Loads data from continuous memory into a warp-striped arrangement of items across the thread block, which is guarded by range - valid.- The warp-striped arrangement is assumed to be ( - WarpSize*- ItemsPerThread) items across a thread block. Each thread uses a- flat_idto load a range of- ItemsPerThreadinto- items.- The number of threads in the block must be a multiple of - WarpSize.
- The default - WarpSizeis a hardware warpsize and is an optimal value.
- WarpSizemust be a power of two and equal or less than the size of hardware warp.
- Using - WarpSizesmaller than hardware warpsize could result in lower performance.
 - Template Parameters:
- WarpSize – - [optional] the number of threads in a warp 
- InputIterator – - [inferred] an iterator type for input (can be a simple pointer 
- T – - [inferred] the data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_input – - the input iterator from the thread block to load from 
- items – - array that data is loaded to 
- valid – - maximum range of valid numbers to load 
 
 
- 
template<unsigned int WarpSize = device_warp_size(), class InputIterator, class T, unsigned int ItemsPerThread, class Default>
 __device__ inline void block_load_direct_warp_striped(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds)#
- Loads data from continuous memory into a warp-striped arrangement of items across the thread block, which is guarded by range with a fall-back value for out-of-bound elements. - The warp-striped arrangement is assumed to be ( - WarpSize*- ItemsPerThread) items across a thread block. Each thread uses a- flat_idto load a range of- ItemsPerThreadinto- items.- The number of threads in the block must be a multiple of - WarpSize.
- The default - WarpSizeis a hardware warpsize and is an optimal value.
- WarpSizemust be a power of two and equal or less than the size of hardware warp.
- Using - WarpSizesmaller than hardware warpsize could result in lower performance.
 - Template Parameters:
- WarpSize – - [optional] the number of threads in a warp 
- InputIterator – - [inferred] an iterator type for input (can be a simple pointer 
- T – - [inferred] the data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
- Default – - [inferred] The data type of the default value 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_input – - the input iterator from the thread block to load from 
- items – - array that data is loaded to 
- valid – - maximum range of valid numbers to load 
- out_of_bounds – - default value assigned to out-of-bound items 
 
 
Store#
- 
template<unsigned int WarpSize = device_warp_size(), class OutputIterator, class T, unsigned int ItemsPerThread>
 __device__ inline void block_store_direct_warp_striped(unsigned int flat_id, OutputIterator block_output, T (&items)[ItemsPerThread])#
- Stores a warp-striped arrangement of items from across the thread block into a blocked arrangement on continuous memory. - The warp-striped arrangement is assumed to be ( - WarpSize*- ItemsPerThread) items across a thread block. Each thread uses a- flat_idto store a range of- ItemsPerThread- itemsto the thread block.- The number of threads in the block must be a multiple of - WarpSize.
- The default - WarpSizeis a hardware warpsize and is an optimal value.
- WarpSizemust be a power of two and equal or less than the size of hardware warp.
- Using - WarpSizesmaller than hardware warpsize could result in lower performance.
 - Template Parameters:
- WarpSize – - [optional] the number of threads in a warp 
- OutputIterator – - [inferred] an iterator type for input (can be a simple pointer 
- T – - [inferred] the data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_output – - the input iterator from the thread block to store to 
- items – - array that data is stored to thread block 
 
 
- 
template<unsigned int WarpSize = device_warp_size(), class OutputIterator, class T, unsigned int ItemsPerThread>
 __device__ inline void block_store_direct_warp_striped(unsigned int flat_id, OutputIterator block_output, T (&items)[ItemsPerThread], unsigned int valid)#
- Stores a warp-striped arrangement of items from across the thread block into a blocked arrangement on continuous memory, which is guarded by range - valid.- The warp-striped arrangement is assumed to be ( - WarpSize*- ItemsPerThread) items across a thread block. Each thread uses a- flat_idto store a range of- ItemsPerThread- itemsto the thread block.- The number of threads in the block must be a multiple of - WarpSize.
- The default - WarpSizeis a hardware warpsize and is an optimal value.
- WarpSizemust be a power of two and equal or less than the size of hardware warp.
- Using - WarpSizesmaller than hardware warpsize could result in lower performance.
 - Template Parameters:
- WarpSize – - [optional] the number of threads in a warp 
- OutputIterator – - [inferred] an iterator type for input (can be a simple pointer 
- T – - [inferred] the data type 
- ItemsPerThread – - [inferred] the number of items to be processed by each thread 
 
- Parameters:
- flat_id – - a local flat 1D thread id in a block (tile) for the calling thread 
- block_output – - the input iterator from the thread block to store to 
- items – - array that data is stored to thread block 
- valid – - maximum range of valid numbers to store