Intrinsics#
Bitwise#
- 
__device__ inline int rocprim::get_bit(int x, int i)#
- Returns a single bit at ‘i’ from ‘x’. 
- 
__device__ inline unsigned int rocprim::bit_count(unsigned int x)#
- Bit count. - Returns the number of bit of - xset.
- 
__device__ inline unsigned int rocprim::bit_count(unsigned long long x)#
- Bit count. - Returns the number of bit of - xset.
- 
__host__ __device__ inline unsigned int rocprim::ctz(unsigned int x)#
- Count trailing zeroes. - Count the number of consecutive 0-bits, starting from the least significant bit. 
- 
__host__ __device__ inline unsigned int rocprim::ctz(unsigned long long x)#
- Count trailing zeroes. - Count the number of consecutive 0-bits, starting from the least significant bit. 
Warp size#
- 
__host__ __device__ inline constexpr unsigned int rocprim::warp_size()#
- [DEPRECATED] Returns a number of threads in a hardware warp. - It is constant for a device. This function is not supported for gfx1030 and newer architectures and will be removed in a future release. Please use the new host_warp_size() and device_warp_size() functions. 
- 
__host__ inline hipError_t rocprim::host_warp_size(const int device_id, unsigned int &warp_size)#
- Returns a number of threads in a hardware warp for the actual device. At host side this constant is available at runtime only. - It is constant for a device. - Parameters:
- device_id – - the device that should be queried. 
- warp_size – - out parameter for the warp size. 
 
- Returns:
- hipError_t any error that might occur. 
 
- 
__host__ inline hipError_t rocprim::host_warp_size(const hipStream_t stream, unsigned int &warp_size)#
- Returns the number of threads in a hardware warp for the device associated with the stream. At host side this constant is available at runtime only. - It is constant for a device. - Parameters:
- stream – - the stream, whose device should be queried. 
- warp_size – - out parameter for the warp size. 
 
- Returns:
- hipError_t any error that might occur. 
 
- 
__device__ inline constexpr unsigned int rocprim::device_warp_size()#
- Returns a number of threads in a hardware warp for the actual target. At device side this constant is available at compile time. - It is constant for a device. 
Lane and Warp ID#
- 
__device__ inline unsigned int warp_id()#
- Returns warp id in a block (tile). 
Flat ID#
- 
__device__ inline unsigned int flat_block_thread_id()#
- Returns flat (linear, 1D) thread identifier in a multidimensional block (tile). 
- 
__device__ inline unsigned int flat_block_id()#
- Returns flat (linear, 1D) block identifier in a multidimensional grid. 
Flat Size#
- 
__device__ inline unsigned int rocprim::flat_block_size()#
- Returns flat size of a multidimensional block (tile). 
- 
__device__ inline unsigned int rocprim::flat_tile_size()#
- Returns flat size of a multidimensional tile (block). 
Synchronization#
- 
__device__ inline void rocprim::syncthreads()#
- Synchronize all threads in a block (tile) 
- 
__device__ inline void rocprim::wave_barrier()#
- Synchronize all threads in the wavefront. - Wait for all threads in the wavefront before continuing execution. Memory ordering is guaranteed by this function between threads in the same wavefront. Threads can communicate by storing to global / shared memory, executing wave_barrier() then reading values stored by other threads in the same wavefront. - wave_barrier() should be executed by all threads in the wavefront in convergence, this means that if the function is executed in a conditional statement all threads in the wave must enter the conditional statement. - Note - On SIMT architectures all lanes come to a convergence point simultaneously, thus no special instruction is needed in the ISA. 
Active threads#
- 
__device__ inline lane_mask_type rocprim::ballot(int predicate)#
- Evaluate predicate for all active work-items in the warp and return an integer whose - i-th bit is set if and only if- predicateis- truefor the- i-th thread of the warp and the- i-th thread is active.- Parameters:
- predicate – - input to be evaluated for all active lanes 
 
- 
__device__ inline bool rocprim::group_elect(lane_mask_type mask)#
- Elect a single lane for each group in - mask.- Parameters:
- mask – [in] bit mask of the lanes in the same group as the calling lane. The - i-th bit should be set if lane- iis in the same group as the calling lane.
- Returns:
- truefor one unspecified lane in the- mask, false for everyone else. Returns- falsefor all lanes not in any group, that is lanes passing 0 as- mask.
- Pre:
- The relation specified by - maskmust be symmetric and transitive, in other words: the groups should be consistent between threads.
 
- 
__device__ inline unsigned int rocprim::masked_bit_count(lane_mask_type x, unsigned int add = 0)#
- Masked bit count. - For each thread, this function returns the number of active threads which have - i-th bit of- xset and come before the current thread.
- 
template<unsigned int LabelBits>
 __device__ inline lane_mask_type rocprim::match_any(unsigned int label, bool valid = true)#
- Group active lanes having the same bits of - label.- Threads that have the same least significant - LabelBitsbits are grouped into the same group. Every lane in the warp receives a mask of all active lanes participating in its group.- Template Parameters:
- LabelBits – number of bits to compare between labels 
- Parameters:
- label – [in] the label for the calling lane 
- valid – [in] lanes passing - falsewill be ignored for comparisons, such lanes will not be part of any group, and will always return an empty mask (0)
 
- Returns:
- A bit mask of lanes sharing the same bits for - label. The bit at index lane- i’s result includes bit- jin the lane mask if lane- jis part of the same group as lane- i, i.e. lane- iand- jcalled with the same value for label.
 
- 
__device__ inline lane_mask_type rocprim::match_any(unsigned int label, unsigned int label_bits, bool valid = true)#
- Group active lanes having the same bits of - label.- This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. - Threads that have the same least significant - label_bitsbits are grouped into the same group. Every lane in the warp receives a mask of all active lanes participating in its group.- This overload does not accept a template parameter for label bits. It is passed as a function parameter instead. - Parameters:
- label – [in] the label for the calling lane 
- label_bits – [in] number of bits to compare between labels 
- valid – [in] lanes passing - falsewill be ignored for comparisons, such lanes will not be part of any group, and will always return an empty mask (0)
 
- Returns:
- A bit mask of lanes sharing the same bits for - label. The bit at index lane- i’s result includes bit- jin the lane mask if lane- jis part of the same group as lane- i, i.e. lane- iand- jcalled with the same value for label.