Shuffle#
-
template<class T>
__device__ inline T rocprim::warp_shuffle(const T &input, const int src_lane, const int width = device_warp_size())# Shuffle for any data type.
Each thread in warp obtains
input
fromsrc_lane
-th thread in warp. Ifwidth
is less than device_warp_size() then each subsection of the warp behaves as a separate entity with a starting logical lane id of 0. Ifsrc_lane
is not in [0;width
) range, the returned value is equal toinput
passed by thesrc_lane modulo width
thread.Note: The optional
width
parameter must be a power of 2; results are undefined if it is not a power of 2, or it is greater than device_warp_size().- Parameters:
input – - input to pass to other threads
src_lane – - warp if of a thread whose
input
should be returnedwidth – - logical warp width
-
template<class T>
__device__ inline T rocprim::warp_shuffle_down(const T &input, const unsigned int delta, const int width = device_warp_size())# Shuffle down for any data type.
i
-th thread in warp obtainsinput
fromi+delta
-th thread in warp. Ifis not in [0;
width
) range, thread’s owninput
is returned.Note: The optional
width
parameter must be a power of 2; results are undefined if it is not a power of 2, or it is greater than device_warp_size().- Parameters:
input – - input to pass to other threads
delta – - offset for calculating source lane id
width – - logical warp width
-
template<class T>
__device__ inline T rocprim::warp_shuffle_xor(const T &input, const int lane_mask, const int width = device_warp_size())# Shuffle XOR for any data type.
i
-th thread in warp obtainsinput
fromi^lane_mask
-th thread in warp.Note: The optional
width
parameter must be a power of 2; results are undefined if it is not a power of 2, or it is greater than device_warp_size().- Parameters:
input – - input to pass to other threads
lane_mask – - mask used for calculating source lane id
width – - logical warp width