21     asm volatile(
"s_mov_b32 m0, %0" : : 
"s"(v) : 
"memory");
 
   27     asm volatile(
"s_add_u32 m0, %0, m0" : : 
"n"(v) : 
"memory");
 
   34     return  __shfl_up(v_local, lane_delta);
 
   36     static_assert(
sizeof(T) == 
sizeof(
int32_t), 
"wrong!");
 
   38     const uint32_t wrap_around_lane_delta = 
get_warp_size() - lane_delta;
 
   40     const int32_t v_remote_tmp = __builtin_amdgcn_ds_bpermute(
 
   41         (__lane_id() << 2) + (wrap_around_lane_delta << 2), bit_cast<int32_t>(v_local));
 
   43     return bit_cast<T>(v_remote_tmp);
 
   51     return  __shfl_down(v_local, lane_delta);
 
   53     static_assert(
sizeof(T) == 
sizeof(
int32_t), 
"wrong!");
 
   55     const int32_t v_remote_tmp = __builtin_amdgcn_ds_bpermute(
 
   56         (__lane_id() << 2) + (lane_delta << 2), bit_cast<int32_t>(v_local));
 
   58     return bit_cast<T>(v_remote_tmp);
 
   66     return  __shfl(v_local, src_lane);
 
   68     if constexpr(
sizeof(
int32_t) > 
sizeof(T))
 
   78         p_remote.x = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(p));
 
   82     else if constexpr(
sizeof(
int32_t) == 
sizeof(T))
 
   85             __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(v_local));
 
   87         return bit_cast<T>(v_remote_tmp);
 
   91         static_assert(
sizeof(T) % 
sizeof(
int32_t) == 0, 
"wrong!");
 
   94         auto vs               = bit_cast<vector_type>(v_local);
 
   95         auto vs_remote        = vector_type{};
 
   97             int32_t tmp = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(vs[i_e]));
 
  100         return bit_cast<T>(vs_remote);
 
  105 template <
typename T>
 
  108     static_assert(
sizeof(T) == 4);
 
  111     asm volatile(
"v_cmp_ge_u32 %[s_exec_flag], %[v_flag], 1" 
  112                  : [s_exec_flag] 
"=s"(exec_flag)
 
  113                  : [v_flag] 
"v"(v_flag));
 
  117 template <
typename X, 
typename Y>
 
  120     static_assert(
sizeof(X) == 4 && 
sizeof(Y) == 4);
 
  123     asm volatile(
"v_cmp_lt_u32 %[s_exec_flag], %[v_x], %[v_y]" 
  124                  : [s_exec_flag] 
"=s"(exec_flag)
 
  125                  : [v_x] 
"v"(x), [v_y] 
"v"(y));
 
#define CK_TILE_DEVICE
Definition: config.hpp:40
 
Definition: cluster_descriptor.hpp:13
 
CK_TILE_DEVICE auto cmp_lt_to_exec(const X &x, const Y &y)
Definition: utility.hpp:118
 
CK_TILE_DEVICE T warp_shuffle_up(const T &v_local, uint32_t lane_delta)
Definition: utility.hpp:31
 
uint32_t uint32x2_t
Definition: vector_type.hpp:152
 
CK_TILE_DEVICE T warp_shuffle(const T &v_local, uint32_t src_lane)
Definition: utility.hpp:63
 
int32_t index_t
Definition: integer.hpp:9
 
CK_TILE_DEVICE T warp_shuffle_down(const T &v_local, uint32_t lane_delta)
Definition: utility.hpp:48
 
int32_t int32_t
Definition: integer.hpp:10
 
CK_TILE_DEVICE void m0_set_with_memory(index_t v)
Definition: utility.hpp:19
 
CK_TILE_DEVICE auto flag_to_exec(const T &v_flag)
Definition: utility.hpp:106
 
CK_TILE_DEVICE void m0_inc_with_memory(index_t v)
Definition: utility.hpp:25
 
__host__ constexpr __device__ index_t get_warp_size()
Definition: get_id.hpp:10
 
Definition: functional.hpp:43