Go to the source code of this file.
 | 
| #define  | CK_TILE_S_CNT_MAX   0b1100'1111'0111'1111#define CK_TILE_VMCNT(cnt)                                              \    ([]() { static_assert(!((cnt) >> 6), "VMCNT only has 6 bits"); }(), \     ((cnt) & 0b1111) | (((cnt) & 0b110000) << 10))#define CK_TILE_EXPCNT(cnt) \    ([]() { static_assert(!((cnt) >> 3), "EXP only has 3 bits"); }(), ((cnt) << 4))#define CK_TILE_LGKMCNT(cnt) \    ([]() { static_assert(!((cnt) >> 4), "LGKM only has 4 bits"); }(), ((cnt) << 8))namespace ck_tile {template <typename, bool>struct safe_underlying_type;template <typename T>struct safe_underlying_type<T, true>{    using type = std::underlying_type_t<T>;};template <typename T>struct safe_underlying_type<T, false>{    using type = void;};template <typename T>using safe_underlying_type_t = typename safe_underlying_type<T, std::is_enum<T>::value>::type;enum struct address_space_enum : std::uint16_t{    generic = 0,    global,    lds,    sgpr,    constant,    vgpr};enum struct memory_operation_enum : std::uint16_t{    set = 0,    atomic_add,    atomic_max,    add};CK_TILE_HOST_DEVICE constexpr index_t get_warp_size(){#if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__)    return 64;#else    return 32;#endif}CK_TILE_DEVICE index_t get_grid_size() { return gridDim.x; }CK_TILE_DEVICE index_t get_block_size() { return blockDim.x; }CK_TILE_DEVICE index_t get_thread_local_1d_id() { return threadIdx.x; }CK_TILE_DEVICE index_t get_thread_global_1d_id() { return blockIdx.x * blockDim.x + threadIdx.x; }CK_TILE_DEVICE index_t get_block_1d_id() { return blockIdx.x; }CK_TILE_DEVICE index_t get_lane_id() { return __lane_id(); }CK_TILE_DEVICE index_t get_warp_id(){    return __builtin_amdgcn_readfirstlane(threadIdx.x / get_warp_size());}CK_TILE_DEVICE index_t get_thread_id() { return threadIdx.x; }CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; }CK_TILE_DEVICE void block_sync_lds(){#if CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM                    __builtin_amdgcn_s_waitcnt(0xc07f);    __builtin_amdgcn_s_barrier();#else    __syncthreads();#endif}CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0){#ifdef __gfx12__    asm volatile("s_wait_loadcnt %0 \n"                 "s_barrier_signal -1 \n"                 "s_barrier_wait -1"                 :                 : "n"(cnt)                 : "memory");#else    asm volatile("s_waitcnt vmcnt(%0) \n"                 "s_barrier"                 :                 : "n"(cnt)                 : "memory");#endif}struct waitcnt_arg{            CK_TILE_DEVICE static constexpr index_t MAX = 0b11'00'1111'0'111'1111; | 
|   | 
| #define  | CK_CONSTANT_ADDRESS_SPACE | 
|   | 
◆ CK_CONSTANT_ADDRESS_SPACE
      
        
          | #define CK_CONSTANT_ADDRESS_SPACE | 
        
      
 
Value:    __attribute__((address_space( \
        static_cast<safe_underlying_type_t<address_space_enum>>(address_space_enum::constant))))
 
 
 
◆ CK_TILE_S_CNT_MAX
      
        
          | #define CK_TILE_S_CNT_MAX   0b1100'1111'0111'1111#define CK_TILE_VMCNT(cnt)                                              \    ([]() { static_assert(!((cnt) >> 6), "VMCNT only has 6 bits"); }(), \     ((cnt) & 0b1111) | (((cnt) & 0b110000) << 10))#define CK_TILE_EXPCNT(cnt) \    ([]() { static_assert(!((cnt) >> 3), "EXP only has 3 bits"); }(), ((cnt) << 4))#define CK_TILE_LGKMCNT(cnt) \    ([]() { static_assert(!((cnt) >> 4), "LGKM only has 4 bits"); }(), ((cnt) << 8))namespace ck_tile {template <typename, bool>struct safe_underlying_type;template <typename T>struct safe_underlying_type<T, true>{    using type = std::underlying_type_t<T>;};template <typename T>struct safe_underlying_type<T, false>{    using type = void;};template <typename T>using safe_underlying_type_t = typename safe_underlying_type<T, std::is_enum<T>::value>::type;enum struct address_space_enum : std::uint16_t{    generic = 0,    global,    lds,    sgpr,    constant,    vgpr};enum struct memory_operation_enum : std::uint16_t{    set = 0,    atomic_add,    atomic_max,    add};CK_TILE_HOST_DEVICE constexpr index_t get_warp_size(){#if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__)    return 64;#else    return 32;#endif}CK_TILE_DEVICE index_t get_grid_size() { return gridDim.x; }CK_TILE_DEVICE index_t get_block_size() { return blockDim.x; }CK_TILE_DEVICE index_t get_thread_local_1d_id() { return threadIdx.x; }CK_TILE_DEVICE index_t get_thread_global_1d_id() { return blockIdx.x * blockDim.x + threadIdx.x; }CK_TILE_DEVICE index_t get_block_1d_id() { return blockIdx.x; }CK_TILE_DEVICE index_t get_lane_id() { return __lane_id(); }CK_TILE_DEVICE index_t get_warp_id(){    return __builtin_amdgcn_readfirstlane(threadIdx.x / get_warp_size());}CK_TILE_DEVICE index_t get_thread_id() { return threadIdx.x; }CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; }CK_TILE_DEVICE void block_sync_lds(){#if CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM                    __builtin_amdgcn_s_waitcnt(0xc07f);    __builtin_amdgcn_s_barrier();#else    __syncthreads();#endif}CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0){#ifdef __gfx12__    asm volatile("s_wait_loadcnt %0 \n"                 "s_barrier_signal -1 \n"                 "s_barrier_wait -1"                 :                 : "n"(cnt)                 : "memory");#else    asm volatile("s_waitcnt vmcnt(%0) \n"                 "s_barrier"                 :                 : "n"(cnt)                 : "memory");#endif}struct waitcnt_arg{            CK_TILE_DEVICE static constexpr index_t MAX = 0b11'00'1111'0'111'1111; | 
        
      
 
 
◆ block_sync_lds_direct_load()
◆ cast_pointer_to_constant_address_space()
◆ cast_pointer_to_generic_address_space()
◆ get_smem_capacity()
◆ s_nop()
◆ s_waitcnt()
template<index_t vmcnt = waitcnt_arg::kMaxVmCnt, index_t expcnt = waitcnt_arg::kMaxExpCnt, index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt> 
      
 
 
◆ s_waitcnt_barrier()
template<index_t vmcnt = waitcnt_arg::kMaxVmCnt, index_t expcnt = waitcnt_arg::kMaxExpCnt, index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>