include/ck_tile/core/arch/arch.hpp Source File

include/ck_tile/core/arch/arch.hpp Source File#

Composable Kernel: include/ck_tile/core/arch/arch.hpp Source File
arch.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
6 // Address Space for AMDGCN
7 // https://llvm.org/docs/AMDGPUUsage.html#address-space
8 
12 
13 #define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111
14 #define CK_TILE_VMCNT(cnt) \
15  ([]() { static_assert(!((cnt) >> 6), "VMCNT only has 6 bits"); }(), \
16  ((cnt) & 0b1111) | (((cnt) & 0b110000) << 10))
17 #define CK_TILE_EXPCNT(cnt) \
18  ([]() { static_assert(!((cnt) >> 3), "EXP only has 3 bits"); }(), ((cnt) << 4))
19 #define CK_TILE_LGKMCNT(cnt) \
20  ([]() { static_assert(!((cnt) >> 4), "LGKM only has 4 bits"); }(), ((cnt) << 8))
21 
22 namespace ck_tile {
23 
24 template <typename, bool>
25 struct safe_underlying_type;
26 
27 template <typename T>
28 struct safe_underlying_type<T, true>
29 {
30  using type = std::underlying_type_t<T>;
31 };
32 
33 template <typename T>
34 struct safe_underlying_type<T, false>
35 {
36  using type = void;
37 };
38 
39 template <typename T>
40 using safe_underlying_type_t = typename safe_underlying_type<T, std::is_enum<T>::value>::type;
41 
42 enum struct address_space_enum : std::uint16_t
43 {
44  generic = 0,
45  global,
46  lds,
47  sgpr,
48  constant,
49  vgpr
50 };
51 
52 enum struct memory_operation_enum : std::uint16_t
53 {
54  set = 0,
55  atomic_add,
56  atomic_max,
57  add
58 };
59 
61 {
62 #if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__)
63  return 64;
64 #else
65  return 32;
66 #endif
67 }
68 
69 CK_TILE_DEVICE index_t get_grid_size() { return gridDim.x; }
70 
71 CK_TILE_DEVICE index_t get_block_size() { return blockDim.x; }
72 
73 // TODO: deprecate these
74 CK_TILE_DEVICE index_t get_thread_local_1d_id() { return threadIdx.x; }
75 
76 CK_TILE_DEVICE index_t get_thread_global_1d_id() { return blockIdx.x * blockDim.x + threadIdx.x; }
77 
78 CK_TILE_DEVICE index_t get_block_1d_id() { return blockIdx.x; }
79 
80 // Use these instead
81 CK_TILE_DEVICE index_t get_lane_id() { return __lane_id(); }
82 
83 CK_TILE_DEVICE index_t get_warp_id()
84 {
85  return __builtin_amdgcn_readfirstlane(threadIdx.x / get_warp_size());
86 }
87 
88 CK_TILE_DEVICE index_t get_thread_id() { return threadIdx.x; }
89 
90 CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; }
91 
93 {
94 #if CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
95  // asm volatile("\
96  // s_waitcnt lgkmcnt(0) \n \
97  // s_barrier \
98  // " ::);
99 
100  __builtin_amdgcn_s_waitcnt(0xc07f);
101  __builtin_amdgcn_s_barrier();
102 #else
103  __syncthreads();
104 #endif
105 }
106 
107 CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
108 {
109 #ifdef __gfx12__
110  asm volatile("s_wait_loadcnt %0 \n"
111  "s_barrier_signal -1 \n"
112  "s_barrier_wait -1"
113  :
114  : "n"(cnt)
115  : "memory");
116 #else
117  asm volatile("s_waitcnt vmcnt(%0) \n"
118  "s_barrier"
119  :
120  : "n"(cnt)
121  : "memory");
122 #endif
123 }
124 
125 // https://llvm.org/docs/AMDGPU/gfx9_waitcnt.html
126 struct waitcnt_arg
127 {
128  // bit numbers (hex) -------------------------> FE'DC'BA98'7'654'3210
129  // [V]M [E]XP [L]GKM counters and [U]NUSED ---> VV'UU'LLLL'U'EEE'VVVV
130  CK_TILE_DEVICE static constexpr index_t MAX = 0b11'00'1111'0'111'1111;
131 
132  CK_TILE_DEVICE static constexpr index_t kMaxVmCnt = 0b111111;
133  CK_TILE_DEVICE static constexpr index_t kMaxExpCnt = 0b111;
134  CK_TILE_DEVICE static constexpr index_t kMaxLgkmCnt = 0b1111;
135 
136  template <index_t cnt>
137  CK_TILE_DEVICE static constexpr index_t from_vmcnt()
138  {
139  static_assert(cnt >= 0 && !(cnt >> 6), "valid range is [0..63]");
140  return MAX & ((cnt & 0b1111) | ((cnt & 0b110000) << 10));
141  }
142 
143  template <index_t cnt>
144  CK_TILE_DEVICE static constexpr index_t from_expcnt()
145  {
146  static_assert(cnt >= 0 && !(cnt >> 3), "valid range is [0..7]");
147  return MAX & (cnt << 4);
148  }
149 
150  template <index_t cnt>
151  CK_TILE_DEVICE static constexpr index_t from_lgkmcnt()
152  {
153  static_assert(cnt >= 0 && !(cnt >> 4), "valid range is [0..15]");
154  return MAX & (cnt << 8);
155  }
156 };
157 
158 template <index_t vmcnt = waitcnt_arg::kMaxVmCnt,
159  index_t expcnt = waitcnt_arg::kMaxExpCnt,
160  index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
162 {
163  __builtin_amdgcn_s_waitcnt(waitcnt_arg::from_vmcnt<vmcnt>() |
164  waitcnt_arg::from_expcnt<expcnt>() |
165  waitcnt_arg::from_lgkmcnt<lgkmcnt>());
166 }
167 
168 template <index_t vmcnt = waitcnt_arg::kMaxVmCnt,
169  index_t expcnt = waitcnt_arg::kMaxExpCnt,
170  index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
172 {
173  s_waitcnt<vmcnt, expcnt, lgkmcnt>();
174  __builtin_amdgcn_s_barrier();
175 }
176 
178 {
179 #if 1
180  // invoke clang builtins which *should* produce the same result as the inline asm below
181  // difference: inline asm is being compiled to wait vmcnt(0) after the barrier
182  s_waitcnt_barrier<0, waitcnt_arg::kMaxExpCnt, 0>();
183 #else
184  // same content as in old CK (#999)
185  asm volatile("\
186  s_waitcnt vmcnt(0) \n \
187  s_waitcnt lgkmcnt(0) \n \
188  s_barrier \
189  " ::);
190 #endif
191 }
192 
194 {
195 #if 1
196  asm volatile("s_nop %0" : : "n"(cnt) :);
197 #else
198  __builtin_amdgcn_sched_barrier(cnt);
199 #endif
200 }
201 
202 #define CK_CONSTANT_ADDRESS_SPACE \
203  __attribute__((address_space( \
204  static_cast<safe_underlying_type_t<address_space_enum>>(address_space_enum::constant))))
205 
206 template <typename T>
208 {
209  // cast a pointer in "Constant" address space (4) to "Generic" address space (0)
210  // only c-style pointer cast seems be able to be compiled
211 #pragma clang diagnostic push
212 #pragma clang diagnostic ignored "-Wold-style-cast"
213  return (T*)(p); // NOLINT(old-style-cast)
214 #pragma clang diagnostic pop
215 }
216 
217 template <typename T>
219 {
220  // cast a pointer in "Generic" address space (0) to "Constant" address space (4)
221  // only c-style pointer cast seems be able to be compiled;
222 #pragma clang diagnostic push
223 #pragma clang diagnostic ignored "-Wold-style-cast"
224  return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast)
225 #pragma clang diagnostic pop
226 }
227 
229 {
230 #if defined(__gfx950__)
231  return 163840;
232 #else
233  return 65536;
234 #endif
235 }
236 
237 } // namespace ck_tile
#define CK_CONSTANT_ADDRESS_SPACE
Definition: arch.hpp:202
constexpr CK_TILE_HOST_DEVICE index_t get_smem_capacity()
Definition: arch.hpp:228
CK_TILE_DEVICE void s_nop(index_t cnt=0)
Definition: arch.hpp:193
__device__ T * cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE *p)
Definition: arch.hpp:207
CK_TILE_DEVICE void s_waitcnt_barrier()
Definition: arch.hpp:171
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE * cast_pointer_to_constant_address_space(T *p)
Definition: arch.hpp:218
CK_TILE_DEVICE void block_sync_lds_direct_load()
Definition: arch.hpp:177
CK_TILE_DEVICE void s_waitcnt()
Definition: arch.hpp:161
#define CK_TILE_DEVICE
Definition: config.hpp:40
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:41
Definition: cluster_descriptor.hpp:13
CK_TILE_DEVICE void atomic_add(X *p_dst, const X &x)
CK_TILE_HOST_DEVICE T add(const T &a, const T &b)
Definition: generic_memory_space_atomic.hpp:12
int32_t index_t
Definition: integer.hpp:9
__device__ index_t get_grid_size()
Definition: get_id.hpp:27
__host__ constexpr __device__ index_t get_warp_size()
Definition: get_id.hpp:10
__device__ index_t get_block_size()
Definition: get_id.hpp:29
__device__ index_t get_block_1d_id()
Definition: get_id.hpp:25
__device__ index_t get_thread_global_1d_id()
Definition: get_id.hpp:21
__device__ X atomic_max(X *p_dst, const X &x)
__device__ index_t get_thread_local_1d_id()
Definition: get_id.hpp:19
__device__ void block_sync_lds()
Definition: synchronization.hpp:10