17 #define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111
18 #define CK_TILE_VMCNT(cnt) \
19 ([]() { static_assert(!((cnt) >> 6), "VMCNT only has 6 bits"); }(), \
20 ((cnt) & 0b1111) | (((cnt) & 0b110000) << 10))
21 #define CK_TILE_EXPCNT(cnt) \
22 ([]() { static_assert(!((cnt) >> 3), "EXP only has 3 bits"); }(), ((cnt) << 4))
23 #define CK_TILE_LGKMCNT(cnt) \
24 ([]() { static_assert(!((cnt) >> 4), "LGKM only has 4 bits"); }(), ((cnt) << 8))
28 template <
typename,
bool>
29 struct safe_underlying_type;
32 struct safe_underlying_type<T, true>
34 using type = std::underlying_type_t<T>;
38 struct safe_underlying_type<T, false>
64 namespace core::arch {
70 enum struct amdgcn_target_id
82 GFX103_GENERIC = 0x103F,
90 GFX11_GENERIC = 0x11FF,
93 GFX12_GENERIC = 0x12FF,
97 enum struct amdgcn_target_family_id
106 enum struct amdgcn_target_arch_id
113 enum struct amdgcn_target_wave_size_id
122 template <amdgcn_target_id TargetId = amdgcn_target_id::HOST,
123 amdgcn_target_family_id FamilyId = amdgcn_target_family_id::HOST,
124 amdgcn_target_arch_id ArchId = amdgcn_target_arch_id::HOST,
125 amdgcn_target_wave_size_id WaveSizeId = amdgcn_target_wave_size_id::HOST>
128 static constexpr amdgcn_target_id TARGET_ID = TargetId;
129 static constexpr amdgcn_target_family_id FAMILY_ID = FamilyId;
130 static constexpr amdgcn_target_arch_id ARCH_ID = ArchId;
131 static constexpr amdgcn_target_wave_size_id WAVE_SIZE_ID = WaveSizeId;
134 template <amdgcn_target_
id targetId>
135 static constexpr
auto make_amdgcn_gfx9_target()
137 return amdgcn_target<targetId,
138 amdgcn_target_family_id::GFX9,
139 amdgcn_target_arch_id::CDNA,
140 amdgcn_target_wave_size_id::WAVE64>{};
143 template <amdgcn_target_
id targetId>
144 static constexpr
auto make_amdgcn_gfx10_3_target()
146 return amdgcn_target<targetId,
147 amdgcn_target_family_id::GFX10_3,
148 amdgcn_target_arch_id::RDNA,
149 amdgcn_target_wave_size_id::WAVE32>{};
152 template <amdgcn_target_
id targetId>
153 static constexpr
auto make_amdgcn_gfx11_target()
155 return amdgcn_target<targetId,
156 amdgcn_target_family_id::GFX11,
157 amdgcn_target_arch_id::RDNA,
158 amdgcn_target_wave_size_id::WAVE32>{};
161 template <amdgcn_target_
id targetId>
162 static constexpr
auto make_amdgcn_gfx12_target()
164 return amdgcn_target<targetId,
165 amdgcn_target_family_id::GFX12,
166 amdgcn_target_arch_id::RDNA,
167 amdgcn_target_wave_size_id::WAVE32>{};
170 template <
typename CompilerTarget, amdgcn_target_id... TargetIds>
171 static constexpr
auto is_target_id_any_of()
173 return is_any_value_of(CompilerTarget::TARGET_ID, TargetIds...);
176 template <
typename CompilerTarget, amdgcn_target_family_id... FamilyIds>
177 static constexpr
auto is_target_family_any_of()
179 return is_any_value_of(CompilerTarget::FAMILY_ID, FamilyIds...);
182 template <
typename CompilerTarget>
183 static constexpr
bool is_target_family_gfx9()
185 return CompilerTarget::FAMILY_ID == amdgcn_target_family_id::GFX9;
188 template <
typename CompilerTarget>
189 static constexpr
bool is_target_family_gfx10_3()
191 return CompilerTarget::FAMILY_ID == amdgcn_target_family_id::GFX10_3;
194 template <
typename CompilerTarget>
195 static constexpr
bool is_target_family_gfx11()
197 return CompilerTarget::FAMILY_ID == amdgcn_target_family_id::GFX11;
200 template <
typename CompilerTarget>
201 static constexpr
bool is_target_family_gfx12()
203 return CompilerTarget::FAMILY_ID == amdgcn_target_family_id::GFX12;
206 template <
typename CompilerTarget>
207 static constexpr
bool is_target_arch_cdna()
209 return CompilerTarget::ARCH_ID == amdgcn_target_arch_id::CDNA;
212 template <
typename CompilerTarget>
213 static constexpr
bool is_target_arch_rdna()
215 return CompilerTarget::ARCH_ID == amdgcn_target_arch_id::RDNA;
218 template <
typename CompilerTarget>
219 static constexpr
bool is_target_wave_size_32()
221 return CompilerTarget::WAVE_SIZE_ID == amdgcn_target_wave_size_id::WAVE32;
224 template <
typename CompilerTarget>
225 static constexpr
bool is_target_wave_size_64()
227 return CompilerTarget::WAVE_SIZE_ID == amdgcn_target_wave_size_id::WAVE64;
232 #define MAP_COMPILER_STATE_TO_GFX9_TARGET(COMPILER_STATE, TARGET_ID) \
233 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
235 return make_amdgcn_gfx9_target<amdgcn_target_id::TARGET_ID>(); \
239 #define MAP_COMPILER_STATE_TO_GFX10_3_TARGET(COMPILER_STATE, TARGET_ID) \
240 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
242 return make_amdgcn_gfx10_3_target<amdgcn_target_id::TARGET_ID>(); \
246 #define MAP_COMPILER_STATE_TO_GFX11_TARGET(COMPILER_STATE, TARGET_ID) \
247 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
249 return make_amdgcn_gfx11_target<amdgcn_target_id::TARGET_ID>(); \
253 #define MAP_COMPILER_STATE_TO_GFX12_TARGET(COMPILER_STATE, TARGET_ID) \
254 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
256 return make_amdgcn_gfx12_target<amdgcn_target_id::TARGET_ID>(); \
265 constexpr
auto get_compiler_target()
267 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX908, GFX908);
268 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX90A, GFX90A);
269 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX942, GFX942);
270 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX950, GFX950);
271 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1030, GFX1030);
272 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1031, GFX1031);
273 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1032, GFX1032);
274 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1034, GFX1034);
275 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1035, GFX1035);
276 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1036, GFX1036);
277 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX10_3_GENERIC, GFX103_GENERIC);
278 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1100, GFX1100);
279 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1101, GFX1101);
280 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1102, GFX1102);
281 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1103, GFX1103);
282 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1150, GFX1150);
283 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1151, GFX1151);
284 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1152, GFX1152);
285 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX11_GENERIC, GFX11_GENERIC);
286 MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX1200, GFX1200);
287 MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX1201, GFX1201);
288 MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX12_GENERIC, GFX12_GENERIC);
291 if constexpr(amdgcn_compiler_target_state::CK_TILE_HOST_COMPILE)
293 return amdgcn_target<>{};
298 #undef MAP_COMPILER_STATE_TO_GFX9_TARGET
299 #undef MAP_COMPILER_STATE_TO_GFX10_3_TARGET
300 #undef MAP_COMPILER_STATE_TO_GFX11_TARGET
301 #undef MAP_COMPILER_STATE_TO_GFX12_TARGET
304 static_assert(!amdgcn_compiler_target_state::CK_TILE_DEVICE_COMPILE ||
305 get_compiler_target().TARGET_ID != amdgcn_target_id::HOST,
306 "Device compile must have a valid target device architecture");
309 static_assert(!amdgcn_compiler_target_state::CK_TILE_HOST_COMPILE ||
310 get_compiler_target().TARGET_ID == amdgcn_target_id::HOST,
311 "Host compile must target HOST architecture");
315 #define MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(NAME_STRING, TARGET_ID) \
316 if(str.find(NAME_STRING) != std::string::npos) \
318 return amdgcn_target_id::TARGET_ID; \
330 CK_TILE_HOST auto hip_device_prop_gcn_arch_name_to_amdgcn_target_id(
char const* testStr)
332 auto str = std::string(testStr);
333 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx908", GFX908);
334 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx90a", GFX90A);
335 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx942", GFX942);
336 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx950", GFX950);
337 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1030", GFX1030);
338 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1031", GFX1031);
339 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1032", GFX1032);
340 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1034", GFX1034);
341 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1035", GFX1035);
342 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1036", GFX1036);
343 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx10_3_generic", GFX103_GENERIC);
344 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1100", GFX1100);
345 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1101", GFX1101);
346 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1102", GFX1102);
347 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1103", GFX1103);
348 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1150", GFX1150);
349 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1151", GFX1151);
350 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1152", GFX1152);
351 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx11_generic", GFX11_GENERIC);
352 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1200", GFX1200);
353 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1201", GFX1201);
354 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx12_generic", GFX12_GENERIC);
357 return amdgcn_target_id::HOST;
360 #undef MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID
368 template <
typename CompilerTarget, amdgcn_target_id... SupportedTargetIds>
369 using enable_if_target_id_t =
370 std::enable_if_t<is_any_value_of(CompilerTarget::TARGET_ID, SupportedTargetIds...)>;
379 template <
typename CompilerTarget, amdgcn_target_family_id... SupportedTargetFamilyIds>
380 using enable_if_target_family_id_t =
381 std::enable_if_t<is_any_value_of(CompilerTarget::FAMILY_ID, SupportedTargetFamilyIds...)>;
388 template <
typename CompilerTarget, amdgcn_target_arch_id... SupportedTargetArchIds>
389 using enable_if_target_arch_id_t =
390 std::enable_if_t<is_any_value_of(CompilerTarget::ARCH_ID, SupportedTargetArchIds...)>;
399 template <
typename CompilerTarget, amdgcn_target_wave_size_id... SupportedTargetWaveSizeIds>
400 using enable_if_target_wave_size_id_t =
401 std::enable_if_t<is_any_value_of(CompilerTarget::WAVE_SIZE_ID, SupportedTargetWaveSizeIds...)>;
409 template <
typename CompilerTarget>
410 using enable_if_target_family_gfx9_t =
411 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX9>;
417 template <
typename CompilerTarget>
418 using enable_if_target_family_gfx10_3_t =
419 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX10_3>;
425 template <
typename CompilerTarget>
426 using enable_if_target_family_gfx11_t =
427 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX11>;
433 template <
typename CompilerTarget>
434 using enable_if_target_family_gfx12_t =
435 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX12>;
441 template <
typename CompilerTarget>
442 using enable_if_target_arch_cdna_t =
443 enable_if_target_arch_id_t<CompilerTarget, amdgcn_target_arch_id::CDNA>;
449 template <
typename CompilerTarget>
450 using enable_if_target_arch_rdna_t =
451 enable_if_target_arch_id_t<CompilerTarget, amdgcn_target_arch_id::RDNA>;
457 template <
typename CompilerTarget>
458 using enable_if_target_wave32_t =
459 enable_if_target_wave_size_id_t<CompilerTarget, amdgcn_target_wave_size_id::WAVE32>;
465 template <
typename CompilerTarget>
466 using enable_if_target_wave64_t =
467 enable_if_target_wave_size_id_t<CompilerTarget, amdgcn_target_wave_size_id::WAVE64>;
469 #elif __cplusplus >= 202002L
479 const amdgcn_target_id TARGET_ID = amdgcn_target_id::HOST;
480 const amdgcn_target_family_id FAMILY_ID = amdgcn_target_family_id::HOST;
481 const amdgcn_target_arch_id ARCH_ID = amdgcn_target_arch_id::HOST;
482 const amdgcn_target_wave_size_id WAVE_SIZE_ID = amdgcn_target_wave_size_id::HOST;
485 static constexpr
auto make_amdgcn_gfx10_3_target(amdgcn_target_id targetId)
487 return amdgcn_target{.TARGET_ID = targetId,
488 .FAMILY_ID = amdgcn_target_family_id::GFX10_3,
489 .ARCH_ID = amdgcn_target_arch_id::RDNA,
490 .WAVE_SIZE_ID = amdgcn_target_wave_size_id::WAVE32};
493 static constexpr
auto make_amdgcn_gfx9_target(amdgcn_target_id targetId)
495 return amdgcn_target{.TARGET_ID = targetId,
496 .FAMILY_ID = amdgcn_target_family_id::GFX9,
497 .ARCH_ID = amdgcn_target_arch_id::CDNA,
498 .WAVE_SIZE_ID = amdgcn_target_wave_size_id::WAVE64};
501 static constexpr
auto make_amdgcn_gfx11_target(amdgcn_target_id targetId)
503 return amdgcn_target{.TARGET_ID = targetId,
504 .FAMILY_ID = amdgcn_target_family_id::GFX11,
505 .ARCH_ID = amdgcn_target_arch_id::RDNA,
506 .WAVE_SIZE_ID = amdgcn_target_wave_size_id::WAVE32};
509 static constexpr
auto make_amdgcn_gfx12_target(amdgcn_target_id targetId)
511 return amdgcn_target{.TARGET_ID = targetId,
512 .FAMILY_ID = amdgcn_target_family_id::GFX12,
513 .ARCH_ID = amdgcn_target_arch_id::RDNA,
514 .WAVE_SIZE_ID = amdgcn_target_wave_size_id::WAVE32};
517 static constexpr
bool is_target_family_gfx9(amdgcn_target target)
519 return target.FAMILY_ID == amdgcn_target_family_id::GFX9;
522 static constexpr
bool is_target_family_gfx10_3(amdgcn_target target)
524 return target.FAMILY_ID == amdgcn_target_family_id::GFX10_3;
527 static constexpr
bool is_target_family_gfx11(amdgcn_target target)
529 return target.FAMILY_ID == amdgcn_target_family_id::GFX11;
532 static constexpr
bool is_target_family_gfx12(amdgcn_target target)
534 return target.FAMILY_ID == amdgcn_target_family_id::GFX12;
537 static constexpr
bool is_target_arch_cdna(amdgcn_target target)
539 return target.ARCH_ID == amdgcn_target_arch_id::CDNA;
542 static constexpr
bool is_target_arch_rdna(amdgcn_target target)
544 return target.ARCH_ID == amdgcn_target_arch_id::RDNA;
547 static constexpr
bool is_target_wave_size_32(amdgcn_target target)
549 return target.WAVE_SIZE_ID == amdgcn_target_wave_size_id::WAVE32;
552 static constexpr
bool is_target_wave_size_64(amdgcn_target target)
554 return target.WAVE_SIZE_ID == amdgcn_target_wave_size_id::WAVE64;
558 #define MAP_COMPILER_STATE_TO_GFX10_3_TARGET(COMPILER_STATE, TARGET_ID) \
559 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
561 return make_amdgcn_gfx9_target(amdgcn_target_id::TARGET_ID); \
564 #define MAP_COMPILER_STATE_TO_GFX9_TARGET(COMPILER_STATE, TARGET_ID) \
565 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
567 return make_amdgcn_gfx9_target(amdgcn_target_id::TARGET_ID); \
570 #define MAP_COMPILER_STATE_TO_GFX11_TARGET(COMPILER_STATE, TARGET_ID) \
571 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
573 return make_amdgcn_gfx11_target(amdgcn_target_id::TARGET_ID); \
576 #define MAP_COMPILER_STATE_TO_GFX12_TARGET(COMPILER_STATE, TARGET_ID) \
577 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
579 return make_amdgcn_gfx12_target(amdgcn_target_id::TARGET_ID); \
588 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX908, GFX908);
589 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX90A, GFX90A);
590 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX942, GFX942);
591 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX950, GFX950);
592 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1030, GFX1030);
593 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1031, GFX1031);
594 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1032, GFX1032);
595 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1034, GFX1034);
596 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1035, GFX1035);
597 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1036, GFX1036);
598 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX10_3_GENERIC, GFX103_GENERIC);
599 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1100, GFX1100);
600 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1101, GFX1101);
601 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1102, GFX1102);
602 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1103, GFX1103);
603 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1150, GFX1150);
604 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1151, GFX1151);
605 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1152, GFX1152);
606 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX11_GENERIC, GFX11_GENERIC);
607 MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX1200, GFX1200);
608 MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX1201, GFX1201);
609 MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX12_GENERIC, GFX12_GENERIC);
612 return amdgcn_target{};
616 #undef MAP_COMPILER_STATE_TO_GFX9_TARGET
617 #undef MAP_COMPILER_STATE_TO_GFX10_3_TARGET
618 #undef MAP_COMPILER_STATE_TO_GFX11_TARGET
619 #undef MAP_COMPILER_STATE_TO_GFX12_TARGET
622 static_assert(!amdgcn_compiler_target_state::CK_TILE_DEVICE_COMPILE ||
623 get_compiler_target().TARGET_ID != amdgcn_target_id::HOST,
624 "Device compile must have a valid target device architecture");
627 static_assert(!amdgcn_compiler_target_state::CK_TILE_HOST_COMPILE ||
628 get_compiler_target().TARGET_ID == amdgcn_target_id::HOST,
629 "Host compile must target HOST architecture");
631 #define MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET(NAME_STRING, TARGET_ID) \
632 if constexpr(str.find(NAME_STRING) != std::string::npos) \
634 return make_amdgcn_gfx9_target(amdgcn_target_id::TARGET_ID); \
638 #define MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(NAME_STRING, TARGET_ID) \
639 if constexpr(str.find(NAME_STRING) != std::string::npos) \
641 return make_amdgcn_gfx10_3_target(amdgcn_target_id::TARGET_ID); \
645 #define MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(NAME_STRING, TARGET_ID) \
646 if constexpr(str.find(NAME_STRING) != std::string::npos) \
648 return make_amdgcn_gfx11_target(amdgcn_target_id::TARGET_ID); \
652 #define MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX12_TARGET(NAME_STRING, TARGET_ID) \
653 if constexpr(str.find(NAME_STRING) != std::string::npos) \
655 return make_amdgcn_gfx12_target(amdgcn_target_id::TARGET_ID); \
665 CK_TILE_HOST auto hip_device_prop_gcn_arch_name_to_amdgcn_target(
char const* testStr)
667 auto str = std::string(testStr);
668 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET(
"gfx908", GFX908);
669 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET(
"gfx90a", GFX90A);
670 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET(
"gfx942", GFX942);
671 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET(
"gfx950", GFX950);
672 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(
"gfx1030", GFX1030);
673 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(
"gfx1031", GFX1031);
674 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(
"gfx1032", GFX1032);
675 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(
"gfx1034", GFX1034);
676 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(
"gfx1035", GFX1035);
677 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(
"gfx1036", GFX1036);
678 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(
"gfx10_3_generic", GFX103_GENERIC);
679 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx1100", GFX1100);
680 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx1101", GFX1101);
681 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx1102", GFX1102);
682 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx1103", GFX1103);
683 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx1150", GFX1150);
684 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx1151", GFX1151);
685 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx1152", GFX1152);
686 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx11_generic", GFX11_GENERIC);
687 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX12_TARGET(
"gfx1200", GFX1200);
688 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX12_TARGET(
"gfx1201", GFX1201);
689 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX12_TARGET(
"gfx12_generic", GFX12_GENERIC);
692 return amdgcn_target{};
695 #undef MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET
696 #undef MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET
697 #undef MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET
698 #undef MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX12_TARGET
706 template <amdgcn_target CompilerTarget, amdgcn_target_id... SupportedTargetIds>
707 using enable_if_target_id_t =
708 std::enable_if_t<is_any_value_of(CompilerTarget.TARGET_ID, SupportedTargetIds...)>;
717 template <amdgcn_target CompilerTarget, amdgcn_target_family_id... SupportedTargetFamilyIds>
718 using enable_if_target_family_id_t =
719 std::enable_if_t<is_any_value_of(CompilerTarget.FAMILY_ID, SupportedTargetFamilyIds...)>;
726 template <amdgcn_target CompilerTarget, amdgcn_target_arch_id... SupportedTargetArchIds>
727 using enable_if_target_arch_id_t =
728 std::enable_if_t<is_any_value_of(CompilerTarget.ARCH_ID, SupportedTargetArchIds...)>;
737 template <amdgcn_target CompilerTarget, amdgcn_target_wave_size_id... SupportedTargetWaveSizeIds>
738 using enable_if_target_wave_size_id_t =
739 std::enable_if_t<is_any_value_of(CompilerTarget.WAVE_SIZE_ID, SupportedTargetWaveSizeIds...)>;
747 template <amdgcn_target CompilerTarget>
748 using enable_if_target_family_gfx9_t =
749 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX9>;
755 template <amdgcn_target CompilerTarget>
756 using enable_if_target_family_gfx10_3_t =
757 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX10_3>;
763 template <amdgcn_target CompilerTarget>
764 using enable_if_target_family_gfx11_t =
765 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX11>;
771 template <amdgcn_target CompilerTarget>
772 using enable_if_target_family_gfx12_t =
773 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX12>;
779 template <amdgcn_target CompilerTarget>
780 using enable_if_target_arch_cdna_t =
781 enable_if_target_arch_id_t<CompilerTarget, amdgcn_target_arch_id::CDNA>;
787 template <amdgcn_target CompilerTarget>
788 using enable_if_target_arch_rdna_t =
789 enable_if_target_arch_id_t<CompilerTarget, amdgcn_target_arch_id::RDNA>;
795 template <amdgcn_target CompilerTarget>
796 using enable_if_target_wave32_t =
797 enable_if_target_wave_size_id_t<CompilerTarget, amdgcn_target_wave_size_id::WAVE32>;
803 template <amdgcn_target CompilerTarget>
804 using enable_if_target_wave64_t =
805 enable_if_target_wave_size_id_t<CompilerTarget, amdgcn_target_wave_size_id::WAVE64>;
813 hipDeviceProp_t props{};
815 auto status = hipGetDevice(&device);
816 if(status != hipSuccess)
820 status = hipGetDeviceProperties(&props, device);
821 if(status != hipSuccess)
825 return props.major > 9;
832 return static_cast<index_t>(core::arch::get_compiler_target().WAVE_SIZE_ID);
849 template <
bool ReturnSgpr = true>
853 if constexpr(ReturnSgpr)
870 asm volatile(
"s_wait_loadcnt %0 \n"
871 "s_barrier_signal -1 \n"
877 asm volatile(
"s_waitcnt vmcnt(%0) \n"
885 struct WaitcntLayoutGfx12
896 struct WaitcntLayoutGfx11
907 struct WaitcntLayoutLegacy
917 return ((c & 0xF) << 0) | ((c & 0x30) << 10);
924 #if defined(__gfx12__)
925 using Waitcnt = WaitcntLayoutGfx12;
926 #elif defined(__gfx11__)
927 using Waitcnt = WaitcntLayoutGfx11;
929 using Waitcnt = WaitcntLayoutLegacy;
938 #if defined(__gfx12__) || defined(__gfx11__)
948 template <index_t cnt>
951 static_assert((cnt & ~Waitcnt::VM_MASK) == 0,
"vmcnt out of range");
952 return Waitcnt::pack_vm(cnt);
955 template <index_t cnt>
958 static_assert((cnt & ~Waitcnt::LGKM_MASK) == 0,
"lgkmcnt out of range");
959 return Waitcnt::pack_lgkm(cnt);
962 template <index_t cnt>
965 if constexpr(Waitcnt::HAS_EXP)
968 #if !defined(__gfx12__) && !defined(__gfx11__)
969 static_assert((cnt & ~Waitcnt::EXP_MASK) == 0,
"expcnt out of range");
970 return Waitcnt::pack_exp(cnt);
978 static_assert(cnt == 0,
"expcnt unsupported on this arch");
984 template <
index_t vmcnt = waitcnt_arg::kMaxVmCnt,
985 index_t expcnt = waitcnt_arg::kMaxExpCnt,
986 index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
989 #if defined(__gfx12__)
991 constexpr
index_t wait_mask = waitcnt_arg::from_vmcnt<vmcnt>() |
992 waitcnt_arg::from_expcnt<expcnt>() |
993 waitcnt_arg::from_lgkmcnt<lgkmcnt>();
995 asm volatile(
"s_wait_loadcnt_dscnt %0" : :
"n"(wait_mask) :
"memory");
997 __builtin_amdgcn_s_waitcnt(waitcnt_arg::from_vmcnt<vmcnt>() |
998 waitcnt_arg::from_expcnt<expcnt>() |
999 waitcnt_arg::from_lgkmcnt<lgkmcnt>());
1003 template <
index_t vmcnt = waitcnt_arg::kMaxVmCnt,
1004 index_t expcnt = waitcnt_arg::kMaxExpCnt,
1005 index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
1008 #if defined(__gfx12__)
1011 constexpr
index_t wait_mask = waitcnt_arg::from_vmcnt<vmcnt>() |
1012 waitcnt_arg::from_expcnt<expcnt>() |
1013 waitcnt_arg::from_lgkmcnt<lgkmcnt>();
1015 asm volatile(
"s_wait_loadcnt_dscnt %0\n"
1016 "s_barrier_signal -1\n"
1022 s_waitcnt<vmcnt, expcnt, lgkmcnt>();
1023 __builtin_amdgcn_s_barrier();
1027 template <index_t lgkmcnt = 0>
1030 s_waitcnt_barrier<waitcnt_arg::kMaxVmCnt, waitcnt_arg::kMaxExpCnt, lgkmcnt>();
1033 template <index_t vmcnt = 0>
1036 s_waitcnt_barrier<vmcnt, waitcnt_arg::kMaxExpCnt, waitcnt_arg::kMaxLgkmCnt>();
1042 asm volatile(
"s_nop %0" : :
"n"(cnt) :);
1044 __builtin_amdgcn_sched_barrier(cnt);
1048 #define CK_TILE_CONSTANT_ADDRESS_SPACE \
1049 __attribute__((address_space( \
1050 static_cast<safe_underlying_type_t<address_space_enum>>(address_space_enum::constant))))
1052 template <
typename T>
1057 #pragma clang diagnostic push
1058 #pragma clang diagnostic ignored "-Wold-style-cast"
1060 #pragma clang diagnostic pop
1063 template <
typename T>
1068 #pragma clang diagnostic push
1069 #pragma clang diagnostic ignored "-Wold-style-cast"
1070 return (T CK_TILE_CONSTANT_ADDRESS_SPACE*)p;
1071 #pragma clang diagnostic pop
1076 #if defined(__gfx950__)
1084 CK_TILE_HOST_DEVICE constexpr
const char* address_space_to_string(address_space_enum addr_space)
1088 case address_space_enum::generic:
return "generic";
1089 case address_space_enum::global:
return "global";
1090 case address_space_enum::lds:
return "lds";
1091 case address_space_enum::sgpr:
return "sgpr";
1092 case address_space_enum::constant:
return "constant";
1093 case address_space_enum::vgpr:
return "vgpr";
1094 default:
return "unknown";
1114 struct gfx_invalid_t
1122 #if defined(__gfx11__)
1129 CK_TILE_DEVICE static constexpr
auto get_n_words_per_128b() {
return 4; }
1132 CK_TILE_DEVICE static constexpr
auto get_n_lds_banks(gfx9_t) {
return 32; }
1134 CK_TILE_DEVICE static constexpr
auto get_n_lds_banks(gfx103_t) {
return 32; }
1136 CK_TILE_DEVICE static constexpr
auto get_n_lds_banks(gfx11_t) {
return 32; }
1138 CK_TILE_DEVICE static constexpr
auto get_n_lds_banks(gfx12_t) {
return 32; }
1140 CK_TILE_DEVICE static constexpr
auto get_n_lds_banks(gfx950_t) {
return 64; }
1142 CK_TILE_DEVICE static constexpr
auto get_n_lds_banks(gfx_invalid_t) {
return 0; }
1146 #if defined(__gfx103__)
1148 #elif defined(__gfx11__)
1150 #elif defined(__gfx12__)
1152 #elif defined(__gfx950__)
1154 #elif defined(__gfx9__)
1157 return gfx_invalid_t{};
1163 return detail::get_n_lds_banks(detail::arch_tag_dispatch());
1166 enum LLVMSchedGroupMask :
int32_t
1175 VMEM_WRITE = 1 << 6,
1179 ALL = (DS_WRITE << 1) - 1,
#define CK_TILE_DEVICE
Definition: config.hpp:45
#define CK_TILE_HOST
Definition: config.hpp:44
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:46
Definition: cluster_descriptor.hpp:13
CK_TILE_DEVICE void atomic_add(X *p_dst, const X &x)
__device__ uint32_t amd_wave_read_first_lane(uint16_t v)
Definition: amd_buffer_addressing.hpp:35
CK_TILE_HOST_DEVICE T add(const T &a, const T &b)
Definition: generic_memory_space_atomic.hpp:16
int32_t index_t
Definition: integer.hpp:9
__device__ index_t get_grid_size()
Definition: get_id.hpp:49
__device__ void s_nop()
Definition: synchronization.hpp:61
__device__ index_t get_block_size()
Definition: get_id.hpp:51
__device__ void block_sync_lds_direct_load()
Definition: synchronization.hpp:43
__device__ index_t get_block_1d_id()
Definition: get_id.hpp:47
__device__ index_t get_thread_global_1d_id()
Definition: get_id.hpp:43
__device__ X atomic_max(X *p_dst, const X &x)
constexpr __device__ index_t get_warp_size()
Definition: get_id.hpp:10
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE * cast_pointer_to_constant_address_space(T *p)
Definition: amd_address_space.hpp:35
__device__ T * cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE *p)
Definition: amd_address_space.hpp:24
typename std::enable_if< B, T >::type enable_if_t
Definition: enable_if.hpp:27
__device__ index_t get_thread_local_1d_id()
Definition: get_id.hpp:41
__device__ void block_sync_lds()
Definition: synchronization.hpp:16
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1697
unsigned short uint16_t
Definition: stdint.h:125
signed int int32_t
Definition: stdint.h:123