/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/core/arch/arch.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/core/arch/arch.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/core/arch/arch.hpp Source File
arch.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
6 // Address Space for AMDGCN
7 // https://llvm.org/docs/AMDGPUUsage.html#address-space
8 
12 
13 namespace ck_tile {
14 
15 template <typename, bool>
17 
18 template <typename T>
19 struct safe_underlying_type<T, true>
20 {
21  using type = std::underlying_type_t<T>;
22 };
23 
24 template <typename T>
25 struct safe_underlying_type<T, false>
26 {
27  using type = void;
28 };
29 
30 template <typename T>
32 
33 enum struct address_space_enum : std::uint16_t
34 {
35  generic = 0,
36  global,
37  lds,
38  sgpr,
39  constant,
40  vgpr
41 };
42 
43 enum struct memory_operation_enum : std::uint16_t
44 {
45  set = 0,
46  atomic_add,
47  atomic_max,
48  add
49 };
50 
52 {
53 #if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__)
54  return 64;
55 #else
56  return 32;
57 #endif
58 }
59 
60 CK_TILE_DEVICE index_t get_grid_size() { return gridDim.x; }
61 
62 CK_TILE_DEVICE index_t get_block_size() { return blockDim.x; }
63 
64 // TODO: deprecate these
66 
67 CK_TILE_DEVICE index_t get_thread_global_1d_id() { return blockIdx.x * blockDim.x + threadIdx.x; }
68 
69 CK_TILE_DEVICE index_t get_block_1d_id() { return blockIdx.x; }
70 
71 // Use these instead
72 CK_TILE_DEVICE index_t get_lane_id() { return __lane_id(); }
73 
75 {
76  return __builtin_amdgcn_readfirstlane(threadIdx.x / get_warp_size());
77 }
78 
79 CK_TILE_DEVICE index_t get_thread_id() { return threadIdx.x; }
80 
81 CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; }
82 
84 {
85 #if CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
86  // asm volatile("\
87  // s_waitcnt lgkmcnt(0) \n \
88  // s_barrier \
89  // " ::);
90 
91  __builtin_amdgcn_s_waitcnt(0xc07f);
92  __builtin_amdgcn_s_barrier();
93 #else
94  __syncthreads();
95 #endif
96 }
97 
99 {
100 #ifdef __gfx12__
101  asm volatile("s_wait_loadcnt %0 \n"
102  "s_barrier_signal -1 \n"
103  "s_barrier_wait -1"
104  :
105  : "n"(cnt)
106  : "memory");
107 #else
108  asm volatile("s_waitcnt vmcnt(%0) \n"
109  "s_barrier"
110  :
111  : "n"(cnt)
112  : "memory");
113 #endif
114 }
115 
117 {
118  asm volatile("\
119  s_waitcnt vmcnt(0) \n \
120  s_waitcnt lgkmcnt(0) \n \
121  s_barrier \
122  " ::);
123 }
124 
126 {
127 #if 1
128  asm volatile("s_nop %0" : : "n"(cnt) :);
129 #else
130  __builtin_amdgcn_sched_barrier(cnt);
131 #endif
132 }
133 
134 #define CK_CONSTANT_ADDRESS_SPACE \
135  __attribute__((address_space( \
136  static_cast<safe_underlying_type_t<address_space_enum>>(address_space_enum::constant))))
137 
138 template <typename T>
140 {
141  // cast a pointer in "Constant" address space (4) to "Generic" address space (0)
142  // only c-style pointer cast seems be able to be compiled
143 #pragma clang diagnostic push
144 #pragma clang diagnostic ignored "-Wold-style-cast"
145  return (T*)(p); // NOLINT(old-style-cast)
146 #pragma clang diagnostic pop
147 }
148 
149 template <typename T>
151 {
152  // cast a pointer in "Generic" address space (0) to "Constant" address space (4)
153  // only c-style pointer cast seems be able to be compiled;
154 #pragma clang diagnostic push
155 #pragma clang diagnostic ignored "-Wold-style-cast"
156  return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast)
157 #pragma clang diagnostic pop
158 }
159 
161 {
162 #if defined(__gfx950__)
163  return 163840;
164 #else
165  return 65536;
166 #endif
167 }
168 
169 } // namespace ck_tile
#define CK_CONSTANT_ADDRESS_SPACE
Definition: arch.hpp:134
#define CK_TILE_DEVICE
Definition: config.hpp:40
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:41
Definition: cluster_descriptor.hpp:13
constexpr CK_TILE_HOST_DEVICE index_t get_warp_size()
Definition: arch.hpp:51
CK_TILE_DEVICE index_t get_lane_id()
Definition: arch.hpp:72
CK_TILE_DEVICE void atomic_add(X *p_dst, const X &x)
memory_operation_enum
Definition: arch.hpp:44
CK_TILE_HOST_DEVICE T add(const T &a, const T &b)
Definition: generic_memory_space_atomic.hpp:12
CK_TILE_DEVICE index_t get_block_1d_id()
Definition: arch.hpp:69
CK_TILE_DEVICE void block_sync_lds()
Definition: arch.hpp:83
int32_t index_t
Definition: integer.hpp:9
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE * cast_pointer_to_constant_address_space(T *p)
Definition: arch.hpp:150
CK_TILE_DEVICE void block_sync_lds_direct_load()
Definition: arch.hpp:116
constexpr CK_TILE_HOST_DEVICE index_t get_smem_capacity()
Definition: arch.hpp:160
CK_TILE_DEVICE void s_nop(index_t cnt=0)
Definition: arch.hpp:125
CK_TILE_DEVICE index_t get_thread_local_1d_id()
Definition: arch.hpp:65
CK_TILE_DEVICE index_t get_block_size()
Definition: arch.hpp:62
CK_TILE_DEVICE index_t get_warp_id()
Definition: arch.hpp:74
__device__ T * cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE *p)
Definition: arch.hpp:139
CK_TILE_DEVICE index_t get_thread_id()
Definition: arch.hpp:79
CK_TILE_DEVICE index_t get_thread_global_1d_id()
Definition: arch.hpp:67
address_space_enum
Definition: arch.hpp:34
CK_TILE_DEVICE index_t get_block_id()
Definition: arch.hpp:81
CK_TILE_DEVICE index_t get_grid_size()
Definition: arch.hpp:60
CK_TILE_DEVICE void block_sync_load_raw(index_t cnt=0)
Definition: arch.hpp:98
typename safe_underlying_type< T, std::is_enum< T >::value >::type safe_underlying_type_t
Definition: arch.hpp:31
void type
Definition: arch.hpp:27
std::underlying_type_t< T > type
Definition: arch.hpp:21
Definition: arch.hpp:16