/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/core/config.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/core/config.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/core/config.hpp Source File
config.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
6 #if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)
7 #define __gfx9__
8 #endif
9 #if defined(__gfx942__) || defined(__gfx950__)
10 #define __gfx94__
11 #endif
12 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
13  defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \
14  defined(__gfx10_3_generic__)
15 #define __gfx103__
16 #endif
17 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
18  defined(__gfx1103__) || defined(__gfx1150__) || defined(__gfx1151__) || \
19  defined(__gfx1152__) || defined(__gfx11_generic__)
20 #define __gfx11__
21 #endif
22 #if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
23 #define __gfx12__
24 #endif
25 
26 #include "hip/hip_version.h"
27 #ifndef CK_TILE_DONT_USE_HIP_RUNTIME_HEADERS
28 #include "hip/hip_runtime.h"
29 #include "hip/hip_fp16.h"
30 #endif
31 
32 #ifdef __HIPCC__
33 #define CK_TILE_HOST inline __host__
34 #define CK_TILE_DEVICE inline __device__
35 #define CK_TILE_HOST_DEVICE inline __host__ __device__
36 #define CK_TILE_DEVICE_EXTERN __device__
37 #define CK_TILE_HOST_DEVICE_EXTERN __host__ __device__
38 #else
39 #define CK_TILE_HOST inline
40 #define CK_TILE_DEVICE inline
41 #define CK_TILE_HOST_DEVICE inline
42 #define CK_TILE_DEVICE_EXTERN
43 #define CK_TILE_HOST_DEVICE_EXTERN
44 #endif
45 
46 // implementing the "memory address space" attribute
47 // https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces-table
48 // WA for https://github.com/ROCm/composable_kernel/issues/1946
49 #if 0
50 #define CK_TILE_GENERIC_ADDR __attribute__((address_space(0)))
51 #define CK_TILE_GLOBAL_ADDR __attribute__((address_space(1)))
52 #define CK_TILE_LDS_ADDR __attribute__((address_space(3)))
53 #define CK_TILE_BUF_RES_ADDR __attribute__((address_space(8)))
54 #else
55 #define CK_TILE_GENERIC_ADDR
56 #define CK_TILE_GLOBAL_ADDR
57 #define CK_TILE_LDS_ADDR
58 #define CK_TILE_BUF_RES_ADDR
59 #endif
60 #ifndef CK_TILE_USE_CUSTOM_DATA_TYPE
61 #define CK_TILE_USE_CUSTOM_DATA_TYPE 0 // custom data type will generate extra move/bfi code
62 #endif
63 
64 #define CK_TILE_FLOAT_TO_BFLOAT16_STANDARD 0
65 #define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE_WITH_NAN 1
66 #define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE 2
67 #define CK_TILE_FLOAT_TO_BFLOAT16_STANDARD_ASM 3
68 #define CK_TILE_FLOAT_TO_BFLOAT16_RTA_ASM 4
69 
70 #ifndef CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT
71 #define CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE
72 #endif
73 
74 #define CK_TILE_FLOAT_TO_FP8_STANDARD 0
75 #define CK_TILE_FLOAT_TO_FP8_STOCHASTIC 1
76 
77 #ifndef CK_TILE_FLOAT_TO_FP8_DEFAULT
78 #define CK_TILE_FLOAT_TO_FP8_DEFAULT CK_TILE_FLOAT_TO_FP8_STANDARD
79 #endif
80 
81 // in the old rocm period, we have to use tuple array implementation to implement this
82 // so turn on the _USE_TUPLE if meet compiler error, otherwise _USE_ARRAY by default.
83 #define CK_TILE_STATICALLY_INDEXED_ARRAY_USE_ARRAY 0
84 #define CK_TILE_STATICALLY_INDEXED_ARRAY_USE_TUPLE 1
85 #ifndef CK_TILE_STATICALLY_INDEXED_ARRAY_DEFAULT
86 #define CK_TILE_STATICALLY_INDEXED_ARRAY_DEFAULT CK_TILE_STATICALLY_INDEXED_ARRAY_USE_TUPLE
87 #endif
88 
89 #define CK_TILE_THREAD_BUFFER_USE_ARRAY 0
90 #define CK_TILE_THREAD_BUFFER_USE_TUPLE 1
91 #ifndef CK_TILE_THREAD_BUFFER_DEFAULT
92 #define CK_TILE_THREAD_BUFFER_DEFAULT CK_TILE_THREAD_BUFFER_USE_ARRAY
93 #endif
94 
95 #ifndef CK_TILE_TUPLE_CTOR_WITH_INITIALIZER_LIST
96 #if CK_TILE_THREAD_BUFFER_DEFAULT == CK_TILE_THREAD_BUFFER_USE_TUPLE
97 // if using tuple-array as thread_buffer implementation, need to support {} brace init
98 // ... with similiar behavior as array
99 #define CK_TILE_TUPLE_CTOR_WITH_INITIALIZER_LIST 1
100 #else
101 #define CK_TILE_TUPLE_CTOR_WITH_INITIALIZER_LIST 0
102 #endif
103 #endif
104 
105 #ifndef CK_TILE_USE_LAUNCH_BOUNDS
106 #define CK_TILE_USE_LAUNCH_BOUNDS 1
107 #endif
108 
109 #ifndef CK_TILE_TIME_KERNEL
110 #define CK_TILE_TIME_KERNEL 1
111 #endif
112 
113 #define CK_TILE_MAX_THREAD_PER_BLOCK 256
114 #define CK_TILE_MIN_BLOCK_PER_CU 2
115 
116 #ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
117 #define CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
118 #endif
119 
120 #ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
121 #define CK_TILE_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
122 #endif
123 
124 #ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
125 #define CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1
126 #endif
127 
128 #ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
129 #define CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK 1
130 #endif
131 
132 #ifndef CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
133 #define CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM 1
134 #endif
135 
136 #ifndef CK_TILE_USE_AMD_BUFFER_LOAD
137 #define CK_TILE_USE_AMD_BUFFER_LOAD 1
138 #endif
139 
140 #ifndef CK_TILE_USE_AMD_BUFFER_STORE
141 #define CK_TILE_USE_AMD_BUFFER_STORE 1
142 #endif
143 
144 #ifndef CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER
145 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1
146 #endif
147 
148 #ifndef CK_TILE_USE_PK4_LAYOUT_SHUFFLE
149 #define CK_TILE_USE_PK4_LAYOUT_SHUFFLE 1
150 #endif
151 
152 // buffer atomic add: floating point
153 #ifndef __HIP_DEVICE_COMPILE__ // for host code
154 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
155 #elif defined(__gfx9__) // for GPU code
156 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
157 #else // for GPU code
158 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
159 #endif
160 
161 #if(defined(__gfx90a__) || defined(__gfx94__)) // for GPU code
162 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
163 #else
164 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
165 #endif
166 
167 #ifndef CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
168 #define CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0
169 #endif
170 
171 #ifndef CK_TILE_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
172 #define CK_TILE_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
173 #endif
174 
175 #ifndef CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE
176 #if HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 1 && HIP_VERSION_PATCH >= 40091
177 #define CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE 1
178 #else
179 #define CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE 0
180 #endif
181 #endif
182 
183 // workaround for ROCm 6.2 and later
184 #ifndef CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
185 #if(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 2 && HIP_VERSION_PATCH >= 41133) || \
186  (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 3 && HIP_VERSION_PATCH >= 42131) || \
187  (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR > 3)
188 #define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 1
189 #else
190 #define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 0
191 #endif
192 #endif
193 
194 #ifndef CK_TILE_DEBUG_LOG
195 #define CK_TILE_DEBUG_LOG 0
196 #endif
197 
198 #ifndef __HIP_DEVICE_COMPILE__ // for host code
199 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0xffffffff
200 #elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || \
201  defined(__gfx9__) // for GPU code
202 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x00020000
203 #elif defined(__gfx103__) // for GPU code
204 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31014000
205 #elif defined(__gfx11__) || defined(__gfx12__) // for GPU code
206 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31004000
207 #endif
208 
209 #ifndef CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
210 #define CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
211 #endif
212 
213 #ifndef CK_TILE_USE_SUBDWORD_TILE_CAST
214 #define CK_TILE_USE_SUBDWORD_TILE_CAST 0
215 #endif
216 
217 #ifndef CK_TILE_USE_PK_FP16_TILE_CAST
218 #define CK_TILE_USE_PK_FP16_TILE_CAST 0
219 #endif
220 
221 // TODO: better solve this inside compiler
222 #ifndef CK_TILE_FMHA_FWD_FAST_EXP2
223 #define CK_TILE_FMHA_FWD_FAST_EXP2 0
224 #endif
225 
226 #ifndef CK_TILE_FMHA_FLOAT_TO_FLOAT16_RTN
227 #define CK_TILE_FMHA_FLOAT_TO_FLOAT16_RTN 0
228 #endif
229 
230 #ifndef CK_TILE_BUFFER_LOAD_RAW_BF16_WA
231 #define CK_TILE_BUFFER_LOAD_RAW_BF16_WA 1
232 #endif
233 
234 // workaround: compiler not emiting reciprocal instruction frm __frcp_rn()
235 #ifndef CK_TILE_WORKAROUND_SWDEV_383542
236 #define CK_TILE_WORKAROUND_SWDEV_383542 1
237 #endif
238 
239 #ifndef CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
240 #define CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID 1
241 #endif
242 
243 #ifndef CK_TILE_USE_OCP_FP8
244 #if defined(__HIP_DEVICE_COMPILE__)
245 #if defined(__gfx950__) || defined(__gfx12__)
246 #define CK_TILE_USE_OCP_FP8 1
247 #else
248 #define CK_TILE_USE_OCP_FP8 0
249 #endif
250 #else
251 #define CK_TILE_USE_OCP_FP8 0
252 #endif
253 #endif
254 
255 #ifndef CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
256 #if __clang_major__ == 20
257 #define CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN 1
258 #else
259 #define CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN 0
260 #endif
261 #endif
262 
263 #ifndef CK_TILE_WA_ISSUE_2028
264 #define CK_TILE_WA_ISSUE_2028 0
265 #endif
266 
267 // Y pointed to R, we don't see a valuable use case.
268 // Will enforce encoding to check Y not pointed to R if set to zero
269 #ifndef CK_TILE_ENC_SUPPORT_Y_TO_R
270 #define CK_TILE_ENC_SUPPORT_Y_TO_R 0
271 #endif