/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck_tile/core/config.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck_tile/core/config.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck_tile/core/config.hpp Source File
config.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
6 #if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
7  defined(__gfx942__) || defined(__gfx950__)
8 #define __gfx9__
9 #endif
10 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
11 #define __gfx94__
12 #endif
13 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
14  defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \
15  defined(__gfx10_3_generic__)
16 #define __gfx103__
17 #endif
18 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
19  defined(__gfx1103__) || defined(__gfx11_generic__)
20 #define __gfx11__
21 #endif
22 #if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
23 #define __gfx12__
24 #endif
25 
26 #include "hip/hip_version.h"
27 #ifndef CK_TILE_DONT_USE_HIP_RUNTIME_HEADERS
28 #include "hip/hip_runtime.h"
29 #include "hip/hip_fp16.h"
30 #endif
31 
32 #ifdef __HIPCC__
33 #define CK_TILE_HOST inline __host__
34 #define CK_TILE_DEVICE inline __device__
35 #define CK_TILE_HOST_DEVICE inline __host__ __device__
36 #define CK_TILE_DEVICE_EXTERN __device__
37 #define CK_TILE_HOST_DEVICE_EXTERN __host__ __device__
38 #else
39 #define CK_TILE_HOST inline
40 #define CK_TILE_DEVICE inline
41 #define CK_TILE_HOST_DEVICE inline
42 #define CK_TILE_DEVICE_EXTERN
43 #define CK_TILE_HOST_DEVICE_EXTERN
44 #endif
45 
46 // implementing the "memory address space" attribute
47 // https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces-table
48 #ifdef __HIPCC_
49 #define CK_TILE_GENERIC_ADDR __attribute__((address_space(0)))
50 #define CK_TILE_GLOBAL_ADDR __attribute__((address_space(1)))
51 #define CK_TILE_LDS_ADDR __attribute__((address_space(3)))
52 #define CK_TILE_BUF_RES_ADDR __attribute__((address_space(8)))
53 #else
54 #define CK_TILE_GENERIC_ADDR
55 #define CK_TILE_GLOBAL_ADDR
56 #define CK_TILE_LDS_ADDR
57 #define CK_TILE_BUF_RES_ADDR
58 #endif
59 #ifndef CK_TILE_USE_CUSTOM_DATA_TYPE
60 #define CK_TILE_USE_CUSTOM_DATA_TYPE 0 // custom data type will generate extra move/bfi code
61 #endif
62 
63 #define CK_TILE_FLOAT_TO_BFLOAT16_STANDARD 0
64 #define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE_WITH_NAN 1
65 #define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE 2
66 #define CK_TILE_FLOAT_TO_BFLOAT16_STANDARD_ASM 3
67 #define CK_TILE_FLOAT_TO_BFLOAT16_RTA_ASM 4
68 
69 #ifndef CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT
70 #define CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE
71 #endif
72 
73 #define CK_TILE_FLOAT_TO_FP8_STANDARD 0
74 #define CK_TILE_FLOAT_TO_FP8_STOCHASTIC 1
75 
76 #ifndef CK_TILE_FLOAT_TO_FP8_DEFAULT
77 #define CK_TILE_FLOAT_TO_FP8_DEFAULT CK_TILE_FLOAT_TO_FP8_STANDARD
78 #endif
79 
80 // in the old rocm period, we have to use tuple array implementation to implement this
81 // so turn on the _USE_TUPLE if meet compiler error, otherwise _USE_ARRAY by default.
82 #define CK_TILE_STATICALLY_INDEXED_ARRAY_USE_ARRAY 0
83 #define CK_TILE_STATICALLY_INDEXED_ARRAY_USE_TUPLE 1
84 #ifndef CK_TILE_STATICALLY_INDEXED_ARRAY_DEFAULT
85 #define CK_TILE_STATICALLY_INDEXED_ARRAY_DEFAULT CK_TILE_STATICALLY_INDEXED_ARRAY_USE_TUPLE
86 #endif
87 
88 #define CK_TILE_THREAD_BUFFER_USE_ARRAY 0
89 #define CK_TILE_THREAD_BUFFER_USE_TUPLE 1
90 #ifndef CK_TILE_THREAD_BUFFER_DEFAULT
91 #define CK_TILE_THREAD_BUFFER_DEFAULT CK_TILE_THREAD_BUFFER_USE_ARRAY
92 #endif
93 
94 #ifndef CK_TILE_TUPLE_CTOR_WITH_INITIALIZER_LIST
95 #if CK_TILE_THREAD_BUFFER_DEFAULT == CK_TILE_THREAD_BUFFER_USE_TUPLE
96 // if using tuple-array as thread_buffer implementation, need to support {} brace init
97 // ... with similiar behavior as array
98 #define CK_TILE_TUPLE_CTOR_WITH_INITIALIZER_LIST 1
99 #else
100 #define CK_TILE_TUPLE_CTOR_WITH_INITIALIZER_LIST 0
101 #endif
102 #endif
103 
104 #ifndef CK_TILE_USE_LAUNCH_BOUNDS
105 #define CK_TILE_USE_LAUNCH_BOUNDS 1
106 #endif
107 
108 #ifndef CK_TILE_TIME_KERNEL
109 #define CK_TILE_TIME_KERNEL 1
110 #endif
111 
112 #define CK_TILE_MAX_THREAD_PER_BLOCK 256
113 #define CK_TILE_MIN_BLOCK_PER_CU 2
114 
115 #ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
116 #define CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
117 #endif
118 
119 #ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
120 #define CK_TILE_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
121 #endif
122 
123 #ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
124 #define CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1
125 #endif
126 
127 #ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
128 #define CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK 1
129 #endif
130 
131 #ifndef CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
132 #define CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM 1
133 #endif
134 
135 #ifndef CK_TILE_USE_AMD_BUFFER_LOAD
136 #define CK_TILE_USE_AMD_BUFFER_LOAD 1
137 #endif
138 
139 #ifndef CK_TILE_USE_AMD_BUFFER_STORE
140 #define CK_TILE_USE_AMD_BUFFER_STORE 1
141 #endif
142 
143 #ifndef CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER
144 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1
145 #endif
146 
147 #ifndef CK_TILE_USE_PK4_LAYOUT_SHUFFLE
148 #define CK_TILE_USE_PK4_LAYOUT_SHUFFLE 1
149 #endif
150 
151 // buffer atomic add: floating point
152 #ifndef __HIP_DEVICE_COMPILE__ // for host code
153 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
154 #elif defined(__gfx9__) // for GPU code
155 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
156 #else // for GPU code
157 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
158 #endif
159 
160 #if(defined(__gfx90a__) || defined(__gfx94__)) // for GPU code
161 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
162 #else
163 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
164 #endif
165 
166 #ifndef CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
167 #define CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0
168 #endif
169 
170 #ifndef CK_TILE_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
171 #define CK_TILE_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
172 #endif
173 
174 #ifndef CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE
175 #if HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 1 && HIP_VERSION_PATCH >= 40091
176 #define CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE 1
177 #else
178 #define CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE 0
179 #endif
180 #endif
181 
182 // workaround for ROCm 6.2 and later
183 #ifndef CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
184 #if(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 2 && HIP_VERSION_PATCH >= 41133) || \
185  (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 3 && HIP_VERSION_PATCH >= 42131) || \
186  (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR > 3)
187 #define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 1
188 #else
189 #define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 0
190 #endif
191 #endif
192 
193 #ifndef CK_TILE_DEBUG_LOG
194 #define CK_TILE_DEBUG_LOG 0
195 #endif
196 
197 #ifndef __HIP_DEVICE_COMPILE__ // for host code
198 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0xffffffff
199 #elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || \
200  defined(__gfx9__) // for GPU code
201 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x00020000
202 #elif defined(__gfx103__) // for GPU code
203 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31014000
204 #elif defined(__gfx11__) || defined(__gfx12__) // for GPU code
205 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31004000
206 #endif
207 
208 #ifndef CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
209 #define CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
210 #endif
211 
212 #ifndef CK_TILE_USE_SUBDWORD_TILE_CAST
213 #define CK_TILE_USE_SUBDWORD_TILE_CAST 0
214 #endif
215 
216 #ifndef CK_TILE_USE_PK_FP16_TILE_CAST
217 #define CK_TILE_USE_PK_FP16_TILE_CAST 0
218 #endif
219 
220 // TODO: better solve this inside compiler
221 #ifndef CK_TILE_FMHA_FWD_FAST_EXP2
222 #define CK_TILE_FMHA_FWD_FAST_EXP2 0
223 #endif
224 
225 #ifndef CK_TILE_BUFFER_LOAD_RAW_BF16_WA
226 #define CK_TILE_BUFFER_LOAD_RAW_BF16_WA 1
227 #endif
228 
229 // workaround: compiler not emiting reciprocal instruction frm __frcp_rn()
230 #ifndef CK_TILE_WORKAROUND_SWDEV_383542
231 #define CK_TILE_WORKAROUND_SWDEV_383542 1
232 #endif
233 
234 #ifndef CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
235 #define CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID 1
236 #endif
237 
238 #ifndef __HIP_DEVICE_COMPILE__ // for host code
239 #ifdef CK_TILE_USE_OCP_FP8
240 #define CK_TILE_USE_OCP_FP8 1
241 #else
242 #define CK_TILE_USE_OCP_FP8 0
243 #endif
244 #elif defined(__gfx950__) || defined(__gfx12__) // for GPU code
245 #define CK_TILE_USE_OCP_FP8 1
246 #else // for GPU code
247 #define CK_TILE_USE_OCP_FP8 0
248 #endif