/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/amd_buffer_addressing.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/amd_buffer_addressing.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/amd_buffer_addressing.hpp Source File
amd_buffer_addressing.hpp
Go to the documentation of this file.
1 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2 // SPDX-License-Identifier: MIT
3 
4 #pragma once
5 #include "data_type.hpp"
7 
8 namespace ck {
9 
10 template <typename T>
12 {
13  __device__ constexpr BufferResource() : content{} {}
14 
15  // 128 bit SGPRs to supply buffer resource in buffer instructions
16  // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
21 };
22 
23 template <typename T>
24 __device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_size)
25 {
26  BufferResource<T> wave_buffer_resource;
27 
28  // wavewise base address (64 bit)
29  wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
30  // wavewise range (32 bit)
31  wave_buffer_resource.range(Number<2>{}) = element_space_size * sizeof(T);
32  // wavewise setting (32 bit)
33  wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
34 
35  return wave_buffer_resource.content;
36 }
37 
38 template <typename T>
40 {
41  BufferResource<T> wave_buffer_resource;
42 
43  // wavewise base address (64 bit)
44  wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
45  // wavewise range (32 bit)
46  wave_buffer_resource.range(Number<2>{}) = 0xffffffff; // max possible range
47  // wavewise setting (32 bit)
48  wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
49 
50  return wave_buffer_resource.content;
51 }
52 
53 // buffer load i8
54 __device__ int8_t
56  index_t voffset,
57  index_t soffset,
58  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8");
59 
60 __device__ int8x2_t
62  index_t voffset,
63  index_t soffset,
64  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8");
65 
66 __device__ int8x4_t
68  index_t voffset,
69  index_t soffset,
70  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
71 
72 // buffer load i16
73 __device__ bhalf_t
75  index_t voffset,
76  index_t soffset,
77  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16");
78 
79 __device__ bhalf2_t
81  index_t voffset,
82  index_t soffset,
83  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16");
84 
85 __device__ bhalf4_t
87  index_t voffset,
88  index_t soffset,
89  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16");
90 
91 // buffer load i32
92 __device__ int32_t
94  index_t voffset,
95  index_t soffset,
96  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
97 
98 __device__ int32x2_t
100  index_t voffset,
101  index_t soffset,
102  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
103 
104 __device__ int32x4_t
106  index_t voffset,
107  index_t soffset,
108  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
109 
110 // buffer load fp16
111 __device__ half_t
113  index_t voffset,
114  index_t soffset,
115  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
116 
117 __device__ half2_t
119  index_t voffset,
120  index_t soffset,
121  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16");
122 
123 __device__ half4_t
125  index_t voffset,
126  index_t soffset,
127  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16");
128 
129 // buffer load fp32
130 __device__ float
132  index_t voffset,
133  index_t soffset,
134  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32");
135 
136 __device__ float2_t
138  index_t voffset,
139  index_t soffset,
140  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32");
141 
142 __device__ float4_t
144  index_t voffset,
145  index_t soffset,
146  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32");
147 
148 // buffer store i8
149 __device__ void
151  int32x4_t rsrc,
152  index_t voffset,
153  index_t soffset,
154  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8");
155 
156 __device__ void
158  int32x4_t rsrc,
159  index_t voffset,
160  index_t soffset,
161  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8");
162 
163 __device__ void
165  int32x4_t rsrc,
166  index_t voffset,
167  index_t soffset,
168  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8");
169 
170 // buffer store i16
171 __device__ void
173  int32x4_t rsrc,
174  index_t voffset,
175  index_t soffset,
176  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");
177 
178 __device__ void
180  int32x4_t rsrc,
181  index_t voffset,
182  index_t soffset,
183  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16");
184 
185 __device__ void
187  int32x4_t rsrc,
188  index_t voffset,
189  index_t soffset,
190  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16");
191 
192 // buffer store i32
193 __device__ void
195  int32x4_t rsrc,
196  index_t voffset,
197  index_t soffset,
198  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32");
199 
200 __device__ void
202  int32x4_t rsrc,
203  index_t voffset,
204  index_t soffset,
205  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32");
206 
207 __device__ void
209  int32x4_t rsrc,
210  index_t voffset,
211  index_t soffset,
212  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
213 
214 // buffer store fp16
215 __device__ void
217  int32x4_t rsrc,
218  index_t voffset,
219  index_t soffset,
220  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
221 
222 __device__ void
224  int32x4_t rsrc,
225  index_t voffset,
226  index_t soffset,
227  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16");
228 
229 __device__ void
231  int32x4_t rsrc,
232  index_t voffset,
233  index_t soffset,
234  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
235 
236 // buffer store fp32
237 __device__ void
239  int32x4_t rsrc,
240  index_t voffset,
241  index_t soffset,
242  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32");
243 
244 __device__ void
246  int32x4_t rsrc,
247  index_t voffset,
248  index_t soffset,
249  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32");
250 
251 __device__ void
253  int32x4_t rsrc,
254  index_t voffset,
255  index_t soffset,
256  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
257 
258 // buffer atomic-add fp16
260  half2_t vdata,
261  int32x4_t rsrc,
262  index_t voffset,
263  index_t soffset,
264  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
265 
266 // buffer atomic-add i32
268  int32_t vdata,
269  int32x4_t rsrc,
270  index_t voffset,
271  index_t soffset,
272  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32");
273 
274 // buffer atomic-add fp32
276  float vdata,
277  int32x4_t rsrc,
278  index_t voffset,
279  index_t soffset,
280  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32");
281 
282 // buffer atomic-add fp32
283 __device__ double
285  int32x4_t rsrc, // dst_wave_buffer_resource
286  int voffset, // dst_thread_addr_offset
287  int soffset, // dst_wave_addr_offset
288  int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
289 
290 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
291 __device__ typename vector_type<int8_t, N>::type
292 amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource,
293  index_t src_thread_addr_offset,
294  index_t src_wave_addr_offset)
295 {
296  static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
297  "wrong! not implemented");
298 
299  if constexpr(N == 1)
300  {
301  return llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource,
302  src_thread_addr_offset,
303  src_wave_addr_offset,
304  static_cast<index_t>(coherence));
305  }
306  else if constexpr(N == 2)
307  {
308 
309  int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
310  src_thread_addr_offset,
311  src_wave_addr_offset,
312  static_cast<index_t>(coherence));
313 
314  return bit_cast<int8x2_t>(tmp);
315  }
316  else if constexpr(N == 4)
317  {
318  int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource,
319  src_thread_addr_offset,
320  src_wave_addr_offset,
321  static_cast<index_t>(coherence));
322 
323  return bit_cast<int8x4_t>(tmp);
324  }
325  else if constexpr(N == 8)
326  {
327  int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource,
328  src_thread_addr_offset,
329  src_wave_addr_offset,
330  static_cast<index_t>(coherence));
331 
332  return bit_cast<int8x8_t>(tmp);
333  }
334  else if constexpr(N == 16)
335  {
336  int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
337  src_thread_addr_offset,
338  src_wave_addr_offset,
339  static_cast<index_t>(coherence));
340  return bit_cast<int8x16_t>(tmp);
341  }
342  else if constexpr(N == 32)
343  {
344  int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
345  src_thread_addr_offset,
346  src_wave_addr_offset,
347  static_cast<index_t>(coherence));
348  int32x4_t tmp1 =
349  llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
350  src_thread_addr_offset,
351  src_wave_addr_offset + 4 * sizeof(int32_t),
352  static_cast<index_t>(coherence));
354 
355  tmp.AsType<int32x4_t>()(Number<0>{}) = tmp0;
356  tmp.AsType<int32x4_t>()(Number<1>{}) = tmp1;
357 
358  return bit_cast<int8x32_t>(tmp);
359  }
360  else if constexpr(N == 64)
361  {
362  int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
363  src_thread_addr_offset,
364  src_wave_addr_offset,
365  static_cast<index_t>(coherence));
366  int32x4_t tmp1 =
367  llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
368  src_thread_addr_offset,
369  src_wave_addr_offset + 4 * sizeof(int32_t),
370  static_cast<index_t>(coherence));
371  int32x4_t tmp2 =
372  llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
373  src_thread_addr_offset,
374  src_wave_addr_offset + 8 * sizeof(int32_t),
375  static_cast<index_t>(coherence));
376  int32x4_t tmp3 =
377  llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
378  src_thread_addr_offset,
379  src_wave_addr_offset + 12 * sizeof(int32_t),
380  static_cast<index_t>(coherence));
381 
383 
384  tmp.AsType<int32x4_t>()(Number<0>{}) = tmp0;
385  tmp.AsType<int32x4_t>()(Number<1>{}) = tmp1;
386  tmp.AsType<int32x4_t>()(Number<2>{}) = tmp2;
387  tmp.AsType<int32x4_t>()(Number<3>{}) = tmp3;
388 
389  return bit_cast<int8x64_t>(tmp);
390  }
391 }
392 
393 template <typename T,
394  index_t N,
396 __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
397  index_t src_thread_addr_offset,
398  index_t src_wave_addr_offset)
399 {
400  static_assert(
401  (is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
402  (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
403  (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
404  (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
405  (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
406  (is_same<T, f8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
407  (is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
408  (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
409  (is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
410  (is_same<T, pk_i4_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
412  (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
413  "wrong! not implemented");
414 
415  using r_t = typename vector_type<T, N>::type;
416  auto raw_data = amd_buffer_load_impl_raw<sizeof(T) * N, coherence>(
417  src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
418  return bit_cast<r_t>(raw_data);
419 }
420 
421 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
422 __device__ void
424  int32x4_t dst_wave_buffer_resource,
425  index_t dst_thread_addr_offset,
426  index_t dst_wave_addr_offset)
427 {
428  static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
429  "wrong! not implemented");
430 
431  if constexpr(N == 1)
432  {
433  llvm_amdgcn_raw_buffer_store_i8(src_thread_data,
434  dst_wave_buffer_resource,
435  dst_thread_addr_offset,
436  dst_wave_addr_offset,
437  static_cast<index_t>(coherence));
438  }
439  else if constexpr(N == 2)
440  {
441 
442  llvm_amdgcn_raw_buffer_store_i16(bit_cast<int16_t>(src_thread_data),
443  dst_wave_buffer_resource,
444  dst_thread_addr_offset,
445  dst_wave_addr_offset,
446  static_cast<index_t>(coherence));
447  }
448  else if constexpr(N == 4)
449  {
450  llvm_amdgcn_raw_buffer_store_i32(bit_cast<int32_t>(src_thread_data),
451  dst_wave_buffer_resource,
452  dst_thread_addr_offset,
453  dst_wave_addr_offset,
454  static_cast<index_t>(coherence));
455  }
456  else if constexpr(N == 8)
457  {
458  llvm_amdgcn_raw_buffer_store_i32x2(bit_cast<int32x2_t>(src_thread_data),
459  dst_wave_buffer_resource,
460  dst_thread_addr_offset,
461  dst_wave_addr_offset,
462  static_cast<index_t>(coherence));
463  }
464  else if constexpr(N == 16)
465  {
466  llvm_amdgcn_raw_buffer_store_i32x4(bit_cast<int32x4_t>(src_thread_data),
467  dst_wave_buffer_resource,
468  dst_thread_addr_offset,
469  dst_wave_addr_offset,
470  static_cast<index_t>(coherence));
471  }
472  else if constexpr(N == 32)
473  {
474  vector_type<int32_t, 8> tmp{bit_cast<int32x8_t>(src_thread_data)};
475 
476  llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<0>{}],
477  dst_wave_buffer_resource,
478  dst_thread_addr_offset,
479  dst_wave_addr_offset,
480  static_cast<index_t>(coherence));
481 
482  llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<1>{}],
483  dst_wave_buffer_resource,
484  dst_thread_addr_offset,
485  dst_wave_addr_offset + sizeof(int32_t) * 4,
486  static_cast<index_t>(coherence));
487  }
488  else if constexpr(N == 64)
489  {
490  vector_type<int32_t, 16> tmp{bit_cast<int32x16_t>(src_thread_data)};
491 
492  llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<0>{}],
493  dst_wave_buffer_resource,
494  dst_thread_addr_offset,
495  dst_wave_addr_offset,
496  static_cast<index_t>(coherence));
497 
498  llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<1>{}],
499  dst_wave_buffer_resource,
500  dst_thread_addr_offset,
501  dst_wave_addr_offset + sizeof(int32_t) * 4,
502  static_cast<index_t>(coherence));
503 
504  llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<2>{}],
505  dst_wave_buffer_resource,
506  dst_thread_addr_offset,
507  dst_wave_addr_offset + sizeof(int32_t) * 8,
508  static_cast<index_t>(coherence));
509 
510  llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<3>{}],
511  dst_wave_buffer_resource,
512  dst_thread_addr_offset,
513  dst_wave_addr_offset + sizeof(int32_t) * 12,
514  static_cast<index_t>(coherence));
515  }
516 }
517 
518 template <typename T,
519  index_t N,
521 __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src_thread_data,
522  int32x4_t dst_wave_buffer_resource,
523  index_t dst_thread_addr_offset,
524  index_t dst_wave_addr_offset)
525 {
526  static_assert(
527  (is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
528  (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
529  (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
530  (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
531  (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
532  (is_same<T, f8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
533  (is_same<T, bf8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
535  (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
536  (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
537  "wrong! not implemented");
538 
539  using r_t = typename vector_type<int8_t, sizeof(T) * N>::type;
540 
541  amd_buffer_store_impl_raw<sizeof(T) * N, coherence>(bit_cast<r_t>(src_thread_data),
542  dst_wave_buffer_resource,
543  dst_thread_addr_offset,
544  dst_wave_addr_offset);
545 }
546 
547 template <typename T, index_t N>
548 __device__ void amd_global_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
549  T* addr)
550 {
551  static_assert((is_same<T, bhalf_t>::value && (N == 2 || N == 4 || N == 8)) ||
552  (is_same<T, half_t>::value && (N == 2 || N == 4 || N == 8)),
553  "wrong! not implemented");
554 
555  if constexpr(is_same<T, half_t>::value)
556  {
557  vector_type<half_t, N> tmp{src_thread_data};
558  static_for<0, N / 2, 1>{}([&](auto i) {
559  __builtin_amdgcn_global_atomic_fadd_v2f16(bit_cast<half2_t*>(addr) + i,
560  tmp.template AsType<half2_t>()[i]);
561  });
562  }
563 #if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx12__)
564  else if constexpr(is_same<T, bhalf_t>::value)
565  {
566  vector_type<bhalf_t, N> tmp{src_thread_data};
567  static_for<0, N / 2, 1>{}([&](auto i) {
568  __builtin_amdgcn_global_atomic_fadd_v2bf16(bit_cast<bhalf2_t*>(addr) + i,
569  tmp.template AsType<bhalf2_t>()[i]);
570  });
571  }
572 #endif
573 }
574 
575 template <typename T, index_t N>
576 __device__ void amd_buffer_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
577  int32x4_t dst_wave_buffer_resource,
578  index_t dst_thread_addr_offset,
579  index_t dst_wave_addr_offset)
580 {
581  static_assert((is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
582  (is_same<T, half_t>::value && (N == 2 || N == 4 || N == 8)) ||
583  (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
584  "wrong! not implemented");
585 
586  if constexpr(is_same<T, float>::value)
587  {
588  if constexpr(N == 1)
589  {
591  dst_wave_buffer_resource,
592  dst_thread_addr_offset,
593  dst_wave_addr_offset,
594  0);
595  }
596  else if constexpr(N == 2)
597  {
598  vector_type<float, 2> tmp{src_thread_data};
599 
600  llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
601  dst_wave_buffer_resource,
602  dst_thread_addr_offset,
603  dst_wave_addr_offset,
604  0);
605 
606  llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
607  dst_wave_buffer_resource,
608  dst_thread_addr_offset,
609  dst_wave_addr_offset + sizeof(float),
610  0);
611  }
612  else if constexpr(N == 4)
613  {
614  vector_type<float, 4> tmp{src_thread_data};
615 
616  llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
617  dst_wave_buffer_resource,
618  dst_thread_addr_offset,
619  dst_wave_addr_offset,
620  0);
621 
622  llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
623  dst_wave_buffer_resource,
624  dst_thread_addr_offset,
625  dst_wave_addr_offset + sizeof(float),
626  0);
627 
628  llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<2>{}],
629  dst_wave_buffer_resource,
630  dst_thread_addr_offset,
631  dst_wave_addr_offset + 2 * sizeof(float),
632  0);
633 
634  llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<3>{}],
635  dst_wave_buffer_resource,
636  dst_thread_addr_offset,
637  dst_wave_addr_offset + 3 * sizeof(float),
638  0);
639  }
640  }
641  else if constexpr(is_same<T, half_t>::value)
642  {
643  if constexpr(N == 2)
644  {
646  dst_wave_buffer_resource,
647  dst_thread_addr_offset,
648  dst_wave_addr_offset,
649  0);
650  }
651  else if constexpr(N == 4)
652  {
653  vector_type<half_t, 4> tmp{src_thread_data};
654 
655  static_for<0, 2, 1>{}([&](auto i) {
657  dst_wave_buffer_resource,
658  dst_thread_addr_offset,
659  dst_wave_addr_offset + i * sizeof(half2_t),
660  0);
661  });
662  }
663  else if constexpr(N == 8)
664  {
665  vector_type<half_t, 8> tmp{src_thread_data};
666 
667  static_for<0, 4, 1>{}([&](auto i) {
669  dst_wave_buffer_resource,
670  dst_thread_addr_offset,
671  dst_wave_addr_offset + i * sizeof(half2_t),
672  0);
673  });
674  }
675  }
676  else if constexpr(is_same<T, int32_t>::value)
677  {
678  if constexpr(N == 1)
679  {
681  dst_wave_buffer_resource,
682  dst_thread_addr_offset,
683  dst_wave_addr_offset,
684  0);
685  }
686  else if constexpr(N == 2)
687  {
688  vector_type<int32_t, 2> tmp{src_thread_data};
689 
691  dst_wave_buffer_resource,
692  dst_thread_addr_offset,
693  dst_wave_addr_offset,
694  0);
695 
697  dst_wave_buffer_resource,
698  dst_thread_addr_offset,
699  dst_wave_addr_offset + sizeof(int32_t),
700  0);
701  }
702  else if constexpr(N == 4)
703  {
704  vector_type<int32_t, 4> tmp{src_thread_data};
705 
707  dst_wave_buffer_resource,
708  dst_thread_addr_offset,
709  dst_wave_addr_offset,
710  0);
711 
713  dst_wave_buffer_resource,
714  dst_thread_addr_offset,
715  dst_wave_addr_offset + sizeof(int32_t),
716  0);
717 
719  dst_wave_buffer_resource,
720  dst_thread_addr_offset,
721  dst_wave_addr_offset + 2 * sizeof(int32_t),
722  0);
723 
725  dst_wave_buffer_resource,
726  dst_thread_addr_offset,
727  dst_wave_addr_offset + 3 * sizeof(int32_t),
728  0);
729  }
730  }
731 }
732 
733 template <typename T, index_t N>
734 __device__ void amd_buffer_atomic_max_impl(const typename vector_type<T, N>::type src_thread_data,
735  int32x4_t dst_wave_buffer_resource,
736  index_t dst_thread_addr_offset,
737  index_t dst_wave_addr_offset)
738 {
739  static_assert((is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
740  "wrong! not implemented");
741  if constexpr(is_same<T, double>::value)
742  {
743  if constexpr(N == 1)
744  {
746  dst_wave_buffer_resource,
747  dst_thread_addr_offset,
748  dst_wave_addr_offset,
749  0);
750  }
751  else if constexpr(N == 2)
752  {
753  vector_type<double, 2> tmp{src_thread_data};
754 
755  llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<0>{}],
756  dst_wave_buffer_resource,
757  dst_thread_addr_offset,
758  dst_wave_addr_offset,
759  0);
760 
761  llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<1>{}],
762  dst_wave_buffer_resource,
763  dst_thread_addr_offset,
764  dst_wave_addr_offset + sizeof(double),
765  0);
766  }
767  else if constexpr(N == 4)
768  {
769  vector_type<double, 4> tmp{src_thread_data};
770 
771  llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<0>{}],
772  dst_wave_buffer_resource,
773  dst_thread_addr_offset,
774  dst_wave_addr_offset,
775  0);
776 
777  llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<1>{}],
778  dst_wave_buffer_resource,
779  dst_thread_addr_offset,
780  dst_wave_addr_offset + sizeof(double),
781  0);
782 
783  llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<2>{}],
784  dst_wave_buffer_resource,
785  dst_thread_addr_offset,
786  dst_wave_addr_offset + 2 * sizeof(double),
787  0);
788 
789  llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<3>{}],
790  dst_wave_buffer_resource,
791  dst_thread_addr_offset,
792  dst_wave_addr_offset + 3 * sizeof(double),
793  0);
794  }
795  }
796 }
797 
798 // buffer_load requires:
799 // 1) p_src_wave must point to global memory space
800 // 2) p_src_wave must be a wavewise pointer.
801 // It is user's responsibility to make sure that is true.
802 template <typename T,
803  index_t N,
805 __device__ typename vector_type_maker<T, N>::type::type
807  index_t src_thread_element_offset,
808  bool src_thread_element_valid,
809  index_t src_element_space_size)
810 {
811  const int32x4_t src_wave_buffer_resource =
812  make_wave_buffer_resource(p_src_wave, src_element_space_size);
813 
814  index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
815 
816  using vector_t = typename vector_type_maker<T, N>::type::type;
817  using scalar_t = typename scalar_type<vector_t>::type;
818 
819  constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
820 
821 #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
822  uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
823  return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
824  src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
825 
826 #else
827 
828  vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
829  src_wave_buffer_resource, src_thread_addr_offset, 0)};
830  return src_thread_element_valid ? tmp : vector_t(0);
831 #endif
832 }
833 
834 // buffer_load requires:
835 // 1) p_src_wave must point to global memory space
836 // 2) p_src_wave must be a wavewise pointer.
837 // It is user's responsibility to make sure that is true.
838 template <typename T,
839  index_t N,
841 __device__ typename vector_type_maker<T, N>::type::type
843  index_t src_thread_element_offset,
844  bool src_thread_element_valid,
845  index_t src_element_space_size,
846  T customized_value)
847 {
848  const int32x4_t src_wave_buffer_resource =
849  make_wave_buffer_resource(p_src_wave, src_element_space_size);
850 
851  index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
852 
853  using vector_t = typename vector_type_maker<T, N>::type::type;
854  using scalar_t = typename scalar_type<vector_t>::type;
855 
856  constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
857 
858  vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
859  src_wave_buffer_resource, src_thread_addr_offset, 0)};
860 
861  return src_thread_element_valid ? tmp : vector_t(customized_value);
862 }
863 
864 // buffer_store requires:
865 // 1) p_dst_wave must point to global memory
866 // 2) p_dst_wave must be a wavewise pointer.
867 // It is user's responsibility to make sure that is true.
868 template <typename T,
869  index_t N,
871 __device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_data,
872  T* p_dst_wave,
873  const index_t dst_thread_element_offset,
874  const bool dst_thread_element_valid,
875  const index_t dst_element_space_size)
876 {
877  const int32x4_t dst_wave_buffer_resource =
878  make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
879 
880  index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
881 
882  using vector_t = typename vector_type_maker<T, N>::type::type;
883  using scalar_t = typename scalar_type<vector_t>::type;
884  constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
885 
886 #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
887  uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
888  amd_buffer_store_impl<scalar_t, vector_size, coherence>(
889  src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
890 #else
891  if(dst_thread_element_valid)
892  {
893  amd_buffer_store_impl<scalar_t, vector_size, coherence>(
894  src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
895  }
896 #endif
897 }
898 
899 // buffer_atomic_add requires:
900 // 1) p_dst_wave must point to global memory
901 // 2) p_dst_wave must be a wavewise pointer.
902 // It is user's responsibility to make sure that is true.
903 template <typename T, index_t N>
904 __device__ void
906  T* p_dst_wave,
907  const index_t dst_thread_element_offset,
908  const bool dst_thread_element_valid,
909  const index_t dst_element_space_size)
910 {
911  const int32x4_t dst_wave_buffer_resource =
912  make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
913 
914  index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
915 
916  using vector_t = typename vector_type_maker<T, N>::type::type;
917  using scalar_t = typename scalar_type<vector_t>::type;
918  constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
919 
920  if constexpr(is_same<T, bhalf_t>::value)
921  {
922  if(dst_thread_element_valid)
923  {
924  amd_global_atomic_add_impl<scalar_t, vector_size>(
925  src_thread_data, p_dst_wave + dst_thread_element_offset);
926  }
927  }
928  else
929  {
930 #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
931  uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
932 
933  amd_buffer_atomic_add_impl<scalar_t, vector_size>(
934  src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
935 #else
936  if(dst_thread_element_valid)
937  {
938  amd_buffer_atomic_add_impl<scalar_t, vector_size>(
939  src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
940  }
941 #endif
942  }
943 }
944 
945 // buffer_atomic_max requires:
946 // 1) p_dst_wave must point to global memory
947 // 2) p_dst_wave must be a wavewise pointer.
948 // It is user's responsibility to make sure that is true.
949 template <typename T, index_t N>
950 __device__ void
952  T* p_dst_wave,
953  const index_t dst_thread_element_offset,
954  const bool dst_thread_element_valid,
955  const index_t dst_element_space_size)
956 {
957  const int32x4_t dst_wave_buffer_resource =
958  make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
959 
960  index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
961 
962  using vector_t = typename vector_type_maker<T, N>::type::type;
963  using scalar_t = typename scalar_type<vector_t>::type;
964  constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
965 
966 #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
967  uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
968 
969  amd_buffer_atomic_max_impl<scalar_t, vector_size>(
970  src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
971 #else
972  if(dst_thread_element_valid)
973  {
974  amd_buffer_atomic_max_impl<scalar_t, vector_size>(
975  src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
976  }
977 #endif
978 }
979 
980 // Direct loads from global to LDS.
981 __device__ void
983  __attribute__((address_space(3))) uint32_t* lds_ptr,
984  index_t size,
985  index_t voffset,
986  index_t soffset,
987  index_t offset,
988  index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
989 
990 #ifndef __HIPCC_RTC__
991 template <typename T, index_t NumElemsPerThread>
992 __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
993  const index_t global_offset,
994  T* lds_base_ptr,
995  const index_t lds_offset,
996  const bool is_valid,
997  const index_t src_element_space_size)
998 {
999  // Direct loads require that each thread reads and writes exactly a single DWORD.
1000  constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
1001 #if defined(__gfx950__)
1002  constexpr auto dword_bytes = 4;
1003  static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
1004  bytes_per_thread == dword_bytes * 4);
1005 #elif defined(__gfx942__)
1006  constexpr auto dword_bytes = 4;
1007  static_assert(bytes_per_thread == dword_bytes);
1008 #endif
1009 
1010  const int32x4_t src_resource =
1011  make_wave_buffer_resource(global_base_ptr, src_element_space_size);
1012  const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
1013 
1014 #if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
1015  T* lds_ptr = lds_base_ptr + lds_offset;
1016 #ifndef CK_CODE_GEN_RTC
1017  auto const lds_ptr_sgpr =
1018  __builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
1019 #else
1020  auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((reinterpret_cast<size_t>(lds_ptr)));
1021 #endif
1022  asm volatile("s_mov_b32 m0, %0; \n\t"
1023  "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
1024  "v"(global_offset_bytes),
1025  "s"(src_resource)
1026  : "memory");
1027 #else
1028  // LDS pointer must be attributed with the LDS address space.
1029  __attribute__((address_space(3))) uint32_t* lds_ptr =
1030 #ifndef CK_CODE_GEN_RTC
1031  reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
1032  reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
1033 #else
1034  reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
1035  reinterpret_cast<size_t>(lds_base_ptr + lds_offset));
1036 #endif
1037 
1039  src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
1040 #endif
1041 }
1042 #endif
1043 
1044 } // namespace ck
#define CK_BUFFER_RESOURCE_3RD_DWORD
Definition: ck.hpp:81
Definition: ck.hpp:270
__device__ void llvm_amdgcn_raw_buffer_store_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
__device__ int32x4_t make_wave_buffer_resource_with_default_range(T *p_wave)
Definition: amd_buffer_addressing.hpp:39
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition: statically_indexed_array.hpp:45
__device__ void amd_buffer_store(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:871
__device__ void amd_direct_load_global_to_lds(const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)
Definition: amd_buffer_addressing.hpp:992
__device__ void amd_buffer_atomic_max(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:951
__device__ int32x4_t llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
__device__ void llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
__device__ void llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
__device__ void amd_buffer_store_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:521
__device__ float llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
__device__ int32x2_t llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
AmdBufferCoherenceEnum
Definition: amd_buffer_coherence.hpp:9
typename vector_type< bhalf_t, 4 >::type bhalf4_t
Definition: dtype_vector.hpp:2162
__device__ int32x4_t make_wave_buffer_resource(T *p_wave, index_t element_space_size)
Definition: amd_buffer_addressing.hpp:24
typename vector_type< int32_t, 2 >::type int32x2_t
Definition: dtype_vector.hpp:2168
__device__ void llvm_amdgcn_raw_buffer_store_fp16(half_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
typename vector_type< int8_t, 2 >::type int8x2_t
Definition: dtype_vector.hpp:2177
__device__ half_t llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
__device__ bhalf_t llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
__device__ void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
__device__ void llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
__device__ int8x4_t llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
__device__ void amd_buffer_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:576
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition: amd_buffer_addressing.hpp:842
__device__ float4_t llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
__device__ bhalf2_t llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
typename vector_type< float, 2 >::type float2_t
Definition: dtype_vector.hpp:2146
__device__ void llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
typename vector_type< half_t, 4 >::type half4_t
Definition: dtype_vector.hpp:2155
__device__ void amd_global_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, T *addr)
Definition: amd_buffer_addressing.hpp:548
_Float16 half_t
Definition: data_type.hpp:31
__device__ void llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
ushort bhalf_t
Definition: data_type.hpp:30
__device__ vector_type< T, N >::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:396
typename vector_type< bhalf_t, 2 >::type bhalf2_t
Definition: dtype_vector.hpp:2161
__device__ void llvm_amdgcn_raw_buffer_store_i16(bhalf_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
__device__ void llvm_amdgcn_raw_buffer_store_i16x2(bhalf2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
typename vector_type< float, 4 >::type float4_t
Definition: dtype_vector.hpp:2147
__device__ half2_t llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
__device__ void amd_buffer_atomic_add(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:905
__device__ void llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
__device__ bhalf4_t llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
__device__ int8_t llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
__device__ float2_t llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
typename vector_type< half_t, 2 >::type half2_t
Definition: dtype_vector.hpp:2154
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_zero(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
Definition: amd_buffer_addressing.hpp:806
__device__ void amd_buffer_atomic_max_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:734
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
typename vector_type< int32_t, 4 >::type int32x4_t
Definition: dtype_vector.hpp:2169
int32_t index_t
Definition: ck.hpp:301
__device__ void llvm_amdgcn_raw_buffer_store_fp16x2(half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
typename vector_type< int8_t, 4 >::type int8x4_t
Definition: dtype_vector.hpp:2178
__device__ void llvm_amdgcn_raw_buffer_store_i8(int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
__device__ int32_t llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
__device__ void amd_buffer_store_impl_raw(const typename vector_type< int8_t, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:423
__device__ void llvm_amdgcn_raw_buffer_store_i16x4(bhalf4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
__device__ vector_type< int8_t, N >::type amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:292
__device__ int8x2_t llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
typename remove_cv< T >::type remove_cv_t
Definition: type.hpp:295
__device__ void llvm_amdgcn_raw_buffer_store_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
__device__ half4_t llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
__device__ void llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
signed short int16_t
Definition: stdint.h:122
_W64 unsigned int uintptr_t
Definition: stdint.h:164
unsigned int uint32_t
Definition: stdint.h:126
signed int int32_t
Definition: stdint.h:123
signed char int8_t
Definition: stdint.h:121
Definition: integral_constant.hpp:20
Definition: type.hpp:177
Definition: data_type.hpp:39
Definition: functional2.hpp:33
Definition: dtype_vector.hpp:31
Definition: dtype_vector.hpp:11
Definition: amd_buffer_addressing.hpp:12
int32x4_t content
Definition: amd_buffer_addressing.hpp:17
StaticallyIndexedArray< int32_t, 4 > config
Definition: amd_buffer_addressing.hpp:20
constexpr __device__ BufferResource()
Definition: amd_buffer_addressing.hpp:13
StaticallyIndexedArray< int32_t, 4 > range
Definition: amd_buffer_addressing.hpp:19
StaticallyIndexedArray< T *, 2 > address
Definition: amd_buffer_addressing.hpp:18