/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/core/tensor/buffer_view.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/core/tensor/buffer_view.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/core/tensor/buffer_view.hpp Source File
buffer_view.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
8 #if __clang_major__ == 20
10 #else
12 #endif
22 
23 namespace ck_tile {
24 
25 // T may be scalar or vector
26 // X may be scalar or vector
27 // T and X have same scalar type
28 // X contains multiple T
29 // FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
30 // transforms of tensor_view/Tensor
31 // FIXME: amd_buffer_coherence_enum is only meaningful for buffer addressing. Need to split
32 // buffer_view definition for different memory address space (Global/GenericLds/Vgpr)
33 template <address_space_enum BufferAddressSpace,
34  typename T,
35  typename BufferSizeType,
36  bool InvalidElementUseNumericalZeroValue,
38 struct buffer_view;
39 
40 // Address Space: generic
41 // T may be scalar or vector
42 // X may be scalar or vector
43 // T and X have same scalar type
44 // X contains multiple T
45 // FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
46 // transforms of tensor_view/Tensor
47 template <typename T, typename BufferSizeType, bool InvalidElementUseNumericalZeroValue>
49  T,
50  BufferSizeType,
51  InvalidElementUseNumericalZeroValue,
53 {
54  using type = T;
55 
56  T* p_data_ = nullptr;
57  BufferSizeType buffer_size_;
58  remove_cvref_t<T> invalid_element_value_ = T{0};
59 
61  : p_data_{}, buffer_size_{}, invalid_element_value_{}
62  {
63  }
64 
65  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data, BufferSizeType buffer_size)
66  : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
67  {
68  }
69 
70  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data,
71  BufferSizeType buffer_size,
72  T invalid_element_value)
73  : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
74  {
75  }
76 
78 
80  {
82  }
83 
84  // i is offset of T
85  // FIXME: doesn't do is_valid check
86  CK_TILE_DEVICE constexpr const T& operator[](index_t i) const { return p_data_[i]; }
87 
88  // i is offset of T
89  // FIXME: doesn't do is_valid check
90  CK_TILE_DEVICE constexpr T& operator()(index_t i) { return p_data_[i]; }
91 
92  // i is offset of T, not X. i should be aligned to X
93  template <typename X,
94  bool oob_conditional_check = true,
95  typename std::enable_if<
96  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
97  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
98  bool>::type = false>
99  CK_TILE_DEVICE constexpr auto get(index_t i,
100  index_t linear_offset,
101  bool is_valid_element,
103  {
104  // X contains multiple T
105  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
106 
107  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
108 
109  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
110  "wrong! X should contain multiple T");
111 
112  if(is_valid_element)
113  {
114 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
115  X tmp;
116 
117  __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]), sizeof(X));
118 
119  return tmp;
120 #else
121  return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
122 #endif
123  }
124  else
125  {
126  if constexpr(InvalidElementUseNumericalZeroValue)
127  {
128  return X{numeric<remove_cvref_t<T>>::zero()};
129  }
130  else
131  {
132  return X{invalid_element_value_};
133  }
134  }
135  }
136 
137  /*
138  In the generic address space, we do not support the transpose instruction in the buffer view.
139  Will report compilation error when developer wants to use it.
140  */
141  template <typename X,
142  bool oob_conditional_check = true,
143  typename std::enable_if<
144  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
145  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
146  bool>::type = false>
148  index_t linear_offset,
149  bool is_valid_element,
151  {
152  static_assert(false, "Error: transpose load not supported in global memory space.");
153  ignore = i;
154  ignore = linear_offset;
155  ignore = is_valid_element;
156  return;
157  }
158 
159  // i is offset of T, not X. i should be aligned to X
160  template <memory_operation_enum Op,
161  typename X,
162  typename std::enable_if<
163  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
164  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
165  bool>::type = false>
166  CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
167  {
168  if constexpr(Op == memory_operation_enum::set)
169  {
170  this->template set<X>(i, linear_offset, is_valid_element, x);
171  }
172  // FIXME: remove memory_operation_enum::add
173  else if constexpr(Op == memory_operation_enum::add)
174  {
175  auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
176  this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
177  }
178  }
179 
180  // i is offset of T, not X. i should be aligned to X
181  template <typename X,
182  typename std::enable_if<
183  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
184  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
185  bool>::type = false>
186  CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
187  {
188  // X contains multiple T
189  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
190 
191  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
192 
193  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
194  "wrong! X should contain multiple T");
195 
196  if(is_valid_element)
197  {
198 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
199  X tmp = x;
200 
201  __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp, sizeof(X));
202 #else
203  *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
204 #endif
205  }
206  }
207 
208  // FIXME: remove
209  CK_TILE_DEVICE static constexpr bool is_static_buffer() { return false; }
210 
211  // FIXME: remove
212  CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
213 
215  {
216  printf("buffer_view{");
217 
218  // AddressSpace
219  printf("AddressSpace: generic, ");
220 
221  // p_data_
222  printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
223 
224  // buffer_size_
225  printf("buffer_size_: ");
226  print(buffer_size_);
227  printf(", ");
228 
229  // invalid_element_value_
230  printf("invalid_element_value_: ");
231  print(invalid_element_value_);
232 
233  printf("}");
234  }
235 };
236 
237 // Address Space: Global
238 // T may be scalar or vector
239 // X may be scalar or vector
240 // T and X have same scalar type
241 // X contains multiple T
242 // FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
243 // transforms of tensor_view/Tensor
244 template <typename T,
245  typename BufferSizeType,
246  bool InvalidElementUseNumericalZeroValue,
247  amd_buffer_coherence_enum Coherence>
249  T,
250  BufferSizeType,
251  InvalidElementUseNumericalZeroValue,
252  Coherence>
253 {
254  using type = T;
255 
256  T* p_data_ = nullptr;
257  BufferSizeType buffer_size_;
259  remove_cvref_t<T> invalid_element_value_ = T{0};
260 
261  static constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
262 
264  : p_data_{}, buffer_size_{}, cached_buf_res_{0}, invalid_element_value_{}
265  {
266  }
267 
268  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data, BufferSizeType buffer_size)
269  : p_data_{p_data},
270  buffer_size_{buffer_size / PackedSize},
271  cached_buf_res_{0},
272  invalid_element_value_{0}
273  {
274  }
275 
276  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data,
277  BufferSizeType buffer_size,
278  T invalid_element_value)
279  : p_data_{p_data},
280  buffer_size_{buffer_size / PackedSize},
281  cached_buf_res_{0},
282  invalid_element_value_{invalid_element_value}
283  {
284  }
285 
286  // this is non constexpr intentially (will call some intrinsic internally)
287  // Must call for buffers that need *_raw load/store
289  {
290  cached_buf_res_ = make_wave_buffer_resource(p_data_, (buffer_size_) * sizeof(type));
291  }
292 
294  {
296  }
297 
298  // i is offset of T
299  // FIXME: doesn't do is_valid check
300  CK_TILE_DEVICE constexpr const T& operator[](index_t i) const { return p_data_[i]; }
301 
302  // i is offset of T
303  // FIXME: doesn't do is_valid check
304  CK_TILE_DEVICE constexpr T& operator()(index_t i) { return p_data_[i]; }
305 
306  // i is offset of T, not X. i should be aligned to X
307  template <typename X,
308  bool oob_conditional_check = true,
309  typename std::enable_if<
310  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
311  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
312  bool>::type = false>
313  CK_TILE_DEVICE constexpr auto get(index_t i,
314  index_t linear_offset,
315  bool is_valid_element,
317  {
318  // X contains multiple T
319  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
320 
321  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
322 
323  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
324  "wrong! X should contain multiple T");
325 
326 #if CK_TILE_USE_AMD_BUFFER_LOAD
327  bool constexpr use_amd_buffer_addressing = true;
328 #else
329  bool constexpr use_amd_buffer_addressing = false;
330 #endif
331 
332  if constexpr(use_amd_buffer_addressing)
333  {
334  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
335 
336  if constexpr(InvalidElementUseNumericalZeroValue)
337  {
338  return amd_buffer_load_invalid_element_return_zero<remove_cvref_t<T>,
339  t_per_x,
340  Coherence,
341  oob_conditional_check>(
342  p_data_, i + linear_offset, is_valid_element, buffer_size_);
343  }
344  else
345  {
347  remove_cvref_t<T>,
348  t_per_x,
349  Coherence,
350  oob_conditional_check>(p_data_,
351  i + linear_offset,
352  is_valid_element,
353  buffer_size_,
354  invalid_element_value_);
355  }
356  }
357  else
358  {
359  if(is_valid_element)
360  {
361 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
362  X tmp;
363 
364  __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]), sizeof(X));
365 
366  return tmp;
367 #else
368  return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
369 #endif
370  }
371  else
372  {
373  if constexpr(InvalidElementUseNumericalZeroValue)
374  {
375  return X{numeric<remove_cvref_t<T>>::zero()};
376  }
377  else
378  {
379  return X{invalid_element_value_};
380  }
381  }
382  }
383  }
384 
385  /*
386  In the global memory address space, we do not support the transpose instruction in the buffer
387  view. Will report compilation error when developer wants to use it.
388  */
389  template <typename X,
390  bool oob_conditional_check = true,
391  typename std::enable_if<
392  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
393  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
394  bool>::type = false>
396  index_t linear_offset,
397  bool is_valid_element,
399  {
400  static_assert(false, "Error: transpose load not supported in global memory space.");
401  ignore = i;
402  ignore = linear_offset;
403  ignore = is_valid_element;
404  return;
405  }
406 
407  // i is offset of T, not X. i should be aligned to X
408  template <typename X,
409  bool oob_conditional_check = true,
410  bool pre_nop = false,
411  typename std::enable_if<
412  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
413  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
414  bool>::type = false>
416  index_t v_offset,
417  index_t i_offset,
418  bool is_valid_element,
419  bool_constant<pre_nop> = {}) const
420  {
421  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
422 
423  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
424 
425  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
426  "wrong! X should contain multiple T");
427 
428  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
429 
430  amd_buffer_load_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check, pre_nop>(
431  dst, cached_buf_res_, v_offset, i_offset, is_valid_element, bool_constant<pre_nop>{});
432  }
433 
434  // i is offset of T, not X. i should be aligned to X
435  template <typename X,
436  bool oob_conditional_check = true,
437  typename std::enable_if<
438  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
439  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
440  bool>::type = false>
442  index_t i,
443  index_t linear_offset,
444  bool is_valid_element,
446  {
447  // X is vector of T
448  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
449  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
450 
451  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
452  "wrong! X should contain multiple T");
453 
454  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
455  const int32x4_t src_wave_buffer_resource =
456  make_wave_buffer_resource(p_data_, (buffer_size_) * sizeof(type));
457 
458  amd_async_buffer_load_with_oob<remove_cvref_t<T>, t_per_x, Coherence>(
459  smem,
460  src_wave_buffer_resource,
461  i,
462  linear_offset,
463  is_valid_element,
465  }
466 
467  // i is offset of T, not X. i should be aligned to X
468  template <typename X,
469  bool pre_nop = false,
470  typename std::enable_if<
471  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
472  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
473  bool>::type = false>
475  index_t i,
476  index_t linear_offset,
477  bool /*is_valid_element*/,
478  bool_constant<pre_nop> = {}) const
479  {
480  // X is vector of T
481  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
482  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
483 
484  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
485  "wrong! X should contain multiple T");
486 
487  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
488 
489  amd_async_buffer_load_with_oob_raw<remove_cvref_t<T>, t_per_x, Coherence>(
490  smem, cached_buf_res_, i, linear_offset, bool_constant<pre_nop>{});
491  }
492 
493  // i is offset of T, not X. i should be aligned to X
494  template <memory_operation_enum Op,
495  typename X,
496  bool oob_conditional_check = true,
497  typename std::enable_if<
498  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
499  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
500  bool>::type = false>
502  index_t linear_offset,
503  bool is_valid_element,
504  const X& x,
506  {
507  if constexpr(Op == memory_operation_enum::set)
508  {
509  this->template set<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
510  }
511  else if constexpr(Op == memory_operation_enum::atomic_add)
512  {
513  this->template atomic_add<X, oob_conditional_check>(
514  i, linear_offset, is_valid_element, x);
515  }
516  else if constexpr(Op == memory_operation_enum::atomic_max)
517  {
518  this->template atomic_max<X, oob_conditional_check>(
519  i, linear_offset, is_valid_element, x);
520  }
521  // FIXME: remove memory_operation_enum::add
522  else if constexpr(Op == memory_operation_enum::add)
523  {
524  auto tmp =
525  this->template get<X, oob_conditional_check>(i, linear_offset, is_valid_element);
526  this->template set<X, oob_conditional_check>(
527  i, linear_offset, is_valid_element, x + tmp);
528  // tmp += x;
529  // this->template set<X>(i, is_valid_element, tmp);
530  }
531  }
532 
533  // i is offset of T, not X. i should be aligned to X
534  template <memory_operation_enum Op,
535  typename X,
536  bool oob_conditional_check = true,
537  bool pre_nop = false,
538  typename std::enable_if<
539  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
540  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
541  bool>::type = false>
543  index_t linear_offset,
544  bool is_valid_element,
545  const X& x,
548  {
549  if constexpr(Op == memory_operation_enum::set)
550  {
551  this->template set_raw<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
552  }
553  else if constexpr(Op == memory_operation_enum::atomic_add)
554  {
555  this->template atomic_add_raw<X, oob_conditional_check, pre_nop>(
556  i, linear_offset, is_valid_element, x);
557  }
558  else if constexpr(Op == memory_operation_enum::atomic_max)
559  {
560  // this->template atomic_max_raw<X>(i, linear_offset, is_valid_element, x);
561  }
562  }
563 
564  // i is offset of T, not X. i should be aligned to X
565  template <typename X,
566  bool oob_conditional_check = true,
567  typename std::enable_if<
568  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
569  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
570  bool>::type = false>
571  CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
572  {
573  // X contains multiple T
574  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
575 
576  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
577 
578  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
579  "wrong! X should contain multiple T");
580 
581 #if CK_TILE_USE_AMD_BUFFER_STORE
582  bool constexpr use_amd_buffer_addressing = true;
583 #else
584  bool constexpr use_amd_buffer_addressing = false;
585 #endif
586 
587  if constexpr(use_amd_buffer_addressing)
588  {
589  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
590 
591  amd_buffer_store<remove_cvref_t<T>, t_per_x, Coherence>(
592  x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
593  }
594  else
595  {
596  if(is_valid_element)
597  {
598 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
599  X tmp = x;
600 
601  __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp, sizeof(X));
602 #else
603  *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
604 #endif
605  }
606  }
607  }
608 
609  // i is offset of T, not X. i should be aligned to X
610  template <typename X,
611  bool oob_conditional_check = true,
612  typename std::enable_if<
613  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
614  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
615  bool>::type = false>
616  CK_TILE_DEVICE void set_raw(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
617  {
618  // X contains multiple T
619  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
620 
621  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
622 
623  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
624  "wrong! X should contain multiple T");
625 
626  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
627  amd_buffer_store_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check>(
628  x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
629  }
630 
631  template <typename X,
632  bool oob_conditional_check = true,
633  typename std::enable_if<
634  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
635  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
636  bool>::type = false>
637  CK_TILE_DEVICE void
638  atomic_add(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
639  {
640  using scalar_t = typename vector_traits<remove_cvref_t<T>>::scalar_type;
641 
642  // X contains multiple T
643  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
644 
645  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
646 
647  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
648  "wrong! X should contain multiple T");
649 
650  static_assert(get_address_space() == address_space_enum::global, "only support global mem");
651 
652 #if CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
653  bool constexpr use_amd_buffer_addressing =
654  std::is_same_v<remove_cvref_t<scalar_t>, int32_t> ||
655  std::is_same_v<remove_cvref_t<scalar_t>, float> ||
656  (std::is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
657 #elif CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
658  bool constexpr use_amd_buffer_addressing =
659  std::is_same_v<remove_cvref_t<scalar_t>, int32_t>;
660 #elif(!CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
661  bool constexpr use_amd_buffer_addressing =
662  std::is_same_v<remove_cvref_t<scalar_t>, float> ||
663  (std::is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
664 #else
665  bool constexpr use_amd_buffer_addressing = false;
666 #endif
667 
668  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
669 
670  if constexpr(use_amd_buffer_addressing)
671  {
672  amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
673  x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
674  }
675  else
676  {
677  if(is_valid_element)
678  {
679  atomic_add_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
680  }
681  }
682  }
683 
684  template <typename X,
685  bool oob_conditional_check = true,
686  bool pre_nop = true,
687  typename std::enable_if<
688  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
689  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
690  bool>::type = false>
691  CK_TILE_DEVICE void
692  atomic_add_raw(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
693  {
694  // using scalar_t = typename vector_traits<remove_cvref_t<T>>::scalar_type;
695 
696  // X contains multiple T
697  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
698 
699  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
700 
701  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
702  "wrong! X should contain multiple T");
703 
704  static_assert(get_address_space() == address_space_enum::global, "only support global mem");
705 
706  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
707 
708  amd_buffer_atomic_add_raw<remove_cvref_t<T>,
709  t_per_x,
710  Coherence,
711  oob_conditional_check,
712  pre_nop>(
713  x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
714  }
715 
716  template <typename X,
717  bool oob_conditional_check = true,
718  typename std::enable_if<
719  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
720  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
721  bool>::type = false>
722  CK_TILE_DEVICE void
723  atomic_max(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
724  {
725  // X contains multiple T
726  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
727 
728  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
729 
730  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
731  "wrong! X should contain multiple T");
732 
733  static_assert(get_address_space() == address_space_enum::global, "only support global mem");
734 
735 #if CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
736  using scalar_t = typename vector_traits<remove_cvref_t<T>>::scalar_type;
737  bool constexpr use_amd_buffer_addressing = std::is_same_v<remove_cvref_t<scalar_t>, double>;
738 #else
739  bool constexpr use_amd_buffer_addressing = false;
740 #endif
741 
742  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
743 
744  if constexpr(use_amd_buffer_addressing)
745  {
746  amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
747  x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
748  }
749  else if(is_valid_element)
750  {
751  atomic_max_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
752  }
753  }
754 
755  // FIXME: remove
756  CK_TILE_DEVICE static constexpr bool is_static_buffer() { return false; }
757 
758  // FIXME: remove
759  CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
760 
762  {
763  printf("buffer_view{");
764 
765  // AddressSpace
766  printf("AddressSpace: Global, ");
767 
768  // p_data_
769  printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
770 
771  // buffer_size_
772  printf("buffer_size_: ");
773  print(buffer_size_);
774  printf(", ");
775 
776  // invalid_element_value_
777  printf("invalid_element_value_: ");
778  print(invalid_element_value_);
779 
780  printf("}");
781  }
782 };
783 
784 // Address Space: LDS
785 // T may be scalar or vector
786 // X may be scalar or vector
787 // T and X have same scalar type
788 // X contains multiple T
789 // FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
790 // transforms of tensor_view/Tensor
791 template <typename T, typename BufferSizeType, bool InvalidElementUseNumericalZeroValue>
793  T,
794  BufferSizeType,
795  InvalidElementUseNumericalZeroValue,
797 {
798  using type = T;
799 
800  T* p_data_ = nullptr;
801  BufferSizeType buffer_size_;
802  remove_cvref_t<T> invalid_element_value_ = T{0};
803 
805  : p_data_{}, buffer_size_{}, invalid_element_value_{}
806  {
807  }
808 
809  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data, BufferSizeType buffer_size)
810  : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
811  {
812  }
813 
814  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data,
815  BufferSizeType buffer_size,
816  T invalid_element_value)
817  : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
818  {
819  }
820 
822 
824  {
826  }
827 
828  // i is offset of T
829  // FIXME: doesn't do is_valid check
830  CK_TILE_DEVICE constexpr const T& operator[](index_t i) const { return p_data_[i]; }
831 
832  // i is offset of T
833  // FIXME: doesn't do is_valid check
834  CK_TILE_DEVICE constexpr T& operator()(index_t i) { return p_data_[i]; }
835 
836  // i is offset of T, not X. i should be aligned to X
837  template <typename X,
838  bool oob_conditional_check = true,
839  typename std::enable_if<
840  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
841  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
842  bool>::type = false>
843  CK_TILE_DEVICE constexpr auto get(index_t i,
844  index_t linear_offset,
845  bool is_valid_element,
847  {
848  // X contains multiple T
849  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
850 
851  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
852 
853  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
854  "wrong! X should contain multiple T");
855 
856  if(is_valid_element)
857  {
858 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
859  X tmp;
860 
861  __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]), sizeof(X));
862 
863  return tmp;
864 #else
865  using buf_t = ext_vector_t<typename vector_traits<remove_cvref_t<T>>::scalar_type,
866  scalar_per_t_vector * scalar_per_x_vector>;
867  // using buf_t = ushort __attribute__((ext_vector_type(8)));
868  auto rtn = *c_style_pointer_cast<const buf_t*>(&p_data_[i + linear_offset]);
869  return bit_cast<X>(rtn);
870 #endif
871  }
872  else
873  {
874  if constexpr(InvalidElementUseNumericalZeroValue)
875  {
876  return X{numeric<remove_cvref_t<T>>::zero()};
877  }
878  else
879  {
880  return X{invalid_element_value_};
881  }
882  }
883  }
884 
885  // i is offset of T, not X. i should be aligned to X
886  template <typename X,
887  bool oob_conditional_check = true,
888  bool pre_nop = false,
889  typename std::enable_if<
890  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
891  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
892  bool>::type = false>
894  index_t v_offset,
895  index_t i_offset,
896  bool /*is_valid_element*/,
897  bool_constant<pre_nop> = {}) const
898  {
899  smem_load<sizeof(X)>{}(dst, v_offset * sizeof(T), i_offset * sizeof(T));
900  }
901 
902  template <typename X,
903  typename std::enable_if<
904  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
905  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
906  bool>::type = false>
907  CK_TILE_DEVICE constexpr auto transpose_get([[maybe_unused]] index_t i,
908  [[maybe_unused]] index_t linear_offset,
909  bool is_valid_element) const
910  {
911  // X contains multiple T
912  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
913 
914  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
915 
916  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
917  "wrong! X should contain multiple T");
918 
919  if(is_valid_element)
920  {
921 #if defined(__gfx950__)
922  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
923  constexpr address_space_enum addr_space = get_address_space();
924  return amd_transpose_load_to_vgpr<remove_cvref_t<T>, t_per_x, addr_space>(
925  p_data_ + i + linear_offset);
926 #else
927  return X{numeric<remove_cvref_t<T>>::zero()};
928 #endif
929  }
930  else
931  {
932  if constexpr(InvalidElementUseNumericalZeroValue)
933  {
934  return X{numeric<remove_cvref_t<T>>::zero()};
935  }
936  else
937  {
938  return X{invalid_element_value_};
939  }
940  }
941  }
942 
943  // i is offset of T, not X. i should be aligned to X
944  template <memory_operation_enum Op,
945  typename X,
946  typename std::enable_if<
947  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
948  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
949  bool>::type = false>
950  CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
951  {
952  if constexpr(Op == memory_operation_enum::set)
953  {
954  this->template set<X>(i, linear_offset, is_valid_element, x);
955  }
956  // FIXME: remove memory_operation_enum::add
957  else if constexpr(Op == memory_operation_enum::add)
958  {
959  auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
960  this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
961  }
962  }
963 
964  // i is offset of T, not X. i should be aligned to X
965  template <typename X,
966  typename std::enable_if<
967  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
968  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
969  bool>::type = false>
970  CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
971  {
972  // X contains multiple T
973  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
974 
975  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
976 
977  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
978  "wrong! X should contain multiple T");
979 
980 #if CK_TILE_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
981  bool constexpr workaround_int8_ds_write_issue = true;
982 #else
983  bool constexpr workaround_int8_ds_write_issue = false;
984 #endif
985 
986  i += linear_offset; // simplicity
987  if constexpr(std::is_same_v<typename vector_traits<remove_cvref_t<T>>::scalar_type,
988  int8_t> &&
989  workaround_int8_ds_write_issue)
990  {
991  if(is_valid_element)
992  {
993  // HACK: compiler would lower IR "store<i8, 16> address_space(3)" into inefficient
994  // ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
995  // ds_write_b128
996  // TODO: remove this after compiler fix
997  static_assert(
1014  // int8 on thread buffer
1023  // ext_vector_type for pk_int4 must use int8_t as type
1040  "wrong! not implemented for this combination, please add "
1041  "implementation");
1042 
1043  if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
1049  {
1050  // HACK: cast pointer of x is bad
1051  // TODO: remove this after compiler fix
1052  *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
1053  *c_style_pointer_cast<const int8_t*>(&x);
1054  }
1055  else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
1061  {
1062  // HACK: cast pointer of x is bad
1063  // TODO: remove this after compiler fix
1064  *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
1065  *c_style_pointer_cast<const int16_t*>(&x);
1066  }
1067  else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
1073  {
1074  // HACK: cast pointer of x is bad
1075  // TODO: remove this after compiler fix
1076  *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
1077  *c_style_pointer_cast<const int32_t*>(&x);
1078  }
1079  else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
1085  {
1086  // HACK: cast pointer of x is bad
1087  // TODO: remove this after compiler fix
1088  *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
1089  *c_style_pointer_cast<const int32x2_t*>(&x);
1090  }
1091  else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
1095  {
1096  // HACK: cast pointer of x is bad
1097  // TODO: remove this after compiler fix
1098  *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
1099  *c_style_pointer_cast<const int32x4_t*>(&x);
1100  }
1101  else if constexpr((std::is_same_v<remove_cvref_t<T>, int8x4_t> &&
1105  {
1106  // HACK: cast pointer of x is bad
1107  // TODO: remove this after compiler fix
1108  *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
1109  *c_style_pointer_cast<const int32_t*>(&x);
1110  }
1111  else if constexpr((std::is_same_v<remove_cvref_t<T>, int8x8_t> &&
1115  {
1116  // HACK: cast pointer of x is bad
1117  // TODO: remove this after compiler fix
1118  *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
1119  *c_style_pointer_cast<const int32x2_t*>(&x);
1120  }
1121  else if constexpr((std::is_same_v<remove_cvref_t<T>, int8x16_t> &&
1125  {
1126  // HACK: cast pointer of x is bad
1127  // TODO: remove this after compiler fix
1128  *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
1129  *c_style_pointer_cast<const int32x4_t*>(&x);
1130  }
1131  }
1132  }
1133  else
1134  {
1135  if(is_valid_element)
1136  {
1137 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1138  X tmp = x;
1139 
1140  __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
1141 #else
1142  using buf_t = ext_vector_t<typename vector_traits<remove_cvref_t<T>>::scalar_type,
1143  scalar_per_t_vector * scalar_per_x_vector>;
1144 
1145  *c_style_pointer_cast<buf_t*>(&p_data_[i]) = reinterpret_cast<const buf_t&>(x);
1146 #endif
1147  }
1148  }
1149  }
1150 
1151  // FIXME: remove
1152  CK_TILE_DEVICE static constexpr bool is_static_buffer() { return false; }
1153 
1154  // FIXME: remove
1155  CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
1156 
1158  {
1159  printf("buffer_view{");
1160 
1161  // AddressSpace
1162  printf("AddressSpace: Lds, ");
1163 
1164  // p_data_
1165  printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
1166 
1167  // buffer_size_
1168  printf("buffer_size_: ");
1169  print(buffer_size_);
1170  printf(", ");
1171 
1172  // invalid_element_value_
1173  printf("invalid_element_value_: ");
1174  print(invalid_element_value_);
1175 
1176  printf("}");
1177  }
1178 };
1179 
1180 // Address Space: Vgpr
1181 // T may be scalar or vector
1182 // X may be scalar or vector
1183 // T and X have same scalar type
1184 // X contains multiple T
1185 // FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
1186 // transforms of tensor_view/Tensor
1187 template <typename T, typename BufferSizeType, bool InvalidElementUseNumericalZeroValue>
1189  T,
1190  BufferSizeType,
1191  InvalidElementUseNumericalZeroValue,
1193 {
1194  using type = T;
1195 
1196  T* p_data_ = nullptr;
1197  BufferSizeType buffer_size_;
1198  remove_cvref_t<T> invalid_element_value_ = T{0};
1199 
1201  : p_data_{}, buffer_size_{}, invalid_element_value_{}
1202  {
1203  }
1204 
1205  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data, BufferSizeType buffer_size)
1206  : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
1207  {
1208  }
1209 
1210  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data,
1211  BufferSizeType buffer_size,
1212  T invalid_element_value)
1213  : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
1214  {
1215  }
1216 
1218 
1220  {
1221  return address_space_enum::vgpr;
1222  }
1223 
1224  // i is offset of T
1225  // FIXME: doesn't do is_valid check
1226  CK_TILE_DEVICE constexpr const T& operator[](index_t i) const { return p_data_[i]; }
1227 
1228  // i is offset of T
1229  // FIXME: doesn't do is_valid check
1230  CK_TILE_DEVICE constexpr T& operator()(index_t i) { return p_data_[i]; }
1231 
1232  // i is offset of T, not X. i should be aligned to X
1233  template <typename X,
1234  bool oob_conditional_check = true,
1235  typename std::enable_if<
1236  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1237  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
1238  bool>::type = false>
1239  CK_TILE_DEVICE constexpr auto get(index_t i,
1240  index_t /*linear_offset*/,
1241  bool is_valid_element,
1243  {
1244  // X contains multiple T
1245  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
1246 
1247  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
1248 
1249  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
1250  "wrong! X should contain multiple T");
1251 
1252  if(is_valid_element)
1253  {
1254 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1255  X tmp;
1256 
1257  __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
1258 
1259  return tmp;
1260 #else
1261  return *c_style_pointer_cast<const X*>(&p_data_[i]);
1262 #endif
1263  }
1264  else
1265  {
1266  if constexpr(InvalidElementUseNumericalZeroValue)
1267  {
1268  return X{numeric<remove_cvref_t<T>>::zero()};
1269  }
1270  else
1271  {
1272  return X{invalid_element_value_};
1273  }
1274  }
1275  }
1276 
1277  // i is offset of T, not X. i should be aligned to X
1278  template <memory_operation_enum Op,
1279  typename X,
1280  typename std::enable_if<
1281  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1282  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
1283  bool>::type = false>
1284  CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
1285  {
1286  if constexpr(Op == memory_operation_enum::set)
1287  {
1288  this->template set<X>(i, linear_offset, is_valid_element, x);
1289  }
1290  // FIXME: remove memory_operation_enum::add
1291  else if constexpr(Op == memory_operation_enum::add)
1292  {
1293  auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
1294  this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
1295  }
1296  }
1297 
1298  // i is offset of T, not X. i should be aligned to X
1299  template <typename X,
1300  typename std::enable_if<
1301  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1302  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
1303  bool>::type = false>
1304  CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
1305  {
1306  // X contains multiple T
1307  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
1308 
1309  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
1310 
1311  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
1312  "wrong! X should contain multiple T");
1313 
1314  if(is_valid_element)
1315  {
1316 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1317  X tmp = x;
1318 
1319  __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp, sizeof(X));
1320 #else
1321  *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
1322 #endif
1323  }
1324  }
1325 
1326  // FIXME: remove
1327  CK_TILE_DEVICE static constexpr bool is_static_buffer() { return false; }
1328 
1329  // FIXME: remove
1330  CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
1331 
1333  {
1334  printf("buffer_view{");
1335 
1336  // AddressSpace
1337  printf("AddressSpace: Vgpr, ");
1338 
1339  // p_data_
1340  printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
1341 
1342  // buffer_size_
1343  printf("buffer_size_: ");
1344  print(buffer_size_);
1345  printf(", ");
1346 
1347  // invalid_element_value_
1348  printf("invalid_element_value_: ");
1349  print(invalid_element_value_);
1350 
1351  printf("}");
1352  }
1353 };
1354 
1355 template <address_space_enum BufferAddressSpace,
1357  typename T,
1358  typename BufferSizeType>
1359 CK_TILE_HOST_DEVICE constexpr auto make_buffer_view(T* p, BufferSizeType buffer_size)
1360 {
1362 }
1363 
1364 template <address_space_enum BufferAddressSpace,
1366  typename T,
1367  typename BufferSizeType,
1368  typename X,
1369  typename std::enable_if<std::is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value,
1370  bool>::type = false>
1371 CK_TILE_HOST_DEVICE constexpr auto
1372 make_buffer_view(T* p, BufferSizeType buffer_size, X invalid_element_value)
1373 {
1375  p, buffer_size, invalid_element_value};
1376 }
1377 
1378 } // namespace ck_tile
#define CK_TILE_DEVICE
Definition: config.hpp:40
#define CK_TILE_LDS_ADDR
Definition: config.hpp:57
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:41
Definition: cluster_descriptor.hpp:13
int8_t __attribute((ext_vector_type(16))) pk_int4x16_t
Definition: vector_type.hpp:238
memory_operation_enum
Definition: arch.hpp:44
tuple_array< T, N > thread_buffer
Definition: thread_buffer.hpp:14
int8_t __attribute((ext_vector_type(8))) pk_int4x8_t
Definition: vector_type.hpp:237
int8_t __attribute((ext_vector_type(4))) int8x4_t
Definition: vector_type.hpp:180
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition: amd_buffer_addressing.hpp:2464
int8_t int8_t
Definition: int8.hpp:20
amd_buffer_coherence_enum
Definition: amd_buffer_addressing.hpp:1316
constexpr CK_TILE_HOST_DEVICE auto make_buffer_view(T *p, BufferSizeType buffer_size)
Definition: buffer_view.hpp:1359
int8_t __attribute((ext_vector_type(16))) int8x16_t
Definition: vector_type.hpp:182
int32_t index_t
Definition: integer.hpp:9
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:21
int8_t __attribute((ext_vector_type(2))) int8x2_t
Definition: vector_type.hpp:179
constexpr detail::ignore_t ignore
Definition: ignore.hpp:20
typename impl::ext_vector< T, N >::type ext_vector_t
Definition: vector_type.hpp:83
int32_t int32_t
Definition: integer.hpp:10
int8_t __attribute((ext_vector_type(4))) pk_int4x4_t
Definition: vector_type.hpp:236
int32_t int32x4_t
Definition: vector_type.hpp:144
address_space_enum
Definition: arch.hpp:34
_Float16 half_t
Definition: half.hpp:111
CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void *ptr, uint32_t size=0xffffffff)
Definition: amd_buffer_addressing.hpp:40
int8_t __attribute((ext_vector_type(8))) int8x8_t
Definition: vector_type.hpp:181
std::enable_if< B, T > enable_if
Definition: enable_if.hpp:24
constexpr bool is_same_v
Definition: type.hpp:283
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:186
constexpr CK_TILE_DEVICE auto transpose_get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:147
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:166
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:70
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:99
constexpr CK_TILE_DEVICE auto transpose_get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:395
constexpr CK_TILE_DEVICE auto async_get_raw(remove_cvref_t< T > *smem, index_t i, index_t linear_offset, bool, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:474
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:300
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:313
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x, bool_constant< oob_conditional_check >={})
Definition: buffer_view.hpp:501
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:293
CK_TILE_DEVICE void update_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
Definition: buffer_view.hpp:542
constexpr CK_TILE_DEVICE auto async_get(CK_TILE_LDS_ADDR remove_cvref_t< T > *smem, index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:441
constexpr CK_TILE_DEVICE auto get_raw(remove_cvref_t< X > &dst, index_t v_offset, index_t i_offset, bool is_valid_element, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:415
CK_TILE_DEVICE void atomic_add(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:638
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:276
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:268
CK_TILE_DEVICE void set_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:616
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:571
CK_TILE_DEVICE void atomic_max(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:723
CK_TILE_DEVICE void atomic_add_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:692
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:970
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:843
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:950
constexpr CK_TILE_DEVICE auto transpose_get([[maybe_unused]] index_t i, [[maybe_unused]] index_t linear_offset, bool is_valid_element) const
Definition: buffer_view.hpp:907
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:814
constexpr CK_TILE_DEVICE auto get_raw(remove_cvref_t< X > &dst, index_t v_offset, index_t i_offset, bool, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:893
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:809
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:1304
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:1210
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:1205
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:1284
constexpr CK_TILE_DEVICE auto get(index_t i, index_t, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:1239
Definition: buffer_view.hpp:38
Definition: integral_constant.hpp:13
Definition: numeric.hpp:81
Definition: numeric.hpp:18
Definition: pk_int4.hpp:21
Definition: amd_buffer_addressing.hpp:836
Definition: vector_type.hpp:89