/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck_tile/core/tensor/buffer_view.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck_tile/core/tensor/buffer_view.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck_tile/core/tensor/buffer_view.hpp Source File
buffer_view.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
17 
18 namespace ck_tile {
19 
20 // T may be scalar or vector
21 // X may be scalar or vector
22 // T and X have same scalar type
23 // X contains multiple T
24 // FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
25 // transforms of tensor_view/Tensor
26 // FIXME: amd_buffer_coherence_enum is only meaningful for buffer addressing. Need to split
27 // buffer_view definition for different memory address space (Global/GenericLds/Vgpr)
28 template <address_space_enum BufferAddressSpace,
29  typename T,
30  typename BufferSizeType,
31  bool InvalidElementUseNumericalZeroValue,
33 struct buffer_view;
34 
35 // Address Space: generic
36 // T may be scalar or vector
37 // X may be scalar or vector
38 // T and X have same scalar type
39 // X contains multiple T
40 // FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
41 // transforms of tensor_view/Tensor
42 template <typename T, typename BufferSizeType, bool InvalidElementUseNumericalZeroValue>
44  T,
45  BufferSizeType,
46  InvalidElementUseNumericalZeroValue,
48 {
49  using type = T;
50 
51  T* p_data_ = nullptr;
52  BufferSizeType buffer_size_;
53  remove_cvref_t<T> invalid_element_value_ = T{0};
54 
56  : p_data_{}, buffer_size_{}, invalid_element_value_{}
57  {
58  }
59 
60  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data, BufferSizeType buffer_size)
61  : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
62  {
63  }
64 
65  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data,
66  BufferSizeType buffer_size,
67  T invalid_element_value)
68  : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
69  {
70  }
71 
73 
75  {
77  }
78 
79  // i is offset of T
80  // FIXME: doesn't do is_valid check
81  CK_TILE_DEVICE constexpr const T& operator[](index_t i) const { return p_data_[i]; }
82 
83  // i is offset of T
84  // FIXME: doesn't do is_valid check
85  CK_TILE_DEVICE constexpr T& operator()(index_t i) { return p_data_[i]; }
86 
87  // i is offset of T, not X. i should be aligned to X
88  template <typename X,
89  bool oob_conditional_check = true,
90  typename std::enable_if<
91  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
92  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
93  bool>::type = false>
94  CK_TILE_DEVICE constexpr auto get(index_t i,
95  index_t linear_offset,
96  bool is_valid_element,
98  {
99  // X contains multiple T
100  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
101 
102  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
103 
104  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
105  "wrong! X should contain multiple T");
106 
107  if(is_valid_element)
108  {
109 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
110  X tmp;
111 
112  __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]), sizeof(X));
113 
114  return tmp;
115 #else
116  return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
117 #endif
118  }
119  else
120  {
121  if constexpr(InvalidElementUseNumericalZeroValue)
122  {
123  return X{numeric<remove_cvref_t<T>>::zero()};
124  }
125  else
126  {
127  return X{invalid_element_value_};
128  }
129  }
130  }
131 
132  // i is offset of T, not X. i should be aligned to X
133  template <memory_operation_enum Op,
134  typename X,
135  typename std::enable_if<
136  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
137  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
138  bool>::type = false>
139  CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
140  {
141  if constexpr(Op == memory_operation_enum::set)
142  {
143  this->template set<X>(i, linear_offset, is_valid_element, x);
144  }
145  // FIXME: remove memory_operation_enum::add
146  else if constexpr(Op == memory_operation_enum::add)
147  {
148  auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
149  this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
150  }
151  }
152 
153  // i is offset of T, not X. i should be aligned to X
154  template <typename X,
155  typename std::enable_if<
156  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
157  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
158  bool>::type = false>
159  CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
160  {
161  // X contains multiple T
162  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
163 
164  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
165 
166  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
167  "wrong! X should contain multiple T");
168 
169  if(is_valid_element)
170  {
171 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
172  X tmp = x;
173 
174  __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp, sizeof(X));
175 #else
176  *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
177 #endif
178  }
179  }
180 
181  // FIXME: remove
182  CK_TILE_DEVICE static constexpr bool is_static_buffer() { return false; }
183 
184  // FIXME: remove
185  CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
186 
188  {
189  printf("buffer_view{");
190 
191  // AddressSpace
192  printf("AddressSpace: generic, ");
193 
194  // p_data_
195  printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
196 
197  // buffer_size_
198  printf("buffer_size_: ");
199  print(buffer_size_);
200  printf(", ");
201 
202  // invalid_element_value_
203  printf("invalid_element_value_: ");
204  print(invalid_element_value_);
205 
206  printf("}");
207  }
208 };
209 
210 // Address Space: Global
211 // T may be scalar or vector
212 // X may be scalar or vector
213 // T and X have same scalar type
214 // X contains multiple T
215 // FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
216 // transforms of tensor_view/Tensor
217 template <typename T,
218  typename BufferSizeType,
219  bool InvalidElementUseNumericalZeroValue,
220  amd_buffer_coherence_enum Coherence>
222  T,
223  BufferSizeType,
224  InvalidElementUseNumericalZeroValue,
225  Coherence>
226 {
227  using type = T;
228 
229  T* p_data_ = nullptr;
230  BufferSizeType buffer_size_;
232  remove_cvref_t<T> invalid_element_value_ = T{0};
233 
235  : p_data_{}, buffer_size_{}, cached_buf_res_{0}, invalid_element_value_{}
236  {
237  }
238 
239  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data, BufferSizeType buffer_size)
240  : p_data_{p_data}, buffer_size_{buffer_size}, cached_buf_res_{0}, invalid_element_value_{0}
241  {
242  }
243 
244  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data,
245  BufferSizeType buffer_size,
246  T invalid_element_value)
247  : p_data_{p_data},
248  buffer_size_{buffer_size},
249  cached_buf_res_{0},
250  invalid_element_value_{invalid_element_value}
251  {
252  }
253 
254  // this is non constexpr intentially (will call some intrinsic internally)
255  // Must call for buffers that need *_raw load/store
257  {
258  cached_buf_res_ = make_wave_buffer_resource(p_data_, buffer_size_ * sizeof(type));
259  }
260 
262  {
264  }
265 
266  // i is offset of T
267  // FIXME: doesn't do is_valid check
268  CK_TILE_DEVICE constexpr const T& operator[](index_t i) const { return p_data_[i]; }
269 
270  // i is offset of T
271  // FIXME: doesn't do is_valid check
272  CK_TILE_DEVICE constexpr T& operator()(index_t i) { return p_data_[i]; }
273 
274  // i is offset of T, not X. i should be aligned to X
275  template <typename X,
276  bool oob_conditional_check = true,
277  typename std::enable_if<
278  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
279  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
280  bool>::type = false>
281  CK_TILE_DEVICE constexpr auto get(index_t i,
282  index_t linear_offset,
283  bool is_valid_element,
285  {
286  // X contains multiple T
287  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
288 
289  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
290 
291  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
292  "wrong! X should contain multiple T");
293 
294 #if CK_TILE_USE_AMD_BUFFER_LOAD
295  bool constexpr use_amd_buffer_addressing = true;
296 #else
297  bool constexpr use_amd_buffer_addressing = false;
298 #endif
299 
300  if constexpr(use_amd_buffer_addressing)
301  {
302  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
303 
304  if constexpr(InvalidElementUseNumericalZeroValue)
305  {
306  return amd_buffer_load_invalid_element_return_zero<remove_cvref_t<T>,
307  t_per_x,
308  Coherence,
309  oob_conditional_check>(
310  p_data_, i + linear_offset, is_valid_element, buffer_size_);
311  }
312  else
313  {
315  remove_cvref_t<T>,
316  t_per_x,
317  Coherence,
318  oob_conditional_check>(p_data_,
319  i + linear_offset,
320  is_valid_element,
321  buffer_size_,
322  invalid_element_value_);
323  }
324  }
325  else
326  {
327  if(is_valid_element)
328  {
329 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
330  X tmp;
331 
332  __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]), sizeof(X));
333 
334  return tmp;
335 #else
336  return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
337 #endif
338  }
339  else
340  {
341  if constexpr(InvalidElementUseNumericalZeroValue)
342  {
343  return X{numeric<remove_cvref_t<T>>::zero()};
344  }
345  else
346  {
347  return X{invalid_element_value_};
348  }
349  }
350  }
351  }
352 
353  // i is offset of T, not X. i should be aligned to X
354  template <typename X,
355  bool oob_conditional_check = true,
356  bool pre_nop = false,
357  typename std::enable_if<
358  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
359  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
360  bool>::type = false>
362  index_t v_offset,
363  index_t i_offset,
364  bool is_valid_element,
365  bool_constant<pre_nop> = {}) const
366  {
367  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
368 
369  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
370 
371  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
372  "wrong! X should contain multiple T");
373 
374  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
375 
376  amd_buffer_load_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check, pre_nop>(
377  dst, cached_buf_res_, v_offset, i_offset, is_valid_element, bool_constant<pre_nop>{});
378  }
379 
380  // i is offset of T, not X. i should be aligned to X
381  template <typename X,
382  bool oob_conditional_check = true,
383  typename std::enable_if<
384  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
385  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
386  bool>::type = false>
388  index_t i,
389  index_t linear_offset,
390  bool is_valid_element,
392  {
393  // X is vector of T
394  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
395  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
396 
397  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
398  "wrong! X should contain multiple T");
399 
400  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
401 
402  amd_async_buffer_load_with_oob<remove_cvref_t<T>, t_per_x, Coherence>(
403  smem,
404  cached_buf_res_,
405  i,
406  linear_offset,
407  is_valid_element,
409  }
410 
411  // i is offset of T, not X. i should be aligned to X
412  template <typename X,
413  bool pre_nop = false,
414  typename std::enable_if<
415  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
416  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
417  bool>::type = false>
419  index_t i,
420  index_t linear_offset,
421  bool /*is_valid_element*/,
422  bool_constant<pre_nop> = {}) const
423  {
424  // X is vector of T
425  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
426  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
427 
428  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
429  "wrong! X should contain multiple T");
430 
431  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
432 
433  amd_async_buffer_load_with_oob_raw<remove_cvref_t<T>, t_per_x, Coherence>(
434  smem, cached_buf_res_, i, linear_offset, bool_constant<pre_nop>{});
435  }
436 
437  // i is offset of T, not X. i should be aligned to X
438  template <memory_operation_enum Op,
439  typename X,
440  bool oob_conditional_check = true,
441  typename std::enable_if<
442  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
443  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
444  bool>::type = false>
446  index_t linear_offset,
447  bool is_valid_element,
448  const X& x,
450  {
451  if constexpr(Op == memory_operation_enum::set)
452  {
453  this->template set<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
454  }
455  else if constexpr(Op == memory_operation_enum::atomic_add)
456  {
457  this->template atomic_add<X, oob_conditional_check>(
458  i, linear_offset, is_valid_element, x);
459  }
460  else if constexpr(Op == memory_operation_enum::atomic_max)
461  {
462  this->template atomic_max<X, oob_conditional_check>(
463  i, linear_offset, is_valid_element, x);
464  }
465  // FIXME: remove memory_operation_enum::add
466  else if constexpr(Op == memory_operation_enum::add)
467  {
468  auto tmp =
469  this->template get<X, oob_conditional_check>(i, linear_offset, is_valid_element);
470  this->template set<X, oob_conditional_check>(
471  i, linear_offset, is_valid_element, x + tmp);
472  // tmp += x;
473  // this->template set<X>(i, is_valid_element, tmp);
474  }
475  }
476 
477  // i is offset of T, not X. i should be aligned to X
478  template <memory_operation_enum Op,
479  typename X,
480  bool oob_conditional_check = true,
481  bool pre_nop = false,
482  typename std::enable_if<
483  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
484  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
485  bool>::type = false>
487  index_t linear_offset,
488  bool is_valid_element,
489  const X& x,
492  {
493  if constexpr(Op == memory_operation_enum::set)
494  {
495  this->template set_raw<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
496  }
497  else if constexpr(Op == memory_operation_enum::atomic_add)
498  {
499  this->template atomic_add_raw<X, oob_conditional_check, pre_nop>(
500  i, linear_offset, is_valid_element, x);
501  }
502  else if constexpr(Op == memory_operation_enum::atomic_max)
503  {
504  // this->template atomic_max_raw<X>(i, linear_offset, is_valid_element, x);
505  }
506  }
507 
508  // i is offset of T, not X. i should be aligned to X
509  template <typename X,
510  bool oob_conditional_check = true,
511  typename std::enable_if<
512  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
513  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
514  bool>::type = false>
515  CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
516  {
517  // X contains multiple T
518  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
519 
520  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
521 
522  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
523  "wrong! X should contain multiple T");
524 
525 #if CK_TILE_USE_AMD_BUFFER_STORE
526  bool constexpr use_amd_buffer_addressing = true;
527 #else
528  bool constexpr use_amd_buffer_addressing = false;
529 #endif
530 
531  if constexpr(use_amd_buffer_addressing)
532  {
533  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
534 
535  amd_buffer_store<remove_cvref_t<T>, t_per_x, Coherence>(
536  x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
537  }
538  else
539  {
540  if(is_valid_element)
541  {
542 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
543  X tmp = x;
544 
545  __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp, sizeof(X));
546 #else
547  *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
548 #endif
549  }
550  }
551  }
552 
553  // i is offset of T, not X. i should be aligned to X
554  template <typename X,
555  bool oob_conditional_check = true,
556  typename std::enable_if<
557  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
558  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
559  bool>::type = false>
560  CK_TILE_DEVICE void set_raw(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
561  {
562  // X contains multiple T
563  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
564 
565  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
566 
567  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
568  "wrong! X should contain multiple T");
569 
570  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
571  amd_buffer_store_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check>(
572  x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
573  }
574 
575  template <typename X,
576  bool oob_conditional_check = true,
577  typename std::enable_if<
578  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
579  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
580  bool>::type = false>
581  CK_TILE_DEVICE void
582  atomic_add(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
583  {
584  using scalar_t = typename vector_traits<remove_cvref_t<T>>::scalar_type;
585 
586  // X contains multiple T
587  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
588 
589  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
590 
591  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
592  "wrong! X should contain multiple T");
593 
594  static_assert(get_address_space() == address_space_enum::global, "only support global mem");
595 
596 #if CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
597  bool constexpr use_amd_buffer_addressing =
598  std::is_same_v<remove_cvref_t<scalar_t>, int32_t> ||
599  std::is_same_v<remove_cvref_t<scalar_t>, float> ||
600  (std::is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
601 #elif CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
602  bool constexpr use_amd_buffer_addressing =
603  std::is_same_v<remove_cvref_t<scalar_t>, int32_t>;
604 #elif(!CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
605  bool constexpr use_amd_buffer_addressing =
606  std::is_same_v<remove_cvref_t<scalar_t>, float> ||
607  (std::is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
608 #else
609  bool constexpr use_amd_buffer_addressing = false;
610 #endif
611 
612  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
613 
614  if constexpr(use_amd_buffer_addressing)
615  {
616  amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
617  x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
618  }
619  else
620  {
621  if(is_valid_element)
622  {
623  atomic_add_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
624  }
625  }
626  }
627 
628  template <typename X,
629  bool oob_conditional_check = true,
630  bool pre_nop = true,
631  typename std::enable_if<
632  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
633  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
634  bool>::type = false>
635  CK_TILE_DEVICE void
636  atomic_add_raw(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
637  {
638  // using scalar_t = typename vector_traits<remove_cvref_t<T>>::scalar_type;
639 
640  // X contains multiple T
641  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
642 
643  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
644 
645  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
646  "wrong! X should contain multiple T");
647 
648  static_assert(get_address_space() == address_space_enum::global, "only support global mem");
649 
650  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
651 
652  amd_buffer_atomic_add_raw<remove_cvref_t<T>,
653  t_per_x,
654  Coherence,
655  oob_conditional_check,
656  pre_nop>(
657  x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
658  }
659 
660  template <typename X,
661  bool oob_conditional_check = true,
662  typename std::enable_if<
663  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
664  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
665  bool>::type = false>
666  CK_TILE_DEVICE void
667  atomic_max(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
668  {
669  // X contains multiple T
670  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
671 
672  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
673 
674  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
675  "wrong! X should contain multiple T");
676 
677  static_assert(get_address_space() == address_space_enum::global, "only support global mem");
678 
679 #if CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
680  using scalar_t = typename vector_traits<remove_cvref_t<T>>::scalar_type;
681  bool constexpr use_amd_buffer_addressing = std::is_same_v<remove_cvref_t<scalar_t>, double>;
682 #else
683  bool constexpr use_amd_buffer_addressing = false;
684 #endif
685 
686  constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
687 
688  if constexpr(use_amd_buffer_addressing)
689  {
690  amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
691  x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
692  }
693  else if(is_valid_element)
694  {
695  atomic_max_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
696  }
697  }
698 
699  // FIXME: remove
700  CK_TILE_DEVICE static constexpr bool is_static_buffer() { return false; }
701 
702  // FIXME: remove
703  CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
704 
706  {
707  printf("buffer_view{");
708 
709  // AddressSpace
710  printf("AddressSpace: Global, ");
711 
712  // p_data_
713  printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
714 
715  // buffer_size_
716  printf("buffer_size_: ");
717  print(buffer_size_);
718  printf(", ");
719 
720  // invalid_element_value_
721  printf("invalid_element_value_: ");
722  print(invalid_element_value_);
723 
724  printf("}");
725  }
726 };
727 
728 // Address Space: LDS
729 // T may be scalar or vector
730 // X may be scalar or vector
731 // T and X have same scalar type
732 // X contains multiple T
733 // FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
734 // transforms of tensor_view/Tensor
735 template <typename T, typename BufferSizeType, bool InvalidElementUseNumericalZeroValue>
737  T,
738  BufferSizeType,
739  InvalidElementUseNumericalZeroValue,
741 {
742  using type = T;
743 
744  T* p_data_ = nullptr;
745  BufferSizeType buffer_size_;
746  remove_cvref_t<T> invalid_element_value_ = T{0};
747 
749  : p_data_{}, buffer_size_{}, invalid_element_value_{}
750  {
751  }
752 
753  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data, BufferSizeType buffer_size)
754  : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
755  {
756  }
757 
758  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data,
759  BufferSizeType buffer_size,
760  T invalid_element_value)
761  : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
762  {
763  }
764 
766 
768  {
770  }
771 
772  // i is offset of T
773  // FIXME: doesn't do is_valid check
774  CK_TILE_DEVICE constexpr const T& operator[](index_t i) const { return p_data_[i]; }
775 
776  // i is offset of T
777  // FIXME: doesn't do is_valid check
778  CK_TILE_DEVICE constexpr T& operator()(index_t i) { return p_data_[i]; }
779 
780  // i is offset of T, not X. i should be aligned to X
781  template <typename X,
782  bool oob_conditional_check = true,
783  typename std::enable_if<
784  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
785  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
786  bool>::type = false>
787  CK_TILE_DEVICE constexpr auto get(index_t i,
788  index_t linear_offset,
789  bool is_valid_element,
791  {
792  // X contains multiple T
793  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
794 
795  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
796 
797  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
798  "wrong! X should contain multiple T");
799 
800  if(is_valid_element)
801  {
802 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
803  X tmp;
804 
805  __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]), sizeof(X));
806 
807  return tmp;
808 #else
809  using buf_t = ext_vector_t<typename vector_traits<remove_cvref_t<T>>::scalar_type,
810  scalar_per_t_vector * scalar_per_x_vector>;
811  // using buf_t = ushort __attribute__((ext_vector_type(8)));
812  auto rtn = *c_style_pointer_cast<const buf_t*>(&p_data_[i + linear_offset]);
813  return bit_cast<X>(rtn);
814 #endif
815  }
816  else
817  {
818  if constexpr(InvalidElementUseNumericalZeroValue)
819  {
820  return X{numeric<remove_cvref_t<T>>::zero()};
821  }
822  else
823  {
824  return X{invalid_element_value_};
825  }
826  }
827  }
828 
829  // i is offset of T, not X. i should be aligned to X
830  template <typename X,
831  bool oob_conditional_check = true,
832  bool pre_nop = false,
833  typename std::enable_if<
834  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
835  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
836  bool>::type = false>
838  index_t v_offset,
839  index_t i_offset,
840  bool /*is_valid_element*/,
841  bool_constant<pre_nop> = {}) const
842  {
843  smem_load<sizeof(X)>{}(dst, v_offset * sizeof(T), i_offset * sizeof(T));
844  }
845 
846  // i is offset of T, not X. i should be aligned to X
847  template <memory_operation_enum Op,
848  typename X,
849  typename std::enable_if<
850  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
851  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
852  bool>::type = false>
853  CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
854  {
855  if constexpr(Op == memory_operation_enum::set)
856  {
857  this->template set<X>(i, linear_offset, is_valid_element, x);
858  }
859  // FIXME: remove memory_operation_enum::add
860  else if constexpr(Op == memory_operation_enum::add)
861  {
862  auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
863  this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
864  }
865  }
866 
867  // i is offset of T, not X. i should be aligned to X
868  template <typename X,
869  typename std::enable_if<
870  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
871  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
872  bool>::type = false>
873  CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
874  {
875  // X contains multiple T
876  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
877 
878  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
879 
880  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
881  "wrong! X should contain multiple T");
882 
883 #if CK_TILE_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
884  bool constexpr workaround_int8_ds_write_issue = true;
885 #else
886  bool constexpr workaround_int8_ds_write_issue = false;
887 #endif
888 
889  i += linear_offset; // simplicity
890  if constexpr(std::is_same<typename vector_traits<remove_cvref_t<T>>::scalar_type,
891  int8_t>::value &&
892  workaround_int8_ds_write_issue)
893  {
894  if(is_valid_element)
895  {
896  // HACK: compiler would lower IR "store<i8, 16> address_space(3)" into inefficient
897  // ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
898  // ds_write_b128
899  // TODO: remove this after compiler fix
900  static_assert((std::is_same<remove_cvref_t<T>, int8_t>::value &&
901  std::is_same<remove_cvref_t<X>, int8_t>::value) ||
902  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
903  std::is_same<remove_cvref_t<X>, int8x2_t>::value) ||
904  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
905  std::is_same<remove_cvref_t<X>, int8x4_t>::value) ||
906  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
907  std::is_same<remove_cvref_t<X>, int8x8_t>::value) ||
908  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
909  std::is_same<remove_cvref_t<X>, int8x16_t>::value) ||
910  (std::is_same<remove_cvref_t<T>, int8x4_t>::value &&
911  std::is_same<remove_cvref_t<X>, int8x4_t>::value) ||
912  (std::is_same<remove_cvref_t<T>, int8x8_t>::value &&
913  std::is_same<remove_cvref_t<X>, int8x8_t>::value) ||
914  (std::is_same<remove_cvref_t<T>, int8x16_t>::value &&
915  std::is_same<remove_cvref_t<X>, int8x16_t>::value),
916  "wrong! not implemented for this combination, please add "
917  "implementation");
918 
919  if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
920  std::is_same<remove_cvref_t<X>, int8_t>::value)
921  {
922  // HACK: cast pointer of x is bad
923  // TODO: remove this after compiler fix
924  *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
925  *c_style_pointer_cast<const int8_t*>(&x);
926  }
927  else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
928  std::is_same<remove_cvref_t<X>, int8x2_t>::value)
929  {
930  // HACK: cast pointer of x is bad
931  // TODO: remove this after compiler fix
932  *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
933  *c_style_pointer_cast<const int16_t*>(&x);
934  }
935  else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
936  std::is_same<remove_cvref_t<X>, int8x4_t>::value)
937  {
938  // HACK: cast pointer of x is bad
939  // TODO: remove this after compiler fix
940  *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
941  *c_style_pointer_cast<const int32_t*>(&x);
942  }
943  else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
944  std::is_same<remove_cvref_t<X>, int8x8_t>::value)
945  {
946  // HACK: cast pointer of x is bad
947  // TODO: remove this after compiler fix
948  *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
949  *c_style_pointer_cast<const int32x2_t*>(&x);
950  }
951  else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
952  std::is_same<remove_cvref_t<X>, int8x16_t>::value)
953  {
954  // HACK: cast pointer of x is bad
955  // TODO: remove this after compiler fix
956  *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
957  *c_style_pointer_cast<const int32x4_t*>(&x);
958  }
959  else if constexpr(std::is_same<remove_cvref_t<T>, int8x4_t>::value &&
960  std::is_same<remove_cvref_t<X>, int8x4_t>::value)
961  {
962  // HACK: cast pointer of x is bad
963  // TODO: remove this after compiler fix
964  *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
965  *c_style_pointer_cast<const int32_t*>(&x);
966  }
967  else if constexpr(std::is_same<remove_cvref_t<T>, int8x8_t>::value &&
968  std::is_same<remove_cvref_t<X>, int8x8_t>::value)
969  {
970  // HACK: cast pointer of x is bad
971  // TODO: remove this after compiler fix
972  *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
973  *c_style_pointer_cast<const int32x2_t*>(&x);
974  }
975  else if constexpr(std::is_same<remove_cvref_t<T>, int8x16_t>::value &&
976  std::is_same<remove_cvref_t<X>, int8x16_t>::value)
977  {
978  // HACK: cast pointer of x is bad
979  // TODO: remove this after compiler fix
980  *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
981  *c_style_pointer_cast<const int32x4_t*>(&x);
982  }
983  }
984  }
985  else
986  {
987  if(is_valid_element)
988  {
989 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
990  X tmp = x;
991 
992  __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
993 #else
994  using buf_t = ext_vector_t<typename vector_traits<remove_cvref_t<T>>::scalar_type,
995  scalar_per_t_vector * scalar_per_x_vector>;
996 
997  *c_style_pointer_cast<buf_t*>(&p_data_[i]) = reinterpret_cast<const buf_t&>(x);
998 #endif
999  }
1000  }
1001  }
1002 
1003  // FIXME: remove
1004  CK_TILE_DEVICE static constexpr bool is_static_buffer() { return false; }
1005 
1006  // FIXME: remove
1007  CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
1008 
1010  {
1011  printf("buffer_view{");
1012 
1013  // AddressSpace
1014  printf("AddressSpace: Lds, ");
1015 
1016  // p_data_
1017  printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
1018 
1019  // buffer_size_
1020  printf("buffer_size_: ");
1021  print(buffer_size_);
1022  printf(", ");
1023 
1024  // invalid_element_value_
1025  printf("invalid_element_value_: ");
1026  print(invalid_element_value_);
1027 
1028  printf("}");
1029  }
1030 };
1031 
1032 // Address Space: Vgpr
1033 // T may be scalar or vector
1034 // X may be scalar or vector
1035 // T and X have same scalar type
1036 // X contains multiple T
1037 // FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
1038 // transforms of tensor_view/Tensor
1039 template <typename T, typename BufferSizeType, bool InvalidElementUseNumericalZeroValue>
1041  T,
1042  BufferSizeType,
1043  InvalidElementUseNumericalZeroValue,
1045 {
1046  using type = T;
1047 
1048  T* p_data_ = nullptr;
1049  BufferSizeType buffer_size_;
1050  remove_cvref_t<T> invalid_element_value_ = T{0};
1051 
1053  : p_data_{}, buffer_size_{}, invalid_element_value_{}
1054  {
1055  }
1056 
1057  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data, BufferSizeType buffer_size)
1058  : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
1059  {
1060  }
1061 
1062  CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data,
1063  BufferSizeType buffer_size,
1064  T invalid_element_value)
1065  : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
1066  {
1067  }
1068 
1070 
1072  {
1073  return address_space_enum::vgpr;
1074  }
1075 
1076  // i is offset of T
1077  // FIXME: doesn't do is_valid check
1078  CK_TILE_DEVICE constexpr const T& operator[](index_t i) const { return p_data_[i]; }
1079 
1080  // i is offset of T
1081  // FIXME: doesn't do is_valid check
1082  CK_TILE_DEVICE constexpr T& operator()(index_t i) { return p_data_[i]; }
1083 
1084  // i is offset of T, not X. i should be aligned to X
1085  template <typename X,
1086  bool oob_conditional_check = true,
1087  typename std::enable_if<
1088  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1089  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
1090  bool>::type = false>
1091  CK_TILE_DEVICE constexpr auto get(index_t i,
1092  index_t /*linear_offset*/,
1093  bool is_valid_element,
1095  {
1096  // X contains multiple T
1097  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
1098 
1099  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
1100 
1101  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
1102  "wrong! X should contain multiple T");
1103 
1104  if(is_valid_element)
1105  {
1106 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1107  X tmp;
1108 
1109  __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
1110 
1111  return tmp;
1112 #else
1113  return *c_style_pointer_cast<const X*>(&p_data_[i]);
1114 #endif
1115  }
1116  else
1117  {
1118  if constexpr(InvalidElementUseNumericalZeroValue)
1119  {
1120  return X{numeric<remove_cvref_t<T>>::zero()};
1121  }
1122  else
1123  {
1124  return X{invalid_element_value_};
1125  }
1126  }
1127  }
1128 
1129  // i is offset of T, not X. i should be aligned to X
1130  template <memory_operation_enum Op,
1131  typename X,
1132  typename std::enable_if<
1133  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1134  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
1135  bool>::type = false>
1136  CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
1137  {
1138  if constexpr(Op == memory_operation_enum::set)
1139  {
1140  this->template set<X>(i, linear_offset, is_valid_element, x);
1141  }
1142  // FIXME: remove memory_operation_enum::add
1143  else if constexpr(Op == memory_operation_enum::add)
1144  {
1145  auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
1146  this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
1147  }
1148  }
1149 
1150  // i is offset of T, not X. i should be aligned to X
1151  template <typename X,
1152  typename std::enable_if<
1153  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1154  typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
1155  bool>::type = false>
1156  CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
1157  {
1158  // X contains multiple T
1159  constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
1160 
1161  constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
1162 
1163  static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
1164  "wrong! X should contain multiple T");
1165 
1166  if(is_valid_element)
1167  {
1168 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1169  X tmp = x;
1170 
1171  __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp, sizeof(X));
1172 #else
1173  *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
1174 #endif
1175  }
1176  }
1177 
1178  // FIXME: remove
1179  CK_TILE_DEVICE static constexpr bool is_static_buffer() { return false; }
1180 
1181  // FIXME: remove
1182  CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
1183 
1185  {
1186  printf("buffer_view{");
1187 
1188  // AddressSpace
1189  printf("AddressSpace: Vgpr, ");
1190 
1191  // p_data_
1192  printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
1193 
1194  // buffer_size_
1195  printf("buffer_size_: ");
1196  print(buffer_size_);
1197  printf(", ");
1198 
1199  // invalid_element_value_
1200  printf("invalid_element_value_: ");
1201  print(invalid_element_value_);
1202 
1203  printf("}");
1204  }
1205 };
1206 
1207 template <address_space_enum BufferAddressSpace,
1209  typename T,
1210  typename BufferSizeType>
1211 CK_TILE_HOST_DEVICE constexpr auto make_buffer_view(T* p, BufferSizeType buffer_size)
1212 {
1214 }
1215 
1216 template <address_space_enum BufferAddressSpace,
1218  typename T,
1219  typename BufferSizeType,
1220  typename X,
1221  typename std::enable_if<std::is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value,
1222  bool>::type = false>
1223 CK_TILE_HOST_DEVICE constexpr auto
1224 make_buffer_view(T* p, BufferSizeType buffer_size, X invalid_element_value)
1225 {
1227  p, buffer_size, invalid_element_value};
1228 }
1229 
1230 } // namespace ck_tile
#define CK_TILE_DEVICE
Definition: config.hpp:40
#define CK_TILE_LDS_ADDR
Definition: config.hpp:56
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:41
Definition: cluster_descriptor.hpp:13
memory_operation_enum
Definition: arch.hpp:44
int8_t __attribute((ext_vector_type(4))) int8x4_t
Definition: vector_type.hpp:150
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition: amd_buffer_addressing.hpp:2195
int8_t int8_t
Definition: int8.hpp:20
amd_buffer_coherence_enum
Definition: amd_buffer_addressing.hpp:1179
constexpr CK_TILE_HOST_DEVICE auto make_buffer_view(T *p, BufferSizeType buffer_size)
Definition: buffer_view.hpp:1211
int8_t __attribute((ext_vector_type(16))) int8x16_t
Definition: vector_type.hpp:152
int32_t index_t
Definition: integer.hpp:9
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:20
int8_t __attribute((ext_vector_type(2))) int8x2_t
Definition: vector_type.hpp:149
typename impl::ext_vector< T, N >::type ext_vector_t
Definition: vector_type.hpp:54
int32_t int32x4_t
Definition: vector_type.hpp:114
address_space_enum
Definition: arch.hpp:34
_Float16 half_t
Definition: half.hpp:111
CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void *ptr, uint32_t size=0xffffffff)
Definition: amd_buffer_addressing.hpp:26
int8_t __attribute((ext_vector_type(8))) int8x8_t
Definition: vector_type.hpp:151
std::enable_if< B, T > enable_if
Definition: enable_if.hpp:10
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:159
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:139
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:65
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:94
constexpr CK_TILE_DEVICE auto async_get_raw(remove_cvref_t< T > *smem, index_t i, index_t linear_offset, bool, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:418
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:268
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:281
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x, bool_constant< oob_conditional_check >={})
Definition: buffer_view.hpp:445
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:261
CK_TILE_DEVICE void update_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
Definition: buffer_view.hpp:486
constexpr CK_TILE_DEVICE auto async_get(CK_TILE_LDS_ADDR remove_cvref_t< T > *smem, index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:387
constexpr CK_TILE_DEVICE auto get_raw(remove_cvref_t< X > &dst, index_t v_offset, index_t i_offset, bool is_valid_element, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:361
CK_TILE_DEVICE void atomic_add(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:582
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:244
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:239
CK_TILE_DEVICE void set_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:560
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:515
CK_TILE_DEVICE void atomic_max(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:667
CK_TILE_DEVICE void atomic_add_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:636
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:873
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:787
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:853
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:758
constexpr CK_TILE_DEVICE auto get_raw(remove_cvref_t< X > &dst, index_t v_offset, index_t i_offset, bool, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:837
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:753
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:1156
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:1062
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:1057
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:1136
constexpr CK_TILE_DEVICE auto get(index_t i, index_t, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:1091
Definition: buffer_view.hpp:33
Definition: integral_constant.hpp:13
Definition: amd_buffer_addressing.hpp:699
Definition: vector_type.hpp:60