8 #if __clang_major__ == 20
35 typename BufferSizeType,
36 bool InvalidElementUseNumericalZeroValue,
47 template <
typename T,
typename BufferSizeType,
bool Inval
idElementUseNumericalZeroValue>
51 InvalidElementUseNumericalZeroValue,
61 : p_data_{}, buffer_size_{}, invalid_element_value_{}
66 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
71 BufferSizeType buffer_size,
72 T invalid_element_value)
73 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
94 bool oob_conditional_check =
true,
96 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
101 bool is_valid_element,
109 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
110 "wrong! X should contain multiple T");
114 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
117 __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]),
sizeof(X));
121 return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
126 if constexpr(InvalidElementUseNumericalZeroValue)
128 return X{numeric<remove_cvref_t<T>>::zero()};
132 return X{invalid_element_value_};
141 template <
typename X,
142 bool oob_conditional_check =
true,
144 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
145 typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
149 bool is_valid_element,
152 static_assert(
false,
"Error: transpose load not supported in global memory space.");
155 ignore = is_valid_element;
163 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
164 typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
170 this->
template set<X>(i, linear_offset, is_valid_element, x);
175 auto tmp = this->
template get<X>(i, linear_offset, is_valid_element);
176 this->
template set<X>(i, linear_offset, is_valid_element, x + tmp);
181 template <
typename X,
183 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
193 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
194 "wrong! X should contain multiple T");
198 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
201 __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp,
sizeof(X));
203 *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
216 printf(
"buffer_view{");
219 printf(
"AddressSpace: generic, ");
222 printf(
"p_data_: %p, ",
static_cast<void*
>(
const_cast<remove_cvref_t<T>*
>(p_data_)));
225 printf(
"buffer_size_: ");
230 printf(
"invalid_element_value_: ");
231 print(invalid_element_value_);
244 template <
typename T,
245 typename BufferSizeType,
246 bool InvalidElementUseNumericalZeroValue,
251 InvalidElementUseNumericalZeroValue,
256 T* p_data_ =
nullptr;
264 : p_data_{}, buffer_size_{}, cached_buf_res_{0}, invalid_element_value_{}
270 buffer_size_{buffer_size / PackedSize},
272 invalid_element_value_{0}
277 BufferSizeType buffer_size,
278 T invalid_element_value)
280 buffer_size_{buffer_size / PackedSize},
282 invalid_element_value_{invalid_element_value}
307 template <
typename X,
308 bool oob_conditional_check =
true,
310 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
315 bool is_valid_element,
323 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
324 "wrong! X should contain multiple T");
326 #if CK_TILE_USE_AMD_BUFFER_LOAD
327 bool constexpr use_amd_buffer_addressing =
true;
329 bool constexpr use_amd_buffer_addressing =
false;
332 if constexpr(use_amd_buffer_addressing)
334 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
336 if constexpr(InvalidElementUseNumericalZeroValue)
338 return amd_buffer_load_invalid_element_return_zero<remove_cvref_t<T>,
341 oob_conditional_check>(
342 p_data_, i + linear_offset, is_valid_element, buffer_size_);
350 oob_conditional_check>(p_data_,
354 invalid_element_value_);
361 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
364 __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]),
sizeof(X));
368 return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
373 if constexpr(InvalidElementUseNumericalZeroValue)
375 return X{numeric<remove_cvref_t<T>>::zero()};
379 return X{invalid_element_value_};
389 template <
typename X,
390 bool oob_conditional_check =
true,
392 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
393 typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
397 bool is_valid_element,
400 static_assert(
false,
"Error: transpose load not supported in global memory space.");
403 ignore = is_valid_element;
408 template <
typename X,
409 bool oob_conditional_check =
true,
410 bool pre_nop =
false,
412 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
413 typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
418 bool is_valid_element,
425 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
426 "wrong! X should contain multiple T");
428 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
430 amd_buffer_load_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check, pre_nop>(
435 template <
typename X,
436 bool oob_conditional_check =
true,
438 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
439 typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
444 bool is_valid_element,
451 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
452 "wrong! X should contain multiple T");
454 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
455 const int32x4_t src_wave_buffer_resource =
458 amd_async_buffer_load_with_oob<remove_cvref_t<T>, t_per_x, Coherence>(
460 src_wave_buffer_resource,
468 template <
typename X,
469 bool pre_nop =
false,
471 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
472 typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
484 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
485 "wrong! X should contain multiple T");
487 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
489 amd_async_buffer_load_with_oob_raw<remove_cvref_t<T>, t_per_x, Coherence>(
496 bool oob_conditional_check =
true,
498 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
499 typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
503 bool is_valid_element,
509 this->
template set<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
513 this->
template atomic_add<X, oob_conditional_check>(
514 i, linear_offset, is_valid_element, x);
518 this->
template atomic_max<X, oob_conditional_check>(
519 i, linear_offset, is_valid_element, x);
525 this->
template get<X, oob_conditional_check>(i, linear_offset, is_valid_element);
526 this->
template set<X, oob_conditional_check>(
527 i, linear_offset, is_valid_element, x + tmp);
536 bool oob_conditional_check =
true,
537 bool pre_nop =
false,
539 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
540 typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
544 bool is_valid_element,
551 this->
template set_raw<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
555 this->
template atomic_add_raw<X, oob_conditional_check, pre_nop>(
556 i, linear_offset, is_valid_element, x);
565 template <
typename X,
566 bool oob_conditional_check =
true,
568 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
569 typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
578 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
579 "wrong! X should contain multiple T");
581 #if CK_TILE_USE_AMD_BUFFER_STORE
582 bool constexpr use_amd_buffer_addressing =
true;
584 bool constexpr use_amd_buffer_addressing =
false;
587 if constexpr(use_amd_buffer_addressing)
589 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
591 amd_buffer_store<remove_cvref_t<T>, t_per_x, Coherence>(
592 x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
598 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
601 __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp,
sizeof(X));
603 *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
610 template <
typename X,
611 bool oob_conditional_check =
true,
613 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
623 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
624 "wrong! X should contain multiple T");
626 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
627 amd_buffer_store_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check>(
628 x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
631 template <
typename X,
632 bool oob_conditional_check =
true,
634 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
647 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
648 "wrong! X should contain multiple T");
652 #if CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
653 bool constexpr use_amd_buffer_addressing =
654 std::is_same_v<remove_cvref_t<scalar_t>,
int32_t> ||
655 std::is_same_v<remove_cvref_t<scalar_t>,
float> ||
656 (std::is_same_v<remove_cvref_t<scalar_t>,
half_t> && scalar_per_x_vector % 2 == 0);
657 #elif CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
658 bool constexpr use_amd_buffer_addressing =
659 std::is_same_v<remove_cvref_t<scalar_t>,
int32_t>;
660 #elif(!CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
661 bool constexpr use_amd_buffer_addressing =
662 std::is_same_v<remove_cvref_t<scalar_t>,
float> ||
663 (std::is_same_v<remove_cvref_t<scalar_t>,
half_t> && scalar_per_x_vector % 2 == 0);
665 bool constexpr use_amd_buffer_addressing =
false;
668 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
670 if constexpr(use_amd_buffer_addressing)
672 amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
673 x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
679 atomic_add_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
684 template <
typename X,
685 bool oob_conditional_check =
true,
688 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
701 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
702 "wrong! X should contain multiple T");
706 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
708 amd_buffer_atomic_add_raw<remove_cvref_t<T>,
711 oob_conditional_check,
713 x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
716 template <
typename X,
717 bool oob_conditional_check =
true,
719 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
730 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
731 "wrong! X should contain multiple T");
735 #if CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
737 bool constexpr use_amd_buffer_addressing = std::is_same_v<remove_cvref_t<scalar_t>,
double>;
739 bool constexpr use_amd_buffer_addressing =
false;
742 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
744 if constexpr(use_amd_buffer_addressing)
746 amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
747 x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
749 else if(is_valid_element)
751 atomic_max_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
763 printf(
"buffer_view{");
766 printf(
"AddressSpace: Global, ");
769 printf(
"p_data_: %p, ",
static_cast<void*
>(
const_cast<remove_cvref_t<T>*
>(p_data_)));
772 printf(
"buffer_size_: ");
777 printf(
"invalid_element_value_: ");
778 print(invalid_element_value_);
791 template <
typename T,
typename BufferSizeType,
bool Inval
idElementUseNumericalZeroValue>
795 InvalidElementUseNumericalZeroValue,
800 T* p_data_ =
nullptr;
805 : p_data_{}, buffer_size_{}, invalid_element_value_{}
810 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
815 BufferSizeType buffer_size,
816 T invalid_element_value)
817 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
837 template <
typename X,
838 bool oob_conditional_check =
true,
840 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
845 bool is_valid_element,
853 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
854 "wrong! X should contain multiple T");
858 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
861 __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]),
sizeof(X));
866 scalar_per_t_vector * scalar_per_x_vector>;
868 auto rtn = *c_style_pointer_cast<const buf_t*>(&p_data_[i + linear_offset]);
869 return bit_cast<X>(rtn);
874 if constexpr(InvalidElementUseNumericalZeroValue)
876 return X{numeric<remove_cvref_t<T>>::zero()};
880 return X{invalid_element_value_};
886 template <
typename X,
887 bool oob_conditional_check =
true,
888 bool pre_nop =
false,
890 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
891 typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
899 smem_load<
sizeof(X)>{}(dst, v_offset *
sizeof(T), i_offset *
sizeof(T));
902 template <
typename X,
904 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
905 typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
908 [[maybe_unused]]
index_t linear_offset,
909 bool is_valid_element)
const
916 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
917 "wrong! X should contain multiple T");
921 #if defined(__gfx950__)
922 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
924 return amd_transpose_load_to_vgpr<remove_cvref_t<T>, t_per_x, addr_space>(
925 p_data_ + i + linear_offset);
932 if constexpr(InvalidElementUseNumericalZeroValue)
938 return X{invalid_element_value_};
947 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
954 this->
template set<X>(i, linear_offset, is_valid_element, x);
959 auto tmp = this->
template get<X>(i, linear_offset, is_valid_element);
960 this->
template set<X>(i, linear_offset, is_valid_element, x + tmp);
965 template <
typename X,
967 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
977 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
978 "wrong! X should contain multiple T");
980 #if CK_TILE_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
981 bool constexpr workaround_int8_ds_write_issue =
true;
983 bool constexpr workaround_int8_ds_write_issue =
false;
989 workaround_int8_ds_write_issue)
1040 "wrong! not implemented for this combination, please add "
1052 *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
1053 *c_style_pointer_cast<const int8_t*>(&x);
1064 *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
1065 *c_style_pointer_cast<const int16_t*>(&x);
1076 *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
1077 *c_style_pointer_cast<const int32_t*>(&x);
1088 *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
1089 *c_style_pointer_cast<const int32x2_t*>(&x);
1098 *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
1099 *c_style_pointer_cast<const int32x4_t*>(&x);
1108 *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
1109 *c_style_pointer_cast<const int32_t*>(&x);
1118 *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
1119 *c_style_pointer_cast<const int32x2_t*>(&x);
1128 *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
1129 *c_style_pointer_cast<const int32x4_t*>(&x);
1135 if(is_valid_element)
1137 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1140 __builtin_memcpy(&(p_data_[i]), &tmp,
sizeof(X));
1143 scalar_per_t_vector * scalar_per_x_vector>;
1145 *c_style_pointer_cast<buf_t*>(&p_data_[i]) =
reinterpret_cast<const buf_t&
>(x);
1159 printf(
"buffer_view{");
1162 printf(
"AddressSpace: Lds, ");
1165 printf(
"p_data_: %p, ",
static_cast<void*
>(
const_cast<remove_cvref_t<T>*
>(p_data_)));
1168 printf(
"buffer_size_: ");
1169 print(buffer_size_);
1173 printf(
"invalid_element_value_: ");
1174 print(invalid_element_value_);
1187 template <
typename T,
typename BufferSizeType,
bool Inval
idElementUseNumericalZeroValue>
1191 InvalidElementUseNumericalZeroValue,
1196 T* p_data_ =
nullptr;
1201 : p_data_{}, buffer_size_{}, invalid_element_value_{}
1206 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
1211 BufferSizeType buffer_size,
1212 T invalid_element_value)
1213 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
1233 template <
typename X,
1234 bool oob_conditional_check =
true,
1236 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1238 bool>::type =
false>
1241 bool is_valid_element,
1249 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
1250 "wrong! X should contain multiple T");
1252 if(is_valid_element)
1254 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1257 __builtin_memcpy(&tmp, &(p_data_[i]),
sizeof(X));
1261 return *c_style_pointer_cast<const X*>(&p_data_[i]);
1266 if constexpr(InvalidElementUseNumericalZeroValue)
1268 return X{numeric<remove_cvref_t<T>>::zero()};
1272 return X{invalid_element_value_};
1281 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1282 typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
1283 bool>::type =
false>
1288 this->
template set<X>(i, linear_offset, is_valid_element, x);
1293 auto tmp = this->
template get<X>(i, linear_offset, is_valid_element);
1294 this->
template set<X>(i, linear_offset, is_valid_element, x + tmp);
1299 template <
typename X,
1301 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1303 bool>::type =
false>
1311 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
1312 "wrong! X should contain multiple T");
1314 if(is_valid_element)
1316 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1319 __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp,
sizeof(X));
1321 *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
1334 printf(
"buffer_view{");
1337 printf(
"AddressSpace: Vgpr, ");
1340 printf(
"p_data_: %p, ",
static_cast<void*
>(
const_cast<remove_cvref_t<T>*
>(p_data_)));
1343 printf(
"buffer_size_: ");
1344 print(buffer_size_);
1348 printf(
"invalid_element_value_: ");
1349 print(invalid_element_value_);
1358 typename BufferSizeType>
1367 typename BufferSizeType,
1369 typename std::enable_if<std::is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value,
1370 bool>::type =
false>
1375 p, buffer_size, invalid_element_value};
#define CK_TILE_DEVICE
Definition: config.hpp:40
#define CK_TILE_LDS_ADDR
Definition: config.hpp:57
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:41
Definition: cluster_descriptor.hpp:13
int8_t __attribute((ext_vector_type(16))) pk_int4x16_t
Definition: vector_type.hpp:238
memory_operation_enum
Definition: arch.hpp:44
tuple_array< T, N > thread_buffer
Definition: thread_buffer.hpp:14
int8_t __attribute((ext_vector_type(8))) pk_int4x8_t
Definition: vector_type.hpp:237
int8_t __attribute((ext_vector_type(4))) int8x4_t
Definition: vector_type.hpp:180
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition: amd_buffer_addressing.hpp:2464
int8_t int8_t
Definition: int8.hpp:20
amd_buffer_coherence_enum
Definition: amd_buffer_addressing.hpp:1316
constexpr CK_TILE_HOST_DEVICE auto make_buffer_view(T *p, BufferSizeType buffer_size)
Definition: buffer_view.hpp:1359
int8_t __attribute((ext_vector_type(16))) int8x16_t
Definition: vector_type.hpp:182
int32_t index_t
Definition: integer.hpp:9
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:21
int8_t __attribute((ext_vector_type(2))) int8x2_t
Definition: vector_type.hpp:179
constexpr detail::ignore_t ignore
Definition: ignore.hpp:20
typename impl::ext_vector< T, N >::type ext_vector_t
Definition: vector_type.hpp:83
int32_t int32_t
Definition: integer.hpp:10
int8_t __attribute((ext_vector_type(4))) pk_int4x4_t
Definition: vector_type.hpp:236
int32_t int32x4_t
Definition: vector_type.hpp:144
address_space_enum
Definition: arch.hpp:34
_Float16 half_t
Definition: half.hpp:111
CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void *ptr, uint32_t size=0xffffffff)
Definition: amd_buffer_addressing.hpp:40
int8_t __attribute((ext_vector_type(8))) int8x8_t
Definition: vector_type.hpp:181
std::enable_if< B, T > enable_if
Definition: enable_if.hpp:24
constexpr bool is_same_v
Definition: type.hpp:283
CK_TILE_HOST_DEVICE void print() const
Definition: buffer_view.hpp:214
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:65
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:186
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:60
constexpr CK_TILE_DEVICE auto transpose_get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:147
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:77
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:90
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:212
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:166
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:79
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:70
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:99
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:86
T type
Definition: buffer_view.hpp:54
BufferSizeType buffer_size_
Definition: buffer_view.hpp:57
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:209
constexpr CK_TILE_DEVICE auto transpose_get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:395
int32x4_t cached_buf_res_
Definition: buffer_view.hpp:258
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:759
BufferSizeType buffer_size_
Definition: buffer_view.hpp:257
constexpr CK_TILE_DEVICE auto async_get_raw(remove_cvref_t< T > *smem, index_t i, index_t linear_offset, bool, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:474
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:300
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:288
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:304
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:313
CK_TILE_HOST_DEVICE void print() const
Definition: buffer_view.hpp:761
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x, bool_constant< oob_conditional_check >={})
Definition: buffer_view.hpp:501
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:293
CK_TILE_DEVICE void update_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
Definition: buffer_view.hpp:542
constexpr CK_TILE_DEVICE auto async_get(CK_TILE_LDS_ADDR remove_cvref_t< T > *smem, index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:441
T type
Definition: buffer_view.hpp:254
constexpr CK_TILE_DEVICE auto get_raw(remove_cvref_t< X > &dst, index_t v_offset, index_t i_offset, bool is_valid_element, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:415
CK_TILE_DEVICE void atomic_add(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:638
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:756
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:276
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:268
CK_TILE_DEVICE void set_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:616
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:263
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:571
CK_TILE_DEVICE void atomic_max(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:723
CK_TILE_DEVICE void atomic_add_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:692
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:970
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:804
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:1152
BufferSizeType buffer_size_
Definition: buffer_view.hpp:801
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:1155
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:821
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:843
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:950
constexpr CK_TILE_DEVICE auto transpose_get([[maybe_unused]] index_t i, [[maybe_unused]] index_t linear_offset, bool is_valid_element) const
Definition: buffer_view.hpp:907
CK_TILE_HOST_DEVICE void print() const
Definition: buffer_view.hpp:1157
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:830
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:834
T type
Definition: buffer_view.hpp:798
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:814
constexpr CK_TILE_DEVICE auto get_raw(remove_cvref_t< X > &dst, index_t v_offset, index_t i_offset, bool, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:893
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:809
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:823
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:1330
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:1217
CK_TILE_HOST_DEVICE void print() const
Definition: buffer_view.hpp:1332
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:1327
T type
Definition: buffer_view.hpp:1194
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:1230
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:1304
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:1200
BufferSizeType buffer_size_
Definition: buffer_view.hpp:1197
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:1210
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:1226
constexpr CK_TILE_HOST_DEVICE buffer_view(T *p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:1205
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:1284
constexpr CK_TILE_DEVICE auto get(index_t i, index_t, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:1239
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:1219
Definition: buffer_view.hpp:38
Definition: integral_constant.hpp:13
Definition: numeric.hpp:81
Definition: numeric.hpp:18
Definition: pk_int4.hpp:21
Definition: amd_buffer_addressing.hpp:836
Definition: vector_type.hpp:89