34 template <
typename BottomTensorView_,
35 typename WindowLengths_,
36 typename StaticTileDistribution_,
40 tile_window_with_static_distribution<BottomTensorView_,
42 StaticTileDistribution_,
46 StaticTileDistribution_>
51 StaticTileDistribution_,
55 StaticTileDistribution_>;
59 static_assert(NumCoord == 1);
61 static_assert(Base::Traits::NumAccess % NumCoord == 0,
62 "wrong! # of access is not divisible by NumCoord");
83 if constexpr(Base::BottomTensorView::buffer_view::get_address_space() ==
84 address_space_enum::global)
86 auto use_lane_id_0 = partition_index;
121 window_origin + window_adaptor_thread_coord_tmp.get_bottom_index();
124 bottom_tensor_view.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
128 using Traits =
typename Base::Traits;
129 using SFC_Ys =
typename Traits::SFC_Ys;
132 auto window_adaptor_thread_coord = window_adaptor_thread_coord_tmp;
133 auto bottom_tensor_thread_coord = bottom_tensor_thread_coord_tmp;
135 constexpr
auto idx_diff_ys =
143 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
145 coords(iCoord) =
make_tuple(window_adaptor_thread_coord, bottom_tensor_thread_coord);
151 template <
index_t i_access_unsupport_ = -1,
bool oob_conditional_check =
true>
156 0, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
159 template <
index_t i_access_unsupport_ = -1,
160 bool oob_conditional_check =
true,
167 auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
170 number<i_access_unsupport_>{},
171 bool_constant<oob_conditional_check>{});
185 template <
typename TileWindow_,
186 typename ElementWise_,
187 index_t i_access_unsupport_ = -1,
188 bool oob_conditional_check =
true>
190 ElementWise_ elementwise,
195 auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
199 number<i_access_unsupport_>{},
200 bool_constant<oob_conditional_check>{});
204 template <
typename DistributedTensor,
205 typename TileWindow_,
206 typename ElementWise_,
207 index_t i_access_unsupport_ = -1,
208 bool oob_conditional_check =
true>
210 const TileWindow_& tile_window,
211 ElementWise_ elementwise,
216 using Traits =
typename Base::Traits;
217 using vector_t =
typename Traits::vector_t;
218 using SFC_Ys =
typename Traits::SFC_Ys;
221 constexpr
auto sizeOfTuple = TileWindow_::size();
223 static_for<0, NumCoord, 1>{}([&](
auto iCoord) {
225 auto window_adaptor_thread_coord =
226 tile_window[number<0>{}].pre_computed_coords_[iCoord][
I0];
227 auto bottom_tensor_thread_coord =
228 tile_window[number<0>{}].pre_computed_coords_[iCoord][
I1];
230 static_for<0, NumAccessPerCoord, 1>{}([&](
auto iCoordAccess) {
231 constexpr
auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
234 constexpr
auto idx_ys_start = SFC_Ys::get_index(iAccess);
239 return tile_window[number<jj>{}]
240 .get_bottom_tensor_view()
241 .
template get_vectorized_elements<vector_t>(
242 bottom_tensor_thread_coord,
244 bool_constant<oob_conditional_check>{});
246 number<sizeOfTuple>{});
249 static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](
auto j) {
252 return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
255 number<Base::NDimY>{});
258 tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
263 elementwise(dst_tensor.get_thread_buffer().template at<d>(),
272 constexpr
auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
275 generate_tuple([&](
auto) {
return number<0>{}; }, number<Base::NDimP>{}),
279 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
285 template <
typename DistributedTensor,
286 index_t i_access_unsupport_ = -1,
287 bool oob_conditional_check =
true>
293 0, dst_tensor, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
296 template <
typename offset_t>
299 constexpr
auto bottom_tensor_idx_off =
to_multi_index(offset_t{});
306 typename StaticTileDistribution,
307 index_t i_access_unsupport_ = -1,
308 bool oob_conditional_check =
true,
316 using Traits =
typename Base::Traits;
317 using vector_t =
typename Traits::vector_t;
318 using SFC_Ys =
typename Traits::SFC_Ys;
322 const index_t linear_off = [&]() {
323 if constexpr(std::is_integral_v<offset_t>)
325 else if constexpr(is_constant_v<offset_t>)
331 static_for<0, NumCoord, 1>{}([&](
auto iCoord) {
336 static_for<0, NumAccessPerCoord, 1>{}([&](
auto iCoordAccess) {
337 constexpr
auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
340 constexpr
auto idx_ys_start = SFC_Ys::get_index(iAccess);
343 const vector_t vec_value =
345 bottom_tensor_thread_coord,
347 bool_constant<oob_conditional_check>{});
349 static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](
auto j) {
352 return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
355 number<Base::NDimY>{});
358 tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
363 .template get_as<typename Base::DataType>()[j / Traits::PackedSize];
368 constexpr
auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
371 generate_tuple([&](
auto) {
return number<0>{}; }, number<Base::NDimP>{}),
375 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
381 template <
typename DstTile,
382 index_t i_access_unsupport_ = -1,
383 bool oob_conditional_check =
true,
384 bool pre_nop =
false>
388 bool_constant<pre_nop> = {})
const
390 using Traits =
typename Base::Traits;
391 using vector_t =
typename Traits::vector_t;
392 using SFC_Ys =
typename Traits::SFC_Ys;
393 static constexpr
index_t YElementSize =
394 typename Base::TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
395 static_assert(YElementSize % (Traits::PackedSize * Traits::ScalarPerVector) == 0);
396 using vectorized_tbuf =
397 array<vector_t, YElementSize / (Traits::PackedSize * Traits::ScalarPerVector)>;
401 auto& dst_vec_tbuf =
reinterpret_cast<vectorized_tbuf&
>(dst_tensor.get_thread_buffer());
404 static_for<0, NumCoord, 1>{}([&](
auto iCoord) {
409 static_for<0, NumAccessPerCoord, 1>{}([&](
auto iCoordAccess) {
410 constexpr
auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
411 constexpr
auto pre_nop_ = [&]() {
412 if constexpr(pre_nop && iCoord == 0 && iCoordAccess == 0)
415 return bool_constant<false>{};
419 constexpr
auto idx_ys_start = SFC_Ys::get_index(iAccess);
421 tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start) /
423 static_assert(d % Traits::ScalarPerVector == 0);
426 dst_vec_tbuf.template at<d / Traits::ScalarPerVector>(),
427 bottom_tensor_thread_coord,
429 bool_constant<oob_conditional_check>{},
431 #if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE || \
432 CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
439 constexpr
auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
442 generate_tuple([&](
auto) {
return number<0>{}; }, number<Base::NDimP>{}),
446 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
453 template <
typename LdsTileWindow_,
454 index_t i_access_unsupport_ = -1,
455 bool oob_conditional_check =
true,
456 bool pre_nop =
false>
460 bool_constant<pre_nop> = {})
const
462 using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
464 using LdsDataType =
typename LdsTileWindow::DataType;
468 static_assert(LdsTileWindow::get_num_of_dimension() == 3);
471 lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
472 make_tuple(number<0>{}, number<0>{}, number<0>{})) *
476 lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
477 make_tuple(number<0>{}, number<1>{}, number<0>{})) *
478 sizeof(LdsDataType) -
482 lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
483 make_tuple(number<1>{}, number<0>{}, number<0>{})) *
484 sizeof(LdsDataType) -
489 size_per_buf + size_per_wave * get_warp_id(bool_constant<false>{});
493 using Traits =
typename Base::Traits;
495 using vector_t =
typename Traits::vector_t;
496 using SFC_Ys =
typename Traits::SFC_Ys;
498 LdsDataType* smem = lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_;
501 static_for<0, NumCoord, 1>{}([&](
auto iCoord) {
506 static_for<0, NumAccessPerCoord, 1>{}([&](
auto iCoordAccess) {
507 constexpr
auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
508 constexpr
auto pre_nop_ = [&]() {
509 if constexpr(pre_nop && iCoord == 0 && iCoordAccess == 0)
512 return bool_constant<false>{};
517 smem, bottom_tensor_thread_coord, 0, pre_nop_);
522 constexpr
auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
525 generate_tuple([&](
auto) {
return number<0>{}; }, number<Base::NDimP>{}),
529 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
537 template <
typename LdsTileWindow_,
538 index_t i_access_unsupport_ = -1,
539 bool oob_conditional_check =
true,
540 bool static_move_ys =
false,
541 typename = std::enable_if_t<std::is_class_v<remove_cvref_t<LdsTileWindow_>>>>
543 LdsTileWindow_&& lds_tile,
546 bool_constant<static_move_ys> = {})
const
548 using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
549 using LdsDataType =
typename LdsTileWindow::DataType;
550 using Traits =
typename Base::Traits;
552 using vector_t =
typename Traits::vector_t;
553 using SFC_Ys =
typename Traits::SFC_Ys;
556 const auto window_origin = lds_tile.get_window_origin();
557 const auto& bottom_tensor_view = lds_tile.get_bottom_tensor_view();
558 const auto& tensor_descriptor = bottom_tensor_view.get_tensor_descriptor();
559 auto lds_base_ptr = bottom_tensor_view.get_buffer_view().p_data_;
561 static_for<0, NumCoord, 1>{}([&](
auto iCoord) {
568 static_for<0, NumAccessPerCoord, 1>{}([&](
auto iCoordAccess) {
569 constexpr
auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
571 constexpr
auto idx_ys_offset = [&]() {
572 constexpr
auto idx_off_ys = SFC_Ys::get_step_between(number<0>{}, iAccess);
574 StaticTileDistribution_{}.get_ps_ys_to_xs_adaptor(),
577 return adapter_ys_offset.get_bottom_index();
579 const auto lds_ys_offset = [&]() {
580 if constexpr(static_move_ys)
582 const auto coord_ys_offset =
584 return coord_ys_offset.get_offset();
591 auto lds_bottom_tensor_thread_idx =
592 window_origin + window_adaptor_warp_coord.get_bottom_index();
593 const auto lds_coord =
598 lds_coord.get_offset() / Traits::PackedSize +
599 lds_ys_offset / Traits::PackedSize;
601 const auto dram_ys_offset = [&]() {
602 if constexpr(static_move_ys)
606 return coord_ys_offset.get_offset();
614 bottom_tensor_thread_coord,
615 offset + dram_ys_offset,
616 bool_constant<oob_conditional_check>{});
621 constexpr
auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
623 generate_tuple([&](
auto) {
return number<0>{}; }, number<Base::NDimP>{}),
626 if constexpr(!static_move_ys)
628 window_adaptor_thread_coord,
629 bottom_tensor_thread_coord,
632 if constexpr(!static_move_ys)
634 window_adaptor_warp_coord, bottom_tensor_warp_coord, idx_diff_ps_ys);
640 template <
typename Policy,
index_t i_access_unsupport_ = -1,
bool oob_conditional_check =
true>
644 return this->
template load_transpose_with_offset<Policy>(
645 0, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
648 template <
typename Policy,
index_t i_access_unsupport_ = -1,
bool oob_conditional_check =
true>
654 auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
655 this->
template load_transpose_with_offset<Policy>(offset,
657 number<i_access_unsupport_>{},
658 bool_constant<oob_conditional_check>{});
662 template <
typename Policy,
663 typename DistributedTensor,
664 index_t i_access_unsupport_ = -1,
665 bool oob_conditional_check =
true>
667 DistributedTensor& dst_tensor,
671 using Traits =
typename Base::Traits;
672 using vector_t =
typename Traits::vector_t;
673 using SFC_Ys =
typename Traits::SFC_Ys;
677 constexpr
auto group_func = Policy::group_func;
680 static_for<0, NumCoord, 1>{}([&](
auto iCoord) {
685 static_for<0, NumAccessPerCoord, 1>{}([&](
auto iCoordAccess) {
686 constexpr
auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
689 constexpr
auto idx_ys_start = SFC_Ys::get_index(iAccess);
692 const vector_t vec_value =
694 .template get_transpose_vectorized_elements<vector_t>(
695 bottom_tensor_thread_coord, offset);
697 static_for<0, Traits::ScalarPerVector, 1>{}([&](
auto j) {
700 return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
703 number<Base::NDimY>{});
705 constexpr
auto grouped_idx_ys = group_func(orig_idx_ys);
707 constexpr
index_t linear_distributed_index =
708 tile_dstr.get_ys_to_d_descriptor().calculate_offset(grouped_idx_ys);
710 dst_tensor.get_thread_buffer().template at<linear_distributed_index>() =
711 vec_value.template get_as<typename Base::DataType>()[j];
716 constexpr
auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
719 generate_tuple([&](
auto) {
return number<0>{}; }, number<Base::NDimP>{}),
723 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
729 template <
index_t i_access_unsupport_ = -1,
bool oob_conditional_check =
true>
735 using Traits =
typename Base::Traits;
737 using vector_t =
typename Traits::vector_t;
738 using SFC_Ys =
typename Traits::SFC_Ys;
743 static_for<0, NumCoord, 1>{}([&](
auto iCoord) {
747 static_for<0, NumAccessPerCoord, 1>{}([&](
auto iCoordAccess) {
748 constexpr
auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
751 constexpr
auto idx_ys_start = SFC_Ys::get_index(iAccess);
757 static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](
auto j) {
760 return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
763 number<Base::NDimY>{});
766 tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
769 vec_value.template get_as<typename Base::DataType>()(j / Traits::PackedSize) =
770 dstr_tensor.get_thread_buffer().template at<d>();
777 bottom_tensor_thread_coord,
780 bool_constant<oob_conditional_check>{});
785 constexpr
auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
788 generate_tuple([&](
auto) {
return number<0>{}; }, number<Base::NDimP>{}),
792 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
798 template <
index_t i_access_unsupport_ = -1>
804 using Traits =
typename Base::Traits;
806 using vector_t =
typename Traits::vector_t;
807 using SFC_Ys =
typename Traits::SFC_Ys;
810 static constexpr
bool oob_conditional_check =
true;
813 static_for<0, NumCoord, 1>{}([&](
auto iCoord) {
818 static_for<0, NumAccessPerCoord, 1>{}([&](
auto iCoordAccess) {
819 constexpr
auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
822 constexpr
auto idx_ys_start = SFC_Ys::get_index(iAccess);
826 static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](
auto j) {
829 return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
832 number<Base::NDimY>{});
834 tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
836 vec_value.template get_as<typename Base::DataType>()(j / Traits::PackedSize) =
842 .template set_vectorized_elements_raw<vector_t, oob_conditional_check>(
843 bottom_tensor_thread_coord, 0, vec_value);
848 constexpr
auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
851 generate_tuple([&](
auto) {
return number<0>{}; }, number<Base::NDimP>{}),
855 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
861 template <
index_t i_access_unsupport_ = -1,
bool oob_conditional_check =
true>
868 using Traits =
typename Base::Traits;
870 using vector_t =
typename Traits::vector_t;
871 using SFC_Ys =
typename Traits::SFC_Ys;
876 static_for<0, NumCoord, 1>{}([&](
auto iCoord) {
881 static_for<0, NumAccessPerCoord, 1>{}([&](
auto iCoordAccess) {
882 constexpr
auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
885 constexpr
auto idx_ys_start = SFC_Ys::get_index(iAccess);
890 static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](
auto j) {
893 return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
896 number<Base::NDimY>{});
899 tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
902 vec_value.template get_as<typename Base::DataType>()(j / Traits::PackedSize) =
908 bottom_tensor_thread_coord,
911 bool_constant<oob_conditional_check>{});
916 constexpr
auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
919 generate_tuple([&](
auto) {
return number<0>{}; }, number<Base::NDimP>{}),
923 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
929 template <
index_t i_access_unsupport_ = -1,
bool oob_conditional_check =
true,
bool pre_nop>
935 bool_constant<pre_nop> = {})
const
937 using Traits =
typename Base::Traits;
939 using vector_t =
typename Traits::vector_t;
940 using SFC_Ys =
typename Traits::SFC_Ys;
945 static_for<0, NumCoord, 1>{}([&](
auto iCoord) {
950 static_for<0, NumAccessPerCoord, 1>{}([&](
auto iCoordAccess) {
951 constexpr
auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
954 constexpr
auto idx_ys_start = SFC_Ys::get_index(iAccess);
959 static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](
auto j) {
962 return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
965 number<Base::NDimY>{});
968 tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
971 vec_value.template get_as<typename Base::DataType>()(j / Traits::PackedSize) =
977 bottom_tensor_thread_coord,
980 bool_constant<oob_conditional_check>{},
981 bool_constant<pre_nop>{});
986 constexpr
auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
989 generate_tuple([&](
auto) {
return number<0>{}; }, number<Base::NDimP>{}),
993 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
1008 if constexpr(Base::BottomTensorView::buffer_view::get_address_space() ==
1009 address_space_enum::global)
1023 const auto window_adaptor_thread_coord_tmp =
1029 this->
window_origin_ + window_adaptor_thread_coord_tmp.get_bottom_index();
1036 using Traits =
typename Base::Traits;
1037 using SFC_Ys =
typename Traits::SFC_Ys;
1040 auto window_adaptor_thread_coord = window_adaptor_thread_coord_tmp;
1041 auto bottom_tensor_thread_coord = bottom_tensor_thread_coord_tmp;
1043 constexpr
auto idx_diff_ys =
1051 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
1054 make_tuple(window_adaptor_thread_coord, bottom_tensor_thread_coord);
1065 Base::BottomTensorView::buffer_view::get_address_space() == address_space_enum::global,
1072 template <
typename TensorView_,
1073 typename WindowLengths_,
1074 typename StaticTileDistribution_,
1078 const WindowLengths_& window_lengths,
1079 const multi_index<TensorView_::get_num_of_dimension()>& origin,
1083 return tile_window_with_static_distribution<remove_cvref_t<TensorView_>,
1084 remove_cvref_t<WindowLengths_>,
1085 remove_cvref_t<StaticTileDistribution_>,
1087 tensor_view, window_lengths, origin, tile_distribution};
1090 template <
typename TensorView_,
1091 typename WindowLengths_,
1092 typename StaticTileDistribution_,
1094 typename = std::enable_if_t<is_tensor_view_v<TensorView_> &&
1095 is_tile_distribution_v<StaticTileDistribution_>>>
1098 const WindowLengths_& window_lengths,
1099 const multi_index<TensorView_::get_num_of_dimension()>& origin,
1104 return tile_window_with_static_distribution<remove_cvref_t<TensorView_>,
1105 remove_cvref_t<WindowLengths_>,
1106 remove_cvref_t<StaticTileDistribution_>,
1108 tensor_view, window_lengths, origin, tile_distribution, partition_index};
1112 template <
typename TensorView_,
1113 typename WindowLengths_,
1114 typename StaticTileDistribution_,
1118 const WindowLengths_& window_lengths,
1119 const multi_index<TensorView_::get_num_of_dimension()>& origin,
1123 auto w = tile_window_with_static_distribution<remove_cvref_t<TensorView_>,
1124 remove_cvref_t<WindowLengths_>,
1125 remove_cvref_t<StaticTileDistribution_>,
1127 tensor_view, window_lengths, origin, tile_distribution};
1132 template <
typename TensorView_,
1133 typename WindowLengths_,
1134 typename StaticTileDistribution_,
1139 StaticTileDistribution_,
1143 StaticTileDistribution_,
1144 NumCoord>::BottomTensorIndex& step)
1149 template <
typename TensorView_,
1150 typename WindowLengths_,
1151 typename StaticTileDistribution_,
1156 StaticTileDistribution_,
1160 StaticTileDistribution_,
1161 NumCoord>::BottomTensorIndex& step)
1165 StaticTileDistribution_,
1168 static constexpr
auto N = T::size();
1172 template <
typename TileWindowWithStaticDistributionType,
1178 static constexpr
auto N = TileWindowWithStaticDistributionType::size();
1190 template <
typename BottomTensorView_,
typename WindowLengths_>
1192 :
public tile_window_base<tile_window_with_static_lengths<BottomTensorView_, WindowLengths_>,
1226 template <
typename DataType>
1231 const char* label =
"")
const
1236 printf(
"%s Window Range [%d:%d, %d:%d] (origin: %d, %d):\n",
1245 for(
index_t i = start_i; i < end_i; i++)
1247 for(
index_t j = start_j; j < end_j; j++)
1252 make_tuple(window_origin[0] + i, window_origin[1] + j));
1256 auto buf =
tensor_view.template get_vectorized_elements<ThreadBuf>(coord, 0);
1258 printf(
" %s[%d,%d] = %f", label, i, j, type_convert<float>(
value));
1266 template <
typename TensorView_,
typename WindowLengths_>
1269 const WindowLengths_& window_lengths,
1270 const multi_index<TensorView_::get_num_of_dimension()>& origin)
1273 "wrong! lengths should be static");
1281 template <
typename TensorView,
typename WindowLengths>
1284 const multi_index<TensorView::get_num_of_dimension()>& origin)
1290 template <
typename TensorView,
typename WindowLengths,
typename StaticTileDistribution>
1293 const multi_index<TensorView::get_num_of_dimension()>& origin,
1302 template <
typename TensorView,
typename WindowLengths,
typename StaticTileDistribution>
1313 template <
typename TensorView,
typename WindowLengths,
typename StaticTileDistribution>
1326 template <
typename TensorView,
typename WindowLengths,
typename StaticTileDistribution>
1336 template <
typename TensorView_,
typename WindowLengths_>
1345 template <
typename NewTensorView_,
1346 typename OldTensorView_,
1347 typename WindowLengths_,
1348 typename StaticTileDistribution_,
1354 StaticTileDistribution_,
1355 NumCoord>& tile_window)
1360 tile_window.get_tile_distribution());
1363 template <
typename NewTensorView_,
typename OldTensorView_,
typename WindowLengths_>
1365 const NewTensorView_& new_tensor_view,
1379 template <
typename T>
1392 template <
typename BottomTensorView_,
1393 typename WindowLengths_,
1394 typename StaticTileDistribution_,
1399 StaticTileDistribution_,
1411 template <
typename T>
1422 template <
typename T>
1433 template <
typename BottomTensorView_,
typename WindowLengths_>
1446 template <
typename T>
#define CK_TILE_DEVICE
Definition: config.hpp:45
#define CK_TILE_LDS_ADDR
Definition: config.hpp:62
Definition: cluster_descriptor.hpp:13
constexpr decltype(auto) apply(F &&f, Tuple &&t)
Definition: tuple.hpp:526
constexpr CK_TILE_HOST_DEVICE auto to_array(const std::vector< X > &x)
Definition: array.hpp:286
constexpr bool is_tile_window_with_static_distribution_v
Helper variable template to check if a type is a tile window with static distribution.
Definition: tile_window.hpp:1412
__device__ uint32_t amd_wave_read_first_lane(uint16_t v)
Definition: amd_buffer_addressing.hpp:35
constexpr CK_TILE_HOST_DEVICE void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const Index &coord_step)
Definition: tensor_coordinate.hpp:72
CK_TILE_DEVICE auto replace_bottom_tensor_view(const NewTensorView_ &new_tensor_view, const tile_scatter_gather< OldTensorView_, WindowLengths_, StaticTileDistribution_, StaticPageIndexArray_, StaticValidArray_, HsGatherDim, NumCoord > &tile_window)
Definition: tile_scatter_gather.hpp:1041
CK_TILE_DEVICE auto make_tile_window_raw(const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, number< NumCoord >={})
Definition: tile_window.hpp:1117
constexpr CK_TILE_HOST_DEVICE auto make_tensor_adaptor_coordinate(const Adaptor &adaptor, const TopIndex &idx_top)
Definition: tensor_adaptor_coordinate.hpp:56
constant< b > bool_constant
Definition: integral_constant.hpp:43
int32_t index_t
Definition: integer.hpp:9
constexpr CK_TILE_HOST_DEVICE auto make_tensor_coordinate(const TensorDesc &tensor_desc, const TopIndex &idx_top)
Definition: tensor_coordinate.hpp:60
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:21
constexpr CK_TILE_HOST_DEVICE auto to_multi_index(const T &x)
Definition: multi_index.hpp:33
constexpr bool is_tile_window_with_static_lengths_v
Helper variable template to check if a type is a tile window with static lengths.
Definition: tile_window.hpp:1447
CK_TILE_HOST_DEVICE auto get_partition_index(Distribution)
Definition: tile_distribution.hpp:21
constexpr CK_TILE_DEVICE auto make_tile_window(null_tensor_view, const WindowLengths &window_lengths, const multi_index< WindowLengths::size()> &, Ts &&...)
Definition: null_tile_window.hpp:75
CK_TILE_DEVICE void move_tile_window(null_tile_window< WindowLengths > &, const typename null_tile_window< WindowLengths >::BottomTensorIndex &)
Definition: null_tile_window.hpp:95
constexpr CK_TILE_HOST_DEVICE auto generate_tuple(F &&f, number< N >)
Definition: tuple.hpp:429
CK_TILE_DEVICE void m0_set_with_memory(index_t v)
Definition: utility.hpp:19
constexpr CK_TILE_HOST_DEVICE auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:360
CK_TILE_DEVICE void m0_inc_with_memory(index_t v)
Definition: utility.hpp:25
constexpr CK_TILE_HOST_DEVICE auto container_concat(const X &x, const Ys &... ys)
Definition: container_helper.hpp:363
typename conditional< predicate, X, Y >::type conditional_t
Definition: functional.hpp:115
bool_constant< false > false_type
Definition: integral_constant.hpp:63
typename std::enable_if< B, T >::type enable_if_t
Definition: enable_if.hpp:27
bool_constant< true > true_type
Definition: integral_constant.hpp:62
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1697
Definition: integral_constant.hpp:13
Definition: type_traits.hpp:76
Type trait to determine if a type is a tile window with static distribution.
Definition: tile_window.hpp:1381
Type trait to determine if a type is a tile window with static lengths.
Definition: tile_window.hpp:1424
Definition: coordinate_transform.hpp:1392
Definition: static_distributed_tensor.hpp:21
constexpr CK_TILE_HOST_DEVICE const auto & get_thread_buffer() const
Definition: static_distributed_tensor.hpp:58
Definition: functional.hpp:81
Definition: functional.hpp:43
Definition: tensor_view.hpp:41
constexpr CK_TILE_HOST_DEVICE auto & get_tensor_descriptor() const
Definition: tensor_view.hpp:61
Definition: tile_distribution.hpp:70
constexpr CK_TILE_HOST_DEVICE const auto & get_ps_ys_to_xs_adaptor() const
Definition: tile_distribution.hpp:124
This class provides description of tile windowed view on the device memory.
Definition: tile_window_base.hpp:31
BottomTensorView bottom_tensor_view_
Definition: tile_window_base.hpp:85
remove_cvref_t< typename BottomTensorView::DataType > DataType
Definition: tile_window_base.hpp:36
constexpr CK_TILE_DEVICE auto get_window_origin() const
Definition: tile_window_base.hpp:45
BottomTensorIndex window_origin_
Definition: tile_window_base.hpp:79
constexpr CK_TILE_DEVICE auto get_bottom_tensor_view() const
Definition: tile_window_base.hpp:47
CK_TILE_DEVICE void move(const BottomTensorIndex &step)
Definition: tile_window_base.hpp:67
constexpr CK_TILE_DEVICE auto get_window_lengths() const
Definition: tile_window_base.hpp:46
remove_reference_t< BottomTensorView_ > BottomTensorView
Definition: tile_window_base.hpp:33
remove_cvref_t< WindowLengths_ > WindowLengths
Definition: tile_window_base.hpp:34
WindowLengths window_lengths_
Definition: tile_window_base.hpp:81
This class provides tile (windowed) view and access to the device memory.
Definition: tile_window.hpp:47
CK_TILE_DEVICE void store_raw(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access_unsupport_ >={}) const
Definition: tile_window.hpp:800
CK_TILE_DEVICE void move_extended(const typename Base::BottomTensorIndex &step)
Definition: tile_window.hpp:1000
constexpr CK_TILE_DEVICE tile_window_with_static_distribution(const typename Base::BottomTensorView &bottom_tensor_view, const typename Base::WindowLengths &window_lengths, const typename Base::BottomTensorIndex &window_origin, const typename Base::TileDstr &tile_distribution, decltype(get_partition_index(tile_distribution)) partition_index)
Definition: tile_window.hpp:67
CK_TILE_DEVICE void async_load_raw(LdsTileWindow_ &&lds_tile, number< i_access_unsupport_ >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) const
Definition: tile_window.hpp:457
CK_TILE_DEVICE void set_window_origin_extended(const typename Base::BottomTensorIndex &)
Definition: tile_window.hpp:1019
array< tuple< typename Base::WindowAdaptorCoord, typename Base::BottomTensorCoord >, NumCoord > pre_computed_coords_
Definition: tile_window.hpp:1062
CK_TILE_DEVICE auto load_transpose(number< i_access_unsupport_ >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window.hpp:641
CK_TILE_DEVICE auto load_transpose_with_offset(index_t offset, number< i_access_unsupport_ >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window.hpp:649
CK_TILE_DEVICE void update_raw(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access_unsupport_ >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) const
Definition: tile_window.hpp:931
CK_TILE_DEVICE void load_with_offset(offset_t offset, static_distributed_tensor< DataType, StaticTileDistribution > &dst_tensor, number< i_access_unsupport_ >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window.hpp:310
constexpr CK_TILE_DEVICE tile_window_with_static_distribution()=default
CK_TILE_DEVICE void async_load_with_offset(index_t offset, LdsTileWindow_ &&lds_tile, number< i_access_unsupport_ >={}, bool_constant< oob_conditional_check >={}, bool_constant< static_move_ys >={}) const
Definition: tile_window.hpp:542
constexpr CK_TILE_DEVICE auto get_load_offset(offset_t={}) const
Definition: tile_window.hpp:297
CK_TILE_DEVICE auto load(number< i_access_unsupport_ >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window.hpp:152
CK_TILE_DEVICE void load(DistributedTensor &dst_tensor, number< i_access_unsupport_ >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window.hpp:288
constexpr CK_TILE_DEVICE auto prepare_coords(const typename Base::BottomTensorView &bottom_tensor_view, const typename Base::BottomTensorIndex &window_origin, const typename Base::TileDstr &tile_distribution, decltype(get_partition_index(tile_distribution)) partition_index) const
Definition: tile_window.hpp:108
static constexpr auto I0
Definition: tile_window.hpp:57
CK_TILE_DEVICE auto load(const TileWindow_ &tile_window, ElementWise_ elementwise, number< i_access_unsupport_ >={}, bool_constant< oob_conditional_check >={}) const
Load tile with elementwise function.
Definition: tile_window.hpp:189
CK_TILE_DEVICE void load_raw(DstTile &dst_tensor, number< i_access_unsupport_ >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) const
Definition: tile_window.hpp:385
CK_TILE_DEVICE void load(DistributedTensor &dst_tensor, const TileWindow_ &tile_window, ElementWise_ elementwise, number< i_access_unsupport_ >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window.hpp:209
static constexpr auto I1
Definition: tile_window.hpp:58
CK_TILE_DEVICE auto load_with_offset(offset_t offset, number< i_access_unsupport_ >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window.hpp:162
CK_TILE_DEVICE void update(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access_unsupport_ >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window.hpp:863
constexpr CK_TILE_DEVICE tile_window_with_static_distribution(const typename Base::BottomTensorView &bottom_tensor_view, const typename Base::WindowLengths &window_lengths, const typename Base::BottomTensorIndex &window_origin, const typename Base::TileDstr &tile_distribution)
Definition: tile_window.hpp:94
std::conditional_t< Base::BottomTensorView::buffer_view::get_address_space()==address_space_enum::global, array< tuple< typename Base::WindowAdaptorCoord, typename Base::BottomTensorCoord >, NumCoord >, std::byte > pre_computed_warp_coords_
Definition: tile_window.hpp:1068
static constexpr index_t NumAccessPerCoord
Definition: tile_window.hpp:63
CK_TILE_DEVICE void load_transpose_with_offset(index_t offset, DistributedTensor &dst_tensor, number< i_access_unsupport_ >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window.hpp:666
CK_TILE_DEVICE void store(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access_unsupport_ >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window.hpp:730
This class provides description of tile windowed view on the device memory.
Definition: tile_window.hpp:1195
constexpr CK_TILE_DEVICE tile_window_with_static_lengths()=default
CK_TILE_DEVICE void print_tile_window_range(index_t start_i, index_t end_i, index_t start_j, index_t end_j, const char *label="") const
Definition: tile_window.hpp:1227
constexpr CK_TILE_DEVICE tile_window_with_static_lengths(const typename Base::BottomTensorView &bottom_tensor_view, const typename Base::WindowLengths &window_lengths, const typename Base::BottomTensorIndex &window_origin)
Definition: tile_window.hpp:1203
Definition: tile_window_base.hpp:94
remove_cvref_t< StaticTileDistribution_ > TileDstr
Definition: tile_window_base.hpp:95
CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate(WindowAdaptorCoord &window_adaptor_thread_coord, BottomTensorCoord &bottom_tensor_thread_coord, const ATopIndex &idx_diff_adaptor_top) const
Definition: tile_window_base.hpp:129
TileDstr tile_dstr_
Definition: tile_window_base.hpp:253
Definition: tuple.hpp:192