22 #define WINDOW_DISPATCH_ISSUE() \
23 if constexpr(i_access < 0) \
25 static_for<0, NumAccess, 1>{}([&](auto ia) { issue(ia); }); \
29 static_assert(i_access < NumAccess); \
30 issue(number<i_access>{}); \
43 template <
typename BottomTensorView_,
44 typename WindowLengths_,
45 typename StaticTileDistribution_,
46 typename LinearBottomDims_>
50 StaticTileDistribution_,
54 StaticTileDistribution_>
58 StaticTileDistribution_,
62 StaticTileDistribution_>;
66 static_assert(LinearBottomDims::size() == Base::BottomTensorView::get_num_of_dimension());
74 static constexpr
auto get_num_non_linear_access()
76 constexpr
auto sfc_access_lens = Base::Traits::SFC_Ys::access_lengths;
77 using ys_to_rhs_major =
typename decltype(
78 typename Base::TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
80 constexpr
auto non_linear = [&]() {
83 constexpr
auto rhs_major = ys_to_rhs_major{}[i_dim_y];
84 constexpr
auto target_h_dim =
number<rhs_major - 1>{};
87 cnt *= sfc_access_lens[i_dim_y];
109 static constexpr
auto get_non_linear_access_map()
111 constexpr
auto sfc_access_lens = Base::Traits::SFC_Ys::access_lengths;
112 using ys_to_rhs_major =
typename decltype(
113 typename Base::TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
114 constexpr
auto non_linear_map = [&]() {
117 index_t cumulative_non_linear_len_ = 1;
120 constexpr
auto rhs_major = ys_to_rhs_major{}[i_dim_y];
121 constexpr
auto target_h_dim =
number<rhs_major - 1>{};
125 constexpr
auto current_len_ = sfc_access_lens[i_dim_y];
128 for(
auto i_ = 0; i_ < cumulative_len_; i_++)
130 current_m_(i_) = m_[i_];
132 for(
auto j_ = 0; j_ < current_len_; j_++)
134 auto j_offset_ = is_linear_dim ? 0 : j_ * cumulative_non_linear_len_;
135 for(
auto i_ = 0; i_ < cumulative_len_; i_++)
137 m_(j_ * cumulative_len_ + i_) = current_m_[i_] + j_offset_;
140 cumulative_len_ *= current_len_;
142 cumulative_non_linear_len_ *= current_len_;
147 return TO_SEQUENCE(non_linear_map, Base::Traits::NumAccess);
150 static constexpr
auto get_non_linear_access_histogram()
152 constexpr
auto m_ = get_non_linear_access_map();
162 static constexpr
auto get_non_linear_access_histogram_prefix_sum()
164 constexpr
auto h_ = get_non_linear_access_histogram();
166 return h_prefix_sum_;
202 window_origin + window_adaptor_thread_coord_tmp.get_bottom_index();
208 using SFC_Ys =
typename Base::Traits::SFC_Ys;
212 constexpr
auto need_save_non_linear_coord =
215 if constexpr(need_save_non_linear_coord)
227 if constexpr(i_access != (
NumAccess - 1))
229 constexpr
auto idx_diff_ys = SFC_Ys::get_forward_step(i_access);
235 window_adaptor_thread_coord_tmp,
236 bottom_tensor_thread_coord_tmp,
242 template <index_t i_access>
245 using SFC_Ys =
typename Base::Traits::SFC_Ys;
247 using ys_to_rhs_major =
typename decltype(
248 typename Base::TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
252 constexpr
auto rhs_major = ys_to_rhs_major{}[i_dim_y];
253 constexpr
auto target_h_dim =
number<rhs_major - 1>{};
265 constexpr
auto adaptor_ =
typename Base::TileDstr{}.get_ps_ys_to_xs_adaptor();
266 constexpr
auto idx_ =
269 return adaptor_.calculate_bottom_index(idx_);
272 template <index_t i_access>
276 constexpr
auto is_pure_linear_tensor =
278 if constexpr(is_pure_linear_tensor)
284 return bottom_tensor_coord.get_offset();
292 constexpr
index_t linear_offset = [&]() {
293 constexpr
auto x_idx_ = linear_coord;
295 static_assert(x_idx_.size() == x_len_.size());
296 constexpr
index_t x_dims_ = x_idx_.size();
300 auto r_i_ =
number<x_dims_ - i_ - 1>{};
301 cu_offset_ += x_idx_[r_i_] * cu_stride_;
302 cu_stride_ *= x_len_[r_i_];
306 return linear_offset;
310 template <
index_t i_access = -1,
bool oob_conditional_check =
true>
313 using vector_t =
typename Base::Traits::vector_t;
314 using SFC_Ys =
typename Base::Traits::SFC_Ys;
318 auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
320 auto issue = [&](
auto i_access_) {
321 constexpr
auto IAccess = number<i_access_>{};
330 const vector_t vec_value =
332 bottom_tensor_thread_coord,
335 bool_constant<oob_conditional_check>{});
338 constexpr
auto idx_diff_ys = SFC_Ys::get_index(IAccess);
340 static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](
auto j) {
343 return jj == Base::Traits::VectorDimY ? (idx_diff_ys[jj] + j)
346 number<Base::NDimY>{});
348 constexpr
index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
349 Base::Traits::PackedSize;
351 dst_tensor.get_thread_buffer().template at<d>() =
353 .template get_as<typename Base::DataType>()[j / Base::Traits::PackedSize];
362 template <
typename DstTile,
index_t i_access = -1,
bool oob_conditional_check =
true>
367 using vector_t =
typename Base::Traits::vector_t;
368 using SFC_Ys =
typename Base::Traits::SFC_Ys;
374 auto issue = [&](
auto i_access_) {
375 constexpr
auto IAccess = number<i_access_>{};
384 const vector_t vec_value =
386 bottom_tensor_thread_coord,
389 bool_constant<oob_conditional_check>{});
391 constexpr
auto idx_diff_ys = SFC_Ys::get_index(IAccess);
393 static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](
auto j) {
396 return jj == Base::Traits::VectorDimY ? (idx_diff_ys[jj] + j)
399 number<Base::NDimY>{});
401 constexpr
index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
402 Base::Traits::PackedSize;
404 dst_tensor.get_thread_buffer().template at<d>() =
406 .template get_as<typename Base::DataType>()[j / Base::Traits::PackedSize];
415 template <
typename DstTile,
417 bool oob_conditional_check =
true,
418 bool pre_nop =
false>
422 bool_constant<pre_nop> = {})
const
424 using vector_t =
typename Base::Traits::vector_t;
425 using SFC_Ys =
typename Base::Traits::SFC_Ys;
426 static constexpr
index_t YElementSize =
427 typename Base::TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
428 static_assert(YElementSize % (Base::Traits::PackedSize * Base::Traits::ScalarPerVector) ==
430 using vectorized_tbuf =
432 YElementSize / (Base::Traits::PackedSize * Base::Traits::ScalarPerVector)>;
436 auto& dst_vec_tbuf =
reinterpret_cast<vectorized_tbuf&
>(dst_tensor.get_thread_buffer());
438 auto issue = [&](
auto i_access_) {
439 constexpr
auto IAccess = number<i_access_>{};
440 constexpr
auto pre_nop_ = [&]() {
441 if constexpr(pre_nop && i_access_ == 0 &&
442 Base::BottomTensorView::buffer_view::get_address_space() ==
444 return bool_constant<true>{};
446 return bool_constant<false>{};
455 constexpr
auto idx_ys_start = SFC_Ys::get_index(IAccess);
457 tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start) /
458 Base::Traits::PackedSize;
459 static_assert(d % Base::Traits::ScalarPerVector == 0);
462 dst_vec_tbuf.template at<d / Base::Traits::ScalarPerVector>(),
463 bottom_tensor_thread_coord,
466 bool_constant<oob_conditional_check>{},
468 #if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE || \
469 CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
478 template <
typename LdsTileWindow_,
480 bool oob_conditional_check =
true,
481 bool pre_nop =
false>
485 bool_constant<pre_nop> = {})
const
487 using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
488 using LdsDataType =
typename LdsTileWindow::DataType;
493 static_assert(Base::BottomTensorView::buffer_view::get_address_space() ==
497 static_assert(LdsTileWindow::get_num_of_dimension() == 3);
500 lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
501 make_tuple(number<0>{}, number<0>{}, number<0>{})) *
505 lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
506 make_tuple(number<0>{}, number<1>{}, number<0>{})) *
507 sizeof(LdsDataType) -
511 lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
512 make_tuple(number<1>{}, number<0>{}, number<0>{})) *
513 sizeof(LdsDataType) -
519 using vector_t =
typename Base::Traits::vector_t;
521 LdsDataType* smem = lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_;
524 auto issue = [&](
auto i_access_) {
525 constexpr
auto IAccess = number<i_access_>{};
526 constexpr
auto pre_nop_ = [&]() {
527 if constexpr(pre_nop && i_access_ == 0)
530 return bool_constant<false>{};
539 smem, bottom_tensor_thread_coord, 0, bottom_tensor_flag, pre_nop_);
542 if constexpr(i_access_ != (
NumAccess - 1))
551 template <
typename LdsTileWindow_,
index_t i_access = -1,
bool oob_conditional_check =
true>
556 using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
557 using LdsDataType =
typename LdsTileWindow::DataType;
558 using vector_t =
typename traits::vector_t;
561 static_assert(Base::BottomTensorView::buffer_view::get_address_space() ==
563 "Requires global memory");
566 const auto window_origin = lds_tile.get_window_origin();
567 const auto& bottom_tensor_view = lds_tile.get_bottom_tensor_view();
568 const auto& tensor_descriptor = bottom_tensor_view.get_tensor_descriptor();
569 auto smem_base_ptr = bottom_tensor_view.get_buffer_view().p_data_;
571 auto issue = [&](
auto i_access_) {
572 constexpr
auto IAccess = number<i_access_>{};
580 auto lds_bottom_tensor_thread_idx =
581 window_origin + window_adaptor_coord.get_bottom_index();
582 const auto lds_coord =
585 CK_TILE_LDS_ADDR LdsDataType* smem = smem_base_ptr + lds_coord.get_offset();
590 bottom_tensor_thread_coord,
593 bool_constant<oob_conditional_check>{});
599 template <
typename Policy,
index_t i_access_unsupport_ = -1,
bool oob_conditional_check =
true>
603 auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
604 this->
template load_transpose_linear<Policy>(
609 template <
typename Policy,
610 typename DistributedTensor,
612 bool oob_conditional_check =
true>
617 using vector_t =
typename traits::vector_t;
618 using SFC_Ys =
typename traits::SFC_Ys;
622 constexpr
auto group_func = Policy::group_func;
624 auto issue = [&](
auto i_access_) {
625 constexpr
auto IAccess = number<i_access_>{};
630 constexpr
auto idx_ys_start = SFC_Ys::get_index(IAccess);
633 const vector_t vec_value =
635 bottom_tensor_thread_coord, 0);
637 static_for<0, traits::ScalarPerVector, 1>{}([&](
auto j) {
640 return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
642 number<Base::NDimY>{});
644 constexpr
index_t linear_distributed_index =
645 tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
646 dst_tensor.get_thread_buffer().template at<linear_distributed_index>() =
647 vec_value.template get_as<typename Base::DataType>()[j];
653 template <
index_t i_access = -1,
bool oob_conditional_check =
true>
660 using vector_t =
typename Base::Traits::vector_t;
661 using SFC_Ys =
typename Base::Traits::SFC_Ys;
666 auto issue = [&](
auto i_access_) {
667 constexpr
auto IAccess = number<i_access_>{};
673 constexpr
auto idx_ys_start = SFC_Ys::get_index(IAccess);
678 static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](
auto j) {
681 return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
684 number<Base::NDimY>{});
686 constexpr
index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
687 Base::Traits::PackedSize;
689 vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
690 dstr_tensor.get_thread_buffer().template at<d>();
695 bottom_tensor_thread_coord,
699 bool_constant<oob_conditional_check>{});
705 template <
index_t i_access = -1>
711 using vector_t =
typename Base::Traits::vector_t;
712 using SFC_Ys =
typename Base::Traits::SFC_Ys;
715 static constexpr
bool oob_conditional_check =
true;
718 auto issue = [&](
auto i_access_) {
719 constexpr
auto IAccess = number<i_access_>{};
726 constexpr
auto idx_ys_start = SFC_Ys::get_index(IAccess);
730 static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](
auto j) {
733 return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
736 number<Base::NDimY>{});
737 constexpr
index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
738 Base::Traits::PackedSize;
739 vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
745 .template set_vectorized_elements_raw<vector_t, oob_conditional_check>(
746 bottom_tensor_thread_coord, linear_offset, bottom_tensor_flag, vec_value);
752 template <
index_t i_access = -1,
bool oob_conditional_check =
true>
760 using vector_t =
typename Base::Traits::vector_t;
761 using SFC_Ys =
typename Base::Traits::SFC_Ys;
766 auto issue = [&](
auto i_access_) {
767 constexpr
auto IAccess = number<i_access_>{};
774 constexpr
auto idx_ys_start = SFC_Ys::get_index(IAccess);
779 static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](
auto j) {
782 return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
785 number<Base::NDimY>{});
787 constexpr
index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
788 Base::Traits::PackedSize;
790 vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
796 bottom_tensor_thread_coord,
800 bool_constant<oob_conditional_check>{});
806 template <
index_t i_access = -1,
bool oob_conditional_check =
true,
bool pre_nop =
false>
812 bool_constant<pre_nop> = {})
const
815 using vector_t =
typename Base::Traits::vector_t;
816 using SFC_Ys =
typename Base::Traits::SFC_Ys;
821 auto issue = [&](
auto i_access_) {
822 constexpr
auto IAccess = number<i_access_>{};
829 constexpr
auto idx_ys_start = SFC_Ys::get_index(IAccess);
834 static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](
auto j) {
837 return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
840 number<Base::NDimY>{});
842 constexpr
index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
843 Base::Traits::PackedSize;
845 vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
851 bottom_tensor_thread_coord,
855 bool_constant<oob_conditional_check>{},
856 bool_constant<pre_nop>{});
868 constexpr
auto need_update_non_linear_coord =
871 if constexpr(need_update_non_linear_coord)
898 this->
window_origin_ + window_adaptor_thread_coord_tmp.get_bottom_index();
904 using SFC_Ys =
typename Base::Traits::SFC_Ys;
908 constexpr
auto need_save_non_linear_coord =
911 if constexpr(need_save_non_linear_coord)
917 if constexpr(i_access != (
NumAccess - 1))
919 constexpr
auto idx_diff_ys = SFC_Ys::get_forward_step(i_access);
925 window_adaptor_thread_coord_tmp,
926 bottom_tensor_thread_coord_tmp,
939 #undef WINDOW_DISPATCH_ISSUE
942 template <address_space_enum, index_t len_>
948 template <index_t len_>
956 template <index_t len_>
964 template <
typename TensorView_>
967 TensorView_::get_num_of_dimension()>::type;
985 template <
typename TensorView_,
986 typename WindowLengths_,
987 typename StaticTileDistribution_,
991 const WindowLengths_& window_lengths,
992 const multi_index<TensorView_::get_num_of_dimension()>& origin,
994 LinearBottomDims_ = {})
996 static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension());
997 return tile_window_linear<remove_cvref_t<TensorView_>,
998 remove_cvref_t<WindowLengths_>,
999 remove_cvref_t<StaticTileDistribution_>,
1000 remove_cvref_t<LinearBottomDims_>>{
1001 tensor_view, window_lengths, origin, tile_distribution};
1005 typename TileWindow_,
1006 typename StaticTileDistribution_,
1007 typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
1011 LinearBottomDims_ = {})
1014 tile_window.get_window_lengths(),
1015 tile_window.get_window_origin(),
1017 LinearBottomDims_{});
1021 template <
typename TensorView_,
1022 typename WindowLengths_,
1023 typename StaticTileDistribution_,
1024 typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
1027 const WindowLengths_& window_lengths,
1028 const multi_index<TensorView_::get_num_of_dimension()>& origin,
1030 LinearBottomDims_ = {})
1032 static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension());
1033 auto w = tile_window_linear<remove_cvref_t<TensorView_>,
1034 remove_cvref_t<WindowLengths_>,
1035 remove_cvref_t<StaticTileDistribution_>,
1036 remove_cvref_t<LinearBottomDims_>>{
1037 tensor_view, window_lengths, origin, tile_distribution};
1043 typename TileWindow_,
1044 typename StaticTileDistribution_,
1045 typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
1049 LinearBottomDims_ = {})
1052 tile_window.get_window_lengths(),
1053 tile_window.get_window_origin(),
1055 LinearBottomDims_{});
1058 template <
typename TensorView_,
1059 typename WindowLengths_,
1060 typename StaticTileDistribution_,
1061 typename LinearBottomDims_>
1067 StaticTileDistribution_,
1068 LinearBottomDims_>::BottomTensorIndex& step)
1081 template <
typename T>
1097 template <
typename BottomTensorView_,
1098 typename WindowLengths_,
1099 typename StaticTileDistribution_,
1100 typename LinearBottomDims_>
1103 StaticTileDistribution_,
1115 template <
typename T>
#define CK_TILE_DEVICE
Definition: config.hpp:40
#define CK_TILE_LDS_ADDR
Definition: config.hpp:57
Definition: cluster_descriptor.hpp:13
typename impl::default_linear_bottom_dims_impl< TensorView_::buffer_view::get_address_space(), TensorView_::get_num_of_dimension()>::type default_linear_bottom_dims
Definition: tile_window_linear.hpp:967
CK_TILE_DEVICE index_t get_lane_id()
Definition: arch.hpp:72
constexpr CK_TILE_HOST_DEVICE void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const Index &coord_step)
Definition: tensor_coordinate.hpp:72
constexpr CK_TILE_HOST_DEVICE auto make_tensor_adaptor_coordinate(const Adaptor &adaptor, const TopIndex &idx_top)
Definition: tensor_adaptor_coordinate.hpp:55
constant< b > bool_constant
Definition: integral_constant.hpp:39
int32_t index_t
Definition: integer.hpp:9
constexpr CK_TILE_HOST_DEVICE auto make_tensor_coordinate(const TensorDesc &tensor_desc, const TopIndex &idx_top)
Definition: tensor_coordinate.hpp:60
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:21
constant< v > number
Definition: integral_constant.hpp:33
constexpr CK_TILE_HOST_DEVICE index_t reduce_on_sequence(Seq, Reduce f, number< Init >)
Definition: sequence.hpp:973
constexpr CK_TILE_HOST_DEVICE bool coordinate_has_valid_offset_assuming_top_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition: tensor_coordinate.hpp:79
CK_TILE_DEVICE auto make_tile_window_linear_raw(const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
Definition: tile_window_linear.hpp:1026
CK_TILE_DEVICE index_t get_warp_id()
Definition: arch.hpp:74
constexpr bool is_tile_window_linear_v
Helper variable template to check if a type is a linear tile window.
Definition: tile_window_linear.hpp:1116
CK_TILE_DEVICE void move_tile_window(null_tile_window< WindowLengths > &, const typename null_tile_window< WindowLengths >::BottomTensorIndex &)
Definition: null_tile_window.hpp:92
constexpr CK_TILE_DEVICE auto make_tile_window_linear(const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
Definition: tile_window_linear.hpp:990
constexpr CK_TILE_HOST_DEVICE auto generate_tuple(F &&f, number< N >)
Definition: tuple.hpp:406
CK_TILE_DEVICE void m0_set_with_memory(index_t v)
Definition: utility.hpp:19
constexpr CK_TILE_HOST_DEVICE auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:337
address_space_enum
Definition: arch.hpp:34
CK_TILE_DEVICE void m0_inc_with_memory(index_t v)
Definition: utility.hpp:25
constexpr CK_TILE_HOST_DEVICE auto histogram_sorted_sequence(SeqSortedSamples, sequence< r, rs... >)
Definition: sequence.hpp:1093
constexpr CK_TILE_HOST_DEVICE auto container_concat(const X &x, const Ys &... ys)
Definition: container_helper.hpp:363
constexpr auto prefix_sum_sequence(Seq)
Definition: sequence.hpp:899
bool_constant< false > false_type
Definition: integral_constant.hpp:63
bool_constant< true > true_type
Definition: integral_constant.hpp:62
Definition: sequence.hpp:278
A fixed-size array container similar to std::array with additional utilities.
Definition: array.hpp:43
Definition: integral_constant.hpp:13
typename sequence_merge< typename uniform_sequence_gen< len_ - 1, 0 >::type, sequence< 1 > >::type type
Definition: tile_window_linear.hpp:953
typename uniform_sequence_gen< len_, 1 >::type type
Definition: tile_window_linear.hpp:960
Definition: tile_window_linear.hpp:944
typename uniform_sequence_gen< len_, 0 >::type type
Definition: tile_window_linear.hpp:945
Type trait to determine if a type is a linear tile window.
Definition: tile_window_linear.hpp:1083
Definition: sequence.hpp:227
Definition: sequence.hpp:52
Definition: static_distributed_tensor.hpp:21
constexpr CK_TILE_HOST_DEVICE const auto & get_thread_buffer() const
Definition: static_distributed_tensor.hpp:58
Definition: functional.hpp:43
Definition: tensor_view.hpp:41
Definition: tile_distribution.hpp:72
constexpr CK_TILE_HOST_DEVICE const auto & get_ps_ys_to_xs_adaptor() const
Definition: tile_distribution.hpp:126
BottomTensorView bottom_tensor_view_
Definition: tile_window_base.hpp:85
remove_cvref_t< typename BottomTensorView::DataType > DataType
Definition: tile_window_base.hpp:36
BottomTensorIndex window_origin_
Definition: tile_window_base.hpp:79
constexpr CK_TILE_DEVICE auto get_bottom_tensor_view() const
Definition: tile_window_base.hpp:47
CK_TILE_DEVICE void move(const BottomTensorIndex &step)
Definition: tile_window_base.hpp:67
remove_reference_t< BottomTensorView_ > BottomTensorView
Definition: tile_window_base.hpp:33
remove_cvref_t< WindowLengths_ > WindowLengths
Definition: tile_window_base.hpp:34
WindowLengths window_lengths_
Definition: tile_window_base.hpp:81
Definition: tile_window_linear.hpp:72
decltype(get_non_linear_access_histogram_prefix_sum()) AccessPrefixSum_NonLinear
Definition: tile_window_linear.hpp:173
decltype(get_non_linear_access_map()) AccessMap_NonLinear
Definition: tile_window_linear.hpp:171
static constexpr index_t NumAccess_NonLinear
Definition: tile_window_linear.hpp:170
decltype(get_non_linear_access_histogram()) AccessHistogram_NonLinear
Definition: tile_window_linear.hpp:172
Definition: tile_window_linear.hpp:55
static constexpr auto I0
Definition: tile_window_linear.hpp:68
CK_TILE_DEVICE void set_window_origin_extended(const typename Base::BottomTensorIndex &)
Definition: tile_window_linear.hpp:889
CK_TILE_DEVICE auto load(number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:311
constexpr CK_TILE_DEVICE tile_window_linear()=default
array< typename Base::WindowAdaptorCoord, traits::NumAccess_NonLinear > cached_window_adaptor_coords_
Definition: tile_window_linear.hpp:935
CK_TILE_DEVICE auto async_load(LdsTileWindow_ &&lds_tile, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:552
CK_TILE_DEVICE void load_raw(DstTile &dst_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) const
Definition: tile_window_linear.hpp:419
static constexpr CK_TILE_DEVICE index_t get_bottom_linear_offset(number< i_access >)
Definition: tile_window_linear.hpp:273
CK_TILE_DEVICE auto load_transpose() const
Definition: tile_window_linear.hpp:600
typename traits::AccessHistogram_NonLinear AccessHistogram_NonLinear
Definition: tile_window_linear.hpp:179
typename traits::AccessMap_NonLinear AccessMap_NonLinear
Definition: tile_window_linear.hpp:178
constexpr CK_TILE_DEVICE tile_window_linear(const typename Base::BottomTensorView &bottom_tensor_view, const typename Base::WindowLengths &window_lengths, const typename Base::BottomTensorIndex &window_origin, const typename Base::TileDstr &tile_distribution)
Definition: tile_window_linear.hpp:184
static constexpr index_t NumAccess
Definition: tile_window_linear.hpp:176
CK_TILE_DEVICE void store_raw(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access >={}) const
Definition: tile_window_linear.hpp:707
array< bool, Base::Traits::NumAccess > cached_flags_
Definition: tile_window_linear.hpp:936
static constexpr CK_TILE_DEVICE auto get_bottom_linear_coordinate(number< i_access >)
Definition: tile_window_linear.hpp:243
CK_TILE_DEVICE void update(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:754
CK_TILE_DEVICE void store(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:654
CK_TILE_DEVICE void update_raw(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) const
Definition: tile_window_linear.hpp:808
typename traits::AccessPrefixSum_NonLinear AccessPrefixSum_NonLinear
Definition: tile_window_linear.hpp:180
CK_TILE_DEVICE auto load_transpose_linear(DistributedTensor &dst_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:613
static constexpr index_t NumAccess_NonLinear
Definition: tile_window_linear.hpp:177
CK_TILE_DEVICE auto load(DstTile &dst_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:363
CK_TILE_DEVICE void move_extended(const typename Base::BottomTensorIndex &step)
Definition: tile_window_linear.hpp:863
array< typename Base::BottomTensorCoord, traits::NumAccess_NonLinear > cached_coords_
Definition: tile_window_linear.hpp:933
CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_ &&lds_tile, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) const
Definition: tile_window_linear.hpp:482
remove_cvref_t< LinearBottomDims_ > LinearBottomDims
Definition: tile_window_linear.hpp:64
static constexpr auto I1
Definition: tile_window_linear.hpp:69
Definition: tile_window_base.hpp:94
static constexpr index_t NDimY
Definition: tile_window_base.hpp:103
remove_cvref_t< StaticTileDistribution_ > TileDstr
Definition: tile_window_base.hpp:95
CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate(WindowAdaptorCoord &window_adaptor_thread_coord, BottomTensorCoord &bottom_tensor_thread_coord, const ATopIndex &idx_diff_adaptor_top) const
Definition: tile_window_base.hpp:129
TileDstr tile_dstr_
Definition: tile_window_base.hpp:253
#define WINDOW_DISPATCH_ISSUE()
Definition: tile_window_linear.hpp:22
#define TO_SEQUENCE(a, n)
Definition: to_sequence.hpp:10