21 #define WINDOW_DISPATCH_ISSUE() \
22 if constexpr(i_access < 0) \
24 static_for<0, NumAccess, 1>{}([&](auto ia) { issue(ia); }); \
28 static_assert(i_access < NumAccess); \
29 issue(number<i_access>{}); \
41 template <
typename BottomTensorView_,
42 typename WindowLengths_,
43 typename StaticTileDistribution_,
44 typename LinearBottomDims_>
57 static_assert(LinearBottomDims::size() == BottomTensorView::get_num_of_dimension());
62 static constexpr
index_t NDimP = TileDstr::get_num_of_dimension_p();
63 static constexpr
index_t NDimY = TileDstr::get_num_of_dimension_y();
71 "wrong! lengths should be static");
74 static_assert(
NDimBottomTensor == WindowAdaptor::get_num_of_bottom_dimension(),
75 "wrong! inconsistent # of diemsnions");
90 CK_TILE_DEVICE static constexpr
auto get_window_adaptor_ys_safe_vector_length_strides()
93 const auto [bottom_tensor_top_dim_vector_lengths,
94 bottom_tensor_top_dim_vector_strides] =
95 BottomTensorDesc::get_top_dimension_safe_vector_length_strides();
98 const auto window_adaptor_bottom_dim_vector_lengths =
99 bottom_tensor_top_dim_vector_lengths;
100 const auto window_adaptor_bottom_dim_vector_strides =
101 bottom_tensor_top_dim_vector_strides;
104 array<
index_t, WindowAdaptor::get_num_of_hidden_dimension()>
105 window_adaptor_vector_lengths{-1};
106 array<
index_t, WindowAdaptor::get_num_of_hidden_dimension()>
107 window_adaptor_vector_strides{-1};
109 constexpr
auto window_adaptor_bottom_dims =
110 WindowAdaptor::get_bottom_dimension_hidden_ids();
113 window_adaptor_bottom_dims,
114 window_adaptor_bottom_dim_vector_lengths);
116 window_adaptor_bottom_dims,
117 window_adaptor_bottom_dim_vector_strides);
119 const auto [window_adaptor_ps_ys_vector_lengths, window_adaptor_ps_ys_vector_strides] =
120 WindowAdaptor{}.get_top_dimension_safe_vector_length_strides(
121 window_adaptor_vector_lengths, window_adaptor_vector_strides);
124 constexpr
auto y_dims =
133 static constexpr
auto get_vector_dim_y_scalar_per_vector()
135 const auto [ys_vector_lengths, ys_vector_strides] =
136 get_window_adaptor_ys_safe_vector_length_strides();
143 if(ys_vector_strides[i] == 1 && ys_vector_lengths[i] > ScalarPerVector_)
145 ScalarPerVector_ = ys_vector_lengths[i];
150 return make_tuple(VectorDimY_, ScalarPerVector_);
156 get_vector_dim_y_scalar_per_vector().template at<1>();
161 static constexpr
auto scalars_per_access_ = [] {
166 constexpr
auto NDimY_ =
NDimY;
168 return TO_SEQUENCE(scalars_per_access_arr, NDimY_);
171 static constexpr
auto get_space_filling_curve()
173 constexpr
auto thread_tensor_lengths_ys =
181 decltype(scalars_per_access_),
186 using SFC_Ys = decltype(get_space_filling_curve());
190 static_assert(0 <
NumAccess,
"Wrong! NumAccess should be larger than 0");
193 static constexpr
auto get_num_non_linear_access()
195 constexpr
auto sfc_access_lens = SFC_Ys::access_lengths;
196 using ys_to_rhs_major =
197 typename decltype(
TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
199 constexpr
auto non_linear = [&]() {
202 constexpr
auto rhs_major = ys_to_rhs_major{}[i_dim_y];
203 constexpr
auto target_h_dim =
number<rhs_major - 1>{};
206 cnt *= sfc_access_lens[i_dim_y];
228 static constexpr
auto get_non_linear_access_map()
230 constexpr
auto sfc_access_lens = SFC_Ys::access_lengths;
231 using ys_to_rhs_major =
232 typename decltype(
TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
233 constexpr
auto non_linear_map = [&]() {
234 array<index_t, NumAccess> m_{0};
236 index_t cumulative_non_linear_len_ = 1;
237 static_for<0, NDimY, 1>{}([&](
auto i_y) {
238 constexpr
auto i_dim_y =
number<
NDimY - i_y - 1>{};
239 constexpr
auto rhs_major = ys_to_rhs_major{}[i_dim_y];
240 constexpr
auto target_h_dim =
number<rhs_major - 1>{};
243 array<index_t, NumAccess> current_m_{0};
244 constexpr
auto current_len_ = sfc_access_lens[i_dim_y];
247 for(
auto i_ = 0; i_ < cumulative_len_; i_++)
249 current_m_(i_) = m_[i_];
251 for(
auto j_ = 0; j_ < current_len_; j_++)
253 auto j_offset_ = is_linear_dim ? 0 : j_ * cumulative_non_linear_len_;
254 for(
auto i_ = 0; i_ < cumulative_len_; i_++)
256 m_(j_ * cumulative_len_ + i_) = current_m_[i_] + j_offset_;
259 cumulative_len_ *= current_len_;
261 cumulative_non_linear_len_ *= current_len_;
269 static constexpr
auto get_non_linear_access_histogram()
271 constexpr
auto m_ = get_non_linear_access_map();
275 typename arithmetic_sequence_gen<0, get_num_non_linear_access() + 1, 1>::type{};
282 static constexpr
auto get_non_linear_access_histogram_prefix_sum()
284 constexpr
auto h_ = get_non_linear_access_histogram();
286 return h_prefix_sum_;
321 window_origin + window_adaptor_thread_coord_tmp.get_bottom_index();
331 constexpr
auto need_save_non_linear_coord =
334 if constexpr(need_save_non_linear_coord)
345 if constexpr(i_access != (
NumAccess - 1))
347 constexpr
auto idx_diff_ys = SFC_Ys::get_forward_step(i_access);
353 window_adaptor_thread_coord_tmp,
354 bottom_tensor_thread_coord_tmp,
383 template <
typename ATopIndex>
387 const ATopIndex& idx_diff_adaptor_top)
const
392 window_adaptor_thread_coord,
393 idx_diff_adaptor_top,
394 idx_diff_adaptor_bottom);
397 bottom_tensor_thread_coord,
398 idx_diff_adaptor_bottom);
401 template <index_t i_access>
406 using ys_to_rhs_major =
407 typename decltype(
TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
411 constexpr
auto rhs_major = ys_to_rhs_major{}[i_dim_y];
412 constexpr
auto target_h_dim =
number<rhs_major - 1>{};
424 constexpr
auto adaptor_ =
TileDstr{}.get_ps_ys_to_xs_adaptor();
425 constexpr
auto idx_ =
428 return adaptor_.calculate_bottom_index(idx_);
431 template <index_t i_access>
435 constexpr
auto is_pure_linear_tensor =
437 if constexpr(is_pure_linear_tensor)
441 auto bottom_tensor_coord =
443 return bottom_tensor_coord.get_offset();
451 constexpr
index_t linear_offset = [&]() {
452 constexpr
auto x_idx_ = linear_coord;
453 constexpr
auto x_len_ =
TileDstr{}.get_lengths();
454 static_assert(x_idx_.size() == x_len_.size());
455 constexpr
index_t x_dims_ = x_idx_.size();
459 auto r_i_ =
number<x_dims_ - i_ - 1>{};
460 cu_offset_ += x_idx_[r_i_] * cu_stride_;
461 cu_stride_ *= x_len_[r_i_];
465 return linear_offset;
471 template <
index_t i_access = -1,
bool oob_conditional_check =
true>
477 constexpr
auto tile_dstr =
TileDstr{};
479 auto dst_tensor = make_static_distributed_tensor<DataType>(tile_dstr);
481 auto issue = [&](
auto i_access_) {
482 constexpr
auto IAccess = number<i_access_>{};
491 const vector_t vec_value =
493 bottom_tensor_thread_coord,
496 bool_constant<oob_conditional_check>{});
499 constexpr
auto idx_diff_ys = SFC_Ys::get_index(IAccess);
501 static_for<0, traits::ScalarPerVector, 1>{}([&](
auto j) {
508 constexpr
index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
510 dst_tensor.get_thread_buffer().template at<d>() =
511 vec_value.template get_as<DataType>()[j];
514 constexpr
index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
517 dst_tensor.get_thread_buffer().template get_as<vector_t>()(
527 template <
typename DstTile,
index_t i_access = -1,
bool oob_conditional_check =
true>
535 constexpr
auto tile_dstr =
TileDstr{};
539 auto issue = [&](
auto i_access_) {
540 constexpr
auto IAccess = number<i_access_>{};
549 const vector_t vec_value =
551 bottom_tensor_thread_coord,
554 bool_constant<oob_conditional_check>{});
557 constexpr
auto idx_diff_ys = SFC_Ys::get_index(IAccess);
559 static_for<0, traits::ScalarPerVector, 1>{}([&](
auto j) {
566 constexpr
index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
568 dst_tensor.get_thread_buffer().template at<d>() =
569 vec_value.template get_as<DataType>()[j];
572 constexpr
index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
575 dst_tensor.get_thread_buffer().template get_as<vector_t>()(
585 template <
typename DstTile,
587 bool oob_conditional_check =
true,
588 bool pre_nop =
false>
592 bool_constant<pre_nop> = {})
const
596 static constexpr
index_t YElementSize =
597 TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
601 constexpr
auto tile_dstr =
TileDstr{};
603 auto& dst_vec_tbuf =
reinterpret_cast<vectorized_tbuf&
>(dst_tensor.get_thread_buffer());
605 auto issue = [&](
auto i_access_) {
606 constexpr
auto IAccess = number<i_access_>{};
607 constexpr
auto pre_nop_ = [&]() {
608 if constexpr(pre_nop && i_access_ == 0 &&
609 BottomTensorView::buffer_view::get_address_space() ==
611 return bool_constant<true>{};
613 return bool_constant<false>{};
622 constexpr
auto idx_ys_start = SFC_Ys::get_index(IAccess);
623 constexpr
index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
627 dst_vec_tbuf.template at<d / traits::ScalarPerVector>(),
628 bottom_tensor_thread_coord,
631 bool_constant<oob_conditional_check>{},
633 #if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE || \
634 CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
643 template <
typename LdsTileWindow_,
645 bool oob_conditional_check =
true,
646 bool pre_nop =
false>
650 bool_constant<pre_nop> = {})
const
652 using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
653 using LdsDataType =
typename LdsTileWindow::DataType;
658 static_assert(BottomTensorView::buffer_view::get_address_space() ==
662 static_assert(LdsTileWindow::get_num_of_dimension() == 3);
665 lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
666 make_tuple(number<0>{}, number<0>{}, number<0>{})) *
670 lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
671 make_tuple(number<0>{}, number<1>{}, number<0>{})) *
672 sizeof(LdsDataType) -
676 lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
677 make_tuple(number<1>{}, number<0>{}, number<0>{})) *
678 sizeof(LdsDataType) -
686 LdsDataType* smem = lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_;
689 auto issue = [&](
auto i_access_) {
690 constexpr
auto IAccess = number<i_access_>{};
691 constexpr
auto pre_nop_ = [&]() {
692 if constexpr(pre_nop && i_access_ == 0)
695 return bool_constant<false>{};
704 smem, bottom_tensor_thread_coord, 0, bottom_tensor_flag, pre_nop_);
707 if constexpr(i_access_ != (
NumAccess - 1))
716 template <
typename LdsTileWindow_,
index_t i_access = -1,
bool oob_conditional_check =
true>
721 using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
722 using LdsDataType =
typename LdsTileWindow::DataType;
727 static_assert(BottomTensorView::buffer_view::get_address_space() ==
731 static_assert(LdsTileWindow::get_num_of_dimension() == 3);
736 constexpr
index_t size_per_buf =
737 lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
738 make_tuple(number<0>{}, number<0>{}, number<0>{}));
740 constexpr
index_t size_per_wave =
741 lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
742 make_tuple(number<0>{}, number<1>{}, number<0>{})) -
745 constexpr
index_t size_per_issue =
746 lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
747 make_tuple(number<1>{}, number<0>{}, number<0>{})) -
756 lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_ + m0_init_value;
759 auto issue = [&](
auto i_access_) {
760 constexpr
auto IAccess = number<i_access_>{};
768 bottom_tensor_thread_coord,
771 bool_constant<oob_conditional_check>{});
774 if constexpr(i_access_ != (
NumAccess - 1))
776 smem += size_per_issue;
783 template <
index_t i_access = -1,
bool oob_conditional_check =
true>
792 constexpr
auto tile_dstr =
TileDstr{};
795 auto issue = [&](
auto i_access_) {
796 constexpr
auto IAccess = number<i_access_>{};
802 constexpr
auto idx_ys_start = SFC_Ys::get_index(IAccess);
807 static_for<0, traits::ScalarPerVector, 1>{}([&](
auto j) {
814 constexpr
index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
816 vec_value.template get_as<DataType>()(j) =
822 bottom_tensor_thread_coord,
826 bool_constant<oob_conditional_check>{});
832 template <
index_t i_access = -1>
839 constexpr
auto tile_dstr =
TileDstr{};
840 static constexpr
bool oob_conditional_check =
true;
843 auto issue = [&](
auto i_access_) {
844 constexpr
auto IAccess = number<i_access_>{};
851 constexpr
auto idx_ys_start = SFC_Ys::get_index(IAccess);
855 static_for<0, traits::ScalarPerVector, 1>{}([&](
auto j) {
861 constexpr
index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
862 vec_value.template get_as<DataType>()(j) =
868 .template set_vectorized_elements_raw<vector_t, oob_conditional_check>(
869 bottom_tensor_thread_coord, linear_offset, bottom_tensor_flag, vec_value);
875 template <
index_t i_access = -1,
bool oob_conditional_check =
true>
884 constexpr
auto tile_dstr =
TileDstr{};
887 auto issue = [&](
auto i_access_) {
888 constexpr
auto IAccess = number<i_access_>{};
895 constexpr
auto idx_ys_start = SFC_Ys::get_index(IAccess);
900 static_for<0, traits::ScalarPerVector, 1>{}([&](
auto j) {
907 constexpr
index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
909 vec_value.template get_as<DataType>()(j) =
915 bottom_tensor_thread_coord,
919 bool_constant<oob_conditional_check>{});
925 template <
index_t i_access = -1,
bool oob_conditional_check =
true,
bool pre_nop =
false>
929 bool_constant<pre_nop> = {})
const
935 constexpr
auto tile_dstr =
TileDstr{};
938 auto issue = [&](
auto i_access_) {
939 constexpr
auto IAccess = number<i_access_>{};
946 constexpr
auto idx_ys_start = SFC_Ys::get_index(IAccess);
951 static_for<0, traits::ScalarPerVector, 1>{}([&](
auto j) {
958 constexpr
index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
960 vec_value.template get_as<DataType>()(j) =
966 bottom_tensor_thread_coord,
970 bool_constant<oob_conditional_check>{},
971 bool_constant<pre_nop>{});
987 constexpr
auto need_update_non_linear_coord =
990 if constexpr(need_update_non_linear_coord)
1013 TileDstr{}.get_ps_ys_to_xs_adaptor(),
1018 window_origin_ + window_adaptor_thread_coord_tmp.get_bottom_index();
1028 constexpr
auto need_save_non_linear_coord =
1031 if constexpr(need_save_non_linear_coord)
1036 if constexpr(i_access != (
NumAccess - 1))
1038 constexpr
auto idx_diff_ys = SFC_Ys::get_forward_step(i_access);
1044 window_adaptor_thread_coord_tmp,
1045 bottom_tensor_thread_coord_tmp,
1073 #undef WINDOW_DISPATCH_ISSUE
1076 template <address_space_enum, index_t len_>
1082 template <index_t len_>
1090 template <index_t len_>
1098 template <
typename TensorView_>
1101 TensorView_::get_num_of_dimension()>::type;
1119 template <
typename TensorView_,
1120 typename WindowLengths_,
1121 typename StaticTileDistribution_,
1125 const WindowLengths_& window_lengths,
1126 const multi_index<TensorView_::get_num_of_dimension()>& origin,
1128 LinearBottomDims_ = {})
1130 static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension());
1131 return tile_window_linear<remove_cvref_t<TensorView_>,
1132 remove_cvref_t<WindowLengths_>,
1133 remove_cvref_t<StaticTileDistribution_>,
1134 remove_cvref_t<LinearBottomDims_>>{
1135 tensor_view, window_lengths, origin, tile_distribution};
1139 typename TileWindow_,
1140 typename StaticTileDistribution_,
1141 typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
1145 LinearBottomDims_ = {})
1148 tile_window.get_window_lengths(),
1149 tile_window.get_window_origin(),
1151 LinearBottomDims_{});
1155 template <
typename TensorView_,
1156 typename WindowLengths_,
1157 typename StaticTileDistribution_,
1158 typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
1161 const WindowLengths_& window_lengths,
1162 const multi_index<TensorView_::get_num_of_dimension()>& origin,
1164 LinearBottomDims_ = {})
1166 static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension());
1167 auto w = tile_window_linear<remove_cvref_t<TensorView_>,
1168 remove_cvref_t<WindowLengths_>,
1169 remove_cvref_t<StaticTileDistribution_>,
1170 remove_cvref_t<LinearBottomDims_>>{
1171 tensor_view, window_lengths, origin, tile_distribution};
1177 typename TileWindow_,
1178 typename StaticTileDistribution_,
1179 typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
1183 LinearBottomDims_ = {})
1186 tile_window.get_window_lengths(),
1187 tile_window.get_window_origin(),
1189 LinearBottomDims_{});
1192 template <
typename TensorView_,
1193 typename WindowLengths_,
1194 typename StaticTileDistribution_,
1195 typename LinearBottomDims_>
1201 StaticTileDistribution_,
1202 LinearBottomDims_>::BottomTensorIndex& step)
#define CK_TILE_DEVICE
Definition: config.hpp:40
#define CK_TILE_LDS_ADDR
Definition: config.hpp:56
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:41
Definition: cluster_descriptor.hpp:13
typename impl::default_linear_bottom_dims_impl< TensorView_::buffer_view::get_address_space(), TensorView_::get_num_of_dimension()>::type default_linear_bottom_dims
Definition: tile_window_linear.hpp:1101
CK_TILE_DEVICE index_t get_lane_id()
Definition: arch.hpp:69
constexpr CK_TILE_HOST_DEVICE void set_container_subset(array< T, N > &y, sequence< Is... > picks, const array< T, sizeof...(Is)> &x)
Definition: container_helper.hpp:420
constexpr CK_TILE_HOST_DEVICE void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const Index &coord_step)
Definition: tensor_coordinate.hpp:72
tuple_array< T, N > thread_buffer
Definition: thread_buffer.hpp:14
constexpr CK_TILE_HOST_DEVICE auto make_tensor_adaptor_coordinate(const Adaptor &adaptor, const TopIndex &idx_top)
Definition: tensor_adaptor_coordinate.hpp:55
constant< b > bool_constant
Definition: integral_constant.hpp:39
constexpr CK_TILE_HOST_DEVICE auto generate_array(F &&f, number< N >)
Definition: sequence.hpp:1106
int32_t index_t
Definition: integer.hpp:9
constexpr CK_TILE_HOST_DEVICE auto make_tensor_coordinate(const TensorDesc &tensor_desc, const TopIndex &idx_top)
Definition: tensor_coordinate.hpp:60
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:20
constant< v > number
Definition: integral_constant.hpp:33
constexpr CK_TILE_HOST_DEVICE index_t reduce_on_sequence(Seq, Reduce f, number< Init >)
Definition: sequence.hpp:973
constexpr CK_TILE_HOST_DEVICE bool coordinate_has_valid_offset_assuming_top_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition: tensor_coordinate.hpp:79
CK_TILE_DEVICE auto make_tile_window_linear_raw(const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
Definition: tile_window_linear.hpp:1160
CK_TILE_DEVICE index_t get_warp_id()
Definition: arch.hpp:71
constexpr CK_TILE_HOST_DEVICE auto to_sequence(tuple< number< Is >... >)
Definition: sequence.hpp:1046
CK_TILE_DEVICE void move_tile_window(null_tile_window< WindowLengths > &, const typename null_tile_window< WindowLengths >::BottomTensorIndex &)
Definition: null_tile_window.hpp:92
constexpr CK_TILE_DEVICE auto make_tile_window_linear(const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
Definition: tile_window_linear.hpp:1124
constexpr CK_TILE_HOST_DEVICE auto generate_tuple(F &&f, number< N >)
Definition: tuple.hpp:400
CK_TILE_DEVICE void m0_set_with_memory(index_t v)
Definition: utility.hpp:19
typename std::remove_reference< T >::type remove_reference_t
Definition: type_traits.hpp:14
constexpr CK_TILE_HOST_DEVICE auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:337
address_space_enum
Definition: arch.hpp:34
constexpr CK_TILE_HOST_DEVICE void move_tensor_adaptor_coordinate(const Adaptor &adaptor, AdaptorCoord &coord, const TopIndex &idx_diff_top, BottomIndex &idx_diff_bottom)
Definition: tensor_adaptor_coordinate.hpp:97
constexpr CK_TILE_HOST_DEVICE auto get_container_subset(const array< T, N > &arr, sequence< Is... >)
Definition: container_helper.hpp:389
CK_TILE_DEVICE void m0_inc_with_memory(index_t v)
Definition: utility.hpp:25
constexpr CK_TILE_HOST_DEVICE auto histogram_sorted_sequence(SeqSortedSamples, sequence< r, rs... >)
Definition: sequence.hpp:1093
impl::is_static_impl< remove_cvref_t< T > > is_static
Definition: type_traits.hpp:86
constexpr CK_TILE_HOST_DEVICE auto container_concat(const X &x, const Ys &... ys)
Definition: container_helper.hpp:363
constexpr auto prefix_sum_sequence(Seq)
Definition: sequence.hpp:899
Definition: sequence.hpp:278
typename std::conditional< kHasContent, type0, type1 >::type type
Definition: sequence.hpp:293
Definition: integral_constant.hpp:13
typename sequence_merge< typename uniform_sequence_gen< len_ - 1, 0 >::type, sequence< 1 > >::type type
Definition: tile_window_linear.hpp:1087
typename uniform_sequence_gen< len_, 1 >::type type
Definition: tile_window_linear.hpp:1094
Definition: tile_window_linear.hpp:1078
typename uniform_sequence_gen< len_, 0 >::type type
Definition: tile_window_linear.hpp:1079
Definition: type_traits.hpp:75
Definition: sequence.hpp:227
Definition: sequence.hpp:52
Definition: space_filling_curve.hpp:20
Definition: static_distributed_tensor.hpp:21
constexpr CK_TILE_HOST_DEVICE const auto & get_thread_buffer() const
Definition: static_distributed_tensor.hpp:56
Definition: functional.hpp:43
Definition: tensor_view.hpp:41
Definition: tile_distribution.hpp:72
constexpr CK_TILE_HOST_DEVICE const auto & get_ps_ys_to_xs_adaptor() const
Definition: tile_distribution.hpp:126
Definition: tile_window_linear.hpp:87
decltype(get_space_filling_curve()) SFC_Ys
Definition: tile_window_linear.hpp:186
decltype(get_non_linear_access_histogram_prefix_sum()) AccessPrefixSum_NonLinear
Definition: tile_window_linear.hpp:293
thread_buffer< DataType, ScalarPerVector > vector_t
Definition: tile_window_linear.hpp:158
static constexpr index_t NumAccess
Definition: tile_window_linear.hpp:188
static constexpr index_t VectorDimY
Definition: tile_window_linear.hpp:154
static constexpr index_t ScalarPerVector
Definition: tile_window_linear.hpp:155
decltype(get_non_linear_access_map()) AccessMap_NonLinear
Definition: tile_window_linear.hpp:291
static constexpr index_t NumAccess_NonLinear
Definition: tile_window_linear.hpp:290
decltype(get_non_linear_access_histogram()) AccessHistogram_NonLinear
Definition: tile_window_linear.hpp:292
Definition: tile_window_linear.hpp:46
CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate(WindowAdaptorCoord &window_adaptor_thread_coord, BottomTensorCoord &bottom_tensor_thread_coord, const ATopIndex &idx_diff_adaptor_top) const
Definition: tile_window_linear.hpp:384
static constexpr auto I0
Definition: tile_window_linear.hpp:65
array< bool, traits::NumAccess > cached_flags_
Definition: tile_window_linear.hpp:1070
constexpr CK_TILE_DEVICE auto get_num_of_access() const
Definition: tile_window_linear.hpp:469
CK_TILE_DEVICE auto load(number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:472
array< BottomTensorCoord, traits::NumAccess_NonLinear > cached_coords_
Definition: tile_window_linear.hpp:1069
constexpr CK_TILE_DEVICE tile_window_linear()=default
static constexpr index_t NDimWindowAdaptorTop
Definition: tile_window_linear.hpp:59
CK_TILE_DEVICE auto async_load(LdsTileWindow_ &&lds_tile, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:717
array< index_t, NDimWindowAdaptorTop > AdaptorTopIndex
Definition: tile_window_linear.hpp:77
CK_TILE_DEVICE void load_raw(DstTile &dst_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) const
Definition: tile_window_linear.hpp:589
constexpr CK_TILE_DEVICE auto get_tile_distribution() const
Definition: tile_window_linear.hpp:369
static constexpr CK_TILE_DEVICE bool has_static_tile_distribution()
Definition: tile_window_linear.hpp:362
static constexpr index_t NDimY
Definition: tile_window_linear.hpp:63
static constexpr CK_TILE_DEVICE index_t get_bottom_linear_offset(number< i_access >)
Definition: tile_window_linear.hpp:432
static constexpr index_t NDimP
Definition: tile_window_linear.hpp:62
typename traits::AccessHistogram_NonLinear AccessHistogram_NonLinear
Definition: tile_window_linear.hpp:299
typename traits::AccessMap_NonLinear AccessMap_NonLinear
Definition: tile_window_linear.hpp:298
static constexpr index_t NumAccess
Definition: tile_window_linear.hpp:296
constexpr CK_TILE_DEVICE auto get_window_lengths() const
Definition: tile_window_linear.hpp:367
TileDstr tile_dstr_
Definition: tile_window_linear.hpp:1066
static constexpr CK_TILE_DEVICE auto get_bottom_linear_coordinate(number< i_access >)
Definition: tile_window_linear.hpp:402
constexpr CK_TILE_DEVICE void set_bottom_tensor_view_data_ptr(typename BottomTensorView::DataType *data)
Definition: tile_window_linear.hpp:376
array< index_t, NDimBottomTensor > BottomTensorIndex
Definition: tile_window_linear.hpp:78
remove_cvref_t< typename BottomTensorView::DataType > DataType
Definition: tile_window_linear.hpp:54
CK_TILE_DEVICE void move(const BottomTensorIndex &step)
Definition: tile_window_linear.hpp:980
remove_cvref_t< StaticTileDistribution_ > TileDstr
Definition: tile_window_linear.hpp:49
CK_TILE_DEVICE void update(const static_distributed_tensor< DataType, TileDstr > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:876
static constexpr index_t NDimBottomTensor
Definition: tile_window_linear.hpp:60
typename traits::AccessPrefixSum_NonLinear AccessPrefixSum_NonLinear
Definition: tile_window_linear.hpp:300
BottomTensorIndex window_origin_
Definition: tile_window_linear.hpp:1061
WindowLengths window_lengths_
Definition: tile_window_linear.hpp:1058
static constexpr index_t NumAccess_NonLinear
Definition: tile_window_linear.hpp:297
typename TileDstr::PsYs2XsAdaptor WindowAdaptor
Definition: tile_window_linear.hpp:51
decltype(make_tensor_coordinate(BottomTensorDesc{}, BottomTensorIndex{})) BottomTensorCoord
Definition: tile_window_linear.hpp:84
constexpr CK_TILE_DEVICE auto get_window_origin() const
Definition: tile_window_linear.hpp:373
CK_TILE_DEVICE void set_window_origin(const BottomTensorIndex &new_window_origin)
Definition: tile_window_linear.hpp:1008
typename BottomTensorView::TensorDesc BottomTensorDesc
Definition: tile_window_linear.hpp:52
CK_TILE_DEVICE void update_raw(const static_distributed_tensor< DataType, TileDstr > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) const
Definition: tile_window_linear.hpp:926
decltype(make_tensor_adaptor_coordinate(WindowAdaptor{}, AdaptorTopIndex{})) WindowAdaptorCoord
Definition: tile_window_linear.hpp:81
static constexpr CK_TILE_DEVICE index_t get_num_of_dimension()
Definition: tile_window_linear.hpp:360
CK_TILE_DEVICE void store_raw(const static_distributed_tensor< DataType, TileDstr > &dstr_tensor, number< i_access >={}) const
Definition: tile_window_linear.hpp:833
CK_TILE_DEVICE auto load(DstTile &dst_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:528
CK_TILE_HOST_DEVICE void init_raw()
Definition: tile_window_linear.hpp:1051
constexpr CK_TILE_DEVICE auto get_bottom_tensor_view() const
Definition: tile_window_linear.hpp:371
remove_reference_t< BottomTensorView_ > BottomTensorView
Definition: tile_window_linear.hpp:47
CK_TILE_DEVICE void store(const static_distributed_tensor< DataType, TileDstr > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:784
CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_ &&lds_tile, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) const
Definition: tile_window_linear.hpp:647
remove_cvref_t< LinearBottomDims_ > LinearBottomDims
Definition: tile_window_linear.hpp:55
remove_cvref_t< WindowLengths_ > WindowLengths
Definition: tile_window_linear.hpp:48
static constexpr auto I1
Definition: tile_window_linear.hpp:66
BottomTensorView bottom_tensor_view_
Definition: tile_window_linear.hpp:1055
constexpr CK_TILE_DEVICE tile_window_linear(const BottomTensorView &bottom_tensor_view, const WindowLengths &window_lengths, const BottomTensorIndex &window_origin, const TileDstr &tile_distribution)
Definition: tile_window_linear.hpp:304
#define WINDOW_DISPATCH_ISSUE()
Definition: tile_window_linear.hpp:21
#define TO_SEQUENCE(a, n)
Definition: to_sequence.hpp:10