/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/core/tensor/tile_window_linear.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/core/tensor/tile_window_linear.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/core/tensor/tile_window_linear.hpp Source File
tile_window_linear.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
19 
20 namespace ck_tile {
21 
22 #define WINDOW_DISPATCH_ISSUE() \
23  if constexpr(i_access < 0) \
24  { \
25  static_for<0, NumAccess, 1>{}([&](auto ia) { issue(ia); }); \
26  } \
27  else \
28  { \
29  static_assert(i_access < NumAccess); \
30  issue(number<i_access>{}); \
31  }
32 
33 //
34 // This version of tile window will pre-cache offset/flags based on need
35 //
36 // LinearBottomDims_, e.g seq<0, 1> for 2d tensor, the last one is linear dim
37 // so last dim can use immediate offset to indexing, can save register
38 // TODO: if using this struct, better use load_raw()/store_raw(), can control
39 // the the immediate offset on the fly
40 // space-filing-curve is non-snaked here!
41 // This struct inherits from tile_window_with_tile_dstr_base, which is an intermediary base class
42 // with the ultimate parent class being tile_window_base.
43 template <typename BottomTensorView_,
44  typename WindowLengths_,
45  typename StaticTileDistribution_,
46  typename LinearBottomDims_>
48  : public tile_window_with_tile_dstr_base<tile_window_linear<BottomTensorView_,
49  WindowLengths_,
50  StaticTileDistribution_,
51  LinearBottomDims_>,
52  BottomTensorView_,
53  WindowLengths_,
54  StaticTileDistribution_>
55 {
57  WindowLengths_,
58  StaticTileDistribution_,
59  LinearBottomDims_>,
60  BottomTensorView_,
61  WindowLengths_,
62  StaticTileDistribution_>;
63 
65 
66  static_assert(LinearBottomDims::size() == Base::BottomTensorView::get_num_of_dimension());
67 
68  static constexpr auto I0 = number<0>{};
69  static constexpr auto I1 = number<1>{};
70 
71  struct traits
72  {
73  private:
74  static constexpr auto get_num_non_linear_access()
75  {
76  constexpr auto sfc_access_lens = Base::Traits::SFC_Ys::access_lengths;
77  using ys_to_rhs_major = typename decltype(
78  typename Base::TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
79 
80  constexpr auto non_linear = [&]() {
81  index_t cnt = 1;
82  static_for<0, Base::NDimY, 1>{}([&](auto i_dim_y) {
83  constexpr auto rhs_major = ys_to_rhs_major{}[i_dim_y];
84  constexpr auto target_h_dim = number<rhs_major - 1>{}; // no r dim here!
85  if constexpr(LinearBottomDims{}[target_h_dim] == 0)
86  {
87  cnt *= sfc_access_lens[i_dim_y];
88  }
89  });
90  return cnt;
91  }();
92 
93  return non_linear;
94  }
95 
96  // example:
97  // non_linear_access_map: sequence<0, 0, 0, 0, 1, 1, 1, 1> for 8 access, totally 2 register
98  // used
99  // -> histogram : sequence<4, 4>
100  // -> prefixsum : seqneuce<0, 4, 8>
101  // non_linear_access_map: sequence<0, 1, 2, 3, 4, 5, 6, 7> for 8 access, totally 8 register
102  // used, will pre-cache 8
103  // -> histogram : sequence<1, 1, 1, 1, 1, 1, 1, 1>
104  // -> prefixsum : seqneuce<0, 1, 2, 3, 4, 5, 6, 7, 8>
105  // non_linear_access_map: sequence<0, 0, 1, 1, 2, 2, 3, 3> for 8 access, totally 4 register
106  // used, will pre-cache 4
107  // -> histogram : sequence<2, 2, 2, 2>
108  // -> prefixsum : seqneuce<0, 2, 4, 6, 8>
109  static constexpr auto get_non_linear_access_map()
110  {
111  constexpr auto sfc_access_lens = Base::Traits::SFC_Ys::access_lengths;
112  using ys_to_rhs_major = typename decltype(
113  typename Base::TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
114  constexpr auto non_linear_map = [&]() {
116  index_t cumulative_len_ = 1;
117  index_t cumulative_non_linear_len_ = 1;
118  static_for<0, Base::NDimY, 1>{}([&](auto i_y) {
119  constexpr auto i_dim_y = number<Base::NDimY - i_y - 1>{}; // from right to left
120  constexpr auto rhs_major = ys_to_rhs_major{}[i_dim_y];
121  constexpr auto target_h_dim = number<rhs_major - 1>{}; // no r dim here!
122  constexpr auto is_linear_dim = LinearBottomDims{}[target_h_dim];
123 
125  constexpr auto current_len_ = sfc_access_lens[i_dim_y];
126 
127  // copy cumulative length as current pattern
128  for(auto i_ = 0; i_ < cumulative_len_; i_++)
129  {
130  current_m_(i_) = m_[i_];
131  }
132  for(auto j_ = 0; j_ < current_len_; j_++)
133  {
134  auto j_offset_ = is_linear_dim ? 0 : j_ * cumulative_non_linear_len_;
135  for(auto i_ = 0; i_ < cumulative_len_; i_++)
136  {
137  m_(j_ * cumulative_len_ + i_) = current_m_[i_] + j_offset_;
138  }
139  }
140  cumulative_len_ *= current_len_;
141  if(!is_linear_dim)
142  cumulative_non_linear_len_ *= current_len_;
143  });
144  return m_;
145  }();
146 
147  return TO_SEQUENCE(non_linear_map, Base::Traits::NumAccess);
148  }
149 
150  static constexpr auto get_non_linear_access_histogram()
151  {
152  constexpr auto m_ = get_non_linear_access_map();
153 
154  constexpr auto r_ =
155  typename arithmetic_sequence_gen<0, get_num_non_linear_access() + 1, 1>::type{};
156 
157  constexpr auto h_ = histogram_sorted_sequence(m_, r_);
158 
159  return h_;
160  }
161 
162  static constexpr auto get_non_linear_access_histogram_prefix_sum()
163  {
164  constexpr auto h_ = get_non_linear_access_histogram();
165  constexpr auto h_prefix_sum_ = prefix_sum_sequence(h_);
166  return h_prefix_sum_;
167  }
168 
169  public:
170  static constexpr index_t NumAccess_NonLinear = get_num_non_linear_access();
171  using AccessMap_NonLinear = decltype(get_non_linear_access_map()); // sequence
172  using AccessHistogram_NonLinear = decltype(get_non_linear_access_histogram());
173  using AccessPrefixSum_NonLinear = decltype(get_non_linear_access_histogram_prefix_sum());
174  };
175 
176  static constexpr index_t NumAccess = Base::Traits::NumAccess;
181 
182  CK_TILE_DEVICE constexpr tile_window_linear() = default;
183 
185  const typename Base::BottomTensorView& bottom_tensor_view,
186  const typename Base::WindowLengths& window_lengths,
187  const typename Base::BottomTensorIndex& window_origin,
188  const typename Base::TileDstr& tile_distribution)
190  {
191  this->bottom_tensor_view_ = bottom_tensor_view;
192  this->window_lengths_ = window_lengths;
193  this->window_origin_ = window_origin;
194  this->tile_dstr_ = tile_distribution;
195  auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
199  generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimY>{})));
200 
201  typename Base::BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
202  window_origin + window_adaptor_thread_coord_tmp.get_bottom_index();
203 
204  auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate(
205  this->bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
206 
207  // future load/store() calls (might allocate more registers)
208  using SFC_Ys = typename Base::Traits::SFC_Ys;
209 
210  static_for<0, NumAccess, 1>{}([&](auto i_access) {
211  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
212  constexpr auto need_save_non_linear_coord =
213  bool_constant<AccessPrefixSum_NonLinear{}[non_linear_id] == i_access>{};
214 
215  if constexpr(need_save_non_linear_coord)
216  {
217  cached_coords_(non_linear_id) = bottom_tensor_thread_coord_tmp;
218  cached_window_adaptor_coords_(non_linear_id) = window_adaptor_thread_coord_tmp;
219  }
220 
221  // TODO: need pad_tensor_view to check which dim need use flag to check
222  // cached flag is independent from non-linear-coord
223  // but need be updated in move_tile, with proper dims
225  this->bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_coord_tmp);
226 
227  if constexpr(i_access != (NumAccess - 1))
228  {
229  constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(i_access); // tuple of number
230  constexpr auto idx_diff_ps_ys = container_concat(
231  generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
232  idx_diff_ys);
233 
235  window_adaptor_thread_coord_tmp,
236  bottom_tensor_thread_coord_tmp,
237  idx_diff_ps_ys);
238  }
239  });
240  }
241 
242  template <index_t i_access>
244  {
245  using SFC_Ys = typename Base::Traits::SFC_Ys;
246  constexpr auto idx_ys = SFC_Ys::get_index(number<i_access>{});
247  using ys_to_rhs_major = typename decltype(
248  typename Base::TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
249 
250  constexpr auto modified_idx_ys = generate_tuple(
251  [&](auto i_dim_y) {
252  constexpr auto rhs_major = ys_to_rhs_major{}[i_dim_y];
253  constexpr auto target_h_dim = number<rhs_major - 1>{}; // no r dim here!
254  if constexpr(LinearBottomDims{}[target_h_dim] == 0)
255  {
256  return number<0>{};
257  }
258  else
259  {
260  return number<idx_ys[i_dim_y]>{};
261  }
262  },
264 
265  constexpr auto adaptor_ = typename Base::TileDstr{}.get_ps_ys_to_xs_adaptor();
266  constexpr auto idx_ =
267  container_concat(make_tuple(number<0>{}, number<0>{}), modified_idx_ys);
268 
269  return adaptor_.calculate_bottom_index(idx_);
270  }
271 
272  template <index_t i_access>
274  {
275  constexpr auto linear_coord = get_bottom_linear_coordinate(number<i_access>{});
276  constexpr auto is_pure_linear_tensor =
278  if constexpr(is_pure_linear_tensor)
279  {
280  // this case usually is a LDS window, everything is known at compile tile.
281  // we directly use BottomTensorView transform to compute the offset, in case padding
282  auto bottom_tensor_coord = make_tensor_coordinate(
283  typename Base::BottomTensorView{}.get_tensor_descriptor(), linear_coord);
284  return bottom_tensor_coord.get_offset();
285  }
286  else
287  {
288  // this case usually is a global window, where last dim can be linear
289  // we hack here, that use the original TileDstr to compute the linear offset
290  // ... hoping that there is no extra padding between other dims, which make sense
291  // since that would introduce runtime length (so can't use linear offset)
292  constexpr index_t linear_offset = [&]() {
293  constexpr auto x_idx_ = linear_coord;
294  constexpr auto x_len_ = typename Base::TileDstr{}.get_lengths();
295  static_assert(x_idx_.size() == x_len_.size());
296  constexpr index_t x_dims_ = x_idx_.size();
297  index_t cu_stride_ = 1;
298  index_t cu_offset_ = 0;
299  static_for<0, x_dims_, 1>{}([&](auto i_) {
300  auto r_i_ = number<x_dims_ - i_ - 1>{};
301  cu_offset_ += x_idx_[r_i_] * cu_stride_;
302  cu_stride_ *= x_len_[r_i_];
303  });
304  return cu_offset_;
305  }();
306  return linear_offset;
307  }
308  }
309 
310  template <index_t i_access = -1, bool oob_conditional_check = true>
312  {
313  using vector_t = typename Base::Traits::vector_t;
314  using SFC_Ys = typename Base::Traits::SFC_Ys;
315 
316  constexpr auto tile_dstr = typename Base::TileDstr{};
317 
318  auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
319 
320  auto issue = [&](auto i_access_) {
321  constexpr auto IAccess = number<i_access_>{};
322 
323  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
324  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
325  auto bottom_tensor_flag = cached_flags_[IAccess];
326 
327  constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
328 
329  // read from bottom tensor
330  const vector_t vec_value =
331  this->get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
332  bottom_tensor_thread_coord,
333  linear_offset,
334  bottom_tensor_flag,
335  bool_constant<oob_conditional_check>{});
336 
337  // data index [y0, y1, ...]
338  constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess);
339  // write into distributed tensor
340  static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
341  constexpr auto idx_ys = generate_tuple(
342  [&](auto jj) {
343  return jj == Base::Traits::VectorDimY ? (idx_diff_ys[jj] + j)
344  : idx_diff_ys[jj];
345  },
346  number<Base::NDimY>{});
347 
348  constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
349  Base::Traits::PackedSize;
350 
351  dst_tensor.get_thread_buffer().template at<d>() =
352  vec_value
353  .template get_as<typename Base::DataType>()[j / Base::Traits::PackedSize];
354  });
355  };
356 
358 
359  return dst_tensor;
360  }
361 
362  template <typename DstTile, index_t i_access = -1, bool oob_conditional_check = true>
363  CK_TILE_DEVICE auto load(DstTile& dst_tensor,
364  number<i_access> = {},
366  {
367  using vector_t = typename Base::Traits::vector_t;
368  using SFC_Ys = typename Base::Traits::SFC_Ys;
369 
370  constexpr auto tile_dstr = typename Base::TileDstr{};
371 
372  // auto dst_tensor = make_static_distributed_tensor<DataType>(tile_dstr);
373 
374  auto issue = [&](auto i_access_) {
375  constexpr auto IAccess = number<i_access_>{};
376 
377  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
378  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
379  auto bottom_tensor_flag = cached_flags_[IAccess];
380 
381  constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
382 
383  // read from bottom tensor
384  const vector_t vec_value =
385  this->get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
386  bottom_tensor_thread_coord,
387  linear_offset,
388  bottom_tensor_flag,
389  bool_constant<oob_conditional_check>{});
390  // data index [y0, y1, ...]
391  constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess);
392  // write into distributed tensor
393  static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
394  constexpr auto idx_ys = generate_tuple(
395  [&](auto jj) {
396  return jj == Base::Traits::VectorDimY ? (idx_diff_ys[jj] + j)
397  : idx_diff_ys[jj];
398  },
399  number<Base::NDimY>{});
400 
401  constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
402  Base::Traits::PackedSize;
403 
404  dst_tensor.get_thread_buffer().template at<d>() =
405  vec_value
406  .template get_as<typename Base::DataType>()[j / Base::Traits::PackedSize];
407  });
408  };
409 
411 
412  return dst_tensor;
413  }
414 
415  template <typename DstTile,
416  index_t i_access = -1,
417  bool oob_conditional_check = true,
418  bool pre_nop = false>
419  CK_TILE_DEVICE void load_raw(DstTile& dst_tensor,
420  number<i_access> = {}, // negative means loop over all num_access
422  bool_constant<pre_nop> = {}) const
423  {
424  using vector_t = typename Base::Traits::vector_t;
425  using SFC_Ys = typename Base::Traits::SFC_Ys;
426  static constexpr index_t YElementSize =
427  typename Base::TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
428  static_assert(YElementSize % (Base::Traits::PackedSize * Base::Traits::ScalarPerVector) ==
429  0);
430  using vectorized_tbuf =
431  array<vector_t,
432  YElementSize / (Base::Traits::PackedSize * Base::Traits::ScalarPerVector)>;
433 
434  constexpr auto tile_dstr = typename Base::TileDstr{};
435 
436  auto& dst_vec_tbuf = reinterpret_cast<vectorized_tbuf&>(dst_tensor.get_thread_buffer());
437 
438  auto issue = [&](auto i_access_) {
439  constexpr auto IAccess = number<i_access_>{};
440  constexpr auto pre_nop_ = [&]() {
441  if constexpr(pre_nop && i_access_ == 0 &&
442  Base::BottomTensorView::buffer_view::get_address_space() ==
444  return bool_constant<true>{};
445  else
446  return bool_constant<false>{};
447  }();
448 
449  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
450  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
451  constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
452  auto bottom_tensor_flag = cached_flags_[IAccess];
453 
454  // data index [y0, y1, ...]
455  constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
456  constexpr index_t d =
457  tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start) /
458  Base::Traits::PackedSize;
459  static_assert(d % Base::Traits::ScalarPerVector == 0);
460 
461  this->get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
462  dst_vec_tbuf.template at<d / Base::Traits::ScalarPerVector>(),
463  bottom_tensor_thread_coord,
464  linear_offset ,
465  bottom_tensor_flag,
466  bool_constant<oob_conditional_check>{},
467  pre_nop_);
468 #if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE || \
469  CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
470  asm volatile(""); // this is starting from rocm-6.2, but same sympton, reuse this flag
471 #endif
472  };
473 
475  }
476 
477  // TODO: currently async load only implemented in inline asm
478  template <typename LdsTileWindow_,
479  index_t i_access = -1,
480  bool oob_conditional_check = true,
481  bool pre_nop = false>
482  CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_&& lds_tile,
483  number<i_access> = {},
485  bool_constant<pre_nop> = {}) const
486  {
487  using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
488  using LdsDataType = typename LdsTileWindow::DataType;
489 
490  // currently we only support everything is non linear dim
491  // actually it's not performant if we have linear dim(e.g. fast changing)
492  static_assert(NumAccess_NonLinear == NumAccess);
493  static_assert(Base::BottomTensorView::buffer_view::get_address_space() ==
495 
496  // issues * warps * lanes
497  static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
498 
499  const index_t size_per_buf =
500  lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
501  make_tuple(number<0>{}, number<0>{}, number<0>{})) *
502  sizeof(LdsDataType);
503 
504  const index_t size_per_wave =
505  lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
506  make_tuple(number<0>{}, number<1>{}, number<0>{})) *
507  sizeof(LdsDataType) -
508  size_per_buf;
509 
510  const index_t size_per_issue =
511  lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
512  make_tuple(number<1>{}, number<0>{}, number<0>{})) *
513  sizeof(LdsDataType) -
514  size_per_buf;
515 
516  const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
517  m0_set_with_memory(m0_init_value); // This should be wave independent
518 
519  using vector_t = typename Base::Traits::vector_t;
520 
521  LdsDataType* smem = lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_;
522 
523  // loop over thread tensor space [y0, y1, ...]
524  auto issue = [&](auto i_access_) {
525  constexpr auto IAccess = number<i_access_>{};
526  constexpr auto pre_nop_ = [&]() {
527  if constexpr(pre_nop && i_access_ == 0)
528  return bool_constant<true>{};
529  else
530  return bool_constant<false>{};
531  }();
532 
533  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
534  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
535  auto bottom_tensor_flag = cached_flags_[IAccess]; // get this flag anyway
536 
537  // read from bottom tensor
538  this->get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
539  smem, bottom_tensor_thread_coord, 0, bottom_tensor_flag, pre_nop_);
540 
541  // move thread coordinate
542  if constexpr(i_access_ != (NumAccess - 1))
543  {
544  m0_inc_with_memory(size_per_issue);
545  }
546  };
547 
549  }
550 
551  template <typename LdsTileWindow_, index_t i_access = -1, bool oob_conditional_check = true>
552  CK_TILE_DEVICE auto async_load(LdsTileWindow_&& lds_tile,
553  number<i_access> = {},
555  {
556  using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
557  using LdsDataType = typename LdsTileWindow::DataType;
558  using vector_t = typename traits::vector_t;
559 
560  static_assert(NumAccess_NonLinear == NumAccess, "Unsupported configuration");
561  static_assert(Base::BottomTensorView::buffer_view::get_address_space() ==
563  "Requires global memory");
564 
565  // Precompute invariant values outside the lambda
566  const auto window_origin = lds_tile.get_window_origin();
567  const auto& bottom_tensor_view = lds_tile.get_bottom_tensor_view();
568  const auto& tensor_descriptor = bottom_tensor_view.get_tensor_descriptor();
569  auto smem_base_ptr = bottom_tensor_view.get_buffer_view().p_data_;
570 
571  auto issue = [&](auto i_access_) {
572  constexpr auto IAccess = number<i_access_>{};
573  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
574 
575  // Use precomputed values
576  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
577  auto window_adaptor_coord = cached_window_adaptor_coords_[non_linear_id];
578  auto bottom_tensor_flag = cached_flags_[IAccess];
579 
580  auto lds_bottom_tensor_thread_idx =
581  window_origin + window_adaptor_coord.get_bottom_index();
582  const auto lds_coord =
583  make_tensor_coordinate(tensor_descriptor, lds_bottom_tensor_thread_idx);
584 
585  CK_TILE_LDS_ADDR LdsDataType* smem = smem_base_ptr + lds_coord.get_offset();
586 
587  // Read from bottom tensor
588  this->get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
589  smem,
590  bottom_tensor_thread_coord,
591  0,
592  bottom_tensor_flag,
593  bool_constant<oob_conditional_check>{});
594  };
595 
597  }
598 
599  template <typename Policy, index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
601  {
602  constexpr auto tile_dstr = typename Base::TileDstr{};
603  auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
604  this->template load_transpose_linear<Policy>(
606  return dst_tensor;
607  }
608 
609  template <typename Policy,
610  typename DistributedTensor,
611  index_t i_access = -1,
612  bool oob_conditional_check = true>
613  CK_TILE_DEVICE auto load_transpose_linear(DistributedTensor& dst_tensor,
614  number<i_access> = {},
616  {
617  using vector_t = typename traits::vector_t;
618  using SFC_Ys = typename traits::SFC_Ys;
619 
620  constexpr auto tile_dstr = typename Base::TileDstr{};
621 
622  constexpr auto group_func = Policy::group_func;
623 
624  auto issue = [&](auto i_access_) {
625  constexpr auto IAccess = number<i_access_>{};
626  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
627  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
628  auto bottom_tensor_flag = cached_flags_[IAccess];
629 
630  constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
631 
632  // read from bottom tensor
633  const vector_t vec_value =
634  this->get_bottom_tensor_view().template get_transpose_vectorized_elements<vector_t>(
635  bottom_tensor_thread_coord, 0);
636  // write into distributed tensor
637  static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
638  constexpr auto idx_ys = generate_tuple(
639  [&](auto jj) {
640  return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
641  },
642  number<Base::NDimY>{});
643 
644  constexpr index_t linear_distributed_index =
645  tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
646  dst_tensor.get_thread_buffer().template at<linear_distributed_index>() =
647  vec_value.template get_as<typename Base::DataType>()[j];
648  });
649  };
651  }
652 
653  template <index_t i_access = -1, bool oob_conditional_check = true>
655  typename Base::TileDstr>& dstr_tensor,
656  number<i_access> = {},
658  {
659 
660  using vector_t = typename Base::Traits::vector_t;
661  using SFC_Ys = typename Base::Traits::SFC_Ys;
662 
663  constexpr auto tile_dstr = typename Base::TileDstr{};
664 
665  // loop over thread tensor space [y0, y1, ...]
666  auto issue = [&](auto i_access_) {
667  constexpr auto IAccess = number<i_access_>{};
668  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
669  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
670  constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
671  auto bottom_tensor_flag = cached_flags_[IAccess];
672  // data index [y0, y1, ...]
673  constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
674 
675  // read from distributed tensor
676  vector_t vec_value;
677 
678  static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
679  constexpr auto idx_ys = generate_tuple(
680  [&](auto jj) {
681  return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
682  : idx_ys_start[jj];
683  },
684  number<Base::NDimY>{});
685 
686  constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
687  Base::Traits::PackedSize;
688 
689  vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
690  dstr_tensor.get_thread_buffer().template at<d>();
691  });
692 
693  // write into bottom tensor
694  this->get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
695  bottom_tensor_thread_coord,
696  linear_offset,
697  bottom_tensor_flag,
698  vec_value,
699  bool_constant<oob_conditional_check>{});
700  };
701 
703  }
704 
705  template <index_t i_access = -1>
706  CK_TILE_DEVICE void
708  dstr_tensor,
709  number<i_access> = {}) const
710  {
711  using vector_t = typename Base::Traits::vector_t;
712  using SFC_Ys = typename Base::Traits::SFC_Ys;
713 
714  constexpr auto tile_dstr = typename Base::TileDstr{};
715  static constexpr bool oob_conditional_check = true;
716 
717  // loop over thread tensor space [y0, y1, ...]
718  auto issue = [&](auto i_access_) {
719  constexpr auto IAccess = number<i_access_>{};
720  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
721  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
722  constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
723  auto bottom_tensor_flag = cached_flags_[IAccess];
724 
725  // data index [y0, y1, ...]
726  constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
727 
728  // read from distributed tensor
729  vector_t vec_value;
730  static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
731  constexpr auto idx_ys = generate_tuple(
732  [&](auto jj) {
733  return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
734  : idx_ys_start[jj];
735  },
736  number<Base::NDimY>{});
737  constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
738  Base::Traits::PackedSize;
739  vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
740  dstr_tensor.get_thread_buffer().template at<d>();
741  });
742 
743  // write into bottom tensor
744  this->get_bottom_tensor_view()
745  .template set_vectorized_elements_raw<vector_t, oob_conditional_check>(
746  bottom_tensor_thread_coord, linear_offset, bottom_tensor_flag, vec_value);
747  };
748 
750  }
751 
752  template <index_t i_access = -1, bool oob_conditional_check = true>
753  CK_TILE_DEVICE void
755  dstr_tensor,
756  number<i_access> = {},
758  {
759 
760  using vector_t = typename Base::Traits::vector_t;
761  using SFC_Ys = typename Base::Traits::SFC_Ys;
762 
763  constexpr auto tile_dstr = typename Base::TileDstr{};
764 
765  // loop over thread tensor space [y0, y1, ...]
766  auto issue = [&](auto i_access_) {
767  constexpr auto IAccess = number<i_access_>{};
768  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
769  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
770  constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
771  auto bottom_tensor_flag = cached_flags_[IAccess];
772 
773  // data index [y0, y1, ...]
774  constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
775 
776  // read from distributed tensor
777  vector_t vec_value;
778 
779  static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
780  constexpr auto idx_ys = generate_tuple(
781  [&](auto jj) {
782  return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
783  : idx_ys_start[jj];
784  },
785  number<Base::NDimY>{});
786 
787  constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
788  Base::Traits::PackedSize;
789 
790  vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
791  dstr_tensor.get_thread_buffer().template at<d>();
792  });
793 
794  // write into bottom tensor
795  this->get_bottom_tensor_view().template update_vectorized_elements<vector_t>(
796  bottom_tensor_thread_coord,
797  linear_offset,
798  bottom_tensor_flag,
799  vec_value,
800  bool_constant<oob_conditional_check>{});
801  };
802 
804  }
805 
806  template <index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
807  CK_TILE_DEVICE void
809  dstr_tensor,
810  number<i_access> = {},
812  bool_constant<pre_nop> = {}) const
813  {
814 
815  using vector_t = typename Base::Traits::vector_t;
816  using SFC_Ys = typename Base::Traits::SFC_Ys;
817 
818  constexpr auto tile_dstr = typename Base::TileDstr{};
819 
820  // loop over thread tensor space [y0, y1, ...]
821  auto issue = [&](auto i_access_) {
822  constexpr auto IAccess = number<i_access_>{};
823  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
824  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
825  constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
826  auto bottom_tensor_flag = cached_flags_[IAccess];
827 
828  // data index [y0, y1, ...]
829  constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
830 
831  // read from distributed tensor
832  vector_t vec_value;
833 
834  static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
835  constexpr auto idx_ys = generate_tuple(
836  [&](auto jj) {
837  return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
838  : idx_ys_start[jj];
839  },
840  number<Base::NDimY>{});
841 
842  constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
843  Base::Traits::PackedSize;
844 
845  vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
846  dstr_tensor.get_thread_buffer().template at<d>();
847  });
848 
849  // write into bottom tensor
850  this->get_bottom_tensor_view().template update_vectorized_elements_raw<vector_t>(
851  bottom_tensor_thread_coord,
852  linear_offset,
853  bottom_tensor_flag,
854  vec_value,
855  bool_constant<oob_conditional_check>{},
856  bool_constant<pre_nop>{});
857  };
858 
860  }
861  // *_extended() functions acts like a virtual function with a default implementation exisiting
862  // in the base class
864  {
865  static_for<0, NumAccess, 1>{}([&](auto i_access) {
866  constexpr auto IAccess = number<i_access>{};
867  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
868  constexpr auto need_update_non_linear_coord =
869  bool_constant<AccessPrefixSum_NonLinear{}[non_linear_id] == i_access>{};
870 
871  if constexpr(need_update_non_linear_coord)
872  {
873  move_tensor_coordinate(this->bottom_tensor_view_.get_tensor_descriptor(),
874  cached_coords_(non_linear_id),
875  step);
876  }
877 
878  // move the current coord with linear_coords
879  auto tmp_coords = cached_coords_[non_linear_id];
880  constexpr auto linear_coord = get_bottom_linear_coordinate(IAccess);
882  this->bottom_tensor_view_.get_tensor_descriptor(), tmp_coords, linear_coord);
883 
885  this->bottom_tensor_view_.get_tensor_descriptor(), tmp_coords);
886  });
887  }
888 
890  {
891  auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
892  typename Base::TileDstr{}.get_ps_ys_to_xs_adaptor(),
895  generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimY>{})));
896 
897  typename Base::BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
898  this->window_origin_ + window_adaptor_thread_coord_tmp.get_bottom_index();
899 
900  auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate(
901  this->bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
902 
903  // future load/store() calls (might allocate more registers)
904  using SFC_Ys = typename Base::Traits::SFC_Ys;
905 
906  static_for<0, NumAccess, 1>{}([&](auto i_access) {
907  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
908  constexpr auto need_save_non_linear_coord =
909  bool_constant<AccessPrefixSum_NonLinear{}[non_linear_id] == i_access>{};
910 
911  if constexpr(need_save_non_linear_coord)
912  {
913  cached_coords_(non_linear_id) = bottom_tensor_thread_coord_tmp;
914  cached_window_adaptor_coords_(non_linear_id) = window_adaptor_thread_coord_tmp;
915  }
916 
917  if constexpr(i_access != (NumAccess - 1))
918  {
919  constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(i_access); // tuple of number
920  constexpr auto idx_diff_ps_ys = container_concat(
921  generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
922  idx_diff_ys);
923 
925  window_adaptor_thread_coord_tmp,
926  bottom_tensor_thread_coord_tmp,
927  idx_diff_ps_ys);
928  }
929  });
930  }
931 
932  // this contains:
937 };
938 
939 #undef WINDOW_DISPATCH_ISSUE
940 
941 namespace impl {
942 template <address_space_enum, index_t len_>
944 {
946 };
947 
948 template <index_t len_>
950 {
951  // global default to seq<0,0,....1>
952  using type = typename sequence_merge<typename uniform_sequence_gen<len_ - 1, 0>::type,
954 };
955 
956 template <index_t len_>
958 {
959  // lds default to seq<1,1.....1>
961 };
962 } // namespace impl
963 
964 template <typename TensorView_>
966  typename impl::default_linear_bottom_dims_impl<TensorView_::buffer_view::get_address_space(),
967  TensorView_::get_num_of_dimension()>::type;
968 
969 // if using this API, will create a tile_window_linear
970 // this structure can have the chance to use immediate value, save register
971 // need pass in LinearBottomDims_ properly to control which dim is linear
972 // so to generate a constexpr offset as linear_offset for this dim
973 // (and finally pass to the immediate offset of buffer/lds instruction)
974 //
975 // Note: there is no internal check for which dim is OK to use linear offset
976 // user must make sure by themselves
977 //
978 // e.g.
979 // 2d global matrix, set LinearBottomDims_=seq<0, 1>, the last dim will generate
980 // immediate offset if each thread has multiple issue along last dim
981 //
982 // 2d LDS buffer, set LinearBottomDims_=seq<1, 1>, then only one vgpr used as offset
983 // everything else is just using immediate offset.
984 //
985 template <typename TensorView_,
986  typename WindowLengths_,
987  typename StaticTileDistribution_,
988  typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
989 CK_TILE_DEVICE constexpr auto
991  const WindowLengths_& window_lengths,
992  const multi_index<TensorView_::get_num_of_dimension()>& origin,
993  const StaticTileDistribution_& tile_distribution,
994  LinearBottomDims_ = {})
995 {
996  static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension());
997  return tile_window_linear<remove_cvref_t<TensorView_>,
998  remove_cvref_t<WindowLengths_>,
999  remove_cvref_t<StaticTileDistribution_>,
1000  remove_cvref_t<LinearBottomDims_>>{
1001  tensor_view, window_lengths, origin, tile_distribution};
1002 }
1003 
1004 template <
1005  typename TileWindow_,
1006  typename StaticTileDistribution_,
1007  typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
1008 CK_TILE_DEVICE constexpr auto
1009 make_tile_window_linear(const TileWindow_& tile_window,
1010  const StaticTileDistribution_& tile_distribution,
1011  LinearBottomDims_ = {})
1012 {
1013  return make_tile_window_linear(tile_window.get_bottom_tensor_view(),
1014  tile_window.get_window_lengths(),
1015  tile_window.get_window_origin(),
1016  tile_distribution,
1017  LinearBottomDims_{});
1018 }
1019 
1020 // this version must not be called under a constexpr context
1021 template <typename TensorView_,
1022  typename WindowLengths_,
1023  typename StaticTileDistribution_,
1024  typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
1025 CK_TILE_DEVICE auto
1027  const WindowLengths_& window_lengths,
1028  const multi_index<TensorView_::get_num_of_dimension()>& origin,
1029  const StaticTileDistribution_& tile_distribution,
1030  LinearBottomDims_ = {})
1031 {
1032  static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension());
1033  auto w = tile_window_linear<remove_cvref_t<TensorView_>,
1034  remove_cvref_t<WindowLengths_>,
1035  remove_cvref_t<StaticTileDistribution_>,
1036  remove_cvref_t<LinearBottomDims_>>{
1037  tensor_view, window_lengths, origin, tile_distribution};
1038  w.init_raw();
1039  return w;
1040 }
1041 
1042 template <
1043  typename TileWindow_,
1044  typename StaticTileDistribution_,
1045  typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
1046 CK_TILE_DEVICE constexpr auto
1047 make_tile_window_linear_raw(const TileWindow_& tile_window,
1048  const StaticTileDistribution_& tile_distribution,
1049  LinearBottomDims_ = {})
1050 {
1051  return make_tile_window_linear_raw(tile_window.get_bottom_tensor_view(),
1052  tile_window.get_window_lengths(),
1053  tile_window.get_window_origin(),
1054  tile_distribution,
1055  LinearBottomDims_{});
1056 }
1057 
1058 template <typename TensorView_,
1059  typename WindowLengths_,
1060  typename StaticTileDistribution_,
1061  typename LinearBottomDims_>
1064  window,
1065  const typename tile_window_linear<TensorView_,
1066  WindowLengths_,
1067  StaticTileDistribution_,
1068  LinearBottomDims_>::BottomTensorIndex& step)
1069 {
1070  window.move(step);
1071 }
1072 
1081 template <typename T>
1083 {
1084 };
1085 
1097 template <typename BottomTensorView_,
1098  typename WindowLengths_,
1099  typename StaticTileDistribution_,
1100  typename LinearBottomDims_>
1102  WindowLengths_,
1103  StaticTileDistribution_,
1104  LinearBottomDims_>> : std::true_type
1105 {
1106 };
1107 
1115 template <typename T>
1117 
1118 } // namespace ck_tile
#define CK_TILE_DEVICE
Definition: config.hpp:40
#define CK_TILE_LDS_ADDR
Definition: config.hpp:57
Definition: cluster_descriptor.hpp:13
typename impl::default_linear_bottom_dims_impl< TensorView_::buffer_view::get_address_space(), TensorView_::get_num_of_dimension()>::type default_linear_bottom_dims
Definition: tile_window_linear.hpp:967
CK_TILE_DEVICE index_t get_lane_id()
Definition: arch.hpp:72
constexpr CK_TILE_HOST_DEVICE void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const Index &coord_step)
Definition: tensor_coordinate.hpp:72
constexpr CK_TILE_HOST_DEVICE auto make_tensor_adaptor_coordinate(const Adaptor &adaptor, const TopIndex &idx_top)
Definition: tensor_adaptor_coordinate.hpp:55
constant< b > bool_constant
Definition: integral_constant.hpp:39
int32_t index_t
Definition: integer.hpp:9
constexpr CK_TILE_HOST_DEVICE auto make_tensor_coordinate(const TensorDesc &tensor_desc, const TopIndex &idx_top)
Definition: tensor_coordinate.hpp:60
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:21
constant< v > number
Definition: integral_constant.hpp:33
constexpr CK_TILE_HOST_DEVICE index_t reduce_on_sequence(Seq, Reduce f, number< Init >)
Definition: sequence.hpp:973
constexpr CK_TILE_HOST_DEVICE bool coordinate_has_valid_offset_assuming_top_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition: tensor_coordinate.hpp:79
CK_TILE_DEVICE auto make_tile_window_linear_raw(const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
Definition: tile_window_linear.hpp:1026
CK_TILE_DEVICE index_t get_warp_id()
Definition: arch.hpp:74
constexpr bool is_tile_window_linear_v
Helper variable template to check if a type is a linear tile window.
Definition: tile_window_linear.hpp:1116
CK_TILE_DEVICE void move_tile_window(null_tile_window< WindowLengths > &, const typename null_tile_window< WindowLengths >::BottomTensorIndex &)
Definition: null_tile_window.hpp:92
constexpr CK_TILE_DEVICE auto make_tile_window_linear(const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
Definition: tile_window_linear.hpp:990
constexpr CK_TILE_HOST_DEVICE auto generate_tuple(F &&f, number< N >)
Definition: tuple.hpp:406
CK_TILE_DEVICE void m0_set_with_memory(index_t v)
Definition: utility.hpp:19
constexpr CK_TILE_HOST_DEVICE auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:337
address_space_enum
Definition: arch.hpp:34
CK_TILE_DEVICE void m0_inc_with_memory(index_t v)
Definition: utility.hpp:25
constexpr CK_TILE_HOST_DEVICE auto histogram_sorted_sequence(SeqSortedSamples, sequence< r, rs... >)
Definition: sequence.hpp:1093
constexpr CK_TILE_HOST_DEVICE auto container_concat(const X &x, const Ys &... ys)
Definition: container_helper.hpp:363
constexpr auto prefix_sum_sequence(Seq)
Definition: sequence.hpp:899
bool_constant< false > false_type
Definition: integral_constant.hpp:63
bool_constant< true > true_type
Definition: integral_constant.hpp:62
Definition: sequence.hpp:278
A fixed-size array container similar to std::array with additional utilities.
Definition: array.hpp:43
Definition: integral_constant.hpp:13
typename sequence_merge< typename uniform_sequence_gen< len_ - 1, 0 >::type, sequence< 1 > >::type type
Definition: tile_window_linear.hpp:953
typename uniform_sequence_gen< len_, 1 >::type type
Definition: tile_window_linear.hpp:960
Definition: tile_window_linear.hpp:944
typename uniform_sequence_gen< len_, 0 >::type type
Definition: tile_window_linear.hpp:945
Type trait to determine if a type is a linear tile window.
Definition: tile_window_linear.hpp:1083
Definition: math.hpp:98
Definition: sequence.hpp:227
Definition: sequence.hpp:52
Definition: static_distributed_tensor.hpp:21
constexpr CK_TILE_HOST_DEVICE const auto & get_thread_buffer() const
Definition: static_distributed_tensor.hpp:58
Definition: functional.hpp:43
Definition: tensor_view.hpp:41
Definition: tile_distribution.hpp:72
constexpr CK_TILE_HOST_DEVICE const auto & get_ps_ys_to_xs_adaptor() const
Definition: tile_distribution.hpp:126
CK_TILE_DEVICE void move(const BottomTensorIndex &step)
Definition: tile_window_base.hpp:67
Definition: tile_window_linear.hpp:72
decltype(get_non_linear_access_histogram_prefix_sum()) AccessPrefixSum_NonLinear
Definition: tile_window_linear.hpp:173
decltype(get_non_linear_access_map()) AccessMap_NonLinear
Definition: tile_window_linear.hpp:171
static constexpr index_t NumAccess_NonLinear
Definition: tile_window_linear.hpp:170
decltype(get_non_linear_access_histogram()) AccessHistogram_NonLinear
Definition: tile_window_linear.hpp:172
Definition: tile_window_linear.hpp:55
static constexpr auto I0
Definition: tile_window_linear.hpp:68
CK_TILE_DEVICE void set_window_origin_extended(const typename Base::BottomTensorIndex &)
Definition: tile_window_linear.hpp:889
CK_TILE_DEVICE auto load(number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:311
constexpr CK_TILE_DEVICE tile_window_linear()=default
array< typename Base::WindowAdaptorCoord, traits::NumAccess_NonLinear > cached_window_adaptor_coords_
Definition: tile_window_linear.hpp:935
CK_TILE_DEVICE auto async_load(LdsTileWindow_ &&lds_tile, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:552
CK_TILE_DEVICE void load_raw(DstTile &dst_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) const
Definition: tile_window_linear.hpp:419
static constexpr CK_TILE_DEVICE index_t get_bottom_linear_offset(number< i_access >)
Definition: tile_window_linear.hpp:273
CK_TILE_DEVICE auto load_transpose() const
Definition: tile_window_linear.hpp:600
typename traits::AccessHistogram_NonLinear AccessHistogram_NonLinear
Definition: tile_window_linear.hpp:179
typename traits::AccessMap_NonLinear AccessMap_NonLinear
Definition: tile_window_linear.hpp:178
constexpr CK_TILE_DEVICE tile_window_linear(const typename Base::BottomTensorView &bottom_tensor_view, const typename Base::WindowLengths &window_lengths, const typename Base::BottomTensorIndex &window_origin, const typename Base::TileDstr &tile_distribution)
Definition: tile_window_linear.hpp:184
static constexpr index_t NumAccess
Definition: tile_window_linear.hpp:176
CK_TILE_DEVICE void store_raw(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access >={}) const
Definition: tile_window_linear.hpp:707
array< bool, Base::Traits::NumAccess > cached_flags_
Definition: tile_window_linear.hpp:936
static constexpr CK_TILE_DEVICE auto get_bottom_linear_coordinate(number< i_access >)
Definition: tile_window_linear.hpp:243
CK_TILE_DEVICE void update(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:754
CK_TILE_DEVICE void store(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:654
CK_TILE_DEVICE void update_raw(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) const
Definition: tile_window_linear.hpp:808
typename traits::AccessPrefixSum_NonLinear AccessPrefixSum_NonLinear
Definition: tile_window_linear.hpp:180
CK_TILE_DEVICE auto load_transpose_linear(DistributedTensor &dst_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:613
static constexpr index_t NumAccess_NonLinear
Definition: tile_window_linear.hpp:177
CK_TILE_DEVICE auto load(DstTile &dst_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:363
CK_TILE_DEVICE void move_extended(const typename Base::BottomTensorIndex &step)
Definition: tile_window_linear.hpp:863
array< typename Base::BottomTensorCoord, traits::NumAccess_NonLinear > cached_coords_
Definition: tile_window_linear.hpp:933
CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_ &&lds_tile, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) const
Definition: tile_window_linear.hpp:482
remove_cvref_t< LinearBottomDims_ > LinearBottomDims
Definition: tile_window_linear.hpp:64
static constexpr auto I1
Definition: tile_window_linear.hpp:69
Definition: tile_window_base.hpp:94
CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate(WindowAdaptorCoord &window_adaptor_thread_coord, BottomTensorCoord &bottom_tensor_thread_coord, const ATopIndex &idx_diff_adaptor_top) const
Definition: tile_window_base.hpp:129
Definition: sequence.hpp:305
typename sequence_gen< NSize, F >::type type
Definition: sequence.hpp:311
#define WINDOW_DISPATCH_ISSUE()
Definition: tile_window_linear.hpp:22
#define TO_SEQUENCE(a, n)
Definition: to_sequence.hpp:10