/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp Source File
threadwise_tensor_slice_transfer.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
10 
12 
14 
15 namespace ck {
16 // Assume:
17 // 1. src:
18 // 1. SrcDesc is known at compile-time
19 // 2. SrcBuffer is StaticBuffer
20 // 3. SrcSliceOrginIdx is known at compile-time
21 // 2. dst:
22 // 1. DstDesc is not known at compile-time
23 // 2. DstBuffer is DynamicBuffer
24 // 3. DstSliceOrginIdx is not known at compile time
25 template <typename SrcData,
26  typename DstData,
27  typename SrcDesc,
28  typename DstDesc,
29  typename ElementwiseOperation,
30  typename SliceLengths,
31  typename DimAccessOrder,
32  index_t DstVectorDim,
33  index_t DstScalarPerVector,
34  InMemoryDataOperationEnum DstInMemOp,
35  index_t DstScalarStrideInVector,
36  bool DstResetCoordinateAfterRun,
37  typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
39 {
40  static constexpr index_t nDim = SliceLengths::Size();
41 
43 
44  using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
45 
46  using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
47 
48  __device__ constexpr ThreadwiseTensorSliceTransfer_v1r3(const DstDesc& dst_desc,
49  const Index& dst_slice_origin_idx,
50  const ElementwiseOperation& element_op)
51  : dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx)),
52  element_op_{element_op}
53  {
54  static_assert(SrcDesc::IsKnownAtCompileTime(),
55  "wrong! SrcDesc need to known at compile-time");
56  static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
57  "wrong! Not divisible");
58  }
59 
60  __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
61  {
62  dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
63  }
64 
65  template <typename SrcSliceOriginIdx, typename SrcBuffer, typename DstBuffer>
66  __device__ void Run(const SrcDesc&,
67  const SrcSliceOriginIdx&,
68  const SrcBuffer& src_buf,
69  const DstDesc& dst_desc,
70  DstBuffer& dst_buf)
71  {
72  static_assert(SrcDesc::IsKnownAtCompileTime(),
73  "wrong! SrcDesc need to known at compile-time");
74 
76  "wrong! SrcSliceOrigin need to known at compile-time");
77 
78  static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
79 
80  // SrcDesc and src_slice_origin_idx are known at compile-time
81  constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
82  constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
83 
84  // scalar per access on each dim
85  // TODO: don't use lambda_scalar_per_access
86  constexpr auto dst_scalar_per_access = generate_sequence(
88 
89  constexpr auto dst_scalar_step_in_vector =
91 
92  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
93  DimAccessOrder,
94  remove_cv_t<decltype(dst_scalar_per_access)>>;
95 
96  // TODO: Use SpaceFillingCurve::ScalarsPerAccess instread of DstScalarPerVector?
97  static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector,
98  "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
101 
102  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
103 
104  static_for<0, num_access, 1>{}([&](auto idx_1d) {
105  constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
106 
107  // copy data from src_buf into dst_vector
108  // TODO: It's a hack here to use \p dst_scalar_step_in_vector. Use SpaceFillingCurve?
110  constexpr index_t src_offset = src_desc.CalculateOffset(
111  src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
112 
113  DstData v;
114 
115  // apply element-wise operation
116  element_op_(v, src_buf[Number<src_offset>{}]);
117 
118  dst_vector.template AsType<DstData>()(i) = v;
119  });
120 
121  const bool is_dst_valid =
123 
124  // copy data from dst_vector into dst_buf
125  dst_buf.template Update<DstInMemOp, dst_vector_t>(
126  dst_coord_.GetOffset(),
127  is_dst_valid,
128  dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
129 
130  if constexpr(idx_1d.value != num_access - 1)
131  {
132  constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
133 
135  dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
136  }
137  });
138 
139  // move dst coordinate back to slice origin (or not)
140  if constexpr(DstResetCoordinateAfterRun)
141  {
142  const auto dst_reset_step =
144 
145  move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
146  }
147  }
148 
149  __device__ static constexpr auto GetDstCoordinateResetStep()
150  {
151  constexpr auto dst_scalar_per_access = generate_sequence(
153 
154  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
155  DimAccessOrder,
156  remove_cv_t<decltype(dst_scalar_per_access)>>;
157 
158  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
159  if constexpr(num_access == 0)
160  {
161  return typename SpaceFillingCurve::Index{};
162  }
163  else
164  {
165  constexpr auto reset_step =
167 
168  return reset_step;
169  }
170  }
171 
172  // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
173  __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
174  const Index& dst_slice_origin_step_idx)
175  {
176  // if dst coord was not reset by Run(), then need to adjust the step here
177  const auto adjusted_step_idx =
178  DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
179  : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
180 
181  // is it OK to construct a new step every time?
182  const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
183 
184  move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
185  }
186 
187  private:
188  DstCoord dst_coord_;
189  const ElementwiseOperation element_op_;
190 }; // namespace ThreadwiseTensorSliceTransfer_v1r3
191 
192 // Assume:
193 // 1. src:
194 // 1. SrcDesc is not known at compile-time
195 // 2. SrcBuffer is DynamicBuffer
196 // 3. src_slice_origin_idx is not known at compile-time
197 // 2. dst:
198 // 1. DstDesc is known at compile-time
199 // 2. DstBuffer is StaticBuffer
200 // 3. dst_slice_origin_idx is known at compile-time
201 template <typename SrcData,
202  typename DstData,
203  typename SrcDesc,
204  typename DstDesc,
205  typename SliceLengths,
206  typename DimAccessOrder,
207  index_t SrcVectorDim,
208  index_t SrcScalarPerVector,
209  index_t SrcScalarStrideInVector,
210  bool SrcResetCoordinateAfterRun,
211  bool InvalidElementAsNaN = false,
212  typename enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false>
214 {
215  static_assert((InvalidElementAsNaN && !ck::is_integral<DstData>::value) ||
216  (!InvalidElementAsNaN),
217  "Filling invalid element as NaN is only for floating point types");
218 
219  static constexpr index_t nDim = SliceLengths::Size();
220 
222 
223  using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
224 
225  using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
226 
227  __device__ constexpr ThreadwiseTensorSliceTransfer_v2(const SrcDesc& src_desc,
228  const Index& src_slice_origin_idx)
229  : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin_idx))
230  {
231  static_assert(DstDesc::IsKnownAtCompileTime(),
232  "wrong! SrcDesc need to known at compile-time");
233  static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
234  "wrong! Not divisible");
235  }
236 
237  __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
238  {
239  src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
240  }
241 
242  template <typename SrcBuffer, typename DstBuffer, typename DstSliceOriginIdx>
243  __device__ void Run(const SrcDesc& src_desc,
244  const SrcBuffer& src_buf,
245  const DstDesc&,
246  const DstSliceOriginIdx&,
247  DstBuffer& dst_buf)
248  {
249  static_assert(DstDesc::IsKnownAtCompileTime(),
250  "wrong! DstDesc need to known at compile-time");
251 
253  "wrong! DstSliceOrigin need to known at compile-time");
254 
255  static_assert(
257  "wrong! inconsistent type");
258 
259  // DstDesc and dst_slice_origin_idx are known at compile-time
260  constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
261  constexpr auto dst_slice_origin_idx = DstSliceOriginIdx{};
262 
263  // scalar per access on each dim
264  // TODO: don't use lambda_scalar_per_access
265  constexpr auto src_scalar_per_access = generate_sequence(
267 
268  constexpr auto src_scalar_step_in_vector =
270 
271  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
272  DimAccessOrder,
273  remove_cv_t<decltype(src_scalar_per_access)>>;
274 
275  // loop over tensor and copy
276  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
277 
278  static_for<0, num_access, 1>{}([&](auto idx_1d) {
280 
281  using src_vector_t =
283  constexpr auto src_data_idx = SpaceFillingCurve::GetIndex(idx_1d);
284 
285  const bool is_src_valid =
287 
288  // copy data from src_buf into src_vector
289  src_vector.template AsType<src_vector_t>()(Number<0>{}) =
290  src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
291 
292  // copy data from src_vector into dst_buf
294  constexpr index_t dst_offset =
295  dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx +
296  i * src_scalar_step_in_vector);
297 
298  if constexpr(InvalidElementAsNaN)
299  {
300  dst_buf(Number<dst_offset>{}) =
301  is_src_valid
302  ? type_convert<DstData>(src_vector.template AsType<SrcData>()[i])
304  }
305  else
306  {
307  dst_buf(Number<dst_offset>{}) =
308  type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
309  }
310  });
311 
312  if constexpr(idx_1d.value != num_access - 1)
313  {
314  constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
315 
317  src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
318  }
319  });
320 
321  // move src coordinate back to slice origin (or not)
322  if constexpr(SrcResetCoordinateAfterRun)
323  {
324  const auto src_reset_step =
326 
327  move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
328  }
329  }
330 
331  __device__ static constexpr auto GetSrcCoordinateResetStep()
332  {
333  constexpr auto src_scalar_per_access = generate_sequence(
335 
336  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
337  DimAccessOrder,
338  remove_cv_t<decltype(src_scalar_per_access)>>;
339 
340  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
341  if constexpr(num_access == 0)
342  {
343  return typename SpaceFillingCurve::Index{};
344  }
345  else
346  {
347  constexpr auto reset_step =
349 
350  return reset_step;
351  }
352  }
353 
354  // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
355  __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
356  const Index& src_slice_origin_step_idx)
357  {
358  // if src coord was not reset by Run(), then need to adjust the step here
359  const auto adjusted_step_idx =
360  SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
361  : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
362 
363  // is it OK to construct a new step every time?
364  const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
365 
366  move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
367  }
368 
369  // src_slice_origin_step_idx need to be known at compile-time, for performance reason
370  template <typename SrcMoveSliceWindowStepHack>
371  __device__ void
372  MoveSrcSliceWindow(const SrcDesc& src_desc,
373  const Index& src_slice_origin_step_idx,
374  const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
375  {
376  // if src coord was not reset by RunRead(), then need to adjust the step here
377  const auto adjusted_step_idx =
378  SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
379  : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
380 
381  // is it OK to construct a new step every time?
382  const auto adjusted_step = make_tensor_coordinate_step(
383  src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
384 
385  move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
386  }
387 
388  private:
389  SrcCoord src_coord_;
390 }; // namespace ck
391 
392 // Assume:
393 // 1. src_desc and dst_desc are not known at compile-time
394 // 2. SrcBuffer and DstBuffer are DynamicBuffer
395 // 3. src_slice_origin and dst_slice_origin are not known at compile-time,
396 // 4. Use thread buffer
397 template <typename SliceLengths,
398  InMemoryDataOperationEnum DstInMemOp,
399  typename SrcData,
400  typename DstData,
401  typename SrcDesc,
402  typename DstDesc,
403  typename SrcDimAccessOrder,
404  typename DstDimAccessOrder,
405  index_t SrcVectorDim,
406  index_t DstVectorDim,
407  index_t SrcScalarPerVector,
408  index_t DstScalarPerVector,
409  index_t SrcScalarStrideInVector,
410  index_t DstScalarStrideInVector,
411  bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
412  // RunRead(), will be fused with MoveSrcSliceWindow to
413  // save addr computation
414  bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
415  // RunWrite(), will be fused with MoveDstSliceWindow to
416  // save addr computation
418 {
419  static constexpr index_t nDim = SliceLengths::Size();
421 
422  using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
423  using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
424 
425  using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
426  using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
427 
428  __device__ constexpr ThreadwiseTensorSliceTransfer_v3(const SrcDesc& src_desc,
429  const Index& src_slice_origin,
430  const DstDesc& dst_desc,
431  const Index& dst_slice_origin)
432  : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
433  dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
434  {
435  static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
436  "wrong! Not divisible");
437  static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
438  "wrong! Not divisible");
439  }
440 
441  __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
442  {
443  src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
444  }
445 
446  __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
447  {
448  dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
449  }
450 
451  template <typename SrcBuffer, typename SrcStepHacks>
452  __device__ void
453  RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
454  {
455  static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
456  SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
457  "wrong!");
458 
459  static_assert(
461  "wrong! SrcBuffer and SrcData data type are inconsistent");
462 
463  constexpr auto I0 = Number<0>{};
464  constexpr auto I1 = Number<1>{};
465 
466  // scalar per access on each dim
467  // TODO: don't use lambda_scalar_per_access
468  constexpr auto src_scalar_per_access = generate_sequence(
470 
471  constexpr auto src_scalar_step_in_vector =
473 
474  constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
475 
476  constexpr auto src_dim_access_order = SrcDimAccessOrder{};
477 
478  constexpr auto ordered_src_access_lengths =
479  container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
480 
481  // make forward steps
482  const auto src_forward_steps = generate_tuple(
483  [&](auto i) {
484  Index forward_step_idx;
485 
486  static_for<0, nDim, 1>{}([&](auto j) {
487  forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
488  });
489 
491  src_desc, forward_step_idx, src_step_hacks[I0][i]);
492  },
493  Number<nDim>{});
494 
495  // make backward steps
496  const auto src_backward_steps = generate_tuple(
497  [&](auto i) {
498  Index backward_step_idx;
499 
500  static_for<0, nDim, 1>{}([&](auto j) {
501  backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
502  });
503 
505  src_desc, backward_step_idx, src_step_hacks[I1][i]);
506  },
507  Number<nDim>{});
508 
509  // loop over tensor and copy
510  static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
511  // judge move forward or move backward
512  constexpr auto forward_sweep = [&]() {
513  StaticallyIndexedArray<bool, nDim> forward_sweep_;
514 
515  forward_sweep_(I0) = true;
516 
517  static_for<1, nDim, 1>{}([&](auto i) {
518  index_t tmp = ordered_src_access_idx[I0];
519 
520  static_for<1, i, 1>{}([&](auto j) {
521  tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
522  });
523 
524  forward_sweep_(i) = tmp % 2 == 0;
525  });
526 
527  return forward_sweep_;
528  }();
529 
530  // calculate src data index
531  constexpr auto src_data_idx = [&]() {
532  Index ordered_idx;
533 
534  static_for<0, nDim, 1>{}([&](auto i) {
535  ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
536  : ordered_src_access_lengths[i] - 1 -
537  ordered_src_access_idx[i];
538  });
539 
540  return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
541  src_scalar_per_access;
542  }();
543 
545 
546  using src_vector_t = typename decltype(src_tmp_vector)::type;
547 
548  const bool is_src_valid =
550 
551  // copy data from src_buf to src_tmp_vector
552  src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
553  src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
554 
555  // copy data from src_tmp_vector to buffer_
557  constexpr index_t buffer_offset =
558  buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
559 
560  buffer_(Number<buffer_offset>{}) = src_tmp_vector.template AsType<SrcData>()[i];
561  });
562 
563  constexpr auto move_on_dim = [&]() constexpr
564  {
566 
567  static_for<0, nDim, 1>{}([&](auto i) {
568  move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
569 
570  static_for<i + 1, nDim, 1>{}([&](auto j) {
571  move_on_dim_(i) &=
572  ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
573  });
574  });
575 
576  return move_on_dim_;
577  }
578  ();
579 
580  // move
581  static_for<0, nDim, 1>{}([&](auto i) {
582  if constexpr(move_on_dim[i])
583  {
584  if constexpr(forward_sweep[i])
585  {
587  src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
588  }
589  else
590  {
592  src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
593  }
594  }
595  });
596  });
597 
598  // move src coordinate back to slice origin (or not)
599  if constexpr(SrcResetCoordinateAfterRun)
600  {
601  const auto src_reset_step =
603 
604  move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
605  }
606  }
607 
608  template <typename DstBuffer, typename DstStepHacks>
609  __device__ void
610  RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
611  {
612  static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
613  DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
614  "wrong!");
615 
616  static_assert(
618  "wrong! SrcBuffer or DstBuffer data type is wrong");
619 
620  constexpr auto I0 = Number<0>{};
621  constexpr auto I1 = Number<1>{};
622 
623  // src scalar per access on each dim
624  // TODO: don't use this
625  constexpr auto dst_scalar_per_access = generate_sequence(
627 
628  constexpr auto dst_scalar_step_in_vector =
630 
631  constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
632 
633  constexpr auto dst_dim_access_order = DstDimAccessOrder{};
634 
635  constexpr auto ordered_dst_access_lengths =
636  container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
637 
638  // make forward steps
639  const auto dst_forward_steps = generate_tuple(
640  [&](auto i) {
641  Index forward_step_idx;
642 
643  static_for<0, nDim, 1>{}([&](auto j) {
644  forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
645  });
646 
648  dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
649  },
650  Number<nDim>{});
651 
652  // make backward steps
653  const auto dst_backward_steps = generate_tuple(
654  [&](auto i) {
655  Index backward_step_idx;
656 
657  static_for<0, nDim, 1>{}([&](auto j) {
658  backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
659  });
660 
662  dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
663  },
664  Number<nDim>{});
665 
666  // loop over tensor and copy
667  static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
668  // judge move forward or move backward
669  constexpr auto forward_sweep = [&]() {
670  StaticallyIndexedArray<bool, nDim> forward_sweep_;
671 
672  forward_sweep_(I0) = true;
673 
674  static_for<1, nDim, 1>{}([&](auto i) {
675  index_t tmp = ordered_dst_access_idx[I0];
676 
677  static_for<1, i, 1>{}([&](auto j) {
678  tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
679  });
680 
681  forward_sweep_(i) = tmp % 2 == 0;
682  });
683 
684  return forward_sweep_;
685  }();
686 
687  // calculate dst data index
688  constexpr auto dst_data_idx = [&]() {
689  Index ordered_idx;
690 
691  static_for<0, nDim, 1>{}([&](auto i) {
692  ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
693  : ordered_dst_access_lengths[i] - 1 -
694  ordered_dst_access_idx[i];
695  });
696 
697  return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
698  dst_scalar_per_access;
699  }();
700 
702 
703  // copy data from buffer_ to dst_tmp_vector
705  constexpr index_t buffer_offset =
706  buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
707 
708  dst_tmp_vector.template AsType<DstData>()(i) =
709  type_convert<DstData>(buffer_[Number<buffer_offset>{}]);
710  });
711 
712  using dst_vector_t = typename decltype(dst_tmp_vector)::type;
713 
714  // copy data from dst_tmp_vector to dst_buf
715  const bool is_dst_valid =
717 
718  dst_buf.template Set<dst_vector_t>(
719  dst_coord_.GetOffset(),
720  is_dst_valid,
721  dst_tmp_vector.template AsType<dst_vector_t>()[Number<0>{}]);
722 
723  constexpr auto move_on_dim = [&]() constexpr
724  {
726 
727  static_for<0, nDim, 1>{}([&](auto i) {
728  move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
729 
730  static_for<i + 1, nDim, 1>{}([&](auto j) {
731  move_on_dim_(i) &=
732  ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
733  });
734  });
735 
736  return move_on_dim_;
737  }
738  ();
739 
740  // move
741  static_for<0, nDim, 1>{}([&](auto i) {
742  if constexpr(move_on_dim[i])
743  {
744  if constexpr(forward_sweep[i])
745  {
747  dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
748  }
749  else
750  {
752  dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
753  }
754  }
755  });
756  });
757 
758  // move dst coordinate back to slice origin (or not)
759  if constexpr(DstResetCoordinateAfterRun)
760  {
761  const auto dst_reset_step =
763 
764  move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
765  }
766  }
767 
768  template <typename SrcBuffer>
769  __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
770  {
771  constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
772 
773  constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
774 
775  constexpr auto src_step_hacks =
776  make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
777  generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
778 
779  RunRead(src_desc, src_buf, src_step_hacks);
780  }
781 
782  template <typename DstBuffer>
783  __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
784  {
785  constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
786 
787  constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
788 
789  constexpr auto dst_step_hacks =
790  make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
791  generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
792 
793  RunWrite(dst_desc, dst_buf, dst_step_hacks);
794  }
795 
796  __device__ static constexpr auto GetSrcCoordinateResetStep()
797  {
798  constexpr auto I0 = Number<0>{};
799 
800  // scalar per access on each dim
801  // TODO: don't use lambda_scalar_per_access
802  constexpr auto src_scalar_per_access = generate_sequence(
804 
805  constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
806 
807  constexpr auto src_dim_access_order = SrcDimAccessOrder{};
808 
809  constexpr auto ordered_src_access_lengths =
810  container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
811 
812  // judge move forward or move backward during the last iteration
813  constexpr auto forward_sweep = [&]() {
814  StaticallyIndexedArray<bool, nDim> forward_sweep_;
815 
816  forward_sweep_(I0) = true;
817 
818  static_for<1, nDim, 1>{}([&](auto i) {
819  index_t tmp = ordered_src_access_lengths[I0] - 1;
820 
821  static_for<1, i, 1>{}([&](auto j) {
822  tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
823  });
824 
825  forward_sweep_(i) = tmp % 2 == 0;
826  });
827 
828  return forward_sweep_;
829  }();
830 
831  // calculate src data index after last iteration in RunRead(), if it has not being reset by
832  // RunRead()
833  constexpr auto src_data_idx = [&]() {
834  Index ordered_idx;
835 
836  static_for<0, nDim, 1>{}([&](auto i) {
837  ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
838  });
839 
840  return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
841  src_scalar_per_access;
842  }();
843 
844  //
845  constexpr auto reset_src_data_step = [&]() {
846  Index reset_src_data_step_;
847 
848  static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
849 
850  return reset_src_data_step_;
851  }();
852 
853  return reset_src_data_step;
854  }
855 
856  __device__ static constexpr auto GetDstCoordinateResetStep()
857  {
858  constexpr auto I0 = Number<0>{};
859 
860  // scalar per access on each dim
861  // TODO: don't use lambda_scalar_per_access
862  constexpr auto dst_scalar_per_access = generate_sequence(
864 
865  constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
866 
867  constexpr auto dst_dim_access_order = DstDimAccessOrder{};
868 
869  constexpr auto ordered_dst_access_lengths =
870  container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
871 
872  // judge move forward or move backward during the last iteration
873  constexpr auto forward_sweep = [&]() {
874  StaticallyIndexedArray<bool, nDim> forward_sweep_;
875 
876  forward_sweep_(I0) = true;
877 
878  static_for<1, nDim, 1>{}([&](auto i) {
879  index_t tmp = ordered_dst_access_lengths[I0] - 1;
880 
881  static_for<1, i, 1>{}([&](auto j) {
882  tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
883  });
884 
885  forward_sweep_(i) = tmp % 2 == 0;
886  });
887 
888  return forward_sweep_;
889  }();
890 
891  // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
892  // RunWrite()
893  constexpr auto dst_data_idx = [&]() {
894  Index ordered_idx;
895 
896  static_for<0, nDim, 1>{}([&](auto i) {
897  ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
898  });
899 
900  return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
901  dst_scalar_per_access;
902  }();
903 
904  //
905  constexpr auto reset_dst_data_step = [&]() {
906  Index reset_dst_data_step_;
907 
908  static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
909 
910  return reset_dst_data_step_;
911  }();
912 
913  return reset_dst_data_step;
914  }
915 
916  // src_slice_origin_step_idx need to be known at compile-time, for performance reason
917  __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
918  const Index& src_slice_origin_step_idx)
919  {
920  // if src coord was not reset by RunRead(), then need to adjust the step here
921  const auto adjusted_step_idx =
922  SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
923  : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
924 
925  // is it OK to construct a new step every time?
926  const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
927 
928  move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
929  }
930 
931  // src_slice_origin_step_idx need to be known at compile-time, for performance reason
932  template <typename SrcMoveSliceWindowStepHack>
933  __device__ void
934  MoveSrcSliceWindow(const SrcDesc& src_desc,
935  const Index& src_slice_origin_step_idx,
936  const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
937  {
938  // if src coord was not reset by RunRead(), then need to adjust the step here
939  const auto adjusted_step_idx =
940  SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
941  : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
942 
943  // is it OK to construct a new step every time?
944  const auto adjusted_step = make_tensor_coordinate_step(
945  src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
946 
947  move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
948  }
949  // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
950  __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
951  const Index& dst_slice_origin_step_idx)
952  {
953  // if dst coord was not reset by RunWrite(), then need to adjust the step here
954  const auto adjusted_step_idx =
955  DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
956  : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
957 
958  // is it OK to construct a new step every time?
959  const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
960 
961  move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
962  }
963 
964  private:
965  static constexpr auto buffer_desc_ =
967 
968  static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
969 
970  StaticBuffer<AddressSpaceEnum::Vgpr, SrcData, buffer_size_, true> buffer_;
971 
972  SrcCoord src_coord_;
973  DstCoord dst_coord_;
974 };
975 
976 // Assume:
977 // 1. src:
978 // 1. SrcDesc is known at compile-time
979 // 2. SrcBuffer is DynamicBuffer
980 // 3. src_ref_idx is known at run-time
981 // 4. SrcRefToOriginDisplacement is known at compile-time
982 // 5. use #-step
983 // 2. dst:
984 // 1. DstDesc is known at compile-time
985 // 2. DstBuffer is StaticBuffer
986 // 3. DstOriginIdx is known at compile-time
987 // 4. use direct address calculation
988 // 3. vector access on src
989 template <typename SrcData,
990  typename DstData,
991  typename SrcDesc,
992  typename DstDesc,
993  typename SliceLengths,
994  typename DimAccessOrder,
995  index_t SrcVectorDim,
996  index_t SrcScalarPerVector,
997  index_t SrcScalarStrideInVector,
998  typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
999  bool>::type = false>
1001 {
1002  static constexpr index_t nDim = SliceLengths::Size();
1003 
1005 
1006  using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
1007 
1008  using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
1009 
1010  static constexpr index_t PackedSize = []() {
1011  if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
1012  return 2;
1013  else
1014  return 1;
1015  }();
1016 
1017  __device__ constexpr ThreadwiseTensorSliceTransfer_v4(const Index& src_ref_idx)
1018  : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx))
1019  {
1020  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1021  "wrong! SrcDesc and DstDesc need to known at compile-time");
1022 
1023  static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
1024  "wrong! Not divisible");
1025 
1026  if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
1027  {
1028  static_assert(SrcScalarPerVector % PackedSize == 0, "pk data N cannot be 1");
1029  }
1030  }
1031 
1032  template <typename SrcRefToOriginDisplacement,
1033  typename DstOriginIdx,
1034  typename SrcBuffer,
1035  typename DstBuffer>
1036  __device__ void Run(const SrcDesc&,
1037  const SrcRefToOriginDisplacement&,
1038  const SrcBuffer& src_buf,
1039  const DstDesc&,
1040  const DstOriginIdx&,
1041  DstBuffer& dst_buf) const
1042  {
1043  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1044  "wrong! SrcDesc and DstDesc need to known at compile-time");
1045 
1046  static_assert(
1049  "wrong! SrcBuffer or DstBuffer data type is wrong");
1050 
1051  static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
1052 
1055  "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
1056  "at compile-time");
1057 
1058  // SrcDesc and DstDesc are known at compile-time
1059  constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
1060  constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
1061 
1062  // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time
1063  constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{});
1064  constexpr auto dst_origin_idx = to_multi_index(DstOriginIdx{});
1065 
1066  // scalar per access of each dim
1067  constexpr auto src_scalar_per_access = generate_sequence_v2(
1068  [&](auto i) constexpr {
1069  if constexpr(i == SrcVectorDim)
1070  {
1071  return Number<SrcScalarPerVector>{};
1072  }
1073  else
1074  {
1075  return Number<1>{};
1076  }
1077  },
1078  Number<nDim>{});
1079 
1080  // scalar step (if steping on SrcVectorDim) of each dim
1081  constexpr auto src_scalar_step_in_vector = generate_sequence_v2(
1082  [&](auto i) constexpr {
1083  if constexpr(i == SrcVectorDim)
1084  {
1085  return Number<1>{};
1086  }
1087  else
1088  {
1089  return Number<0>{};
1090  }
1091  },
1092  Number<nDim>{});
1093 
1094  constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
1095 
1096  constexpr auto dim_access_order = DimAccessOrder{};
1097 
1098  constexpr auto ordered_access_lengths =
1099  container_reorder_given_new2old(access_lengths, dim_access_order);
1100 
1101  static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
1102 #if 0
1103  // TODO: unable to compile
1104  // position in slice window
1105  constexpr auto data_to_origin_disp_idx =
1106  container_reorder_given_old2new(ordered_access_idx, dim_access_order) *
1107  src_scalar_per_access;
1108 #else
1109  // position in slice window
1110  constexpr auto data_to_origin_disp_idx =
1111  ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
1112 #endif
1113  // src coordinate
1114  constexpr auto src_ref_to_data_disp_idx =
1115  src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
1116 
1117  constexpr auto src_ref_to_data_disp_coord_step =
1118  make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx);
1119 
1120  auto src_data_coord = src_ref_coord_;
1121 
1122  move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
1123 
1124  vector_type_maker_t<SrcData, SrcScalarPerVector / PackedSize> src_tmp_vector;
1125 
1126  using src_vector_t = typename decltype(src_tmp_vector)::type;
1127 
1129  src_desc, src_data_coord);
1130 
1131  // copy data from src_buf into src_tmp_vector
1132  if constexpr(SrcBuffer::IsDynamicBuffer())
1133  {
1134  src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
1135  src_buf.template Get<src_vector_t>(src_data_coord.GetOffset() / PackedSize,
1136  is_src_valid);
1137  }
1138  else if constexpr(SrcBuffer::IsStaticBuffer())
1139  {
1140  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1141  constexpr index_t src_offset = src_desc.CalculateOffset(
1142  src_ref_to_origin_disp_idx + data_to_origin_disp_idx +
1143  i * src_scalar_step_in_vector);
1144 
1145  src_tmp_vector.template AsType<SrcData>()(i) = src_buf[Number<src_offset>{}];
1146  });
1147  }
1148 
1149  if constexpr(is_same<remove_cvref_t<SrcData>, pk_i4_t>::value)
1150  {
1151  // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
1152  // DstData)
1154 
1155  constexpr index_t pack_size = 8;
1156 
1157  static_assert(SrcScalarPerVector % pack_size == 0, "");
1158 
1159  using src_v_t = typename vector_type_maker_t<SrcData, pack_size / PackedSize>::type;
1160  using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
1161 
1162  static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) {
1164  dst_tmp_vector.template AsType<dst_v_t>()(i),
1165  src_tmp_vector.template AsType<src_v_t>()[i]);
1166  });
1167 
1168  // copy data from dst_tmp_vector into dst_buf
1169  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1170  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1171  dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1172 
1173  dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
1174  });
1175  }
1176  else if constexpr(is_same<remove_cvref_t<SrcData>, f8_t>::value &&
1178  SrcScalarPerVector % 2 == 0)
1179  {
1180  // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
1181  // DstData)
1183 
1184  constexpr index_t pack_size = 2;
1185 
1186  using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
1187  using src_v_t = typename vector_type_maker_t<SrcData, pack_size>::type;
1188  static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) {
1190  dst_tmp_vector.template AsType<dst_v_t>()(i),
1191  src_tmp_vector.template AsType<src_v_t>()[i]);
1192  });
1193 
1194  // copy data from dst_tmp_vector into dst_buf
1195  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1196  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1197  dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1198 
1199  dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
1200  });
1201  }
1202  else
1203  {
1204  // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
1205  // DstData)
1207 
1208  // TODO: if SrcData and DstData are vetor type, then static_cast may not compile
1209  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1210  dst_tmp_vector.template AsType<DstData>()(i) =
1211  type_convert<DstData>(src_tmp_vector.template AsType<SrcData>()[i]);
1212  });
1213 
1214  // copy data from dst_tmp_vector into dst_buf
1215  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1216  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1217  dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1218 
1219  dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
1220  });
1221  }
1222  });
1223  }
1224 
1225  // Fuse scale
1226  template <typename SrcRefToOriginDisplacement,
1227  typename DstOriginIdx,
1228  typename SrcBuffer,
1229  typename DstBuffer>
1230  __device__ void Run(const SrcDesc&,
1231  const SrcRefToOriginDisplacement&,
1232  const SrcBuffer& src_buf,
1233  const DstData& scale,
1234  const DstDesc&,
1235  const DstOriginIdx&,
1236  DstBuffer& dst_buf) const
1237  {
1238  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1239  "wrong! SrcDesc and DstDesc need to known at compile-time");
1240 
1241  static_assert(
1244  "wrong! SrcBuffer or DstBuffer data type is wrong");
1245 
1246  static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
1247 
1250  "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
1251  "at compile-time");
1252 
1253  // SrcDesc and DstDesc are known at compile-time
1254  constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
1255  constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
1256 
1257  // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time
1258  constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{});
1259  constexpr auto dst_origin_idx = to_multi_index(DstOriginIdx{});
1260 
1261  // scalar per access of each dim
1262  constexpr auto src_scalar_per_access = generate_sequence_v2(
1263  [&](auto i) constexpr {
1264  if constexpr(i == SrcVectorDim)
1265  {
1266  return Number<SrcScalarPerVector>{};
1267  }
1268  else
1269  {
1270  return Number<1>{};
1271  }
1272  },
1273  Number<nDim>{});
1274 
1275  // scalar step (if steping on SrcVectorDim) of each dim
1276  constexpr auto src_scalar_step_in_vector = generate_sequence_v2(
1277  [&](auto i) constexpr {
1278  if constexpr(i == SrcVectorDim)
1279  {
1280  return Number<1>{};
1281  }
1282  else
1283  {
1284  return Number<0>{};
1285  }
1286  },
1287  Number<nDim>{});
1288 
1289  constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
1290 
1291  constexpr auto dim_access_order = DimAccessOrder{};
1292 
1293  constexpr auto ordered_access_lengths =
1294  container_reorder_given_new2old(access_lengths, dim_access_order);
1295 
1296  static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
1297 #if 0
1298  // TODO: unable to compile
1299  // position in slice window
1300  constexpr auto data_to_origin_disp_idx =
1301  container_reorder_given_old2new(ordered_access_idx, dim_access_order) *
1302  src_scalar_per_access;
1303 #else
1304  // position in slice window
1305  constexpr auto data_to_origin_disp_idx =
1306  ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
1307 #endif
1308  // src coordinate
1309  constexpr auto src_ref_to_data_disp_idx =
1310  src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
1311 
1312  constexpr auto src_ref_to_data_disp_coord_step =
1313  make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx);
1314 
1315  auto src_data_coord = src_ref_coord_;
1316 
1317  move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
1318 
1319  vector_type_maker_t<SrcData, SrcScalarPerVector / PackedSize> src_tmp_vector;
1320 
1321  using src_vector_t = typename decltype(src_tmp_vector)::type;
1322 
1324  src_desc, src_data_coord);
1325 
1326  // copy data from src_buf into src_tmp_vector
1327  if constexpr(SrcBuffer::IsDynamicBuffer())
1328  {
1329  src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
1330  src_buf.template Get<src_vector_t>(src_data_coord.GetOffset() / PackedSize,
1331  is_src_valid);
1332  }
1333  else if constexpr(SrcBuffer::IsStaticBuffer())
1334  {
1335  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1336  constexpr index_t src_offset = src_desc.CalculateOffset(
1337  src_ref_to_origin_disp_idx + data_to_origin_disp_idx +
1338  i * src_scalar_step_in_vector);
1339 
1340  src_tmp_vector.template AsType<SrcData>()(i) = src_buf[Number<src_offset>{}];
1341  });
1342  }
1343 
1344  if constexpr(is_same<remove_cvref_t<SrcData>, pk_i4_t>::value)
1345  {
1346  // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
1347  // DstData)
1349  vector_type<DstData, 2> scale_vector;
1350  scale_vector.template AsType<DstData>()(Number<0>{}) = scale;
1351  scale_vector.template AsType<DstData>()(Number<1>{}) = scale;
1352 
1353  constexpr index_t pack_size = 8;
1354 
1355  static_assert(SrcScalarPerVector % pack_size == 0, "");
1356 
1357  using src_v_t = typename vector_type_maker_t<SrcData, pack_size / PackedSize>::type;
1358  using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
1359  using scale_v_t = typename vector_type_maker_t<DstData, 2>::type;
1360 
1361  static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) {
1363  dst_tmp_vector.template AsType<dst_v_t>()(i),
1364  src_tmp_vector.template AsType<src_v_t>()[i],
1365  scale_vector.template AsType<scale_v_t>()[Number<0>{}]);
1366  });
1367 
1368  // copy data from dst_tmp_vector into dst_buf
1369  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1370  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1371  dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1372 
1373  dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
1374  });
1375  }
1376  else if constexpr(is_same<remove_cvref_t<SrcData>, f8_t>::value &&
1378  SrcScalarPerVector % 2 == 0)
1379  {
1380  // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
1381  // DstData)
1383 
1384  constexpr index_t pack_size = 2;
1385 
1386  using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
1387  using src_v_t = typename vector_type_maker_t<SrcData, pack_size>::type;
1388  static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) {
1390  dst_tmp_vector.template AsType<dst_v_t>()(i),
1391  src_tmp_vector.template AsType<src_v_t>()[i]);
1392  });
1393 
1394  // copy data from dst_tmp_vector into dst_buf
1395  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1396  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1397  dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1398 
1399  dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
1400  });
1401  }
1402  else
1403  {
1404  // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
1405  // DstData)
1407 
1408  // TODO: if SrcData and DstData are vetor type, then static_cast may not compile
1409  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1410  dst_tmp_vector.template AsType<DstData>()(i) =
1411  type_convert<DstData>(src_tmp_vector.template AsType<SrcData>()[i]);
1412  });
1413 
1414  // copy data from dst_tmp_vector into dst_buf
1415  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1416  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1417  dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1418 
1419  dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
1420  });
1421  }
1422  });
1423  }
1424 
1425  template <typename SrcSliceMoveStepIdx>
1426  __device__ void MoveSrcSliceWindow(const SrcDesc&,
1427  const SrcSliceMoveStepIdx& src_slice_move_step_idx)
1428  {
1429  constexpr auto src_desc = SrcDesc{};
1430 
1431  const auto src_slice_move_step_iter =
1432  make_tensor_coordinate_step(src_desc, to_multi_index(src_slice_move_step_idx));
1433 
1434  move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
1435  }
1436  __device__ void SetSrcCoord(const Index& src_ref_idx)
1437  {
1438  src_ref_coord_ = make_tensor_coordinate(SrcDesc{}, src_ref_idx);
1439  }
1440 
1441  private:
1442  SrcCoord src_ref_coord_;
1443 };
1444 
1451 template <typename SrcData,
1452  typename DstData,
1453  typename SrcDesc,
1454  typename DstDesc,
1455  typename ElementwiseOperation,
1456  typename SliceLengths,
1457  typename DimAccessOrder,
1458  index_t DstVectorDim,
1459  index_t DstScalarPerVector,
1460  typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1461  bool>::type = false>
1463 {
1464  static constexpr index_t nDim = SliceLengths::Size();
1465 
1467 
1469  const ElementwiseOperation& element_op)
1470  : element_op_{element_op}
1471  {
1472  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1473  "wrong! Desc need to known at compile-time");
1474 
1475  static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
1476  "wrong! Not divisible");
1477  }
1478 
1479  template <typename SrcSliceOriginIdx,
1480  typename DstSliceOriginIdx,
1481  typename SrcBuffer,
1482  typename DstBuffer>
1483  __device__ void Run(const SrcDesc&,
1484  const SrcSliceOriginIdx&,
1485  const SrcBuffer& src_buf,
1486  const DstDesc&,
1487  const DstSliceOriginIdx&,
1488  DstBuffer& dst_buf)
1489  {
1490  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1491  "wrong! Desc need to known at compile-time");
1492 
1495  "wrong! SliceOrigin need to known at compile-time");
1496 
1497  static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
1498  "wrong! Buffer need to be StaticBuffer");
1499 
1500  // SrcDesc and src_slice_origin_idx are known at compile-time
1501  constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
1502  constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
1503  constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
1504  constexpr auto dst_slice_origin_idx = to_multi_index(DstSliceOriginIdx{});
1505 
1506  // scalar per access on each dim
1507  constexpr auto dst_scalar_per_access = generate_sequence(
1509 
1510  constexpr auto dst_scalar_step_in_vector =
1512 
1513  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
1514  DimAccessOrder,
1515  remove_cv_t<decltype(dst_scalar_per_access)>>;
1516 
1517  static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector,
1518  "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
1519 
1520  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
1521 
1522  static_for<0, num_access, 1>{}([&](auto idx_1d) {
1523  constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
1524 
1525  // copy data from src_buf into dst_vector
1526  static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
1527  constexpr index_t src_offset = src_desc.CalculateOffset(
1528  src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1529 
1530  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1531  dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1532 
1533  DstData v;
1534 
1535  // apply element-wise operation
1536  element_op_(v, src_buf[Number<src_offset>{}]);
1537 
1538  // apply type convert
1539  dst_buf(Number<dst_offset>{}) = v;
1540  });
1541  });
1542  }
1543 
1544  ElementwiseOperation element_op_;
1545 };
1546 
1547 // Specialized for gfx11
1548 // A single Wave32 is composed by double row
1549 // Data exchange allowed between these two rows
1550 // This RowLane Dst buf will be filled from two Src buf
1551 // SrcA: From specific thread buffer hold by This RowLane on This Row
1552 // SrcB: From specific thread buffer hold by This RowLane on The other Row
1553 template <typename SrcData,
1554  typename DstData,
1555  typename SrcDesc,
1556  typename DstDesc,
1557  typename ElementwiseOperation,
1558  typename SliceLengths,
1559  typename DimAccessOrder,
1560  index_t DstVectorDim,
1561  index_t DstScalarPerVector,
1562  uint32_t LowEightRowlaneIdx,
1563  uint32_t HighEightRowLaneIdx,
1564  bool IntraRowSwizzlePerm,
1565  typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1566  bool>::type = false>
1568 {
1569  static constexpr index_t nDim = SliceLengths::Size();
1570 
1572 
1574  {
1575  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1576  "wrong! Desc need to known at compile-time");
1577 
1578  static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
1579  "wrong! Not divisible");
1580  ignore = src_idx;
1581  }
1582 
1583  template <typename SrcSliceOriginIdx,
1584  typename DstSliceOriginIdx,
1585  typename SrcBuffer,
1586  typename DstBuffer>
1587  __device__ void Run(const SrcDesc&,
1588  const SrcSliceOriginIdx&,
1589  const SrcBuffer& src_buf,
1590  const DstDesc&,
1591  const DstSliceOriginIdx&,
1592  DstBuffer& dst_buf) const
1593  {
1594  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1595  "wrong! Desc need to known at compile-time");
1596 
1599  "wrong! SliceOrigin need to known at compile-time");
1600 
1601  static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
1602  "wrong! Buffer need to be StaticBuffer");
1603 
1604  // SrcDesc and src_slice_origin_idx are known at compile-time
1605  constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
1606  constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
1607  constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
1608  constexpr auto dst_slice_origin_idx = to_multi_index(DstSliceOriginIdx{});
1609 
1610  // scalar per access on each dim
1611  constexpr auto dst_scalar_per_access = generate_sequence(
1613 
1614  constexpr auto dst_scalar_step_in_vector =
1616 
1617  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
1618  DimAccessOrder,
1619  remove_cv_t<decltype(dst_scalar_per_access)>>;
1620 
1621  static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector,
1622  "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
1623 
1624  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
1625 
1626  static_for<0, num_access, 1>{}([&](auto idx_1d) {
1627  constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
1628 
1629  // copy data from src_buf into dst_vector
1630  static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
1631  // src_desc error, non constexpr, caused by merge transform
1632  constexpr index_t src_offset = src_desc.CalculateOffset(
1633  src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1634 
1635  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1636  dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1637 
1638  SrcData v_this_row, v_theother_row;
1639  // int type temp value due to intrinsic requirement
1640  int temp = 0;
1641 
1642  // apply element-wise operation
1643  element_op_(v_this_row, src_buf[Number<src_offset>{}]);
1644 
1645  // apply intra-row permute.
1646  if constexpr(IntraRowSwizzlePerm)
1647  {
1648  temp = __builtin_amdgcn_permlane16(
1649  temp, type_convert_sp<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
1650  v_this_row = type_convert_sp<SrcData>(temp);
1651  }
1652 
1653  // apply inter-row permute.
1654  temp = __builtin_amdgcn_permlanex16(temp,
1655  type_convert_sp<int>(v_this_row),
1656  LowEightRowlaneIdx,
1657  HighEightRowLaneIdx,
1658  1,
1659  0);
1660  v_theother_row = type_convert_sp<SrcData>(temp);
1661 
1662  if(get_thread_local_1d_id() % 32 < 16)
1663  {
1664  // apply type convert
1665  dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_this_row);
1667  type_convert_sp<DstData>(v_theother_row);
1668  }
1669  else
1670  {
1671  // apply type convert
1673  type_convert_sp<DstData>(v_this_row);
1674  dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_theother_row);
1675  }
1676  });
1677  });
1678  }
1679  ElementwiseOperation element_op_{};
1680 };
1681 
1682 // Specialized for gfx12
1683 template <typename SrcData,
1684  typename DstData,
1685  typename SrcDesc,
1686  typename DstDesc,
1687  typename ElementwiseOperation,
1688  typename SliceLengths,
1689  typename DimAccessOrder,
1690  index_t DstVectorDim,
1691  index_t DstScalarPerVector,
1692  bool IntraRowSwizzlePerm,
1693  typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1694  bool>::type = false>
1696 {
1697  static constexpr index_t nDim = SliceLengths::Size();
1698 
1700 
1702  {
1703  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1704  "wrong! Desc need to known at compile-time");
1705 
1706  static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
1707  "wrong! Not divisible");
1708  ignore = src_idx;
1709  }
1710 
1711  template <typename SrcSliceOriginIdx,
1712  typename DstSliceOriginIdx,
1713  typename SrcBuffer,
1714  typename DstBuffer>
1715  __device__ void Run(const SrcDesc&,
1716  const SrcSliceOriginIdx&,
1717  const SrcBuffer& src_buf,
1718  const DstDesc&,
1719  const DstSliceOriginIdx&,
1720  DstBuffer& dst_buf) const
1721  {
1722  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1723  "wrong! Desc need to known at compile-time");
1724 
1727  "wrong! SliceOrigin need to known at compile-time");
1728 
1729  static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
1730  "wrong! Buffer need to be StaticBuffer");
1731 
1732  // SrcDesc and src_slice_origin_idx are known at compile-time
1733  constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
1734  constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
1735  constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
1736  constexpr auto dst_slice_origin_idx = to_multi_index(DstSliceOriginIdx{});
1737 
1738  // scalar per access on each dim
1739  constexpr auto dst_scalar_per_access = generate_sequence(
1741 
1742  constexpr auto dst_scalar_step_in_vector =
1744 
1745  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
1746  DimAccessOrder,
1747  remove_cv_t<decltype(dst_scalar_per_access)>>;
1748 
1749  static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector,
1750  "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
1751 
1752  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
1753 
1754  static_for<0, num_access, 1>{}([&](auto idx_1d) {
1755  constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
1756 
1757  // copy data from src_buf into dst_vector
1758  static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
1759  // src_desc error, non constexpr, caused by merge transform
1760  constexpr index_t src_offset = src_desc.CalculateOffset(
1761  src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1762 
1763  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1764  dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1765 
1766  SrcData v_this_row;
1767  // int type temp value due to intrinsic requirement
1768  int temp = 0;
1769 
1770  // apply element-wise operation
1771  element_op_(v_this_row, src_buf[Number<src_offset>{}]);
1772 
1773  // apply intra-row permute.
1774  if constexpr(IntraRowSwizzlePerm)
1775  {
1776  temp = __builtin_amdgcn_permlane16(
1777  temp, type_convert_sp<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
1778  v_this_row = type_convert_sp<SrcData>(temp);
1779  }
1780 
1781  // apply type convert
1782  dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_this_row);
1783  });
1784  });
1785  }
1786  ElementwiseOperation element_op_{};
1787 };
1788 
1789 } // namespace ck
Definition: ck.hpp:264
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition: statically_indexed_array.hpp:45
__host__ constexpr __device__ bool coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition: tensor_descriptor.hpp:560
__host__ constexpr __device__ auto generate_tuple(F &&f, Number< N >)
Definition: tuple_helper.hpp:15
InMemoryDataOperationEnum
Definition: ck.hpp:267
__host__ constexpr __device__ auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition: tensor_descriptor_helper.hpp:101
f8_fnuz_t f8_t
Definition: amd_ck_fp8.hpp:990
__host__ constexpr __device__ auto to_multi_index(const T &x)
Definition: array_multi_index.hpp:28
_Float16 half_t
Definition: data_type.hpp:25
__host__ constexpr __device__ auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition: tensor_descriptor.hpp:407
constexpr detail::ignore_t ignore
Definition: ignore.hpp:20
__host__ constexpr __device__ auto generate_sequence(F, Number< N >)
Definition: sequence_helper.hpp:18
__host__ constexpr __device__ auto generate_sequence_v2(F &&f, Number< N >)
Definition: sequence_helper.hpp:25
__host__ constexpr __device__ auto sequence_to_tuple_of_number(Sequence< Is... >)
Definition: container_helper.hpp:380
std::enable_if< B, T > enable_if
Definition: enable_if.hpp:10
constexpr bool is_same_v
Definition: type.hpp:283
__host__ constexpr __device__ auto container_reorder_given_new2old(const Array< TData, NSize > &old_array, Sequence< IRs... >)
Definition: container_helper.hpp:43
__host__ constexpr __device__ auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:211
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition: type.hpp:300
int32_t index_t
Definition: ck.hpp:289
__host__ constexpr __device__ void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
Definition: tensor_descriptor.hpp:508
__host__ constexpr __device__ auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition: tensor_descriptor.hpp:444
__device__ index_t get_thread_local_1d_id()
Definition: get_id.hpp:16
typename remove_cv< T >::type remove_cv_t
Definition: type.hpp:298
__host__ constexpr __device__ auto container_reorder_given_old2new(const Array< TData, NSize > &old_array, Sequence< IRs... > old2new)
Definition: container_helper.hpp:54
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition: data_type.hpp:384
Definition: array.hpp:14
__host__ static constexpr __device__ T QuietNaN()
Definition: data_type.hpp:2835
Definition: tensor_space_filling_curve.hpp:20
static __device__ constexpr __host__ auto GetForwardStep(Number< AccessIdx1d >)
Definition: tensor_space_filling_curve.hpp:66
__host__ static constexpr __device__ index_t GetNumOfAccess()
Definition: tensor_space_filling_curve.hpp:41
static constexpr index_t ScalarPerVector
Definition: tensor_space_filling_curve.hpp:25
static __device__ constexpr __host__ Index GetIndex(Number< AccessIdx1d >)
Definition: tensor_space_filling_curve.hpp:81
static __device__ constexpr __host__ auto GetStepBetween(Number< AccessIdx1dBegin >, Number< AccessIdx1dEnd >)
Definition: tensor_space_filling_curve.hpp:52
Definition: threadwise_tensor_slice_transfer.hpp:1568
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1569
constexpr __device__ ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow(const Index &src_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1573
ElementwiseOperation element_op_
Definition: threadwise_tensor_slice_transfer.hpp:1679
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1587
Definition: threadwise_tensor_slice_transfer.hpp:1696
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1697
constexpr __device__ ThreadwiseTensorSliceTransfer_StaticToStatic_IntraRow(const Index &src_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1701
ElementwiseOperation element_op_
Definition: threadwise_tensor_slice_transfer.hpp:1786
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1715
Threadwise data transfer.
Definition: threadwise_tensor_slice_transfer.hpp:1463
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:1483
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1464
ElementwiseOperation element_op_
Definition: threadwise_tensor_slice_transfer.hpp:1544
constexpr __device__ ThreadwiseTensorSliceTransfer_StaticToStatic(const ElementwiseOperation &element_op)
Definition: threadwise_tensor_slice_transfer.hpp:1468
Definition: threadwise_tensor_slice_transfer.hpp:39
static constexpr __device__ auto GetDstCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:149
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:40
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:42
decltype(make_tensor_coordinate(DstDesc{}, Index{})) DstCoord
Definition: threadwise_tensor_slice_transfer.hpp:44
constexpr __device__ ThreadwiseTensorSliceTransfer_v1r3(const DstDesc &dst_desc, const Index &dst_slice_origin_idx, const ElementwiseOperation &element_op)
Definition: threadwise_tensor_slice_transfer.hpp:48
decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) DstCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:46
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:173
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:60
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:66
Definition: threadwise_tensor_slice_transfer.hpp:214
constexpr __device__ ThreadwiseTensorSliceTransfer_v2(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:227
__device__ void Run(const SrcDesc &src_desc, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:243
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:221
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:355
static constexpr __device__ auto GetSrcCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:331
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition: threadwise_tensor_slice_transfer.hpp:372
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:237
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:219
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:225
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:223
Definition: threadwise_tensor_slice_transfer.hpp:418
decltype(make_tensor_coordinate(DstDesc{}, Index{})) DstCoord
Definition: threadwise_tensor_slice_transfer.hpp:423
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:425
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:420
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition: threadwise_tensor_slice_transfer.hpp:934
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf, const SrcStepHacks &src_step_hacks)
Definition: threadwise_tensor_slice_transfer.hpp:453
decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) DstCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:426
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:950
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:446
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:783
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:441
static constexpr __device__ auto GetSrcCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:796
static constexpr __device__ auto GetDstCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:856
constexpr __device__ ThreadwiseTensorSliceTransfer_v3(const SrcDesc &src_desc, const Index &src_slice_origin, const DstDesc &dst_desc, const Index &dst_slice_origin)
Definition: threadwise_tensor_slice_transfer.hpp:428
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:422
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:419
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf)
Definition: threadwise_tensor_slice_transfer.hpp:769
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:917
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf, const DstStepHacks &dst_step_hacks)
Definition: threadwise_tensor_slice_transfer.hpp:610
Definition: threadwise_tensor_slice_transfer.hpp:1001
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1002
__device__ void Run(const SrcDesc &, const SrcRefToOriginDisplacement &, const SrcBuffer &src_buf, const DstDesc &, const DstOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1036
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:1010
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:1006
constexpr __device__ ThreadwiseTensorSliceTransfer_v4(const Index &src_ref_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1017
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:1008
__device__ void Run(const SrcDesc &, const SrcRefToOriginDisplacement &, const SrcBuffer &src_buf, const DstData &scale, const DstDesc &, const DstOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1230
__device__ void SetSrcCoord(const Index &src_ref_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1436
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:1004
__device__ void MoveSrcSliceWindow(const SrcDesc &, const SrcSliceMoveStepIdx &src_slice_move_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1426
Definition: threadwise_tensor_slice_transfer_util.hpp:20
Definition: threadwise_tensor_slice_transfer_util.hpp:29
Definition: integral_constant.hpp:10
Definition: type.hpp:206
Definition: is_known_at_compile_time.hpp:14
Definition: type.hpp:177
Definition: data_type.hpp:320
Definition: functional2.hpp:31
Definition: functional3.hpp:97
Definition: unary_element_wise_operation.hpp:174
Definition: unary_element_wise_operation.hpp:210
Definition: unary_element_wise_operation.hpp:115
typename sequence_gen< NSize, F >::type type
Definition: sequence.hpp:268
Definition: data_type.hpp:367
Definition: data_type.hpp:347