/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp Source File
threadwise_tensor_slice_transfer.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
10 
12 
14 
15 namespace ck {
16 // Assume:
17 // 1. src:
18 // 1. SrcDesc is known at compile-time
19 // 2. SrcBuffer is StaticBuffer
20 // 3. SrcSliceOrginIdx is known at compile-time
21 // 2. dst:
22 // 1. DstDesc is not known at compile-time
23 // 2. DstBuffer is DynamicBuffer
24 // 3. DstSliceOrginIdx is not known at compile time
25 template <typename SrcData,
26  typename DstData,
27  typename SrcDesc,
28  typename DstDesc,
29  typename ElementwiseOperation,
30  typename SliceLengths,
31  typename DimAccessOrder,
32  index_t DstVectorDim,
33  index_t DstScalarPerVector,
34  InMemoryDataOperationEnum DstInMemOp,
35  index_t DstScalarStrideInVector,
36  bool DstResetCoordinateAfterRun,
37  typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
39 {
40  static constexpr index_t nDim = SliceLengths::Size();
41 
43 
44  using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
45 
46  using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
47 
48  __device__ constexpr ThreadwiseTensorSliceTransfer_v1r3(const DstDesc& dst_desc,
49  const Index& dst_slice_origin_idx,
50  const ElementwiseOperation& element_op)
51  : dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx)),
52  element_op_{element_op}
53  {
54  static_assert(SrcDesc::IsKnownAtCompileTime(),
55  "wrong! SrcDesc need to known at compile-time");
56  static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
57  "wrong! Not divisible");
58  }
59 
60  __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
61  {
62  dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
63  }
64 
65  template <typename SrcSliceOriginIdx, typename SrcBuffer, typename DstBuffer>
66  __device__ void Run(const SrcDesc&,
67  const SrcSliceOriginIdx&,
68  const SrcBuffer& src_buf,
69  const DstDesc& dst_desc,
70  DstBuffer& dst_buf)
71  {
72  static_assert(SrcDesc::IsKnownAtCompileTime(),
73  "wrong! SrcDesc need to known at compile-time");
74 
76  "wrong! SrcSliceOrigin need to known at compile-time");
77 
78  static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
79 
80  // SrcDesc and src_slice_origin_idx are known at compile-time
81  constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
82  constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
83 
84  // scalar per access on each dim
85  // TODO: don't use lambda_scalar_per_access
86  constexpr auto dst_scalar_per_access = generate_sequence(
88 
89  constexpr auto dst_scalar_step_in_vector =
91 
92  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
93  DimAccessOrder,
94  remove_cv_t<decltype(dst_scalar_per_access)>>;
95 
96  // TODO: Use SpaceFillingCurve::ScalarsPerAccess instread of DstScalarPerVector?
97  static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector,
98  "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
101 
102  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
103 
104  static_for<0, num_access, 1>{}([&](auto idx_1d) {
105  constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
106 
107  // copy data from src_buf into dst_vector
108  // TODO: It's a hack here to use \p dst_scalar_step_in_vector. Use SpaceFillingCurve?
110  constexpr index_t src_offset = src_desc.CalculateOffset(
111  src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
112 
113  DstData v;
114 
115  // apply element-wise operation
116  element_op_(v, src_buf[Number<src_offset>{}]);
117 
118  dst_vector.template AsType<DstData>()(i) = v;
119  });
120 
121  const bool is_dst_valid =
123 
124  // copy data from dst_vector into dst_buf
125  dst_buf.template Update<DstInMemOp, dst_vector_t>(
126  dst_coord_.GetOffset(),
127  is_dst_valid,
128  dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
129 
130  if constexpr(idx_1d.value != num_access - 1)
131  {
132  constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
133 
135  dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
136  }
137  });
138 
139  // move dst coordinate back to slice origin (or not)
140  if constexpr(DstResetCoordinateAfterRun)
141  {
142  const auto dst_reset_step =
144 
145  move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
146  }
147  }
148 
149  __device__ static constexpr auto GetDstCoordinateResetStep()
150  {
151  constexpr auto dst_scalar_per_access = generate_sequence(
153 
154  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
155  DimAccessOrder,
156  remove_cv_t<decltype(dst_scalar_per_access)>>;
157 
158  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
159  if constexpr(num_access == 0)
160  {
161  return typename SpaceFillingCurve::Index{};
162  }
163  else
164  {
165  constexpr auto reset_step =
167 
168  return reset_step;
169  }
170  }
171 
172  // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
173  __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
174  const Index& dst_slice_origin_step_idx)
175  {
176  // if dst coord was not reset by Run(), then need to adjust the step here
177  const auto adjusted_step_idx =
178  DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
179  : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
180 
181  // is it OK to construct a new step every time?
182  const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
183 
184  move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
185  }
186 
187  private:
188  DstCoord dst_coord_;
189  const ElementwiseOperation element_op_;
190 }; // namespace ThreadwiseTensorSliceTransfer_v1r3
191 
221 template <typename SrcData,
222  typename DstData,
223  typename SrcDesc,
224  typename DstDesc,
225  typename SliceLengths,
226  typename DimAccessOrder,
227  index_t SrcVectorDim,
228  index_t SrcScalarPerVector,
229  index_t SrcScalarStrideInVector,
230  bool SrcResetCoordinateAfterRun,
231  bool InvalidElementAsNaN = false,
232  typename enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false>
234 {
235  static_assert((InvalidElementAsNaN && !ck::is_integral<DstData>::value) ||
236  (!InvalidElementAsNaN),
237  "Filling invalid element as NaN is only for floating point types");
238 
239  static constexpr index_t nDim = SliceLengths::Size();
240 
242 
243  using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
244 
245  using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
246 
247  static constexpr index_t PackedSize = []() {
249  return 2;
250  else
251  return 1;
252  }();
253 
254  __device__ constexpr ThreadwiseTensorSliceTransfer_v2(const SrcDesc& src_desc,
255  const Index& src_slice_origin_idx)
256  : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin_idx))
257  {
258  static_assert(DstDesc::IsKnownAtCompileTime(),
259  "wrong! SrcDesc need to known at compile-time");
260  static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
261  "wrong! Not divisible");
262 
263  if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t> ||
265  {
266  static_assert(SrcScalarPerVector % PackedSize == 0, "pk data N cannot be 1");
267  }
268  }
269 
270  __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
271  {
272  src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
273  }
274 
275  template <typename SrcBuffer, typename DstBuffer, typename DstSliceOriginIdx>
276  __device__ void Run(const SrcDesc& src_desc,
277  const SrcBuffer& src_buf,
278  const DstDesc&,
279  const DstSliceOriginIdx&,
280  DstBuffer& dst_buf)
281  {
282  static_assert(DstDesc::IsKnownAtCompileTime(),
283  "wrong! DstDesc need to known at compile-time");
284 
286  "wrong! DstSliceOrigin need to known at compile-time");
287 
288  static_assert(
290  "wrong! inconsistent type");
291 
292  // DstDesc and dst_slice_origin_idx are known at compile-time
293  constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
294  constexpr auto dst_slice_origin_idx = DstSliceOriginIdx{};
295 
296  // scalar per access on each dim
297  // TODO: don't use lambda_scalar_per_access
298  constexpr auto src_scalar_per_access = generate_sequence(
300 
301  constexpr auto src_scalar_step_in_vector =
303 
304  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
305  DimAccessOrder,
306  remove_cv_t<decltype(src_scalar_per_access)>>;
307 
308  // loop over tensor and copy
309  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
310 
311  static_for<0, num_access, 1>{}([&](auto idx_1d) {
312  typename vector_type_maker<SrcData, SrcScalarPerVector / PackedSize>::type src_vector;
313 
314  using src_vector_t =
315  typename vector_type_maker<SrcData, SrcScalarPerVector / PackedSize>::type::type;
316  constexpr auto src_data_idx = SpaceFillingCurve::GetIndex(idx_1d);
317 
318  const bool is_src_valid =
320 
321  // copy data from src_buf into src_vector
322  src_vector.template AsType<src_vector_t>()(Number<0>{}) =
323  src_buf.template Get<src_vector_t>(src_coord_.GetOffset() / PackedSize,
324  is_src_valid);
325 
326  // copy data from src_vector into dst_buf
327  static_for<0, SrcScalarPerVector / PackedSize, 1>{}([&](auto i) {
328  constexpr index_t dst_offset =
329  dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx +
330  i * src_scalar_step_in_vector);
331 
332  if constexpr(InvalidElementAsNaN)
333  {
334  dst_buf(Number<dst_offset>{}) =
335  is_src_valid
336  ? type_convert<DstData>(src_vector.template AsType<SrcData>()[i])
338  }
339  else
340  {
341  dst_buf(Number<dst_offset>{}) =
342  type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
343  }
344  });
345 
346  if constexpr(idx_1d.value != num_access - 1)
347  {
348  constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
349 
351  src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
352  }
353  });
354 
355  // move src coordinate back to slice origin (or not)
356  if constexpr(SrcResetCoordinateAfterRun)
357  {
358  const auto src_reset_step =
360 
361  move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
362  }
363  }
364 
365  __device__ static constexpr auto GetSrcCoordinateResetStep()
366  {
367  constexpr auto src_scalar_per_access = generate_sequence(
369 
370  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
371  DimAccessOrder,
372  remove_cv_t<decltype(src_scalar_per_access)>>;
373 
374  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
375  if constexpr(num_access == 0)
376  {
377  return typename SpaceFillingCurve::Index{};
378  }
379  else
380  {
381  constexpr auto reset_step =
383 
384  return reset_step;
385  }
386  }
387 
388  // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
389  __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
390  const Index& src_slice_origin_step_idx)
391  {
392  // if src coord was not reset by Run(), then need to adjust the step here
393  const auto adjusted_step_idx =
394  SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
395  : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
396 
397  // is it OK to construct a new step every time?
398  const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
399 
400  move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
401  }
402 
403  // src_slice_origin_step_idx need to be known at compile-time, for performance reason
404  template <typename SrcMoveSliceWindowStepHack>
405  __device__ void
406  MoveSrcSliceWindow(const SrcDesc& src_desc,
407  const Index& src_slice_origin_step_idx,
408  const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
409  {
410  // if src coord was not reset by RunRead(), then need to adjust the step here
411  const auto adjusted_step_idx =
412  SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
413  : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
414 
415  // is it OK to construct a new step every time?
416  const auto adjusted_step = make_tensor_coordinate_step(
417  src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
418 
419  move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
420  }
421 
422  private:
423  SrcCoord src_coord_;
424 }; // namespace ck
425 
426 template <typename SrcData,
427  typename DstData,
428  typename SrcDesc,
429  typename DstDesc,
430  typename SliceLengths,
431  typename DimAccessOrder,
432  index_t SrcVectorDim,
433  index_t SrcScalarPerVector,
434  index_t SrcScalarStrideInVector,
435  bool SrcResetCoordinateAfterRun,
436  index_t scale_gather_num,
437  bool InvalidElementAsNaN = false,
438  typename enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false>
440 {
441  static_assert((InvalidElementAsNaN && !ck::is_integral<DstData>::value) ||
442  (!InvalidElementAsNaN),
443  "Filling invalid element as NaN is only for floating point types");
444 
445  static constexpr index_t nDim = SliceLengths::Size();
446 
448 
449  using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
450 
451  using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
452 
453  static constexpr index_t PackedSize = []() {
455  return 2;
456  else
457  return 1;
458  }();
459 
461  const SrcDesc& src_desc,
462  const Index& src_slice_origin_idx,
463  const StaticallyIndexedArray<index_t, scale_gather_num>& scale_gather_offsets)
464  : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin_idx)),
465  scale_gather_offsets_(scale_gather_offsets)
466  {
467  static_assert(DstDesc::IsKnownAtCompileTime(),
468  "wrong! SrcDesc need to known at compile-time");
469  static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
470  "wrong! Not divisible");
471 
473  {
474  static_assert(SrcScalarPerVector % PackedSize == 0, "pk data N cannot be 1");
475  }
476  }
477 
478  __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
479  {
480  auto adjusted_origin_idx = [&]() {
481  Index idx;
482 
484  [&](auto i) { idx(i) = i.value == 0 ? 0 : src_slice_origin_idx[Number<i>{}]; });
485 
486  return idx;
487  }();
488 
489  src_coord_ = make_tensor_coordinate(src_desc, adjusted_origin_idx);
490  }
491 
492  template <typename SrcBuffer, typename DstBuffer, typename DstSliceOriginIdx>
493  __device__ void Run(const SrcDesc& src_desc,
494  const SrcBuffer& src_buf,
495  const DstDesc&,
496  const DstSliceOriginIdx&,
497  DstBuffer& dst_buf)
498  {
499  static_assert(DstDesc::IsKnownAtCompileTime(),
500  "wrong! DstDesc need to known at compile-time");
501 
503  "wrong! DstSliceOrigin need to known at compile-time");
504 
505  static_assert(
507  "wrong! inconsistent type");
508 
509  // DstDesc and dst_slice_origin_idx are known at compile-time
510  constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
511  constexpr auto dst_slice_origin_idx = DstSliceOriginIdx{};
512 
513  // scalar per access on each dim
514  // TODO: don't use lambda_scalar_per_access
515  constexpr auto src_scalar_per_access = generate_sequence(
517 
518  constexpr auto src_scalar_step_in_vector =
520 
521  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
522  DimAccessOrder,
523  remove_cv_t<decltype(src_scalar_per_access)>>;
524 
525  // loop over tensor and copy
526  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
527 
528  static_for<0, scale_gather_num, 1>{}([&](auto gather_idx) {
529  constexpr auto current_dst_origin =
530  to_multi_index(dst_slice_origin_idx) + make_multi_index(gather_idx, 0);
531 
532  static_for<0, num_access, 1>{}([&](auto idx_1d) {
533  typename vector_type_maker<SrcData, SrcScalarPerVector / PackedSize>::type
534  src_vector;
535 
536  using src_vector_t =
537  typename vector_type_maker<SrcData,
538  SrcScalarPerVector / PackedSize>::type::type;
539  constexpr auto src_data_idx = SpaceFillingCurve::GetIndex(idx_1d);
540 
541  const bool is_src_valid =
543  src_coord_);
544 
545  // copy data from src_buf into src_vector
546  src_vector.template AsType<src_vector_t>()(Number<0>{}) =
547  src_buf.template Get<src_vector_t>(src_coord_.GetOffset() / PackedSize +
548  scale_gather_offsets_(gather_idx),
549  is_src_valid);
550 
551  // copy data from src_vector into dst_buf
552  static_for<0, SrcScalarPerVector / PackedSize, 1>{}([&](auto i) {
553  constexpr index_t dst_offset =
554  dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) +
555  src_data_idx + i * src_scalar_step_in_vector);
556  constexpr auto full_dst_offset =
557  dst_desc.CalculateOffset(current_dst_origin) + dst_offset;
558 
559  if constexpr(InvalidElementAsNaN)
560  {
561  dst_buf(full_dst_offset) =
562  is_src_valid
563  ? type_convert<DstData>(src_vector.template AsType<SrcData>()[i])
565  }
566  else
567  {
568  dst_buf(Number<full_dst_offset>{}) =
569  type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
570  }
571  });
572 
573  if constexpr(idx_1d.value != num_access - 1)
574  {
575  constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
576 
578  src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
579  }
580  });
581  });
582 
583  // move src coordinate back to slice origin (or not)
584  if constexpr(SrcResetCoordinateAfterRun)
585  {
586  const auto src_reset_step =
588 
589  move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
590  }
591  }
592 
593  __device__ static constexpr auto GetSrcCoordinateResetStep()
594  {
595  constexpr auto src_scalar_per_access = generate_sequence(
597 
598  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
599  DimAccessOrder,
600  remove_cv_t<decltype(src_scalar_per_access)>>;
601 
602  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
603  if constexpr(num_access == 0)
604  {
605  return typename SpaceFillingCurve::Index{};
606  }
607  else
608  {
609  constexpr auto reset_step =
611 
612  return reset_step;
613  }
614  }
615 
616  // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
617  __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
618  const Index& src_slice_origin_step_idx)
619  {
620  // if src coord was not reset by Run(), then need to adjust the step here
621  const auto adjusted_step_idx =
622  SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
623  : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
624 
625  // is it OK to construct a new step every time?
626  const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
627 
628  move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
629  }
630 
631  // src_slice_origin_step_idx need to be known at compile-time, for performance reason
632  template <typename SrcMoveSliceWindowStepHack>
633  __device__ void
634  MoveSrcSliceWindow(const SrcDesc& src_desc,
635  const Index& src_slice_origin_step_idx,
636  const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
637  {
638  // if src coord was not reset by RunRead(), then need to adjust the step here
639  const auto adjusted_step_idx =
640  SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
641  : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
642 
643  // is it OK to construct a new step every time?
644  const auto adjusted_step = make_tensor_coordinate_step(
645  src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
646 
647  move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
648  }
649 
650  private:
651  SrcCoord src_coord_;
653 }; // namespace ck
654 
655 // Assume:
656 // 1. src_desc and dst_desc are not known at compile-time
657 // 2. SrcBuffer and DstBuffer are DynamicBuffer
658 // 3. src_slice_origin and dst_slice_origin are not known at compile-time,
659 // 4. Use thread buffer
660 template <typename SliceLengths,
661  InMemoryDataOperationEnum DstInMemOp,
662  typename SrcData,
663  typename DstData,
664  typename SrcDesc,
665  typename DstDesc,
666  typename SrcDimAccessOrder,
667  typename DstDimAccessOrder,
668  index_t SrcVectorDim,
669  index_t DstVectorDim,
670  index_t SrcScalarPerVector,
671  index_t DstScalarPerVector,
672  index_t SrcScalarStrideInVector,
673  index_t DstScalarStrideInVector,
674  bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
675  // RunRead(), will be fused with MoveSrcSliceWindow to
676  // save addr computation
677  bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
678  // RunWrite(), will be fused with MoveDstSliceWindow to
679  // save addr computation
681 {
682  static constexpr index_t nDim = SliceLengths::Size();
684 
685  using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
686  using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
687 
688  using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
689  using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
690 
691  __device__ constexpr ThreadwiseTensorSliceTransfer_v3(const SrcDesc& src_desc,
692  const Index& src_slice_origin,
693  const DstDesc& dst_desc,
694  const Index& dst_slice_origin)
695  : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
696  dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
697  {
698  static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
699  "wrong! Not divisible");
700  static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
701  "wrong! Not divisible");
702  }
703 
704  __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
705  {
706  src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
707  }
708 
709  __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
710  {
711  dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
712  }
713 
714  template <typename SrcBuffer, typename SrcStepHacks>
715  __device__ void
716  RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
717  {
718  static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
719  SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
720  "wrong!");
721 
722  static_assert(
724  "wrong! SrcBuffer and SrcData data type are inconsistent");
725 
726  constexpr auto I0 = Number<0>{};
727  constexpr auto I1 = Number<1>{};
728 
729  // scalar per access on each dim
730  // TODO: don't use lambda_scalar_per_access
731  constexpr auto src_scalar_per_access = generate_sequence(
733 
734  constexpr auto src_scalar_step_in_vector =
736 
737  constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
738 
739  constexpr auto src_dim_access_order = SrcDimAccessOrder{};
740 
741  constexpr auto ordered_src_access_lengths =
742  container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
743 
744  // make forward steps
745  const auto src_forward_steps = generate_tuple(
746  [&](auto i) {
747  Index forward_step_idx;
748 
749  static_for<0, nDim, 1>{}([&](auto j) {
750  forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
751  });
752 
754  src_desc, forward_step_idx, src_step_hacks[I0][i]);
755  },
756  Number<nDim>{});
757 
758  // make backward steps
759  const auto src_backward_steps = generate_tuple(
760  [&](auto i) {
761  Index backward_step_idx;
762 
763  static_for<0, nDim, 1>{}([&](auto j) {
764  backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
765  });
766 
768  src_desc, backward_step_idx, src_step_hacks[I1][i]);
769  },
770  Number<nDim>{});
771 
772  // loop over tensor and copy
773  static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
774  // judge move forward or move backward
775  constexpr auto forward_sweep = [&]() {
776  StaticallyIndexedArray<bool, nDim> forward_sweep_;
777 
778  forward_sweep_(I0) = true;
779 
780  static_for<1, nDim, 1>{}([&](auto i) {
781  index_t tmp = ordered_src_access_idx[I0];
782 
783  static_for<1, i, 1>{}([&](auto j) {
784  tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
785  });
786 
787  forward_sweep_(i) = tmp % 2 == 0;
788  });
789 
790  return forward_sweep_;
791  }();
792 
793  // calculate src data index
794  constexpr auto src_data_idx = [&]() {
795  Index ordered_idx;
796 
797  static_for<0, nDim, 1>{}([&](auto i) {
798  ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
799  : ordered_src_access_lengths[i] - 1 -
800  ordered_src_access_idx[i];
801  });
802 
803  return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
804  src_scalar_per_access;
805  }();
806 
808 
809  using src_vector_t = typename decltype(src_tmp_vector)::type;
810 
811  const bool is_src_valid =
813 
814  // copy data from src_buf to src_tmp_vector
815  src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
816  src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
817 
818  // copy data from src_tmp_vector to buffer_
820  constexpr index_t buffer_offset =
821  buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
822 
823  buffer_(Number<buffer_offset>{}) = src_tmp_vector.template AsType<SrcData>()[i];
824  });
825 
826  constexpr auto move_on_dim = [&]() constexpr
827  {
829 
830  static_for<0, nDim, 1>{}([&](auto i) {
831  move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
832 
833  static_for<i + 1, nDim, 1>{}([&](auto j) {
834  move_on_dim_(i) &=
835  ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
836  });
837  });
838 
839  return move_on_dim_;
840  }
841  ();
842 
843  // move
844  static_for<0, nDim, 1>{}([&](auto i) {
845  if constexpr(move_on_dim[i])
846  {
847  if constexpr(forward_sweep[i])
848  {
850  src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
851  }
852  else
853  {
855  src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
856  }
857  }
858  });
859  });
860 
861  // move src coordinate back to slice origin (or not)
862  if constexpr(SrcResetCoordinateAfterRun)
863  {
864  const auto src_reset_step =
866 
867  move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
868  }
869  }
870 
871  template <typename DstBuffer, typename DstStepHacks>
872  __device__ void
873  RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
874  {
875  static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
876  DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
877  "wrong!");
878 
879  static_assert(
881  "wrong! SrcBuffer or DstBuffer data type is wrong");
882 
883  constexpr auto I0 = Number<0>{};
884  constexpr auto I1 = Number<1>{};
885 
886  // src scalar per access on each dim
887  // TODO: don't use this
888  constexpr auto dst_scalar_per_access = generate_sequence(
890 
891  constexpr auto dst_scalar_step_in_vector =
893 
894  constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
895 
896  constexpr auto dst_dim_access_order = DstDimAccessOrder{};
897 
898  constexpr auto ordered_dst_access_lengths =
899  container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
900 
901  // make forward steps
902  const auto dst_forward_steps = generate_tuple(
903  [&](auto i) {
904  Index forward_step_idx;
905 
906  static_for<0, nDim, 1>{}([&](auto j) {
907  forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
908  });
909 
911  dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
912  },
913  Number<nDim>{});
914 
915  // make backward steps
916  const auto dst_backward_steps = generate_tuple(
917  [&](auto i) {
918  Index backward_step_idx;
919 
920  static_for<0, nDim, 1>{}([&](auto j) {
921  backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
922  });
923 
925  dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
926  },
927  Number<nDim>{});
928 
929  // loop over tensor and copy
930  static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
931  // judge move forward or move backward
932  constexpr auto forward_sweep = [&]() {
933  StaticallyIndexedArray<bool, nDim> forward_sweep_;
934 
935  forward_sweep_(I0) = true;
936 
937  static_for<1, nDim, 1>{}([&](auto i) {
938  index_t tmp = ordered_dst_access_idx[I0];
939 
940  static_for<1, i, 1>{}([&](auto j) {
941  tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
942  });
943 
944  forward_sweep_(i) = tmp % 2 == 0;
945  });
946 
947  return forward_sweep_;
948  }();
949 
950  // calculate dst data index
951  constexpr auto dst_data_idx = [&]() {
952  Index ordered_idx;
953 
954  static_for<0, nDim, 1>{}([&](auto i) {
955  ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
956  : ordered_dst_access_lengths[i] - 1 -
957  ordered_dst_access_idx[i];
958  });
959 
960  return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
961  dst_scalar_per_access;
962  }();
963 
965 
966  // copy data from buffer_ to dst_tmp_vector
968  constexpr index_t buffer_offset =
969  buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
970 
971  dst_tmp_vector.template AsType<DstData>()(i) =
972  type_convert<DstData>(buffer_[Number<buffer_offset>{}]);
973  });
974 
975  using dst_vector_t = typename decltype(dst_tmp_vector)::type;
976 
977  // copy data from dst_tmp_vector to dst_buf
978  const bool is_dst_valid =
980 
981  dst_buf.template Set<dst_vector_t>(
982  dst_coord_.GetOffset(),
983  is_dst_valid,
984  dst_tmp_vector.template AsType<dst_vector_t>()[Number<0>{}]);
985 
986  constexpr auto move_on_dim = [&]() constexpr
987  {
989 
990  static_for<0, nDim, 1>{}([&](auto i) {
991  move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
992 
993  static_for<i + 1, nDim, 1>{}([&](auto j) {
994  move_on_dim_(i) &=
995  ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
996  });
997  });
998 
999  return move_on_dim_;
1000  }
1001  ();
1002 
1003  // move
1004  static_for<0, nDim, 1>{}([&](auto i) {
1005  if constexpr(move_on_dim[i])
1006  {
1007  if constexpr(forward_sweep[i])
1008  {
1010  dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
1011  }
1012  else
1013  {
1015  dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
1016  }
1017  }
1018  });
1019  });
1020 
1021  // move dst coordinate back to slice origin (or not)
1022  if constexpr(DstResetCoordinateAfterRun)
1023  {
1024  const auto dst_reset_step =
1026 
1027  move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
1028  }
1029  }
1030 
1031  template <typename SrcBuffer>
1032  __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
1033  {
1034  constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
1035 
1036  constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
1037 
1038  constexpr auto src_step_hacks =
1039  make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
1040  generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
1041 
1042  RunRead(src_desc, src_buf, src_step_hacks);
1043  }
1044 
1045  template <typename DstBuffer>
1046  __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
1047  {
1048  constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
1049 
1050  constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
1051 
1052  constexpr auto dst_step_hacks =
1053  make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
1054  generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
1055 
1056  RunWrite(dst_desc, dst_buf, dst_step_hacks);
1057  }
1058 
1059  __device__ static constexpr auto GetSrcCoordinateResetStep()
1060  {
1061  constexpr auto I0 = Number<0>{};
1062 
1063  // scalar per access on each dim
1064  // TODO: don't use lambda_scalar_per_access
1065  constexpr auto src_scalar_per_access = generate_sequence(
1067 
1068  constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
1069 
1070  constexpr auto src_dim_access_order = SrcDimAccessOrder{};
1071 
1072  constexpr auto ordered_src_access_lengths =
1073  container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
1074 
1075  // judge move forward or move backward during the last iteration
1076  constexpr auto forward_sweep = [&]() {
1077  StaticallyIndexedArray<bool, nDim> forward_sweep_;
1078 
1079  forward_sweep_(I0) = true;
1080 
1081  static_for<1, nDim, 1>{}([&](auto i) {
1082  index_t tmp = ordered_src_access_lengths[I0] - 1;
1083 
1084  static_for<1, i, 1>{}([&](auto j) {
1085  tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
1086  });
1087 
1088  forward_sweep_(i) = tmp % 2 == 0;
1089  });
1090 
1091  return forward_sweep_;
1092  }();
1093 
1094  // calculate src data index after last iteration in RunRead(), if it has not being reset by
1095  // RunRead()
1096  constexpr auto src_data_idx = [&]() {
1097  Index ordered_idx;
1098 
1099  static_for<0, nDim, 1>{}([&](auto i) {
1100  ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
1101  });
1102 
1103  return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
1104  src_scalar_per_access;
1105  }();
1106 
1107  //
1108  constexpr auto reset_src_data_step = [&]() {
1109  Index reset_src_data_step_;
1110 
1111  static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
1112 
1113  return reset_src_data_step_;
1114  }();
1115 
1116  return reset_src_data_step;
1117  }
1118 
1119  __device__ static constexpr auto GetDstCoordinateResetStep()
1120  {
1121  constexpr auto I0 = Number<0>{};
1122 
1123  // scalar per access on each dim
1124  // TODO: don't use lambda_scalar_per_access
1125  constexpr auto dst_scalar_per_access = generate_sequence(
1127 
1128  constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
1129 
1130  constexpr auto dst_dim_access_order = DstDimAccessOrder{};
1131 
1132  constexpr auto ordered_dst_access_lengths =
1133  container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
1134 
1135  // judge move forward or move backward during the last iteration
1136  constexpr auto forward_sweep = [&]() {
1137  StaticallyIndexedArray<bool, nDim> forward_sweep_;
1138 
1139  forward_sweep_(I0) = true;
1140 
1141  static_for<1, nDim, 1>{}([&](auto i) {
1142  index_t tmp = ordered_dst_access_lengths[I0] - 1;
1143 
1144  static_for<1, i, 1>{}([&](auto j) {
1145  tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
1146  });
1147 
1148  forward_sweep_(i) = tmp % 2 == 0;
1149  });
1150 
1151  return forward_sweep_;
1152  }();
1153 
1154  // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
1155  // RunWrite()
1156  constexpr auto dst_data_idx = [&]() {
1157  Index ordered_idx;
1158 
1159  static_for<0, nDim, 1>{}([&](auto i) {
1160  ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
1161  });
1162 
1163  return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
1164  dst_scalar_per_access;
1165  }();
1166 
1167  //
1168  constexpr auto reset_dst_data_step = [&]() {
1169  Index reset_dst_data_step_;
1170 
1171  static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
1172 
1173  return reset_dst_data_step_;
1174  }();
1175 
1176  return reset_dst_data_step;
1177  }
1178 
1179  // src_slice_origin_step_idx need to be known at compile-time, for performance reason
1180  __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
1181  const Index& src_slice_origin_step_idx)
1182  {
1183  // if src coord was not reset by RunRead(), then need to adjust the step here
1184  const auto adjusted_step_idx =
1185  SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
1186  : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
1187 
1188  // is it OK to construct a new step every time?
1189  const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
1190 
1191  move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
1192  }
1193 
1194  // src_slice_origin_step_idx need to be known at compile-time, for performance reason
1195  template <typename SrcMoveSliceWindowStepHack>
1196  __device__ void
1197  MoveSrcSliceWindow(const SrcDesc& src_desc,
1198  const Index& src_slice_origin_step_idx,
1199  const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
1200  {
1201  // if src coord was not reset by RunRead(), then need to adjust the step here
1202  const auto adjusted_step_idx =
1203  SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
1204  : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
1205 
1206  // is it OK to construct a new step every time?
1207  const auto adjusted_step = make_tensor_coordinate_step(
1208  src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
1209 
1210  move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
1211  }
1212  // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
1213  __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
1214  const Index& dst_slice_origin_step_idx)
1215  {
1216  // if dst coord was not reset by RunWrite(), then need to adjust the step here
1217  const auto adjusted_step_idx =
1218  DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
1219  : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
1220 
1221  // is it OK to construct a new step every time?
1222  const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
1223 
1224  move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
1225  }
1226 
1227  private:
1228  static constexpr auto buffer_desc_ =
1230 
1231  static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
1232 
1233  StaticBuffer<AddressSpaceEnum::Vgpr, SrcData, buffer_size_, true> buffer_;
1234 
1235  SrcCoord src_coord_;
1236  DstCoord dst_coord_;
1237 };
1238 
1239 // Assume:
1240 // 1. src:
1241 // 1. SrcDesc is known at compile-time
1242 // 2. SrcBuffer is DynamicBuffer
1243 // 3. src_ref_idx is known at run-time
1244 // 4. SrcRefToOriginDisplacement is known at compile-time
1245 // 5. use #-step
1246 // 2. dst:
1247 // 1. DstDesc is known at compile-time
1248 // 2. DstBuffer is StaticBuffer
1249 // 3. DstOriginIdx is known at compile-time
1250 // 4. use direct address calculation
1251 // 3. vector access on src
1252 template <typename SrcData,
1253  typename DstData,
1254  typename SrcDesc,
1255  typename DstDesc,
1256  typename SliceLengths,
1257  typename DimAccessOrder,
1258  index_t SrcVectorDim,
1259  index_t SrcScalarPerVector,
1260  index_t SrcScalarStrideInVector,
1261  typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1262  bool>::type = false>
1264 {
1265  static constexpr index_t nDim = SliceLengths::Size();
1266 
1268 
1269  using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
1270 
1271  using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
1272 
1273  static constexpr index_t PackedSize = []() {
1274  if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
1275  return 2;
1276  else
1277  return 1;
1278  }();
1279 
1280  __device__ constexpr ThreadwiseTensorSliceTransfer_v4(const Index& src_ref_idx)
1281  : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx))
1282  {
1283  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1284  "wrong! SrcDesc and DstDesc need to known at compile-time");
1285 
1286  if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t> ||
1288  {
1289  static_assert(SrcScalarPerVector % PackedSize == 0, "pk data N cannot be 1");
1290  }
1291  }
1292 
1293  template <typename SrcRefToOriginDisplacement,
1294  typename DstOriginIdx,
1295  typename SrcBuffer,
1296  typename DstBuffer>
1297  __device__ void Run(const SrcDesc&,
1298  const SrcRefToOriginDisplacement&,
1299  const SrcBuffer& src_buf,
1300  const DstDesc&,
1301  const DstOriginIdx&,
1302  DstBuffer& dst_buf) const
1303  {
1304  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1305  "wrong! SrcDesc and DstDesc need to known at compile-time");
1306 
1307  static_assert(
1310  "wrong! SrcBuffer or DstBuffer data type is wrong");
1311 
1312  static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
1313 
1316  "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
1317  "at compile-time");
1318 
1319  // SrcDesc and DstDesc are known at compile-time
1320  constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
1321  constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
1322 
1323  // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time
1324  constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{});
1325  constexpr auto dst_origin_idx = to_multi_index(DstOriginIdx{});
1326 
1327  // scalar per access of each dim
1328  constexpr auto src_scalar_per_access = generate_sequence_v2(
1329  [&](auto i) constexpr {
1330  if constexpr(i == SrcVectorDim)
1331  {
1332  return Number<SrcScalarPerVector>{};
1333  }
1334  else
1335  {
1336  return Number<1>{};
1337  }
1338  },
1339  Number<nDim>{});
1340 
1341  // scalar step (if steping on SrcVectorDim) of each dim
1342  constexpr auto src_scalar_step_in_vector = generate_sequence_v2(
1343  [&](auto i) constexpr {
1344  if constexpr(i == SrcVectorDim)
1345  {
1346  return Number<1>{};
1347  }
1348  else
1349  {
1350  return Number<0>{};
1351  }
1352  },
1353  Number<nDim>{});
1354 
1355  constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
1356 
1357  constexpr auto dim_access_order = DimAccessOrder{};
1358 
1359  constexpr auto ordered_access_lengths =
1360  container_reorder_given_new2old(access_lengths, dim_access_order);
1361 
1362  static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
1363 #if 0
1364  // TODO: unable to compile
1365  // position in slice window
1366  constexpr auto data_to_origin_disp_idx =
1367  container_reorder_given_old2new(ordered_access_idx, dim_access_order) *
1368  src_scalar_per_access;
1369 #else
1370  // position in slice window
1371  constexpr auto data_to_origin_disp_idx =
1372  ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
1373 #endif
1374  // src coordinate
1375  constexpr auto src_ref_to_data_disp_idx =
1376  src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
1377 
1378  constexpr auto src_ref_to_data_disp_coord_step =
1379  make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx);
1380 
1381  auto src_data_coord = src_ref_coord_;
1382 
1383  move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
1384 
1385  vector_type_maker_t<SrcData, SrcScalarPerVector / PackedSize> src_tmp_vector;
1386 
1387  using src_vector_t = typename decltype(src_tmp_vector)::type;
1388 
1390  src_desc, src_data_coord);
1391 
1392  // copy data from src_buf into src_tmp_vector
1393  if constexpr(SrcBuffer::IsDynamicBuffer())
1394  {
1395  src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
1396  src_buf.template Get<src_vector_t>(src_data_coord.GetOffset() / PackedSize,
1397  is_src_valid);
1398  }
1399  else if constexpr(SrcBuffer::IsStaticBuffer())
1400  {
1401  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1402  constexpr index_t src_offset = src_desc.CalculateOffset(
1403  src_ref_to_origin_disp_idx + data_to_origin_disp_idx +
1404  i * src_scalar_step_in_vector);
1405 
1406  src_tmp_vector.template AsType<SrcData>()(i) = src_buf[Number<src_offset>{}];
1407  });
1408  }
1409 
1410  if constexpr(is_same<remove_cvref_t<SrcData>, pk_i4_t>::value)
1411  {
1412  // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
1413  // DstData)
1415 
1416  constexpr index_t pack_size = 8;
1417 
1418  static_assert(SrcScalarPerVector % pack_size == 0, "");
1419 
1420  using src_v_t = typename vector_type_maker_t<SrcData, pack_size / PackedSize>::type;
1421  using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
1422 
1423  static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) {
1425  dst_tmp_vector.template AsType<dst_v_t>()(i),
1426  src_tmp_vector.template AsType<src_v_t>()[i]);
1427  });
1428 
1429  // copy data from dst_tmp_vector into dst_buf
1430  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1431  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1432  dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1433 
1434  dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
1435  });
1436  }
1437  else if constexpr(is_same<remove_cvref_t<SrcData>, f8_t>::value &&
1439  SrcScalarPerVector % 2 == 0)
1440  {
1441  // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
1442  // DstData)
1444 
1445  constexpr index_t pack_size = 2;
1446 
1447  using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
1448  using src_v_t = typename vector_type_maker_t<SrcData, pack_size>::type;
1449  static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) {
1451  dst_tmp_vector.template AsType<dst_v_t>()(i),
1452  src_tmp_vector.template AsType<src_v_t>()[i]);
1453  });
1454 
1455  // copy data from dst_tmp_vector into dst_buf
1456  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1457  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1458  dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1459 
1460  dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
1461  });
1462  }
1463  else
1464  {
1465  // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
1466  // DstData)
1467  vector_type_maker_t<DstData, SrcScalarPerVector / PackedSize> dst_tmp_vector;
1468 
1469  // TODO: if SrcData and DstData are vetor type, then static_cast may not compile
1470  static_for<0, SrcScalarPerVector / PackedSize, 1>{}([&](auto i) {
1471  dst_tmp_vector.template AsType<DstData>()(i) =
1472  type_convert<DstData>(src_tmp_vector.template AsType<SrcData>()[i]);
1473  });
1474 
1475  // copy data from dst_tmp_vector into dst_buf
1476  static_for<0, SrcScalarPerVector / PackedSize, 1>{}([&](auto i) {
1477  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1478  dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1479 
1480  dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
1481  });
1482  }
1483  });
1484  }
1485 
1486  // Fuse scale
1487  template <typename SrcRefToOriginDisplacement,
1488  typename DstOriginIdx,
1489  typename SrcBuffer,
1490  typename DstBuffer>
1491  __device__ void Run(const SrcDesc&,
1492  const SrcRefToOriginDisplacement&,
1493  const SrcBuffer& src_buf,
1494  const DstData& scale,
1495  const DstDesc&,
1496  const DstOriginIdx&,
1497  DstBuffer& dst_buf) const
1498  {
1499  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1500  "wrong! SrcDesc and DstDesc need to known at compile-time");
1501 
1502  static_assert(
1505  "wrong! SrcBuffer or DstBuffer data type is wrong");
1506 
1507  static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
1508 
1511  "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
1512  "at compile-time");
1513 
1514  // SrcDesc and DstDesc are known at compile-time
1515  constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
1516  constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
1517 
1518  // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time
1519  constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{});
1520  constexpr auto dst_origin_idx = to_multi_index(DstOriginIdx{});
1521 
1522  // scalar per access of each dim
1523  constexpr auto src_scalar_per_access = generate_sequence_v2(
1524  [&](auto i) constexpr {
1525  if constexpr(i == SrcVectorDim)
1526  {
1527  return Number<SrcScalarPerVector>{};
1528  }
1529  else
1530  {
1531  return Number<1>{};
1532  }
1533  },
1534  Number<nDim>{});
1535 
1536  // scalar step (if steping on SrcVectorDim) of each dim
1537  constexpr auto src_scalar_step_in_vector = generate_sequence_v2(
1538  [&](auto i) constexpr {
1539  if constexpr(i == SrcVectorDim)
1540  {
1541  return Number<1>{};
1542  }
1543  else
1544  {
1545  return Number<0>{};
1546  }
1547  },
1548  Number<nDim>{});
1549 
1550  constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
1551 
1552  constexpr auto dim_access_order = DimAccessOrder{};
1553 
1554  constexpr auto ordered_access_lengths =
1555  container_reorder_given_new2old(access_lengths, dim_access_order);
1556 
1557  static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
1558 #if 0
1559  // TODO: unable to compile
1560  // position in slice window
1561  constexpr auto data_to_origin_disp_idx =
1562  container_reorder_given_old2new(ordered_access_idx, dim_access_order) *
1563  src_scalar_per_access;
1564 #else
1565  // position in slice window
1566  constexpr auto data_to_origin_disp_idx =
1567  ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
1568 #endif
1569  // src coordinate
1570  constexpr auto src_ref_to_data_disp_idx =
1571  src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
1572 
1573  constexpr auto src_ref_to_data_disp_coord_step =
1574  make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx);
1575 
1576  auto src_data_coord = src_ref_coord_;
1577 
1578  move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
1579 
1580  vector_type_maker_t<SrcData, SrcScalarPerVector / PackedSize> src_tmp_vector;
1581 
1582  using src_vector_t = typename decltype(src_tmp_vector)::type;
1583 
1585  src_desc, src_data_coord);
1586 
1587  // copy data from src_buf into src_tmp_vector
1588  if constexpr(SrcBuffer::IsDynamicBuffer())
1589  {
1590  src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
1591  src_buf.template Get<src_vector_t>(src_data_coord.GetOffset() / PackedSize,
1592  is_src_valid);
1593  }
1594  else if constexpr(SrcBuffer::IsStaticBuffer())
1595  {
1596  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1597  constexpr index_t src_offset = src_desc.CalculateOffset(
1598  src_ref_to_origin_disp_idx + data_to_origin_disp_idx +
1599  i * src_scalar_step_in_vector);
1600 
1601  src_tmp_vector.template AsType<SrcData>()(i) = src_buf[Number<src_offset>{}];
1602  });
1603  }
1604 
1605  if constexpr(is_same<remove_cvref_t<SrcData>, pk_i4_t>::value)
1606  {
1607  // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
1608  // DstData)
1610  vector_type<DstData, 2> scale_vector;
1611  scale_vector.template AsType<DstData>()(Number<0>{}) = scale;
1612  scale_vector.template AsType<DstData>()(Number<1>{}) = scale;
1613 
1614  constexpr index_t pack_size = 8;
1615 
1616  static_assert(SrcScalarPerVector % pack_size == 0, "");
1617 
1618  using src_v_t = typename vector_type_maker_t<SrcData, pack_size / PackedSize>::type;
1619  using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
1620  using scale_v_t = typename vector_type_maker_t<DstData, 2>::type;
1621 
1622  static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) {
1624  dst_tmp_vector.template AsType<dst_v_t>()(i),
1625  src_tmp_vector.template AsType<src_v_t>()[i],
1626  scale_vector.template AsType<scale_v_t>()[Number<0>{}]);
1627  });
1628 
1629  // copy data from dst_tmp_vector into dst_buf
1630  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1631  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1632  dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1633 
1634  dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
1635  });
1636  }
1637  else if constexpr(is_same<remove_cvref_t<SrcData>, f8_t>::value &&
1639  SrcScalarPerVector % 2 == 0)
1640  {
1641  // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
1642  // DstData)
1644 
1645  constexpr index_t pack_size = 2;
1646 
1647  using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
1648  using src_v_t = typename vector_type_maker_t<SrcData, pack_size>::type;
1649  static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) {
1651  dst_tmp_vector.template AsType<dst_v_t>()(i),
1652  src_tmp_vector.template AsType<src_v_t>()[i]);
1653  });
1654 
1655  // copy data from dst_tmp_vector into dst_buf
1656  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1657  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1658  dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1659 
1660  dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
1661  });
1662  }
1663  else
1664  {
1665  // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
1666  // DstData)
1668 
1669  // TODO: if SrcData and DstData are vetor type, then static_cast may not compile
1670  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1671  dst_tmp_vector.template AsType<DstData>()(i) =
1672  type_convert<DstData>(src_tmp_vector.template AsType<SrcData>()[i]);
1673  });
1674 
1675  // copy data from dst_tmp_vector into dst_buf
1676  static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
1677  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1678  dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1679 
1680  dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
1681  });
1682  }
1683  });
1684  }
1685 
1686  template <typename SrcSliceMoveStepIdx>
1687  __device__ void MoveSrcSliceWindow(const SrcDesc&,
1688  const SrcSliceMoveStepIdx& src_slice_move_step_idx)
1689  {
1690  constexpr auto src_desc = SrcDesc{};
1691 
1692  const auto src_slice_move_step_iter =
1693  make_tensor_coordinate_step(src_desc, to_multi_index(src_slice_move_step_idx));
1694 
1695  move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
1696  }
1697  __device__ void SetSrcCoord(const Index& src_ref_idx)
1698  {
1699  src_ref_coord_ = make_tensor_coordinate(SrcDesc{}, src_ref_idx);
1700  }
1701 
1702  private:
1703  SrcCoord src_ref_coord_;
1704 };
1705 
1712 template <typename SrcData,
1713  typename DstData,
1714  typename SrcDesc,
1715  typename DstDesc,
1716  typename ElementwiseOperation,
1717  typename SliceLengths,
1718  typename DimAccessOrder,
1719  index_t DstVectorDim,
1720  index_t DstScalarPerVector,
1721  typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1722  bool>::type = false>
1724 {
1725  static constexpr index_t nDim = SliceLengths::Size();
1726 
1728 
1729  static constexpr index_t PackedSize = []() {
1730  if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
1731  return 2;
1732  else
1733  return 1;
1734  }();
1735 
1737  const ElementwiseOperation& element_op)
1738  : element_op_{element_op}
1739  {
1740  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1741  "wrong! Desc need to known at compile-time");
1742 
1743  static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
1744  "wrong! Not divisible");
1745  }
1746 
1747  template <typename SrcSliceOriginIdx,
1748  typename DstSliceOriginIdx,
1749  typename SrcBuffer,
1750  typename DstBuffer>
1751  __device__ void Run(const SrcDesc&,
1752  const SrcSliceOriginIdx&,
1753  const SrcBuffer& src_buf,
1754  const DstDesc&,
1755  const DstSliceOriginIdx&,
1756  DstBuffer& dst_buf) const
1757  {
1758  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1759  "wrong! Desc need to known at compile-time");
1760 
1763  "wrong! SliceOrigin need to known at compile-time");
1764 
1765  static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
1766  "wrong! Buffer need to be StaticBuffer");
1767 
1768  // SrcDesc and src_slice_origin_idx are known at compile-time
1769  constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
1770  constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
1771  constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
1772  constexpr auto dst_slice_origin_idx = to_multi_index(DstSliceOriginIdx{});
1773 
1774  // scalar per access on each dim
1775  constexpr auto dst_scalar_per_access = generate_sequence(
1777 
1778  constexpr auto dst_scalar_step_in_vector =
1780 
1781  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
1782  DimAccessOrder,
1783  remove_cv_t<decltype(dst_scalar_per_access)>>;
1784 
1785  static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector,
1786  "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
1787 
1788  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
1789 
1790  if constexpr(is_same<remove_cvref_t<SrcData>, pk_i4_t>::value)
1791  {
1792  static_for<0, num_access, 1>{}([&](auto idx_1d) {
1793  typename vector_type_maker<SrcData, DstScalarPerVector / PackedSize>::type
1794  src_tmp_vector;
1795 
1796  constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
1797 
1798  // copy data from src_buf into dst_vector
1799  static_for<0, DstScalarPerVector / PackedSize, 1>{}([&](auto i) {
1800  constexpr index_t src_offset = src_desc.CalculateOffset(
1801  src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1802 
1803  src_tmp_vector.template AsType<SrcData>()(i) = src_buf[Number<src_offset>{}];
1804  });
1805 
1806  // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
1807  // DstData)
1809 
1810  constexpr index_t pack_size = 8;
1811 
1812  static_assert(DstScalarPerVector % pack_size == 0, "");
1813 
1814  using src_v_t = typename vector_type_maker_t<SrcData, pack_size / PackedSize>::type;
1815  using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
1816 
1817  static_for<0, DstScalarPerVector / pack_size, 1>{}([&](auto i) {
1819  dst_tmp_vector.template AsType<dst_v_t>()(i),
1820  src_tmp_vector.template AsType<src_v_t>()[i]);
1821  });
1822 
1823  // copy data from dst_tmp_vector into dst_buf
1824  static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
1825  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1826  dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1827 
1828  dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
1829  });
1830  });
1831  }
1832  else
1833  {
1834  static_for<0, num_access, 1>{}([&](auto idx_1d) {
1835  constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
1836 
1837  // copy data from src_buf into dst_vector
1838  static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
1839  constexpr index_t src_offset = src_desc.CalculateOffset(
1840  src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1841 
1842  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1843  dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1844 
1845  DstData v;
1846 
1847  // apply element-wise operation
1848  element_op_(v, src_buf[Number<src_offset>{}]);
1849 
1850  // apply type convert
1851  dst_buf(Number<dst_offset>{}) = v;
1852  });
1853  });
1854  }
1855  }
1856 
1857  ElementwiseOperation element_op_;
1858 };
1859 
1860 // Specialized for gfx11
1861 // A single Wave32 is composed by double row
1862 // Data exchange allowed between these two rows
1863 // This RowLane Dst buf will be filled from two Src buf
1864 // SrcA: From specific thread buffer hold by This RowLane on This Row
1865 // SrcB: From specific thread buffer hold by This RowLane on The other Row
1866 template <typename SrcData,
1867  typename DstData,
1868  typename SrcDesc,
1869  typename DstDesc,
1870  typename ElementwiseOperation,
1871  typename SliceLengths,
1872  typename DimAccessOrder,
1873  index_t DstVectorDim,
1874  index_t DstScalarPerVector,
1875  uint32_t LowEightRowlaneIdx,
1876  uint32_t HighEightRowLaneIdx,
1877  bool IntraRowSwizzlePerm,
1878  typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1879  bool>::type = false>
1881 {
1882  static constexpr index_t nDim = SliceLengths::Size();
1883 
1885 
1887  {
1888  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1889  "wrong! Desc need to known at compile-time");
1890 
1891  static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
1892  "wrong! Not divisible");
1893  ignore = src_idx;
1894  }
1895 
1896  template <typename SrcSliceOriginIdx,
1897  typename DstSliceOriginIdx,
1898  typename SrcBuffer,
1899  typename DstBuffer>
1900  __device__ void Run(const SrcDesc&,
1901  const SrcSliceOriginIdx&,
1902  const SrcBuffer& src_buf,
1903  const DstDesc&,
1904  const DstSliceOriginIdx&,
1905  DstBuffer& dst_buf) const
1906  {
1907  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1908  "wrong! Desc need to known at compile-time");
1909 
1912  "wrong! SliceOrigin need to known at compile-time");
1913 
1914  static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
1915  "wrong! Buffer need to be StaticBuffer");
1916 
1917  // SrcDesc and src_slice_origin_idx are known at compile-time
1918  constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
1919  constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
1920  constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
1921  constexpr auto dst_slice_origin_idx = to_multi_index(DstSliceOriginIdx{});
1922 
1923  // scalar per access on each dim
1924  constexpr auto dst_scalar_per_access = generate_sequence(
1926 
1927  constexpr auto dst_scalar_step_in_vector =
1929 
1930  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
1931  DimAccessOrder,
1932  remove_cv_t<decltype(dst_scalar_per_access)>>;
1933 
1934  static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector,
1935  "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
1936 
1937  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
1938 
1939  static_for<0, num_access, 1>{}([&](auto idx_1d) {
1940  constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
1941 
1942  // copy data from src_buf into dst_vector
1943  static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
1944  // src_desc error, non constexpr, caused by merge transform
1945  constexpr index_t src_offset = src_desc.CalculateOffset(
1946  src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1947 
1948  constexpr index_t dst_offset = dst_desc.CalculateOffset(
1949  dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1950 
1951  SrcData v_this_row, v_theother_row;
1952  // int type temp value due to intrinsic requirement
1953  int temp = 0;
1954 
1955  // apply element-wise operation
1956  element_op_(v_this_row, src_buf[Number<src_offset>{}]);
1957 
1958  // apply intra-row permute.
1959  if constexpr(IntraRowSwizzlePerm)
1960  {
1961  temp = __builtin_amdgcn_permlane16(
1962  temp, type_convert_sp<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
1963  v_this_row = type_convert_sp<SrcData>(temp);
1964  }
1965 
1966  // apply inter-row permute.
1967  temp = __builtin_amdgcn_permlanex16(temp,
1968  type_convert_sp<int>(v_this_row),
1969  LowEightRowlaneIdx,
1970  HighEightRowLaneIdx,
1971  1,
1972  0);
1973  v_theother_row = type_convert_sp<SrcData>(temp);
1974 
1975  if(get_thread_local_1d_id() % 32 < 16)
1976  {
1977  // apply type convert
1978  dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_this_row);
1980  type_convert_sp<DstData>(v_theother_row);
1981  }
1982  else
1983  {
1984  // apply type convert
1986  type_convert_sp<DstData>(v_this_row);
1987  dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_theother_row);
1988  }
1989  });
1990  });
1991  }
1992  ElementwiseOperation element_op_{};
1993 };
1994 
1995 // Specialized for gfx12
1996 template <typename SrcData,
1997  typename DstData,
1998  typename SrcDesc,
1999  typename DstDesc,
2000  typename ElementwiseOperation,
2001  typename SliceLengths,
2002  typename DimAccessOrder,
2003  index_t DstVectorDim,
2004  index_t DstScalarPerVector,
2005  bool IntraRowSwizzlePerm,
2006  typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
2007  bool>::type = false>
2009 {
2010  static constexpr index_t nDim = SliceLengths::Size();
2011 
2013 
2015  {
2016  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
2017  "wrong! Desc need to known at compile-time");
2018 
2019  static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
2020  "wrong! Not divisible");
2021  ignore = src_idx;
2022  }
2023 
2024  template <typename SrcSliceOriginIdx,
2025  typename DstSliceOriginIdx,
2026  typename SrcBuffer,
2027  typename DstBuffer>
2028  __device__ void Run(const SrcDesc&,
2029  const SrcSliceOriginIdx&,
2030  const SrcBuffer& src_buf,
2031  const DstDesc&,
2032  const DstSliceOriginIdx&,
2033  DstBuffer& dst_buf) const
2034  {
2035  static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
2036  "wrong! Desc need to known at compile-time");
2037 
2040  "wrong! SliceOrigin need to known at compile-time");
2041 
2042  static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
2043  "wrong! Buffer need to be StaticBuffer");
2044 
2045  // SrcDesc and src_slice_origin_idx are known at compile-time
2046  constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
2047  constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
2048  constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
2049  constexpr auto dst_slice_origin_idx = to_multi_index(DstSliceOriginIdx{});
2050 
2051  // scalar per access on each dim
2052  constexpr auto dst_scalar_per_access = generate_sequence(
2054 
2055  constexpr auto dst_scalar_step_in_vector =
2057 
2058  using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
2059  DimAccessOrder,
2060  remove_cv_t<decltype(dst_scalar_per_access)>>;
2061 
2062  static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector,
2063  "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
2064 
2065  constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
2066 
2067  static_for<0, num_access, 1>{}([&](auto idx_1d) {
2068  constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
2069 
2070  // copy data from src_buf into dst_vector
2071  static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
2072  // src_desc error, non constexpr, caused by merge transform
2073  constexpr index_t src_offset = src_desc.CalculateOffset(
2074  src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
2075 
2076  constexpr index_t dst_offset = dst_desc.CalculateOffset(
2077  dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
2078 
2079  SrcData v_this_row;
2080  // int type temp value due to intrinsic requirement
2081  int temp = 0;
2082 
2083  // apply element-wise operation
2084  element_op_(v_this_row, src_buf[Number<src_offset>{}]);
2085 
2086  // apply intra-row permute.
2087  if constexpr(IntraRowSwizzlePerm)
2088  {
2089  temp = __builtin_amdgcn_permlane16(
2090  temp, type_convert_sp<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
2091  v_this_row = type_convert_sp<SrcData>(temp);
2092  }
2093 
2094  // apply type convert
2095  dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_this_row);
2096  });
2097  });
2098  }
2099  ElementwiseOperation element_op_{};
2100 };
2101 
2102 } // namespace ck
Definition: ck.hpp:269
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition: statically_indexed_array.hpp:45
__host__ constexpr __device__ bool coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition: tensor_descriptor.hpp:560
__host__ constexpr __device__ auto make_multi_index(Xs &&... xs)
Definition: array_multi_index.hpp:15
__host__ constexpr __device__ auto generate_tuple(F &&f, Number< N >)
Definition: tuple_helper.hpp:21
InMemoryDataOperationEnum
Definition: ck.hpp:278
__host__ constexpr __device__ auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition: tensor_descriptor_helper.hpp:101
f8_fnuz_t f8_t
Definition: amd_ck_fp8.hpp:1737
__host__ constexpr __device__ auto to_multi_index(const T &x)
Definition: array_multi_index.hpp:28
_Float16 half_t
Definition: data_type.hpp:30
__host__ constexpr __device__ auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition: tensor_descriptor.hpp:407
constexpr detail::ignore_t ignore
Definition: ignore.hpp:20
__host__ constexpr __device__ auto generate_sequence(F, Number< N >)
Definition: sequence_helper.hpp:18
__host__ constexpr __device__ auto generate_sequence_v2(F &&f, Number< N >)
Definition: sequence_helper.hpp:25
__host__ constexpr __device__ auto sequence_to_tuple_of_number(Sequence< Is... >)
Definition: container_helper.hpp:380
std::enable_if< B, T > enable_if
Definition: enable_if.hpp:24
constexpr bool is_same_v
Definition: type.hpp:283
__host__ constexpr __device__ auto container_reorder_given_new2old(const Array< TData, NSize > &old_array, Sequence< IRs... >)
Definition: container_helper.hpp:43
__host__ constexpr __device__ auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:211
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition: type.hpp:297
int32_t index_t
Definition: ck.hpp:300
__host__ constexpr __device__ void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
Definition: tensor_descriptor.hpp:508
__host__ constexpr __device__ auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition: tensor_descriptor.hpp:444
__device__ index_t get_thread_local_1d_id()
Definition: get_id.hpp:19
typename remove_cv< T >::type remove_cv_t
Definition: type.hpp:295
__host__ constexpr __device__ auto container_reorder_given_old2new(const Array< TData, NSize > &old_array, Sequence< IRs... > old2new)
Definition: container_helper.hpp:54
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition: dtype_vector.hpp:54
Definition: array.hpp:14
__host__ static constexpr __device__ T QuietNaN()
Definition: numeric_limits.hpp:313
Definition: tensor_space_filling_curve.hpp:20
static __device__ constexpr __host__ auto GetForwardStep(Number< AccessIdx1d >)
Definition: tensor_space_filling_curve.hpp:66
__host__ static constexpr __device__ index_t GetNumOfAccess()
Definition: tensor_space_filling_curve.hpp:41
static constexpr index_t ScalarPerVector
Definition: tensor_space_filling_curve.hpp:25
static __device__ constexpr __host__ Index GetIndex(Number< AccessIdx1d >)
Definition: tensor_space_filling_curve.hpp:81
static __device__ constexpr __host__ auto GetStepBetween(Number< AccessIdx1dBegin >, Number< AccessIdx1dEnd >)
Definition: tensor_space_filling_curve.hpp:52
Definition: threadwise_tensor_slice_transfer.hpp:1881
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1882
constexpr __device__ ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow(const Index &src_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1886
ElementwiseOperation element_op_
Definition: threadwise_tensor_slice_transfer.hpp:1992
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1900
Definition: threadwise_tensor_slice_transfer.hpp:2009
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:2010
constexpr __device__ ThreadwiseTensorSliceTransfer_StaticToStatic_IntraRow(const Index &src_idx)
Definition: threadwise_tensor_slice_transfer.hpp:2014
ElementwiseOperation element_op_
Definition: threadwise_tensor_slice_transfer.hpp:2099
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:2028
Threadwise data transfer.
Definition: threadwise_tensor_slice_transfer.hpp:1724
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:1729
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1751
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1725
ElementwiseOperation element_op_
Definition: threadwise_tensor_slice_transfer.hpp:1857
constexpr __device__ ThreadwiseTensorSliceTransfer_StaticToStatic(const ElementwiseOperation &element_op)
Definition: threadwise_tensor_slice_transfer.hpp:1736
Definition: threadwise_tensor_slice_transfer.hpp:39
static constexpr __device__ auto GetDstCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:149
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:40
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:42
decltype(make_tensor_coordinate(DstDesc{}, Index{})) DstCoord
Definition: threadwise_tensor_slice_transfer.hpp:44
constexpr __device__ ThreadwiseTensorSliceTransfer_v1r3(const DstDesc &dst_desc, const Index &dst_slice_origin_idx, const ElementwiseOperation &element_op)
Definition: threadwise_tensor_slice_transfer.hpp:48
decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) DstCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:46
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:173
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:60
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:66
Definition: threadwise_tensor_slice_transfer.hpp:440
__device__ void Run(const SrcDesc &src_desc, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:493
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:453
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition: threadwise_tensor_slice_transfer.hpp:634
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:478
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:449
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:447
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:451
static constexpr __device__ auto GetSrcCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:593
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:617
constexpr __device__ ThreadwiseTensorSliceTransfer_v2_gather(const SrcDesc &src_desc, const Index &src_slice_origin_idx, const StaticallyIndexedArray< index_t, scale_gather_num > &scale_gather_offsets)
Definition: threadwise_tensor_slice_transfer.hpp:460
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:445
Helper structure that facilitates transfer of source (grid) data to destination threads.
Definition: threadwise_tensor_slice_transfer.hpp:234
constexpr __device__ ThreadwiseTensorSliceTransfer_v2(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:254
__device__ void Run(const SrcDesc &src_desc, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:276
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:241
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:389
static constexpr __device__ auto GetSrcCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:365
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition: threadwise_tensor_slice_transfer.hpp:406
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:270
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:239
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:245
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:247
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:243
Definition: threadwise_tensor_slice_transfer.hpp:681
decltype(make_tensor_coordinate(DstDesc{}, Index{})) DstCoord
Definition: threadwise_tensor_slice_transfer.hpp:686
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:688
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:683
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition: threadwise_tensor_slice_transfer.hpp:1197
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf, const SrcStepHacks &src_step_hacks)
Definition: threadwise_tensor_slice_transfer.hpp:716
decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) DstCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:689
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1213
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:709
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:1046
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:704
static constexpr __device__ auto GetSrcCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:1059
static constexpr __device__ auto GetDstCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:1119
constexpr __device__ ThreadwiseTensorSliceTransfer_v3(const SrcDesc &src_desc, const Index &src_slice_origin, const DstDesc &dst_desc, const Index &dst_slice_origin)
Definition: threadwise_tensor_slice_transfer.hpp:691
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:685
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:682
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf)
Definition: threadwise_tensor_slice_transfer.hpp:1032
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1180
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf, const DstStepHacks &dst_step_hacks)
Definition: threadwise_tensor_slice_transfer.hpp:873
Definition: threadwise_tensor_slice_transfer.hpp:1264
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1265
__device__ void Run(const SrcDesc &, const SrcRefToOriginDisplacement &, const SrcBuffer &src_buf, const DstDesc &, const DstOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1297
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:1273
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:1269
constexpr __device__ ThreadwiseTensorSliceTransfer_v4(const Index &src_ref_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1280
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:1271
__device__ void Run(const SrcDesc &, const SrcRefToOriginDisplacement &, const SrcBuffer &src_buf, const DstData &scale, const DstDesc &, const DstOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1491
__device__ void SetSrcCoord(const Index &src_ref_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1697
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:1267
__device__ void MoveSrcSliceWindow(const SrcDesc &, const SrcSliceMoveStepIdx &src_slice_move_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1687
Definition: threadwise_tensor_slice_transfer_util.hpp:20
Definition: threadwise_tensor_slice_transfer_util.hpp:29
Definition: data_type.hpp:41
Definition: integral_constant.hpp:20
Definition: type.hpp:206
Definition: is_known_at_compile_time.hpp:14
Definition: type.hpp:177
Definition: data_type.hpp:186
Definition: functional2.hpp:33
Definition: functional3.hpp:97
Definition: unary_element_wise_operation.hpp:241
Definition: unary_element_wise_operation.hpp:277
Definition: unary_element_wise_operation.hpp:133
typename sequence_gen< NSize, F >::type type
Definition: sequence.hpp:295
Definition: dtype_vector.hpp:30
Definition: dtype_vector.hpp:10