25 template <
typename SrcData,
29 typename ElementwiseOperation,
30 typename SliceLengths,
31 typename DimAccessOrder,
35 index_t DstScalarStrideInVector,
36 bool DstResetCoordinateAfterRun,
37 typename enable_if<SrcDesc::IsKnownAtCompileTime(),
bool>::type =
false>
49 const Index& dst_slice_origin_idx,
50 const ElementwiseOperation& element_op)
52 element_op_{element_op}
54 static_assert(SrcDesc::IsKnownAtCompileTime(),
55 "wrong! SrcDesc need to known at compile-time");
57 "wrong! Not divisible");
65 template <
typename SrcSliceOriginIdx,
typename SrcBuffer,
typename DstBuffer>
66 __device__
void Run(
const SrcDesc&,
67 const SrcSliceOriginIdx&,
68 const SrcBuffer& src_buf,
69 const DstDesc& dst_desc,
72 static_assert(SrcDesc::IsKnownAtCompileTime(),
73 "wrong! SrcDesc need to known at compile-time");
76 "wrong! SrcSliceOrigin need to known at compile-time");
78 static_assert(SrcBuffer::IsStaticBuffer(),
"wrong! SrcBuffer need to be StaticBuffer");
82 constexpr
auto src_slice_origin_idx =
to_multi_index(SrcSliceOriginIdx{});
89 constexpr
auto dst_scalar_step_in_vector =
98 "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
110 constexpr
index_t src_offset = src_desc.CalculateOffset(
111 src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
118 dst_vector.template AsType<DstData>()(i) = v;
121 const bool is_dst_valid =
125 dst_buf.template Update<DstInMemOp, dst_vector_t>(
126 dst_coord_.GetOffset(),
128 dst_vector.template AsType<dst_vector_t>()[
Number<0>{}]);
130 if constexpr(idx_1d.value != num_access - 1)
140 if constexpr(DstResetCoordinateAfterRun)
142 const auto dst_reset_step =
159 if constexpr(num_access == 0)
165 constexpr
auto reset_step =
174 const Index& dst_slice_origin_step_idx)
177 const auto adjusted_step_idx =
178 DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
189 const ElementwiseOperation element_op_;
221 template <
typename SrcData,
225 typename SliceLengths,
226 typename DimAccessOrder,
229 index_t SrcScalarStrideInVector,
230 bool SrcResetCoordinateAfterRun,
231 bool InvalidElementAsNaN =
false,
232 typename enable_if<DstDesc::IsKnownAtCompileTime(),
bool>::type =
false>
236 (!InvalidElementAsNaN),
237 "Filling invalid element as NaN is only for floating point types");
255 const Index& src_slice_origin_idx)
258 static_assert(DstDesc::IsKnownAtCompileTime(),
259 "wrong! SrcDesc need to known at compile-time");
261 "wrong! Not divisible");
266 static_assert(SrcScalarPerVector %
PackedSize == 0,
"pk data N cannot be 1");
275 template <
typename SrcBuffer,
typename DstBuffer,
typename DstSliceOriginIdx>
276 __device__
void Run(
const SrcDesc& src_desc,
277 const SrcBuffer& src_buf,
279 const DstSliceOriginIdx&,
282 static_assert(DstDesc::IsKnownAtCompileTime(),
283 "wrong! DstDesc need to known at compile-time");
286 "wrong! DstSliceOrigin need to known at compile-time");
290 "wrong! inconsistent type");
294 constexpr
auto dst_slice_origin_idx = DstSliceOriginIdx{};
301 constexpr
auto src_scalar_step_in_vector =
318 const bool is_src_valid =
322 src_vector.template AsType<src_vector_t>()(
Number<0>{}) =
323 src_buf.template Get<src_vector_t>(src_coord_.GetOffset() /
PackedSize,
329 dst_desc.CalculateOffset(
to_multi_index(dst_slice_origin_idx) + src_data_idx +
330 i * src_scalar_step_in_vector);
332 if constexpr(InvalidElementAsNaN)
336 ? type_convert<DstData>(src_vector.template AsType<SrcData>()[i])
342 type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
346 if constexpr(idx_1d.value != num_access - 1)
356 if constexpr(SrcResetCoordinateAfterRun)
358 const auto src_reset_step =
375 if constexpr(num_access == 0)
381 constexpr
auto reset_step =
390 const Index& src_slice_origin_step_idx)
393 const auto adjusted_step_idx =
394 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
404 template <
typename SrcMoveSliceWindowStepHack>
407 const Index& src_slice_origin_step_idx,
408 const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
411 const auto adjusted_step_idx =
412 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
417 src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
426 template <
typename SrcData,
430 typename SliceLengths,
431 typename DimAccessOrder,
434 index_t SrcScalarStrideInVector,
435 bool SrcResetCoordinateAfterRun,
437 bool InvalidElementAsNaN =
false,
438 typename enable_if<DstDesc::IsKnownAtCompileTime(),
bool>::type =
false>
442 (!InvalidElementAsNaN),
443 "Filling invalid element as NaN is only for floating point types");
461 const SrcDesc& src_desc,
462 const Index& src_slice_origin_idx,
465 scale_gather_offsets_(scale_gather_offsets)
467 static_assert(DstDesc::IsKnownAtCompileTime(),
468 "wrong! SrcDesc need to known at compile-time");
470 "wrong! Not divisible");
474 static_assert(SrcScalarPerVector %
PackedSize == 0,
"pk data N cannot be 1");
480 auto adjusted_origin_idx = [&]() {
484 [&](
auto i) { idx(i) = i.value == 0 ? 0 : src_slice_origin_idx[
Number<i>{}]; });
492 template <
typename SrcBuffer,
typename DstBuffer,
typename DstSliceOriginIdx>
493 __device__
void Run(
const SrcDesc& src_desc,
494 const SrcBuffer& src_buf,
496 const DstSliceOriginIdx&,
499 static_assert(DstDesc::IsKnownAtCompileTime(),
500 "wrong! DstDesc need to known at compile-time");
503 "wrong! DstSliceOrigin need to known at compile-time");
507 "wrong! inconsistent type");
511 constexpr
auto dst_slice_origin_idx = DstSliceOriginIdx{};
518 constexpr
auto src_scalar_step_in_vector =
529 constexpr
auto current_dst_origin =
541 const bool is_src_valid =
546 src_vector.template AsType<src_vector_t>()(
Number<0>{}) =
547 src_buf.template Get<src_vector_t>(src_coord_.GetOffset() /
PackedSize +
548 scale_gather_offsets_(gather_idx),
555 src_data_idx + i * src_scalar_step_in_vector);
556 constexpr
auto full_dst_offset =
557 dst_desc.CalculateOffset(current_dst_origin) + dst_offset;
559 if constexpr(InvalidElementAsNaN)
561 dst_buf(full_dst_offset) =
563 ? type_convert<DstData>(src_vector.template AsType<SrcData>()[i])
569 type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
573 if constexpr(idx_1d.value != num_access - 1)
584 if constexpr(SrcResetCoordinateAfterRun)
586 const auto src_reset_step =
603 if constexpr(num_access == 0)
609 constexpr
auto reset_step =
618 const Index& src_slice_origin_step_idx)
621 const auto adjusted_step_idx =
622 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
632 template <
typename SrcMoveSliceWindowStepHack>
635 const Index& src_slice_origin_step_idx,
636 const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
639 const auto adjusted_step_idx =
640 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
645 src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
660 template <
typename SliceLengths,
666 typename SrcDimAccessOrder,
667 typename DstDimAccessOrder,
672 index_t SrcScalarStrideInVector,
673 index_t DstScalarStrideInVector,
674 bool SrcResetCoordinateAfterRun,
677 bool DstResetCoordinateAfterRun>
692 const Index& src_slice_origin,
693 const DstDesc& dst_desc,
694 const Index& dst_slice_origin)
699 "wrong! Not divisible");
701 "wrong! Not divisible");
714 template <
typename SrcBuffer,
typename SrcStepHacks>
716 RunRead(
const SrcDesc& src_desc,
const SrcBuffer& src_buf,
const SrcStepHacks& src_step_hacks)
724 "wrong! SrcBuffer and SrcData data type are inconsistent");
734 constexpr
auto src_scalar_step_in_vector =
737 constexpr
auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
739 constexpr
auto src_dim_access_order = SrcDimAccessOrder{};
741 constexpr
auto ordered_src_access_lengths =
747 Index forward_step_idx;
750 forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
754 src_desc, forward_step_idx, src_step_hacks[I0][i]);
761 Index backward_step_idx;
764 backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
768 src_desc, backward_step_idx, src_step_hacks[I1][i]);
773 static_ford<decltype(ordered_src_access_lengths)>{}([&](
auto ordered_src_access_idx) {
775 constexpr
auto forward_sweep = [&]() {
778 forward_sweep_(I0) =
true;
781 index_t tmp = ordered_src_access_idx[I0];
784 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
787 forward_sweep_(i) = tmp % 2 == 0;
790 return forward_sweep_;
794 constexpr
auto src_data_idx = [&]() {
798 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
799 : ordered_src_access_lengths[i] - 1 -
800 ordered_src_access_idx[i];
804 src_scalar_per_access;
809 using src_vector_t =
typename decltype(src_tmp_vector)::type;
811 const bool is_src_valid =
815 src_tmp_vector.template AsType<src_vector_t>()(
Number<0>{}) =
816 src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
820 constexpr
index_t buffer_offset =
821 buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
826 constexpr
auto move_on_dim = [&]() constexpr
831 move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
835 ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
845 if constexpr(move_on_dim[i])
847 if constexpr(forward_sweep[i])
850 src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
855 src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
862 if constexpr(SrcResetCoordinateAfterRun)
864 const auto src_reset_step =
871 template <
typename DstBuffer,
typename DstStepHacks>
873 RunWrite(
const DstDesc& dst_desc, DstBuffer& dst_buf,
const DstStepHacks& dst_step_hacks)
881 "wrong! SrcBuffer or DstBuffer data type is wrong");
891 constexpr
auto dst_scalar_step_in_vector =
894 constexpr
auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
896 constexpr
auto dst_dim_access_order = DstDimAccessOrder{};
898 constexpr
auto ordered_dst_access_lengths =
904 Index forward_step_idx;
907 forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
911 dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
918 Index backward_step_idx;
921 backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
925 dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
930 static_ford<decltype(ordered_dst_access_lengths)>{}([&](
auto ordered_dst_access_idx) {
932 constexpr
auto forward_sweep = [&]() {
935 forward_sweep_(I0) =
true;
938 index_t tmp = ordered_dst_access_idx[I0];
941 tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
944 forward_sweep_(i) = tmp % 2 == 0;
947 return forward_sweep_;
951 constexpr
auto dst_data_idx = [&]() {
955 ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
956 : ordered_dst_access_lengths[i] - 1 -
957 ordered_dst_access_idx[i];
961 dst_scalar_per_access;
968 constexpr
index_t buffer_offset =
969 buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
971 dst_tmp_vector.template AsType<DstData>()(i) =
975 using dst_vector_t =
typename decltype(dst_tmp_vector)::type;
978 const bool is_dst_valid =
981 dst_buf.template Set<dst_vector_t>(
982 dst_coord_.GetOffset(),
984 dst_tmp_vector.template AsType<dst_vector_t>()[
Number<0>{}]);
986 constexpr
auto move_on_dim = [&]() constexpr
991 move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
995 ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
1005 if constexpr(move_on_dim[i])
1007 if constexpr(forward_sweep[i])
1010 dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
1015 dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
1022 if constexpr(DstResetCoordinateAfterRun)
1024 const auto dst_reset_step =
1031 template <
typename SrcBuffer>
1032 __device__
void RunRead(
const SrcDesc& src_desc,
const SrcBuffer& src_buf)
1034 constexpr
index_t ntransform_src = SrcDesc::GetNumOfTransform();
1038 constexpr
auto src_step_hacks =
1042 RunRead(src_desc, src_buf, src_step_hacks);
1045 template <
typename DstBuffer>
1046 __device__
void RunWrite(
const DstDesc& dst_desc, DstBuffer& dst_buf)
1048 constexpr
index_t ntransform_dst = DstDesc::GetNumOfTransform();
1052 constexpr
auto dst_step_hacks =
1056 RunWrite(dst_desc, dst_buf, dst_step_hacks);
1068 constexpr
auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
1070 constexpr
auto src_dim_access_order = SrcDimAccessOrder{};
1072 constexpr
auto ordered_src_access_lengths =
1076 constexpr
auto forward_sweep = [&]() {
1079 forward_sweep_(I0) =
true;
1082 index_t tmp = ordered_src_access_lengths[I0] - 1;
1085 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
1088 forward_sweep_(i) = tmp % 2 == 0;
1091 return forward_sweep_;
1096 constexpr
auto src_data_idx = [&]() {
1100 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
1104 src_scalar_per_access;
1108 constexpr
auto reset_src_data_step = [&]() {
1109 Index reset_src_data_step_;
1113 return reset_src_data_step_;
1116 return reset_src_data_step;
1128 constexpr
auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
1130 constexpr
auto dst_dim_access_order = DstDimAccessOrder{};
1132 constexpr
auto ordered_dst_access_lengths =
1136 constexpr
auto forward_sweep = [&]() {
1139 forward_sweep_(I0) =
true;
1142 index_t tmp = ordered_dst_access_lengths[I0] - 1;
1145 tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
1148 forward_sweep_(i) = tmp % 2 == 0;
1151 return forward_sweep_;
1156 constexpr
auto dst_data_idx = [&]() {
1160 ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
1164 dst_scalar_per_access;
1168 constexpr
auto reset_dst_data_step = [&]() {
1169 Index reset_dst_data_step_;
1173 return reset_dst_data_step_;
1176 return reset_dst_data_step;
1181 const Index& src_slice_origin_step_idx)
1184 const auto adjusted_step_idx =
1185 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
1195 template <
typename SrcMoveSliceWindowStepHack>
1198 const Index& src_slice_origin_step_idx,
1199 const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
1202 const auto adjusted_step_idx =
1203 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
1208 src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
1214 const Index& dst_slice_origin_step_idx)
1217 const auto adjusted_step_idx =
1218 DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
1228 static constexpr
auto buffer_desc_ =
1231 static constexpr
auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
1233 StaticBuffer<AddressSpaceEnum::Vgpr, SrcData, buffer_size_, true> buffer_;
1252 template <
typename SrcData,
1256 typename SliceLengths,
1257 typename DimAccessOrder,
1260 index_t SrcScalarStrideInVector,
1261 typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1262 bool>::type =
false>
1283 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1284 "wrong! SrcDesc and DstDesc need to known at compile-time");
1289 static_assert(SrcScalarPerVector %
PackedSize == 0,
"pk data N cannot be 1");
1293 template <
typename SrcRefToOriginDisplacement,
1294 typename DstOriginIdx,
1297 __device__
void Run(
const SrcDesc&,
1298 const SrcRefToOriginDisplacement&,
1299 const SrcBuffer& src_buf,
1301 const DstOriginIdx&,
1302 DstBuffer& dst_buf)
const
1304 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1305 "wrong! SrcDesc and DstDesc need to known at compile-time");
1310 "wrong! SrcBuffer or DstBuffer data type is wrong");
1312 static_assert(DstBuffer::IsStaticBuffer(),
"wrong! DstBuffer need to be StaticBuffer");
1316 "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
1324 constexpr
auto src_ref_to_origin_disp_idx =
to_multi_index(SrcRefToOriginDisplacement{});
1329 [&](
auto i) constexpr {
1330 if constexpr(i == SrcVectorDim)
1343 [&](
auto i) constexpr {
1344 if constexpr(i == SrcVectorDim)
1355 constexpr
auto access_lengths = SliceLengths{} / src_scalar_per_access;
1357 constexpr
auto dim_access_order = DimAccessOrder{};
1359 constexpr
auto ordered_access_lengths =
1362 static_ford<decltype(ordered_access_lengths)>{}([&](
auto ordered_access_idx) {
1366 constexpr
auto data_to_origin_disp_idx =
1368 src_scalar_per_access;
1371 constexpr
auto data_to_origin_disp_idx =
1372 ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
1375 constexpr
auto src_ref_to_data_disp_idx =
1376 src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
1378 constexpr
auto src_ref_to_data_disp_coord_step =
1381 auto src_data_coord = src_ref_coord_;
1387 using src_vector_t =
typename decltype(src_tmp_vector)::type;
1390 src_desc, src_data_coord);
1393 if constexpr(SrcBuffer::IsDynamicBuffer())
1395 src_tmp_vector.template AsType<src_vector_t>()(
Number<0>{}) =
1396 src_buf.template Get<src_vector_t>(src_data_coord.GetOffset() /
PackedSize,
1399 else if constexpr(SrcBuffer::IsStaticBuffer())
1402 constexpr
index_t src_offset = src_desc.CalculateOffset(
1403 src_ref_to_origin_disp_idx + data_to_origin_disp_idx +
1404 i * src_scalar_step_in_vector);
1416 constexpr
index_t pack_size = 8;
1418 static_assert(SrcScalarPerVector % pack_size == 0,
"");
1423 static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](
auto i) {
1425 dst_tmp_vector.template AsType<dst_v_t>()(i),
1426 src_tmp_vector.template AsType<src_v_t>()[i]);
1431 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1432 dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1439 SrcScalarPerVector % 2 == 0)
1445 constexpr
index_t pack_size = 2;
1449 static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](
auto i) {
1451 dst_tmp_vector.template AsType<dst_v_t>()(i),
1452 src_tmp_vector.template AsType<src_v_t>()[i]);
1457 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1458 dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1471 dst_tmp_vector.template AsType<DstData>()(i) =
1472 type_convert<DstData>(src_tmp_vector.template AsType<SrcData>()[i]);
1477 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1478 dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1487 template <
typename SrcRefToOriginDisplacement,
1488 typename DstOriginIdx,
1491 __device__
void Run(
const SrcDesc&,
1492 const SrcRefToOriginDisplacement&,
1493 const SrcBuffer& src_buf,
1494 const DstData& scale,
1496 const DstOriginIdx&,
1497 DstBuffer& dst_buf)
const
1499 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1500 "wrong! SrcDesc and DstDesc need to known at compile-time");
1505 "wrong! SrcBuffer or DstBuffer data type is wrong");
1507 static_assert(DstBuffer::IsStaticBuffer(),
"wrong! DstBuffer need to be StaticBuffer");
1511 "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
1519 constexpr
auto src_ref_to_origin_disp_idx =
to_multi_index(SrcRefToOriginDisplacement{});
1524 [&](
auto i) constexpr {
1525 if constexpr(i == SrcVectorDim)
1538 [&](
auto i) constexpr {
1539 if constexpr(i == SrcVectorDim)
1550 constexpr
auto access_lengths = SliceLengths{} / src_scalar_per_access;
1552 constexpr
auto dim_access_order = DimAccessOrder{};
1554 constexpr
auto ordered_access_lengths =
1557 static_ford<decltype(ordered_access_lengths)>{}([&](
auto ordered_access_idx) {
1561 constexpr
auto data_to_origin_disp_idx =
1563 src_scalar_per_access;
1566 constexpr
auto data_to_origin_disp_idx =
1567 ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
1570 constexpr
auto src_ref_to_data_disp_idx =
1571 src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
1573 constexpr
auto src_ref_to_data_disp_coord_step =
1576 auto src_data_coord = src_ref_coord_;
1582 using src_vector_t =
typename decltype(src_tmp_vector)::type;
1585 src_desc, src_data_coord);
1588 if constexpr(SrcBuffer::IsDynamicBuffer())
1590 src_tmp_vector.template AsType<src_vector_t>()(
Number<0>{}) =
1591 src_buf.template Get<src_vector_t>(src_data_coord.GetOffset() /
PackedSize,
1594 else if constexpr(SrcBuffer::IsStaticBuffer())
1597 constexpr
index_t src_offset = src_desc.CalculateOffset(
1598 src_ref_to_origin_disp_idx + data_to_origin_disp_idx +
1599 i * src_scalar_step_in_vector);
1611 scale_vector.template AsType<DstData>()(
Number<0>{}) = scale;
1612 scale_vector.template AsType<DstData>()(
Number<1>{}) = scale;
1614 constexpr
index_t pack_size = 8;
1616 static_assert(SrcScalarPerVector % pack_size == 0,
"");
1622 static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](
auto i) {
1624 dst_tmp_vector.template AsType<dst_v_t>()(i),
1625 src_tmp_vector.template AsType<src_v_t>()[i],
1626 scale_vector.template AsType<scale_v_t>()[
Number<0>{}]);
1631 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1632 dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1639 SrcScalarPerVector % 2 == 0)
1645 constexpr
index_t pack_size = 2;
1649 static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](
auto i) {
1651 dst_tmp_vector.template AsType<dst_v_t>()(i),
1652 src_tmp_vector.template AsType<src_v_t>()[i]);
1657 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1658 dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1671 dst_tmp_vector.template AsType<DstData>()(i) =
1672 type_convert<DstData>(src_tmp_vector.template AsType<SrcData>()[i]);
1677 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1678 dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1686 template <
typename SrcSliceMoveStepIdx>
1688 const SrcSliceMoveStepIdx& src_slice_move_step_idx)
1690 constexpr
auto src_desc = SrcDesc{};
1692 const auto src_slice_move_step_iter =
1712 template <
typename SrcData,
1716 typename ElementwiseOperation,
1717 typename SliceLengths,
1718 typename DimAccessOrder,
1721 typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1722 bool>::type =
false>
1737 const ElementwiseOperation& element_op)
1740 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1741 "wrong! Desc need to known at compile-time");
1744 "wrong! Not divisible");
1747 template <
typename SrcSliceOriginIdx,
1748 typename DstSliceOriginIdx,
1751 __device__
void Run(
const SrcDesc&,
1752 const SrcSliceOriginIdx&,
1753 const SrcBuffer& src_buf,
1755 const DstSliceOriginIdx&,
1756 DstBuffer& dst_buf)
const
1758 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1759 "wrong! Desc need to known at compile-time");
1763 "wrong! SliceOrigin need to known at compile-time");
1765 static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
1766 "wrong! Buffer need to be StaticBuffer");
1771 constexpr
auto src_slice_origin_idx =
to_multi_index(SrcSliceOriginIdx{});
1772 constexpr
auto dst_slice_origin_idx =
to_multi_index(DstSliceOriginIdx{});
1778 constexpr
auto dst_scalar_step_in_vector =
1786 "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
1800 constexpr
index_t src_offset = src_desc.CalculateOffset(
1801 src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1810 constexpr
index_t pack_size = 8;
1812 static_assert(DstScalarPerVector % pack_size == 0,
"");
1817 static_for<0, DstScalarPerVector / pack_size, 1>{}([&](
auto i) {
1819 dst_tmp_vector.template AsType<dst_v_t>()(i),
1820 src_tmp_vector.template AsType<src_v_t>()[i]);
1825 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1826 dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1839 constexpr
index_t src_offset = src_desc.CalculateOffset(
1840 src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1842 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1843 dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1866 template <
typename SrcData,
1870 typename ElementwiseOperation,
1871 typename SliceLengths,
1872 typename DimAccessOrder,
1875 uint32_t LowEightRowlaneIdx,
1876 uint32_t HighEightRowLaneIdx,
1877 bool IntraRowSwizzlePerm,
1878 typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1879 bool>::type =
false>
1888 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1889 "wrong! Desc need to known at compile-time");
1892 "wrong! Not divisible");
1896 template <
typename SrcSliceOriginIdx,
1897 typename DstSliceOriginIdx,
1900 __device__
void Run(
const SrcDesc&,
1901 const SrcSliceOriginIdx&,
1902 const SrcBuffer& src_buf,
1904 const DstSliceOriginIdx&,
1905 DstBuffer& dst_buf)
const
1907 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1908 "wrong! Desc need to known at compile-time");
1912 "wrong! SliceOrigin need to known at compile-time");
1914 static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
1915 "wrong! Buffer need to be StaticBuffer");
1920 constexpr
auto src_slice_origin_idx =
to_multi_index(SrcSliceOriginIdx{});
1921 constexpr
auto dst_slice_origin_idx =
to_multi_index(DstSliceOriginIdx{});
1927 constexpr
auto dst_scalar_step_in_vector =
1935 "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
1945 constexpr
index_t src_offset = src_desc.CalculateOffset(
1946 src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1948 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1949 dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1951 SrcData v_this_row, v_theother_row;
1959 if constexpr(IntraRowSwizzlePerm)
1961 temp = __builtin_amdgcn_permlane16(
1962 temp, type_convert_sp<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
1963 v_this_row = type_convert_sp<SrcData>(temp);
1967 temp = __builtin_amdgcn_permlanex16(temp,
1968 type_convert_sp<int>(v_this_row),
1970 HighEightRowLaneIdx,
1973 v_theother_row = type_convert_sp<SrcData>(temp);
1980 type_convert_sp<DstData>(v_theother_row);
1986 type_convert_sp<DstData>(v_this_row);
1996 template <
typename SrcData,
2000 typename ElementwiseOperation,
2001 typename SliceLengths,
2002 typename DimAccessOrder,
2005 bool IntraRowSwizzlePerm,
2006 typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
2007 bool>::type =
false>
2016 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
2017 "wrong! Desc need to known at compile-time");
2020 "wrong! Not divisible");
2024 template <
typename SrcSliceOriginIdx,
2025 typename DstSliceOriginIdx,
2028 __device__
void Run(
const SrcDesc&,
2029 const SrcSliceOriginIdx&,
2030 const SrcBuffer& src_buf,
2032 const DstSliceOriginIdx&,
2033 DstBuffer& dst_buf)
const
2035 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
2036 "wrong! Desc need to known at compile-time");
2040 "wrong! SliceOrigin need to known at compile-time");
2042 static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
2043 "wrong! Buffer need to be StaticBuffer");
2048 constexpr
auto src_slice_origin_idx =
to_multi_index(SrcSliceOriginIdx{});
2049 constexpr
auto dst_slice_origin_idx =
to_multi_index(DstSliceOriginIdx{});
2055 constexpr
auto dst_scalar_step_in_vector =
2063 "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
2073 constexpr
index_t src_offset = src_desc.CalculateOffset(
2074 src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
2076 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
2077 dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
2087 if constexpr(IntraRowSwizzlePerm)
2089 temp = __builtin_amdgcn_permlane16(
2090 temp, type_convert_sp<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
2091 v_this_row = type_convert_sp<SrcData>(temp);
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition: statically_indexed_array.hpp:45
__host__ constexpr __device__ bool coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition: tensor_descriptor.hpp:560
__host__ constexpr __device__ auto make_multi_index(Xs &&... xs)
Definition: array_multi_index.hpp:15
__host__ constexpr __device__ auto generate_tuple(F &&f, Number< N >)
Definition: tuple_helper.hpp:21
InMemoryDataOperationEnum
Definition: ck.hpp:278
__host__ constexpr __device__ auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition: tensor_descriptor_helper.hpp:101
f8_fnuz_t f8_t
Definition: amd_ck_fp8.hpp:1737
__host__ constexpr __device__ auto to_multi_index(const T &x)
Definition: array_multi_index.hpp:28
_Float16 half_t
Definition: data_type.hpp:30
__host__ constexpr __device__ auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition: tensor_descriptor.hpp:407
constexpr detail::ignore_t ignore
Definition: ignore.hpp:20
__host__ constexpr __device__ auto generate_sequence(F, Number< N >)
Definition: sequence_helper.hpp:18
__host__ constexpr __device__ auto generate_sequence_v2(F &&f, Number< N >)
Definition: sequence_helper.hpp:25
__host__ constexpr __device__ auto sequence_to_tuple_of_number(Sequence< Is... >)
Definition: container_helper.hpp:380
std::enable_if< B, T > enable_if
Definition: enable_if.hpp:24
constexpr bool is_same_v
Definition: type.hpp:283
__host__ constexpr __device__ auto container_reorder_given_new2old(const Array< TData, NSize > &old_array, Sequence< IRs... >)
Definition: container_helper.hpp:43
__host__ constexpr __device__ auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:211
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition: type.hpp:297
int32_t index_t
Definition: ck.hpp:300
__host__ constexpr __device__ void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
Definition: tensor_descriptor.hpp:508
__host__ constexpr __device__ auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition: tensor_descriptor.hpp:444
__device__ index_t get_thread_local_1d_id()
Definition: get_id.hpp:19
typename remove_cv< T >::type remove_cv_t
Definition: type.hpp:295
__host__ constexpr __device__ auto container_reorder_given_old2new(const Array< TData, NSize > &old_array, Sequence< IRs... > old2new)
Definition: container_helper.hpp:54
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition: dtype_vector.hpp:54
__host__ static constexpr __device__ T QuietNaN()
Definition: numeric_limits.hpp:313
Definition: tensor_space_filling_curve.hpp:20
static __device__ constexpr __host__ auto GetForwardStep(Number< AccessIdx1d >)
Definition: tensor_space_filling_curve.hpp:66
__host__ static constexpr __device__ index_t GetNumOfAccess()
Definition: tensor_space_filling_curve.hpp:41
static constexpr index_t ScalarPerVector
Definition: tensor_space_filling_curve.hpp:25
static __device__ constexpr __host__ Index GetIndex(Number< AccessIdx1d >)
Definition: tensor_space_filling_curve.hpp:81
static __device__ constexpr __host__ auto GetStepBetween(Number< AccessIdx1dBegin >, Number< AccessIdx1dEnd >)
Definition: tensor_space_filling_curve.hpp:52
Definition: threadwise_tensor_slice_transfer.hpp:1881
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1882
constexpr __device__ ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow(const Index &src_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1886
ElementwiseOperation element_op_
Definition: threadwise_tensor_slice_transfer.hpp:1992
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1900
Definition: threadwise_tensor_slice_transfer.hpp:2009
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:2010
constexpr __device__ ThreadwiseTensorSliceTransfer_StaticToStatic_IntraRow(const Index &src_idx)
Definition: threadwise_tensor_slice_transfer.hpp:2014
ElementwiseOperation element_op_
Definition: threadwise_tensor_slice_transfer.hpp:2099
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:2028
Threadwise data transfer.
Definition: threadwise_tensor_slice_transfer.hpp:1724
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:1729
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1751
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1725
ElementwiseOperation element_op_
Definition: threadwise_tensor_slice_transfer.hpp:1857
constexpr __device__ ThreadwiseTensorSliceTransfer_StaticToStatic(const ElementwiseOperation &element_op)
Definition: threadwise_tensor_slice_transfer.hpp:1736
Definition: threadwise_tensor_slice_transfer.hpp:39
static constexpr __device__ auto GetDstCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:149
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:40
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:42
decltype(make_tensor_coordinate(DstDesc{}, Index{})) DstCoord
Definition: threadwise_tensor_slice_transfer.hpp:44
constexpr __device__ ThreadwiseTensorSliceTransfer_v1r3(const DstDesc &dst_desc, const Index &dst_slice_origin_idx, const ElementwiseOperation &element_op)
Definition: threadwise_tensor_slice_transfer.hpp:48
decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) DstCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:46
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:173
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:60
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:66
Definition: threadwise_tensor_slice_transfer.hpp:440
__device__ void Run(const SrcDesc &src_desc, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:493
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:453
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition: threadwise_tensor_slice_transfer.hpp:634
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:478
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:449
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:447
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:451
static constexpr __device__ auto GetSrcCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:593
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:617
constexpr __device__ ThreadwiseTensorSliceTransfer_v2_gather(const SrcDesc &src_desc, const Index &src_slice_origin_idx, const StaticallyIndexedArray< index_t, scale_gather_num > &scale_gather_offsets)
Definition: threadwise_tensor_slice_transfer.hpp:460
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:445
Helper structure that facilitates transfer of source (grid) data to destination threads.
Definition: threadwise_tensor_slice_transfer.hpp:234
constexpr __device__ ThreadwiseTensorSliceTransfer_v2(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:254
__device__ void Run(const SrcDesc &src_desc, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:276
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:241
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:389
static constexpr __device__ auto GetSrcCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:365
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition: threadwise_tensor_slice_transfer.hpp:406
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:270
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:239
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:245
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:247
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:243
Definition: threadwise_tensor_slice_transfer.hpp:681
decltype(make_tensor_coordinate(DstDesc{}, Index{})) DstCoord
Definition: threadwise_tensor_slice_transfer.hpp:686
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:688
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:683
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition: threadwise_tensor_slice_transfer.hpp:1197
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf, const SrcStepHacks &src_step_hacks)
Definition: threadwise_tensor_slice_transfer.hpp:716
decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) DstCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:689
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1213
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:709
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:1046
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:704
static constexpr __device__ auto GetSrcCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:1059
static constexpr __device__ auto GetDstCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:1119
constexpr __device__ ThreadwiseTensorSliceTransfer_v3(const SrcDesc &src_desc, const Index &src_slice_origin, const DstDesc &dst_desc, const Index &dst_slice_origin)
Definition: threadwise_tensor_slice_transfer.hpp:691
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:685
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:682
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf)
Definition: threadwise_tensor_slice_transfer.hpp:1032
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1180
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf, const DstStepHacks &dst_step_hacks)
Definition: threadwise_tensor_slice_transfer.hpp:873
Definition: threadwise_tensor_slice_transfer.hpp:1264
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1265
__device__ void Run(const SrcDesc &, const SrcRefToOriginDisplacement &, const SrcBuffer &src_buf, const DstDesc &, const DstOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1297
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:1273
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:1269
constexpr __device__ ThreadwiseTensorSliceTransfer_v4(const Index &src_ref_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1280
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:1271
__device__ void Run(const SrcDesc &, const SrcRefToOriginDisplacement &, const SrcBuffer &src_buf, const DstData &scale, const DstDesc &, const DstOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1491
__device__ void SetSrcCoord(const Index &src_ref_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1697
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:1267
__device__ void MoveSrcSliceWindow(const SrcDesc &, const SrcSliceMoveStepIdx &src_slice_move_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1687
Definition: threadwise_tensor_slice_transfer_util.hpp:20
Definition: threadwise_tensor_slice_transfer_util.hpp:29
Definition: data_type.hpp:41
Definition: integral_constant.hpp:20
Definition: is_known_at_compile_time.hpp:14
Definition: data_type.hpp:186
Definition: functional2.hpp:33
Definition: functional3.hpp:97
Definition: unary_element_wise_operation.hpp:241
Definition: unary_element_wise_operation.hpp:277
Definition: unary_element_wise_operation.hpp:133
Definition: dtype_vector.hpp:30
Definition: dtype_vector.hpp:10