25 template <
typename SrcData,
 
   29           typename ElementwiseOperation,
 
   30           typename SliceLengths,
 
   31           typename DimAccessOrder,
 
   35           index_t DstScalarStrideInVector,
 
   36           bool DstResetCoordinateAfterRun,
 
   37           typename enable_if<SrcDesc::IsKnownAtCompileTime(), 
bool>::type = 
false>
 
   49                                                             const Index& dst_slice_origin_idx,
 
   50                                                             const ElementwiseOperation& element_op)
 
   52           element_op_{element_op}
 
   54         static_assert(SrcDesc::IsKnownAtCompileTime(),
 
   55                       "wrong! SrcDesc need to known at compile-time");
 
   57                       "wrong! Not divisible");
 
   65     template <
typename SrcSliceOriginIdx, 
typename SrcBuffer, 
typename DstBuffer>
 
   66     __device__ 
void Run(
const SrcDesc&,
 
   67                         const SrcSliceOriginIdx&,
 
   68                         const SrcBuffer& src_buf,
 
   69                         const DstDesc& dst_desc,
 
   72         static_assert(SrcDesc::IsKnownAtCompileTime(),
 
   73                       "wrong! SrcDesc need to known at compile-time");
 
   76                       "wrong! SrcSliceOrigin need to known at compile-time");
 
   78         static_assert(SrcBuffer::IsStaticBuffer(), 
"wrong! SrcBuffer need to be StaticBuffer");
 
   82         constexpr 
auto src_slice_origin_idx = 
to_multi_index(SrcSliceOriginIdx{});
 
   89         constexpr 
auto dst_scalar_step_in_vector =
 
   98                       "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
 
  110                 constexpr 
index_t src_offset = src_desc.CalculateOffset(
 
  111                     src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
 
  118                 dst_vector.template AsType<DstData>()(i) = v;
 
  121             const bool is_dst_valid =
 
  125             dst_buf.template Update<DstInMemOp, dst_vector_t>(
 
  126                 dst_coord_.GetOffset(),
 
  128                 dst_vector.template AsType<dst_vector_t>()[
Number<0>{}]);
 
  130             if constexpr(idx_1d.value != num_access - 1)
 
  140         if constexpr(DstResetCoordinateAfterRun)
 
  142             const auto dst_reset_step =
 
  159         if constexpr(num_access == 0)
 
  165             constexpr 
auto reset_step =
 
  174                                        const Index& dst_slice_origin_step_idx)
 
  177         const auto adjusted_step_idx =
 
  178             DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
 
  189     const ElementwiseOperation element_op_;
 
  221 template <
typename SrcData,
 
  225           typename SliceLengths,
 
  226           typename DimAccessOrder,
 
  229           index_t SrcScalarStrideInVector,
 
  230           bool SrcResetCoordinateAfterRun,
 
  231           bool InvalidElementAsNaN                                        = 
false,
 
  232           typename enable_if<DstDesc::IsKnownAtCompileTime(), 
bool>::type = 
false>
 
  236                       (!InvalidElementAsNaN),
 
  237                   "Filling invalid element as NaN is only for floating point types");
 
  255                                                           const Index& src_slice_origin_idx)
 
  258         static_assert(DstDesc::IsKnownAtCompileTime(),
 
  259                       "wrong! SrcDesc need to known at compile-time");
 
  261                       "wrong! Not divisible");
 
  266             static_assert(SrcScalarPerVector % 
PackedSize == 0, 
"pk data N cannot be 1");
 
  275     template <
typename SrcBuffer, 
typename DstBuffer, 
typename DstSliceOriginIdx>
 
  276     __device__ 
void Run(
const SrcDesc& src_desc,
 
  277                         const SrcBuffer& src_buf,
 
  279                         const DstSliceOriginIdx&,
 
  282         static_assert(DstDesc::IsKnownAtCompileTime(),
 
  283                       "wrong! DstDesc need to known at compile-time");
 
  286                       "wrong! DstSliceOrigin need to known at compile-time");
 
  290             "wrong! inconsistent type");
 
  294         constexpr 
auto dst_slice_origin_idx = DstSliceOriginIdx{};
 
  301         constexpr 
auto src_scalar_step_in_vector =
 
  318             const bool is_src_valid =
 
  322             src_vector.template AsType<src_vector_t>()(
Number<0>{}) =
 
  323                 src_buf.template Get<src_vector_t>(src_coord_.GetOffset() / 
PackedSize,
 
  329                     dst_desc.CalculateOffset(
to_multi_index(dst_slice_origin_idx) + src_data_idx +
 
  330                                              i * src_scalar_step_in_vector);
 
  332                 if constexpr(InvalidElementAsNaN)
 
  336                             ? type_convert<DstData>(src_vector.template AsType<SrcData>()[i])
 
  342                         type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
 
  346             if constexpr(idx_1d.value != num_access - 1)
 
  356         if constexpr(SrcResetCoordinateAfterRun)
 
  358             const auto src_reset_step =
 
  375         if constexpr(num_access == 0)
 
  381             constexpr 
auto reset_step =
 
  390                                        const Index& src_slice_origin_step_idx)
 
  393         const auto adjusted_step_idx =
 
  394             SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
 
  404     template <
typename SrcMoveSliceWindowStepHack>
 
  407                        const Index& src_slice_origin_step_idx,
 
  408                        const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
 
  411         const auto adjusted_step_idx =
 
  412             SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
 
  417             src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
 
  426 template <
typename SrcData,
 
  430           typename SliceLengths,
 
  431           typename DimAccessOrder,
 
  434           index_t SrcScalarStrideInVector,
 
  435           bool SrcResetCoordinateAfterRun,
 
  437           bool InvalidElementAsNaN                                        = 
false,
 
  438           typename enable_if<DstDesc::IsKnownAtCompileTime(), 
bool>::type = 
false>
 
  442                       (!InvalidElementAsNaN),
 
  443                   "Filling invalid element as NaN is only for floating point types");
 
  461         const SrcDesc& src_desc,
 
  462         const Index& src_slice_origin_idx,
 
  465           scale_gather_offsets_(scale_gather_offsets)
 
  467         static_assert(DstDesc::IsKnownAtCompileTime(),
 
  468                       "wrong! SrcDesc need to known at compile-time");
 
  470                       "wrong! Not divisible");
 
  474             static_assert(SrcScalarPerVector % 
PackedSize == 0, 
"pk data N cannot be 1");
 
  480         auto adjusted_origin_idx = [&]() {
 
  484                 [&](
auto i) { idx(i) = i.value == 0 ? 0 : src_slice_origin_idx[
Number<i>{}]; });
 
  492     template <
typename SrcBuffer, 
typename DstBuffer, 
typename DstSliceOriginIdx>
 
  493     __device__ 
void Run(
const SrcDesc& src_desc,
 
  494                         const SrcBuffer& src_buf,
 
  496                         const DstSliceOriginIdx&,
 
  499         static_assert(DstDesc::IsKnownAtCompileTime(),
 
  500                       "wrong! DstDesc need to known at compile-time");
 
  503                       "wrong! DstSliceOrigin need to known at compile-time");
 
  507             "wrong! inconsistent type");
 
  511         constexpr 
auto dst_slice_origin_idx = DstSliceOriginIdx{};
 
  518         constexpr 
auto src_scalar_step_in_vector =
 
  529             constexpr 
auto current_dst_origin =
 
  541                 const bool is_src_valid =
 
  546                 src_vector.template AsType<src_vector_t>()(
Number<0>{}) =
 
  547                     src_buf.template Get<src_vector_t>(src_coord_.GetOffset() / 
PackedSize +
 
  548                                                            scale_gather_offsets_(gather_idx),
 
  555                                                  src_data_idx + i * src_scalar_step_in_vector);
 
  556                     constexpr 
auto full_dst_offset =
 
  557                         dst_desc.CalculateOffset(current_dst_origin) + dst_offset;
 
  559                     if constexpr(InvalidElementAsNaN)
 
  561                         dst_buf(full_dst_offset) =
 
  563                                 ? type_convert<DstData>(src_vector.template AsType<SrcData>()[i])
 
  569                             type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
 
  573                 if constexpr(idx_1d.value != num_access - 1)
 
  584         if constexpr(SrcResetCoordinateAfterRun)
 
  586             const auto src_reset_step =
 
  603         if constexpr(num_access == 0)
 
  609             constexpr 
auto reset_step =
 
  618                                        const Index& src_slice_origin_step_idx)
 
  621         const auto adjusted_step_idx =
 
  622             SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
 
  632     template <
typename SrcMoveSliceWindowStepHack>
 
  635                        const Index& src_slice_origin_step_idx,
 
  636                        const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
 
  639         const auto adjusted_step_idx =
 
  640             SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
 
  645             src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
 
  660 template <
typename SliceLengths,
 
  666           typename SrcDimAccessOrder,
 
  667           typename DstDimAccessOrder,
 
  672           index_t SrcScalarStrideInVector,
 
  673           index_t DstScalarStrideInVector,
 
  674           bool SrcResetCoordinateAfterRun, 
 
  677           bool DstResetCoordinateAfterRun> 
 
  692                                                           const Index& src_slice_origin,
 
  693                                                           const DstDesc& dst_desc,
 
  694                                                           const Index& dst_slice_origin)
 
  699                       "wrong! Not divisible");
 
  701                       "wrong! Not divisible");
 
  714     template <
typename SrcBuffer, 
typename SrcStepHacks>
 
  716     RunRead(
const SrcDesc& src_desc, 
const SrcBuffer& src_buf, 
const SrcStepHacks& src_step_hacks)
 
  724             "wrong! SrcBuffer and SrcData data type are inconsistent");
 
  734         constexpr 
auto src_scalar_step_in_vector =
 
  737         constexpr 
auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
 
  739         constexpr 
auto src_dim_access_order = SrcDimAccessOrder{};
 
  741         constexpr 
auto ordered_src_access_lengths =
 
  747                 Index forward_step_idx;
 
  750                     forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
 
  754                     src_desc, forward_step_idx, src_step_hacks[I0][i]);
 
  761                 Index backward_step_idx;
 
  764                     backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
 
  768                     src_desc, backward_step_idx, src_step_hacks[I1][i]);
 
  773         static_ford<decltype(ordered_src_access_lengths)>{}([&](
auto ordered_src_access_idx) {
 
  775             constexpr 
auto forward_sweep = [&]() {
 
  778                 forward_sweep_(I0) = 
true;
 
  781                     index_t tmp = ordered_src_access_idx[I0];
 
  784                         tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
 
  787                     forward_sweep_(i) = tmp % 2 == 0;
 
  790                 return forward_sweep_;
 
  794             constexpr 
auto src_data_idx = [&]() {
 
  798                     ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
 
  799                                                       : ordered_src_access_lengths[i] - 1 -
 
  800                                                             ordered_src_access_idx[i];
 
  804                        src_scalar_per_access;
 
  809             using src_vector_t = 
typename decltype(src_tmp_vector)::type;
 
  811             const bool is_src_valid =
 
  815             src_tmp_vector.template AsType<src_vector_t>()(
Number<0>{}) =
 
  816                 src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
 
  820                 constexpr 
index_t buffer_offset =
 
  821                     buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
 
  826             constexpr 
auto move_on_dim = [&]() constexpr {
 
  830                     move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
 
  834                             ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
 
  843                 if constexpr(move_on_dim[i])
 
  845                     if constexpr(forward_sweep[i])
 
  848                             src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
 
  853                             src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
 
  860         if constexpr(SrcResetCoordinateAfterRun)
 
  862             const auto src_reset_step =
 
  869     template <
typename DstBuffer, 
typename DstStepHacks>
 
  871     RunWrite(
const DstDesc& dst_desc, DstBuffer& dst_buf, 
const DstStepHacks& dst_step_hacks)
 
  879             "wrong! SrcBuffer or DstBuffer data type is wrong");
 
  889         constexpr 
auto dst_scalar_step_in_vector =
 
  892         constexpr 
auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
 
  894         constexpr 
auto dst_dim_access_order = DstDimAccessOrder{};
 
  896         constexpr 
auto ordered_dst_access_lengths =
 
  902                 Index forward_step_idx;
 
  905                     forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
 
  909                     dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
 
  916                 Index backward_step_idx;
 
  919                     backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
 
  923                     dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
 
  928         static_ford<decltype(ordered_dst_access_lengths)>{}([&](
auto ordered_dst_access_idx) {
 
  930             constexpr 
auto forward_sweep = [&]() {
 
  933                 forward_sweep_(I0) = 
true;
 
  936                     index_t tmp = ordered_dst_access_idx[I0];
 
  939                         tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
 
  942                     forward_sweep_(i) = tmp % 2 == 0;
 
  945                 return forward_sweep_;
 
  949             constexpr 
auto dst_data_idx = [&]() {
 
  953                     ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
 
  954                                                       : ordered_dst_access_lengths[i] - 1 -
 
  955                                                             ordered_dst_access_idx[i];
 
  959                        dst_scalar_per_access;
 
  966                 constexpr 
index_t buffer_offset =
 
  967                     buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
 
  969                 dst_tmp_vector.template AsType<DstData>()(i) =
 
  973             using dst_vector_t = 
typename decltype(dst_tmp_vector)::type;
 
  976             const bool is_dst_valid =
 
  979             dst_buf.template Set<dst_vector_t>(
 
  980                 dst_coord_.GetOffset(),
 
  982                 dst_tmp_vector.template AsType<dst_vector_t>()[
Number<0>{}]);
 
  984             constexpr 
auto move_on_dim = [&]() constexpr {
 
  988                     move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
 
  992                             ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
 
 1001                 if constexpr(move_on_dim[i])
 
 1003                     if constexpr(forward_sweep[i])
 
 1006                             dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
 
 1011                             dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
 
 1018         if constexpr(DstResetCoordinateAfterRun)
 
 1020             const auto dst_reset_step =
 
 1027     template <
typename SrcBuffer>
 
 1028     __device__ 
void RunRead(
const SrcDesc& src_desc, 
const SrcBuffer& src_buf)
 
 1030         constexpr 
index_t ntransform_src = SrcDesc::GetNumOfTransform();
 
 1034         constexpr 
auto src_step_hacks =
 
 1038         RunRead(src_desc, src_buf, src_step_hacks);
 
 1041     template <
typename DstBuffer>
 
 1042     __device__ 
void RunWrite(
const DstDesc& dst_desc, DstBuffer& dst_buf)
 
 1044         constexpr 
index_t ntransform_dst = DstDesc::GetNumOfTransform();
 
 1048         constexpr 
auto dst_step_hacks =
 
 1052         RunWrite(dst_desc, dst_buf, dst_step_hacks);
 
 1064         constexpr 
auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
 
 1066         constexpr 
auto src_dim_access_order = SrcDimAccessOrder{};
 
 1068         constexpr 
auto ordered_src_access_lengths =
 
 1072         constexpr 
auto forward_sweep = [&]() {
 
 1075             forward_sweep_(I0) = 
true;
 
 1078                 index_t tmp = ordered_src_access_lengths[I0] - 1;
 
 1081                     tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
 
 1084                 forward_sweep_(i) = tmp % 2 == 0;
 
 1087             return forward_sweep_;
 
 1092         constexpr 
auto src_data_idx = [&]() {
 
 1096                 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
 
 1100                    src_scalar_per_access;
 
 1104         constexpr 
auto reset_src_data_step = [&]() {
 
 1105             Index reset_src_data_step_;
 
 1109             return reset_src_data_step_;
 
 1112         return reset_src_data_step;
 
 1124         constexpr 
auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
 
 1126         constexpr 
auto dst_dim_access_order = DstDimAccessOrder{};
 
 1128         constexpr 
auto ordered_dst_access_lengths =
 
 1132         constexpr 
auto forward_sweep = [&]() {
 
 1135             forward_sweep_(I0) = 
true;
 
 1138                 index_t tmp = ordered_dst_access_lengths[I0] - 1;
 
 1141                     tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
 
 1144                 forward_sweep_(i) = tmp % 2 == 0;
 
 1147             return forward_sweep_;
 
 1152         constexpr 
auto dst_data_idx = [&]() {
 
 1156                 ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
 
 1160                    dst_scalar_per_access;
 
 1164         constexpr 
auto reset_dst_data_step = [&]() {
 
 1165             Index reset_dst_data_step_;
 
 1169             return reset_dst_data_step_;
 
 1172         return reset_dst_data_step;
 
 1177                                        const Index& src_slice_origin_step_idx)
 
 1180         const auto adjusted_step_idx =
 
 1181             SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
 
 1191     template <
typename SrcMoveSliceWindowStepHack>
 
 1194                        const Index& src_slice_origin_step_idx,
 
 1195                        const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
 
 1198         const auto adjusted_step_idx =
 
 1199             SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
 
 1204             src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
 
 1210                                        const Index& dst_slice_origin_step_idx)
 
 1213         const auto adjusted_step_idx =
 
 1214             DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
 
 1224     static constexpr 
auto buffer_desc_ =
 
 1227     static constexpr 
auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
 
 1229     StaticBuffer<AddressSpaceEnum::Vgpr, SrcData, buffer_size_, true> buffer_;
 
 1248 template <
typename SrcData,
 
 1252           typename SliceLengths,
 
 1253           typename DimAccessOrder,
 
 1256           index_t SrcScalarStrideInVector,
 
 1257           typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
 
 1258                              bool>::type = 
false>
 
 1279         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
 
 1280                       "wrong! SrcDesc and DstDesc need to known at compile-time");
 
 1285             static_assert(SrcScalarPerVector % 
PackedSize == 0, 
"pk data N cannot be 1");
 
 1289     template <
typename SrcRefToOriginDisplacement,
 
 1290               typename DstOriginIdx,
 
 1293     __device__ 
void Run(
const SrcDesc&,
 
 1294                         const SrcRefToOriginDisplacement&,
 
 1295                         const SrcBuffer& src_buf,
 
 1297                         const DstOriginIdx&,
 
 1298                         DstBuffer& dst_buf)
 const 
 1300         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
 
 1301                       "wrong! SrcDesc and DstDesc need to known at compile-time");
 
 1306             "wrong! SrcBuffer or DstBuffer data type is wrong");
 
 1308         static_assert(DstBuffer::IsStaticBuffer(), 
"wrong! DstBuffer need to be StaticBuffer");
 
 1312                       "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known " 
 1320         constexpr 
auto src_ref_to_origin_disp_idx = 
to_multi_index(SrcRefToOriginDisplacement{});
 
 1325             [&](
auto i) constexpr {
 
 1326                 if constexpr(i == SrcVectorDim)
 
 1339             [&](
auto i) constexpr {
 
 1340                 if constexpr(i == SrcVectorDim)
 
 1351         constexpr 
auto access_lengths = SliceLengths{} / src_scalar_per_access;
 
 1353         constexpr 
auto dim_access_order = DimAccessOrder{};
 
 1355         constexpr 
auto ordered_access_lengths =
 
 1358         static_ford<decltype(ordered_access_lengths)>{}([&](
auto ordered_access_idx) {
 
 1362             constexpr 
auto data_to_origin_disp_idx =
 
 1364                 src_scalar_per_access;
 
 1367             constexpr 
auto data_to_origin_disp_idx =
 
 1368                 ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
 
 1371             constexpr 
auto src_ref_to_data_disp_idx =
 
 1372                 src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
 
 1374             constexpr 
auto src_ref_to_data_disp_coord_step =
 
 1377             auto src_data_coord = src_ref_coord_;
 
 1383             using src_vector_t = 
typename decltype(src_tmp_vector)::type;
 
 1386                 src_desc, src_data_coord);
 
 1389             if constexpr(SrcBuffer::IsDynamicBuffer())
 
 1391                 src_tmp_vector.template AsType<src_vector_t>()(
Number<0>{}) =
 
 1392                     src_buf.template Get<src_vector_t>(src_data_coord.GetOffset() / 
PackedSize,
 
 1395             else if constexpr(SrcBuffer::IsStaticBuffer())
 
 1398                     constexpr 
index_t src_offset = src_desc.CalculateOffset(
 
 1399                         src_ref_to_origin_disp_idx + data_to_origin_disp_idx +
 
 1400                         i * src_scalar_step_in_vector);
 
 1412                 constexpr 
index_t pack_size = 8;
 
 1414                 static_assert(SrcScalarPerVector % pack_size == 0, 
"");
 
 1419                 static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](
auto i) {
 
 1421                         dst_tmp_vector.template AsType<dst_v_t>()(i),
 
 1422                         src_tmp_vector.template AsType<src_v_t>()[i]);
 
 1427                     constexpr 
index_t dst_offset = dst_desc.CalculateOffset(
 
 1428                         dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
 
 1435                               SrcScalarPerVector % 2 == 0)
 
 1441                 constexpr 
index_t pack_size = 2;
 
 1445                 static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](
auto i) {
 
 1447                         dst_tmp_vector.template AsType<dst_v_t>()(i),
 
 1448                         src_tmp_vector.template AsType<src_v_t>()[i]);
 
 1453                     constexpr 
index_t dst_offset = dst_desc.CalculateOffset(
 
 1454                         dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
 
 1467                     dst_tmp_vector.template AsType<DstData>()(i) =
 
 1468                         type_convert<DstData>(src_tmp_vector.template AsType<SrcData>()[i]);
 
 1473                     constexpr 
index_t dst_offset = dst_desc.CalculateOffset(
 
 1474                         dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
 
 1483     template <
typename SrcRefToOriginDisplacement,
 
 1484               typename DstOriginIdx,
 
 1487     __device__ 
void Run(
const SrcDesc&,
 
 1488                         const SrcRefToOriginDisplacement&,
 
 1489                         const SrcBuffer& src_buf,
 
 1490                         const DstData& scale,
 
 1492                         const DstOriginIdx&,
 
 1493                         DstBuffer& dst_buf)
 const 
 1495         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
 
 1496                       "wrong! SrcDesc and DstDesc need to known at compile-time");
 
 1501             "wrong! SrcBuffer or DstBuffer data type is wrong");
 
 1503         static_assert(DstBuffer::IsStaticBuffer(), 
"wrong! DstBuffer need to be StaticBuffer");
 
 1507                       "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known " 
 1515         constexpr 
auto src_ref_to_origin_disp_idx = 
to_multi_index(SrcRefToOriginDisplacement{});
 
 1520             [&](
auto i) constexpr {
 
 1521                 if constexpr(i == SrcVectorDim)
 
 1534             [&](
auto i) constexpr {
 
 1535                 if constexpr(i == SrcVectorDim)
 
 1546         constexpr 
auto access_lengths = SliceLengths{} / src_scalar_per_access;
 
 1548         constexpr 
auto dim_access_order = DimAccessOrder{};
 
 1550         constexpr 
auto ordered_access_lengths =
 
 1553         static_ford<decltype(ordered_access_lengths)>{}([&](
auto ordered_access_idx) {
 
 1557             constexpr 
auto data_to_origin_disp_idx =
 
 1559                 src_scalar_per_access;
 
 1562             constexpr 
auto data_to_origin_disp_idx =
 
 1563                 ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
 
 1566             constexpr 
auto src_ref_to_data_disp_idx =
 
 1567                 src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
 
 1569             constexpr 
auto src_ref_to_data_disp_coord_step =
 
 1572             auto src_data_coord = src_ref_coord_;
 
 1578             using src_vector_t = 
typename decltype(src_tmp_vector)::type;
 
 1581                 src_desc, src_data_coord);
 
 1584             if constexpr(SrcBuffer::IsDynamicBuffer())
 
 1586                 src_tmp_vector.template AsType<src_vector_t>()(
Number<0>{}) =
 
 1587                     src_buf.template Get<src_vector_t>(src_data_coord.GetOffset() / 
PackedSize,
 
 1590             else if constexpr(SrcBuffer::IsStaticBuffer())
 
 1593                     constexpr 
index_t src_offset = src_desc.CalculateOffset(
 
 1594                         src_ref_to_origin_disp_idx + data_to_origin_disp_idx +
 
 1595                         i * src_scalar_step_in_vector);
 
 1607                 scale_vector.template AsType<DstData>()(
Number<0>{}) = scale;
 
 1608                 scale_vector.template AsType<DstData>()(
Number<1>{}) = scale;
 
 1610                 constexpr 
index_t pack_size = 8;
 
 1612                 static_assert(SrcScalarPerVector % pack_size == 0, 
"");
 
 1618                 static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](
auto i) {
 
 1620                         dst_tmp_vector.template AsType<dst_v_t>()(i),
 
 1621                         src_tmp_vector.template AsType<src_v_t>()[i],
 
 1622                         scale_vector.template AsType<scale_v_t>()[
Number<0>{}]);
 
 1627                     constexpr 
index_t dst_offset = dst_desc.CalculateOffset(
 
 1628                         dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
 
 1635                               SrcScalarPerVector % 2 == 0)
 
 1641                 constexpr 
index_t pack_size = 2;
 
 1645                 static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](
auto i) {
 
 1647                         dst_tmp_vector.template AsType<dst_v_t>()(i),
 
 1648                         src_tmp_vector.template AsType<src_v_t>()[i]);
 
 1653                     constexpr 
index_t dst_offset = dst_desc.CalculateOffset(
 
 1654                         dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
 
 1667                     dst_tmp_vector.template AsType<DstData>()(i) =
 
 1668                         type_convert<DstData>(src_tmp_vector.template AsType<SrcData>()[i]);
 
 1673                     constexpr 
index_t dst_offset = dst_desc.CalculateOffset(
 
 1674                         dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
 
 1682     template <
typename SrcSliceMoveStepIdx>
 
 1684                                        const SrcSliceMoveStepIdx& src_slice_move_step_idx)
 
 1686         constexpr 
auto src_desc = SrcDesc{};
 
 1688         const auto src_slice_move_step_iter =
 
 1708 template <
typename SrcData,
 
 1712           typename ElementwiseOperation,
 
 1713           typename SliceLengths,
 
 1714           typename DimAccessOrder,
 
 1717           typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
 
 1718                              bool>::type = 
false>
 
 1733         const ElementwiseOperation& element_op)
 
 1736         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
 
 1737                       "wrong! Desc need to known at compile-time");
 
 1740                       "wrong! Not divisible");
 
 1743     template <
typename SrcSliceOriginIdx,
 
 1744               typename DstSliceOriginIdx,
 
 1747     __device__ 
void Run(
const SrcDesc&,
 
 1748                         const SrcSliceOriginIdx&,
 
 1749                         const SrcBuffer& src_buf,
 
 1751                         const DstSliceOriginIdx&,
 
 1752                         DstBuffer& dst_buf)
 const 
 1754         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
 
 1755                       "wrong! Desc need to known at compile-time");
 
 1759                       "wrong! SliceOrigin need to known at compile-time");
 
 1761         static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
 
 1762                       "wrong! Buffer need to be StaticBuffer");
 
 1767         constexpr 
auto src_slice_origin_idx = 
to_multi_index(SrcSliceOriginIdx{});
 
 1768         constexpr 
auto dst_slice_origin_idx = 
to_multi_index(DstSliceOriginIdx{});
 
 1774         constexpr 
auto dst_scalar_step_in_vector =
 
 1782                       "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
 
 1796                     constexpr 
index_t src_offset = src_desc.CalculateOffset(
 
 1797                         src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
 
 1806                 constexpr 
index_t pack_size = 8;
 
 1808                 static_assert(DstScalarPerVector % pack_size == 0, 
"");
 
 1813                 static_for<0, DstScalarPerVector / pack_size, 1>{}([&](
auto i) {
 
 1815                         dst_tmp_vector.template AsType<dst_v_t>()(i),
 
 1816                         src_tmp_vector.template AsType<src_v_t>()[i]);
 
 1821                     constexpr 
index_t dst_offset = dst_desc.CalculateOffset(
 
 1822                         dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
 
 1835                     constexpr 
index_t src_offset = src_desc.CalculateOffset(
 
 1836                         src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
 
 1838                     constexpr 
index_t dst_offset = dst_desc.CalculateOffset(
 
 1839                         dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
 
 1862 template <
typename SrcData,
 
 1866           typename ElementwiseOperation,
 
 1867           typename SliceLengths,
 
 1868           typename DimAccessOrder,
 
 1871           uint32_t LowEightRowlaneIdx,
 
 1872           uint32_t HighEightRowLaneIdx,
 
 1873           bool IntraRowSwizzlePerm,
 
 1874           typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
 
 1875                              bool>::type = 
false>
 
 1884         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
 
 1885                       "wrong! Desc need to known at compile-time");
 
 1888                       "wrong! Not divisible");
 
 1892     template <
typename SrcSliceOriginIdx,
 
 1893               typename DstSliceOriginIdx,
 
 1896     __device__ 
void Run(
const SrcDesc&,
 
 1897                         const SrcSliceOriginIdx&,
 
 1898                         const SrcBuffer& src_buf,
 
 1900                         const DstSliceOriginIdx&,
 
 1901                         DstBuffer& dst_buf)
 const 
 1903         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
 
 1904                       "wrong! Desc need to known at compile-time");
 
 1908                       "wrong! SliceOrigin need to known at compile-time");
 
 1910         static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
 
 1911                       "wrong! Buffer need to be StaticBuffer");
 
 1916         constexpr 
auto src_slice_origin_idx = 
to_multi_index(SrcSliceOriginIdx{});
 
 1917         constexpr 
auto dst_slice_origin_idx = 
to_multi_index(DstSliceOriginIdx{});
 
 1923         constexpr 
auto dst_scalar_step_in_vector =
 
 1931                       "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
 
 1941                 constexpr 
index_t src_offset = src_desc.CalculateOffset(
 
 1942                     src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
 
 1944                 constexpr 
index_t dst_offset = dst_desc.CalculateOffset(
 
 1945                     dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
 
 1947                 SrcData v_this_row, v_theother_row;
 
 1955                 if constexpr(IntraRowSwizzlePerm)
 
 1957                     temp = __builtin_amdgcn_permlane16(
 
 1958                         temp, type_convert_sp<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
 
 1959                     v_this_row = type_convert_sp<SrcData>(temp);
 
 1963                 temp           = __builtin_amdgcn_permlanex16(temp,
 
 1964                                                     type_convert_sp<int>(v_this_row),
 
 1966                                                     HighEightRowLaneIdx,
 
 1969                 v_theother_row = type_convert_sp<SrcData>(temp);
 
 1976                         type_convert_sp<DstData>(v_theother_row);
 
 1982                         type_convert_sp<DstData>(v_this_row);
 
 1992 template <
typename SrcData,
 
 1996           typename ElementwiseOperation,
 
 1997           typename SliceLengths,
 
 1998           typename DimAccessOrder,
 
 2001           bool IntraRowSwizzlePerm,
 
 2002           typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
 
 2003                              bool>::type = 
false>
 
 2012         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
 
 2013                       "wrong! Desc need to known at compile-time");
 
 2016                       "wrong! Not divisible");
 
 2020     template <
typename SrcSliceOriginIdx,
 
 2021               typename DstSliceOriginIdx,
 
 2024     __device__ 
void Run(
const SrcDesc&,
 
 2025                         const SrcSliceOriginIdx&,
 
 2026                         const SrcBuffer& src_buf,
 
 2028                         const DstSliceOriginIdx&,
 
 2029                         DstBuffer& dst_buf)
 const 
 2031         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
 
 2032                       "wrong! Desc need to known at compile-time");
 
 2036                       "wrong! SliceOrigin need to known at compile-time");
 
 2038         static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
 
 2039                       "wrong! Buffer need to be StaticBuffer");
 
 2044         constexpr 
auto src_slice_origin_idx = 
to_multi_index(SrcSliceOriginIdx{});
 
 2045         constexpr 
auto dst_slice_origin_idx = 
to_multi_index(DstSliceOriginIdx{});
 
 2051         constexpr 
auto dst_scalar_step_in_vector =
 
 2059                       "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
 
 2069                 constexpr 
index_t src_offset = src_desc.CalculateOffset(
 
 2070                     src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
 
 2072                 constexpr 
index_t dst_offset = dst_desc.CalculateOffset(
 
 2073                     dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
 
 2083                 if constexpr(IntraRowSwizzlePerm)
 
 2085                     temp = __builtin_amdgcn_permlane16(
 
 2086                         temp, type_convert_sp<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
 
 2087                     v_this_row = type_convert_sp<SrcData>(temp);
 
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition: statically_indexed_array.hpp:45
 
__host__ constexpr __device__ bool coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition: tensor_descriptor.hpp:560
 
__host__ constexpr __device__ auto make_multi_index(Xs &&... xs)
Definition: array_multi_index.hpp:15
 
__host__ constexpr __device__ auto generate_tuple(F &&f, Number< N >)
Definition: tuple_helper.hpp:21
 
InMemoryDataOperationEnum
Definition: ck.hpp:275
 
__host__ constexpr __device__ auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition: tensor_descriptor_helper.hpp:101
 
f8_fnuz_t f8_t
Definition: amd_ck_fp8.hpp:1737
 
__host__ constexpr __device__ auto to_multi_index(const T &x)
Definition: array_multi_index.hpp:28
 
_Float16 half_t
Definition: data_type.hpp:30
 
__host__ constexpr __device__ auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition: tensor_descriptor.hpp:407
 
constexpr detail::ignore_t ignore
Definition: ignore.hpp:20
 
__host__ constexpr __device__ auto generate_sequence(F, Number< N >)
Definition: sequence_helper.hpp:18
 
__host__ constexpr __device__ auto generate_sequence_v2(F &&f, Number< N >)
Definition: sequence_helper.hpp:25
 
__host__ constexpr __device__ auto sequence_to_tuple_of_number(Sequence< Is... >)
Definition: container_helper.hpp:380
 
std::enable_if< B, T > enable_if
Definition: enable_if.hpp:24
 
constexpr bool is_same_v
Definition: type.hpp:283
 
__host__ constexpr __device__ auto container_reorder_given_new2old(const Array< TData, NSize > &old_array, Sequence< IRs... >)
Definition: container_helper.hpp:43
 
__host__ constexpr __device__ auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:211
 
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition: type.hpp:297
 
int32_t index_t
Definition: ck.hpp:297
 
__host__ constexpr __device__ void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
Definition: tensor_descriptor.hpp:508
 
__host__ constexpr __device__ auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition: tensor_descriptor.hpp:444
 
__device__ index_t get_thread_local_1d_id()
Definition: get_id.hpp:19
 
typename remove_cv< T >::type remove_cv_t
Definition: type.hpp:295
 
__host__ constexpr __device__ auto container_reorder_given_old2new(const Array< TData, NSize > &old_array, Sequence< IRs... > old2new)
Definition: container_helper.hpp:54
 
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition: dtype_vector.hpp:54
 
__host__ static constexpr __device__ T QuietNaN()
Definition: numeric_limits.hpp:313
 
Definition: tensor_space_filling_curve.hpp:20
 
static __device__ constexpr __host__ auto GetForwardStep(Number< AccessIdx1d >)
Definition: tensor_space_filling_curve.hpp:66
 
__host__ static constexpr __device__ index_t GetNumOfAccess()
Definition: tensor_space_filling_curve.hpp:41
 
static constexpr index_t ScalarPerVector
Definition: tensor_space_filling_curve.hpp:25
 
static __device__ constexpr __host__ Index GetIndex(Number< AccessIdx1d >)
Definition: tensor_space_filling_curve.hpp:81
 
static __device__ constexpr __host__ auto GetStepBetween(Number< AccessIdx1dBegin >, Number< AccessIdx1dEnd >)
Definition: tensor_space_filling_curve.hpp:52
 
Definition: threadwise_tensor_slice_transfer.hpp:1877
 
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1878
 
constexpr __device__ ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow(const Index &src_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1882
 
ElementwiseOperation element_op_
Definition: threadwise_tensor_slice_transfer.hpp:1988
 
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1896
 
Definition: threadwise_tensor_slice_transfer.hpp:2005
 
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:2006
 
constexpr __device__ ThreadwiseTensorSliceTransfer_StaticToStatic_IntraRow(const Index &src_idx)
Definition: threadwise_tensor_slice_transfer.hpp:2010
 
ElementwiseOperation element_op_
Definition: threadwise_tensor_slice_transfer.hpp:2095
 
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:2024
 
Threadwise data transfer.
Definition: threadwise_tensor_slice_transfer.hpp:1720
 
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:1725
 
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1747
 
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1721
 
ElementwiseOperation element_op_
Definition: threadwise_tensor_slice_transfer.hpp:1853
 
constexpr __device__ ThreadwiseTensorSliceTransfer_StaticToStatic(const ElementwiseOperation &element_op)
Definition: threadwise_tensor_slice_transfer.hpp:1732
 
Definition: threadwise_tensor_slice_transfer.hpp:39
 
static constexpr __device__ auto GetDstCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:149
 
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:40
 
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:42
 
decltype(make_tensor_coordinate(DstDesc{}, Index{})) DstCoord
Definition: threadwise_tensor_slice_transfer.hpp:44
 
constexpr __device__ ThreadwiseTensorSliceTransfer_v1r3(const DstDesc &dst_desc, const Index &dst_slice_origin_idx, const ElementwiseOperation &element_op)
Definition: threadwise_tensor_slice_transfer.hpp:48
 
decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) DstCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:46
 
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:173
 
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:60
 
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:66
 
Definition: threadwise_tensor_slice_transfer.hpp:440
 
__device__ void Run(const SrcDesc &src_desc, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:493
 
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:453
 
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition: threadwise_tensor_slice_transfer.hpp:634
 
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:478
 
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:449
 
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:447
 
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:451
 
static constexpr __device__ auto GetSrcCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:593
 
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:617
 
constexpr __device__ ThreadwiseTensorSliceTransfer_v2_gather(const SrcDesc &src_desc, const Index &src_slice_origin_idx, const StaticallyIndexedArray< index_t, scale_gather_num > &scale_gather_offsets)
Definition: threadwise_tensor_slice_transfer.hpp:460
 
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:445
 
Helper structure that facilitates transfer of source (grid) data to destination threads.
Definition: threadwise_tensor_slice_transfer.hpp:234
 
constexpr __device__ ThreadwiseTensorSliceTransfer_v2(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:254
 
__device__ void Run(const SrcDesc &src_desc, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:276
 
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:241
 
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:389
 
static constexpr __device__ auto GetSrcCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:365
 
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition: threadwise_tensor_slice_transfer.hpp:406
 
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:270
 
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:239
 
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:245
 
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:247
 
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:243
 
Definition: threadwise_tensor_slice_transfer.hpp:681
 
decltype(make_tensor_coordinate(DstDesc{}, Index{})) DstCoord
Definition: threadwise_tensor_slice_transfer.hpp:686
 
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:688
 
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:683
 
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition: threadwise_tensor_slice_transfer.hpp:1193
 
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf, const SrcStepHacks &src_step_hacks)
Definition: threadwise_tensor_slice_transfer.hpp:716
 
decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) DstCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:689
 
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1209
 
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:709
 
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:1042
 
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:704
 
static constexpr __device__ auto GetSrcCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:1055
 
static constexpr __device__ auto GetDstCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:1115
 
constexpr __device__ ThreadwiseTensorSliceTransfer_v3(const SrcDesc &src_desc, const Index &src_slice_origin, const DstDesc &dst_desc, const Index &dst_slice_origin)
Definition: threadwise_tensor_slice_transfer.hpp:691
 
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:685
 
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:682
 
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf)
Definition: threadwise_tensor_slice_transfer.hpp:1028
 
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1176
 
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf, const DstStepHacks &dst_step_hacks)
Definition: threadwise_tensor_slice_transfer.hpp:871
 
Definition: threadwise_tensor_slice_transfer.hpp:1260
 
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1261
 
__device__ void Run(const SrcDesc &, const SrcRefToOriginDisplacement &, const SrcBuffer &src_buf, const DstDesc &, const DstOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1293
 
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:1269
 
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:1265
 
constexpr __device__ ThreadwiseTensorSliceTransfer_v4(const Index &src_ref_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1276
 
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:1267
 
__device__ void Run(const SrcDesc &, const SrcRefToOriginDisplacement &, const SrcBuffer &src_buf, const DstData &scale, const DstDesc &, const DstOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1487
 
__device__ void SetSrcCoord(const Index &src_ref_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1693
 
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:1263
 
__device__ void MoveSrcSliceWindow(const SrcDesc &, const SrcSliceMoveStepIdx &src_slice_move_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1683
 
Definition: threadwise_tensor_slice_transfer_util.hpp:20
 
Definition: threadwise_tensor_slice_transfer_util.hpp:29
 
Definition: data_type.hpp:41
 
Definition: integral_constant.hpp:20
 
Definition: is_known_at_compile_time.hpp:14
 
Definition: data_type.hpp:197
 
Definition: functional2.hpp:33
 
Definition: functional3.hpp:97
 
Definition: unary_element_wise_operation.hpp:241
 
Definition: unary_element_wise_operation.hpp:277
 
Definition: unary_element_wise_operation.hpp:133
 
Definition: dtype_vector.hpp:30
 
Definition: dtype_vector.hpp:10