31 template <
typename DimAccessOrderTuple,
 
   34           typename SrcTensorType,
 
   35           typename DstTensorType>
 
   36 __device__ 
void copy(
const SrcTensorType& src_tensor, DstTensorType& dst_tensor)
 
   38     static_assert(is_detected<is_tuple, DimAccessOrderTuple>::value);
 
   39     constexpr 
auto I0 = Number<0>{};
 
   40     constexpr 
auto I1 = Number<1>{};
 
   42     const auto& in_grid_desc  = 
layout(src_tensor).GetUnrolledDescriptor();
 
   43     const auto& out_grid_desc = 
layout(dst_tensor).GetUnrolledDescriptor();
 
   46     constexpr 
index_t num_dims = SrcShapeType::Size();
 
   48     constexpr 
auto thread_slice_lengths =
 
   51         [](
auto I) { 
return DimAccessOrderTuple{}.At(I); }, Number<num_dims>{});
 
   53     if constexpr(SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
 
   56         auto transfer = ThreadwiseTensorSliceTransfer_v7<
 
   57             Tuple<typename SrcTensorType::TensorElementType>,
 
   58             Tuple<typename DstTensorType::TensorElementType>,
 
   59             decltype(
tie(in_grid_desc)),
 
   60             decltype(
tie(out_grid_desc)),
 
   61             tensor_operation::element_wise::PassThrough,
 
   63             decltype(thread_slice_lengths),
 
   64             decltype(dim_access_order),
 
   68             Sequence<true>>{in_grid_desc,
 
   72                             tensor_operation::element_wise::PassThrough{}};
 
   74         transfer.Run(
tie(in_grid_desc),
 
   75                      tie(src_tensor.GetBuffer()),
 
   77                      tie(dst_tensor.GetBuffer()));
 
   79     else if constexpr(!SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
 
   82         const auto src_slice_origin_idxs =
 
   86             ThreadwiseTensorSliceTransfer_v1r3<
typename SrcTensorType::TensorElementType,
 
   87                                                typename DstTensorType::TensorElementType,
 
   90                                                tensor_operation::element_wise::PassThrough,
 
   91                                                decltype(thread_slice_lengths),
 
   92                                                decltype(dim_access_order),
 
   98                                                      dst_tensor.GetMultiIdxOffsets(),
 
   99                                                      tensor_operation::element_wise::PassThrough{}};
 
  101         transfer.Run(in_grid_desc,
 
  102                      src_slice_origin_idxs,
 
  103                      src_tensor.GetBuffer(),
 
  105                      dst_tensor.GetBuffer());
 
  107     else if constexpr(SrcTensorType::IsDynamicBuffer && !DstTensorType::IsDynamicBuffer)
 
  110         const auto dst_slice_origin_idxs =
 
  112         auto transfer = ThreadwiseTensorSliceTransfer_v2<
 
  113             std::remove_const_t<typename SrcTensorType::TensorElementType>,
 
  114             std::remove_const_t<typename DstTensorType::TensorElementType>,
 
  117             decltype(thread_slice_lengths),
 
  118             decltype(dim_access_order),
 
  123             false>{in_grid_desc, src_tensor.GetMultiIdxOffsets()};
 
  125         transfer.Run(in_grid_desc,
 
  126                      src_tensor.GetBuffer(),
 
  128                      dst_slice_origin_idxs,
 
  129                      dst_tensor.GetBuffer());
 
  134         static_for<0, SrcShapeType::Size(), 1>{}([&](
auto i) { dst_tensor(i) = src_tensor(i); });
 
  145 template <
typename SrcTensorType, 
typename DstTensorType>
 
  146 __host__ __device__ 
void copy(
const SrcTensorType& src_tensor, DstTensorType& dst_tensor)
 
  150     constexpr 
index_t num_dims = SrcShapeType::Size();
 
  152     constexpr 
auto dim_access_order_tuple =
 
  153         generate_tuple([](
auto i) { 
return Number<i>{}; }, Number<num_dims>{});
 
  154     constexpr 
index_t vector_dim        = num_dims - 1;
 
  155     constexpr 
index_t scalar_per_vector = 1;
 
  156     copy<decltype(dim_access_order_tuple), vector_dim, scalar_per_vector>(src_tensor, dst_tensor);
 
  172 template <
typename DimAccessOrderTuple,
 
  175           typename SrcTensorType,
 
  176           typename DstTensorType,
 
  177           typename ThreadShape,
 
  178           typename ThreadUnrolledDesc>
 
  181                DstTensorType& dst_tensor,
 
  184     static_assert(SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer);
 
  185     static_assert(is_detected<is_tuple, DimAccessOrderTuple>::value);
 
  187     const auto& in_grid_desc  = 
layout(src_tensor).GetUnrolledDescriptor();
 
  188     const auto& out_grid_desc = 
layout(dst_tensor).GetUnrolledDescriptor();
 
  191     constexpr 
index_t num_dims = SrcShapeType::Size();
 
  193     constexpr 
auto tile_lengths_seq =
 
  195     constexpr 
auto thread_layout_seq =
 
  198         [](
auto I) { 
return DimAccessOrderTuple{}.At(I); }, Number<num_dims>{});
 
  200     using ThisThreadBlock = ThisThreadBlock<size(ThreadShape{})>;
 
  203     auto transfer = ThreadGroupTensorSliceTransfer_v7<
 
  205         Tuple<typename SrcTensorType::TensorElementType>,
 
  206         Tuple<typename DstTensorType::TensorElementType>,
 
  207         decltype(
tie(in_grid_desc)),
 
  208         decltype(
tie(out_grid_desc)),
 
  209         tensor_operation::element_wise::PassThrough,
 
  211         std::remove_const_t<decltype(tile_lengths_seq)>,
 
  212         std::remove_const_t<decltype(thread_layout_seq)>,
 
  213         std::remove_const_t<decltype(dim_access_order)>,
 
  214         std::remove_const_t<decltype(dim_access_order)>,
 
  218         Sequence<true>>{in_grid_desc,
 
  222                         tensor_operation::element_wise::PassThrough{}};
 
  224     transfer.Run(
tie(in_grid_desc),
 
  225                  tie(src_tensor.GetBuffer()),
 
  227                  tie(dst_tensor.GetBuffer()));
 
__device__ void blockwise_copy(const SrcTensorType &src_tensor, DstTensorType &dst_tensor, [[maybe_unused]] const Layout< ThreadShape, ThreadUnrolledDesc > &thread_layout)
Perform optimized blockwise copy between two tensors. Tensors must have the same size.
Definition: copy.hpp:180
 
__device__ void copy(const SrcTensorType &src_tensor, DstTensorType &dst_tensor)
Perform optimized copy between two tensors partitions (threadwise copy). Tensors must have the same s...
Definition: copy.hpp:36
 
__host__ constexpr __device__ const auto & shape(const LayoutType &layout)
Get Layout shape.
Definition: layout_utils.hpp:431
 
__host__ constexpr __device__ auto generate_tuple(F &&f, Number< N >)
Definition: tuple_helper.hpp:21
 
constexpr Tuple< Args &... > tie(Args &... args) noexcept
Definition: tuple.hpp:218
 
__host__ constexpr __device__ auto generate_sequence_v2(F &&f, Number< N >)
Definition: sequence_helper.hpp:25
 
__host__ constexpr __device__ auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:211
 
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition: type.hpp:297
 
int32_t index_t
Definition: ck.hpp:300
 
Layout wrapper that performs the tensor descriptor logic.
Definition: layout.hpp:24
 
__host__ constexpr __device__ const auto & layout(const Tensor< BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType > &tensor)
Get Tensor Layout.
Definition: tensor_utils.hpp:162