/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/wrapper/operations/copy.hpp File Reference#
copy.hpp File Reference
#include "ck/wrapper/utils/tensor_utils.hpp"#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp"#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp"#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"#include "ck/tensor_description/tensor_space_filling_curve.hpp"Go to the source code of this file.
Functions | |
| template<typename DimAccessOrderTuple , index_t VectorDim, index_t ScalarPerVector, typename SrcTensorType , typename DstTensorType > | |
| __device__ void | copy (const SrcTensorType &src_tensor, DstTensorType &dst_tensor) |
| Perform optimized copy between two tensors partitions (threadwise copy). Tensors must have the same size. More... | |
| template<typename SrcTensorType , typename DstTensorType > | |
| __host__ __device__ void | copy (const SrcTensorType &src_tensor, DstTensorType &dst_tensor) |
| Perform generic copy between two tensors partitions (threadwise copy). Tensors must have the same size. More... | |
| template<typename DimAccessOrderTuple , index_t VectorDim, index_t ScalarPerVector, typename SrcTensorType , typename DstTensorType , typename ThreadShape , typename ThreadUnrolledDesc > | |
| __device__ void | blockwise_copy (const SrcTensorType &src_tensor, DstTensorType &dst_tensor, [[maybe_unused]] const Layout< ThreadShape, ThreadUnrolledDesc > &thread_layout) |
| Perform optimized blockwise copy between two tensors. Tensors must have the same size. More... | |
Function Documentation
◆ blockwise_copy()
template<typename DimAccessOrderTuple , index_t VectorDim, index_t ScalarPerVector, typename SrcTensorType , typename DstTensorType , typename ThreadShape , typename ThreadUnrolledDesc >
| __device__ void blockwise_copy | ( | const SrcTensorType & | src_tensor, |
| DstTensorType & | dst_tensor, | ||
| [[maybe_unused] ] const Layout< ThreadShape, ThreadUnrolledDesc > & | thread_layout | ||
| ) |
Perform optimized blockwise copy between two tensors. Tensors must have the same size.
- Note
- At now Vgpr and Sgpr are not supported.
- Template Parameters
-
DimAccessOrderTuple Tuple with dimension access order. VectorDim Dimension for vectorize read and write. ScalarPerVector Number of scalar per vectorize read and write.
- Parameters
-
src_tensor Source tensor. dst_tensor Destination tensor. thread_layout Thread layout per each dimension for copy.
◆ copy() [1/2]
template<typename DimAccessOrderTuple , index_t VectorDim, index_t ScalarPerVector, typename SrcTensorType , typename DstTensorType >
| __device__ void copy | ( | const SrcTensorType & | src_tensor, |
| DstTensorType & | dst_tensor | ||
| ) |
Perform optimized copy between two tensors partitions (threadwise copy). Tensors must have the same size.
- Template Parameters
-
DimAccessOrderTuple Tuple with dimension access order. VectorDim Dimension for vectorized read and write. ScalarPerVector Number of scalar per vectorized read and write.
- Parameters
-
src_tensor Source tensor. dst_tensor Destination tensor.
◆ copy() [2/2]
template<typename SrcTensorType , typename DstTensorType >
| __host__ __device__ void copy | ( | const SrcTensorType & | src_tensor, |
| DstTensorType & | dst_tensor | ||
| ) |
Perform generic copy between two tensors partitions (threadwise copy). Tensors must have the same size.
- Parameters
-
src_tensor Source tensor. dst_tensor Destination tensor.