Wrapper#
Description#
The CK library provides a lightweight wrapper for more complex operations implemented in the library.
Example:
const auto shape_4x2x4         = ck::make_tuple(4, ck::make_tuple(2, 4));
const auto strides_s2x1x8      = ck::make_tuple(2, ck::make_tuple(1, 8));
const auto layout = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8);
std::array<ck::index_t, 32> data;
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
for(ck::index_t w = 0; w < size(tensor); w++) {
    tensor(w) = w;
}
// slice() == slice(0, -1) (whole dimension)
auto tensor_slice = tensor(ck::wrapper::slice(1, 3), ck::make_tuple(ck::wrapper::slice(), ck::wrapper::slice()));
std::cout << "dims:2,(2,4) strides:2,(1,8)" << std::endl;
for(ck::index_t h = 0; h < ck::wrapper::size<0>(tensor_slice); h++)
{
    for(ck::index_t w = 0; w < ck::wrapper::size<1>(tensor_slice); w++)
    {
        std::cout << tensor_slice(h, w) << " ";
    }
    std::cout << std::endl;
}
Output:
dims:2,(2,4) strides:2,(1,8)
1 5 9 13 17 21 25 29
2 6 10 14 18 22 26 30
Tutorials:
Advanced examples:
Layout#
- 
template<typename Shape, typename UnrolledDescriptorType>
 struct Layout#
- Layout wrapper that performs the tensor descriptor logic. - Template Parameters:
- Shape – Tuple of Number<> (for compile-time layout) or index_t (dynamic layout). It is possible to pass nested shapes (e.g. ((4, 2), 2)), nested dimensions are merged. 
- UnrolledDescriptorType – Tensor descriptor for unnested shape dims. 
 
 
Layout helpers#
Functions
- 
template<typename Shape, typename Strides>
 __host__ __device__ constexpr auto make_layout(const Shape &shape, const Strides &strides)#
- Make layout function. - Template Parameters:
- Shape – Shape for layout. 
- Strides – Strides for layout. 
 
- Returns:
- Constructed layout. 
 
- 
template<typename Shape>
 __host__ __device__ constexpr auto make_layout(const Shape &shape)#
- Make layout function with packed strides (column-major). - Template Parameters:
- Shape – Shape for layout. 
- Returns:
- Constructed layout. 
 
- 
template<typename T>
 __host__ __device__ constexpr T get(const T &dim)#
- Get dim. - Parameters:
- dim – Dimension. 
- Returns:
- Returned the same dimension. 
 
- 
template<index_t idx, typename ...Dims>
 __host__ __device__ constexpr auto get(const Tuple<Dims...> &tuple)#
- Get element from tuple (Shape/Strides/Idxs). - Template Parameters:
- idx – Index to lookup. 
- Parameters:
- tuple – Tuple to lookup. 
- Returns:
- Requsted element. 
 
- 
template<index_t idx, typename Shape, typename UnrolledDesc>
 __host__ __device__ constexpr auto get(const Layout<Shape, UnrolledDesc> &layout)#
- Get sub layout. - Template Parameters:
- idx – Index to lookup. 
- Parameters:
- layout – Layout to create sub layout. 
- Returns:
- Requsted sub layout. 
 
- 
template<index_t Idx, index_t... Idxs, typename T>
 __host__ __device__ constexpr auto get(const T &elem)#
- Hierarchical get. - Template Parameters:
- Idxs – Indexes to lookup. 
- Parameters:
- elem – Element to lookup. 
- Returns:
- Requsted element. 
 
- 
template<typename T>
 __host__ __device__ constexpr T size(const T &dim)#
- Get size. - Parameters:
- dim – Size. 
- Returns:
- Returned the same size. 
 
- 
template<index_t idx, typename Shape, typename UnrolledDescriptorType>
 __host__ __device__ constexpr auto size(const Layout<Shape, UnrolledDescriptorType> &layout)#
- Length get (product if tuple). - Template Parameters:
- idx – Index to lookup. 
- Parameters:
- layout – Layout to get Shape of. 
- Returns:
- Requsted length. 
 
- 
template<typename ...ShapeDims>
 __host__ __device__ constexpr auto size(const Tuple<ShapeDims...> &shape)#
- Shape size (product of dims). - Parameters:
- shape – Shape to lookup. 
- Returns:
- Requsted size. 
 
- 
template<typename Shape, typename UnrolledDescriptorType>
 __host__ __device__ constexpr auto size(const Layout<Shape, UnrolledDescriptorType> &layout)#
- Layout size (product of dims). - Parameters:
- layout – Layout to calculate shape size. 
- Returns:
- Requsted size. 
 
- 
template<index_t idx, typename ...Ts>
 __host__ __device__ constexpr auto size(const Tuple<Ts...> &tuple)#
- Length get from tuple (product if tuple). - Template Parameters:
- idx – Index to lookup. 
- Parameters:
- tuple – Tuple to lookup. 
- Returns:
- Requsted length. 
 
- 
template<index_t Idx, index_t... Idxs, typename T>
 __host__ __device__ constexpr auto size(const T &elem)#
- Hierarchical size. - Template Parameters:
- Idx – First index to lookup (to avoid empty Idxs). 
- Idxs – Next indexes to lookup. 
 
- Parameters:
- elem – Element to lookup. 
- Returns:
- Requsted element. 
 
- 
template<typename Shape, typename UnrolledDescriptorType>
 __host__ __device__ constexpr auto rank([[maybe_unused]] const Layout<Shape, UnrolledDescriptorType> &layout)#
- Get layout rank (num elements in shape). - Parameters:
- layout – Layout to calculate rank. 
- Returns:
- Requsted rank. 
 
- 
template<typename ...Dims>
 __host__ __device__ constexpr auto rank([[maybe_unused]] const Tuple<Dims...> &tuple)#
- Get tuple rank (num elements in tuple). Return 1 if scalar passed. - Parameters:
- tuple – Tuple to calculate rank. 
- Returns:
- Requsted rank. 
 
- 
template<index_t IDim>
 __host__ __device__ constexpr index_t rank([[maybe_unused]] const Number<IDim> &dim)#
- Rank for scalar. - Parameters:
- dim – Dimension scalar. 
- Returns:
- Returned 1. 
 
- 
__host__ __device__ constexpr index_t rank([[maybe_unused]] const index_t &dim)#
- Rank for scalar. - Parameters:
- dim – Dimension scalar. 
- Returns:
- Returned 1. 
 
- 
template<index_t... Idxs, typename T>
 __host__ __device__ constexpr auto rank(const T &elem)#
- Hierarchical rank. - Template Parameters:
- Idxs – Indexes to lookup. 
- Parameters:
- elem – Element to lookup. 
- Returns:
- Requsted rank. 
 
- 
template<typename Shape, typename UnrolledDescriptorType>
 __host__ __device__ constexpr auto depth(const Layout<Shape, UnrolledDescriptorType> &layout)#
- Get depth of the layout shape (return 0 if scalar). - Parameters:
- layout – Layout to calculate depth. 
- Returns:
- Requsted depth. 
 
- 
template<typename ...Dims>
 __host__ __device__ constexpr auto depth(const Tuple<Dims...> &tuple)#
- Get depth of the tuple. (return 0 if scalar) - Parameters:
- tuple – Tuple to calculate depth. 
- Returns:
- Requsted depth. 
 
- 
template<index_t IDim>
 __host__ __device__ constexpr index_t depth([[maybe_unused]] const Number<IDim> &dim)#
- Depth for scalar. - Parameters:
- dim – Scalar. 
- Returns:
- Returned 0. 
 
- 
__host__ __device__ constexpr index_t depth([[maybe_unused]] const index_t &dim)#
- Depth for scalar. - Parameters:
- dim – Scalar. 
- Returns:
- Returned 0. 
 
- 
template<index_t... Idxs, typename T>
 __host__ __device__ constexpr auto depth(const T &elem)#
- Hierarchical depth. - Template Parameters:
- Idxs – Indexes to lookup. 
- Parameters:
- elem – Element to lookup. 
- Returns:
- Requsted depth. 
 
- 
template<typename LayoutType>
 __host__ __device__ constexpr const auto &shape(const LayoutType &layout)#
- Get Layout shape. - Parameters:
- layout – Layout to get shape from. 
- Returns:
- Requsted shape. 
 
- 
template<typename Shape, typename UnrolledDesc, typename TileLengths>
 __host__ __device__ constexpr auto pad(const Layout<Shape, UnrolledDesc> &layout, const TileLengths &tile_lengths)#
- Pad layout shapes to be adjusted to tile lengths. - Parameters:
- layout – Layout to pad. 
- tile_lengths – Tile lengths to align layout shape. 
 
- Returns:
- Padded layout. 
 
- 
template<index_t Idx, typename Shape, typename UnrolledDesc, typename NewLengths, typename NewIdxs>
 __host__ __device__ constexpr auto unmerge(const Layout<Shape, UnrolledDesc> &layout, const NewLengths &new_lengths, [[maybe_unused]] const NewIdxs &new_indexes)#
- Unmerge selected dim in layout. - Template Parameters:
- Idx – Index to dimension being unmerged. 
- Parameters:
- layout – Layout to pad. 
- new_lengths – Dimensions into which the indicated dimension will be divided. 
- new_indexes – Indexes to shuffle dims. Dims for unmerged dim should be nested. 
 
- Returns:
- Unmerged layout. 
 
Tensor#
- 
template<MemoryTypeEnum BufferAddressSpace, typename ElementType, typename Shape, typename UnrolledDescriptorType>
 struct Tensor#
- Tensor wrapper that performs static and dynamic buffer logic. The tensor is based on a descriptor stored in the Layout. Additionally, tensor can be sliced or shifted using multi-index offset. - Template Parameters:
- BufferAddressSpace – Memory type (Generic, Global, LDS, VGPR, SGPR). 
- ElementType – Element data type. 
- Shape – Tensor shape (layout component). 
- UnrolledDescriptorType – Flatten descriptor (layout component). 
 
 
Tensor helpers#
Typedefs
- 
using MemoryTypeEnum = AddressSpaceEnum#
- Memory type, allowed members: - Generic, 
- Global, 
- Lds, 
- Sgpr, 
- Vgpr, 
 
Functions
- 
template<MemoryTypeEnum MemoryType, typename ElementType, typename Shape, typename UnrolledDescriptorType>
 constexpr auto make_tensor(ElementType *pointer, const Layout<Shape, UnrolledDescriptorType> &layout)#
- Make tensor function. - Template Parameters:
- MemoryType – Type of memory. 
- Parameters:
- pointer – Pointer to the memory. 
- layout – Tensor layout. 
 
- Returns:
- Constructed tensor. 
 
- 
template<MemoryTypeEnum MemoryType, typename ElementType, typename Shape, typename UnrolledDescriptorType>
 constexpr auto make_register_tensor(const Layout<Shape, UnrolledDescriptorType> &layout)#
- Make SGPR or VGPR tensor function. - Template Parameters:
- MemoryType – Type of memory. 
- ElementType – Memory data type. 
 
- Returns:
- Constructed tensor. 
 
- 
template<MemoryTypeEnum BufferAddressSpace, typename ElementType, typename Shape, typename UnrolledDescriptorType>
 __host__ __device__ void clear(Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType> &tensor)#
- Clear tensor. (Only for Vpgr/Sgpr) - Parameters:
- tensor – Tensor to be cleared. 
 
- 
template<MemoryTypeEnum BufferAddressSpace, typename ElementType, typename Shape, typename UnrolledDescriptorType>
 __host__ __device__ constexpr const auto &layout(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType> &tensor)#
- 
- Parameters:
- tensor – Tensor to get layout of. 
- Returns:
- Requsted layout. 
 
- 
template<index_t... Idxs, MemoryTypeEnum BufferAddressSpace, typename ElementType, typename Shape, typename UnrolledDescriptorType>
 __host__ __device__ constexpr auto size(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType> &tensor)#
- Product of tensor shape dims. - Template Parameters:
- Idxs – Indexes to access specific shape dim (optional). 
- Parameters:
- tensor – Tensor to get Shape of. 
- Returns:
- Requsted size. 
 
- 
template<index_t... Idxs, MemoryTypeEnum BufferAddressSpace, typename ElementType, typename Shape, typename UnrolledDescriptorType>
 __host__ __device__ constexpr auto rank(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType> &tensor)#
- Rank of Shape tuple. - Template Parameters:
- Idxs – Indexes to access specific shape dim (optional). 
- Parameters:
- tensor – Tensor to get rank of. 
- Returns:
- Requsted rank. 
 
- 
template<index_t... Idxs, MemoryTypeEnum BufferAddressSpace, typename ElementType, typename Shape, typename UnrolledDescriptorType>
 __host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType> &tensor)#
- Depth of Shape tuple. - Template Parameters:
- Idxs – Indexes to access specific shape dim (optional). 
- Parameters:
- tensor – Tensor to get depth of. 
- Returns:
- Requsted depth. 
 
- 
template<MemoryTypeEnum BufferAddressSpace, typename ElementType, typename Shape, typename UnrolledDescriptorType>
 __host__ __device__ constexpr const auto &shape(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType> &tensor)#
- Get Tensor shape. - Parameters:
- tensor – Tensor to get shape from. 
- Returns:
- Requsted shape. 
 
- 
template<typename FromType, typename ToType>
 constexpr auto slice(const FromType from, const ToType to)#
- Get dim slice. - Parameters:
- from – Beginning of the interval. 
- to – End of the interval. (could be also negative to index from the end) 
 
- Returns:
- Requested slice. Could be used to create sliced tensor from other tensor. 
 
- 
template<typename ToType>
 constexpr auto slice(const ToType to)#
- Get dim slice. (Assumed that from is equal to 1) - Parameters:
- to – End of the interval. (could be also negative to index from the end) 
- Returns:
- Requested slice. Could be used to create sliced tensor from other tensor. 
 
- 
constexpr auto slice()#
- Get whole dim slice (from = 0, to = -1). - Returns:
- Requested slice. Could be used to create sliced tensor from other tensor. 
 
Functions
- 
template<typename TensorType, typename ThreadShape, typename ThreadUnrolledDesc, typename ProjectionTuple>
 __host__ __device__ constexpr auto make_local_partition(TensorType &tensor, [[maybe_unused]] const Layout<ThreadShape, ThreadUnrolledDesc> &thread_layout, const index_t thread_id, const ProjectionTuple &projection)#
- Create local partition for thread (At now only packed partition is supported). - Parameters:
- Returns:
- Partition tensor. 
 
- 
template<typename TensorType, typename ThreadShape, typename ThreadUnrolledDesc>
 __host__ __device__ constexpr auto make_local_partition(TensorType &tensor, const Layout<ThreadShape, ThreadUnrolledDesc> &thread_lengths, const index_t thread_id)#
- Create local partition for thread (At now only packed partition is supported). 
- 
template<typename TensorType, typename BlockShapeTuple, typename BlockIdxs, typename ProjectionTuple>
 __host__ __device__ constexpr auto make_local_tile(const TensorType &tensor, const BlockShapeTuple &tile_shape, const BlockIdxs &block_idxs, const ProjectionTuple &projection)#
- Create local tile for thread block. (At now only packed tile is supported). - Note - Temporary to gain the best performance use 2d tile_shape. - Parameters:
- tensor – Tensor for partition. 
- tile_shape – Shapes of requested tile. 
- block_idxs – Tuple of block indexes represented as integer. If slice, then get whole dim. 
- projection – Projection is used to remove selected dim from partitioning. Use - slice(X)to remove dimension, where X is dim size. Use- Number<1>{}to keep it.
 
- Returns:
- Tile tensor. 
 
- 
template<typename TensorType, typename BlockShapeTuple, typename BlockIdxs>
 __host__ __device__ constexpr auto make_local_tile(const TensorType &tensor, const BlockShapeTuple &tile_shape, const BlockIdxs &block_idxs)#
- Create local tile for thread block. (At now only packed tile is supported). - Note - Currently to get the best performance please use 2d shape. - Parameters:
- tensor – Tensor for partition. 
- tile_shape – Shapes of requested tile. 
- block_idxs – Tuple of block indexes represented as integer. If slice, then get whole dim. 
 
- Returns:
- Tile tensor. 
 
Operations#
Functions
- 
template<typename DimAccessOrderTuple, index_t VectorDim, index_t ScalarPerVector, typename SrcTensorType, typename DstTensorType>
 __device__ void copy(const SrcTensorType &src_tensor, DstTensorType &dst_tensor)#
- Perform optimized copy between two tensors partitions (threadwise copy). Tensors must have the same size. - Template Parameters:
- DimAccessOrderTuple – Tuple with dimension access order. 
- VectorDim – Dimension for vectorized read and write. 
- ScalarPerVector – Number of scalar per vectorized read and write. 
 
- Parameters:
- src_tensor – Source tensor. 
- dst_tensor – Destination tensor. 
 
 
- 
template<typename SrcTensorType, typename DstTensorType>
 __host__ __device__ void copy(const SrcTensorType &src_tensor, DstTensorType &dst_tensor)#
- Perform generic copy between two tensors partitions (threadwise copy). Tensors must have the same size. - Parameters:
- src_tensor – Source tensor. 
- dst_tensor – Destination tensor. 
 
 
- 
template<typename DimAccessOrderTuple, index_t VectorDim, index_t ScalarPerVector, typename SrcTensorType, typename DstTensorType, typename ThreadShape, typename ThreadUnrolledDesc>
 __device__ void blockwise_copy(const SrcTensorType &src_tensor, DstTensorType &dst_tensor, [[maybe_unused]] const Layout<ThreadShape, ThreadUnrolledDesc> &thread_layout)#
- Perform optimized blockwise copy between two tensors. Tensors must have the same size. - Note - At now Vgpr and Sgpr are not supported. - Template Parameters:
- DimAccessOrderTuple – Tuple with dimension access order. 
- VectorDim – Dimension for vectorize read and write. 
- ScalarPerVector – Number of scalar per vectorize read and write. 
 
- Parameters:
- src_tensor – Source tensor. 
- dst_tensor – Destination tensor. 
- thread_layout – Thread layout per each dimension for copy. 
 
 
Functions
- 
template<typename DataType, index_t BlockSize, typename GemmTraits, typename ATensorType, typename BTensorType, typename CTensorType>
 __device__ void blockwise_gemm_xdl(const ATensorType &a_local_tile_tensor, const BTensorType &b_local_tile_tensor, CTensorType &c_reg_tensor)#
- Perform blockwise gemm xdl on tensors stored in lds. Result will be stored in Vgpr register. A data layout must be (MPerBlock, KPerBlock) or (K0PerBlock, MPerBlock, K1) and B data layout must be (NPerBlock, KPerBlock) or (K0PerBlock, NPerBlock, K1). - Note - C output Vgpr register layout (8D): - MXdlPerWave - The number of MFMA instructions run by single wave in M dimension per tile. 
- NXdlPerWave - The number of MFMA instructions run by single wave in N dimension per tile. 
- MWave - Equals to 1 since this is for single wave. 
- NWave - Equals to 1 since this is for single wave. 
- NumGroupsPerBlock - Mfma instruction internal layout (depeneds on the instruction size). 
- NumInputsBlock - Mfma instruction internal layout (depeneds on the instruction size). 
- GroupSize - Mfma instruction internal layout (depeneds on the instruction size). 
- NumThreadsPerBlock - Mfma instruction internal layout (depeneds on the instruction size). 
 - Template Parameters:
- DataType – Input data types. 
- BlockSize – Tensor to pad. 
- GemmTraits – Traits of gemm xdl operation. 
 
- Parameters:
- a_local_tile_tensor – A tensor in LDS memory for blockwise gemm (MPerBlock, KPerBlock) or (K0PerBlock, MPerBlock, K1) layout. 
- b_local_tile_tensor – B tensor in LDS memory for blockwise gemm (NPerBlock, KPerBlock) or (K0PerBlock, NPerBlock, K1) layout. 
- c_reg_tensor – C tensor VGPR memory for blockwise gemm. 
 
 
- 
template<typename DataType, typename ATileLayout, typename BTileLayout, index_t BlockSize, typename GemmTraits, typename CTensorType>
 __host__ __device__ constexpr auto make_blockwise_gemm_xdl_c_local_partition(CTensorType &c_local_tile_tensor)#
- Create local partition per thread for C tensor. - Note - C output global memory layout (8D): - MXdlPerWave - The number of MFMA instructions run by single wave in M dimension. 
- NXdlPerWave - The number of MFMA instructions run by single wave in N dimension. 
- MWave - The number of waves in single tile M dimension per tile. 
- NWave - The number of waves in single tile N dimension per tile. 
- NumGroupsPerBlock - Mfma instruction internal layout (depeneds on the instruction size). 
- NumInputsBlock - Mfma instruction internal layout (depeneds on the instruction size). 
- GroupSize - Mfma instruction internal layout (depeneds on the instruction size). 
- NumThreadsPerBlock - Mfma instruction internal layout (depeneds on the instruction size). 
 - Template Parameters:
- DataType – Input data types. 
- ATileLayout – A tensor layout. 
- BTileLayout – B tensor layout. 
- BlockSize – Number of threads in block. 
- GemmTraits – Traits of gemm xdl operation. 
 
- Parameters:
- c_local_tile_tensor – C tensor in LDS memory for blockwise gemm (MPerBlock, NPerBlock) layout. 
- Returns:
- Partition c tensor for blockwise gemm. 
 
- 
template<typename DataType, typename ATileLayout, typename BTileLayout, index_t BlockSize, typename GemmTraits>
 __host__ __device__ constexpr auto make_blockwise_gemm_xdl_c_vgpr()#
- Create local partition per thread for C tensor. - Note - C output Vgpr register layout (8D): - MXdlPerWave - The number of MFMA instructions run by single wave in M dimension per tile. 
- NXdlPerWave - The number of MFMA instructions run by single wave in N dimension per tile. 
- MWave - Equals to 1 since this is for single wave. 
- NWave - Equals to 1 since this is for single wave. 
- NumGroupsPerBlock - Mfma instruction internal layout (depeneds on the instruction size). 
- NumInputsBlock - Mfma instruction internal layout (depeneds on the instruction size). 
- GroupSize - Mfma instruction internal layout (depeneds on the instruction size). 
- NumThreadsPerBlock - Mfma instruction internal layout (depeneds on the instruction size). 
 - Template Parameters:
- DataType – Input data types. 
- ATileLayout – A tensor layout. 
- BTileLayout – B tensor layout. 
- BlockSize – Number of threads in block. 
- GemmTraits – Traits of gemm xdl operation. 
 
- Returns:
- Vgpr c tensor for blockwise gemm.