Wrapper

Wrapper#

Description#

The CK library provides a lightweight wrapper for more complex operations implemented in the library.

Example:

const auto shape_4x2x4         = ck::make_tuple(4, ck::make_tuple(2, 4));
const auto strides_s2x1x8      = ck::make_tuple(2, ck::make_tuple(1, 8));
const auto layout = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8);

std::array<ck::index_t, 32> data;
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);

for(ck::index_t w = 0; w < size(tensor); w++) {
    tensor(w) = w;
}

// slice() == slice(0, -1) (whole dimension)
auto tensor_slice = tensor(ck::wrapper::slice(1, 3), ck::make_tuple(ck::wrapper::slice(), ck::wrapper::slice()));
std::cout << "dims:2,(2,4) strides:2,(1,8)" << std::endl;
for(ck::index_t h = 0; h < ck::wrapper::size<0>(tensor_slice); h++)
{
    for(ck::index_t w = 0; w < ck::wrapper::size<1>(tensor_slice); w++)
    {
        std::cout << tensor_slice(h, w) << " ";
    }
    std::cout << std::endl;
}

Output:

dims:2,(2,4) strides:2,(1,8)
1 5 9 13 17 21 25 29
2 6 10 14 18 22 26 30

Tutorials:

GEMM tutorial

Advanced examples:

Layout#

template<typename Shape, typename UnrolledDescriptorType> struct Layout#

Layout wrapper that performs the tensor descriptor logic.

Template Parameters:

Shape – Tuple of Number<> (for compile-time layout) or index_t (dynamic layout). It is possible to pass nested shapes (e.g. ((4, 2), 2)), nested dimensions are merged.
UnrolledDescriptorType – Tensor descriptor for unnested shape dims.

Layout helpers#

Functions

template<typename Shape, typename Strides> __host__ __device__ constexpr auto make_layout(const Shape &shape, const Strides &strides)#

Make layout function.

Template Parameters:

Shape – Shape for layout.
Strides – Strides for layout.

Returns:

Constructed layout.

template<typename Shape> __host__ __device__ constexpr auto make_layout(const Shape &shape)#

Make layout function with packed strides (column-major).

Template Parameters:: Shape – Shape for layout.
Returns:: Constructed layout.

template<typename T> __host__ __device__ constexpr T get(const T &dim)#

Get dim.

Parameters:: dim – Dimension.
Returns:: Returned the same dimension.

template<index_t idx, typename ...Dims> __host__ __device__ constexpr auto get(const Tuple<Dims...> &tuple)#

Get element from tuple (Shape/Strides/Idxs).

Template Parameters:: idx – Index to lookup.
Parameters:: tuple – Tuple to lookup.
Returns:: Requsted element.

template<index_t idx, typename Shape, typename UnrolledDesc> __host__ __device__ constexpr auto get(const Layout<Shape, UnrolledDesc> &layout)#

Get sub layout.

Template Parameters:: idx – Index to lookup.
Parameters:: layout – Layout to create sub layout.
Returns:: Requsted sub layout.

template<index_t Idx, index_t... Idxs, typename T> __host__ __device__ constexpr auto get(const T &elem)#

Hierarchical get.

Template Parameters:: Idxs – Indexes to lookup.
Parameters:: elem – Element to lookup.
Returns:: Requsted element.

template<typename T> __host__ __device__ constexpr T size(const T &dim)#

Get size.

Parameters:: dim – Size.
Returns:: Returned the same size.

template<index_t idx, typename Shape, typename UnrolledDescriptorType> __host__ __device__ constexpr auto size(const Layout<Shape, UnrolledDescriptorType> &layout)#

Length get (product if tuple).

Template Parameters:: idx – Index to lookup.
Parameters:: layout – Layout to get Shape of.
Returns:: Requsted length.

template<typename ...ShapeDims> __host__ __device__ constexpr auto size(const Tuple<ShapeDims...> &shape)#

Shape size (product of dims).

Parameters:: shape – Shape to lookup.
Returns:: Requsted size.

template<typename Shape, typename UnrolledDescriptorType> __host__ __device__ constexpr auto size(const Layout<Shape, UnrolledDescriptorType> &layout)#

Layout size (product of dims).

Parameters:: layout – Layout to calculate shape size.
Returns:: Requsted size.

template<index_t idx, typename ...Ts> __host__ __device__ constexpr auto size(const Tuple<Ts...> &tuple)#

Length get from tuple (product if tuple).

Template Parameters:: idx – Index to lookup.
Parameters:: tuple – Tuple to lookup.
Returns:: Requsted length.

template<index_t Idx, index_t... Idxs, typename T> __host__ __device__ constexpr auto size(const T &elem)#

Hierarchical size.

Template Parameters:

Idx – First index to lookup (to avoid empty Idxs).
Idxs – Next indexes to lookup.

Parameters:

elem – Element to lookup.

Returns:

Requsted element.

template<typename Shape, typename UnrolledDescriptorType> __host__ __device__ constexpr auto rank([[maybe_unused]] const Layout<Shape, UnrolledDescriptorType> &layout)#

Get layout rank (num elements in shape).

Parameters:: layout – Layout to calculate rank.
Returns:: Requsted rank.

template<typename ...Dims> __host__ __device__ constexpr auto rank([[maybe_unused]] const Tuple<Dims...> &tuple)#

Get tuple rank (num elements in tuple). Return 1 if scalar passed.

Parameters:: tuple – Tuple to calculate rank.
Returns:: Requsted rank.

template<index_t IDim> __host__ __device__ constexpr index_t rank([[maybe_unused]] const Number<IDim> &dim)#

Rank for scalar.

Parameters:: dim – Dimension scalar.
Returns:: Returned 1.

__host__ __device__ constexpr index_t rank([[maybe_unused]] const index_t &dim)#

Rank for scalar.

Parameters:: dim – Dimension scalar.
Returns:: Returned 1.

template<index_t... Idxs, typename T> __host__ __device__ constexpr auto rank(const T &elem)#

Hierarchical rank.

Template Parameters:: Idxs – Indexes to lookup.
Parameters:: elem – Element to lookup.
Returns:: Requsted rank.

template<typename Shape, typename UnrolledDescriptorType> __host__ __device__ constexpr auto depth(const Layout<Shape, UnrolledDescriptorType> &layout)#

Get depth of the layout shape (return 0 if scalar).

Parameters:: layout – Layout to calculate depth.
Returns:: Requsted depth.

template<typename ...Dims> __host__ __device__ constexpr auto depth(const Tuple<Dims...> &tuple)#

Get depth of the tuple. (return 0 if scalar)

Parameters:: tuple – Tuple to calculate depth.
Returns:: Requsted depth.

template<index_t IDim> __host__ __device__ constexpr index_t depth([[maybe_unused]] const Number<IDim> &dim)#

Depth for scalar.

Parameters:: dim – Scalar.
Returns:: Returned 0.

__host__ __device__ constexpr index_t depth([[maybe_unused]] const index_t &dim)#

Depth for scalar.

Parameters:: dim – Scalar.
Returns:: Returned 0.

template<index_t... Idxs, typename T> __host__ __device__ constexpr auto depth(const T &elem)#

Hierarchical depth.

Template Parameters:: Idxs – Indexes to lookup.
Parameters:: elem – Element to lookup.
Returns:: Requsted depth.

template<typename LayoutType> __host__ __device__ constexpr const auto &shape(const LayoutType &layout)#

Get Layout shape.

Parameters:: layout – Layout to get shape from.
Returns:: Requsted shape.

template<typename Shape, typename UnrolledDesc, typename TileLengths> __host__ __device__ constexpr auto pad(const Layout<Shape, UnrolledDesc> &layout, const TileLengths &tile_lengths)#

Pad layout shapes to be adjusted to tile lengths.

Parameters:

layout – Layout to pad.
tile_lengths – Tile lengths to align layout shape.

Returns:

Padded layout.

template<index_t Idx, typename Shape, typename UnrolledDesc, typename NewLengths, typename NewIdxs> __host__ __device__ constexpr auto unmerge(const Layout<Shape, UnrolledDesc> &layout, const NewLengths &new_lengths, [[maybe_unused]] const NewIdxs &new_indexes)#

Unmerge selected dim in layout.

Template Parameters:

Idx – Index to dimension being unmerged.

Parameters:

layout – Layout to pad.
new_lengths – Dimensions into which the indicated dimension will be divided.
new_indexes – Indexes to shuffle dims. Dims for unmerged dim should be nested.

Returns:

Unmerged layout.

Tensor#

template<typename T> struct Tensor#

Tensor wrapper that performs static and dynamic buffer logic. The tensor is based on a descriptor stored in the Layout. Additionally, tensor can be sliced or shifted using multi-index offset.

Template Parameters:

BufferAddressSpace – Memory type (Generic, Global, LDS, VGPR, SGPR).
ElementType – Element data type.
Shape – Tensor shape (layout component).
UnrolledDescriptorType – Flatten descriptor (layout component).

Tensor helpers#

Typedefs

using MemoryTypeEnum = AddressSpaceEnum#

Memory type, allowed members:

Generic,
Global,
Lds,
Sgpr,
Vgpr,

Functions

template<MemoryTypeEnum MemoryType, typename ElementType, typename Shape, typename UnrolledDescriptorType> constexpr auto make_tensor(ElementType *pointer, const Layout<Shape, UnrolledDescriptorType> &layout)#

Make tensor function.

Template Parameters:

MemoryType – Type of memory.

Parameters:

pointer – Pointer to the memory.
layout – Tensor layout.

Returns:

Constructed tensor.

template<MemoryTypeEnum MemoryType, typename ElementType, typename Shape, typename UnrolledDescriptorType> constexpr auto make_register_tensor(const Layout<Shape, UnrolledDescriptorType> &layout)#

Make SGPR or VGPR tensor function.

Template Parameters:

MemoryType – Type of memory.
ElementType – Memory data type.

Returns:

Constructed tensor.

template<MemoryTypeEnum BufferAddressSpace, typename ElementType, typename Shape, typename UnrolledDescriptorType> __host__ __device__ void clear(Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType> &tensor)#

Clear tensor. (Only for Vpgr/Sgpr)

Parameters:: tensor – Tensor to be cleared.

template<MemoryTypeEnum BufferAddressSpace, typename ElementType, typename Shape, typename UnrolledDescriptorType> __host__ __device__ constexpr const auto &layout(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType> &tensor)#

Get Tensor Layout.

Parameters:: tensor – Tensor to get layout of.
Returns:: Requsted layout.

template<index_t... Idxs, MemoryTypeEnum BufferAddressSpace, typename ElementType, typename Shape, typename UnrolledDescriptorType> __host__ __device__ constexpr auto size(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType> &tensor)#

Product of tensor shape dims.

Template Parameters:: Idxs – Indexes to access specific shape dim (optional).
Parameters:: tensor – Tensor to get Shape of.
Returns:: Requsted size.

template<index_t... Idxs, MemoryTypeEnum BufferAddressSpace, typename ElementType, typename Shape, typename UnrolledDescriptorType> __host__ __device__ constexpr auto rank(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType> &tensor)#

Rank of Shape tuple.

Template Parameters:: Idxs – Indexes to access specific shape dim (optional).
Parameters:: tensor – Tensor to get rank of.
Returns:: Requsted rank.

template<index_t... Idxs, MemoryTypeEnum BufferAddressSpace, typename ElementType, typename Shape, typename UnrolledDescriptorType> __host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType> &tensor)#

Depth of Shape tuple.

Template Parameters:: Idxs – Indexes to access specific shape dim (optional).
Parameters:: tensor – Tensor to get depth of.
Returns:: Requsted depth.

template<MemoryTypeEnum BufferAddressSpace, typename ElementType, typename Shape, typename UnrolledDescriptorType> __host__ __device__ constexpr const auto &shape(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType> &tensor)#

Get Tensor shape.

Parameters:: tensor – Tensor to get shape from.
Returns:: Requsted shape.

template<typename FromType, typename ToType> constexpr auto slice(const FromType from, const ToType to)#

Get dim slice.

Parameters:

from – Beginning of the interval.
to – End of the interval. (could be also negative to index from the end)

Returns:

Requested slice. Could be used to create sliced tensor from other tensor.

template<typename ToType> constexpr auto slice(const ToType to)#

Get dim slice. (Assumed that from is equal to 1)

Parameters:: to – End of the interval. (could be also negative to index from the end)
Returns:: Requested slice. Could be used to create sliced tensor from other tensor.

constexpr auto slice()#

Get whole dim slice (from = 0, to = -1).

Returns:: Requested slice. Could be used to create sliced tensor from other tensor.

Functions

template<typename TensorType, typename ThreadShape, typename ThreadUnrolledDesc, typename ProjectionTuple> __host__ __device__ constexpr auto make_local_partition(TensorType &tensor, [[maybe_unused]] const Layout<ThreadShape, ThreadUnrolledDesc> &thread_layout, const index_t thread_id, const ProjectionTuple &projection)#

Create local partition for thread (At now only packed partition is supported).

Parameters:

tensor – Tensor for partition.
thread_layout – Layout of threads (could not be transformed).
thread_id – Thread index represented as integer.
projection – Projection is used to remove selected dim from partitioning. Use slice(X) to remove dimension, where X is dim size. Use Number<1>{} to keep it.

Returns:

Partition tensor.

template<typename TensorType, typename ThreadShape, typename ThreadUnrolledDesc> __host__ __device__ constexpr auto make_local_partition(TensorType &tensor, const Layout<ThreadShape, ThreadUnrolledDesc> &thread_lengths, const index_t thread_id)#

Create local partition for thread (At now only packed partition is supported).

Parameters:

tensor – Tensor for partition.
thread_lengths – Layout of threads (could not be nested).
thread_id – Thread index represented as integer.

Returns:

Partition tensor.

template<typename TensorType, typename BlockShapeTuple, typename BlockIdxs, typename ProjectionTuple> __host__ __device__ constexpr auto make_local_tile(const TensorType &tensor, const BlockShapeTuple &tile_shape, const BlockIdxs &block_idxs, const ProjectionTuple &projection)#

Create local tile for thread block. (At now only packed tile is supported).

Note

Temporary to gain the best performance use 2d tile_shape.

Parameters:

tensor – Tensor for partition.
tile_shape – Shapes of requested tile.
block_idxs – Tuple of block indexes represented as integer. If slice, then get whole dim.
projection – Projection is used to remove selected dim from partitioning. Use slice(X) to remove dimension, where X is dim size. Use Number<1>{} to keep it.

Returns:

Tile tensor.

template<typename TensorType, typename BlockShapeTuple, typename BlockIdxs> __host__ __device__ constexpr auto make_local_tile(const TensorType &tensor, const BlockShapeTuple &tile_shape, const BlockIdxs &block_idxs)#

Create local tile for thread block. (At now only packed tile is supported).

Note

Currently to get the best performance please use 2d shape.

Parameters:

tensor – Tensor for partition.
tile_shape – Shapes of requested tile.
block_idxs – Tuple of block indexes represented as integer. If slice, then get whole dim.

Returns:

Tile tensor.

Operations#

Functions

template<typename DimAccessOrderTuple, index_t VectorDim, index_t ScalarPerVector, typename SrcTensorType, typename DstTensorType> __device__ void copy(const SrcTensorType &src_tensor, DstTensorType &dst_tensor)#

Perform optimized copy between two tensors partitions (threadwise copy). Tensors must have the same size.

Template Parameters:

DimAccessOrderTuple – Tuple with dimension access order.
VectorDim – Dimension for vectorized read and write.
ScalarPerVector – Number of scalar per vectorized read and write.

Parameters:

src_tensor – Source tensor.
dst_tensor – Destination tensor.

template<typename SrcTensorType, typename DstTensorType> __host__ __device__ void copy(const SrcTensorType &src_tensor, DstTensorType &dst_tensor)#

Perform generic copy between two tensors partitions (threadwise copy). Tensors must have the same size.

Parameters:

src_tensor – Source tensor.
dst_tensor – Destination tensor.

template<typename DimAccessOrderTuple, index_t VectorDim, index_t ScalarPerVector, typename SrcTensorType, typename DstTensorType, typename ThreadShape, typename ThreadUnrolledDesc> __device__ void blockwise_copy(const SrcTensorType &src_tensor, DstTensorType &dst_tensor, [[maybe_unused]] const Layout<ThreadShape, ThreadUnrolledDesc> &thread_layout)#

Perform optimized blockwise copy between two tensors. Tensors must have the same size.

Note

At now Vgpr and Sgpr are not supported.

Template Parameters:

DimAccessOrderTuple – Tuple with dimension access order.
VectorDim – Dimension for vectorize read and write.
ScalarPerVector – Number of scalar per vectorize read and write.

Parameters:

src_tensor – Source tensor.
dst_tensor – Destination tensor.
thread_layout – Thread layout per each dimension for copy.

Functions

template<typename DataType, index_t BlockSize, typename GemmTraits, typename ATensorType, typename BTensorType, typename CTensorType> __device__ void blockwise_gemm_xdl(const ATensorType &a_local_tile_tensor, const BTensorType &b_local_tile_tensor, CTensorType &c_reg_tensor)#

Perform blockwise gemm xdl on tensors stored in lds. Result will be stored in Vgpr register. A data layout must be (MPerBlock, KPerBlock) or (K0PerBlock, MPerBlock, K1) and B data layout must be (NPerBlock, KPerBlock) or (K0PerBlock, NPerBlock, K1).

Note

C output Vgpr register layout (8D):

MXdlPerWave - The number of MFMA instructions run by single wave in M dimension per tile.
NXdlPerWave - The number of MFMA instructions run by single wave in N dimension per tile.
MWave - Equals to 1 since this is for single wave.
NWave - Equals to 1 since this is for single wave.
NumGroupsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).
NumInputsBlock - Mfma instruction internal layout (depeneds on the instruction size).
GroupSize - Mfma instruction internal layout (depeneds on the instruction size).
NumThreadsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).

Template Parameters:

DataType – Input data types.
BlockSize – Tensor to pad.
GemmTraits – Traits of gemm xdl operation.

Parameters:

a_local_tile_tensor – A tensor in LDS memory for blockwise gemm (MPerBlock, KPerBlock) or (K0PerBlock, MPerBlock, K1) layout.
b_local_tile_tensor – B tensor in LDS memory for blockwise gemm (NPerBlock, KPerBlock) or (K0PerBlock, NPerBlock, K1) layout.
c_reg_tensor – C tensor VGPR memory for blockwise gemm.

template<typename DataType, typename ATileLayout, typename BTileLayout, index_t BlockSize, typename GemmTraits, typename CTensorType> __host__ __device__ constexpr auto make_blockwise_gemm_xdl_c_local_partition(CTensorType &c_local_tile_tensor)#

Create local partition per thread for C tensor.

Note

C output global memory layout (8D):

MXdlPerWave - The number of MFMA instructions run by single wave in M dimension.
NXdlPerWave - The number of MFMA instructions run by single wave in N dimension.
MWave - The number of waves in single tile M dimension per tile.
NWave - The number of waves in single tile N dimension per tile.
NumGroupsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).
NumInputsBlock - Mfma instruction internal layout (depeneds on the instruction size).
GroupSize - Mfma instruction internal layout (depeneds on the instruction size).
NumThreadsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).

Template Parameters:

DataType – Input data types.
ATileLayout – A tensor layout.
BTileLayout – B tensor layout.
BlockSize – Number of threads in block.
GemmTraits – Traits of gemm xdl operation.

Parameters:

c_local_tile_tensor – C tensor in LDS memory for blockwise gemm (MPerBlock, NPerBlock) layout.

Returns:

Partition c tensor for blockwise gemm.

template<typename DataType, typename ATileLayout, typename BTileLayout, index_t BlockSize, typename GemmTraits> __host__ __device__ constexpr auto make_blockwise_gemm_xdl_c_vgpr()#

Create local partition per thread for C tensor.

Note

C output Vgpr register layout (8D):

MXdlPerWave - The number of MFMA instructions run by single wave in M dimension per tile.
NXdlPerWave - The number of MFMA instructions run by single wave in N dimension per tile.
MWave - Equals to 1 since this is for single wave.
NWave - Equals to 1 since this is for single wave.
NumGroupsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).
NumInputsBlock - Mfma instruction internal layout (depeneds on the instruction size).
GroupSize - Mfma instruction internal layout (depeneds on the instruction size).
NumThreadsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).

Template Parameters:

DataType – Input data types.
ATileLayout – A tensor layout.
BTileLayout – B tensor layout.
BlockSize – Number of threads in block.
GemmTraits – Traits of gemm xdl operation.

Returns:

Vgpr c tensor for blockwise gemm.

Wrapper

Contents

Wrapper#

Description#

Layout#

Layout helpers#

Tensor#

Tensor helpers#

Operations#