GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ > Struct Template Reference

GemmKernel&lt; TilePartitioner_, GemmPipeline_, EpiloguePipeline_ &gt; Struct Template Reference#

Composable Kernel: ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ > Struct Template Reference
ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ > Struct Template Reference

The GEMM kernel template. More...

#include <gemm_kernel.hpp>

Inheritance diagram for ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >:
ck_tile::BatchedGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ > ck_tile::GroupedGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >

Classes

struct  has_persistent_kernel
 
struct  SplitKBatchOffset
 

Public Types

using TilePartitioner = remove_cvref_t< TilePartitioner_ >
 
using GemmPipeline = remove_cvref_t< GemmPipeline_ >
 
using EpiloguePipeline = remove_cvref_t< EpiloguePipeline_ >
 
using ALayout = remove_cvref_t< typename GemmPipeline::ALayout >
 
using BLayout = remove_cvref_t< typename GemmPipeline::BLayout >
 
using ELayout = remove_cvref_t< typename GemmPipeline::CLayout >
 
using DsLayout = remove_cvref_t< typename EpiloguePipeline::DsLayout >
 
using DsDataType = remove_cvref_t< typename EpiloguePipeline::DsDataType >
 
using ADataType = remove_cvref_t< typename GemmPipeline::ADataType >
 
using BDataType = remove_cvref_t< typename GemmPipeline::BDataType >
 
using EDataType = remove_cvref_t< typename EpiloguePipeline::ODataType >
 
using KernelArgs = GemmKernelArgs< DsLayout::size()>
 

Public Member Functions

template<bool U = !PersistentKernel, typename = std::enable_if_t<U>>
CK_TILE_DEVICE void operator() (KernelArgs kargs) const
 
template<bool U = PersistentKernel, typename = std::enable_if_t<U>, typename = void>
CK_TILE_DEVICE void operator() (KernelArgs kargs) const
 

Static Public Member Functions

static CK_TILE_HOST const std::string GetName ()
 
static constexpr CK_TILE_HOST auto GridSize (index_t M, index_t N, index_t KBatch)
 
static CK_TILE_HOST auto MaxOccupancyGridSize (const stream_config &s) -> dim3
 Get the maximum occupancy grid size for the persistent kernel on the current device. More...
 
static constexpr CK_TILE_HOST auto BlockSize ()
 
static constexpr CK_TILE_HOST KernelArgs MakeKernelArgs (const GemmHostArgs< NumDTensor > &hostArgs)
 
static constexpr CK_TILE_HOST_DEVICE index_t GetSmemSize ()
 
static CK_TILE_HOST bool IsSupportedArgument (const KernelArgs &kargs)
 
template<memory_operation_enum DstInMemOp = memory_operation_enum::set>
static CK_TILE_DEVICE auto MakeGemmTensorViews (const ADataType *a_ptr, const BDataType *b_ptr, const std::array< const void *, NumDTensor > &ds_ptr, EDataType *e_ptr, const KernelArgs &kargs, const SplitKBatchOffset &splitk_batch_offset)
 
template<typename TensorView >
static CK_TILE_DEVICE auto MakeGemmPadViews (const TensorView &views)
 
template<typename PadView >
static CK_TILE_DEVICE auto MakeGemmTileWindows (const PadView &views, const index_t i_m, const index_t i_n)
 
template<bool UseDefaultScheduler = true>
static CK_TILE_DEVICE void RunGemm (const ADataType *a_ptr, const BDataType *b_ptr, const std::array< const void *, NumDTensor > &ds_ptr, EDataType *e_ptr, void *smem_ptr_0, const KernelArgs &kargs, const SplitKBatchOffset &splitk_batch_offset, const index_t block_idx_m, const index_t block_idx_n)
 Runs single GEMM problem cooperatively by whole workgroup. More...
 
static CK_TILE_DEVICE void RunGemm2LDS (const ADataType *a_ptr, const BDataType *b_ptr, const std::array< const void *, NumDTensor > &ds_ptr, EDataType *e_ptr, void *__restrict__ smem_ptr_0, void *__restrict__ smem_ptr_1, const KernelArgs &kargs, const SplitKBatchOffset &splitk_batch_offset, const index_t block_idx_m, const index_t block_idx_n)
 Runs single GEMM problem cooperatively by whole workgroup. More...
 

Static Public Attributes

static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize
 
static constexpr bool PersistentKernel = has_persistent_kernel::value
 
static constexpr index_t NumDTensor = DsDataType::size()
 
static constexpr auto I0 = number<0>()
 
static constexpr auto I1 = number<1>()
 
static constexpr auto I2 = number<2>()
 
static constexpr auto I3 = number<3>{}
 

Detailed Description

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
struct ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >

The GEMM kernel template.

Overview

This class provides the generic matrix multiplication kernel template. By semantic division of GEMM algorithm into following parts we achieve flexible, versatile and robust kernel implementation.

  • Prolog - The start of GEMM kernel implementation in operator() function call operator" which determines the work scope of each workgroup. @li @b GemmPipeline - The core part @a "heart" of matrix multiplication algorithm. This is the place where each workgroup is loading data from global memory and carrying out dot products.
  • Epilogue - The "final" part of matrix multiplication implementation responsible for storing results to global memory. This is also the place where any additional operator fusion may take place.

Additionally both GemmPipeline and EpiloguePipeline are parameterized with so called Policy which determines all internal details of those functional parts. You can think of it like both gemm and epilogue pipelines provides the control-flow logic controlled by policies. Moreover the policy is responsible for definition of all necessary data layouts and thread's work distribution.

Template Parameters
TilePartitioner_The type of class providing mapping of workgroup index into the output data tile to be calculated. It determines the workgroup to data relationship (or in other words - which data would be processed and calculated by which workgroup).
GemmPipeline_The type of class which provides the core part of matrix multiplication. This class should provide implementation of data loading from global memory and performing block-wise matrix multiplication. You can think of it as a work done by single workgroup point of view.
EpiloguePipeline_The type of class providing the final part of matrix multiplication implementation. It is responsible for storing results calculated by GemmPipeline to the output E tensor in global memory.

Member Typedef Documentation

◆ ADataType

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
using ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::ADataType = remove_cvref_t<typename GemmPipeline::ADataType>

◆ ALayout

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
using ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::ALayout = remove_cvref_t<typename GemmPipeline::ALayout>

◆ BDataType

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
using ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::BDataType = remove_cvref_t<typename GemmPipeline::BDataType>

◆ BLayout

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
using ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::BLayout = remove_cvref_t<typename GemmPipeline::BLayout>

◆ DsDataType

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
using ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::DsDataType = remove_cvref_t<typename EpiloguePipeline::DsDataType>

◆ DsLayout

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
using ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::DsLayout = remove_cvref_t<typename EpiloguePipeline::DsLayout>

◆ EDataType

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
using ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::EDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>

◆ ELayout

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
using ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::ELayout = remove_cvref_t<typename GemmPipeline::CLayout>

◆ EpiloguePipeline

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
using ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>

◆ GemmPipeline

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
using ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::GemmPipeline = remove_cvref_t<GemmPipeline_>

◆ KernelArgs

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
using ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::KernelArgs = GemmKernelArgs<DsLayout::size()>

◆ TilePartitioner

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
using ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::TilePartitioner = remove_cvref_t<TilePartitioner_>

Member Function Documentation

◆ BlockSize()

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
static constexpr CK_TILE_HOST auto ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::BlockSize ( )
inlinestaticconstexpr

◆ GetName()

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
static CK_TILE_HOST const std::string ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::GetName ( )
inlinestatic

◆ GetSmemSize()

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
static constexpr CK_TILE_HOST_DEVICE index_t ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::GetSmemSize ( )
inlinestaticconstexpr

◆ GridSize()

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
static constexpr CK_TILE_HOST auto ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::GridSize ( index_t  M,
index_t  N,
index_t  KBatch 
)
inlinestaticconstexpr

◆ IsSupportedArgument()

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
static CK_TILE_HOST bool ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::IsSupportedArgument ( const KernelArgs kargs)
inlinestatic

◆ MakeGemmPadViews()

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
template<typename TensorView >
static CK_TILE_DEVICE auto ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::MakeGemmPadViews ( const TensorView &  views)
inlinestatic

◆ MakeGemmTensorViews()

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
template<memory_operation_enum DstInMemOp = memory_operation_enum::set>
static CK_TILE_DEVICE auto ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::MakeGemmTensorViews ( const ADataType a_ptr,
const BDataType b_ptr,
const std::array< const void *, NumDTensor > &  ds_ptr,
EDataType e_ptr,
const KernelArgs kargs,
const SplitKBatchOffset splitk_batch_offset 
)
inlinestatic

◆ MakeGemmTileWindows()

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
template<typename PadView >
static CK_TILE_DEVICE auto ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::MakeGemmTileWindows ( const PadView &  views,
const index_t  i_m,
const index_t  i_n 
)
inlinestatic

◆ MakeKernelArgs()

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
static constexpr CK_TILE_HOST KernelArgs ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::MakeKernelArgs ( const GemmHostArgs< NumDTensor > &  hostArgs)
inlinestaticconstexpr

◆ MaxOccupancyGridSize()

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
static CK_TILE_HOST auto ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::MaxOccupancyGridSize ( const stream_config s) -> dim3
inlinestatic

Get the maximum occupancy grid size for the persistent kernel on the current device.

Returns
The maximum occupancy grid size.
Note
This function queries the maximum occupancy of the kernel using hipOccupancyMaxActiveBlocksPerMultiprocessor.

◆ operator()() [1/2]

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
template<bool U = !PersistentKernel, typename = std::enable_if_t<U>>
CK_TILE_DEVICE void ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::operator() ( KernelArgs  kargs) const
inline

◆ operator()() [2/2]

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
template<bool U = PersistentKernel, typename = std::enable_if_t<U>, typename = void>
CK_TILE_DEVICE void ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::operator() ( KernelArgs  kargs) const
inline

◆ RunGemm()

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
template<bool UseDefaultScheduler = true>
static CK_TILE_DEVICE void ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::RunGemm ( const ADataType a_ptr,
const BDataType b_ptr,
const std::array< const void *, NumDTensor > &  ds_ptr,
EDataType e_ptr,
void *  smem_ptr_0,
const KernelArgs kargs,
const SplitKBatchOffset splitk_batch_offset,
const index_t  block_idx_m,
const index_t  block_idx_n 
)
inlinestatic

Runs single GEMM problem cooperatively by whole workgroup.

Parameters
a_ptrinput A pointer
b_ptrinput B pointer
ds_ptrinput Ds pointer
e_ptroutput E pointer
smem_ptr_0The start memory pointer of the shared memory block.
kargsGEMM kernel arguments
splitk_batch_offsetsplitk_batch_offset Utility structure used to calculate k batch.
block_idx_mThe GEMM's output M dimension tile index processed by this workgroup.
block_idx_nThe GEMM's output N dimension tile index processed by this workgroup.

◆ RunGemm2LDS()

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
static CK_TILE_DEVICE void ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::RunGemm2LDS ( const ADataType a_ptr,
const BDataType b_ptr,
const std::array< const void *, NumDTensor > &  ds_ptr,
EDataType e_ptr,
void *__restrict__  smem_ptr_0,
void *__restrict__  smem_ptr_1,
const KernelArgs kargs,
const SplitKBatchOffset splitk_batch_offset,
const index_t  block_idx_m,
const index_t  block_idx_n 
)
inlinestatic

Runs single GEMM problem cooperatively by whole workgroup.

Note
RunGEMM2LDS in with two shared memory buffers using the ping pong buffer mechanism.
Parameters
a_ptrinput A pointer
b_ptrinput B pointer
ds_ptrinput Ds pointer
e_ptroutput E pointer
smem_ptr_0The starting pointer of 1st shared memory block.
smem_ptr_1The starting pointer of 2nd shared memory block.
kargsGEMM kernel arguments
splitk_batch_offsetUtility structure used to calculate k batch.
block_idx_mThe GEMM's output M dimension tile index processed by this workgroup.
block_idx_nThe GEMM's output N dimension tile index processed by this workgroup.

Member Data Documentation

◆ I0

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
constexpr auto ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::I0 = number<0>()
staticconstexpr

◆ I1

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
constexpr auto ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::I1 = number<1>()
staticconstexpr

◆ I2

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
constexpr auto ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::I2 = number<2>()
staticconstexpr

◆ I3

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
constexpr auto ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::I3 = number<3>{}
staticconstexpr

◆ KernelBlockSize

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
constexpr index_t ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::KernelBlockSize = GemmPipeline::BlockSize
staticconstexpr

◆ NumDTensor

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
constexpr index_t ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::NumDTensor = DsDataType::size()
staticconstexpr

◆ PersistentKernel

template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
constexpr bool ck_tile::GemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::PersistentKernel = has_persistent_kernel::value
staticconstexpr

The documentation for this struct was generated from the following file:
  • /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp