/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.2/include/ck/wrapper/operations/gemm.hpp File Reference#
gemm.hpp File Reference
  #include "ck/wrapper/utils/tensor_utils.hpp"#include "ck/wrapper/traits/blockwise_gemm_xdl_traits.hpp"#include "ck/host_utility/device_prop.hpp"#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"Go to the source code of this file.
Functions | |
| template<typename DataType , index_t BlockSize, typename GemmTraits , typename ATensorType , typename BTensorType , typename CTensorType > | |
| __device__ void | blockwise_gemm_xdl (const ATensorType &a_local_tile_tensor, const BTensorType &b_local_tile_tensor, CTensorType &c_reg_tensor) | 
| Perform blockwise gemm xdl on tensors stored in lds. Result will be stored in Vgpr register. A data layout must be (MPerBlock, KPerBlock) or (K0PerBlock, MPerBlock, K1) and B data layout must be (NPerBlock, KPerBlock) or (K0PerBlock, NPerBlock, K1).  More... | |
| template<typename DataType , typename ATileLayout , typename BTileLayout , index_t BlockSize, typename GemmTraits , typename CTensorType > | |
| __host__ constexpr __device__ auto | make_blockwise_gemm_xdl_c_local_partition (CTensorType &c_local_tile_tensor) | 
| Create local partition per thread for C tensor.  More... | |
| template<typename DataType , typename ATileLayout , typename BTileLayout , index_t BlockSize, typename GemmTraits > | |
| __host__ constexpr __device__ auto | make_blockwise_gemm_xdl_c_vgpr () | 
| Create local partition per thread for C tensor.  More... | |
Function Documentation
◆ blockwise_gemm_xdl()
template<typename DataType , index_t BlockSize, typename GemmTraits , typename ATensorType , typename BTensorType , typename CTensorType > 
      | __device__ void blockwise_gemm_xdl | ( | const ATensorType & | a_local_tile_tensor, | 
| const BTensorType & | b_local_tile_tensor, | ||
| CTensorType & | c_reg_tensor | ||
| ) | 
Perform blockwise gemm xdl on tensors stored in lds. Result will be stored in Vgpr register. A data layout must be (MPerBlock, KPerBlock) or (K0PerBlock, MPerBlock, K1) and B data layout must be (NPerBlock, KPerBlock) or (K0PerBlock, NPerBlock, K1).
- Note
 - C output Vgpr register layout (8D):
- MXdlPerWave - The number of MFMA instructions run by single wave in M dimension per tile.
 - NXdlPerWave - The number of MFMA instructions run by single wave in N dimension per tile.
 - MWave - Equals to 1 since this is for single wave.
 - NWave - Equals to 1 since this is for single wave.
 - NumGroupsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).
 - NumInputsBlock - Mfma instruction internal layout (depeneds on the instruction size).
 - GroupSize - Mfma instruction internal layout (depeneds on the instruction size).
 - NumThreadsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).
 
 
- Template Parameters
 - 
  
DataType Input data types. BlockSize Tensor to pad. GemmTraits Traits of gemm xdl operation.  
- Parameters
 - 
  
a_local_tile_tensor A tensor in LDS memory for blockwise gemm (MPerBlock, KPerBlock) or (K0PerBlock, MPerBlock, K1) layout. b_local_tile_tensor B tensor in LDS memory for blockwise gemm (NPerBlock, KPerBlock) or (K0PerBlock, NPerBlock, K1) layout. c_reg_tensor C tensor VGPR memory for blockwise gemm.  
◆ make_blockwise_gemm_xdl_c_local_partition()
template<typename DataType , typename ATileLayout , typename BTileLayout , index_t BlockSize, typename GemmTraits , typename CTensorType > 
      
  | 
  constexpr | 
Create local partition per thread for C tensor.
- Note
 - C output global memory layout (8D):
- MXdlPerWave - The number of MFMA instructions run by single wave in M dimension.
 - NXdlPerWave - The number of MFMA instructions run by single wave in N dimension.
 - MWave - The number of waves in single tile M dimension per tile.
 - NWave - The number of waves in single tile N dimension per tile.
 - NumGroupsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).
 - NumInputsBlock - Mfma instruction internal layout (depeneds on the instruction size).
 - GroupSize - Mfma instruction internal layout (depeneds on the instruction size).
 - NumThreadsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).
 
 
- Template Parameters
 - 
  
DataType Input data types. ATileLayout A tensor layout. BTileLayout B tensor layout. BlockSize Number of threads in block. GemmTraits Traits of gemm xdl operation.  
- Parameters
 - 
  
c_local_tile_tensor C tensor in LDS memory for blockwise gemm (MPerBlock, NPerBlock) layout.  
- Returns
 - Partition c tensor for blockwise gemm.
 
◆ make_blockwise_gemm_xdl_c_vgpr()
template<typename DataType , typename ATileLayout , typename BTileLayout , index_t BlockSize, typename GemmTraits > 
      
  | 
  constexpr | 
Create local partition per thread for C tensor.
- Note
 - C output Vgpr register layout (8D):
- MXdlPerWave - The number of MFMA instructions run by single wave in M dimension per tile.
 - NXdlPerWave - The number of MFMA instructions run by single wave in N dimension per tile.
 - MWave - Equals to 1 since this is for single wave.
 - NWave - Equals to 1 since this is for single wave.
 - NumGroupsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).
 - NumInputsBlock - Mfma instruction internal layout (depeneds on the instruction size).
 - GroupSize - Mfma instruction internal layout (depeneds on the instruction size).
 - NumThreadsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).
 
 
- Template Parameters
 - 
  
DataType Input data types. ATileLayout A tensor layout. BTileLayout B tensor layout. BlockSize Number of threads in block. GemmTraits Traits of gemm xdl operation.  
- Returns
 - Vgpr c tensor for blockwise gemm.