#include <grouped_gemm_kernel.hpp>
 | 
| CK_TILE_DEVICE void  | Run (const UniversalGemmKernelArgs<> &kargs, const tuple< index_t, index_t > &block_idx_2d, const index_t block_idx_z) const | 
|   | 
| CK_TILE_DEVICE index_t  | FindGroupId (const GemmTransKernelArg *gemm_desc_ptr, index_t block_id, index_t group_count) const | 
|   | 
| template<bool U = UsePersistentKernel, typename  = std::enable_if_t<!U>>  | 
| CK_TILE_DEVICE void  | operator() (const void CK_CONSTANT_ADDRESS_SPACE *gemm_descs_const, index_t group_count) const | 
|   | 
| template<bool U = UsePersistentKernel, typename  = std::enable_if_t<U>, typename  = void>  | 
| CK_TILE_DEVICE void  | operator() (const void CK_CONSTANT_ADDRESS_SPACE *gemm_descs_const, const index_t group_count) const | 
|   | 
 | 
| static CK_TILE_HOST const std::string  | GetName () | 
|   | 
| static CK_TILE_HOST auto  | GetWorkSpaceSize (const std::vector< GroupedGemmHostArgs > &gemm_descs) -> std::size_t | 
|   | 
| static CK_TILE_HOST auto  | GetWorkSpaceSize (index_t group_count) -> std::size_t | 
|   | 
| static constexpr CK_TILE_HOST auto  | BlockSize () -> dim3 | 
|   | 
| static CK_TILE_HOST auto  | MaxOccupancyGridSize (const stream_config &s) -> dim3 | 
|   | Get the maximum occupancy grid size for the persistent kernel on the current device.  More...
  | 
|   | 
| static CK_TILE_HOST auto  | GridSize (const std::vector< GroupedGemmHostArgs > &gemm_descs) | 
|   | 
| static CK_TILE_HOST auto  | MakeKargs (const std::vector< GroupedGemmHostArgs > &gemm_descs) -> std::vector< GemmTransKernelArg > | 
|   | 
| static CK_TILE_HOST bool  | IsSupportedArgument (const std::vector< GemmTransKernelArg > &kargs) | 
|   | 
| static constexpr CK_TILE_HOST_DEVICE auto  | GetSmemSize () -> index_t | 
|   | 
| static CK_TILE_DEVICE void  | RunGemmWithPipelineSelection (const ADataType *a_ptr, const BDataType *b_ptr, CDataType *c_ptr, void *smem_ptr_0, const UniversalGemmKernelArgs<> &kargs, const typename Base::SplitKBatchOffset &splitk_batch_offset, const index_t block_idx_m, const index_t block_idx_n) | 
|   | Runs single GEMM problem cooperatively by whole workgroup.  More...
  | 
|   | 
| static CK_TILE_DEVICE void  | RunGemmWithPipelineSelection2LDS (const ADataType *a_ptr, const BDataType *b_ptr, CDataType *c_ptr, void *__restrict__ smem_ptr_0, void *__restrict__ smem_ptr_1, const UniversalGemmKernelArgs<> &kargs, const typename Base::SplitKBatchOffset &splitk_batch_offset, const index_t block_idx_m, const index_t block_idx_n) | 
|   | Runs single GEMM problem cooperatively by whole workgroup.  More...
  | 
|   | 
◆ ADataType
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
      
 
Specify the data type configurations for A, B, C/E. 
 
 
◆ ALayout
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
      
 
 
◆ Base
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
      
 
 
◆ BDataType
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
      
 
 
◆ BLayout
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
      
 
 
◆ CDataType
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
      
 
 
◆ CLayout
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
      
 
 
◆ EpiloguePipeline
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
      
 
 
◆ GemmPipeline
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
      
 
 
◆ Kernel
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
      
 
 
◆ OffsetTile1DPartitioner
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
      
 
ALayout and ADataType are expected to be scalars, not a tuple. 
BLayout and BDataType are expected to be scalars, not a tuple.
C/ELayout and C/EDataType are expected to be scalars, not a tuple. 
 
 
◆ TilePartitioner
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
      
 
 
◆ BlockSize()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
 
 
◆ FindGroupId()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
 
 
◆ GetName()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
 
 
◆ GetSmemSize()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
 
 
◆ GetWorkSpaceSize() [1/2]
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
 
 
◆ GetWorkSpaceSize() [2/2]
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
 
 
◆ GridSize()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
 
 
◆ IsSupportedArgument()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
 
 
◆ MakeKargs()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
 
 
◆ MaxOccupancyGridSize()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
 
Get the maximum occupancy grid size for the persistent kernel on the current device. 
- Returns
 - The maximum occupancy grid size. 
 
- Note
 - This function queries the maximum occupancy of the kernel using 
hipOccupancyMaxActiveBlocksPerMultiprocessor.  
 
 
◆ operator()() [1/2]
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
template<bool U = UsePersistentKernel, typename  = std::enable_if_t<U>, typename  = void> 
 
 
◆ operator()() [2/2]
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
template<bool U = UsePersistentKernel, typename  = std::enable_if_t<!U>> 
 
 
◆ Run()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
 
 
◆ RunGemmWithPipelineSelection()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
 
Runs single GEMM problem cooperatively by whole workgroup. 
- Note
 - The GEMM pipeline is selected in-kernel based on the number of K-loops and the tail-number. This is needed for the persistent tile-loop when we didn't have access to the K dimension on the host.
 
- Parameters
 - 
  
    | a_ptr | input A pointer  | 
    | b_ptr | input B pointer  | 
    | c_ptr | output C pointer  | 
    | smem_ptr_0 | The start memory pointer of the shared memory block.  | 
    | kargs | GEMM kernel arguments  | 
    | splitk_batch_offset | splitk_batch_offset Utility structure used to calculate k batch.  | 
    | block_idx_m | The GEMM's output M dimension tile index processed by this workgroup.  | 
    | block_idx_n | The GEMM's output N dimension tile index processed by this workgroup.  | 
  
   
 
 
◆ RunGemmWithPipelineSelection2LDS()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
  
  
      
        
          | static CK_TILE_DEVICE void ck_tile::GroupedGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::RunGemmWithPipelineSelection2LDS  | 
          ( | 
          const ADataType *  | 
          a_ptr,  | 
         
        
           | 
           | 
          const BDataType *  | 
          b_ptr,  | 
         
        
           | 
           | 
          CDataType *  | 
          c_ptr,  | 
         
        
           | 
           | 
          void *__restrict__  | 
          smem_ptr_0,  | 
         
        
           | 
           | 
          void *__restrict__  | 
          smem_ptr_1,  | 
         
        
           | 
           | 
          const UniversalGemmKernelArgs<> &  | 
          kargs,  | 
         
        
           | 
           | 
          const typename Base::SplitKBatchOffset &  | 
          splitk_batch_offset,  | 
         
        
           | 
           | 
          const index_t  | 
          block_idx_m,  | 
         
        
           | 
           | 
          const index_t  | 
          block_idx_n  | 
         
        
           | 
          ) | 
           |  | 
         
       
   | 
  
inlinestatic   | 
  
 
Runs single GEMM problem cooperatively by whole workgroup. 
- Note
 - The GEMM pipeline is selected in-kernel based on the number of K-loops and the tail-number. This is needed for the persistent tile-loop when we didn't have access to the K dimension on the host.
 
- Parameters
 - 
  
    | a_ptr | input A pointer  | 
    | b_ptr | input B pointer  | 
    | c_ptr | output C pointer  | 
    | smem_ptr_0 | The start memory pointer of the shared memory block.  | 
    | smem_ptr_1 | The second start memory pointer of the shared memory block.  | 
    | kargs | GEMM kernel arguments  | 
    | splitk_batch_offset | splitk_batch_offset Utility structure used to calculate k batch.  | 
    | block_idx_m | The GEMM's output M dimension tile index processed by this workgroup.  | 
    | block_idx_n | The GEMM's output N dimension tile index processed by this workgroup.  | 
  
   
 
 
◆ KernelBlockSize
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
 
 
◆ UsePersistentKernel
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ > 
  
  
      
        
          | constexpr bool ck_tile::GroupedGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::UsePersistentKernel = GemmPipeline::UsePersistentKernel | 
         
       
   | 
  
staticconstexpr   | 
  
 
 
The documentation for this struct was generated from the following file:
- /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.1.0/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp