BlockwiseGemmXdlops_pipeline_v4< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride > Struct Template Reference#
Public Types |
Public Member Functions |
Static Public Member Functions |
Public Attributes |
Static Public Attributes |
Protected Types |
Protected Attributes |
Static Protected Attributes |
List of all members
ck::BlockwiseGemmXdlops_pipeline_v4< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride > Struct Template Reference
#include <blockwise_gemm_pipeline_xdlops.hpp>
Public Types | |
| using | ThisThreadBlock = ThisThreadBlock< BlockSize > |
| using | HotLoopInstList = BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, A_K1, B_K1, A_K1, B_K1, KPack, KPack, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops > |
| using | Tuple4 = decltype(CalculateAThreadOriginDataIndex()) |
Public Member Functions | |
| __host__ constexpr __device__ auto & | GetCThreadBuffer () |
| __host__ __device__ | BlockwiseGemmXdlops_pipeline_v4 (Tuple4 a_origin=CalculateAThreadOriginDataIndex(), Tuple4 b_origin=CalculateBThreadOriginDataIndex()) |
| template<> | |
| constexpr __device__ auto | TailScheduler () |
| template<> | |
| constexpr __device__ auto | TailScheduler () |
| template<bool HasMainLoop, index_t TailNum, typename AGridDesc , typename ABlockDesc , typename ABlockTransfer , typename AGridBuffer , typename ABlockBuffer , typename ABlockTransferStep , typename BGridDesc , typename BBlockDesc , typename BBlockTransfer , typename BGridBuffer , typename BBlockBuffer , typename BBlockTransferStep , typename CThreadBuffer > | |
| __device__ void | Run (const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, index_t num_loop) const |
Static Public Member Functions | |
| static __device__ auto | GetWaveIdx () |
| static __device__ auto | CalculateAThreadOriginDataIndex () |
| static __device__ auto | CalculateBThreadOriginDataIndex () |
| template<index_t m0, index_t n0, index_t xdlops_i, index_t blk_i> | |
| static __device__ auto | CalculateCThreadOriginDataIndex (Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >) |
| template<index_t m0, index_t n0, index_t xdlops_i, index_t blk_i> | |
| static __device__ auto | CalculateCThreadOriginDataIndex8D (Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >) |
| __host__ static constexpr __device__ auto | GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4 () |
| __host__ static constexpr __device__ auto | GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 () |
| __host__ static constexpr __device__ auto | GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 () |
| __host__ static constexpr __device__ auto | GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4 () |
| __host__ static constexpr __device__ auto | GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 () |
| __host__ static constexpr __device__ auto | GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 () |
| template<typename CGridDesc_M_N > | |
| __host__ static constexpr __device__ auto | MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 (const CGridDesc_M_N &c_grid_desc_m_n) |
| template<typename CGridDesc_G_M_N > | |
| __host__ static constexpr __device__ auto | MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 (const CGridDesc_G_M_N &c_grid_desc_g_m_n) |
| static constexpr __device__ auto | HotLoopScheduler () |
| template<index_t stage> | |
| static constexpr __device__ auto | TailScheduler () |
Public Attributes | |
| StaticBufferTupleOfVector< AddressSpaceEnum::Vgpr, FloatAcc, MRepeat *NRepeat, xdlops_gemm.GetRegSizePerXdlops(), true > | c_thread_buf_ |
Static Public Attributes | |
| static constexpr auto | I0 = Number<0>{} |
| static constexpr auto | I1 = Number<1>{} |
| static constexpr auto | I2 = Number<2>{} |
| static constexpr auto | I3 = Number<3>{} |
| static constexpr index_t | WaveSize = get_warp_size() |
| static constexpr index_t | A_K0 = ATileDesc{}.GetLength(I0) |
| static constexpr index_t | B_K0 = BTileDesc{}.GetLength(I0) |
| static constexpr index_t | A_K1 = ATileDesc{}.GetLength(I2) |
| static constexpr index_t | B_K1 = BTileDesc{}.GetLength(I2) |
| static constexpr auto | xdlops_gemm |
| static constexpr index_t | KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops |
| static constexpr index_t | KRepeat = KPerThread / KPack |
| static constexpr index_t | MWaves = MPerBlock / (MRepeat * MPerXDL) |
| static constexpr index_t | NWaves = NPerBlock / (NRepeat * NPerXDL) |
| static constexpr AMmaTileDesc | a_block_desc_m0_m1_m2_k |
| static constexpr BMmaTileDesc | b_block_desc_n0_n1_n2_k |
Protected Types | |
| using | AThreadCopy = ThreadwiseTensorSliceTransfer_v4< FloatAB, FloatAB, decltype(a_block_desc_m0_m1_m2_k), decltype(a_thread_desc_), Sequence< 1, 1, 1, KPack >, Sequence< 0, 1, 2, 3 >, 3, A_K1, A_K1 > |
| using | BThreadCopy = ThreadwiseTensorSliceTransfer_v4< FloatAB, FloatAB, decltype(b_block_desc_n0_n1_n2_k), decltype(b_thread_desc_), Sequence< 1, 1, 1, KPack >, Sequence< 0, 1, 2, 3 >, 3, B_K1, B_K1 > |
Protected Attributes | |
| AThreadCopy | a_thread_copy_ |
| BThreadCopy | b_thread_copy_ |
Static Protected Attributes | |
| static constexpr auto | a_thread_desc_ |
| static constexpr auto | b_thread_desc_ |
| static constexpr auto | c_thread_desc_ |
Member Typedef Documentation
◆ AThreadCopy
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
protected |
◆ BThreadCopy
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
protected |
◆ HotLoopInstList
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
| using ck::BlockwiseGemmXdlops_pipeline_v4< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::HotLoopInstList = BlockwiseGemmXdlops_pipeline_hotloop_inst<BlockSize, MPerBlock, NPerBlock, KPerBlock, A_K1, B_K1, A_K1, B_K1, KPack, KPack, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops> |
◆ ThisThreadBlock
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
| using ck::BlockwiseGemmXdlops_pipeline_v4< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::ThisThreadBlock = ThisThreadBlock<BlockSize> |
◆ Tuple4
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
| using ck::BlockwiseGemmXdlops_pipeline_v4< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::Tuple4 = decltype(CalculateAThreadOriginDataIndex()) |
Constructor & Destructor Documentation
◆ BlockwiseGemmXdlops_pipeline_v4()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
inline |
Member Function Documentation
◆ CalculateAThreadOriginDataIndex()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
inlinestatic |
◆ CalculateBThreadOriginDataIndex()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
inlinestatic |
◆ CalculateCThreadOriginDataIndex()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
template<index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
|
inlinestatic |
◆ CalculateCThreadOriginDataIndex8D()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
template<index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
|
inlinestatic |
◆ GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
inlinestaticconstexpr |
◆ GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
inlinestaticconstexpr |
◆ GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
inlinestaticconstexpr |
◆ GetCThreadBuffer()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
inlineconstexpr |
◆ GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
inlinestaticconstexpr |
◆ GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
inlinestaticconstexpr |
◆ GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
inlinestaticconstexpr |
◆ GetWaveIdx()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
inlinestatic |
◆ HotLoopScheduler()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
inlinestaticconstexpr |
◆ MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
template<typename CGridDesc_G_M_N >
|
inlinestaticconstexpr |
◆ MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
template<typename CGridDesc_M_N >
|
inlinestaticconstexpr |
◆ Run()
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
template<bool HasMainLoop, index_t TailNum, typename AGridDesc , typename ABlockDesc , typename ABlockTransfer , typename AGridBuffer , typename ABlockBuffer , typename ABlockTransferStep , typename BGridDesc , typename BBlockDesc , typename BBlockTransfer , typename BGridBuffer , typename BBlockBuffer , typename BBlockTransferStep , typename CThreadBuffer >
|
inline |
◆ TailScheduler() [1/3]
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
template<index_t stage>
|
inlinestaticconstexpr |
◆ TailScheduler() [2/3]
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
template<>
|
inlineconstexpr |
◆ TailScheduler() [3/3]
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
template<>
|
inlineconstexpr |
Member Data Documentation
◆ a_block_desc_m0_m1_m2_k
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
◆ A_K0
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
◆ A_K1
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
◆ a_thread_copy_
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
protected |
◆ a_thread_desc_
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexprprotected |
Initial value:
make_tuple(Number<MRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}),
Number<KPack>{}, Number<KRepeat * MRepeat * KPack>{}, Number<MRepeat * KPack>{}, I1))
__host__ constexpr __device__ auto make_naive_tensor_descriptor(const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
Definition: tensor_descriptor_helper.hpp:49
static constexpr auto I1
Definition: blockwise_gemm_pipeline_xdlops.hpp:105
◆ b_block_desc_n0_n1_n2_k
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
◆ B_K0
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
◆ B_K1
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
◆ b_thread_copy_
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
protected |
◆ b_thread_desc_
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexprprotected |
Initial value:
make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}),
Number<KPack>{}, Number<KRepeat * NRepeat * KPack>{}, Number<NRepeat * KPack>{}, I1))
◆ c_thread_buf_
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
| StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, FloatAcc, MRepeat * NRepeat, xdlops_gemm.GetRegSizePerXdlops(), true> ck::BlockwiseGemmXdlops_pipeline_v4< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::c_thread_buf_ |
◆ c_thread_desc_
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexprprotected |
Initial value:
make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()))
__host__ constexpr __device__ auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition: tensor_descriptor_helper.hpp:101
static constexpr auto xdlops_gemm
Definition: blockwise_gemm_pipeline_xdlops.hpp:118
◆ I0
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
◆ I1
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
◆ I2
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
◆ I3
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
◆ KPerThread
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
◆ KRepeat
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
◆ MWaves
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
◆ NWaves
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
◆ WaveSize
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
◆ xdlops_gemm
template<index_t BlockSize, typename FloatAB , typename FloatAcc , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
|
staticconstexpr |
Initial value:
=
XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}
The documentation for this struct was generated from the following file:
- /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp