Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack > Struct Template Reference

Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack > Struct Template Reference#

Composable Kernel: ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack > Struct Template Reference
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack > Struct Template Reference

#include <blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp>

Inheritance diagram for ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >:
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >

Public Types

using Base = BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
using AccType = typename Base::AccType
 
using Tuple5 = typename Base::Tuple5
 
using ComputeTypeA = typename Base::ComputeTypeA
 
using ComputeTypeB = typename Base::ComputeTypeB
 
using mx_scale_t = e8m0_bexp_t
 
using AThreadCopy = ThreadwiseTensorSliceTransfer_v4< ADataType, ComputeTypeA, decltype(a_block_desc_m0_m1_m2_m3_k), decltype(a_thread_desc_), Sequence< 1, 1, 1, 1, KThreadChunk >, Sequence< 0, 1, 2, 3, 4 >, 4, A_K1, A_K1 >
 
- Public Types inherited from ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
using ComputeTypeA = ADataType
 
using ComputeTypeB = BDataType
 
using AccType = float
 
using ThisThreadBlock = ThisThreadBlock< BlockSize >
 
using HotLoopInstList = ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>
 
using Tuple5 = decltype(CalculateAThreadOriginDataIndex())
 

Public Member Functions

template<bool HasMainLoop, TailNumber TailNum, typename AGridDesc , typename ABlockDesc , typename ABlockTransfer , typename AGridBuffer , typename ABlockBuffer , typename ABlockTransferStep , typename BGridDesc , typename BBlockDesc , typename BBlockTransfer , typename BGridBuffer , typename BBlockBuffer , typename BBlockTransferStep , typename CThreadBuffer , typename AScaleGridBuffer , typename AScaleGridDesc , typename AScaleThreadTransfer , typename BScaleGridBuffer , typename BScaleGridDesc , typename BScaleThreadTransfer >
__device__ void Run (const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_bufs, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, BBlockTransfer &b_blockwise_copy_up, const BGridBuffer &b_grid_buf, const BGridBuffer &b_grid_buf_up, BBlockBuffer &b_block_bufs, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, CThreadBuffer &c_thread_buf_up, const AScaleGridDesc &a_scale_grid_desc, AScaleThreadTransfer &a_scale_thread_copy, const AScaleGridBuffer &a_scale_grid_buf, const BScaleGridDesc &b_scale_grid_desc, BScaleThreadTransfer &b_scale_thread_copy, BScaleThreadTransfer &b_scale_thread_copy_up, const BScaleGridBuffer &b_scale_grid_buf, const BScaleGridBuffer &b_scale_grid_buf_up, index_t num_loop) const
 
- Public Member Functions inherited from ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
__host__ constexpr __device__ auto & GetCThreadBuffer ()
 
__host__ __device__ BlockwiseGemmXdlops_mx_pipeline_base (Tuple5 a_origin=CalculateAThreadOriginDataIndex(), Tuple5 b_origin=CalculateBThreadOriginDataIndex())
 Constructor for BlockwiseGemmXdlops_mx_pipeline_base. More...
 

Static Public Member Functions

static constexpr __host__ bool BlockHasHotloop (index_t num_loop)
 
static constexpr __host__ TailNumber BlockLoopTailNum (index_t num_loop)
 
static constexpr __device__ auto HotLoopScheduler ()
 
- Static Public Member Functions inherited from ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
static __device__ auto GetWaveIdx ()
 
static __device__ auto CalculateAThreadOriginDataIndex ()
 
static __device__ auto CalculateBThreadOriginDataIndex ()
 
static __device__ auto CalculateCThreadOriginDataIndex (Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
 
__host__ static constexpr __device__ auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4 ()
 
__host__ static constexpr __device__ auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 ()
 
__host__ static constexpr __device__ auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3 ()
 
__host__ static constexpr __device__ auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 ()
 
__host__ static constexpr __device__ auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4 ()
 
__host__ static constexpr __device__ auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 ()
 
__host__ static constexpr __device__ auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3 ()
 
__host__ static constexpr __device__ auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 ()
 
__host__ static constexpr __device__ auto MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 (const CGridDesc_M_N &c_grid_desc_m_n)
 
__host__ static constexpr __device__ auto MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 (const CGridDesc_G_M_N &c_grid_desc_g_m_n)
 
__host__ static constexpr __device__ auto GetCThreadDesc ()
 

Public Attributes

AThreadCopy a_thread_copy_ {Base::CalculateAThreadOriginDataIndex()}
 
- Public Attributes inherited from ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
StaticBufferTupleOfVector< AddressSpaceEnum::Vgpr, AccType, MRepeat *NRepeat, xdlops_gemm.GetRegSizePerXdlops(), true > c_thread_buf_
 

Static Public Attributes

static constexpr index_t PrefetchStages = 2
 
static constexpr index_t LocalPrefetchStages = 2
 
static constexpr index_t PrefillStages = 1
 
static constexpr index_t GlobalBufferNum = 1
 
static constexpr index_t HotloopLocalBufSwitch = MRepeat % 2 == 0 ? 0 : 1
 
static constexpr auto num_buffer_load_a_scale = MRepeat / MXdlPack * KRepeat / KXdlPack
 
static constexpr auto num_buffer_load_b_scale = NRepeat / NXdlPack * KRepeat / KXdlPack * 2
 
static constexpr auto async_vmcnt
 
static constexpr auto async_vmcnt_encoding = 3952 + async_vmcnt % 16 + async_vmcnt / 16 * 16384
 
static constexpr auto ScalesPerKBlockSize
 
static constexpr auto ScalesPerXdlopsRun
 
static constexpr auto ScalesPerXdlopsRunPerThread
 
static constexpr auto scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t)
 
static constexpr auto scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t)
 
static constexpr auto a_scale_thread_vec_size = KXdlPack * MXdlPack / scale_pack_size_a
 
static constexpr auto b_scale_thread_vec_size = KXdlPack * NXdlPack / scale_pack_size_b
 
static constexpr auto ARegBuf = 2
 
static constexpr auto a_thread_desc_
 
static constexpr auto a_scale_thread_desc
 
static constexpr auto b_scale_thread_desc
 
- Static Public Attributes inherited from ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
static constexpr index_t APackedSize
 
static constexpr index_t BPackedSize
 
static constexpr auto I0
 
static constexpr auto I1
 
static constexpr auto I2
 
static constexpr auto I3
 
static constexpr index_t WaveSize
 
static constexpr index_t A_K0
 
static constexpr index_t B_K0
 
static constexpr index_t A_K1
 
static constexpr index_t B_K1
 
static constexpr auto xdlops_gemm
 
static constexpr index_t AMmaKStride
 
static constexpr index_t BMmaKStride
 
static constexpr index_t KThreadChunk
 
static constexpr index_t KPerThread
 
static constexpr index_t KRepeat
 
static constexpr index_t KPerInnerLoop
 
static constexpr index_t MWaves
 
static constexpr index_t NWaves
 
static constexpr index_t MXdlPack
 
static constexpr index_t NXdlPack
 
static constexpr index_t KXdlPack
 
static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_m3_k
 
static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_n3_k
 

Additional Inherited Members

- Protected Types inherited from ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
using AThreadCopy = ThreadwiseTensorSliceTransfer_v4< ADataType, ComputeTypeA, decltype(a_block_desc_m0_m1_m2_m3_k), decltype(a_thread_desc_), Sequence< 1, 1, 1, 1, KThreadChunk >, Sequence< 0, 1, 2, 3, 4 >, 4, A_K1, A_K1 >
 
using BThreadCopy = ThreadwiseTensorSliceTransfer_v4< BDataType, ComputeTypeB, decltype(b_block_desc_n0_n1_n2_n3_k), decltype(b_thread_desc_), Sequence< 1, 1, 1, 1, KThreadChunk >, Sequence< 0, 1, 2, 3, 4 >, 4, B_K1, B_K1 >
 
- Protected Attributes inherited from ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
AThreadCopy a_thread_copy_
 
BThreadCopy b_thread_copy_
 
- Static Protected Attributes inherited from ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
static constexpr auto a_thread_desc_
 
static constexpr auto b_thread_desc_
 
static constexpr auto c_thread_desc_
 

Member Typedef Documentation

◆ AccType

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
using ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AccType = typename Base::AccType

◆ AThreadCopy

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
using ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType, ComputeTypeA, decltype(a_block_desc_m0_m1_m2_m3_k), decltype(a_thread_desc_), Sequence<1, 1, 1, 1, KThreadChunk>, Sequence<0, 1, 2, 3, 4>, 4, A_K1, A_K1>

◆ Base

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
using ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack>

◆ ComputeTypeA

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
using ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeTypeA = typename Base::ComputeTypeA

◆ ComputeTypeB

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
using ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeTypeB = typename Base::ComputeTypeB

◆ mx_scale_t

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
using ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::mx_scale_t = e8m0_bexp_t

◆ Tuple5

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
using ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Tuple5 = typename Base::Tuple5

Member Function Documentation

◆ BlockHasHotloop()

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
static constexpr __host__ bool ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockHasHotloop ( index_t  num_loop)
inlinestaticconstexpr

◆ BlockLoopTailNum()

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
static constexpr __host__ TailNumber ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockLoopTailNum ( index_t  num_loop)
inlinestaticconstexpr

◆ HotLoopScheduler()

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
static constexpr __device__ auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotLoopScheduler ( )
inlinestaticconstexpr

◆ Run()

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
template<bool HasMainLoop, TailNumber TailNum, typename AGridDesc , typename ABlockDesc , typename ABlockTransfer , typename AGridBuffer , typename ABlockBuffer , typename ABlockTransferStep , typename BGridDesc , typename BBlockDesc , typename BBlockTransfer , typename BGridBuffer , typename BBlockBuffer , typename BBlockTransferStep , typename CThreadBuffer , typename AScaleGridBuffer , typename AScaleGridDesc , typename AScaleThreadTransfer , typename BScaleGridBuffer , typename BScaleGridDesc , typename BScaleThreadTransfer >
__device__ void ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Run ( const AGridDesc &  a_grid_desc,
const ABlockDesc &  a_block_desc,
ABlockTransfer &  a_blockwise_copy,
const AGridBuffer &  a_grid_buf,
ABlockBuffer &  a_block_bufs,
const ABlockTransferStep &  a_block_copy_step,
const BGridDesc &  b_grid_desc,
const BBlockDesc &  b_block_desc,
BBlockTransfer &  b_blockwise_copy,
BBlockTransfer &  b_blockwise_copy_up,
const BGridBuffer &  b_grid_buf,
const BGridBuffer &  b_grid_buf_up,
BBlockBuffer &  b_block_bufs,
const BBlockTransferStep &  b_block_copy_step,
CThreadBuffer &  c_thread_buf,
CThreadBuffer &  c_thread_buf_up,
const AScaleGridDesc &  a_scale_grid_desc,
AScaleThreadTransfer &  a_scale_thread_copy,
const AScaleGridBuffer &  a_scale_grid_buf,
const BScaleGridDesc &  b_scale_grid_desc,
BScaleThreadTransfer &  b_scale_thread_copy,
BScaleThreadTransfer &  b_scale_thread_copy_up,
const BScaleGridBuffer &  b_scale_grid_buf,
const BScaleGridBuffer &  b_scale_grid_buf_up,
index_t  num_loop 
) const
inline

Member Data Documentation

◆ a_scale_thread_desc

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_scale_thread_desc
staticconstexpr
Initial value:
make_tuple(Number<MRepeat / MXdlPack>{},
Number<ScalesPerXdlopsRunPerThread * a_scale_thread_vec_size>{}))
__host__ constexpr __device__ auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition: tensor_descriptor_helper.hpp:101
__host__ constexpr __device__ auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:211
integral_constant< index_t, N > Number
Definition: number.hpp:12

◆ a_scale_thread_vec_size

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_scale_thread_vec_size = KXdlPack * MXdlPack / scale_pack_size_a
staticconstexpr

◆ a_thread_copy_

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
AThreadCopy ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_copy_ {Base::CalculateAThreadOriginDataIndex()}

◆ a_thread_desc_

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_desc_
staticconstexpr

◆ ARegBuf

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ARegBuf = 2
staticconstexpr

◆ async_vmcnt

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::async_vmcnt
staticconstexpr

◆ async_vmcnt_encoding

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::async_vmcnt_encoding = 3952 + async_vmcnt % 16 + async_vmcnt / 16 * 16384
staticconstexpr

◆ b_scale_thread_desc

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_scale_thread_desc
staticconstexpr
Initial value:
make_tuple(Number<NRepeat / NXdlPack>{},
Number<ScalesPerXdlopsRunPerThread * b_scale_thread_vec_size>{}))

◆ b_scale_thread_vec_size

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_scale_thread_vec_size = KXdlPack * NXdlPack / scale_pack_size_b
staticconstexpr

◆ GlobalBufferNum

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr index_t ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GlobalBufferNum = 1
staticconstexpr

◆ HotloopLocalBufSwitch

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr index_t ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotloopLocalBufSwitch = MRepeat % 2 == 0 ? 0 : 1
staticconstexpr

◆ LocalPrefetchStages

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr index_t ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::LocalPrefetchStages = 2
staticconstexpr

◆ num_buffer_load_a_scale

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::num_buffer_load_a_scale = MRepeat / MXdlPack * KRepeat / KXdlPack
staticconstexpr

◆ num_buffer_load_b_scale

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::num_buffer_load_b_scale = NRepeat / NXdlPack * KRepeat / KXdlPack * 2
staticconstexpr

◆ PrefetchStages

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr index_t ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefetchStages = 2
staticconstexpr

◆ PrefillStages

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr index_t ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefillStages = 1
staticconstexpr

◆ scale_pack_size_a

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t)
staticconstexpr

◆ scale_pack_size_b

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t)
staticconstexpr

◆ ScalesPerKBlockSize

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ScalesPerKBlockSize
staticconstexpr
Initial value:
=
KPerBlock / ScaleBlockSize

◆ ScalesPerXdlopsRun

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ScalesPerXdlopsRun
staticconstexpr

◆ ScalesPerXdlopsRunPerThread

template<index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ScalesPerXdlopsRunPerThread
staticconstexpr

The documentation for this struct was generated from the following file: