DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer > Struct Template Reference#
Classes |
Public Types |
Public Member Functions |
Static Public Member Functions |
Static Public Attributes |
List of all members
ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer > Struct Template Reference
#include <device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp>
Inheritance diagram for ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >:
Classes | |
| struct | Argument |
| struct | ComputeBasePtrOfStridedBatch |
| struct | CrossAttnArg |
| struct | CrossAttnInvoker |
| struct | Invoker |
| struct | RawArg |
| struct | SelfAttnArg |
| struct | SelfAttnInvoker |
Public Types | |
| using | DeviceOp = DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle |
| using | Transform = TransformBatchedContractionContractionToBatchedGemmGemm_Wmma< Sequence< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN >, Sequence< MPerBlock, LPerBlock, KPerBlock, NPerBlock >, GemmSpec, ASpec, B0Spec, B1Spec, CSpec > |
| using | AGridDesc = decltype(MakeAGridDescriptor({}, {})) |
| using | B0GridDesc = decltype(MakeB0GridDescriptor({}, {})) |
| using | B1GridDesc = decltype(MakeB1GridDescriptor({}, {})) |
| using | CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {})) |
| using | AGridDesc_G_M_K = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {})) |
| using | B0GridDesc_G_L_K = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {})) |
| using | B1GridDesc_G_N_L = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {})) |
| using | CGridDesc_G_M_N = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {})) |
| using | C0MatrixMask = C0MatrixMask_impl< decltype(make_MaskOutPredicate())> |
| using | GridwiseOp = GridwiseBatchedGemmSoftmaxGemm_Wmma< ADataType, B0DataType, Acc0DataType, B1DataType, Acc1DataType, CShuffleDataType, CDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, InMemoryDataOperationEnum::Set, AGridDesc, B0GridDesc, B1GridDesc, CGridDesc_M_N, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, true, AEnableLds, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, true, B0EnableLds, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, false, B1EnableLds, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, Transform::matrix_padder.PadN, MaskingSpec==MaskingSpecialization::MaskOutUpperTriangle, NumPrefetch, LoopSched, PipelineVer > |
Public Member Functions | |
| bool | IsSupportedArgument (const BaseArgument *p_arg) override |
| std::unique_ptr< BaseArgument > | MakeArgumentPointer (const void *p_a, const void *p_b0, const void *p_b1, void *p_c, const std::array< void *, NumAcc0Bias > p_acc0_biases, const std::array< void *, NumAcc1Bias > p_acc1_biases, const std::vector< index_t > &a_gs_ms_ks_lengths, const std::vector< index_t > &a_gs_ms_ks_strides, const std::vector< index_t > &b0_gs_ls_ks_lengths, const std::vector< index_t > &b0_gs_ls_ks_strides, const std::vector< index_t > &b1_gs_ns_ls_lengths, const std::vector< index_t > &b1_gs_ns_ls_strides, const std::vector< index_t > &c_gs_ms_ns_lengths, const std::vector< index_t > &c_gs_ms_ns_strides, const std::array< std::vector< ck::index_t >, NumAcc0Bias > acc0_biases_gs_ms_ls_lengths, const std::array< std::vector< ck::index_t >, NumAcc0Bias > acc0_biases_gs_ms_ls_strides, const std::array< std::vector< ck::index_t >, NumAcc1Bias > acc1_biases_gs_ms_ns_lengths, const std::array< std::vector< ck::index_t >, NumAcc1Bias > acc1_biases_gs_ms_ns_strides, AElementwiseOperation a_element_op, B0ElementwiseOperation b0_element_op, AccElementwiseOperation acc_element_op, B1ElementwiseOperation b1_element_op, CElementwiseOperation c_element_op) override |
| std::unique_ptr< BaseInvoker > | MakeInvokerPointer () override |
| std::string | GetTypeString () const override |
Public Member Functions inherited from ck::tensor_operation::device::BaseOperator | |
| BaseOperator ()=default | |
| BaseOperator (const BaseOperator &)=default | |
| BaseOperator & | operator= (const BaseOperator &)=default |
| virtual std::string | GetTypeIdName () const |
| virtual std::optional< std::string > | GetObjectName () const |
| virtual std::optional< std::string > | GetTemplateInfo () const |
| virtual std::string | GetTypeIdHashCode () const |
| virtual size_t | GetWorkSpaceSize (const BaseArgument *) const |
| virtual void | SetWorkSpacePointer (BaseArgument *p_arg, void *p_workspace, const StreamConfig &=StreamConfig{}) const |
| virtual | ~BaseOperator () |
Static Public Member Functions | |
| __host__ static __device__ auto | MakeAGridDescriptor (const std::array< index_t, NumDimG+NumDimM+NumDimN > &a_gs_ms_ks_lengths_vec, const std::array< index_t, NumDimG+NumDimM+NumDimN > &a_gs_ms_ks_strides_vec) |
| __host__ static __device__ auto | MakeB0GridDescriptor (const std::array< index_t, NumDimG+NumDimM+NumDimN > &b0_gs_ls_ks_lengths_vec, const std::array< index_t, NumDimG+NumDimM+NumDimN > &b0_gs_ls_ks_strides_vec) |
| __host__ static __device__ auto | MakeB1GridDescriptor (const std::array< index_t, NumDimG+NumDimM+NumDimN > &b1_gs_ns_ls_lengths_vec, const std::array< index_t, NumDimG+NumDimM+NumDimN > &b1_gs_ns_ls_strides_vec) |
| __host__ constexpr static __device__ auto | make_MaskOutPredicate () |
| static auto | MakeArgument (const ADataType *p_a, const B0DataType *p_b0, const B1DataType *p_b1, CDataType *p_c, index_t M, index_t N, index_t K, index_t O, index_t G0, index_t G1, float alpha, bool input_permute, bool output_permute) |
| static bool | IsSupportedArgument (const RawArg &arg) |
| static auto | MakeSelfAttnArgument (const ADataType *p_qkv_grid, CDataType *p_out_grid, index_t batch_size, index_t sequence_length, index_t head_count, index_t head_size, float alpha) |
| static auto | MakeCrossAttnArgument (const ADataType *p_q_grid, const B0DataType *p_kv_grid, CDataType *p_out_grid, index_t batch_size, index_t q_sequence_length, index_t kv_sequence_length, index_t head_count, index_t head_size, float alpha) |
| static auto | MakeSelfAttnInvoker () |
| static auto | MakeCrossAttnInvoker () |
| static constexpr bool | IsValidCompilationParameter () |
| static auto | MakeInvoker () |
Static Public Attributes | |
| static constexpr index_t | NumAcc0Bias = Acc0BiasDataType::Size() |
| static constexpr index_t | NumAcc1Bias = Acc1BiasDataType::Size() |
| static constexpr index_t | NumDimGemm0M = NumDimM |
| static constexpr index_t | NumDimGemm0N = NumDimL |
| static constexpr index_t | NumDimGemm0K = NumDimK |
| static constexpr index_t | NumDimGemm1M = NumDimM |
| static constexpr index_t | NumDimGemm1N = NumDimN |
| static constexpr index_t | NumDimGemm1K = NumDimL |
| static constexpr auto | I0 = Number<0>{} |
| static constexpr auto | I1 = Number<1>{} |
| static constexpr auto | I2 = Number<2>{} |
| static constexpr auto | I3 = Number<3>{} |
| static constexpr auto | I4 = Number<4>{} |
| static constexpr auto | I5 = Number<5>{} |
| static constexpr auto | I6 = Number<6>{} |
| static constexpr auto | WmmaK = 16 |
| static constexpr auto | MWaves = MPerBlock / (MRepeat * MPerWmma) |
| static constexpr auto | LWaves = LPerBlock / (LRepeat * LPerWmma) |
| static constexpr auto | NWaves = NPerBlock / (NRepeat * NPerWmma) |
| static constexpr auto | AEnableLds_auto = LWaves == 1 ? false : true |
| static constexpr auto | B0EnableLds_auto = MWaves == 1 ? false : true |
| static constexpr auto | B1EnableLds_auto = MWaves == 1 ? false : true |
| static constexpr auto | AEnableLds_manu = false |
| static constexpr auto | B0EnableLds_manu = true |
| static constexpr auto | B1EnableLds_manu = true |
| static constexpr auto | AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1) |
| static constexpr auto | B0EnableLds = B0EnableLds_auto || B0EnableLds_manu || (NumPrefetch > 1) |
| static constexpr auto | B1EnableLds = B1EnableLds_auto || B1EnableLds_manu || (NumPrefetch > 1) |
Static Public Attributes inherited from ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc1BiasDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, MaskingSpec > | |
| static constexpr index_t | NumAcc0Bias |
| static constexpr index_t | NumAcc1Bias |
Member Typedef Documentation
◆ AGridDesc
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::AGridDesc = decltype(MakeAGridDescriptor({}, {})) |
◆ AGridDesc_G_M_K
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::AGridDesc_G_M_K = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {})) |
◆ B0GridDesc
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B0GridDesc = decltype(MakeB0GridDescriptor({}, {})) |
◆ B0GridDesc_G_L_K
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B0GridDesc_G_L_K = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {})) |
◆ B1GridDesc
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B1GridDesc = decltype(MakeB1GridDescriptor({}, {})) |
◆ B1GridDesc_G_N_L
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B1GridDesc_G_N_L = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {})) |
◆ C0MatrixMask
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::C0MatrixMask = C0MatrixMask_impl<decltype(make_MaskOutPredicate())> |
◆ CGridDesc_G_M_N
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::CGridDesc_G_M_N = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {})) |
◆ CGridDesc_M_N
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {})) |
◆ DeviceOp
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::DeviceOp = DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle |
◆ GridwiseOp
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::GridwiseOp = GridwiseBatchedGemmSoftmaxGemm_Wmma< ADataType, B0DataType, Acc0DataType, B1DataType, Acc1DataType, CShuffleDataType, CDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, InMemoryDataOperationEnum::Set, AGridDesc, B0GridDesc, B1GridDesc, CGridDesc_M_N, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, true, AEnableLds, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, true, B0EnableLds, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, false, B1EnableLds, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, Transform::matrix_padder.PadN, MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle, NumPrefetch, LoopSched, PipelineVer> |
◆ Transform
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::Transform = TransformBatchedContractionContractionToBatchedGemmGemm_Wmma< Sequence<NumDimG, NumDimM, NumDimL, NumDimK, NumDimN>, Sequence<MPerBlock, LPerBlock, KPerBlock, NPerBlock>, GemmSpec, ASpec, B0Spec, B1Spec, CSpec> |
Member Function Documentation
◆ GetTypeString()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlineoverridevirtual |
Reimplemented from ck::tensor_operation::device::BaseOperator.
◆ IsSupportedArgument() [1/2]
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlineoverridevirtual |
Reimplemented from ck::tensor_operation::device::BaseOperator.
◆ IsSupportedArgument() [2/2]
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ IsValidCompilationParameter()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestaticconstexpr |
◆ make_MaskOutPredicate()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestaticconstexpr |
◆ MakeAGridDescriptor()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeArgument()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeArgumentPointer()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlineoverridevirtual |
◆ MakeB0GridDescriptor()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeB1GridDescriptor()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeCrossAttnArgument()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeCrossAttnInvoker()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeInvoker()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeInvokerPointer()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlineoverridevirtual |
◆ MakeSelfAttnArgument()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeSelfAttnInvoker()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
Member Data Documentation
◆ AEnableLds
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ AEnableLds_auto
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ AEnableLds_manu
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ B0EnableLds
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ B0EnableLds_auto
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ B0EnableLds_manu
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ B1EnableLds
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ B1EnableLds_auto
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ B1EnableLds_manu
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ I0
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ I1
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ I2
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ I3
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ I4
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ I5
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ I6
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ LWaves
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ MWaves
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumAcc0Bias
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumAcc1Bias
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumDimGemm0K
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumDimGemm0M
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumDimGemm0N
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumDimGemm1K
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumDimGemm1M
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumDimGemm1N
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NWaves
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ WmmaK
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename Acc0BiasDataType , typename Acc0DataType , typename Acc1BiasDataType , typename Acc1DataType , typename CShuffleDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1 , typename ABlockTransferThreadClusterArrangeOrder , typename ABlockTransferSrcAccessOrder , ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1 , typename B0BlockTransferThreadClusterArrangeOrder , typename B0BlockTransferSrcAccessOrder , ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1 , typename B1BlockTransferThreadClusterArrangeOrder , typename B1BlockTransferSrcAccessOrder , ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock , index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
The documentation for this struct was generated from the following file:
- /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.1/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
Public Member Functions inherited from