ck Namespace Reference

ck Namespace Reference#

Composable Kernel: ck Namespace Reference
ck Namespace Reference

Namespaces

 conv_tensor_rearrange_op
 
 debug
 
 detail
 
 details
 
 dpp8
 
 fp8_impl
 
 host_common
 
 impl
 
 internal
 
 lds_utils
 
 literals
 
 math
 
 mathext
 
 ranges
 
 reduce
 
 tensor_layout
 
 tensor_operation
 
 util
 
 utility
 
 utils
 

Classes

struct  InMemoryDataOperationEnumSequence
 
struct  StaticTensor
 
struct  StaticTensorTupleOfVectorBuffer
 
struct  PassThrough
 
struct  Pad
 
struct  LeftPad
 
struct  RightPad
 
struct  Embed
 
struct  Merge_v1_carry_check
 
struct  lambda_merge_generate_MagicDivision_calculate_magic_multiplier
 
struct  lambda_merge_generate_MagicDivision_calculate_magic_shift
 
struct  Merge_v2_magic_division
 
struct  Merge_v2r2_magic_division
 
struct  Merge_v3_division_mod
 
struct  UnMerge
 
struct  ConvBwdDataImplicitGemmOutTransform
 Transformation struct for convolution backward data output indices to GEMM indices. More...
 
struct  Freeze
 
struct  Insert
 
struct  Vectorize
 
struct  Slice
 
struct  Modulo
 
struct  Xor
 
struct  TensorAdaptor
 
struct  TensorCoordinate
 
struct  TensorCoordinateStep
 
struct  TensorDescriptor
 
struct  lambda_get_up_dim_num
 
struct  SpaceFillingCurve
 
struct  BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
 
struct  BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
 
struct  BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
 
struct  BlockwiseGemmDpp_ak0mak1_bk0nbk1_m0n0m1n1m2n2
 
struct  BlockwiseGemmXdlops_mx_pipeline_base
 
struct  BlockwiseGemmWmmaops_pipeline_hotloop_inst
 
struct  BlockwiseGemmWmmaops_pipeline_base
 
struct  BlockwiseGemmWmmaops_pipeline_v1
 
struct  BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >
 
struct  BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >
 
struct  BlockwiseGemmWmmaops_pipeline_v3
 
struct  BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >
 
struct  BlockwiseGemmXdlops_pipeline_hotloop_inst
 
struct  BlockwiseGemmXdlops_pipeline_v4
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v1
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v1
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v2
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v2< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v3
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_base
 
struct  BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v1
 
struct  BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3
 
struct  BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v3
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v3
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_mx_moe_bns_gufusion_v3
 
struct  BlockwiseGemmXdlops_pipeline_mx_moe_bns_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v1
 
struct  BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v1< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3
 
struct  BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v1
 
struct  BlockwiseGemmXdlops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v1_ab_scale
 
struct  BlockwiseGemmXdlops_pipeline_v1_ab_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v1_b_scale
 
struct  BlockwiseGemmXdlops_pipeline_v1_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v1_mx
 
struct  BlockwiseGemmXdlops_pipeline_v1_mx< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v2
 
struct  BlockwiseGemmXdlops_pipeline_v2< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v2< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v2_ab_scale
 
struct  BlockwiseGemmXdlops_pipeline_v2_ab_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v2_b_scale
 
struct  BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v3
 
struct  BlockwiseGemmXdlops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v3_ab_scale
 
struct  BlockwiseGemmXdlops_pipeline_v3_ab_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v3_b_scale
 
struct  BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v3_mx
 
struct  BlockwiseGemmXdlops_pipeline_v3_mx< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle
 
struct  BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v4_b_scale
 
struct  BlockwiseGemmXdlops_pipeline_v4_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v5
 
struct  BlockwiseGemmXdlops_pipeline_v5< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 
struct  BlockwiseGemmWMMA
 
struct  BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 
struct  BlockwiseGemmXdlops_v2
 Blockwise gemm. More...
 
struct  BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1r1
 
struct  BlockwiseSoftmax
 Blockwise softmax. More...
 
struct  BlockwiseTensorSliceTransfer_v5r1
 
struct  BlockwiseWelford
 
struct  PartitionedBlockwiseReduction
 
struct  PartitionedBlockwiseReduction_v2
 
struct  PartitionedBlockwiseReductionWithIndex
 
struct  ThreadGroupTensorSliceTransfer_DirectLoad
 
struct  ThreadGroupTensorSliceTransfer_Gather_DirectLoad
 
struct  ThreadGroupTensorSliceTransfer_v4r1
 Blockwise data transfer. More...
 
struct  ThreadGroupTensorSliceTransfer_v4r1_dequant
 Blockwise data transfer with dequantization. More...
 
struct  ThreadGroupTensorSliceTransfer_v4r1_gather
 Blockwise data transfer. More...
 
struct  ThreadGroupTensorSliceTransfer_v4r2
 Blockwise data transfer. More...
 
struct  ThreadGroupTensorSliceTransfer_v6r1
 
struct  ThreadGroupTensorSliceTransfer_v6r1r2
 
struct  ThreadGroupTensorSliceTransfer_v6r2
 
struct  ThreadGroupTensorSliceTransfer_v6r3
 
struct  ThreadGroupTensorSliceTransfer_v7
 
struct  ThreadGroupTensorSliceTransfer_v7r2
 
struct  ThreadGroupTensorSliceTransfer_v7r3
 
struct  ThreadGroupTensorSliceTransfer_v7r3_scatter
 
struct  reduce_binary_operator
 
struct  reduce_binary_operator< ReduceTensorOp::ADD >
 
struct  reduce_binary_operator< ReduceTensorOp::MUL >
 
struct  reduce_binary_operator< ReduceTensorOp::MIN >
 
struct  reduce_binary_operator< ReduceTensorOp::MAX >
 
struct  reduce_binary_operator< ReduceTensorOp::AMAX >
 
struct  reduce_binary_operator< ReduceTensorOp::AVG >
 
struct  reduce_binary_operator< ReduceTensorOp::NORM1 >
 
struct  reduce_binary_operator< ReduceTensorOp::NORM2 >
 
struct  reduce_unary_operator
 
struct  reduce_unary_operator< ReduceTensorOp::AVG, IsFirstReduce, true >
 
struct  reduce_unary_operator< ReduceTensorOp::NORM1, true, IsLastReduce >
 
struct  reduce_unary_operator< ReduceTensorOp::AMAX, true, IsLastReduce >
 
struct  reduce_unary_operator< ReduceTensorOp::NORM2, true, false >
 
struct  reduce_unary_operator< ReduceTensorOp::NORM2, true, true >
 
struct  reduce_unary_operator< ReduceTensorOp::NORM2, false, true >
 
struct  GridwiseMultiblockBatchNormForward
 
struct  GridwiseReduceSecondHalfBatchNormBackwardFinal
 
struct  GridwiseMultiblockWelfordFirstHalf
 
struct  GridwiseWelfordSecondHalfBatchNormForwardFinal
 
struct  GridwiseWelfordSecondHalfReduceFirstHalf
 
struct  BlockToCTileMap_M00_N0_M01
 
struct  BlockToCTileMap_M00_N0_M01Adapt
 
struct  BlockToCTileMap_M00_N0_M01Adapt< MPerBlock, NPerBlock, void >
 
struct  BlockToCTileMap_Grouped_M00_N0_M01Adapt
 
struct  BlockToCTileMap_N00_M0_N01Adapt
 
struct  BlockToCTileMap_N00_M0_N01Adapt< MPerBlock, NPerBlock, void >
 
struct  BlockToCTileMap_KSplit_M00_N0_M01Adapt
 
struct  BlockToCTileMap_M00_N00_M01_N01
 
struct  BlockToCTileMap_KSplit_M00_N00_M01_N01
 
struct  OffsettedBlockToCTileMap
 
struct  OffsettedBlockToCTileMap2
 
struct  BlockToCTileMap_3DGrid_KSplit
 Simple tile mapping which creates 3D grid of block of threads. More...
 
struct  BlockToCTileMap_GemmStreamK
 
struct  BlockToCTileMap_GemmStreamK_v2
 
struct  GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle
 
struct  GridwiseWelfordSecondHalfLayernorm2d
 
struct  GridwiseMultipleReduction_mk_to_m_multiblock
 
struct  GridwiseMultipleReduction_mk_to_m_threadwise
 
struct  GridwiseReduction_mk_to_m_multiblock
 
struct  GridwiseReduction_mk_to_m_threadwise
 
struct  GridwiseReduction_mk_to_m_threadwise_multi_d
 
struct  GridwiseBatchedGemmGemm_wmma_cshuffle_v3
 
struct  GridwiseBatchedGemmGemm_Xdl_CShuffle
 
struct  GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
 
struct  GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
 
struct  GridwiseBatchedGemmSoftmaxGemm_Wmma
 
struct  GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
 Gridwise gemm + softmax + gemm fusion. More...
 
struct  GridwiseBatchNormBackwardWithBlockwiseWelford
 
struct  GridwiseBatchNormForwardWithBlockwiseWelford
 
struct  GridwiseElementwise_1D
 
struct  GridwiseElementwise
 
struct  GridwiseElementwiseLayernormWelfordVariance_mk_to_mk
 
struct  GridwiseFpAintBGemm_Wmma
 
struct  GridwiseGemmBiasAddReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 
struct  GridwiseGemmDlMultipleD_km_kn_mn
 
struct  GridwiseGemmDl_km_kn_mn_v1r3
 
struct  GridwiseGemmDl_bkm_bkn_mn_v1r3
 
struct  GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp
 
struct  GridwiseGemmMultipleABD_xdl_cshuffle
 
struct  GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 
struct  GridwiseGemmMultipleD_Wmma
 
struct  GridwiseGemmMultipleD_xdl_cshuffle
 
struct  GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
 
struct  GridwiseGemmMultipleD_xdl_splitk_cshuffle
 
struct  GridwiseGemmPipeline_v1
 
struct  GridwiseGemmPipeline_v1< 1, true, true >
 
struct  GridwiseGemmPipeline_v1< 2, true, true >
 
struct  GridwiseGemmPipeline_v1< 1, false, true >
 
struct  GridwiseGemmPipeline_v1< 1, true, false >
 
struct  GridwiseGemmPipeline_v1< 1, false, false >
 
struct  GridwiseGemmPipeline_v1_WeightOnly
 
struct  GridwiseGemmPipeline_v1_WeightOnly< 1, true, true >
 
struct  GridwiseGemmPipelineInterwave_v1
 
struct  GridwiseGemmPipelineInterwave_v1< 1 >
 
struct  GridwiseGemmPipelineInterwave_v1< 2 >
 
struct  GridwiseGemmPipeline_v2
 
struct  GridwiseGemmPipeline_v3
 
struct  GridwiseGemmPipeline_v4
 
struct  GridwiseGemmPipeline_v4< 1 >
 
struct  GridwiseGemmPipeline_v4< 2 >
 
struct  GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 
struct  GridwiseGemmSplitKMultipleD_xdl_cshuffle
 
struct  GridwiseGemmLoadWave
 
struct  GridwiseGemmLoadWave< TileLoadThreadGroup, 1 >
 
struct  GridwiseGemmMathWave
 
struct  GridwiseGemmMathWave< TileMathThreadGroup, 1 >
 
struct  GridwiseGemm_Wmma
 
struct  GridwiseGemm_wmma_cshuffle_v3
 "Universal" GEMM kernel with SplitK support. More...
 
struct  GridwiseGemm_wmma_cshuffle_v3_b_scale
 
struct  GridwiseGemm_wmma_cshuffle_v3_base
 
struct  GridwiseGemm_xdl_cshuffle_conv_v3
 
struct  GridwiseGemm_xdl_cshuffle_streamk_v3
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 
struct  GridwiseGemm_xdl_cshuffle_v2
 
struct  GridwiseGemm_xdl_cshuffle_v3
 "Universal" GEMM kernel with SplitK support. More...
 
struct  GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
 
struct  GridwiseGemmMultiD_xdl_cshuffle_v3
 
struct  GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
 
struct  GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
 
struct  GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
 
struct  GridwiseGemmMX_xdl_cshuffle_v3
 
struct  GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
 
struct  GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle
 
struct  Merge_v4_no_carry
 
struct  GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
 
struct  GridwiseGemm_xdlops_splitk_lds_direct_load
 
struct  GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext
 
struct  GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
 
struct  GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
 
struct  GridwiseMoeGemm
 
struct  GridwiseMoeGemmBlockScale
 
struct  GridwiseMoeGemmMX
 
struct  GridwiseMoeGemmMXBNS
 
struct  GridwiseMoeGemmMX_BPreshuffle
 
struct  GridwisePermute
 
struct  GridwisePutElement_1D
 
struct  GridwiseSoftmax_mk_to_mk
 
struct  GridwiseSparseEmbeddingsForwardLayernorm
 
struct  GridwiseTensorRearrange
 
struct  GridwiseNormalizationBwdData_mk_to_mk
 
struct  GridwiseNormalizationBwdGammaBeta_mk_to_k
 
struct  GridwiseNormalizationNaiveVariance_mk_to_mk
 
struct  GridwiseNormalizationSplitK1st
 
struct  GridwiseNormalizationSplitK2nd
 
struct  GridwiseNormalizationWelfordVariance_mk_to_mk
 
struct  ThreadwiseReduction
 
struct  ThreadwiseReductionWithIndex
 
struct  ThreadwiseGemmDl_km0m1_kn0n1_m0m1n0n1
 
struct  ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
 
struct  ThreadwiseGemmDlops_km_kn_mn_v3
 
struct  ThreadwiseTensorSliceSet_v1
 
struct  ThreadwiseTensorSliceTransfer_v1r3
 
struct  ThreadwiseTensorSliceTransfer_v2
 Helper structure that facilitates transfer of source (grid) data to destination threads. More...
 
struct  ThreadwiseTensorSliceTransfer_v2_gather
 
struct  ThreadwiseTensorSliceTransfer_v3
 
struct  ThreadwiseTensorSliceTransfer_v4
 
struct  ThreadwiseTensorSliceTransfer_StaticToStatic
 Threadwise data transfer. More...
 
struct  ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
 
struct  ThreadwiseTensorSliceTransfer_StaticToStatic_IntraRow
 
struct  ThreadwiseTensorSliceTransfer_v3r1
 
struct  ThreadwiseTensorSliceTransfer_v3r1_dequant
 
struct  ThreadwiseTensorSliceTransfer_v3r1_gather
 
struct  ThreadwiseTensorSliceTransfer_v3r2
 
struct  ThreadwiseTensorSliceTransfer_v4r1
 
struct  ThreadwiseTensorSliceTransfer_v5r1
 
struct  ThreadwiseTensorSliceTransfer_v6r1
 
struct  ThreadwiseTensorSliceTransfer_v6r1r2
 
struct  ThreadwiseTensorSliceTransfer_v6r2
 
struct  ThreadwiseTensorSliceTransfer_v6r3
 
struct  ThreadwiseTensorSliceTransfer_v7
 
struct  ThreadwiseTensorSliceTransfer_v7r2
 
struct  ThreadwiseTensorSliceTransfer_v7r3
 
struct  ThreadwiseTensorSliceTransfer_v7r3_scatter
 
struct  ThreadwiseWelford
 
struct  ThreadwiseWelfordMerge
 
struct  dpp_type
 
struct  dpp_type< DppInstr::dpp8_f16_32x8x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_8x32x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_8x16x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_16x16x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_4x32x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_4x16x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_1x32x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_2x32x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_2x16x2 >
 
struct  DppSelector
 
struct  DppGemm
 
struct  smfmac_type
 
struct  smfmac< SmfmacInstr::smfmac_f32_16x16x32f16 >
 
struct  smfmac< SmfmacInstr::smfmac_f32_32x32x16f16 >
 
struct  smfmac< SmfmacInstr::smfmac_f32_16x16x32bf16 >
 
struct  smfmac< SmfmacInstr::smfmac_f32_32x32x16bf16 >
 
struct  SmfmacSelector
 
struct  SparseXdlopsGemm
 
struct  wmma_type
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_f16, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_bf16, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f16_16x16x16_f16, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_bf16_16x16x16_bf16, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_i32_16x16x16_iu8, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_f16_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_bf16_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_i32_16x16x16_iu8_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_f8f8_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_f8bf8_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_bf8f8_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_bf8bf8_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  WmmaSelector
 
struct  WmmaGemm
 
struct  mfma_type
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x1f32 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x2f32 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x4f32 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x1f32 >
 
struct  mfma_type< MfmaInstr::mfma_f32_4x4x1f32 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x4f16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x8f16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16f16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32f16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x16f16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x4f16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_4x4x4f16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16bf16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x8bf16_1k >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32bf16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x16bf16_1k >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x4bf16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x8bf16 >
 
struct  mfma_type< MfmaInstr::mfma_i32_32x32x8i8 >
 
struct  mfma_type< MfmaInstr::mfma_i32_16x16x16i8 >
 
struct  mfma_type< MfmaInstr::mfma_i32_32x32x16i8 >
 
struct  mfma_type< MfmaInstr::mfma_i32_16x16x32i8 >
 
struct  mfma_type< MfmaInstr::mfma_i32_32x32x32i8 >
 
struct  mfma_type< MfmaInstr::mfma_i32_16x16x64i8 >
 
struct  mfma_type< MfmaInstr::mfma_f64_16x16x4f64 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16f8f8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32f8f8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16bf8bf8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32bf8bf8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16f8bf8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32f8bf8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16bf8f8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32bf8f8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x64f8f6f4 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x128f8f6f4 >
 
struct  mfma_type< MfmaInstr::mfma_scale_f32_32x32x64f8f6f4 >
 
struct  mfma_type< MfmaInstr::mfma_scale_f32_16x16x128f8f6f4 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x8xf32 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x4xf32 >
 
struct  mfma_type_gfx11_base
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_f16 >
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_bf16 >
 
struct  mfma_type< MfmaInstr::wmma_i32_16x16x16_iu8 >
 
struct  mfma_type< MfmaInstr::wmma_unsupport_16x16_gfx11 >
 
struct  mfma_type_gfx12_base
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_f16_gfx12 >
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_bf16_gfx12 >
 
struct  mfma_type< MfmaInstr::wmma_i32_16x16x16_iu8_gfx12 >
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_f8f8_gfx12 >
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_f8bf8_gfx12 >
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_bf8f8_gfx12 >
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_bf8bf8_gfx12 >
 
struct  mfma_type< MfmaInstr::wmma_unsupport_16x16_gfx12 >
 
class  MfmaSelector
 Selects the appropriate MFMA instruction type and configuration for given data types and tile sizes on AMD GPUs. More...
 
struct  XdlopsGemm
 
union  BufferResource
 
struct  f8_fnuz_t
 
struct  bf8_fnuz_t
 
struct  f8_ocp_t
 
struct  bf8_ocp_t
 
struct  intrin_smfmac_f32_16x16x32f16
 
struct  intrin_smfmac_f32_16x16x32f16< 16, 16 >
 
struct  intrin_smfmac_f32_16x16x32bf16
 
struct  intrin_smfmac_f32_16x16x32bf16< 16, 16 >
 
struct  intrin_smfmac_f32_32x32x16f16
 
struct  intrin_smfmac_f32_32x32x16f16< 32, 32 >
 
struct  intrin_smfmac_f32_32x32x16bf16
 
struct  intrin_smfmac_f32_32x32x16bf16< 32, 32 >
 
struct  intrin_wmma_f32_16x16x16_f16_w32
 
struct  intrin_wmma_f32_16x16x16_f16_w32< 16, 16 >
 
struct  intrin_wmma_f32_16x16x16_bf16_w32
 
struct  intrin_wmma_f32_16x16x16_bf16_w32< 16, 16 >
 
struct  intrin_wmma_f16_16x16x16_f16_w32
 
struct  intrin_wmma_f16_16x16x16_f16_w32< 16, 16, Opsel >
 
struct  intrin_wmma_bf16_16x16x16_bf16_w32
 
struct  intrin_wmma_bf16_16x16x16_bf16_w32< 16, 16, Opsel >
 
struct  intrin_wmma_i32_16x16x16_iu8_w32
 
struct  intrin_wmma_i32_16x16x16_iu8_w32< 16, 16, neg_a, neg_b, clamp >
 
struct  intrin_wmma_f32_16x16x16_f16_w64
 
struct  intrin_wmma_f32_16x16x16_f16_w64< 16, 16 >
 
struct  intrin_wmma_f32_16x16x16_bf16_w64
 
struct  intrin_wmma_f32_16x16x16_bf16_w64< 16, 16 >
 
struct  intrin_wmma_f16_16x16x16_f16_w64
 
struct  intrin_wmma_f16_16x16x16_f16_w64< 16, 16, Opsel >
 
struct  intrin_wmma_bf16_16x16x16_bf16_w64
 
struct  intrin_wmma_bf16_16x16x16_bf16_w64< 16, 16, Opsel >
 
struct  intrin_wmma_i32_16x16x16_iu8_w64
 
struct  intrin_wmma_i32_16x16x16_iu8_w64< 16, 16, neg_a, neg_b, clamp >
 
struct  intrin_wmma_f32_16x16x16_f16_w32_gfx12
 
struct  intrin_wmma_f32_16x16x16_f16_w32_gfx12< 16, 16 >
 
struct  intrin_wmma_f32_16x16x16_bf16_w32_gfx12
 
struct  intrin_wmma_f32_16x16x16_bf16_w32_gfx12< 16, 16 >
 
struct  intrin_wmma_i32_16x16x16_iu8_w32_gfx12
 
struct  intrin_wmma_i32_16x16x16_iu8_w32_gfx12< 16, 16, neg_a, neg_b, clamp >
 
struct  intrin_wmma_f32_16x16x16_f8f8_w32_gfx12
 
struct  intrin_wmma_f32_16x16x16_f8f8_w32_gfx12< 16, 16 >
 
struct  intrin_wmma_f32_16x16x16_f8bf8_w32_gfx12
 
struct  intrin_wmma_f32_16x16x16_f8bf8_w32_gfx12< 16, 16 >
 
struct  intrin_wmma_f32_16x16x16_bf8f8_w32_gfx12
 
struct  intrin_wmma_f32_16x16x16_bf8f8_w32_gfx12< 16, 16 >
 
struct  intrin_wmma_f32_16x16x16_bf8bf8_w32_gfx12
 
struct  intrin_wmma_f32_16x16x16_bf8bf8_w32_gfx12< 16, 16 >
 
struct  intrin_mfma_f32_32x32x1f32
 
struct  intrin_mfma_f32_32x32x1f32< 64, 64 >
 
struct  intrin_mfma_f32_32x32x1f32< 32, 64 >
 
struct  intrin_mfma_f32_32x32x2f32
 
struct  intrin_mfma_f32_32x32x2f32< 32, 32 >
 
struct  intrin_mfma_f32_16x16x4f32
 
struct  intrin_mfma_f32_16x16x4f32< 16, 16 >
 
struct  intrin_mfma_f32_16x16x1f32
 
struct  intrin_mfma_f32_16x16x1f32< 16, 64 >
 
struct  intrin_mfma_f32_4x4x1f32
 
struct  intrin_mfma_f32_4x4x1f32< 4, 64 >
 
struct  intrin_mfma_f32_4x4x1f32< 8, 64 >
 
struct  intrin_mfma_f32_32x32x4f16
 
struct  intrin_mfma_f32_32x32x4f16< 64, 64 >
 
struct  intrin_mfma_f32_32x32x4f16< 32, 64 >
 
struct  intrin_mfma_f32_32x32x16f16
 
struct  intrin_mfma_f32_32x32x16f16< 32, 32 >
 
struct  intrin_mfma_f32_16x16x32f16
 
struct  intrin_mfma_f32_16x16x32f16< 16, 16 >
 
struct  intrin_mfma_f32_32x32x8f16
 
struct  intrin_mfma_f32_32x32x8f16< 32, 32 >
 
struct  intrin_mfma_f32_16x16x16f16
 
struct  intrin_mfma_f32_16x16x16f16< 16, 16 >
 
struct  intrin_mfma_f32_16x16x4f16
 
struct  intrin_mfma_f32_16x16x4f16< 16, 64 >
 
struct  intrin_mfma_f32_4x4x4f16
 
struct  intrin_mfma_f32_4x4x4f16< 4, 64 >
 
struct  intrin_mfma_f32_4x4x4f16< 8, 64 >
 
struct  intrin_mfma_f32_32x32x16bf16
 
struct  intrin_mfma_f32_32x32x16bf16< 32, 32 >
 
struct  intrin_mfma_f32_16x16x32bf16
 
struct  intrin_mfma_f32_16x16x32bf16< 16, 16 >
 
struct  intrin_mfma_f32_32x32x8bf16_1k
 
struct  intrin_mfma_f32_32x32x8bf16_1k< 32, 32 >
 
struct  intrin_mfma_f32_16x16x16bf16_1k
 
struct  intrin_mfma_f32_16x16x16bf16_1k< 16, 16 >
 
struct  intrin_mfma_f32_32x32x4bf16
 
struct  intrin_mfma_f32_32x32x4bf16< 32, 32 >
 
struct  intrin_mfma_f32_16x16x8bf16
 
struct  intrin_mfma_f32_16x16x8bf16< 16, 16 >
 
struct  intrin_mfma_i32_32x32x8i8
 
struct  intrin_mfma_i32_32x32x8i8< 32, 32 >
 
struct  intrin_mfma_i32_16x16x16i8
 
struct  intrin_mfma_i32_16x16x16i8< 16, 16 >
 
struct  intrin_mfma_i32_32x32x32i8
 
struct  intrin_mfma_i32_32x32x32i8< 32, 32 >
 
struct  intrin_mfma_i32_16x16x64i8
 
struct  intrin_mfma_i32_16x16x64i8< 16, 16 >
 
struct  intrin_mfma_i32_32x32x16i8
 
struct  intrin_mfma_i32_32x32x16i8< 32, 32 >
 
struct  intrin_mfma_i32_16x16x32i8
 
struct  intrin_mfma_i32_16x16x32i8< 16, 16 >
 
struct  intrin_mfma_f64_16x16x4f64
 
struct  intrin_mfma_f64_16x16x4f64< 16, 16 >
 
struct  intrin_mfma_f32_32x32x64f8f6f4
 
struct  intrin_mfma_f32_32x32x64f8f6f4< 32, 32 >
 Performs a matrix fused multiply-accumulate operation on 32x32x64 submatrices for f8, f6, and f4 data types. More...
 
struct  intrin_mfma_scale_f32_32x32x64f8f6f4
 
struct  intrin_mfma_scale_f32_32x32x64f8f6f4< 32, 32, OpselA, OpselB >
 
struct  intrin_mfma_scale_f32_16x16x128f8f6f4
 
struct  intrin_mfma_scale_f32_16x16x128f8f6f4< 16, 16, OpselA, OpselB >
 
struct  intrin_mfma_f32_16x16x128f8f6f4
 
struct  intrin_mfma_f32_16x16x128f8f6f4< 16, 16 >
 Performs a matrix fused multiply-accumulate operation on 16x16x128 submatrices for f8f6f4 data types. More...
 
struct  intrin_mfma_f32_32x32x16f8f8
 
struct  intrin_mfma_f32_32x32x16f8f8< 32, 32 >
 
struct  intrin_mfma_f32_16x16x32f8f8
 
struct  intrin_mfma_f32_16x16x32f8f8< 16, 16 >
 
struct  intrin_mfma_f32_32x32x16bf8bf8
 
struct  intrin_mfma_f32_32x32x16bf8bf8< 32, 32 >
 
struct  intrin_mfma_f32_16x16x32bf8bf8
 
struct  intrin_mfma_f32_16x16x32bf8bf8< 16, 16 >
 
struct  intrin_mfma_f32_32x32x16f8bf8
 
struct  intrin_mfma_f32_32x32x16f8bf8< 32, 32 >
 
struct  intrin_mfma_f32_16x16x32f8bf8
 
struct  intrin_mfma_f32_16x16x32f8bf8< 16, 16 >
 
struct  intrin_mfma_f32_32x32x16bf8f8
 
struct  intrin_mfma_f32_32x32x16bf8f8< 32, 32 >
 
struct  intrin_mfma_f32_16x16x32bf8f8
 
struct  intrin_mfma_f32_16x16x32bf8f8< 16, 16 >
 
struct  intrin_mfma_f32_16x16x8xf32
 
struct  intrin_mfma_f32_16x16x8xf32< 16, 16 >
 
struct  intrin_mfma_f32_32x32x4xf32
 
struct  intrin_mfma_f32_32x32x4xf32< 32, 32 >
 
struct  Array
 
struct  Array< TData, 0 >
 
struct  ContainerElementPicker
 
struct  ConstantContainerElementPicker
 
struct  scalar_type
 
struct  f4x2_pk_t
 
struct  f6_pk_t
 
struct  pk_i4_t
 
struct  is_scalar_type
 
struct  scalar_type< T >
 
struct  scalar_type< double >
 
struct  scalar_type< float >
 
struct  scalar_type< half_t >
 
struct  scalar_type< bhalf_t >
 
struct  scalar_type< int32_t >
 
struct  scalar_type< int8_t >
 
struct  scalar_type< uint8_t >
 
struct  scalar_type< pk_i4_t >
 
struct  scalar_type< f8_fnuz_t >
 
struct  scalar_type< bf8_fnuz_t >
 
struct  scalar_type< f8_ocp_t >
 
struct  scalar_type< bf8_ocp_t >
 
struct  scalar_type< e8m0_bexp_t >
 
struct  scalar_type< f4x2_pk_t >
 
struct  scalar_type< f6x32_pk_t >
 
struct  scalar_type< bf6x32_pk_t >
 
struct  scalar_type< f6x16_pk_t >
 
struct  scalar_type< bf6x16_pk_t >
 
struct  scalar_type< bool >
 
struct  packed_type_info
 
struct  packed_type_maker
 
struct  vector_type
 
struct  vector_type_maker
 
struct  scalar_type< vector_type< T, N > >
 
struct  vector_type_maker< T, N0 >
 
struct  vector_type_maker< vector_type< T, N1 >, N0 >
 
struct  non_native_vector_base
 
struct  nnvb_data_t_selector
 
struct  nnvb_data_t_selector< f8_ocp_t >
 
struct  nnvb_data_t_selector< bf8_ocp_t >
 
struct  nnvb_data_t_selector< f8_fnuz_t >
 
struct  nnvb_data_t_selector< bf8_fnuz_t >
 
struct  nnvb_data_t_selector< e8m0_bexp_t >
 
struct  nnvb_data_t_selector< f6x16_pk_t >
 
struct  nnvb_data_t_selector< f6x32_pk_t >
 
struct  nnvb_data_t_selector< bf6x16_pk_t >
 
struct  nnvb_data_t_selector< bf6x32_pk_t >
 
struct  nnvb_data_t_selector< pk_i4_t >
 
struct  nnvb_data_t_selector< f4x2_pk_t >
 
struct  non_native_vector_base< T, N, ck::enable_if_t< sizeof(T)==1||sizeof(T)==2||sizeof(T)==4||sizeof(T)==8 > >
 
struct  non_native_vector_base< T, N, ck::enable_if_t< sizeof(T)==12||sizeof(T)==16||sizeof(T)==24||sizeof(T)==32 > >
 
struct  scalar_type< non_native_vector_base< T, N, ck::enable_if_t< sizeof(T)==1||sizeof(T)==2||sizeof(T)==4||sizeof(T)==8 > > >
 
struct  scalar_type< non_native_vector_base< T, N, ck::enable_if_t< sizeof(T)==12||sizeof(T)==16||sizeof(T)==24||sizeof(T)==32 > > >
 
struct  DynamicBuffer
 
struct  e8m0_bexp_t
 Unsigned representation of a conventional biased Float32 exponent. More...
 
struct  forwarder
 
struct  swallow
 
struct  logical_and
 
struct  logical_or
 
struct  logical_not
 
struct  static_if
 
struct  static_if< true >
 
struct  static_if< false >
 
struct  conditional
 
struct  conditional< true, X, Y >
 
struct  conditional< false, X, Y >
 
struct  static_for
 
struct  static_for< 0, N, 1 >
 
struct  static_for_range
 
struct  static_for_product
 
struct  static_for_product< Tuple< Is... > >
 
struct  static_for_product< Tuple< Is... >, Rest... >
 
struct  identity
 
struct  static_ford
 
struct  ford
 
struct  constant
 
struct  integral_constant
 
struct  nonesuch
 
struct  is_known_at_compile_time
 
struct  is_known_at_compile_time< index_t >
 
struct  is_known_at_compile_time< unsigned int >
 
struct  is_known_at_compile_time< long_index_t >
 
struct  is_known_at_compile_time< integral_constant< T, X > >
 
struct  is_known_at_compile_time< Sequence< Is... > >
 
struct  is_known_at_compile_time< Tuple< Ts... > >
 
struct  MagicDivision
 
struct  MDiv
 
struct  MDiv2
 
struct  NumericLimits
 
struct  NumericLimits< half_t >
 
struct  NumericLimits< f8_fnuz_t >
 
struct  NumericLimits< bf8_fnuz_t >
 
struct  NumericLimits< f8_ocp_t >
 
struct  NumericLimits< bf8_ocp_t >
 
struct  NumericLimits< f4_t >
 
struct  NumericLimits< f6_t >
 
struct  NumericLimits< bf6_t >
 
struct  NumericLimits< e8m0_bexp_t >
 
struct  NumericUtils
 
struct  NumericUtils< e8m0_bexp_t >
 
struct  NumericUtils< float >
 
struct  NumericUtils< ck::tf32_t >
 
struct  NumericUtils< half_t >
 
struct  NumericUtils< bhalf_t >
 
struct  NumericUtils< f8_fnuz_t >
 
struct  NumericUtils< bf8_fnuz_t >
 
struct  NumericUtils< f8_ocp_t >
 
struct  NumericUtils< bf8_ocp_t >
 
struct  NumericUtils< f4_t >
 
struct  NumericUtils< f6_t >
 
struct  NumericUtils< bf6_t >
 
struct  float_equal_one
 
struct  float_equal_zero
 
struct  Sequence
 
struct  sequence_split
 
struct  sequence_reverse
 
struct  sequence_map_inverse
 
struct  is_valid_sequence_map
 
struct  sequence_merge
 
struct  sequence_merge< Sequence< Xs... >, Sequence< Ys... > >
 
struct  sequence_merge< Seq >
 
struct  sequence_gen
 
struct  arithmetic_sequence_gen
 
struct  arithmetic_sequence_gen< 0, IEnd, 1 >
 
struct  uniform_sequence_gen
 
struct  sequence_reverse_inclusive_scan
 
struct  sequence_reverse_inclusive_scan< Sequence< I, Is... >, Reduce, Init >
 
struct  sequence_reverse_inclusive_scan< Sequence< I >, Reduce, Init >
 
struct  sequence_reverse_inclusive_scan< Sequence<>, Reduce, Init >
 
struct  sequence_reverse< Sequence< I > >
 
struct  sequence_reverse< Sequence< I0, I1 > >
 
struct  sequence_reduce
 
struct  sequence_reduce< Reduce, Sequence< Xs... >, Sequence< Ys... > >
 
struct  sequence_reduce< Reduce, Seq >
 
struct  sequence_sort_impl
 
struct  sequence_sort_impl< Sequence< ValueX, ValueY >, Sequence< IdX, IdY >, Compare >
 
struct  sequence_sort_impl< Sequence< Value >, Sequence< Id >, Compare >
 
struct  sequence_sort_impl< Sequence<>, Sequence<>, Compare >
 
struct  sequence_sort
 
struct  sequence_unique_sort
 
class  span
 
struct  StaticBuffer
 
struct  StaticBufferTupleOfVector
 
struct  StaticallyIndexedArray_v2
 
struct  ThisThreadBlock
 
struct  transpose_vectors
 
struct  transpose_vectors< half_t, NX, NY >
 
struct  transpose_vectors< int8_t, NX, NY >
 
struct  transpose_vectors< f8_t, NX, NY >
 
struct  Tuple
 
struct  Tuple<>
 
struct  tuple_element
 
struct  is_same
 
struct  is_same< X, X >
 
struct  is_floating_point
 
struct  is_floating_point< float >
 
struct  is_floating_point< double >
 
struct  is_floating_point< long double >
 
struct  is_integral
 
struct  is_integral< int >
 
struct  is_integral< unsigned int >
 
struct  is_integral< long >
 
struct  is_integral< unsigned long >
 
struct  is_integral< short >
 
struct  is_integral< unsigned short >
 
struct  is_integral< long long >
 
struct  is_integral< unsigned long long >
 
struct  is_integral< char >
 
struct  is_integral< signed char >
 
struct  is_integral< unsigned char >
 
struct  is_integral< wchar_t >
 
struct  is_integral< char16_t >
 
struct  is_integral< char32_t >
 
struct  is_integral< bool >
 
struct  workgroup_barrier
 

Typedefs

using index_t = int32_t
 
using long_index_t = int64_t
 
template<typename T >
using iter_value_t = typename std::iterator_traits< remove_cvref_t< T > >::value_type
 
template<typename T >
using iter_reference_t = decltype(*std::declval< T & >())
 
template<typename T >
using iter_difference_t = typename std::iterator_traits< remove_cvref_t< T > >::difference_type
 
template<typename TensorDesc >
using TensorCoordinate_t = decltype(make_tensor_coordinate(TensorDesc{}, MultiIndex< remove_cvref_t< TensorDesc >::GetNumOfDimension()>{}))
 
template<typename TensorDesc >
using TensorCoordinateStep_t = decltype(make_tensor_coordinate_step(TensorDesc{}, MultiIndex< remove_cvref_t< TensorDesc >::GetNumOfDimension()>{}))
 
typedef unsigned char fp8_storage_t
 
using f8_t = f8_fnuz_t
 
using bf8_t = bf8_fnuz_t
 
template<index_t N>
using MultiIndex = Array< index_t, N >
 
using tf32_t = _BitInt(19)
 
using bhalf_t = ushort
 
using half_t = _Float16
 
using int4_t = _BitInt(4)
 
using f4_t = unsigned _BitInt(4)
 
using f6_t = _BitInt(6)
 
using bf6_t = unsigned _BitInt(6)
 
using f6x16_pk_t = f6_pk_t< f6_t, 16 >
 
using f6x32_pk_t = f6_pk_t< f6_t, 32 >
 
using bf6x16_pk_t = f6_pk_t< bf6_t, 16 >
 
using bf6x32_pk_t = f6_pk_t< bf6_t, 32 >
 
template<typename X , typename Y >
using has_same_scalar_type = is_same< typename scalar_type< remove_cvref_t< X > >::type, typename scalar_type< remove_cvref_t< Y > >::type >
 
template<typename T >
using element_type_t = typename packed_type_info< T >::element_type
 
template<typename T , index_t N = 0>
using packed_type_t = typename packed_type_maker< T, N >::packed_type
 
using int64_t = long
 
using double2_t = typename vector_type< double, 2 >::type
 
using double4_t = typename vector_type< double, 4 >::type
 
template<typename T , index_t N>
using vector_type_maker_t = typename vector_type_maker< T, N >::type
 
using float2_t = typename vector_type< float, 2 >::type
 
using float4_t = typename vector_type< float, 4 >::type
 
using float8_t = typename vector_type< float, 8 >::type
 
using float16_t = typename vector_type< float, 16 >::type
 
using float32_t = typename vector_type< float, 32 >::type
 
using float64_t = typename vector_type< float, 64 >::type
 
using half2_t = typename vector_type< half_t, 2 >::type
 
using half4_t = typename vector_type< half_t, 4 >::type
 
using half8_t = typename vector_type< half_t, 8 >::type
 
using half16_t = typename vector_type< half_t, 16 >::type
 
using half32_t = typename vector_type< half_t, 32 >::type
 
using bhalf2_t = typename vector_type< bhalf_t, 2 >::type
 
using bhalf4_t = typename vector_type< bhalf_t, 4 >::type
 
using bhalf8_t = typename vector_type< bhalf_t, 8 >::type
 
using bhalf16_t = typename vector_type< bhalf_t, 16 >::type
 
using bhalf32_t = typename vector_type< bhalf_t, 32 >::type
 
using int32x2_t = typename vector_type< int32_t, 2 >::type
 
using int32x4_t = typename vector_type< int32_t, 4 >::type
 
using int32x6_t = typename vector_type< int32_t, 6 >::type
 
using int32x8_t = typename vector_type< int32_t, 8 >::type
 
using int32x16_t = typename vector_type< int32_t, 16 >::type
 
using int32x32_t = typename vector_type< int32_t, 32 >::type
 
using int32x64_t = typename vector_type< int32_t, 64 >::type
 
using int8x2_t = typename vector_type< int8_t, 2 >::type
 
using int8x4_t = typename vector_type< int8_t, 4 >::type
 
using int8x8_t = typename vector_type< int8_t, 8 >::type
 
using int8x16_t = typename vector_type< int8_t, 16 >::type
 
using int8x32_t = typename vector_type< int8_t, 32 >::type
 
using int8x64_t = typename vector_type< int8_t, 64 >::type
 
using f8x2_fnuz_t = typename vector_type< f8_fnuz_t, 2 >::type
 
using f8x4_fnuz_t = typename vector_type< f8_fnuz_t, 4 >::type
 
using f8x8_fnuz_t = typename vector_type< f8_fnuz_t, 8 >::type
 
using f8x16_fnuz_t = typename vector_type< f8_fnuz_t, 16 >::type
 
using f8x32_fnuz_t = typename vector_type< f8_fnuz_t, 32 >::type
 
using f8x64_fnuz_t = typename vector_type< f8_fnuz_t, 64 >::type
 
using bf8x2_fnuz_t = typename vector_type< bf8_fnuz_t, 2 >::type
 
using bf8x4_fnuz_t = typename vector_type< bf8_fnuz_t, 4 >::type
 
using bf8x8_fnuz_t = typename vector_type< bf8_fnuz_t, 8 >::type
 
using bf8x16_fnuz_t = typename vector_type< bf8_fnuz_t, 16 >::type
 
using bf8x32_fnuz_t = typename vector_type< bf8_fnuz_t, 32 >::type
 
using bf8x64_fnuz_t = typename vector_type< bf8_fnuz_t, 64 >::type
 
using f8x2_ocp_t = typename vector_type< f8_ocp_t, 2 >::type
 
using f8x4_ocp_t = typename vector_type< f8_ocp_t, 4 >::type
 
using f8x8_ocp_t = typename vector_type< f8_ocp_t, 8 >::type
 
using f8x16_ocp_t = typename vector_type< f8_ocp_t, 16 >::type
 
using f8x32_ocp_t = typename vector_type< f8_ocp_t, 32 >::type
 
using f8x64_ocp_t = typename vector_type< f8_ocp_t, 64 >::type
 
using bf8x2_ocp_t = typename vector_type< bf8_ocp_t, 2 >::type
 
using bf8x4_ocp_t = typename vector_type< bf8_ocp_t, 4 >::type
 
using bf8x8_ocp_t = typename vector_type< bf8_ocp_t, 8 >::type
 
using bf8x16_ocp_t = typename vector_type< bf8_ocp_t, 16 >::type
 
using bf8x32_ocp_t = typename vector_type< bf8_ocp_t, 32 >::type
 
using bf8x64_ocp_t = typename vector_type< bf8_ocp_t, 64 >::type
 
using uint8x2_t = typename vector_type< uint8_t, 2 >::type
 
using uint8x4_t = typename vector_type< uint8_t, 4 >::type
 
using uint8x8_t = typename vector_type< uint8_t, 8 >::type
 
using uint8x16_t = typename vector_type< uint8_t, 16 >::type
 
using uint8x32_t = typename vector_type< uint8_t, 32 >::type
 
using uint8x64_t = typename vector_type< uint8_t, 64 >::type
 
using f4x2_t = typename vector_type< f4x2_pk_t, 1 >::type
 
using f4x4_t = typename vector_type< f4x2_pk_t, 2 >::type
 
using f4x8_t = typename vector_type< f4x2_pk_t, 4 >::type
 
using f4x16_t = typename vector_type< f4x2_pk_t, 8 >::type
 
using f4x32_t = typename vector_type< f4x2_pk_t, 16 >::type
 
using f4x64_t = typename vector_type< f4x2_pk_t, 32 >::type
 
using f6x16_t = typename vector_type< f6x16_pk_t, 1 >::type
 
using f6x16x2_t = typename vector_type< f6x16_pk_t, 2 >::type
 
using f6x32_t = typename vector_type< f6x32_pk_t, 1 >::type
 
using bf6x16_t = typename vector_type< bf6x16_pk_t, 1 >::type
 
using bf6x16x2_t = typename vector_type< bf6x16_pk_t, 2 >::type
 
using bf6x32_t = typename vector_type< bf6x32_pk_t, 1 >::type
 
using e8m0x4_bexp_t = typename vector_type< e8m0_bexp_t, 4 >::type
 
using pk_i4x2_t = typename vector_type< pk_i4_t, 2 >::type
 
using pk_i4x4_t = typename vector_type< pk_i4_t, 4 >::type
 
using pk_i4x8_t = typename vector_type< pk_i4_t, 8 >::type
 
template<bool B, typename T = void>
using enable_if = std::enable_if< B, T >
 
template<bool B, typename T = void>
using enable_if_t = typename std::enable_if< B, T >::type
 
template<bool predicate, class X , class Y >
using conditional_t = typename conditional< predicate, X, Y >::type
 
template<bool B>
using bool_constant = integral_constant< bool, B >
 
using true_type = bool_constant< true >
 
using false_type = bool_constant< false >
 
template<template< class... > class Op, class... Args>
using is_detected = typename detail::detector< nonesuch, void, Op, Args... >::value_t
 
template<typename T >
using is_pack2_invocable_t = decltype(ck::declval< T & >().is_pack2_invocable)
 
template<typename T >
using is_pack4_invocable_t = decltype(ck::declval< T & >().is_pack4_invocable)
 
template<typename T >
using is_pack8_invocable_t = decltype(ck::declval< T & >().is_pack8_invocable)
 
template<index_t N>
using Number = integral_constant< index_t, N >
 
template<index_t N>
using LongNumber = integral_constant< long_index_t, N >
 
template<index_t N>
using make_index_sequence = typename __make_integer_seq< impl::__integer_sequence, index_t, N >::seq_type
 
template<typename Sx , typename Sy >
using sequence_merge_t = typename sequence_merge< Sx, Sy >::type
 
template<index_t NSize, index_t I>
using uniform_sequence_gen_t = typename uniform_sequence_gen< NSize, I >::type
 
template<typename T , index_t N>
using StaticallyIndexedArray = typename detail::StaticallyIndexedArrayImpl< T, N >::type
 
template<index_t I, typename TTuple >
using tuple_element_t = typename tuple_element< I, TTuple >::type
 
template<typename T >
using is_tuple = decltype(ck::declval< T & >().IsTuple())
 
template<typename T >
using remove_reference_t = typename remove_reference< T >::type
 
template<typename T >
using remove_cv_t = typename remove_cv< T >::type
 
template<typename T >
using remove_cvref_t = remove_cv_t< remove_reference_t< T > >
 
template<typename T >
using remove_pointer_t = typename remove_pointer< T >::type
 

Enumerations

enum class  InMemoryDataOperationEnum {
  Set ,
  AtomicAdd ,
  AtomicMax ,
  Add
}
 
enum  StreamKReductionStrategy {
  Atomic = 0 ,
  Reduction
}
 
enum class  PipelineVersion {
  v1 ,
  v2 ,
  v4 ,
  weight_only
}
 
enum  Activation {
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1
}
 
enum  Activation {
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1
}
 
enum  Activation {
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1
}
 
enum  Activation {
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1
}
 
enum  Activation {
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1
}
 
enum class  DppInstr {
  dpp8_f16_1x32x2 = 0 ,
  dpp8_f16_2x16x2 ,
  dpp8_f16_2x32x2 ,
  dpp8_f16_4x16x2 ,
  dpp8_f16_4x32x2 ,
  dpp8_f16_8x16x2 ,
  dpp8_f16_8x32x2 ,
  dpp8_f16_16x16x2 ,
  dpp8_f16_32x8x2
}
 
enum class  SmfmacInstr {
  smfmac_f32_16x16x32f16 = 0 ,
  smfmac_f32_32x32x16f16 ,
  smfmac_f32_16x16x32bf16 ,
  smfmac_f32_32x32x16bf16
}
 
enum class  WmmaInstr {
  wmma_f32_16x16x16_f16 = 0 ,
  wmma_f32_16x16x16_bf16 ,
  wmma_f16_16x16x16_f16 ,
  wmma_bf16_16x16x16_bf16 ,
  wmma_i32_16x16x16_iu8 ,
  wmma_i32_16x16x16_iu4 ,
  wmma_f32_16x16x16_f16_gfx12 ,
  wmma_f32_16x16x16_bf16_gfx12 ,
  wmma_i32_16x16x16_iu8_gfx12 ,
  wmma_f32_16x16x16_f8f8_gfx12 ,
  wmma_f32_16x16x16_f8bf8_gfx12 ,
  wmma_f32_16x16x16_bf8f8_gfx12 ,
  wmma_f32_16x16x16_bf8bf8_gfx12
}
 
enum class  MfmaInstr {
  mfma_f32_32x32x1f32 = 0 ,
  mfma_f32_16x16x1f32 ,
  mfma_f32_4x4x1f32 ,
  mfma_f32_32x32x2f32 ,
  mfma_f32_16x16x4f32 ,
  mfma_f32_32x32x4f16 ,
  mfma_f32_16x16x4f16 ,
  mfma_f32_4x4x4f16 ,
  mfma_f32_32x32x8f16 ,
  mfma_f32_16x16x16f16 ,
  mfma_f32_32x32x8bf16_1k ,
  mfma_f32_16x16x16bf16_1k ,
  mfma_f32_32x32x4bf16 ,
  mfma_f32_16x16x8bf16 ,
  mfma_i32_32x32x8i8 ,
  mfma_i32_16x16x16i8 ,
  mfma_i32_32x32x16i8 ,
  mfma_i32_16x16x32i8 ,
  mfma_f64_16x16x4f64 ,
  mfma_f32_32x32x16f8f8 ,
  mfma_f32_16x16x32f8f8 ,
  mfma_f32_32x32x16bf8bf8 ,
  mfma_f32_16x16x32bf8bf8 ,
  mfma_f32_32x32x16f8bf8 ,
  mfma_f32_16x16x32f8bf8 ,
  mfma_f32_32x32x16bf8f8 ,
  mfma_f32_16x16x32bf8f8 ,
  mfma_f32_32x32x16f16 ,
  mfma_f32_16x16x32f16 ,
  mfma_f32_32x32x16bf16 ,
  mfma_f32_16x16x32bf16 ,
  mfma_i32_32x32x32i8 ,
  mfma_i32_16x16x64i8 ,
  mfma_f32_32x32x64f8f6f4 ,
  mfma_f32_16x16x128f8f6f4 ,
  mfma_scale_f32_32x32x64f8f6f4 ,
  mfma_scale_f32_16x16x128f8f6f4 ,
  mfma_f32_16x16x8xf32 ,
  mfma_f32_32x32x4xf32 ,
  wmma_f32_16x16x16_f16 ,
  wmma_f32_16x16x16_bf16 ,
  wmma_i32_16x16x16_iu8 ,
  wmma_unsupport_16x16_gfx11 ,
  wmma_f32_16x16x16_f16_gfx12 ,
  wmma_f32_16x16x16_bf16_gfx12 ,
  wmma_i32_16x16x16_iu8_gfx12 ,
  wmma_f32_16x16x16_f8f8_gfx12 ,
  wmma_f32_16x16x16_f8bf8_gfx12 ,
  wmma_f32_16x16x16_bf8f8_gfx12 ,
  wmma_f32_16x16x16_bf8bf8_gfx12 ,
  wmma_unsupport_16x16_gfx12
}
 
enum class  AddressSpaceEnum {
  Generic ,
  Global ,
  Lds ,
  Sgpr ,
  Vgpr
}
 
enum class  AmdBufferCoherenceEnum {
  DefaultCoherence = 0 ,
  GLC = 1 ,
  SLC = 2 ,
  GLC_SLC = 3 ,
  WAVE_NT0 = 0 ,
  WAVE_NT1 = 2 ,
  GROUP_NT0 = 1 ,
  GROUP_NT1 = 3 ,
  DEVICE_NT0 = 8 ,
  DEVICE_NT1 = 10 ,
  SYSTEM_NT0 = 9 ,
  SYSTEM_NT1 = 11 ,
  DefaultCoherence = 0 ,
  GLC = 1 ,
  SLC = 2 ,
  GLC_SLC = 3 ,
  WAVE_NT0 = 0 ,
  WAVE_NT1 = 2 ,
  GROUP_NT0 = 1 ,
  GROUP_NT1 = 3 ,
  DEVICE_NT0 = 8 ,
  DEVICE_NT1 = 10 ,
  SYSTEM_NT0 = 9 ,
  SYSTEM_NT1 = 11
}
 
enum class  AmdBufferCoherenceEnum {
  DefaultCoherence = 0 ,
  GLC = 1 ,
  SLC = 2 ,
  GLC_SLC = 3 ,
  WAVE_NT0 = 0 ,
  WAVE_NT1 = 2 ,
  GROUP_NT0 = 1 ,
  GROUP_NT1 = 3 ,
  DEVICE_NT0 = 8 ,
  DEVICE_NT1 = 10 ,
  SYSTEM_NT0 = 9 ,
  SYSTEM_NT1 = 11 ,
  DefaultCoherence = 0 ,
  GLC = 1 ,
  SLC = 2 ,
  GLC_SLC = 3 ,
  WAVE_NT0 = 0 ,
  WAVE_NT1 = 2 ,
  GROUP_NT0 = 1 ,
  GROUP_NT1 = 3 ,
  DEVICE_NT0 = 8 ,
  DEVICE_NT1 = 10 ,
  SYSTEM_NT0 = 9 ,
  SYSTEM_NT1 = 11
}
 
enum class  ck_fp8_interpretation_t {
  CK_E4M3_OCP = 0 ,
  CK_E5M2_OCP = 1 ,
  CK_E4M3_FNUZ = 2 ,
  CK_E5M2_FNUZ = 3
}
 Describes FP8 interpretation. More...
 
enum class  ck_saturation_t {
  CK_NOSAT = 0 ,
  CK_SATFINITE = 1
}
 Describes saturation behavior. More...
 
enum class  BlockGemmPipelineVersion {
  v1 ,
  v2 ,
  v3 ,
  v4 ,
  v5
}
 
enum class  BlockGemmPipelineScheduler {
  Intrawave ,
  Interwave
}
 
enum class  TailNumber {
  Odd ,
  Even ,
  One ,
  Two ,
  Three ,
  Four ,
  Five ,
  Six ,
  Seven ,
  Empty ,
  Full
}
 
enum  SchedulerGroup : uint32_t {
  SCHED_GROUP_MFMA = 0x008 ,
  SCHED_GROUP_VMEM = 0x020 ,
  SCHED_GROUP_LDS_READ = 0x100 ,
  SCHED_GROUP_LDS_WRITE = 0x200
}
 
enum class  f8_rounding_mode {
  standard ,
  stochastic
}
 
enum class  LoopScheduler {
  Default ,
  Interwave
}
 
enum class  ReduceTensorOp {
  ADD = 0 ,
  MUL = 1 ,
  MIN = 2 ,
  MAX = 3 ,
  AMAX = 4 ,
  AVG = 5 ,
  NORM1 = 6 ,
  NORM2 = 7
}
 
enum class  NanPropagation {
  NOT_PROPAGATE_NAN = 0 ,
  PROPAGATE_NAN = 1
}
 
enum class  ReduceTensorIndices {
  NO_INDICES = 0 ,
  FLATTENED_INDICES = 1
}
 
enum class  IndicesType {
  INDICES_32BIT = 0 ,
  INDICES_64BIT = 1 ,
  INDICES_16BIT = 2 ,
  INDICES_8BIT = 3
}
 

Functions

constexpr unsigned int fnv1a_hash (std::string_view str, unsigned int h=2166136261u)
 
std::string get_device_name ()
 
bool is_gfx12_supported ()
 
bool is_gfx11_supported ()
 
bool is_xdl_supported ()
 
template<typename ADataType , typename BDataType , index_t MPerXDL, index_t NPerXDL>
bool is_xdl_wmma_supported ()
 
bool is_lds_direct_load_supported ()
 
bool is_bf16_atomic_supported ()
 
bool is_gfx101_supported ()
 
bool is_gfx103_supported ()
 
bool is_wmma_supported ()
 
bool is_tf32_supported ()
 
template<typename T , typename ForwardIterator , typename Size , typename BinaryOperation >
auto accumulate_n (ForwardIterator first, Size count, T init, BinaryOperation op) -> decltype(std::accumulate(first, std::next(first, count), init, op))
 
unsigned int get_available_cpu_cores ()
 
template<typename... In, typename... Wei, typename... Out, typename ConvStrides , typename ConvDilations , typename InLeftPads , typename InRightPads , index_t GemmK1Value>
__host__ constexpr __device__ auto transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad (const TensorDescriptor< In... > &in_grid_desc_n_di_hi_wi_c, const TensorDescriptor< Wei... > &wei_k_z_y_x_c_grid_desc, const TensorDescriptor< Out... > &out_n_do_ho_wo_k_grid_desc, const ConvStrides &conv_strides, const ConvDilations &conv_dilations, const InLeftPads &in_left_pads, const InRightPads &in_right_pads, Number< GemmK1Value >)
 
template<AddressSpaceEnum AddressSpace, typename T , typename TensorDesc , typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false>
__host__ constexpr __device__ auto make_static_tensor (TensorDesc)
 
template<AddressSpaceEnum AddressSpace, typename T , typename TensorDesc , typename X , typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false, typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X >>::value, bool >::type = false>
__host__ constexpr __device__ auto make_static_tensor (TensorDesc, X invalid_element_value)
 
template<typename Lengths , typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
__host__ constexpr __device__ auto make_cluster_descriptor (const Lengths &lengths, ArrangeOrder order=typename arithmetic_sequence_gen< 0, Lengths::Size(), 1 >::type{})
 
template<typename LowLength >
__host__ constexpr __device__ auto make_pass_through_transform (const LowLength &low_length)
 
template<typename LowLength , typename LeftPad , typename RightPad , bool SkipIsValidCheck = false>
__host__ constexpr __device__ auto make_pad_transform (const LowLength &low_length, const LeftPad &left_pad, const RightPad &right_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{})
 
template<typename LowLength , typename LeftPadLength , bool SkipIsValidCheck = false>
__host__ constexpr __device__ auto make_left_pad_transform (const LowLength &low_length, const LeftPadLength &left_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{})
 
template<typename LowLength , typename RightPadLength , bool SkipIsValidCheck = false>
__host__ constexpr __device__ auto make_right_pad_transform (const LowLength &low_length, const RightPadLength &right_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{})
 
template<typename UpLengths , typename Coefficients , typename enable_if< UpLengths::Size()==Coefficients::Size(), bool >::type = false>
__host__ constexpr __device__ auto make_embed_transform (const UpLengths &up_lengths, const Coefficients &coefficients)
 
template<typename LowLengths >
__host__ constexpr __device__ auto make_merge_transform (const LowLengths &low_lengths)
 
template<typename LowLengths >
__host__ constexpr __device__ auto make_merge_transform_v1_carry_check (const LowLengths &low_lengths)
 
template<typename LowLengths >
__host__ constexpr __device__ auto make_merge_transform_v2_magic_division (const LowLengths &low_lengths)
 
template<typename LowLengths >
__host__ constexpr __device__ auto make_merge_transform_v3_division_mod (const LowLengths &low_lengths)
 
template<typename UpLengths , bool Use24BitIntegerCalculation = false>
__host__ constexpr __device__ auto make_unmerge_transform (const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
 
__host__ constexpr __device__ auto make_conv_bwd_data_out_transform (index_t N, index_t Ho, index_t Wo, index_t K, [[maybe_unused]] index_t YDot, index_t XDot, index_t HTilde, index_t WTilde, index_t ConvDilationH, index_t ConvDilationW, index_t HTildeSlice, index_t WTildeSlice, index_t YDotSlice, index_t XDotSlice, index_t IHTildeSliceBegin, index_t IWTildeSliceBegin, index_t GcdStrideDilationH, index_t GcdStrideDilationW, index_t K0, index_t K1, index_t MPerBlock, index_t GemmKPerBlock)
 
template<typename LowerIndex >
__host__ constexpr __device__ auto make_freeze_transform (const LowerIndex &low_idx)
 
template<typename UpperIndex >
__host__ constexpr __device__ auto make_insert_transform (const UpperIndex &up_idx)
 
template<typename LowLength , typename SliceBegin , typename SliceEnd >
__host__ constexpr __device__ auto make_slice_transform (const LowLength &low_length, const SliceBegin &slice_begin, const SliceEnd &slice_end)
 
template<typename VectorSize , typename UpLength >
__host__ constexpr __device__ auto make_vectorize_transform (const VectorSize &vector_size, const UpLength &up_length)
 
template<typename Modulus , typename UpLength >
__host__ constexpr __device__ auto make_modulo_transform (const Modulus &modulus, const UpLength &up_length)
 
template<typename LowLengths >
__host__ constexpr __device__ auto make_xor_with_modulo_transform (const LowLengths &low_lengths)
 
template<typename LowLengths >
__host__ constexpr __device__ auto make_xor_transform (const LowLengths &low_lengths)
 
template<typename TensorAdaptor0 , typename TensorAdaptor1 >
__host__ constexpr __device__ auto chain_tensor_adaptors (const TensorAdaptor0 &adaptor0, const TensorAdaptor1 &adaptor1)
 
template<typename Transforms , typename LowerDimensionOldTopIdss , typename UpperDimensionNewTopIdss >
__host__ constexpr __device__ auto make_single_stage_tensor_adaptor (const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
 
template<typename OldTensorDescriptor , typename NewTransforms , typename NewLowerDimensionOldVisibleIdss , typename NewUpperDimensionNewVisibleIdss >
__host__ constexpr __device__ auto transform_tensor_descriptor (const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
 
template<typename TensorDesc , typename VisibleIndex >
__host__ constexpr __device__ auto make_tensor_coordinate (const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
 
template<typename TensorDesc , typename VisibleIndex , typename UpdateLowerIndexHack >
__host__ constexpr __device__ auto make_tensor_coordinate_step (const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
 
template<typename TensorDesc , typename VisibleIndex >
__host__ constexpr __device__ auto make_tensor_coordinate_step (const TensorDesc &, const VisibleIndex &idx_diff_visible)
 
template<typename TensorDesc , typename TensorCoord , typename TensorCoordStep >
__host__ constexpr __device__ void move_tensor_coordinate (const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
 
template<typename TensorDesc , typename TensorCoord >
__host__ constexpr __device__ bool coordinate_has_valid_offset_assuming_visible_index_is_valid (const TensorDesc &tensor_desc, const TensorCoord &coord)
 
template<typename TensorDesc , typename TensorCoord >
__host__ constexpr __device__ bool coordinate_has_valid_offset (const TensorDesc &tensor_desc, const TensorCoord &coord)
 
template<typename... Lengths, typename... Strides, typename enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
__host__ constexpr __device__ auto make_naive_tensor_descriptor (const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
 
template<typename... Lengths>
__host__ constexpr __device__ auto make_naive_tensor_descriptor_packed (const Tuple< Lengths... > &lengths)
 
template<typename... Lengths, typename Align >
__host__ constexpr __device__ auto make_naive_tensor_descriptor_aligned (const Tuple< Lengths... > &lengths, Align align)
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeTypeA , typename ComputeTypeB , typename AccDataType , typename AWmmaTileDesc , typename BWmmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false>
constexpr auto BlockGemmPipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto BlockGemmABScalePipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto BlockGemmMXBPreshufflePipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto BlockGemmBPreshufflePipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto BlockGemmBlockScaleBPreshufflePipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto BlockGemmBlockMoeScaleBPreshufflePipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto BlockGemmMXBPreshufflePipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto BlockGemmMXNBSPipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto BlockGemmMXPipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto BlockGemmMXPipeline_Selector ()
 
template<index_t BlockSize, typename FloatA , typename FloatB , typename FloatAcc , typename AK0MK1BlockDesc , typename BK0NK1BlockDesc , index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, LoopScheduler LoopSched, typename ComputeTypeA = FloatA, typename ComputeTypeB = FloatB>
constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector ()
 
template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ComputePtrOffsetOfBatch , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_contraction_multiple_d_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, const index_t batch_count, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2ETileMap block_2_etile_map)
 
template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_batched_gemm_xdl_cshuffle_v3_multi_d (BatchedGemmArg karg)
 
template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds (BatchedGemmArg karg)
 
template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_batched_gemm_b_scale_xdl_cshuffle_v3 (BatchedGemmArg karg)
 
template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds (BatchedGemmArg karg)
 
template<typename GridwiseGemm , typename AsPointer , typename BsPointer , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AsGridDesc_AK0_M_AK1 , typename BsGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_contraction_multiple_abd_xdl_cshuffle (AsPointer p_as_grid, BsPointer p_bs_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AsGridDesc_AK0_M_AK1 as_grid_desc_ak0_m_ak1, const BsGridDesc_BK0_N_BK1 bs_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_contraction_multiple_d_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
 
template<typename GridwiseElementwiseReduction , typename InDataTypePointerTuple , typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename AccDataType , typename XElementwiseOperation , typename YElementwiseOperation , typename InGrid2dDescTuple , typename GridDesc_M_K >
__global__ void kernel_elementwise_layernorm (const InGrid2dDescTuple in_grid_2d_desc_tuple, const GridDesc_M_K x_grid_desc_m_k, const GridDesc_M_K gamma_grid_desc_m_k, const GridDesc_M_K beta_grid_desc_m_k, const GridDesc_M_K y_grid_desc_m_k, index_t num_k_block_tile_iteration, AccDataType epsilon, const InDataTypePointerTuple p_in_global_tuple, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, const XElementwiseOperation x_elementwise_op, const YElementwiseOperation y_elementwise_op)
 
template<typename GridwiseGemm , typename ABDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_K0_M0_M1_K1 , typename BGridDesc_K0_N0_N1_K1 , typename DsGridDesc_M0_M10_M11_N0_N10_N11 , typename CGridDesc_M0_M10_M11_N0_N10_N11 , typename Block2CTileMap , bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__global__ void kernel_gemm_dl_multiple_d (const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1, const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1, const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11, const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemmWelford , typename ABDataType , typename DsPointer , typename EMeanVarDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename MeanVarGridDescriptor_MBlock_MPerBlock_NBlock , typename CountGridDescriptor_MBlock_MPerBlock_NBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_multiple_d_welford_first_half_xdl_cshuffle (const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EMeanVarDataType *__restrict__ p_e_grid, EMeanVarDataType *__restrict__ p_welford_mean_grid, EMeanVarDataType *__restrict__ p_welford_var_grid, int32_t *__restrict__ p_welford_count_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const MeanVarGridDescriptor_MBlock_MPerBlock_NBlock mean_var_grid_desc_mblock_mperblock_nblock, const CountGridDescriptor_MBlock_MPerBlock_NBlock count_grid_desc_mblock_mperblock_nblock, const Block2ETileMap block_2_etile_map, index_t NRaw)
 
template<typename GridwiseWelfordLayernorm , typename EMeanVarDataType , typename HDataType , typename GammaDataType , typename BetaDataType , typename ComputeDataType , typename EHGridDesc_M_N , typename LayernormMeanVarGridDesc_M_NBlock , typename LayernormCountGridDesc_M_NBlock , typename GammaBetaGridDesc_N , typename HElementwiseOperation >
__global__ void kernel_welford_layernorm2d_second_half (const EMeanVarDataType *__restrict__ p_e_grid, const EMeanVarDataType *__restrict__ p_in_welford_mean_grid, const EMeanVarDataType *__restrict__ p_in_welford_var_grid, const int32_t *__restrict__ p_in_welford_count_grid, const GammaDataType *__restrict__ p_gamma_grid, const BetaDataType *__restrict__ p_beta_grid, HDataType *__restrict__ p_h_grid, const EHGridDesc_M_N e_grid_desc_m_n, const EHGridDesc_M_N h_grid_desc_m_n, const LayernormMeanVarGridDesc_M_NBlock mean_var_grid_desc_m_nblock, const LayernormCountGridDesc_M_NBlock count_grid_desc_m_nblock, const GammaBetaGridDesc_N gamma_grid_desc_n, const GammaBetaGridDesc_N beta_grid_desc_n, index_t numMeanVarCountBlockTileIteration_N, index_t NBlockClusterLength, ComputeDataType epsilon, HElementwiseOperation h_element_op)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename FloatRsPointer , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename QsElementwiseOperation , typename RsElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename RsGridDescriptor_MBlock_MPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_multiple_d_multiple_r_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, FloatRsPointer p_rs_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const QsElementwiseOperation qs_element_op, const RsElementwiseOperation rs_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock, const Block2ETileMap block_2_etile_map)
 
template<typename GridwiseGemm , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_multiple_d_xdl_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
 
template<typename GridwiseGemm , typename ABDataType , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename EElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdl_waveletmodel_cshuffle (const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const EElementwiseOperation e_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
 
template<typename GridwiseGemm , typename ContractionMultiDKernelArg , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , bool HasMainKBlockLoop>
__global__ void kernel_grouped_contraction_multiple_d_xdl_cshuffle (const void CK_CONSTANT_ADDRESS_SPACE *contraction_args, const index_t group_count, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op)
 
template<typename GridwiseWelford , typename XDataType , typename WorkspaceMeanVarDataType , typename ComputeDataType , typename XGridDesc_M_K , typename MeanVarGridDesc_M_KBlock >
__global__ void kernel_normalizationSplitK1st (const XGridDesc_M_K x_grid_desc_m_k, const MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock, index_t num_k_block_tile_iteration, const XDataType *const __restrict__ p_x_global, WorkspaceMeanVarDataType *const __restrict__ p_welford_mean, WorkspaceMeanVarDataType *const __restrict__ p_welford_variance, int32_t *const __restrict__ p_welford_count)
 
template<typename GridwiseWelfordNormalization , typename WorkspaceMeanVarDataType , typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename SaveMeanInvStdDataType , typename ComputeDataType , typename YElementwiseOperation , typename MeanVarGridDesc_M_KBlock , typename CountGridDesc_M_KBlock , typename XYGammaBetaGridDesc_M_K , typename SaveMeanInvStdGridDesc_M >
__global__ void kernel_normalizationSplitK2nd (const MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock, const CountGridDesc_M_KBlock count_grid_desc_m_kblock, const XYGammaBetaGridDesc_M_K x_grid_desc_m_k, const XYGammaBetaGridDesc_M_K gamma_grid_desc_m_k, const XYGammaBetaGridDesc_M_K beta_grid_desc_m_k, const XYGammaBetaGridDesc_M_K y_grid_desc_m_k, const SaveMeanInvStdGridDesc_M save_mean_grid_desc_m, const SaveMeanInvStdGridDesc_M save_inv_std_grid_desc_m, index_t num_k_mean_var_count_iteration, index_t num_k_block_tile_iteration, index_t k_grid_size, ComputeDataType epsilon, const WorkspaceMeanVarDataType *const p_mean_global, const WorkspaceMeanVarDataType *const p_variance_global, const int32_t *const p_welford_count_global, const XDataType *const __restrict__ p_x_global, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, SaveMeanInvStdDataType *const __restrict__ p_save_mean_global, SaveMeanInvStdDataType *const __restrict__ p_save_inv_std_global, const YElementwiseOperation y_elementwise_op)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AKB_AK0_M_AK1 , typename BGridDesc_BKB_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ComputePtrOffsetOfBatch , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_contraction_multiple_d_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, const index_t batch_count, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AKB_AK0_M_AK1 a_grid_desc_akb_ak0_m_ak1, const BGridDesc_BKB_BK0_N_BK1 b_grid_desc_bkb_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2ETileMap block_2_etile_map)
 
__device__ half4_t i4_to_half4 (int q)
 
__device__ half4_t i4_to_half4_scale (int q, const ck::half2_t &scale)
 
__device__ f8x4_t i4_to_f8x4 (int q)
 
__device__ f8x8_t i4_to_fp8x8 (int q)
 
__device__ bhalf4_t i4_to_bhalf4 (int q)
 
template<typename GridwiseMultiblockBatchNormForward_ , typename XDataType , typename YDataType , typename AccDataType , typename ScaleDataType , typename BiasDataType , typename MeanVarDataType , typename YElementwiseOp , typename XYGridDesc_M_K , typename MeanVarCountGridDesc_M_G , typename MeanVarCountGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M , typename GetReduceCountPerThreadFunctor >
__global__ void kernel_multiblock_batchnorm_forward (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K y_grid_desc_m_k, const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g, const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, index_t num_k_block_tile_iteration, AccDataType epsilon, const XDataType *const __restrict__ p_x, MeanVarDataType *const __restrict__ p_welford_mean, MeanVarDataType *const __restrict__ p_welford_variance, int32_t *const __restrict__ p_welford_count, int32_t *const __restrict__ p_control, const ScaleDataType *const __restrict__ p_scale, const BiasDataType *const __restrict__ p_bias, const YElementwiseOp y_elementwise_op, YDataType *const __restrict__ p_y, bool updateMovingAverage, AccDataType averageFactor, MeanVarDataType *const __restrict__ resultRunningMean, MeanVarDataType *const __restrict__ resultRunningVariance, bool saveMeanInvVariance, MeanVarDataType *const __restrict__ resultSaveMean, MeanVarDataType *const __restrict__ resultSaveInvVariance)
 
template<typename GridwiseReduceSecondHalfBatchNormBackwardFinal_ , typename XDataType , typename DyDataType , typename DxDataType , typename ScaleDataType , typename DscaleDbiasDataType , typename MeanVarDataType , typename DyElementwiseOp , typename XYGridDesc_M_K , typename DscaleDbiasGridDesc_M_K , typename MeanVarGridDesc_M , typename ScaleBiasGridDesc_M >
__global__ void kernel_reduce_second_half_batchnorm_backward_final (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const XYGridDesc_M_K dx_grid_desc_m_k, const DscaleDbiasGridDesc_M_K dscale_dbias_grid_desc_m_k, const MeanVarGridDesc_M mean_var_grid_desc_m, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, index_t blkgroup_size, long_index_t reduce_size, index_t num_xy_k_block_tile_iteration, index_t num_dscale_dbias_k_block_tile_iteration, const DscaleDbiasDataType *const __restrict__ p_reduce_dscale, const DscaleDbiasDataType *const __restrict__ p_reduce_dbias, const MeanVarDataType *const __restrict__ p_mean, const MeanVarDataType *const __restrict__ p_inv_var, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, const ScaleDataType *const __restrict__ p_scale, const DyElementwiseOp dy_elementwise_op, DxDataType *const __restrict__ p_dx, DscaleDbiasDataType *const __restrict__ p_dscale, DscaleDbiasDataType *const __restrict__ p_dbias)
 
template<typename GridwiseMultiblockWelfordFirstHalf_ , typename XDataType , typename MeanVarDataType , typename XGridDesc_M_K , typename MeanVarCountGridDesc_M_G , typename GetReduceCountPerThreadFunctor >
__global__ void kernel_multiblock_welford_first_half (const XGridDesc_M_K x_grid_desc_m_k, const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, index_t num_k_block_tile_iteration, const XDataType *const __restrict__ p_x, MeanVarDataType *const p_welford_mean, MeanVarDataType *const p_welford_variance, int32_t *const p_welford_count)
 
template<typename GridwiseWelfordSecondHalfBatchNormForwardFinal_ , typename XDataType , typename YDataType , typename AccDataType , typename ScaleDataType , typename BiasDataType , typename MeanVarDataType , typename YElementwiseOp , typename XYGridDesc_M_K , typename MeanVarCountGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M >
__global__ void kernel_welford_second_half_batchnorm_forward_final (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K y_grid_desc_m_k, const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, index_t blkgroup_size, index_t num_xy_k_block_tile_iteration, AccDataType epsilon, const MeanVarDataType *const __restrict__ p_in_welford_mean, const MeanVarDataType *const __restrict__ p_in_welford_variance, const int32_t *const __restrict__ p_in_welford_count, const XDataType *const __restrict__ p_x, const ScaleDataType *const __restrict__ p_scale, const BiasDataType *const __restrict__ p_bias, const YElementwiseOp y_elementwise_op, YDataType *const __restrict__ p_y, bool updateMovingAverage, AccDataType averageFactor, MeanVarDataType *const __restrict__ resultRunningMean, MeanVarDataType *const __restrict__ resultRunningVariance, bool saveMeanInvVariance, MeanVarDataType *const __restrict__ resultSaveMean, MeanVarDataType *const __restrict__ resultSaveInvVariance)
 
template<typename GridwiseWelfordSecondHalfReduceFirstHalf_ , typename XDataType , typename DyDataType , typename AccDataType , typename ScaleDataType , typename DscaleDbiasDataType , typename MeanVarDataType , typename DyElementwiseOp , typename XYGridDesc_M_K , typename MeanVarGridDesc_M , typename MeanVarCountGridDesc_M_K , typename DscaleDbiasGridDesc_M_G >
__global__ void kernel_welford_second_half_reduce_first_half (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const MeanVarGridDesc_M mean_var_grid_desc_m, const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, const DscaleDbiasGridDesc_M_G dscale_dbias_grid_desc_m_g, index_t blkgroup_size, index_t num_xy_k_block_tile_iteration, index_t num_mean_var_count_k_block_tile_iteration, AccDataType epsilon, bool haveSavedMeanInvVar, const MeanVarDataType *const __restrict__ p_savedMean, const MeanVarDataType *const __restrict__ p_savedInvVar, const MeanVarDataType *const __restrict__ p_in_welford_mean, const MeanVarDataType *const __restrict__ p_in_welford_variance, const int32_t *const __restrict__ p_in_welford_count, const DyElementwiseOp dy_elementwise_op, MeanVarDataType *const __restrict__ p_out_welford_mean, MeanVarDataType *const __restrict__ p_out_welford_inv_variance, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, DscaleDbiasDataType *const __restrict__ p_reduce_dscale, DscaleDbiasDataType *const __restrict__ p_reduce_dbias)
 
template<typename CTileIdx , typename CTileDim >
__host__ __device__ bool DefaultValidCTileIndex (const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim)
 
template<typename GridwiseMultipleReduction , index_t NumReduction, typename InDataType , typename OutDataTypePointerTuple , typename AccDataType , typename InGridDesc_M_K , typename OutGridDesc_M_Tuple , typename InElementwiseOperationTuple , typename AccElementwiseOperationTuple >
__global__ void kernel_multiple_reduce_multiblock (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M_Tuple out_grid_desc_m_tuple, const InElementwiseOperationTuple in_elementwise_op_tuple, const AccElementwiseOperationTuple acc_elementwise_op_tuple, index_t block_group_size, index_t num_k_block_tile_iteration, Array< AccDataType, NumReduction > alpha_values, const InDataType *const __restrict__ p_in_value_global, Array< AccDataType, NumReduction > beta_values, OutDataTypePointerTuple p_out_value_global_tuple)
 
template<typename GridwiseMultipleReduction , index_t NumReduction, typename InDataType , typename OutDataTypePointerTuple , typename AccDataType , typename InGridDesc_M_K , typename OutGridDesc_M_Tuple , typename InElementwiseOperationTuple , typename AccElementwiseOperationTuple >
__global__ void kernel_multiple_reduce_threadwise (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M_Tuple out_grid_desc_m_tuple, const InElementwiseOperationTuple in_elementwise_op_tuple, const AccElementwiseOperationTuple acc_elementwise_op_tuple, Array< AccDataType, NumReduction > alpha_values, const InDataType *const __restrict__ p_in_value_global, Array< AccDataType, NumReduction > beta_values, OutDataTypePointerTuple p_out_value_global_tuple)
 
template<typename GridwiseReduction , bool OutputIndex, bool HaveIndexInput, typename InDataType , typename OutDataType , typename AccDataType , typename IndexDataType , typename InGridDesc_M_K , typename OutGridDesc_M , typename InElementwiseOperation , typename AccElementwiseOperation >
__global__ void kernel_reduce_multiblock (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M out_grid_desc_m, const InElementwiseOperation in_elementwise_op, const AccElementwiseOperation acc_elementwise_op, index_t block_group_size, index_t num_k_block_tile_iteration, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, const IndexDataType *const __restrict__ p_in_index_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global, IndexDataType *const __restrict__ p_out_index_global)
 
template<typename GridwiseReduction , bool OutputIndex, bool TransformIndexKtoGlobal, bool HaveIndexInput, typename InDataType , typename OutDataType , typename AccDataType , typename IndexDataType , typename InGridDesc_M_K , typename OutGridDesc_M , typename InElementwiseOperation , typename AccElementwiseOperation >
__global__ void kernel_reduce_threadwise (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M out_grid_desc_m, const InElementwiseOperation in_elementwise_op, const AccElementwiseOperation acc_elementwise_op, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, const IndexDataType *const __restrict__ p_in_index_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global, IndexDataType *const __restrict__ p_out_index_global)
 
template<typename GridwiseReduction , typename InDataType , typename OutDataType , typename AccDataType , typename InGridDesc_M_K , typename DsGridDesc_M , typename OutGridDesc_M , typename InElementwiseOperation , typename OutElementwiseOperation , typename DsGridPointer >
__global__ void kernel_reduce_threadwise_multi_d (const InGridDesc_M_K in_grid_desc_m_k, const DsGridDesc_M ds_grid_desc_m, const OutGridDesc_M out_grid_desc_m, const InElementwiseOperation in_elementwise_op, const OutElementwiseOperation out_elementwise_op, const InDataType *const __restrict__ p_in_value_global, const DsGridPointer p_ds_value_global, OutDataType *const __restrict__ p_out_value_global)
 
template<typename GridwiseBatchrNormBackwardWithBlockwiseWelford_ , typename XDataType , typename DyDataType , typename DxDataType , typename AccDataType , typename ScaleDataType , typename DscaleDbiasDataType , typename MeanVarDataType , typename DyElementwiseOp , typename XYGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M , typename GetReduceCountPerThreadFunctor >
__global__ void kernel_batchnorm_backward_with_blockwise_welford (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const XYGridDesc_M_K dx_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M dscale_dbias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, long_index_t reduce_size, index_t num_k_block_tile_iteration, AccDataType epsilon, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, const ScaleDataType *const __restrict__ p_scale, bool haveSavedMeanInvVar, const MeanVarDataType *const __restrict__ p_savedMean, const MeanVarDataType *const __restrict__ p_savedInvVar, const DyElementwiseOp dy_elementwise_op, DxDataType *const __restrict__ p_dx, DscaleDbiasDataType *const __restrict__ p_dscale, DscaleDbiasDataType *const __restrict__ p_dbias)
 
template<typename GridwiseBatchrNormForwardWithBlockwiseWelford_ , typename XDataType , typename YDataType , typename AccDataType , typename ScaleDataType , typename BiasDataType , typename MeanVarDataType , typename YElementwiseOp , typename XYGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M , typename GetReduceCountPerThreadFunctor >
__global__ void kernel_batchnorm_forward_with_blockwise_welford (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K y_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, index_t num_k_block_tile_iteration, AccDataType epsilon, const XDataType *const __restrict__ p_x, const ScaleDataType *const __restrict__ p_scale, const BiasDataType *const __restrict__ p_bias, const YElementwiseOp y_elementwise_op, YDataType *const __restrict__ p_y, bool updateMovingAverage, AccDataType averageFactor, MeanVarDataType *const __restrict__ resultRunningMean, MeanVarDataType *const __restrict__ resultRunningVariance, bool saveMeanInvVariance, MeanVarDataType *const __restrict__ resultSaveMean, MeanVarDataType *const __restrict__ resultSaveInvVariance)
 
template<typename GridwiseElementwise1dFunctor , typename InGrid1dDescTuple , typename OutGrid1dDescTuple , typename InDataTypePointerTuple , typename OutDataTypePointerTuple , typename ElementwiseOperation , typename UnaryOperation , typename Scale >
__global__ void kernel_elementwise_1d (const InGrid1dDescTuple in_grid_1d_desc_tuple, const OutGrid1dDescTuple out_grid_1d_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const ElementwiseOperation elementwise_op, const UnaryOperation unary_op, const Scale scale_op)
 
template<typename GridwiseElementwiseFunctor , typename InGridDescTuple , typename OutGridDescTuple , typename InDataTypePointerTuple , typename OutDataTypePointerTuple , typename Block2TileMap , typename ElementwiseOperation >
__global__ void kernel_elementwise (const InGridDescTuple in_grid_desc_tuple, const OutGridDescTuple out_grid_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const Block2TileMap block_2_tile_map, const ElementwiseOperation elementwise_op)
 
template<typename GridwiseElementwiseFunctorA , typename GridwiseElementwiseFunctorB , typename InAGridDescTuple , typename InBGridDescTuple , typename OutAGridDescTuple , typename OutBGridDescTuple , typename InADataTypePointerTuple , typename InBDataTypePointerTuple , typename OutADataTypePointerTuple , typename OutBDataTypePointerTuple , typename Block2TileMapA , typename Block2TileMapB , typename ElementwiseOperation >
__global__ void kernel_elementwise_dual (const InAGridDescTuple in_grid_desc_tuple_a, const InBGridDescTuple in_grid_desc_tuple_b, const OutAGridDescTuple out_grid_desc_tuple_a, const OutBGridDescTuple out_grid_desc_tuple_b, const InADataTypePointerTuple p_in_global_tuple_a, const InBDataTypePointerTuple p_in_global_tuple_b, const OutADataTypePointerTuple p_out_global_tuple_a, const OutBDataTypePointerTuple p_out_global_tuple_b, const Block2TileMapA block_2_tile_map_a, const Block2TileMapB block_2_tile_map_b, const ElementwiseOperation elementwise_op, const index_t a_grid_size)
 
template<typename GridwiseElementwiseFunctorA , typename GridwiseElementwiseFunctorB , typename InAGridDescTuple , typename InBGridDescTuple , typename OutAGridDescTuple , typename OutBGridDescTuple , typename InADataTypePointerTuple , typename InBDataTypePointerTuple , typename OutADataTypePointerTuple , typename OutBDataTypePointerTuple , typename Block2TileMapA , typename Block2TileMapB , typename ElementwiseOperation , index_t NumInputsA, index_t NumInputsB, index_t NumOutputsA, index_t NumOutputsB>
__global__ void kernel_elementwise_batched_dual (const InAGridDescTuple in_grid_desc_tuple_a, const InBGridDescTuple in_grid_desc_tuple_b, const OutAGridDescTuple out_grid_desc_tuple_a, const OutBGridDescTuple out_grid_desc_tuple_b, const InADataTypePointerTuple p_in_global_tuple_a, const InBDataTypePointerTuple p_in_global_tuple_b, const OutADataTypePointerTuple p_out_global_tuple_a, const OutBDataTypePointerTuple p_out_global_tuple_b, const Block2TileMapA block_2_tile_map_a, const Block2TileMapB block_2_tile_map_b, const ElementwiseOperation elementwise_op, const index_t a_grid_size, const index_t batch_count_a, const index_t batch_count_b, const std::array< index_t, NumInputsA > input_batch_strides_a, const std::array< index_t, NumInputsB > input_batch_strides_b, const std::array< index_t, NumOutputsA > output_batch_strides_a, const std::array< index_t, NumOutputsB > output_batch_strides_b)
 
template<typename GridwiseElementwiseFunctor , typename InGridDescTuple , typename OutGridDescTuple , typename InDataTypePointerTuple , typename OutDataTypePointerTuple , typename Block2TileMap , typename ElementwiseOperation , index_t NumInputs, index_t NumOutputs>
__global__ void kernel_batched_elementwise (const InGridDescTuple in_grid_desc_tuple, const OutGridDescTuple out_grid_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const Block2TileMap block_2_tile_map, const ElementwiseOperation elementwise_op, const index_t batch_count, const std::array< index_t, NumInputs > input_batch_strides, const std::array< index_t, NumOutputs > output_batch_strides)
 
template<typename GridwiseGemm , typename ADataType , typename BDataType , typename ScaleDataType , typename CDataType , typename AGridDesc , typename BGridDesc , typename ScaleGridDesc , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_fpAintB_gemm_wmma (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, const ScaleDataType *__restrict__ p_scale_grid, CDataType *__restrict__ p_c_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const ScaleGridDesc scale_grid_desc, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename FloatC0 , typename FloatC1 , typename ReducePtrsGlobal , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename C1ElementwiseOperation , typename ReduceInElementwiseOperations , typename ReduceAccElementwiseOperations , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ReduceGridDescriptor_MBlock_MPerBlock , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_bias_add_reduce_xdl_cshuffle_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC0 *__restrict__ p_bias_grid, const FloatC1 *__restrict__ p_d0_grid, ReducePtrsGlobal p_reduces_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const C1ElementwiseOperation c1_element_op, const ReduceInElementwiseOperations reduce_in_element_ops, const ReduceAccElementwiseOperations reduce_out_element_ops, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c0_grid_desc_mblock_mperblock_nblock_nperblock, const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c1_grid_desc_mblock_mperblock_nblock_nperblock, const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M0_M1_K1 , typename BGridDesc_K0_N0_N1_K1 , typename CGridDesc_M0_M10_M11_N0_N10_N11 , typename Block2CTileMap , bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__global__ void kernel_gemm_dl_v1r3 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1, const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1, const CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop>
__global__ void kernel_gemm_dpp (const typename GridwiseGemm::Argument karg)
 
template<typename GridwiseOp , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2CTileMap , typename ComputePtrOffsetOfBatch , bool HasMainKBlockLoop>
__global__ void kernel_grouped_conv_multiple_d_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const index_t batch_count, const AGridDesc_AK0_M_AK1 a_grid_desc, const BGridDesc_BK0_N_BK1 b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_, const Block2CTileMap block_2_ctile_map, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 
template<typename GridwiseOp , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AGridDesc , typename BGridDesc , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename ComputePtrOffsetOfBatch , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_contraction_multiple_d_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const index_t batch_count, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2CTileMap block_2_etile_map)
 
template<typename GridwiseOp , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AGridDesc , typename BGridDesc , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_mupltipe_d_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_multiple_d_xdl_cshuffle_lds_direct_load (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
 
template<PipelineVersion PipelineVer, index_t NumPrefetch = 1, LoopScheduler LoopSched = LoopScheduler::Default, bool AEnableLds = true, bool BEnableLds = true>
constexpr auto GridwiseGemmPipeline_Selector ()
 
template<index_t NumPrefetch, LoopScheduler LoopSched>
constexpr auto GridwiseGemmPipeline_v1_Selector ()
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename ReducePtrsGlobal , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename ReduceInElementwiseOperations , typename ReduceAccElementwiseOperations , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ReduceGridDescriptor_MBlock_MPerBlock , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_reduce_xdl_cshuffle_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, ReducePtrsGlobal p_reduces_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const ReduceInElementwiseOperations reduce_in_element_ops, const ReduceAccElementwiseOperations reduce_out_element_ops, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , typename ADataType , typename BDataType , typename CDataType , typename AGridDesc , typename BGridDesc , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_wmma (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, CDataType *__restrict__ p_c_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum EGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_wmma_cshuffle_v3 (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_xdl_cshuffle_v3 (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_xdl_cshuffle_v3_2lds (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdl_cshuffle_v1 (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , typename FloatA , typename FloatB , typename FloatC , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdl_cshuffle_v1 (const FloatA *__restrict__ p_a_grid, const FloatB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, typename GridwiseGemm::Problem problem)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, index_t TailNum = 3>
__global__ void kernel_gemm_xdl_cshuffle_v2 (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , typename FloatA , typename FloatB , typename FloatC , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdl_cshuffle_v2 (const FloatA *p_a_grid, const FloatB *p_b_grid, FloatC *p_c_grid, typename GridwiseGemm::Problem problem)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_b_preshuffle (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d_2lds (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds (typename GridwiseGemm::Argument karg)
 
template<bool Use2LDS, typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ enable_if_t<!Use2LDS, void > kernel_gemm_xdl_cshuffle_v3_mx (typename GridwiseGemm::Argument karg)
 
template<bool Use2LDS, typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ enable_if_t< Use2LDS, void > kernel_gemm_xdl_cshuffle_v3_mx (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename FloatC0 , typename AElementwiseOperation , typename BElementwiseOperation , typename AccElementwiseOperation , typename CElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename C0GridDescriptor_NBlock_NPerBlock , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_layernorm_xdl_cshuffle_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC0 *__restrict__ p_c0_bias_grid, const FloatC0 *__restrict__ p_c0_add_grid, const FloatC0 *__restrict__ p_c0_gamma_grid, const FloatC0 *__restrict__ p_c0_beta_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const AccElementwiseOperation acc_element_op, const CElementwiseOperation c_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const C0GridDescriptor_NBlock_NPerBlock c0_grid_desc_nblock_nperblock, const Block2CTileMap block_2_ctile_map)
 
template<typename LowLengths >
__host__ constexpr __device__ auto make_merge_transform_v4_no_carry (const LowLengths &low_lengths)
 
template<typename GridwiseGemm , typename FloatA , typename FloatB , typename FloatC , typename AGridDesc_B_K0_M_K1 , typename BGridDesc_B_K0_N_K1 , typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename CBlockClusterAdaptor , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_bwd_weight (const FloatA *__restrict__ p_a_grid, const FloatB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc, const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc, const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const CBlockClusterAdaptor c_block_cluster_adaptor)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDesc_M_N , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainK0BlockLoop>
__global__ void kernel_gemm_xdlops_skip_b_lds_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDesc_M_N c_grid_desc_m_n, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation >
__global__ void kernel_gemm_xdlops_splitk_lds_direct_load (typename GridwiseGemm::Argument karg, const Block2CTileMap &b2c_map, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op)
 
template<typename GridwiseGemm >
__global__ void kernel_gemm_xdlops_streamk (const typename GridwiseGemm::FloatAB *p_a_grid, const typename GridwiseGemm::FloatAB *p_b_grid, typename GridwiseGemm::FloatC *p_c_grid, void *p_workspace, index_t M, index_t N, index_t K, index_t StrideA, index_t StrideB, index_t StrideC, typename GridwiseGemm::Block2CTileMap block_mapping)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDesc_M_N , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_v2r3 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDesc_M_N c_grid_desc_m_n)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_v2r3 (const typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename ABK0MK1GridDesc , typename BBK0NK1GridDesc , typename CM0N0M1N1M2M3M4N2GridDesc , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename CBlockClusterAdaptor , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_v2r4 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const ABK0MK1GridDesc a_b_k0_m_k1_grid_desc, const BBK0NK1GridDesc b_b_k0_n_k1_grid_desc, const CM0N0M1N1M2M3M4N2GridDesc c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const CBlockClusterAdaptor c_block_cluster_adaptor)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation >
__global__ void kernel_gemm_xdlops_v2r4r2_simplified (typename GridwiseGemm::Argument karg, const Block2CTileMap &b2c_map, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainK0BlockLoop>
__global__ void kernel_gemm_xdlops_v3r1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_v3r2 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC *__restrict__ p_c0_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_v3r3 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC *__restrict__ p_c0_grid, const FloatC *__restrict__ p_c1_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_moe_gemm (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_moe_gemm_2lds (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_moe_mxgemm_2lds (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_moe_mxgemm (typename GridwiseGemm::Argument karg)
 
template<typename GridwisePermute , typename InGridDesc , typename OutGridDesc , typename InDataType , typename OutDataType , typename ElementwiseOperation , typename Block2TileMap >
__global__ void kernel_nd_permute (const InGridDesc in_grid_desc, const OutGridDesc out_grid_desc, const InDataType *p_in_global, OutDataType *p_out_global, const ElementwiseOperation elementwise_op, const Block2TileMap block_2_tile_map)
 
template<typename GridwisePutElementwise1dFunctor , typename InGrid1dDesc , typename InDataType , typename IndexDataType , typename OutDataType , typename ElementwiseOperation >
__global__ void kernel_put_element_1d (const InGrid1dDesc in_grid_1d_desc, const InDataType *__restrict__ p_in_global, const IndexDataType *__restrict__ p_indices_global, OutDataType *__restrict__ p_out_global, const ElementwiseOperation elementwise_op)
 
template<index_t BlockSize, typename DataType , typename Grid1dBufferDescType >
__global__ void kernel_buffer_set_value (const Grid1dBufferDescType grid_1d_buffer_desc, DataType *const __restrict__ p_global, DataType value)
 
template<typename Grid1dBufferDescTuple , index_t NumBuffer, index_t BlockSize, typename DataTypePointerTuple , typename DataTypeTuple >
__global__ void kernel_multiple_buffer_set_value (const Grid1dBufferDescTuple grid_1d_buffer_desc_tuple, DataTypePointerTuple p_global_tuple, DataTypeTuple value_tuple)
 
template<typename GridwiseReduction , typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K >
__global__ void kernel_softmax (const GridDesc_M_K in_grid_desc_m_k, const GridDesc_M_K out_grid_desc_m_k, index_t block_group_size, index_t num_k_block_tile_iteration, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global)
 
template<typename GridwiseSparseEmbedding , typename EmbType , typename IndexType , typename GammaDataType , typename BetaDataType , typename AccDataType , typename OutType , typename OutGridDesc , typename EmbElementwiseOperation , ck::index_t NumEmbeddings>
__global__ void kernel_sparse_embeddings_forward_layernorm (OutType *p_out, const ck::Array< EmbType *, NumEmbeddings > p_embs, const ck::Array< IndexType *, NumEmbeddings > p_indexes, const GammaDataType *p_gamma, const BetaDataType *p_beta, const OutGridDesc out_grid_desc, const AccDataType epsilon, const EmbElementwiseOperation emb_elementwise_op)
 
template<typename InputGridDesc , typename InputDataType , typename OutputGridDesc , typename OutputDataType , typename Block2ETileMap , typename ComputePtrOffsetOfStridedBatch , typename GridwiseTensorRearrangeKernel >
__global__ void kernel_tensor_rearrange (const InputGridDesc in_grid_desc, const InputDataType *__restrict__ p_in_global, const OutputGridDesc out_grid_desc, OutputDataType *__restrict__ p_out_global, const index_t batch_count, const Block2ETileMap block_2_tile_map, const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
 
template<typename GridwiseReduction , typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename SaveMeanInvStdDataType , typename ComputeDataType , typename YElementwiseOperation , typename GridDesc_M_K , typename GridDesc_M >
__global__ void kernel_normalization (const GridDesc_M_K x_grid_desc_m_k, const GridDesc_M_K gamma_grid_desc_m_k, const GridDesc_M_K beta_grid_desc_m_k, const GridDesc_M_K y_grid_desc_m_k, const GridDesc_M save_mean_grid_desc_m, const GridDesc_M save_inv_std_grid_desc_m, index_t num_k_block_tile_iteration, ComputeDataType epsilon, const XDataType *const __restrict__ p_x_global, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, SaveMeanInvStdDataType *const __restrict__ p_save_mean_global, SaveMeanInvStdDataType *const __restrict__ p_save_inv_std_global, const YElementwiseOperation y_elementwise_op)
 
template<typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename SaveMeanInvStdDataType , typename ComputeDataType , typename YElementwiseOperation , typename GridDesc_M_K , typename GridDesc_M , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t XSrcVectorDim, index_t XSrcVectorSize, index_t GammaSrcVectorDim, index_t GammaSrcVectorSize, index_t BetaSrcVectorDim, index_t BetaSrcVectorSize, index_t YDstVectorDim, index_t YDstVectorSize, index_t SaveMeanInvStdDstVectorSize, bool UseWelford>
auto NormalizationKernelSelector (bool isSweepOnce)
 
template<typename T >
__device__ T * cast_pointer_to_generic_address_space (T CK_CONSTANT_ADDRESS_SPACE *p)
 
template<typename T >
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACEcast_pointer_to_constant_address_space (T *p)
 
template<typename T >
__device__ int32x4_t make_wave_buffer_resource (T *p_wave, index_t element_space_size)
 
template<typename T >
__device__ int32x4_t make_wave_buffer_resource_with_default_range (T *p_wave)
 
__device__ int8_t llvm_amdgcn_raw_buffer_load_i8 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
 
__device__ int8x2_t llvm_amdgcn_raw_buffer_load_i8x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
 
__device__ int8x4_t llvm_amdgcn_raw_buffer_load_i8x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
 
__device__ bhalf_t llvm_amdgcn_raw_buffer_load_i16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
 
__device__ bhalf2_t llvm_amdgcn_raw_buffer_load_i16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
 
__device__ bhalf4_t llvm_amdgcn_raw_buffer_load_i16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
 
__device__ int32_t llvm_amdgcn_raw_buffer_load_i32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
 
__device__ int32x2_t llvm_amdgcn_raw_buffer_load_i32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
 
__device__ int32x4_t llvm_amdgcn_raw_buffer_load_i32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
 
__device__ half_t llvm_amdgcn_raw_buffer_load_fp16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
 
__device__ half2_t llvm_amdgcn_raw_buffer_load_fp16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
 
__device__ half4_t llvm_amdgcn_raw_buffer_load_fp16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
 
__device__ float llvm_amdgcn_raw_buffer_load_fp32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
 
__device__ float2_t llvm_amdgcn_raw_buffer_load_fp32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
 
__device__ float4_t llvm_amdgcn_raw_buffer_load_fp32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
 
__device__ void llvm_amdgcn_raw_buffer_store_i8 (int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
 
__device__ void llvm_amdgcn_raw_buffer_store_i8x2 (int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
 
__device__ void llvm_amdgcn_raw_buffer_store_i8x4 (int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
 
__device__ void llvm_amdgcn_raw_buffer_store_i16 (bhalf_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
 
__device__ void llvm_amdgcn_raw_buffer_store_i16x2 (bhalf2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
 
__device__ void llvm_amdgcn_raw_buffer_store_i16x4 (bhalf4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
 
__device__ void llvm_amdgcn_raw_buffer_store_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
 
__device__ void llvm_amdgcn_raw_buffer_store_i32x2 (int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
 
__device__ void llvm_amdgcn_raw_buffer_store_i32x4 (int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
 
__device__ void llvm_amdgcn_raw_buffer_store_fp16 (half_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
 
__device__ void llvm_amdgcn_raw_buffer_store_fp16x2 (half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
 
__device__ void llvm_amdgcn_raw_buffer_store_fp16x4 (half4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
 
__device__ void llvm_amdgcn_raw_buffer_store_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
 
__device__ void llvm_amdgcn_raw_buffer_store_fp32x2 (float2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
 
__device__ void llvm_amdgcn_raw_buffer_store_fp32x4 (float4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
 
__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2 (half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
 
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
 
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
 
__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64 (double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
 
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< int8_t, N >::type amd_buffer_load_impl_raw (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< T, N >::type amd_buffer_load_impl (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
 
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store_impl_raw (const typename vector_type< int8_t, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N>
__device__ void amd_global_atomic_add_impl (const typename vector_type< T, N >::type src_thread_data, T *addr)
 
template<typename T , index_t N>
__device__ void amd_buffer_atomic_add_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N>
__device__ void amd_buffer_atomic_max_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_zero (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_customized_value (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
template<typename T , index_t N>
__device__ void amd_buffer_atomic_add (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
template<typename T , index_t N>
__device__ void amd_buffer_atomic_max (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
__device__ void llvm_amdgcn_raw_buffer_load_lds (int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
 
template<typename T , index_t NumElemsPerThread>
__device__ void amd_direct_load_global_to_lds (const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)
 
template<typename T >
__device__ __amdgpu_buffer_rsrc_t make_wave_buffer_resource_new (T *p_wave, index_t element_space_size)
 
template<typename T >
__device__ __amdgpu_buffer_rsrc_t make_wave_buffer_resource_with_default_range_new (T *p_wave)
 
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< int8_t, N >::type amd_buffer_load_impl_raw (__amdgpu_buffer_rsrc_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< T, N >::type amd_buffer_load_impl (__amdgpu_buffer_rsrc_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
 
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store_impl_raw (const typename vector_type< int8_t, N >::type src_thread_data, __amdgpu_buffer_rsrc_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store_impl (const typename vector_type< T, N >::type src_thread_data, __amdgpu_buffer_rsrc_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<>
__host__ constexpr __device__ bool fp8_is_nan (f8_ocp_t a)
 
template<>
__host__ constexpr __device__ bool fp8_is_nan (bf8_ocp_t a)
 
template<>
__host__ constexpr __device__ bool fp8_is_nan (f8_fnuz_t a)
 
template<>
__host__ constexpr __device__ bool fp8_is_nan (bf8_fnuz_t a)
 
template<>
__host__ constexpr __device__ bool fp8_is_inf (bf8_ocp_t a)
 
__device__ int amd_assembly_and_b32 (int a, int b)
 
__device__ int amd_assembly_and_or_b32 (int a, int b, int d)
 
__device__ half2_t amd_assembly_pk_fma_f16 (half2_t a, half2_t b, half2_t c)
 
__device__ half2_t amd_assembly_pk_add_f16 (half2_t a, half2_t b)
 
__device__ float amd_assemble_cvt_f32_i4 (int b)
 
__device__ f8x4_t amd_assembly_cvt_f8_to_f32 (float b0, float b1, float b2, float b3)
 
__device__ f8x8_t amd_assembly_i4_to_fp8x8 (int a)
 
__device__ void amd_assembly_outer_product_1x2 (float a, float b0, float b1, float &c0, float &c1)
 
__device__ void amd_assembly_outer_product_1x4 (float a, float b0, float b1, float b2, float b3, float &c0, float &c1, float &c2, float &c3)
 
__device__ void amd_assembly_outer_product_1x2 (half2_t a, half2_t b0, half2_t b1, float &c0, float &c1)
 
__device__ void amd_assembly_outer_product_1x4 (half2_t a, half2_t b0, half2_t b1, half2_t b2, half2_t b3, float &c0, float &c1, float &c2, float &c3)
 
__device__ void amd_assembly_outer_product_1x2 (int8x4_t a, int8x4_t b0, int8x4_t b1, int32_t &c0, int32_t &c1)
 
__device__ void amd_assembly_outer_product_1x4 (int8x4_t a, int8x4_t b0, int8x4_t b1, int8x4_t b2, int8x4_t b3, int32_t &c0, int32_t &c1, int32_t &c2, int32_t &c3)
 
__device__ uint32_t amd_wave_read_first_lane (uint32_t value)
 
__device__ int32_t amd_wave_read_first_lane (int32_t value)
 
__device__ int64_t amd_wave_read_first_lane (int64_t value)
 
template<typename Object , typename = ck::enable_if_t<ck::is_class_v<Object> && ck::is_trivially_copyable_v<Object>>>
__device__ auto amd_wave_read_first_lane (const Object &obj)
 
template<typename X , typename... Xs>
__host__ constexpr __device__ auto make_array (X &&x, Xs &&... xs)
 
template<typename X >
__host__ constexpr __device__ auto make_array ()
 
template<typename... Xs>
__host__ constexpr __device__ auto make_multi_index (Xs &&... xs)
 
template<index_t NSize>
__host__ constexpr __device__ auto make_zero_multi_index ()
 
template<typename T >
__host__ constexpr __device__ auto to_multi_index (const T &x)
 
template<index_t NSize, typename X >
__host__ constexpr __device__ auto operator+= (MultiIndex< NSize > &y, const X &x)
 
template<index_t NSize, typename X >
__host__ constexpr __device__ auto operator-= (MultiIndex< NSize > &y, const X &x)
 
template<index_t NSize, typename T >
__host__ constexpr __device__ auto operator+ (const MultiIndex< NSize > &a, const T &b)
 
template<index_t NSize, typename T >
__host__ constexpr __device__ auto operator- (const MultiIndex< NSize > &a, const T &b)
 
template<index_t NSize, typename T >
__host__ constexpr __device__ auto operator* (const MultiIndex< NSize > &a, const T &b)
 
template<typename PY , typename PX , typename enable_if< is_pointer_v< PY > &&is_pointer_v< PX >, bool >::type = false>
__host__ __device__ PY c_style_pointer_cast (PX p_x)
 
template<typename Arr , typename Picks , typename X >
__host__ constexpr __device__ auto operator+= (ContainerElementPicker< Arr, Picks > &y, const X &x)
 
template<typename Arr , typename Picks , typename X >
__host__ constexpr __device__ auto operator-= (ContainerElementPicker< Arr, Picks > &y, const X &x)
 
template<typename Arr , typename Picks >
__host__ constexpr __device__ auto pick_container_element (Arr &a, Picks)
 
template<typename Arr , typename Picks >
__host__ constexpr __device__ auto pick_container_element (const Arr &a, Picks)
 
template<typename TData , index_t NSize>
__host__ constexpr __device__ auto container_push_back (const Array< TData, NSize > &a, const TData &x)
 
template<typename... Ts, typename T >
__host__ constexpr __device__ auto container_push_front (const Tuple< Ts... > &a, const T &x)
 
template<typename... Ts, typename T >
__host__ constexpr __device__ auto container_push_back (const Tuple< Ts... > &a, const T &x)
 
template<typename TData , index_t NSize, index_t... IRs>
__host__ constexpr __device__ auto container_reorder_given_new2old (const Array< TData, NSize > &old_array, Sequence< IRs... >)
 
template<typename TData , index_t NSize, index_t... IRs>
__host__ constexpr __device__ auto container_reorder_given_old2new (const Array< TData, NSize > &old_array, Sequence< IRs... > old2new)
 
template<typename... Ts, index_t... IRs>
__host__ constexpr __device__ auto container_reorder_given_new2old (const Tuple< Ts... > &old_tuple, Sequence< IRs... >)
 
template<typename... Ts, index_t... IRs>
__host__ constexpr __device__ auto container_reorder_given_old2new (const Tuple< Ts... > &old_tuple, Sequence< IRs... > old2new)
 
template<index_t... Is, index_t... IRs>
__host__ constexpr __device__ auto container_reorder_given_new2old (Sequence< Is... >, Sequence< IRs... >)
 
template<index_t... Is, index_t... IRs>
__host__ constexpr __device__ auto container_reorder_given_old2new (Sequence< Is... > old_seq, Sequence< IRs... >)
 
template<typename Container , typename Reduce , typename Init , index_t IBegin = 0, index_t IEnd = Container::Size(), index_t IStep = 1>
__host__ constexpr __device__ auto container_reduce (const Container &x, Reduce reduce, Init init, Number< IBegin >=Number< 0 >{}, Number< IEnd >=Number< Container::Size()>{}, Number< IStep >=Number< 1 >{})
 
template<typename TData , index_t NSize, typename Reduce >
__host__ constexpr __device__ auto container_reverse_inclusive_scan (const Array< TData, NSize > &x, Reduce f, TData init)
 
template<typename TData , index_t NSize, typename Reduce >
__host__ constexpr __device__ auto container_reverse_exclusive_scan (const Array< TData, NSize > &x, Reduce f, TData init)
 
template<index_t... Is, typename Reduce , index_t Init>
__host__ constexpr __device__ auto container_reverse_exclusive_scan (const Sequence< Is... > &seq, Reduce f, Number< Init >)
 
template<typename... Xs, typename Reduce , typename Init >
__host__ constexpr __device__ auto container_reverse_exclusive_scan (const Tuple< Xs... > &x, Reduce reduce, Init init)
 
template<typename... Xs, typename Reduce , typename TData >
__host__ constexpr __device__ auto container_reverse_inclusive_scan (const Tuple< Xs... > &x, Reduce f, TData init)
 
template<typename X , typename... Ys>
__host__ constexpr __device__ auto container_concat (const X &x, const Ys &... ys)
 
template<typename T , index_t NX, index_t NY>
__host__ constexpr __device__ auto container_concat (const Array< T, NX > &ax, const Array< T, NY > &ay)
 
template<typename... X, typename... Y>
__host__ constexpr __device__ auto container_concat (const Tuple< X... > &tx, const Tuple< Y... > &ty)
 
template<typename Container >
__host__ constexpr __device__ auto container_concat (const Container &x)
 
template<typename T , index_t N, index_t... Is>
__host__ constexpr __device__ auto get_container_subset (const Array< T, N > &arr, Sequence< Is... >)
 
template<typename... Ts, index_t... Is>
__host__ constexpr __device__ auto get_container_subset (const Tuple< Ts... > &tup, Sequence< Is... >)
 
template<typename T , index_t N, index_t... Is>
__host__ constexpr __device__ void set_container_subset (Array< T, N > &y, Sequence< Is... > picks, const Array< T, sizeof...(Is)> &x)
 
template<typename... Ys, index_t... Is, typename... Xs>
__host__ constexpr __device__ void set_container_subset (Tuple< Ys... > &y, Sequence< Is... > picks, const Tuple< Xs... > &x)
 
template<index_t... Is>
__host__ constexpr __device__ auto sequence_to_tuple_of_number (Sequence< Is... >)
 
constexpr auto next_pow2 (uint32_t x)
 
template<typename T >
constexpr bool is_native_type ()
 
template<typename T >
const char * get_type_name ()
 
template<typename T , index_t N>
__host__ constexpr __device__ auto make_vector_type (Number< N >)
 
template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T , typename ElementSpaceSize >
__host__ constexpr __device__ auto make_dynamic_buffer (T *p, ElementSpaceSize element_space_size)
 
template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T , typename ElementSpaceSize >
__host__ constexpr __device__ auto make_long_dynamic_buffer (T *p, ElementSpaceSize element_space_size)
 
template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T , typename ElementSpaceSize , typename X , typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X >>::value, bool >::type = false>
__host__ constexpr __device__ auto make_dynamic_buffer (T *p, ElementSpaceSize element_space_size, X invalid_element_value)
 
template<class EnvVar >
const std::string & EnvGetString (EnvVar)
 
template<class EnvVar >
bool EnvIsEnabled (EnvVar)
 
template<class EnvVar >
bool EnvIsDisabled (EnvVar)
 
template<class EnvVar >
uint64_t EnvValue (EnvVar)
 
template<class EnvVar >
bool EnvIsUnset (EnvVar)
 
template<class EnvVar >
void EnvUnset (EnvVar)
 
template<typename EnvVar , typename ValueType >
void UpdateEnvVar (EnvVar, const ValueType &val)
 updates the cached value of an environment variable More...
 
template<typename EnvVar >
void UpdateEnvVar (EnvVar, const std::string_view &val)
 
__host__ int clz (uint32_t x)
 
template<bool predicate, typename X , typename Y >
constexpr auto conditional_expr (X &&x, Y &&y)
 
template<typename F , typename X >
__host__ constexpr __device__ auto unpack (F &&f, X &&x)
 
template<typename F , typename X , typename Y >
__host__ constexpr __device__ auto unpack2 (F &&f, X &&x, Y &&y)
 
template<typename X >
__device__ X atomic_add (X *p_dst, const X &x)
 
template<>
__device__ int32_t atomic_add< int32_t > (int32_t *p_dst, const int32_t &x)
 
template<>
__device__ uint32_t atomic_add< uint32_t > (uint32_t *p_dst, const uint32_t &x)
 
template<>
__device__ float atomic_add< float > (float *p_dst, const float &x)
 
template<>
__device__ unsigned short atomic_add< unsigned short > (unsigned short *p_dst, const unsigned short &x)
 
template<>
__device__ _Float16 atomic_add< _Float16 > (_Float16 *p_dst, const _Float16 &x)
 
template<>
__device__ double atomic_add< double > (double *p_dst, const double &x)
 
template<>
__device__ float2_t atomic_add< float2_t > (float2_t *p_dst, const float2_t &x)
 
template<>
__device__ double2_t atomic_add< double2_t > (double2_t *p_dst, const double2_t &x)
 
template<typename X >
__device__ X atomic_max (X *p_dst, const X &x)
 
template<>
__device__ int32_t atomic_max< int32_t > (int32_t *p_dst, const int32_t &x)
 
template<>
__device__ uint32_t atomic_max< uint32_t > (uint32_t *p_dst, const uint32_t &x)
 
template<>
__device__ float atomic_max< float > (float *p_dst, const float &x)
 
template<>
__device__ double atomic_max< double > (double *p_dst, const double &x)
 
template<>
__device__ float2_t atomic_max< float2_t > (float2_t *p_dst, const float2_t &x)
 
constexpr __device__ index_t get_warp_size ()
 
__device__ index_t get_thread_local_1d_id ()
 
__device__ index_t get_thread_global_1d_id ()
 
__device__ index_t get_warp_local_1d_id ()
 
__device__ index_t get_block_1d_id ()
 
__device__ index_t get_grid_size ()
 
__device__ index_t get_block_size ()
 
template<>
constexpr __device__ index_t get_shift< 1 > ()
 
template<typename TA , typename TB , typename TC >
__device__ void inner_product (const TA &a, const TB &b, TC &c)
 
template<>
__device__ void inner_product< float, float, float > (const float &a, const float &b, float &c)
 
template<>
__device__ void inner_product< float2_t, float2_t, float > (const float2_t &a, const float2_t &b, float &c)
 
template<>
__device__ void inner_product< float4_t, float4_t, float > (const float4_t &a, const float4_t &b, float &c)
 
template<>
__device__ void inner_product< bhalf_t, bhalf_t, float > (const bhalf_t &a, const bhalf_t &b, float &c)
 
template<>
__device__ void inner_product< half_t, half_t, float > (const half_t &a, const half_t &b, float &c)
 
template<>
__device__ void inner_product< half2_t, half2_t, float > (const half2_t &a, const half2_t &b, float &c)
 
template<>
__device__ void inner_product< half4_t, half4_t, float > (const half4_t &a, const half4_t &b, float &c)
 
template<>
__device__ void inner_product< half8_t, half8_t, float > (const half8_t &a, const half8_t &b, float &c)
 
template<>
__device__ void inner_product< int8_t, int8_t, int32_t > (const int8_t &a, const int8_t &b, int32_t &c)
 
template<>
__device__ void inner_product< int8x2_t, int8x2_t, int32_t > (const int8x2_t &a, const int8x2_t &b, int32_t &c)
 
template<>
__device__ void inner_product< int8x4_t, int8x4_t, int32_t > (const int8x4_t &a, const int8x4_t &b, int32_t &c)
 
template<>
__device__ void inner_product< int8x8_t, int8x8_t, int32_t > (const int8x8_t &a, const int8x8_t &b, int32_t &c)
 
template<>
__device__ void inner_product< int8x16_t, int8x16_t, int32_t > (const int8x16_t &a, const int8x16_t &b, int32_t &c)
 
template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto operator+ (integral_constant< TX, X >, integral_constant< TY, Y >)
 
template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto operator- (integral_constant< TX, X >, integral_constant< TY, Y >)
 
template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto operator* (integral_constant< TX, X >, integral_constant< TY, Y >)
 
template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto operator/ (integral_constant< TX, X >, integral_constant< TY, Y >)
 
template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto operator% (integral_constant< TX, X >, integral_constant< TY, Y >)
 
constexpr LoopScheduler make_default_loop_scheduler ()
 
template<typename Y , typename X >
__host__ constexpr __device__ Y mxf8_convert_sr (X x, float scale)
 
template<typename Y , typename X >
__host__ constexpr __device__ Y mxf8_convert_rne (X x, float scale)
 
template<>
__host__ __device__ f8_ocp_t mxf8_convert_rne< f8_ocp_t, float > (float x, float scale)
 
template<>
__host__ __device__ bf8_ocp_t mxf8_convert_rne< bf8_ocp_t, float > (float x, float scale)
 
template<>
__host__ __device__ f8x2_ocp_t mxf8_convert_rne< f8x2_ocp_t, float2_t > (float2_t x, float scale)
 
template<>
__host__ __device__ bf8x2_ocp_t mxf8_convert_rne< bf8x2_ocp_t, float2_t > (float2_t x, float scale)
 
template<>
__host__ __device__ f8x16_ocp_t mxf8_convert_rne< f8x16_ocp_t, float16_t > (float16_t x, float scale)
 
template<>
__host__ __device__ bf8x16_ocp_t mxf8_convert_rne< bf8x16_ocp_t, float16_t > (float16_t x, float scale)
 
template<>
__host__ __device__ f8x32_ocp_t mxf8_convert_rne< f8x32_ocp_t, float32_t > (float32_t x, float scale)
 
template<>
__host__ __device__ bf8x32_ocp_t mxf8_convert_rne< bf8x32_ocp_t, float32_t > (float32_t x, float scale)
 
template<>
__host__ __device__ f8_ocp_t mxf8_convert_sr< f8_ocp_t, float > (float x, float scale)
 
template<>
__host__ __device__ bf8_ocp_t mxf8_convert_sr< bf8_ocp_t, float > (float x, float scale)
 
template<>
__host__ __device__ f8x2_ocp_t mxf8_convert_sr< f8x2_ocp_t, float2_t > (float2_t x, float scale)
 
template<>
__host__ __device__ bf8x2_ocp_t mxf8_convert_sr< bf8x2_ocp_t, float2_t > (float2_t x, float scale)
 
template<>
__host__ __device__ f8x16_ocp_t mxf8_convert_sr< f8x16_ocp_t, float16_t > (float16_t x, float scale)
 
template<>
__host__ __device__ bf8x16_ocp_t mxf8_convert_sr< bf8x16_ocp_t, float16_t > (float16_t x, float scale)
 
template<>
__host__ __device__ f8x32_ocp_t mxf8_convert_sr< f8x32_ocp_t, float32_t > (float32_t x, float scale)
 
template<>
__host__ __device__ bf8x32_ocp_t mxf8_convert_sr< bf8x32_ocp_t, float32_t > (float32_t x, float scale)
 
template<typename T , uint32_t seed_t, ck::enable_if_t< is_same< float, T >{}, bool > = false>
__host__ __device__ uint32_t prand_generator (index_t id, T val, uint32_t seed=seed_t)
 
template<typename T , uint32_t seed_t, ck::enable_if_t<!(is_same< float, T >{}||is_same< _Float16, T >{}), bool > = false>
__host__ __device__ uint32_t prand_generator (int id, T val, uint32_t seed=seed_t)
 
template<typename Y , typename X >
constexpr __host__ Y scaled_type_convert (e8m0_bexp_t scale, X x)
 
template<>
__host__ float scaled_type_convert< float, f8_ocp_t > (e8m0_bexp_t scale, f8_ocp_t x)
 
template<>
__host__ float scaled_type_convert< float, bf8_ocp_t > (e8m0_bexp_t scale, bf8_ocp_t x)
 
template<>
__host__ float2_t scaled_type_convert< float2_t, f8x2_ocp_t > (e8m0_bexp_t scale, f8x2_ocp_t x)
 
template<>
__host__ float2_t scaled_type_convert< float2_t, bf8x2_ocp_t > (e8m0_bexp_t scale, bf8x2_ocp_t x)
 
template<>
__host__ float16_t scaled_type_convert< float16_t, f8x16_ocp_t > (e8m0_bexp_t scale, f8x16_ocp_t x)
 
template<>
__host__ float16_t scaled_type_convert< float16_t, bf8x16_ocp_t > (e8m0_bexp_t scale, bf8x16_ocp_t x)
 
template<>
__host__ float32_t scaled_type_convert< float32_t, f8x32_ocp_t > (e8m0_bexp_t scale, f8x32_ocp_t x)
 
template<>
__host__ float32_t scaled_type_convert< float32_t, bf8x32_ocp_t > (e8m0_bexp_t scale, bf8x32_ocp_t x)
 
template<>
__host__ f8_ocp_t scaled_type_convert< f8_ocp_t, float > (e8m0_bexp_t scale, float x)
 
template<>
__host__ bf8_ocp_t scaled_type_convert< bf8_ocp_t, float > (e8m0_bexp_t scale, float x)
 
template<>
__host__ f8x2_ocp_t scaled_type_convert< f8x2_ocp_t, float2_t > (e8m0_bexp_t scale, float2_t x)
 
template<>
__host__ bf8x2_ocp_t scaled_type_convert< bf8x2_ocp_t, float2_t > (e8m0_bexp_t scale, float2_t x)
 
template<>
__host__ f8x16_ocp_t scaled_type_convert< f8x16_ocp_t, float16_t > (e8m0_bexp_t scale, float16_t x)
 
template<>
__host__ bf8x16_ocp_t scaled_type_convert< bf8x16_ocp_t, float16_t > (e8m0_bexp_t scale, float16_t x)
 
template<>
__host__ f8x32_ocp_t scaled_type_convert< f8x32_ocp_t, float32_t > (e8m0_bexp_t scale, float32_t x)
 
template<>
__host__ bf8x32_ocp_t scaled_type_convert< bf8x32_ocp_t, float32_t > (e8m0_bexp_t scale, float32_t x)
 
template<index_t I, index_t... Is>
__host__ constexpr __device__ auto sequence_pop_front (Sequence< I, Is... >)
 
template<typename Seq >
__host__ constexpr __device__ auto sequence_pop_back (Seq)
 
template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ bool operator== (Sequence< Xs... >, Sequence< Ys... >)
 
template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto operator+ (Sequence< Xs... >, Sequence< Ys... >)
 
template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto operator- (Sequence< Xs... >, Sequence< Ys... >)
 
template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto operator* (Sequence< Xs... >, Sequence< Ys... >)
 
template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto operator/ (Sequence< Xs... >, Sequence< Ys... >)
 
template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto operator% (Sequence< Xs... >, Sequence< Ys... >)
 
template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto operator+ (Sequence< Xs... >, Number< Y >)
 
template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto operator- (Sequence< Xs... >, Number< Y >)
 
template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto operator* (Sequence< Xs... >, Number< Y >)
 
template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto operator/ (Sequence< Xs... >, Number< Y >)
 
template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto operator% (Sequence< Xs... >, Number< Y >)
 
template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto operator+ (Number< Y >, Sequence< Xs... >)
 
template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto operator- (Number< Y >, Sequence< Xs... >)
 
template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto operator* (Number< Y >, Sequence< Xs... >)
 
template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto operator/ (Number< Y >, Sequence< Xs... >)
 
template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto operator% (Number< Y >, Sequence< Xs... >)
 
template<typename... Seqs>
__host__ constexpr __device__ auto merge_sequences (Seqs...)
 
template<typename F , index_t... Xs>
__host__ constexpr __device__ auto transform_sequences (F f, Sequence< Xs... >)
 
template<typename F , index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto transform_sequences (F f, Sequence< Xs... >, Sequence< Ys... >)
 
template<typename F , index_t... Xs, index_t... Ys, index_t... Zs>
__host__ constexpr __device__ auto transform_sequences (F f, Sequence< Xs... >, Sequence< Ys... >, Sequence< Zs... >)
 
template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ auto reverse_inclusive_scan_sequence (Seq, Reduce, Number< Init >)
 
template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ auto reverse_exclusive_scan_sequence (Seq, Reduce, Number< Init >)
 
template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ auto inclusive_scan_sequence (Seq, Reduce, Number< Init >)
 
template<typename Seq , index_t... Is>
__host__ constexpr __device__ auto pick_sequence_elements_by_ids (Seq, Sequence< Is... >)
 
template<typename Seq , typename Mask >
__host__ constexpr __device__ auto pick_sequence_elements_by_mask (Seq, Mask)
 
template<typename Seq , typename Values , typename Ids >
__host__ constexpr __device__ auto modify_sequence_elements_by_ids (Seq, Values, Ids)
 
template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ index_t reduce_on_sequence (Seq, Reduce f, Number< Init >)
 
template<typename Seq , typename F >
__host__ constexpr __device__ bool sequence_any_of (Seq, F f)
 
template<typename Seq , typename F >
__host__ constexpr __device__ bool sequence_all_of (Seq, F f)
 
template<index_t... Is>
__host__ constexpr __device__ auto make_sequence (Number< Is >...)
 
template<typename F , index_t N>
__host__ constexpr __device__ auto generate_sequence (F, Number< N >)
 
template<typename F , index_t N>
__host__ constexpr __device__ auto generate_sequence_v2 (F &&f, Number< N >)
 
template<index_t... Is>
__host__ constexpr __device__ auto to_sequence (Tuple< Number< Is >... >)
 
template<AddressSpaceEnum AddressSpace, typename T , index_t N>
__host__ constexpr __device__ auto make_static_buffer (Number< N >)
 
template<AddressSpaceEnum AddressSpace, typename T , long_index_t N>
__host__ constexpr __device__ auto make_static_buffer (LongNumber< N >)
 
template<typename X , typename... Xs>
__host__ constexpr __device__ auto make_statically_indexed_array (const X &x, const Xs &... xs)
 
template<typename X >
__host__ constexpr __device__ auto make_statically_indexed_array ()
 
template<typename... Ys, typename X , enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false>
__host__ constexpr __device__ auto operator+= (Tuple< Ys... > &y, const X &x)
 
template<typename... Ys, typename X , enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false>
__host__ constexpr __device__ auto operator-= (Tuple< Ys... > &y, const X &x)
 
template<typename... Xs, typename Y , enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto operator+ (const Tuple< Xs... > &x, const Y &y)
 
template<typename... Xs, typename Y , enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto operator- (const Tuple< Xs... > &x, const Y &y)
 
template<typename... Xs, typename Y , enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto operator* (const Tuple< Xs... > &x, const Y &y)
 
template<typename... Xs, typename Y , enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto operator* (Y a, const Tuple< Xs... > &x)
 
template<typename... Xs, typename Y , enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto operator* (const Tuple< Xs... > &x, Y a)
 
template<typename... Xs>
__host__ __device__ void print_multi_index (const Tuple< Xs... > &x)
 
__device__ void block_sync_lds ()
 
__device__ void block_sync_lds_direct_load ()
 
__device__ void s_nop ()
 
__device__ void transpose_fp16_2x2 (const half2_t &x0, const half2_t &x1, half2_t &y0, half2_t &y1)
 
__device__ void transpose_int8_4x4 (const int8x4_t &x0, const int8x4_t &x1, const int8x4_t &x2, const int8x4_t &x3, int8x4_t &y0, int8x4_t &y1, int8x4_t &y2, int8x4_t &y3)
 
__device__ void transpose_f8_4x4 (const f8x4_t &x0, const f8x4_t &x1, const f8x4_t &x2, const f8x4_t &x3, f8x4_t &y0, f8x4_t &y1, f8x4_t &y2, f8x4_t &y3)
 
template<typename... Xs>
__host__ constexpr __device__ auto make_tuple (Xs &&... xs)
 
template<typename... Args>
constexpr Tuple< Args &... > tie (Args &... args) noexcept
 
template<typename F , index_t... ids>
__host__ constexpr __device__ auto generate_tuple_for (F &&f, Sequence< ids... >)
 
template<typename F , index_t N>
__host__ constexpr __device__ auto generate_tuple (F &&f, Number< N >)
 
template<typename F , index_t N>
__host__ constexpr __device__ auto generate_tuple (F &&f, LongNumber< N >)
 
template<typename F , index_t N>
__host__ constexpr __device__ auto generate_tie (F &&f, Number< N >)
 
template<typename... X, typename... Y>
__host__ constexpr __device__ auto concat_tuple_of_reference (const Tuple< X &... > &tx, const Tuple< Y &... > &ty)
 
template<typename... X, typename... Y>
__host__ constexpr __device__ auto concat_tuple (const Tuple< X... > &tx, const Tuple< Y... > &ty)
 
template<typename... X>
__host__ constexpr __device__ auto concat_tuple (const Tuple< X... > &tx)
 
template<typename... X, typename... Tuples>
__host__ constexpr __device__ auto concat_tuple (const Tuple< X... > &tx, const Tuples &... tuples)
 
template<typename F , typename X >
__host__ constexpr __device__ auto transform_tuples (F f, const X &x)
 
template<typename F , typename X , typename Y >
__host__ constexpr __device__ auto transform_tuples (F f, const X &x, const Y &y)
 
template<typename F , typename X , typename Y , typename Z >
__host__ constexpr __device__ auto transform_tuples (F f, const X &x, const Y &y, const Z &z)
 
template<index_t Depth = 0, index_t MaxDepth = -1>
__host__ constexpr __device__ auto UnrollNestedTuple (const Tuple<> &element)
 
template<index_t Depth = 0, index_t MaxDepth = -1, typename T >
__host__ constexpr __device__ auto UnrollNestedTuple (const T &element)
 
template<index_t Depth = 0, index_t MaxDepth = -1, typename... Ts>
__host__ constexpr __device__ auto UnrollNestedTuple (const Tuple< Ts... > &tuple)
 
template<typename... Ts>
__host__ constexpr __device__ auto TupleReverse (const Tuple< Ts... > &tuple)
 
template<index_t Idx, index_t End, typename F , typename... Ts>
__host__ constexpr __device__ auto TupleReduce (F &&f, const Tuple< Ts... > &tuple)
 
template<typename... Ts>
__host__ constexpr __device__ auto IsNestedTuple (const Tuple< Ts... > &)
 
template<index_t depth = 0, typename T >
__host__ constexpr __device__ auto TupleDepth (const T &)
 
template<index_t depth = 0, typename... Ts>
__host__ constexpr __device__ auto TupleDepth (const Tuple< Ts... > &)
 
template<index_t from, index_t to, typename... Ts>
__host__ constexpr __device__ auto TupleSlice (const Tuple< Ts... > &tuple)
 
template<typename Y , typename X , typename enable_if< sizeof(X)==sizeof(Y), bool >::type = false>
__host__ constexpr __device__ Y bit_cast (const X &x)
 
template<typename Y , typename X >
__host__ constexpr __device__ Y bf16_convert_rtn (X x)
 
template<>
__host__ constexpr __device__ bhalf_t bf16_convert_rtn< bhalf_t, float > (float x)
 
template<>
__host__ constexpr __device__ bhalf_t bf16_convert_rtn< bhalf_t, half_t > (half_t x)
 
template<typename Y , typename X , ck::enable_if_t<!(ck::is_const_v< Y >||ck::is_const_v< X >), bool > = false>
__host__ constexpr __device__ Y type_convert (X x)
 
template<>
__host__ constexpr __device__ float type_convert< float, bhalf_t > (bhalf_t x)
 
template<>
__host__ constexpr __device__ bhalf_t type_convert< bhalf_t, float > (float x)
 
template<>
__host__ constexpr __device__ half_t type_convert< half_t, bhalf_t > (bhalf_t x)
 
template<>
__host__ constexpr __device__ bhalf_t type_convert< bhalf_t, half_t > (half_t x)
 
template<>
__host__ constexpr __device__ int8_t type_convert< int8_t, bhalf_t > (bhalf_t x)
 
template<>
__host__ constexpr __device__ bhalf_t type_convert< bhalf_t, int8_t > (int8_t x)
 
template<>
__host__ constexpr __device__ f8_ocp_t type_convert< f8_ocp_t, int > (int x)
 
template<>
__host__ constexpr __device__ bf8_ocp_t type_convert< bf8_ocp_t, int > (int x)
 
template<typename Y , enable_if_t< is_same_v< Y, ck::tf32_t >, bool > = false>
__host__ constexpr __device__ float type_convert (float x)
 
template<typename Y , typename X >
__host__ constexpr __device__ Y type_convert_sp (X x)
 
template<>
__host__ constexpr __device__ int type_convert_sp< int, float > (float x)
 
template<>
__host__ constexpr __device__ float type_convert_sp< float, int > (int x)
 
template<>
__host__ constexpr __device__ int type_convert_sp< int, half_t > (half_t x)
 
template<>
__host__ constexpr __device__ half_t type_convert_sp< half_t, int > (int x)
 
template<>
__host__ constexpr __device__ int type_convert_sp< int, f8_t > (f8_t x)
 
template<>
__host__ constexpr __device__ f8_t type_convert_sp< f8_t, int > (int x)
 
template<>
__host__ constexpr __device__ int type_convert_sp< int, bhalf_t > (bhalf_t x)
 
template<>
__host__ constexpr __device__ bhalf_t type_convert_sp< bhalf_t, int > (int x)
 
template<>
__host__ constexpr __device__ bhalf_t type_convert_sp< bhalf_t, float > (float x)
 
template<>
__host__ constexpr __device__ half_t type_convert_sp< half_t, float > (float x)
 
template<typename Y , typename X >
__host__ constexpr __device__ Y f8_convert_sr (X x)
 
template<>
__host__ __device__ f8_fnuz_t f8_convert_sr< f8_fnuz_t, float > (float x)
 
template<>
__host__ __device__ f8_fnuz_t f8_convert_sr< f8_fnuz_t, half_t > (half_t x)
 
template<>
__host__ __device__ bf8_fnuz_t f8_convert_sr< bf8_fnuz_t, float > (float x)
 
template<>
__host__ __device__ bf8_fnuz_t f8_convert_sr< bf8_fnuz_t, half_t > (half_t x)
 
template<>
__host__ __device__ f8_ocp_t f8_convert_sr< f8_ocp_t, float > (float x)
 Converts a float to a 8-bit float type (f8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ f8x2_ocp_t f8_convert_sr< f8x2_ocp_t, float2_t > (float2_t x)
 Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ bf8_ocp_t f8_convert_sr< bf8_ocp_t, float > (float x)
 Converts a float to a 8-bit float type (bf8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_sr< bf8x2_ocp_t, float2_t > (float2_t x)
 Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ f8_ocp_t f8_convert_sr< f8_ocp_t, half_t > (half_t x)
 Converts a half_t to a 8-bit float type (f8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ f8x2_ocp_t f8_convert_sr< f8x2_ocp_t, half2_t > (half2_t x)
 Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ bf8_ocp_t f8_convert_sr< bf8_ocp_t, half_t > (half_t x)
 Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_sr< bf8x2_ocp_t, half2_t > (half2_t x)
 Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ f8_ocp_t f8_convert_sr< f8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ f8x2_ocp_t f8_convert_sr< f8x2_ocp_t, bhalf2_t > (bhalf2_t x)
 Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ bf8_ocp_t f8_convert_sr< bf8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_sr< bf8x2_ocp_t, bhalf2_t > (bhalf2_t x)
 Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding. More...
 
template<typename Y , typename X >
__host__ constexpr __device__ Y f8_convert_rne (X x)
 
template<>
__host__ __device__ f8_fnuz_t f8_convert_rne< f8_fnuz_t, float > (float x)
 
template<>
__host__ __device__ f8_fnuz_t f8_convert_rne< f8_fnuz_t, half_t > (half_t x)
 
template<>
__host__ __device__ bf8_fnuz_t f8_convert_rne< bf8_fnuz_t, float > (float x)
 
template<>
__host__ __device__ bf8_fnuz_t f8_convert_rne< bf8_fnuz_t, half_t > (half_t x)
 
template<>
__host__ __device__ f8_ocp_t f8_convert_rne< f8_ocp_t, float > (float x)
 Converts a float to a 8-bit float type (f8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ f8x2_ocp_t f8_convert_rne< f8x2_ocp_t, float2_t > (float2_t x)
 Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ bf8_ocp_t f8_convert_rne< bf8_ocp_t, float > (float x)
 Converts a float to a 8-bit float type (bf8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_rne< bf8x2_ocp_t, float2_t > (float2_t x)
 Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ f8_ocp_t f8_convert_rne< f8_ocp_t, half_t > (half_t x)
 Converts a half_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ f8x2_ocp_t f8_convert_rne< f8x2_ocp_t, half2_t > (half2_t x)
 Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ bf8_ocp_t f8_convert_rne< bf8_ocp_t, half_t > (half_t x)
 Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_rne< bf8x2_ocp_t, half2_t > (half2_t x)
 Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ f8_ocp_t f8_convert_rne< f8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ f8x2_ocp_t f8_convert_rne< f8x2_ocp_t, bhalf2_t > (bhalf2_t x)
 Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ bf8_ocp_t f8_convert_rne< bf8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_rne< bf8x2_ocp_t, bhalf2_t > (bhalf2_t x)
 Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ f8_fnuz_t type_convert< f8_fnuz_t, float > (float x)
 
template<>
__host__ __device__ float type_convert< float, f8_fnuz_t > (f8_fnuz_t x)
 
template<>
__host__ __device__ float2_t type_convert< float2_t, f8x2_fnuz_t > (f8x2_fnuz_t x)
 
template<>
__host__ __device__ float type_convert< float, f8_ocp_t > (f8_ocp_t x)
 Converts a f8_ocp_t value to a float value. More...
 
template<>
__host__ __device__ float2_t type_convert< float2_t, f8x2_ocp_t > (f8x2_ocp_t x)
 Converts a vector of 2 f8_ocp_t values to a vector of 2 float values. More...
 
template<>
__host__ __device__ half_t type_convert< half_t, f8_ocp_t > (f8_ocp_t x)
 Converts a f8_ocp_t value to a half_t value. More...
 
template<>
__host__ __device__ half2_t type_convert< half2_t, f8x2_ocp_t > (f8x2_ocp_t x)
 Converts a vector of 2 f8_ocp_t values to a vector of 2 half_t values. More...
 
template<>
__host__ __device__ bhalf_t type_convert< bhalf_t, f8_ocp_t > (f8_ocp_t x)
 Converts a f8_ocp_t value to a bhalf_t value. More...
 
template<>
__host__ __device__ bhalf2_t type_convert< bhalf2_t, f8x2_ocp_t > (f8x2_ocp_t x)
 Converts a vector of 2 f8_ocp_t values to a vector of 2 bhalf_t values. More...
 
template<>
__host__ __device__ float type_convert< float, bf8_ocp_t > (bf8_ocp_t x)
 Converts a bf8_ocp_t value to a float value. More...
 
template<>
__host__ __device__ float2_t type_convert< float2_t, bf8x2_ocp_t > (bf8x2_ocp_t x)
 Converts a vector of 2 bf8_ocp_t values to a vector of 2 float values. More...
 
template<>
__host__ __device__ half_t type_convert< half_t, bf8_ocp_t > (bf8_ocp_t x)
 Converts a bf8_ocp_t value to a half_t value. More...
 
template<>
__host__ __device__ half2_t type_convert< half2_t, bf8x2_ocp_t > (bf8x2_ocp_t x)
 Converts a vector of 2 bf8_ocp_t values to a vector of 2 half_t values. More...
 
template<>
__host__ __device__ bhalf_t type_convert< bhalf_t, bf8_ocp_t > (bf8_ocp_t x)
 Converts a bf8_ocp_t value to a bhalf_t value. More...
 
template<>
__host__ __device__ bhalf2_t type_convert< bhalf2_t, bf8x2_ocp_t > (bf8x2_ocp_t x)
 Converts a vector of 2 bf8_ocp_t values to a vector of 2 bhalf_t values. More...
 
template<>
__host__ __device__ float2_t type_convert< float2_t, pk_i4_t > (pk_i4_t x)
 
template<>
__host__ __device__ half2_t type_convert< half2_t, pk_i4_t > (pk_i4_t x)
 
template<>
__host__ __device__ bhalf2_t type_convert< bhalf2_t, pk_i4_t > (pk_i4_t x)
 
template<>
__host__ __device__ half2_t type_convert< half2_t, float2_t > (float2_t x)
 
template<>
__host__ __device__ f8_fnuz_t type_convert< f8_fnuz_t, half_t > (half_t x)
 
template<>
__host__ __device__ f8_ocp_t type_convert< f8_ocp_t, half_t > (half_t x)
 Converts a half_t value to a f8_ocp_t value with rounding determined by a flag. More...
 
template<>
__host__ __device__ bf8_ocp_t type_convert< bf8_ocp_t, half_t > (half_t x)
 Converts a half_t value to a bf8_ocp_t value with rounding determined by a flag. More...
 
template<>
__host__ __device__ half_t type_convert< half_t, f8_fnuz_t > (f8_fnuz_t x)
 
template<>
__host__ __device__ bf8_fnuz_t type_convert< bf8_fnuz_t, float > (float x)
 
template<>
__host__ __device__ f8_ocp_t type_convert< f8_ocp_t, float > (float x)
 Converts a float value to a f8_ocp_t value with rounding determined by a flag. More...
 
template<>
__host__ __device__ bf8_ocp_t type_convert< bf8_ocp_t, float > (float x)
 Converts a float value to a bf8_ocp_t value with rounding determined by a flag. More...
 
template<>
__host__ __device__ f8_ocp_t type_convert< f8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t value to a f8_ocp_t value with rounding determined by a flag. More...
 
template<>
__host__ __device__ bf8_ocp_t type_convert< bf8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t value to a bf8_ocp_t value with rounding determined by a flag. More...
 
template<>
__host__ __device__ float type_convert< float, bf8_fnuz_t > (bf8_fnuz_t x)
 
template<>
__host__ __device__ bf8_fnuz_t type_convert< bf8_fnuz_t, half_t > (half_t x)
 
template<>
__host__ __device__ half_t type_convert< half_t, bf8_fnuz_t > (bf8_fnuz_t x)
 
__host__ __device__ f4_t f4_convert_rne (float x, float scale=1.0f)
 
__host__ __device__ f4x2_t f4_convert_rne (float2_t x, float scale=1.0f)
 
__host__ __device__ f4_t f4_convert_sr (float x, float scale=1.0f)
 
__host__ __device__ f4x2_t f4_convert_sr (float2_t x, float scale=1.0f)
 
template<>
__host__ __device__ f4_t type_convert< f4_t, float > (float x)
 
template<>
__host__ __device__ f4x2_t type_convert< f4x2_t, float2_t > (float2_t x)
 
template<>
__host__ __device__ f4x2_pk_t type_convert< f4x2_pk_t, float2_t > (float2_t x)
 
template<>
__host__ __device__ f4x32_t type_convert< f4x32_t, float32_t > (float32_t x)
 
template<>
__host__ __device__ float type_convert< float, f4_t > (f4_t x)
 
template<>
__host__ __device__ float2_t type_convert< float2_t, f4x2_t > (f4x2_t x)
 
template<>
__host__ __device__ float32_t type_convert< float32_t, f4x32_t > (f4x32_t x)
 
__host__ __device__ f6_t f6_convert_rne (float x, float scale=1.0f)
 Converts a float to a 6-bit float type (f6_t) using round-to-nearest-even. More...
 
__host__ __device__ f6x32_t f6_convert_rne (float32_t x, float scale=1.0f)
 Converts a 32-element single-precision float array into a packed 6-bit representation. More...
 
__host__ __device__ f6_t f6_convert_sr (float x, float scale=1.0f)
 Converts a float to the 6-bit floating-point type (f6_t) using stochastic rounding. More...
 
__host__ __device__ f6x32_t f6_convert_sr (float32_t x, float scale=1.0f)
 Converts a 32-element single-precision float array into a packed 6-bit representation. More...
 
template<>
__host__ __device__ f6_t type_convert< f6_t, float > (float x)
 Specializes the type conversion template for converting a float into the 6-bit float type (f6_t). More...
 
template<>
__host__ __device__ f6x32_t type_convert< f6x32_t, float32_t > (float32_t x)
 Specializes the type conversion template for converting a vector of 32 floats into the vector of 32 6-bit float types (f6x32_t). More...
 
template<>
__host__ __device__ f6x32_pk_t type_convert< f6x32_pk_t, float32_t > (float32_t x)
 
template<>
__host__ __device__ f6x16_t type_convert< f6x16_t, float16_t > (float16_t x)
 
template<>
__host__ __device__ f6x16_pk_t type_convert< f6x16_pk_t, float16_t > (float16_t x)
 
template<>
__host__ __device__ float type_convert< float, f6_t > (f6_t x)
 Specializes the type conversion template for converting the 6-bit float type (f6_t) to float. More...
 
template<>
__host__ __device__ float32_t type_convert< float32_t, f6x32_t > (f6x32_t x)
 Specializes the type conversion template for converting the vector of 32 6-bit float types (f6x32_t) to vector of 32 floats. More...
 
template<>
__host__ __device__ float16_t type_convert< float16_t, f6x16_t > (f6x16_t x)
 
template<>
__host__ __device__ float16_t type_convert< float16_t, f6x16_pk_t > (f6x16_pk_t x)
 
__host__ __device__ bf6_t bf6_convert_rne (float x, float scale=1.0f)
 Converts a float to the 6-bit BF6 type using round-to-nearest-even. More...
 
__host__ __device__ bf6x32_t bf6_convert_rne (float32_t x, float scale=1.0f)
 Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using round-to-nearest-even. More...
 
__host__ __device__ bf6_t bf6_convert_sr (float x, float scale=1.0f)
 Converts a float to the 6-bit BF6 type using stochastic rounding. More...
 
__host__ __device__ bf6x32_t bf6_convert_sr (float32_t x, float scale=1.0f)
 Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using stochastic rounding. More...
 
template<>
__host__ __device__ bf6_t type_convert< bf6_t, float > (float x)
 Specializes float-to-bf6_t conversion. More...
 
template<>
__host__ __device__ bf6x32_t type_convert< bf6x32_t, float32_t > (float32_t x)
 Specializes vector of 32 float-to-bf6_t conversion. More...
 
template<>
__host__ __device__ bf6x32_pk_t type_convert< bf6x32_pk_t, float32_t > (float32_t x)
 
template<>
__host__ __device__ bf6x16_t type_convert< bf6x16_t, float16_t > (float16_t x)
 
template<>
__host__ __device__ bf6x16_pk_t type_convert< bf6x16_pk_t, float16_t > (float16_t x)
 
template<>
__host__ __device__ float type_convert< float, bf6_t > (bf6_t x)
 Specializes the type conversion template for converting a bf6_t value to float. More...
 
template<>
__host__ __device__ float32_t type_convert< float32_t, bf6x32_t > (bf6x32_t x)
 Specializes the type conversion template for converting a vector of 32 bf6_t values to vector of 32 floats. More...
 
template<>
__host__ __device__ float16_t type_convert< float16_t, bf6x16_t > (bf6x16_t x)
 
template<>
__host__ __device__ float16_t type_convert< float16_t, bf6x16_pk_t > (bf6x16_pk_t x)
 
template<typename Y , typename X , size_t NumElems>
__host__ __device__ void array_convert (std::array< Y, NumElems > &y, const std::array< X, NumElems > &x)
 
template<typename Y , typename X , index_t NumElems>
__host__ __device__ void array_convert (Array< Y, NumElems > &y, const Array< X, NumElems > &x)
 

Variables

template<typename T >
constexpr index_t packed_size_v = packed_type_info<T>::packed_size
 
template<typename T >
constexpr bool is_packed_type_v = packed_size_v<T> > 1
 
constexpr detail::ignore_t ignore
 
template<typename X , typename Y >
constexpr bool is_same_v = is_same<X, Y>::value
 
template<typename X , typename Y >
constexpr bool is_base_of_v = is_base_of<X, Y>::value
 
template<typename T >
constexpr bool is_unsigned_v = is_unsigned<T>::value
 
template<typename T >
constexpr bool is_pointer_v = is_pointer<T>::value
 

Detailed Description

Definitions from <cstdint>, <cmath> conflict with /opt/rocm/include/hip/amd_detail/amd_hip_vector_types.h.

Typedef Documentation

◆ bf6_t

using ck::bf6_t = typedef unsigned _BitInt(6)

◆ bf6x16_pk_t

using ck::bf6x16_pk_t = typedef f6_pk_t<bf6_t, 16>

◆ bf6x16_t

using ck::bf6x16_t = typedef typename vector_type<bf6x16_pk_t, 1>::type

◆ bf6x16x2_t

using ck::bf6x16x2_t = typedef typename vector_type<bf6x16_pk_t, 2>::type

◆ bf6x32_pk_t

using ck::bf6x32_pk_t = typedef f6_pk_t<bf6_t, 32>

◆ bf6x32_t

using ck::bf6x32_t = typedef typename vector_type<bf6x32_pk_t, 1>::type

◆ bf8_t

using ck::bf8_t = typedef bf8_fnuz_t

◆ bf8x16_fnuz_t

using ck::bf8x16_fnuz_t = typedef typename vector_type<bf8_fnuz_t, 16>::type

◆ bf8x16_ocp_t

using ck::bf8x16_ocp_t = typedef typename vector_type<bf8_ocp_t, 16>::type

◆ bf8x2_fnuz_t

using ck::bf8x2_fnuz_t = typedef typename vector_type<bf8_fnuz_t, 2>::type

◆ bf8x2_ocp_t

using ck::bf8x2_ocp_t = typedef typename vector_type<bf8_ocp_t, 2>::type

◆ bf8x32_fnuz_t

using ck::bf8x32_fnuz_t = typedef typename vector_type<bf8_fnuz_t, 32>::type

◆ bf8x32_ocp_t

using ck::bf8x32_ocp_t = typedef typename vector_type<bf8_ocp_t, 32>::type

◆ bf8x4_fnuz_t

using ck::bf8x4_fnuz_t = typedef typename vector_type<bf8_fnuz_t, 4>::type

◆ bf8x4_ocp_t

using ck::bf8x4_ocp_t = typedef typename vector_type<bf8_ocp_t, 4>::type

◆ bf8x64_fnuz_t

using ck::bf8x64_fnuz_t = typedef typename vector_type<bf8_fnuz_t, 64>::type

◆ bf8x64_ocp_t

using ck::bf8x64_ocp_t = typedef typename vector_type<bf8_ocp_t, 64>::type

◆ bf8x8_fnuz_t

using ck::bf8x8_fnuz_t = typedef typename vector_type<bf8_fnuz_t, 8>::type

◆ bf8x8_ocp_t

using ck::bf8x8_ocp_t = typedef typename vector_type<bf8_ocp_t, 8>::type

◆ bhalf16_t

using ck::bhalf16_t = typedef typename vector_type<bhalf_t, 16>::type

◆ bhalf2_t

using ck::bhalf2_t = typedef typename vector_type<bhalf_t, 2>::type

◆ bhalf32_t

using ck::bhalf32_t = typedef typename vector_type<bhalf_t, 32>::type

◆ bhalf4_t

using ck::bhalf4_t = typedef typename vector_type<bhalf_t, 4>::type

◆ bhalf8_t

using ck::bhalf8_t = typedef typename vector_type<bhalf_t, 8>::type

◆ bhalf_t

using ck::bhalf_t = typedef ushort

◆ bool_constant

template<bool B>
using ck::bool_constant = typedef integral_constant<bool, B>

◆ conditional_t

template<bool predicate, class X , class Y >
using ck::conditional_t = typedef typename conditional<predicate, X, Y>::type

◆ double2_t

using ck::double2_t = typedef typename vector_type<double, 2>::type

◆ double4_t

using ck::double4_t = typedef typename vector_type<double, 4>::type

◆ e8m0x4_bexp_t

using ck::e8m0x4_bexp_t = typedef typename vector_type<e8m0_bexp_t, 4>::type

◆ element_type_t

template<typename T >
using ck::element_type_t = typedef typename packed_type_info<T>::element_type

◆ enable_if

template<bool B, typename T = void>
using ck::enable_if = typedef std::enable_if<B, T>

◆ enable_if_t

template<bool B, typename T = void>
using ck::enable_if_t = typedef typename std::enable_if<B, T>::type

◆ f4_t

using ck::f4_t = typedef unsigned _BitInt(4)

◆ f4x16_t

using ck::f4x16_t = typedef typename vector_type<f4x2_pk_t, 8>::type

◆ f4x2_t

using ck::f4x2_t = typedef typename vector_type<f4x2_pk_t, 1>::type

◆ f4x32_t

using ck::f4x32_t = typedef typename vector_type<f4x2_pk_t, 16>::type

◆ f4x4_t

using ck::f4x4_t = typedef typename vector_type<f4x2_pk_t, 2>::type

◆ f4x64_t

using ck::f4x64_t = typedef typename vector_type<f4x2_pk_t, 32>::type

◆ f4x8_t

using ck::f4x8_t = typedef typename vector_type<f4x2_pk_t, 4>::type

◆ f6_t

using ck::f6_t = typedef _BitInt(6)

◆ f6x16_pk_t

using ck::f6x16_pk_t = typedef f6_pk_t<f6_t, 16>

◆ f6x16_t

using ck::f6x16_t = typedef typename vector_type<f6x16_pk_t, 1>::type

◆ f6x16x2_t

using ck::f6x16x2_t = typedef typename vector_type<f6x16_pk_t, 2>::type

◆ f6x32_pk_t

using ck::f6x32_pk_t = typedef f6_pk_t<f6_t, 32>

◆ f6x32_t

using ck::f6x32_t = typedef typename vector_type<f6x32_pk_t, 1>::type

◆ f8_t

using ck::f8_t = typedef f8_fnuz_t

◆ f8x16_fnuz_t

using ck::f8x16_fnuz_t = typedef typename vector_type<f8_fnuz_t, 16>::type

◆ f8x16_ocp_t

using ck::f8x16_ocp_t = typedef typename vector_type<f8_ocp_t, 16>::type

◆ f8x2_fnuz_t

using ck::f8x2_fnuz_t = typedef typename vector_type<f8_fnuz_t, 2>::type

◆ f8x2_ocp_t

using ck::f8x2_ocp_t = typedef typename vector_type<f8_ocp_t, 2>::type

◆ f8x32_fnuz_t

using ck::f8x32_fnuz_t = typedef typename vector_type<f8_fnuz_t, 32>::type

◆ f8x32_ocp_t

using ck::f8x32_ocp_t = typedef typename vector_type<f8_ocp_t, 32>::type

◆ f8x4_fnuz_t

using ck::f8x4_fnuz_t = typedef typename vector_type<f8_fnuz_t, 4>::type

◆ f8x4_ocp_t

using ck::f8x4_ocp_t = typedef typename vector_type<f8_ocp_t, 4>::type

◆ f8x64_fnuz_t

using ck::f8x64_fnuz_t = typedef typename vector_type<f8_fnuz_t, 64>::type

◆ f8x64_ocp_t

using ck::f8x64_ocp_t = typedef typename vector_type<f8_ocp_t, 64>::type

◆ f8x8_fnuz_t

using ck::f8x8_fnuz_t = typedef typename vector_type<f8_fnuz_t, 8>::type

◆ f8x8_ocp_t

using ck::f8x8_ocp_t = typedef typename vector_type<f8_ocp_t, 8>::type

◆ false_type

using ck::false_type = typedef bool_constant<false>

◆ float16_t

using ck::float16_t = typedef typename vector_type<float, 16>::type

◆ float2_t

using ck::float2_t = typedef typename vector_type<float, 2>::type

◆ float32_t

using ck::float32_t = typedef typename vector_type<float, 32>::type

◆ float4_t

using ck::float4_t = typedef typename vector_type<float, 4>::type

◆ float64_t

using ck::float64_t = typedef typename vector_type<float, 64>::type

◆ float8_t

using ck::float8_t = typedef typename vector_type<float, 8>::type

◆ fp8_storage_t

typedef unsigned char ck::fp8_storage_t

◆ half16_t

using ck::half16_t = typedef typename vector_type<half_t, 16>::type

◆ half2_t

using ck::half2_t = typedef typename vector_type<half_t, 2>::type

◆ half32_t

using ck::half32_t = typedef typename vector_type<half_t, 32>::type

◆ half4_t

using ck::half4_t = typedef typename vector_type<half_t, 4>::type

◆ half8_t

using ck::half8_t = typedef typename vector_type<half_t, 8>::type

◆ half_t

using ck::half_t = typedef _Float16

◆ has_same_scalar_type

template<typename X , typename Y >
using ck::has_same_scalar_type = typedef is_same<typename scalar_type<remove_cvref_t<X> >::type, typename scalar_type<remove_cvref_t<Y> >::type>

◆ index_t

using ck::index_t = typedef int32_t

◆ int32x16_t

using ck::int32x16_t = typedef typename vector_type<int32_t, 16>::type

◆ int32x2_t

using ck::int32x2_t = typedef typename vector_type<int32_t, 2>::type

◆ int32x32_t

using ck::int32x32_t = typedef typename vector_type<int32_t, 32>::type

◆ int32x4_t

using ck::int32x4_t = typedef typename vector_type<int32_t, 4>::type

◆ int32x64_t

using ck::int32x64_t = typedef typename vector_type<int32_t, 64>::type

◆ int32x6_t

using ck::int32x6_t = typedef typename vector_type<int32_t, 6>::type

◆ int32x8_t

using ck::int32x8_t = typedef typename vector_type<int32_t, 8>::type

◆ int4_t

using ck::int4_t = typedef _BitInt(4)

◆ int64_t

using ck::int64_t = typedef long

◆ int8x16_t

using ck::int8x16_t = typedef typename vector_type<int8_t, 16>::type

◆ int8x2_t

using ck::int8x2_t = typedef typename vector_type<int8_t, 2>::type

◆ int8x32_t

using ck::int8x32_t = typedef typename vector_type<int8_t, 32>::type

◆ int8x4_t

using ck::int8x4_t = typedef typename vector_type<int8_t, 4>::type

◆ int8x64_t

using ck::int8x64_t = typedef typename vector_type<int8_t, 64>::type

◆ int8x8_t

using ck::int8x8_t = typedef typename vector_type<int8_t, 8>::type

◆ is_detected

template<template< class... > class Op, class... Args>
using ck::is_detected = typedef typename detail::detector<nonesuch, void, Op, Args...>::value_t

◆ is_pack2_invocable_t

template<typename T >
using ck::is_pack2_invocable_t = typedef decltype(ck::declval<T&>().is_pack2_invocable)

◆ is_pack4_invocable_t

template<typename T >
using ck::is_pack4_invocable_t = typedef decltype(ck::declval<T&>().is_pack4_invocable)

◆ is_pack8_invocable_t

template<typename T >
using ck::is_pack8_invocable_t = typedef decltype(ck::declval<T&>().is_pack8_invocable)

◆ is_tuple

template<typename T >
using ck::is_tuple = typedef decltype(ck::declval<T&>().IsTuple())

◆ iter_difference_t

template<typename T >
using ck::iter_difference_t = typedef typename std::iterator_traits<remove_cvref_t<T> >::difference_type

◆ iter_reference_t

template<typename T >
using ck::iter_reference_t = typedef decltype(*std::declval<T&>())

◆ iter_value_t

template<typename T >
using ck::iter_value_t = typedef typename std::iterator_traits<remove_cvref_t<T> >::value_type

◆ long_index_t

using ck::long_index_t = typedef int64_t

◆ LongNumber

template<index_t N>
using ck::LongNumber = typedef integral_constant<long_index_t, N>

◆ make_index_sequence

template<index_t N>
using ck::make_index_sequence = typedef typename __make_integer_seq<impl::__integer_sequence, index_t, N>::seq_type

◆ MultiIndex

template<index_t N>
using ck::MultiIndex = typedef StaticallyIndexedArray<index_t, N>

◆ Number

template<index_t N>
using ck::Number = typedef integral_constant<index_t, N>

◆ packed_type_t

template<typename T , index_t N = 0>
using ck::packed_type_t = typedef typename packed_type_maker<T, N>::packed_type

◆ pk_i4x2_t

using ck::pk_i4x2_t = typedef typename vector_type<pk_i4_t, 2>::type

◆ pk_i4x4_t

using ck::pk_i4x4_t = typedef typename vector_type<pk_i4_t, 4>::type

◆ pk_i4x8_t

using ck::pk_i4x8_t = typedef typename vector_type<pk_i4_t, 8>::type

◆ remove_cv_t

template<typename T >
using ck::remove_cv_t = typedef typename remove_cv<T>::type

◆ remove_cvref_t

template<typename T >
using ck::remove_cvref_t = typedef remove_cv_t<remove_reference_t<T> >

◆ remove_pointer_t

template<typename T >
using ck::remove_pointer_t = typedef typename remove_pointer<T>::type

◆ remove_reference_t

template<typename T >
using ck::remove_reference_t = typedef typename remove_reference<T>::type

◆ sequence_merge_t

template<typename Sx , typename Sy >
using ck::sequence_merge_t = typedef typename sequence_merge<Sx, Sy>::type

◆ StaticallyIndexedArray

template<typename T , index_t N>
using ck::StaticallyIndexedArray = typedef typename detail::StaticallyIndexedArrayImpl<T, N>::type

◆ TensorCoordinate_t

template<typename TensorDesc >
using ck::TensorCoordinate_t = typedef decltype(make_tensor_coordinate( TensorDesc{}, MultiIndex<remove_cvref_t<TensorDesc>::GetNumOfDimension()>{}))

◆ TensorCoordinateStep_t

template<typename TensorDesc >
using ck::TensorCoordinateStep_t = typedef decltype(make_tensor_coordinate_step( TensorDesc{}, MultiIndex<remove_cvref_t<TensorDesc>::GetNumOfDimension()>{}))

◆ tf32_t

using ck::tf32_t = typedef _BitInt(19)

◆ true_type

using ck::true_type = typedef bool_constant<true>

◆ tuple_element_t

template<index_t I, typename TTuple >
using ck::tuple_element_t = typedef typename tuple_element<I, TTuple>::type

◆ uint8x16_t

using ck::uint8x16_t = typedef typename vector_type<uint8_t, 16>::type

◆ uint8x2_t

using ck::uint8x2_t = typedef typename vector_type<uint8_t, 2>::type

◆ uint8x32_t

using ck::uint8x32_t = typedef typename vector_type<uint8_t, 32>::type

◆ uint8x4_t

using ck::uint8x4_t = typedef typename vector_type<uint8_t, 4>::type

◆ uint8x64_t

using ck::uint8x64_t = typedef typename vector_type<uint8_t, 64>::type

◆ uint8x8_t

using ck::uint8x8_t = typedef typename vector_type<uint8_t, 8>::type

◆ uniform_sequence_gen_t

template<index_t NSize, index_t I>
using ck::uniform_sequence_gen_t = typedef typename uniform_sequence_gen<NSize, I>::type

◆ vector_type_maker_t

template<typename T , index_t N>
using ck::vector_type_maker_t = typedef typename vector_type_maker<T, N>::type

Enumeration Type Documentation

◆ Activation [1/5]

Enumerator
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 

◆ Activation [2/5]

Enumerator
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 

◆ Activation [3/5]

Enumerator
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 

◆ Activation [4/5]

Enumerator
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 

◆ Activation [5/5]

Enumerator
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 

◆ AddressSpaceEnum

enum ck::AddressSpaceEnum
strong
Enumerator
Generic 
Global 
Lds 
Sgpr 
Vgpr 

◆ AmdBufferCoherenceEnum [1/2]

Enumerator
DefaultCoherence 
GLC 
SLC 
GLC_SLC 
WAVE_NT0 
WAVE_NT1 
GROUP_NT0 
GROUP_NT1 
DEVICE_NT0 
DEVICE_NT1 
SYSTEM_NT0 
SYSTEM_NT1 
DefaultCoherence 
GLC 
SLC 
GLC_SLC 
WAVE_NT0 
WAVE_NT1 
GROUP_NT0 
GROUP_NT1 
DEVICE_NT0 
DEVICE_NT1 
SYSTEM_NT0 
SYSTEM_NT1 

◆ AmdBufferCoherenceEnum [2/2]

Enumerator
DefaultCoherence 
GLC 
SLC 
GLC_SLC 
WAVE_NT0 
WAVE_NT1 
GROUP_NT0 
GROUP_NT1 
DEVICE_NT0 
DEVICE_NT1 
SYSTEM_NT0 
SYSTEM_NT1 
DefaultCoherence 
GLC 
SLC 
GLC_SLC 
WAVE_NT0 
WAVE_NT1 
GROUP_NT0 
GROUP_NT1 
DEVICE_NT0 
DEVICE_NT1 
SYSTEM_NT0 
SYSTEM_NT1 

◆ BlockGemmPipelineScheduler

Enumerator
Intrawave 
Interwave 

◆ BlockGemmPipelineVersion

Enumerator
v1 
v2 
v3 
v4 
v5 

◆ ck_fp8_interpretation_t

Describes FP8 interpretation.

Enumerator
CK_E4M3_OCP 
CK_E5M2_OCP 
CK_E4M3_FNUZ 
CK_E5M2_FNUZ 

◆ ck_saturation_t

enum ck::ck_saturation_t
strong

Describes saturation behavior.

Enumerator
CK_NOSAT 
CK_SATFINITE 

◆ DppInstr

enum ck::DppInstr
strong
Enumerator
dpp8_f16_1x32x2 
dpp8_f16_2x16x2 
dpp8_f16_2x32x2 
dpp8_f16_4x16x2 
dpp8_f16_4x32x2 
dpp8_f16_8x16x2 
dpp8_f16_8x32x2 
dpp8_f16_16x16x2 
dpp8_f16_32x8x2 

◆ f8_rounding_mode

enum ck::f8_rounding_mode
strong
Enumerator
standard 
stochastic 

◆ IndicesType

enum ck::IndicesType
strong
Enumerator
INDICES_32BIT 
INDICES_64BIT 
INDICES_16BIT 
INDICES_8BIT 

◆ InMemoryDataOperationEnum

Enumerator
Set 
AtomicAdd 
AtomicMax 
Add 

◆ LoopScheduler

enum ck::LoopScheduler
strong
Enumerator
Default 
Interwave 

◆ MfmaInstr

enum ck::MfmaInstr
strong
Enumerator
mfma_f32_32x32x1f32 
mfma_f32_16x16x1f32 
mfma_f32_4x4x1f32 
mfma_f32_32x32x2f32 
mfma_f32_16x16x4f32 
mfma_f32_32x32x4f16 
mfma_f32_16x16x4f16 
mfma_f32_4x4x4f16 
mfma_f32_32x32x8f16 
mfma_f32_16x16x16f16 
mfma_f32_32x32x8bf16_1k 
mfma_f32_16x16x16bf16_1k 
mfma_f32_32x32x4bf16 
mfma_f32_16x16x8bf16 
mfma_i32_32x32x8i8 
mfma_i32_16x16x16i8 
mfma_i32_32x32x16i8 
mfma_i32_16x16x32i8 
mfma_f64_16x16x4f64 
mfma_f32_32x32x16f8f8 
mfma_f32_16x16x32f8f8 
mfma_f32_32x32x16bf8bf8 
mfma_f32_16x16x32bf8bf8 
mfma_f32_32x32x16f8bf8 
mfma_f32_16x16x32f8bf8 
mfma_f32_32x32x16bf8f8 
mfma_f32_16x16x32bf8f8 
mfma_f32_32x32x16f16 
mfma_f32_16x16x32f16 
mfma_f32_32x32x16bf16 
mfma_f32_16x16x32bf16 
mfma_i32_32x32x32i8 
mfma_i32_16x16x64i8 
mfma_f32_32x32x64f8f6f4 
mfma_f32_16x16x128f8f6f4 
mfma_scale_f32_32x32x64f8f6f4 
mfma_scale_f32_16x16x128f8f6f4 
mfma_f32_16x16x8xf32 
mfma_f32_32x32x4xf32 
wmma_f32_16x16x16_f16 
wmma_f32_16x16x16_bf16 
wmma_i32_16x16x16_iu8 
wmma_unsupport_16x16_gfx11 
wmma_f32_16x16x16_f16_gfx12 
wmma_f32_16x16x16_bf16_gfx12 
wmma_i32_16x16x16_iu8_gfx12 
wmma_f32_16x16x16_f8f8_gfx12 
wmma_f32_16x16x16_f8bf8_gfx12 
wmma_f32_16x16x16_bf8f8_gfx12 
wmma_f32_16x16x16_bf8bf8_gfx12 
wmma_unsupport_16x16_gfx12 

◆ NanPropagation

enum ck::NanPropagation
strong
Enumerator
NOT_PROPAGATE_NAN 
PROPAGATE_NAN 

◆ PipelineVersion

enum ck::PipelineVersion
strong
Enumerator
v1 
v2 
v4 
weight_only 

◆ ReduceTensorIndices

Enumerator
NO_INDICES 
FLATTENED_INDICES 

◆ ReduceTensorOp

enum ck::ReduceTensorOp
strong
Enumerator
ADD 
MUL 
MIN 
MAX 
AMAX 
AVG 
NORM1 
NORM2 

◆ SchedulerGroup

Enumerator
SCHED_GROUP_MFMA 
SCHED_GROUP_VMEM 
SCHED_GROUP_LDS_READ 
SCHED_GROUP_LDS_WRITE 

◆ SmfmacInstr

enum ck::SmfmacInstr
strong
Enumerator
smfmac_f32_16x16x32f16 
smfmac_f32_32x32x16f16 
smfmac_f32_16x16x32bf16 
smfmac_f32_32x32x16bf16 

◆ StreamKReductionStrategy

Enumerator
Atomic 
Reduction 

◆ TailNumber

enum ck::TailNumber
strong
Enumerator
Odd 
Even 
One 
Two 
Three 
Four 
Five 
Six 
Seven 
Empty 
Full 

◆ WmmaInstr

enum ck::WmmaInstr
strong
Enumerator
wmma_f32_16x16x16_f16 
wmma_f32_16x16x16_bf16 
wmma_f16_16x16x16_f16 
wmma_bf16_16x16x16_bf16 
wmma_i32_16x16x16_iu8 
wmma_i32_16x16x16_iu4 
wmma_f32_16x16x16_f16_gfx12 
wmma_f32_16x16x16_bf16_gfx12 
wmma_i32_16x16x16_iu8_gfx12 
wmma_f32_16x16x16_f8f8_gfx12 
wmma_f32_16x16x16_f8bf8_gfx12 
wmma_f32_16x16x16_bf8f8_gfx12 
wmma_f32_16x16x16_bf8bf8_gfx12 

Function Documentation

◆ accumulate_n()

template<typename T , typename ForwardIterator , typename Size , typename BinaryOperation >
auto ck::accumulate_n ( ForwardIterator  first,
Size  count,
init,
BinaryOperation  op 
) -> decltype(std::accumulate(first, std::next(first, count), init, op))

◆ amd_assemble_cvt_f32_i4()

__device__ float ck::amd_assemble_cvt_f32_i4 ( int  b)
inline

◆ amd_assembly_and_b32()

__device__ int ck::amd_assembly_and_b32 ( int  a,
int  b 
)
inline

◆ amd_assembly_and_or_b32()

__device__ int ck::amd_assembly_and_or_b32 ( int  a,
int  b,
int  d 
)
inline

◆ amd_assembly_cvt_f8_to_f32()

__device__ f8x4_t ck::amd_assembly_cvt_f8_to_f32 ( float  b0,
float  b1,
float  b2,
float  b3 
)
inline

◆ amd_assembly_i4_to_fp8x8()

__device__ f8x8_t ck::amd_assembly_i4_to_fp8x8 ( int  a)
inline

◆ amd_assembly_outer_product_1x2() [1/3]

__device__ void ck::amd_assembly_outer_product_1x2 ( float  a,
float  b0,
float  b1,
float &  c0,
float &  c1 
)

◆ amd_assembly_outer_product_1x2() [2/3]

__device__ void ck::amd_assembly_outer_product_1x2 ( half2_t  a,
half2_t  b0,
half2_t  b1,
float &  c0,
float &  c1 
)

◆ amd_assembly_outer_product_1x2() [3/3]

__device__ void ck::amd_assembly_outer_product_1x2 ( int8x4_t  a,
int8x4_t  b0,
int8x4_t  b1,
int32_t c0,
int32_t c1 
)

◆ amd_assembly_outer_product_1x4() [1/3]

__device__ void ck::amd_assembly_outer_product_1x4 ( float  a,
float  b0,
float  b1,
float  b2,
float  b3,
float &  c0,
float &  c1,
float &  c2,
float &  c3 
)

◆ amd_assembly_outer_product_1x4() [2/3]

__device__ void ck::amd_assembly_outer_product_1x4 ( half2_t  a,
half2_t  b0,
half2_t  b1,
half2_t  b2,
half2_t  b3,
float &  c0,
float &  c1,
float &  c2,
float &  c3 
)

◆ amd_assembly_outer_product_1x4() [3/3]

__device__ void ck::amd_assembly_outer_product_1x4 ( int8x4_t  a,
int8x4_t  b0,
int8x4_t  b1,
int8x4_t  b2,
int8x4_t  b3,
int32_t c0,
int32_t c1,
int32_t c2,
int32_t c3 
)

◆ amd_assembly_pk_add_f16()

__device__ half2_t ck::amd_assembly_pk_add_f16 ( half2_t  a,
half2_t  b 
)
inline

◆ amd_assembly_pk_fma_f16()

__device__ half2_t ck::amd_assembly_pk_fma_f16 ( half2_t  a,
half2_t  b,
half2_t  c 
)
inline

◆ amd_buffer_atomic_add()

template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_add ( const typename vector_type_maker< T, N >::type::type  src_thread_data,
T *  p_dst_wave,
const index_t  dst_thread_element_offset,
const bool  dst_thread_element_valid,
const index_t  dst_element_space_size 
)

◆ amd_buffer_atomic_add_impl()

template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_add_impl ( const typename vector_type< T, N >::type  src_thread_data,
int32x4_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_buffer_atomic_max()

template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_max ( const typename vector_type_maker< T, N >::type::type  src_thread_data,
T *  p_dst_wave,
const index_t  dst_thread_element_offset,
const bool  dst_thread_element_valid,
const index_t  dst_element_space_size 
)

◆ amd_buffer_atomic_max_impl()

template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_max_impl ( const typename vector_type< T, N >::type  src_thread_data,
int32x4_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_buffer_load_impl() [1/2]

template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type<T, N>::type ck::amd_buffer_load_impl ( __amdgpu_buffer_rsrc_t  src_wave_buffer_resource,
index_t  src_thread_addr_offset,
index_t  src_wave_addr_offset 
)

◆ amd_buffer_load_impl() [2/2]

template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type<T, N>::type ck::amd_buffer_load_impl ( int32x4_t  src_wave_buffer_resource,
index_t  src_thread_addr_offset,
index_t  src_wave_addr_offset 
)

◆ amd_buffer_load_impl_raw() [1/2]

template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type<int8_t, N>::type ck::amd_buffer_load_impl_raw ( __amdgpu_buffer_rsrc_t  src_wave_buffer_resource,
index_t  src_thread_addr_offset,
index_t  src_wave_addr_offset 
)

◆ amd_buffer_load_impl_raw() [2/2]

template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type<int8_t, N>::type ck::amd_buffer_load_impl_raw ( int32x4_t  src_wave_buffer_resource,
index_t  src_thread_addr_offset,
index_t  src_wave_addr_offset 
)

◆ amd_buffer_load_invalid_element_return_customized_value()

template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type ck::amd_buffer_load_invalid_element_return_customized_value ( const T *  p_src_wave,
index_t  src_thread_element_offset,
bool  src_thread_element_valid,
index_t  src_element_space_size,
customized_value 
)

◆ amd_buffer_load_invalid_element_return_zero()

template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type ck::amd_buffer_load_invalid_element_return_zero ( const T *  p_src_wave,
index_t  src_thread_element_offset,
bool  src_thread_element_valid,
index_t  src_element_space_size 
)

◆ amd_buffer_store()

template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store ( const typename vector_type_maker< T, N >::type::type  src_thread_data,
T *  p_dst_wave,
const index_t  dst_thread_element_offset,
const bool  dst_thread_element_valid,
const index_t  dst_element_space_size 
)

◆ amd_buffer_store_impl() [1/2]

template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl ( const typename vector_type< T, N >::type  src_thread_data,
__amdgpu_buffer_rsrc_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_buffer_store_impl() [2/2]

template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl ( const typename vector_type< T, N >::type  src_thread_data,
int32x4_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_buffer_store_impl_raw() [1/2]

template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl_raw ( const typename vector_type< int8_t, N >::type  src_thread_data,
__amdgpu_buffer_rsrc_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_buffer_store_impl_raw() [2/2]

template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl_raw ( const typename vector_type< int8_t, N >::type  src_thread_data,
int32x4_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_direct_load_global_to_lds()

template<typename T , index_t NumElemsPerThread>
__device__ void ck::amd_direct_load_global_to_lds ( const T *  global_base_ptr,
const index_t  global_offset,
T *  lds_base_ptr,
const index_t  lds_offset,
const bool  is_valid,
const index_t  src_element_space_size 
)

◆ amd_global_atomic_add_impl()

template<typename T , index_t N>
__device__ void ck::amd_global_atomic_add_impl ( const typename vector_type< T, N >::type  src_thread_data,
T *  addr 
)

◆ amd_wave_read_first_lane() [1/4]

template<typename Object , typename = ck::enable_if_t<ck::is_class_v<Object> && ck::is_trivially_copyable_v<Object>>>
__device__ auto ck::amd_wave_read_first_lane ( const Object &  obj)

NOTE: Implicitly start object lifetime. It's better to use std::start_lifetime_at() in this scenario

◆ amd_wave_read_first_lane() [2/4]

__device__ int32_t ck::amd_wave_read_first_lane ( int32_t  value)
inline

◆ amd_wave_read_first_lane() [3/4]

__device__ int64_t ck::amd_wave_read_first_lane ( int64_t  value)
inline

◆ amd_wave_read_first_lane() [4/4]

__device__ uint32_t ck::amd_wave_read_first_lane ( uint32_t  value)
inline

◆ array_convert() [1/2]

template<typename Y , typename X , index_t NumElems>
__host__ __device__ void ck::array_convert ( Array< Y, NumElems > &  y,
const Array< X, NumElems > &  x 
)
inline

◆ array_convert() [2/2]

template<typename Y , typename X , size_t NumElems>
__host__ __device__ void ck::array_convert ( std::array< Y, NumElems > &  y,
const std::array< X, NumElems > &  x 
)
inline

◆ atomic_add()

template<typename X >
__device__ X ck::atomic_add ( X *  p_dst,
const X &  x 
)

◆ atomic_add< _Float16 >()

template<>
__device__ _Float16 ck::atomic_add< _Float16 > ( _Float16 *  p_dst,
const _Float16 &  x 
)

◆ atomic_add< double >()

template<>
__device__ double ck::atomic_add< double > ( double *  p_dst,
const double &  x 
)

◆ atomic_add< double2_t >()

template<>
__device__ double2_t ck::atomic_add< double2_t > ( double2_t p_dst,
const double2_t x 
)

◆ atomic_add< float >()

template<>
__device__ float ck::atomic_add< float > ( float *  p_dst,
const float &  x 
)

◆ atomic_add< float2_t >()

template<>
__device__ float2_t ck::atomic_add< float2_t > ( float2_t p_dst,
const float2_t x 
)

◆ atomic_add< int32_t >()

template<>
__device__ int32_t ck::atomic_add< int32_t > ( int32_t p_dst,
const int32_t x 
)

◆ atomic_add< uint32_t >()

template<>
__device__ uint32_t ck::atomic_add< uint32_t > ( uint32_t p_dst,
const uint32_t x 
)

◆ atomic_add< unsigned short >()

template<>
__device__ unsigned short ck::atomic_add< unsigned short > ( unsigned short *  p_dst,
const unsigned short &  x 
)

◆ atomic_max()

template<typename X >
__device__ X ck::atomic_max ( X *  p_dst,
const X &  x 
)

◆ atomic_max< double >()

template<>
__device__ double ck::atomic_max< double > ( double *  p_dst,
const double &  x 
)

◆ atomic_max< float >()

template<>
__device__ float ck::atomic_max< float > ( float *  p_dst,
const float &  x 
)

◆ atomic_max< float2_t >()

template<>
__device__ float2_t ck::atomic_max< float2_t > ( float2_t p_dst,
const float2_t x 
)

◆ atomic_max< int32_t >()

template<>
__device__ int32_t ck::atomic_max< int32_t > ( int32_t p_dst,
const int32_t x 
)

◆ atomic_max< uint32_t >()

template<>
__device__ uint32_t ck::atomic_max< uint32_t > ( uint32_t p_dst,
const uint32_t x 
)

◆ bf16_convert_rtn()

template<typename Y , typename X >
__host__ constexpr __device__ Y ck::bf16_convert_rtn ( x)
constexpr

◆ bf16_convert_rtn< bhalf_t, float >()

template<>
__host__ constexpr __device__ bhalf_t ck::bf16_convert_rtn< bhalf_t, float > ( float  x)
inlineconstexpr

◆ bf16_convert_rtn< bhalf_t, half_t >()

template<>
__host__ constexpr __device__ bhalf_t ck::bf16_convert_rtn< bhalf_t, half_t > ( half_t  x)
inlineconstexpr

◆ bf6_convert_rne() [1/2]

__host__ __device__ bf6_t ck::bf6_convert_rne ( float  x,
float  scale = 1.0f 
)
inline

Converts a float to the 6-bit BF6 type using round-to-nearest-even.

Divides the input by the specified scale, then saturates and converts it to a 6-bit BF6 floating-point format.

Parameters
xThe float value to be converted.
scaleThe scaling factor applied to the input before conversion.
Returns
The converted bf6_t value.

◆ bf6_convert_rne() [2/2]

__host__ __device__ bf6x32_t ck::bf6_convert_rne ( float32_t  x,
float  scale = 1.0f 
)
inline

Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using round-to-nearest-even.

Divides the input by the specified scale, then saturates and converts it to a 6-bit BF6 floating-point format.

Parameters
xThe float vector to be converted.
scaleThe scaling factor applied to the input before conversion.
Returns
The converted bf6x32_t vector.

◆ bf6_convert_sr() [1/2]

__host__ __device__ bf6_t ck::bf6_convert_sr ( float  x,
float  scale = 1.0f 
)
inline

Converts a float to the 6-bit BF6 type using stochastic rounding.

Divides the input by the specified scale, and converts the result to a 6-bit BF6 floating-point format with stochastic rounding.

Parameters
xThe float value to be converted.
scaleThe scaling factor applied to the input before conversion.
Returns
The converted bf6_t value.

◆ bf6_convert_sr() [2/2]

__host__ __device__ bf6x32_t ck::bf6_convert_sr ( float32_t  x,
float  scale = 1.0f 
)
inline

Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using stochastic rounding.

Divides the input by the specified scale, and converts the result to a 6-bit BF6 floating-point format with stochastic rounding.

Parameters
xThe float vector to be converted.
scaleThe scaling factor applied to the input before conversion.
Returns
The converted bf6x32_t vector.

◆ bit_cast()

template<typename Y , typename X , typename enable_if< sizeof(X)==sizeof(Y), bool >::type = false>
__host__ constexpr __device__ Y ck::bit_cast ( const X &  x)
constexpr

◆ block_sync_lds()

__device__ void ck::block_sync_lds ( )

◆ block_sync_lds_direct_load()

__device__ void ck::block_sync_lds_direct_load ( )

◆ BlockGemmABScalePipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockGemmABScalePipeline_Selector ( )
constexpr

◆ BlockGemmBlockMoeScaleBPreshufflePipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto ck::BlockGemmBlockMoeScaleBPreshufflePipeline_Selector ( )
constexpr

◆ BlockGemmBlockScaleBPreshufflePipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockGemmBlockScaleBPreshufflePipeline_Selector ( )
constexpr

◆ BlockGemmBPreshufflePipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto ck::BlockGemmBPreshufflePipeline_Selector ( )
constexpr

◆ BlockGemmMXBPreshufflePipeline_Selector() [1/2]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto ck::BlockGemmMXBPreshufflePipeline_Selector ( )
constexpr

◆ BlockGemmMXBPreshufflePipeline_Selector() [2/2]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockGemmMXBPreshufflePipeline_Selector ( )
constexpr

◆ BlockGemmMXNBSPipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto ck::BlockGemmMXNBSPipeline_Selector ( )
constexpr

◆ BlockGemmMXPipeline_Selector() [1/2]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto ck::BlockGemmMXPipeline_Selector ( )
constexpr

◆ BlockGemmMXPipeline_Selector() [2/2]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockGemmMXPipeline_Selector ( )
constexpr

◆ BlockGemmPipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeTypeA , typename ComputeTypeB , typename AccDataType , typename AWmmaTileDesc , typename BWmmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false>
constexpr auto ck::BlockGemmPipeline_Selector ( )
constexpr

◆ BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()

template<index_t BlockSize, typename FloatA , typename FloatB , typename FloatAcc , typename AK0MK1BlockDesc , typename BK0NK1BlockDesc , index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, LoopScheduler LoopSched, typename ComputeTypeA = FloatA, typename ComputeTypeB = FloatB>
constexpr auto ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector ( )
constexpr

◆ c_style_pointer_cast()

template<typename PY , typename PX , typename enable_if< is_pointer_v< PY > &&is_pointer_v< PX >, bool >::type = false>
__host__ __device__ PY ck::c_style_pointer_cast ( PX  p_x)

◆ cast_pointer_to_constant_address_space()

template<typename T >
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* ck::cast_pointer_to_constant_address_space ( T *  p)

◆ cast_pointer_to_generic_address_space()

template<typename T >
__device__ T* ck::cast_pointer_to_generic_address_space ( T CK_CONSTANT_ADDRESS_SPACE p)

◆ chain_tensor_adaptors()

template<typename TensorAdaptor0 , typename TensorAdaptor1 >
__host__ constexpr __device__ auto ck::chain_tensor_adaptors ( const TensorAdaptor0 &  adaptor0,
const TensorAdaptor1 &  adaptor1 
)
constexpr

◆ clz()

__device__ int ck::clz ( uint32_t  x)
inline

◆ concat_tuple() [1/3]

template<typename... X>
__host__ constexpr __device__ auto ck::concat_tuple ( const Tuple< X... > &  tx)
constexpr

◆ concat_tuple() [2/3]

template<typename... X, typename... Y>
__host__ constexpr __device__ auto ck::concat_tuple ( const Tuple< X... > &  tx,
const Tuple< Y... > &  ty 
)
constexpr

◆ concat_tuple() [3/3]

template<typename... X, typename... Tuples>
__host__ constexpr __device__ auto ck::concat_tuple ( const Tuple< X... > &  tx,
const Tuples &...  tuples 
)
constexpr

◆ concat_tuple_of_reference()

template<typename... X, typename... Y>
__host__ constexpr __device__ auto ck::concat_tuple_of_reference ( const Tuple< X &... > &  tx,
const Tuple< Y &... > &  ty 
)
constexpr

◆ conditional_expr()

template<bool predicate, typename X , typename Y >
constexpr auto ck::conditional_expr ( X &&  x,
Y &&  y 
)
constexpr

◆ container_concat() [1/4]

template<typename T , index_t NX, index_t NY>
__host__ constexpr __device__ auto ck::container_concat ( const Array< T, NX > &  ax,
const Array< T, NY > &  ay 
)
constexpr

◆ container_concat() [2/4]

template<typename Container >
__host__ constexpr __device__ auto ck::container_concat ( const Container &  x)
constexpr

◆ container_concat() [3/4]

template<typename... X, typename... Y>
__host__ constexpr __device__ auto ck::container_concat ( const Tuple< X... > &  tx,
const Tuple< Y... > &  ty 
)
constexpr

◆ container_concat() [4/4]

template<typename X , typename... Ys>
__host__ constexpr __device__ auto ck::container_concat ( const X &  x,
const Ys &...  ys 
)
constexpr

◆ container_push_back() [1/2]

template<typename TData , index_t NSize>
__host__ constexpr __device__ auto ck::container_push_back ( const Array< TData, NSize > &  a,
const TData &  x 
)
constexpr

◆ container_push_back() [2/2]

template<typename... Ts, typename T >
__host__ constexpr __device__ auto ck::container_push_back ( const Tuple< Ts... > &  a,
const T &  x 
)
constexpr

◆ container_push_front()

template<typename... Ts, typename T >
__host__ constexpr __device__ auto ck::container_push_front ( const Tuple< Ts... > &  a,
const T &  x 
)
constexpr

◆ container_reduce()

template<typename Container , typename Reduce , typename Init , index_t IBegin = 0, index_t IEnd = Container::Size(), index_t IStep = 1>
__host__ constexpr __device__ auto ck::container_reduce ( const Container &  x,
Reduce  reduce,
Init  init,
Number< IBegin >  = Number<0>{},
Number< IEnd >  = Number<Container::Size()>{},
Number< IStep >  = Number<1>{} 
)
constexpr

◆ container_reorder_given_new2old() [1/3]

template<typename TData , index_t NSize, index_t... IRs>
__host__ constexpr __device__ auto ck::container_reorder_given_new2old ( const Array< TData, NSize > &  old_array,
Sequence< IRs... >   
)
constexpr

◆ container_reorder_given_new2old() [2/3]

template<typename... Ts, index_t... IRs>
__host__ constexpr __device__ auto ck::container_reorder_given_new2old ( const Tuple< Ts... > &  old_tuple,
Sequence< IRs... >   
)
constexpr

◆ container_reorder_given_new2old() [3/3]

template<index_t... Is, index_t... IRs>
__host__ constexpr __device__ auto ck::container_reorder_given_new2old ( Sequence< Is... >  ,
Sequence< IRs... >   
)
constexpr

◆ container_reorder_given_old2new() [1/3]

template<typename TData , index_t NSize, index_t... IRs>
__host__ constexpr __device__ auto ck::container_reorder_given_old2new ( const Array< TData, NSize > &  old_array,
Sequence< IRs... >  old2new 
)
constexpr

◆ container_reorder_given_old2new() [2/3]

template<typename... Ts, index_t... IRs>
__host__ constexpr __device__ auto ck::container_reorder_given_old2new ( const Tuple< Ts... > &  old_tuple,
Sequence< IRs... >  old2new 
)
constexpr

◆ container_reorder_given_old2new() [3/3]

template<index_t... Is, index_t... IRs>
__host__ constexpr __device__ auto ck::container_reorder_given_old2new ( Sequence< Is... >  old_seq,
Sequence< IRs... >   
)
constexpr

◆ container_reverse_exclusive_scan() [1/3]

template<typename TData , index_t NSize, typename Reduce >
__host__ constexpr __device__ auto ck::container_reverse_exclusive_scan ( const Array< TData, NSize > &  x,
Reduce  f,
TData  init 
)
constexpr

◆ container_reverse_exclusive_scan() [2/3]

template<index_t... Is, typename Reduce , index_t Init>
__host__ constexpr __device__ auto ck::container_reverse_exclusive_scan ( const Sequence< Is... > &  seq,
Reduce  f,
Number< Init >   
)
constexpr

◆ container_reverse_exclusive_scan() [3/3]

template<typename... Xs, typename Reduce , typename Init >
__host__ constexpr __device__ auto ck::container_reverse_exclusive_scan ( const Tuple< Xs... > &  x,
Reduce  reduce,
Init  init 
)
constexpr

◆ container_reverse_inclusive_scan() [1/2]

template<typename TData , index_t NSize, typename Reduce >
__host__ constexpr __device__ auto ck::container_reverse_inclusive_scan ( const Array< TData, NSize > &  x,
Reduce  f,
TData  init 
)
constexpr

◆ container_reverse_inclusive_scan() [2/2]

template<typename... Xs, typename Reduce , typename TData >
__host__ constexpr __device__ auto ck::container_reverse_inclusive_scan ( const Tuple< Xs... > &  x,
Reduce  f,
TData  init 
)
constexpr

◆ coordinate_has_valid_offset()

template<typename TensorDesc , typename TensorCoord >
__host__ constexpr __device__ bool ck::coordinate_has_valid_offset ( const TensorDesc &  tensor_desc,
const TensorCoord &  coord 
)
constexpr

◆ coordinate_has_valid_offset_assuming_visible_index_is_valid()

template<typename TensorDesc , typename TensorCoord >
__host__ constexpr __device__ bool ck::coordinate_has_valid_offset_assuming_visible_index_is_valid ( const TensorDesc &  tensor_desc,
const TensorCoord &  coord 
)
constexpr

◆ DefaultValidCTileIndex()

template<typename CTileIdx , typename CTileDim >
__host__ __device__ bool ck::DefaultValidCTileIndex ( const CTileIdx &  c_tile_idx,
const CTileDim &  c_tile_dim 
)

◆ EnvGetString()

template<class EnvVar >
const std::string& ck::EnvGetString ( EnvVar  )
inline

◆ EnvIsDisabled()

template<class EnvVar >
bool ck::EnvIsDisabled ( EnvVar  )
inline

◆ EnvIsEnabled()

template<class EnvVar >
bool ck::EnvIsEnabled ( EnvVar  )
inline

◆ EnvIsUnset()

template<class EnvVar >
bool ck::EnvIsUnset ( EnvVar  )
inline

◆ EnvUnset()

template<class EnvVar >
void ck::EnvUnset ( EnvVar  )

◆ EnvValue()

template<class EnvVar >
uint64_t ck::EnvValue ( EnvVar  )
inline

◆ f4_convert_rne() [1/2]

__host__ __device__ f4_t ck::f4_convert_rne ( float  x,
float  scale = 1.0f 
)
inline

◆ f4_convert_rne() [2/2]

__host__ __device__ f4x32_t ck::f4_convert_rne ( float2_t  x,
float  scale = 1.0f 
)
inline

◆ f4_convert_sr() [1/2]

__host__ __device__ f4_t ck::f4_convert_sr ( float  x,
float  scale = 1.0f 
)
inline

◆ f4_convert_sr() [2/2]

__host__ __device__ f4x32_t ck::f4_convert_sr ( float2_t  x,
float  scale = 1.0f 
)
inline

◆ f6_convert_rne() [1/2]

__host__ __device__ f6_t ck::f6_convert_rne ( float  x,
float  scale = 1.0f 
)
inline

Converts a float to a 6-bit float type (f6_t) using round-to-nearest-even.

Divides the input by the specified scale, then saturates and converts it to the 6-bit floating-point format (f6_t).

Parameters
xThe input float value.
scaleA scaling factor applied to x before conversion.
Returns
The converted f6_t value.

◆ f6_convert_rne() [2/2]

__host__ __device__ f6x32_t ck::f6_convert_rne ( float32_t  x,
float  scale = 1.0f 
)
inline

Converts a 32-element single-precision float array into a packed 6-bit representation.

This function divides each input float by the provided scale value, then performs conversion with rounding to nearest / even to pack each element into 6 bits of precision.

Parameters
xA vector of 32 floats stored in float32_t.
scaleA scaling factor for each float before conversion.
Returns
An f6x32_t object storing the compressed 6-bit representation.

◆ f6_convert_sr() [1/2]

__host__ __device__ f6_t ck::f6_convert_sr ( float  x,
float  scale = 1.0f 
)
inline

Converts a float to the 6-bit floating-point type (f6_t) using stochastic rounding.

Divides the input by the specified scale, then performs saturation and conversion to f6_t based on a pseudo-randomly generated seed.

Parameters
xThe input float value.
scaleA scaling factor applied to x before conversion.
Returns
The converted f6_t value.

◆ f6_convert_sr() [2/2]

__host__ __device__ f6x32_t ck::f6_convert_sr ( float32_t  x,
float  scale = 1.0f 
)
inline

Converts a 32-element single-precision float array into a packed 6-bit representation.

This function divides each input float by the provided scale value, then performs conversion with stochastic rounding to pack each element into 6 bits of precision.

Parameters
xA vector of 32 floats stored in float32_t.
scaleA scaling factor for each float before conversion.
Returns
An f6x32_t object storing the compressed 6-bit representation.

◆ f8_convert_rne()

template<typename Y , typename X >
__host__ constexpr __device__ Y ck::f8_convert_rne ( x)
constexpr

◆ f8_convert_rne< bf8_fnuz_t, float >()

template<>
__host__ __device__ bf8_fnuz_t ck::f8_convert_rne< bf8_fnuz_t, float > ( float  x)
inline

◆ f8_convert_rne< bf8_fnuz_t, half_t >()

template<>
__host__ __device__ bf8_fnuz_t ck::f8_convert_rne< bf8_fnuz_t, half_t > ( half_t  x)
inline

◆ f8_convert_rne< bf8_ocp_t, bhalf_t >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_rne< bf8_ocp_t, bhalf_t > ( bhalf_t  x)
inline

Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input bhalf_t value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_rne< bf8_ocp_t, float >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_rne< bf8_ocp_t, float > ( float  x)
inline

Converts a float to a 8-bit float type (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input float value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_rne< bf8_ocp_t, half_t >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_rne< bf8_ocp_t, half_t > ( half_t  x)
inline

Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input half_t value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_rne< bf8x2_ocp_t, bhalf2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_rne< bf8x2_ocp_t, bhalf2_t > ( bhalf2_t  x)
inline

Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 bhalf_t.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_rne< bf8x2_ocp_t, float2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_rne< bf8x2_ocp_t, float2_t > ( float2_t  x)
inline

Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 floats.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_rne< bf8x2_ocp_t, half2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_rne< bf8x2_ocp_t, half2_t > ( half2_t  x)
inline

Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 half_t.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_rne< f8_fnuz_t, float >()

template<>
__host__ __device__ f8_fnuz_t ck::f8_convert_rne< f8_fnuz_t, float > ( float  x)
inline

◆ f8_convert_rne< f8_fnuz_t, half_t >()

template<>
__host__ __device__ f8_fnuz_t ck::f8_convert_rne< f8_fnuz_t, half_t > ( half_t  x)
inline

◆ f8_convert_rne< f8_ocp_t, bhalf_t >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_rne< f8_ocp_t, bhalf_t > ( bhalf_t  x)
inline

Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input bhalf_t value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_rne< f8_ocp_t, float >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_rne< f8_ocp_t, float > ( float  x)
inline

Converts a float to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input float value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_rne< f8_ocp_t, half_t >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_rne< f8_ocp_t, half_t > ( half_t  x)
inline

Converts a half_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input half_t value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_rne< f8x2_ocp_t, bhalf2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_rne< f8x2_ocp_t, bhalf2_t > ( bhalf2_t  x)
inline

Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 bhalf_t.
Returns
The converted vector of 2 f8_ocp_t.

◆ f8_convert_rne< f8x2_ocp_t, float2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_rne< f8x2_ocp_t, float2_t > ( float2_t  x)
inline

Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 floats.
Returns
The converted vector of 2 f8_ocp_t.

◆ f8_convert_rne< f8x2_ocp_t, half2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_rne< f8x2_ocp_t, half2_t > ( half2_t  x)
inline

Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 half_t.
Returns
The converted vector of 2 f8_ocp_t.

◆ f8_convert_sr()

template<typename Y , typename X >
__host__ constexpr __device__ Y ck::f8_convert_sr ( x)
constexpr

◆ f8_convert_sr< bf8_fnuz_t, float >()

template<>
__host__ __device__ bf8_fnuz_t ck::f8_convert_sr< bf8_fnuz_t, float > ( float  x)
inline

◆ f8_convert_sr< bf8_fnuz_t, half_t >()

template<>
__host__ __device__ bf8_fnuz_t ck::f8_convert_sr< bf8_fnuz_t, half_t > ( half_t  x)
inline

◆ f8_convert_sr< bf8_ocp_t, bhalf_t >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_sr< bf8_ocp_t, bhalf_t > ( bhalf_t  x)
inline

Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input bhalf_t value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_sr< bf8_ocp_t, float >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_sr< bf8_ocp_t, float > ( float  x)
inline

Converts a float to a 8-bit float type (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input float value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_sr< bf8_ocp_t, half_t >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_sr< bf8_ocp_t, half_t > ( half_t  x)
inline

Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input half_t value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_sr< bf8x2_ocp_t, bhalf2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_sr< bf8x2_ocp_t, bhalf2_t > ( bhalf2_t  x)
inline

Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 bhalf_t.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_sr< bf8x2_ocp_t, float2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_sr< bf8x2_ocp_t, float2_t > ( float2_t  x)
inline

Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 floats.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_sr< bf8x2_ocp_t, half2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_sr< bf8x2_ocp_t, half2_t > ( half2_t  x)
inline

Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 half_t.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_sr< f8_fnuz_t, float >()

template<>
__host__ __device__ f8_fnuz_t ck::f8_convert_sr< f8_fnuz_t, float > ( float  x)
inline

◆ f8_convert_sr< f8_fnuz_t, half_t >()

template<>
__host__ __device__ f8_fnuz_t ck::f8_convert_sr< f8_fnuz_t, half_t > ( half_t  x)
inline

◆ f8_convert_sr< f8_ocp_t, bhalf_t >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_sr< f8_ocp_t, bhalf_t > ( bhalf_t  x)
inline

Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using stochastic rounding.

Parameters
xThe input bhalf_t value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_sr< f8_ocp_t, float >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_sr< f8_ocp_t, float > ( float  x)
inline

Converts a float to a 8-bit float type (f8_ocp_t) using stochastic rounding.

Parameters
xThe input float value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_sr< f8_ocp_t, half_t >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_sr< f8_ocp_t, half_t > ( half_t  x)
inline

Converts a half_t to a 8-bit float type (f8_ocp_t) using stochastic rounding.

Parameters
xThe input half_t value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_sr< f8x2_ocp_t, bhalf2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_sr< f8x2_ocp_t, bhalf2_t > ( bhalf2_t  x)
inline

Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 bhalf_t.
Returns
The converted vector of 2 f8_ocp_t.

◆ f8_convert_sr< f8x2_ocp_t, float2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_sr< f8x2_ocp_t, float2_t > ( float2_t  x)
inline

Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 floats.
Returns
The converted vector of 2 f8_ocp_t.

◆ f8_convert_sr< f8x2_ocp_t, half2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_sr< f8x2_ocp_t, half2_t > ( half2_t  x)
inline

Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 half_t.
Returns
The converted vector of 2 f8_ocp_t.

◆ fnv1a_hash()

constexpr unsigned int ck::fnv1a_hash ( std::string_view  str,
unsigned int  h = 2166136261u 
)
constexpr

◆ fp8_is_inf()

template<>
__host__ constexpr __device__ bool ck::fp8_is_inf ( bf8_ocp_t  a)
inlineconstexpr

◆ fp8_is_nan() [1/4]

template<>
__host__ constexpr __device__ bool ck::fp8_is_nan ( bf8_fnuz_t  a)
inlineconstexpr

◆ fp8_is_nan() [2/4]

template<>
__host__ constexpr __device__ bool ck::fp8_is_nan ( bf8_ocp_t  a)
inlineconstexpr

◆ fp8_is_nan() [3/4]

template<>
__host__ constexpr __device__ bool ck::fp8_is_nan ( f8_fnuz_t  a)
inlineconstexpr

◆ fp8_is_nan() [4/4]

template<>
__host__ constexpr __device__ bool ck::fp8_is_nan ( f8_ocp_t  a)
inlineconstexpr

◆ generate_sequence()

template<typename F , index_t N>
__host__ constexpr __device__ auto ck::generate_sequence ( ,
Number< N >   
)
constexpr

◆ generate_sequence_v2()

template<typename F , index_t N>
__host__ constexpr __device__ auto ck::generate_sequence_v2 ( F &&  f,
Number< N >   
)
constexpr

◆ generate_tie()

template<typename F , index_t N>
__host__ constexpr __device__ auto ck::generate_tie ( F &&  f,
Number< N >   
)
constexpr

◆ generate_tuple() [1/2]

template<typename F , index_t N>
__host__ constexpr __device__ auto ck::generate_tuple ( F &&  f,
LongNumber< N >   
)
constexpr

◆ generate_tuple() [2/2]

template<typename F , index_t N>
__host__ constexpr __device__ auto ck::generate_tuple ( F &&  f,
Number< N >   
)
constexpr

◆ generate_tuple_for()

template<typename F , index_t... ids>
__host__ constexpr __device__ auto ck::generate_tuple_for ( F &&  f,
Sequence< ids... >   
)
constexpr

◆ get_available_cpu_cores()

unsigned int ck::get_available_cpu_cores ( )
inline

◆ get_block_1d_id()

__device__ index_t ck::get_block_1d_id ( )

◆ get_block_size()

__device__ index_t ck::get_block_size ( )

◆ get_container_subset() [1/2]

template<typename T , index_t N, index_t... Is>
__host__ constexpr __device__ auto ck::get_container_subset ( const Array< T, N > &  arr,
Sequence< Is... >   
)
constexpr

◆ get_container_subset() [2/2]

template<typename... Ts, index_t... Is>
__host__ constexpr __device__ auto ck::get_container_subset ( const Tuple< Ts... > &  tup,
Sequence< Is... >   
)
constexpr

◆ get_device_name()

std::string ck::get_device_name ( )
inline

◆ get_grid_size()

__device__ index_t ck::get_grid_size ( )

◆ get_shift< 1 >()

template<>
constexpr __device__ index_t ck::get_shift< 1 > ( )
constexpr

◆ get_thread_global_1d_id()

__device__ index_t ck::get_thread_global_1d_id ( )

◆ get_thread_local_1d_id()

__device__ index_t ck::get_thread_local_1d_id ( )

◆ get_type_name()

template<typename T >
const char* ck::get_type_name ( )
inline

◆ get_warp_local_1d_id()

__device__ index_t ck::get_warp_local_1d_id ( )

◆ get_warp_size()

__host__ index_t ck::get_warp_size ( )
inlineconstexpr

◆ GridwiseGemmPipeline_Selector()

template<PipelineVersion PipelineVer, index_t NumPrefetch = 1, LoopScheduler LoopSched = LoopScheduler::Default, bool AEnableLds = true, bool BEnableLds = true>
constexpr auto ck::GridwiseGemmPipeline_Selector ( )
constexpr

◆ GridwiseGemmPipeline_v1_Selector()

template<index_t NumPrefetch, LoopScheduler LoopSched>
constexpr auto ck::GridwiseGemmPipeline_v1_Selector ( )
constexpr

◆ i4_to_bhalf4()

__device__ bhalf4_t ck::i4_to_bhalf4 ( int  q)
inline

◆ i4_to_f8x4()

__device__ f8x4_t ck::i4_to_f8x4 ( int  q)
inline

◆ i4_to_fp8x8()

__device__ f8x8_t ck::i4_to_fp8x8 ( int  q)
inline

◆ i4_to_half4()

__device__ half4_t ck::i4_to_half4 ( int  q)
inline

◆ i4_to_half4_scale()

__device__ half4_t ck::i4_to_half4_scale ( int  q,
const ck::half2_t scale 
)
inline

◆ inclusive_scan_sequence()

template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ auto ck::inclusive_scan_sequence ( Seq  ,
Reduce  ,
Number< Init >   
)
constexpr

◆ inner_product()

template<typename TA , typename TB , typename TC >
__device__ void ck::inner_product ( const TA &  a,
const TB &  b,
TC &  c 
)

◆ inner_product< bhalf_t, bhalf_t, float >()

template<>
__device__ void ck::inner_product< bhalf_t, bhalf_t, float > ( const bhalf_t a,
const bhalf_t b,
float &  c 
)

◆ inner_product< float, float, float >()

template<>
__device__ void ck::inner_product< float, float, float > ( const float &  a,
const float &  b,
float &  c 
)

◆ inner_product< float2_t, float2_t, float >()

template<>
__device__ void ck::inner_product< float2_t, float2_t, float > ( const float2_t a,
const float2_t b,
float &  c 
)

◆ inner_product< float4_t, float4_t, float >()

template<>
__device__ void ck::inner_product< float4_t, float4_t, float > ( const float4_t a,
const float4_t b,
float &  c 
)

◆ inner_product< half2_t, half2_t, float >()

template<>
__device__ void ck::inner_product< half2_t, half2_t, float > ( const half2_t a,
const half2_t b,
float &  c 
)

◆ inner_product< half4_t, half4_t, float >()

template<>
__device__ void ck::inner_product< half4_t, half4_t, float > ( const half4_t a,
const half4_t b,
float &  c 
)

◆ inner_product< half8_t, half8_t, float >()

template<>
__device__ void ck::inner_product< half8_t, half8_t, float > ( const half8_t a,
const half8_t b,
float &  c 
)

◆ inner_product< half_t, half_t, float >()

template<>
__device__ void ck::inner_product< half_t, half_t, float > ( const half_t a,
const half_t b,
float &  c 
)

◆ inner_product< int8_t, int8_t, int32_t >()

template<>
__device__ void ck::inner_product< int8_t, int8_t, int32_t > ( const int8_t a,
const int8_t b,
int32_t c 
)

◆ inner_product< int8x16_t, int8x16_t, int32_t >()

template<>
__device__ void ck::inner_product< int8x16_t, int8x16_t, int32_t > ( const int8x16_t a,
const int8x16_t b,
int32_t c 
)

◆ inner_product< int8x2_t, int8x2_t, int32_t >()

template<>
__device__ void ck::inner_product< int8x2_t, int8x2_t, int32_t > ( const int8x2_t a,
const int8x2_t b,
int32_t c 
)

◆ inner_product< int8x4_t, int8x4_t, int32_t >()

template<>
__device__ void ck::inner_product< int8x4_t, int8x4_t, int32_t > ( const int8x4_t a,
const int8x4_t b,
int32_t c 
)

◆ inner_product< int8x8_t, int8x8_t, int32_t >()

template<>
__device__ void ck::inner_product< int8x8_t, int8x8_t, int32_t > ( const int8x8_t a,
const int8x8_t b,
int32_t c 
)

◆ is_bf16_atomic_supported()

bool ck::is_bf16_atomic_supported ( )
inline

◆ is_gfx101_supported()

bool ck::is_gfx101_supported ( )
inline

◆ is_gfx103_supported()

bool ck::is_gfx103_supported ( )
inline

◆ is_gfx11_supported()

bool ck::is_gfx11_supported ( )
inline

◆ is_gfx12_supported()

bool ck::is_gfx12_supported ( )
inline

◆ is_lds_direct_load_supported()

bool ck::is_lds_direct_load_supported ( )
inline

◆ is_native_type()

template<typename T >
constexpr bool ck::is_native_type ( )
inlineconstexpr

◆ is_tf32_supported()

bool ck::is_tf32_supported ( )
inline

◆ is_wmma_supported()

bool ck::is_wmma_supported ( )
inline

◆ is_xdl_supported()

bool ck::is_xdl_supported ( )
inline

◆ is_xdl_wmma_supported()

template<typename ADataType , typename BDataType , index_t MPerXDL, index_t NPerXDL>
bool ck::is_xdl_wmma_supported ( )
inline

◆ IsNestedTuple()

template<typename... Ts>
__host__ constexpr __device__ auto ck::IsNestedTuple ( const Tuple< Ts... > &  )
constexpr

◆ kernel_batched_elementwise()

template<typename GridwiseElementwiseFunctor , typename InGridDescTuple , typename OutGridDescTuple , typename InDataTypePointerTuple , typename OutDataTypePointerTuple , typename Block2TileMap , typename ElementwiseOperation , index_t NumInputs, index_t NumOutputs>
__global__ void ck::kernel_batched_elementwise ( const InGridDescTuple  in_grid_desc_tuple,
const OutGridDescTuple  out_grid_desc_tuple,
const InDataTypePointerTuple  p_in_global_tuple,
const OutDataTypePointerTuple  p_out_global_tuple,
const Block2TileMap  block_2_tile_map,
const ElementwiseOperation  elementwise_op,
const index_t  batch_count,
const std::array< index_t, NumInputs >  input_batch_strides,
const std::array< index_t, NumOutputs >  output_batch_strides 
)

◆ kernel_batched_gemm_b_scale_xdl_cshuffle_v3()

template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_batched_gemm_b_scale_xdl_cshuffle_v3 ( BatchedGemmArg  karg)

◆ kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds()

template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds ( BatchedGemmArg  karg)

◆ kernel_batched_gemm_xdl_cshuffle_v3_multi_d()

template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_batched_gemm_xdl_cshuffle_v3_multi_d ( BatchedGemmArg  karg)

◆ kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds()

template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds ( BatchedGemmArg  karg)

◆ kernel_batchnorm_backward_with_blockwise_welford()

template<typename GridwiseBatchrNormBackwardWithBlockwiseWelford_ , typename XDataType , typename DyDataType , typename DxDataType , typename AccDataType , typename ScaleDataType , typename DscaleDbiasDataType , typename MeanVarDataType , typename DyElementwiseOp , typename XYGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M , typename GetReduceCountPerThreadFunctor >
__global__ void ck::kernel_batchnorm_backward_with_blockwise_welford ( const XYGridDesc_M_K  x_grid_desc_m_k,
const XYGridDesc_M_K  dy_grid_desc_m_k,
const XYGridDesc_M_K  dx_grid_desc_m_k,
const ScaleBiasGridDesc_M  scale_grid_desc_m,
const ScaleBiasGridDesc_M  dscale_dbias_grid_desc_m,
const MeanVarGridDesc_M  mean_var_grid_desc_m,
const GetReduceCountPerThreadFunctor  get_reduce_count_per_thread,
long_index_t  reduce_size,
index_t  num_k_block_tile_iteration,
AccDataType  epsilon,
const XDataType *const __restrict__  p_x,
const DyDataType *const __restrict__  p_dy,
const ScaleDataType *const __restrict__  p_scale,
bool  haveSavedMeanInvVar,
const MeanVarDataType *const __restrict__  p_savedMean,
const MeanVarDataType *const __restrict__  p_savedInvVar,
const DyElementwiseOp  dy_elementwise_op,
DxDataType *const __restrict__  p_dx,
DscaleDbiasDataType *const __restrict__  p_dscale,
DscaleDbiasDataType *const __restrict__  p_dbias 
)

◆ kernel_batchnorm_forward_with_blockwise_welford()

template<typename GridwiseBatchrNormForwardWithBlockwiseWelford_ , typename XDataType , typename YDataType , typename AccDataType , typename ScaleDataType , typename BiasDataType , typename MeanVarDataType , typename YElementwiseOp , typename XYGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M , typename GetReduceCountPerThreadFunctor >
__global__ void ck::kernel_batchnorm_forward_with_blockwise_welford ( const XYGridDesc_M_K  x_grid_desc_m_k,
const XYGridDesc_M_K  y_grid_desc_m_k,
const ScaleBiasGridDesc_M  scale_grid_desc_m,
const ScaleBiasGridDesc_M  bias_grid_desc_m,
const MeanVarGridDesc_M  mean_var_grid_desc_m,
const GetReduceCountPerThreadFunctor  get_reduce_count_per_thread,
index_t  num_k_block_tile_iteration,
AccDataType  epsilon,
const XDataType *const __restrict__  p_x,
const ScaleDataType *const __restrict__  p_scale,
const BiasDataType *const __restrict__  p_bias,
const YElementwiseOp  y_elementwise_op,
YDataType *const __restrict__  p_y,
bool  updateMovingAverage,
AccDataType  averageFactor,
MeanVarDataType *const __restrict__  resultRunningMean,
MeanVarDataType *const __restrict__  resultRunningVariance,
bool  saveMeanInvVariance,
MeanVarDataType *const __restrict__  resultSaveMean,
MeanVarDataType *const __restrict__  resultSaveInvVariance 
)

◆ kernel_buffer_set_value()

template<index_t BlockSize, typename DataType , typename Grid1dBufferDescType >
__global__ void ck::kernel_buffer_set_value ( const Grid1dBufferDescType  grid_1d_buffer_desc,
DataType *const __restrict__  p_global,
DataType  value 
)

◆ kernel_contraction_multiple_abd_xdl_cshuffle()

template<typename GridwiseGemm , typename AsPointer , typename BsPointer , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AsGridDesc_AK0_M_AK1 , typename BsGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_contraction_multiple_abd_xdl_cshuffle ( AsPointer  p_as_grid,
BsPointer  p_bs_grid,
DsPointer  p_ds_grid,
EDataType *__restrict__  p_e_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AsGridDesc_AK0_M_AK1  as_grid_desc_ak0_m_ak1,
const BsGridDesc_BK0_N_BK1  bs_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_contraction_multiple_d_wmma_cshuffle()

template<typename GridwiseOp , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AGridDesc , typename BGridDesc , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename ComputePtrOffsetOfBatch , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_contraction_multiple_d_wmma_cshuffle ( const ADataType *__restrict__  p_a_grid,
const BDataType *__restrict__  p_b_grid,
DsPointer  p_ds_grid,
EDataType *__restrict__  p_e_grid,
const index_t  batch_count,
const AGridDesc  a_grid_desc,
const BGridDesc  b_grid_desc,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const ComputePtrOffsetOfBatch  compute_ptr_offset_of_batch,
const Block2CTileMap  block_2_etile_map 
)

◆ kernel_contraction_multiple_d_xdl_cshuffle() [1/3]

template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_contraction_multiple_d_xdl_cshuffle ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatDsPointer  p_ds_grid,
FloatE *__restrict__  p_e_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_contraction_multiple_d_xdl_cshuffle() [2/3]

template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ComputePtrOffsetOfBatch , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_contraction_multiple_d_xdl_cshuffle ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatDsPointer  p_ds_grid,
FloatE *__restrict__  p_e_grid,
const index_t  batch_count,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const ComputePtrOffsetOfBatch  compute_ptr_offset_of_batch,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_contraction_multiple_d_xdl_cshuffle() [3/3]

template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AKB_AK0_M_AK1 , typename BGridDesc_BKB_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ComputePtrOffsetOfBatch , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_contraction_multiple_d_xdl_cshuffle ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatDsPointer  p_ds_grid,
FloatE *__restrict__  p_e_grid,
const index_t  batch_count,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AGridDesc_AKB_AK0_M_AK1  a_grid_desc_akb_ak0_m_ak1,
const BGridDesc_BKB_BK0_N_BK1  b_grid_desc_bkb_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const ComputePtrOffsetOfBatch  compute_ptr_offset_of_batch,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_elementwise()

template<typename GridwiseElementwiseFunctor , typename InGridDescTuple , typename OutGridDescTuple , typename InDataTypePointerTuple , typename OutDataTypePointerTuple , typename Block2TileMap , typename ElementwiseOperation >
__global__ void ck::kernel_elementwise ( const InGridDescTuple  in_grid_desc_tuple,
const OutGridDescTuple  out_grid_desc_tuple,
const InDataTypePointerTuple  p_in_global_tuple,
const OutDataTypePointerTuple  p_out_global_tuple,
const Block2TileMap  block_2_tile_map,
const ElementwiseOperation  elementwise_op 
)

◆ kernel_elementwise_1d()

template<typename GridwiseElementwise1dFunctor , typename InGrid1dDescTuple , typename OutGrid1dDescTuple , typename InDataTypePointerTuple , typename OutDataTypePointerTuple , typename ElementwiseOperation , typename UnaryOperation , typename Scale >
__global__ void ck::kernel_elementwise_1d ( const InGrid1dDescTuple  in_grid_1d_desc_tuple,
const OutGrid1dDescTuple  out_grid_1d_desc_tuple,
const InDataTypePointerTuple  p_in_global_tuple,
const OutDataTypePointerTuple  p_out_global_tuple,
const ElementwiseOperation  elementwise_op,
const UnaryOperation  unary_op,
const Scale  scale_op 
)

◆ kernel_elementwise_batched_dual()

template<typename GridwiseElementwiseFunctorA , typename GridwiseElementwiseFunctorB , typename InAGridDescTuple , typename InBGridDescTuple , typename OutAGridDescTuple , typename OutBGridDescTuple , typename InADataTypePointerTuple , typename InBDataTypePointerTuple , typename OutADataTypePointerTuple , typename OutBDataTypePointerTuple , typename Block2TileMapA , typename Block2TileMapB , typename ElementwiseOperation , index_t NumInputsA, index_t NumInputsB, index_t NumOutputsA, index_t NumOutputsB>
__global__ void ck::kernel_elementwise_batched_dual ( const InAGridDescTuple  in_grid_desc_tuple_a,
const InBGridDescTuple  in_grid_desc_tuple_b,
const OutAGridDescTuple  out_grid_desc_tuple_a,
const OutBGridDescTuple  out_grid_desc_tuple_b,
const InADataTypePointerTuple  p_in_global_tuple_a,
const InBDataTypePointerTuple  p_in_global_tuple_b,
const OutADataTypePointerTuple  p_out_global_tuple_a,
const OutBDataTypePointerTuple  p_out_global_tuple_b,
const Block2TileMapA  block_2_tile_map_a,
const Block2TileMapB  block_2_tile_map_b,
const ElementwiseOperation  elementwise_op,
const index_t  a_grid_size,
const index_t  batch_count_a,
const index_t  batch_count_b,
const std::array< index_t, NumInputsA >  input_batch_strides_a,
const std::array< index_t, NumInputsB >  input_batch_strides_b,
const std::array< index_t, NumOutputsA >  output_batch_strides_a,
const std::array< index_t, NumOutputsB >  output_batch_strides_b 
)

◆ kernel_elementwise_dual()

template<typename GridwiseElementwiseFunctorA , typename GridwiseElementwiseFunctorB , typename InAGridDescTuple , typename InBGridDescTuple , typename OutAGridDescTuple , typename OutBGridDescTuple , typename InADataTypePointerTuple , typename InBDataTypePointerTuple , typename OutADataTypePointerTuple , typename OutBDataTypePointerTuple , typename Block2TileMapA , typename Block2TileMapB , typename ElementwiseOperation >
__global__ void ck::kernel_elementwise_dual ( const InAGridDescTuple  in_grid_desc_tuple_a,
const InBGridDescTuple  in_grid_desc_tuple_b,
const OutAGridDescTuple  out_grid_desc_tuple_a,
const OutBGridDescTuple  out_grid_desc_tuple_b,
const InADataTypePointerTuple  p_in_global_tuple_a,
const InBDataTypePointerTuple  p_in_global_tuple_b,
const OutADataTypePointerTuple  p_out_global_tuple_a,
const OutBDataTypePointerTuple  p_out_global_tuple_b,
const Block2TileMapA  block_2_tile_map_a,
const Block2TileMapB  block_2_tile_map_b,
const ElementwiseOperation  elementwise_op,
const index_t  a_grid_size 
)

◆ kernel_elementwise_layernorm()

template<typename GridwiseElementwiseReduction , typename InDataTypePointerTuple , typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename AccDataType , typename XElementwiseOperation , typename YElementwiseOperation , typename InGrid2dDescTuple , typename GridDesc_M_K >
__global__ void ck::kernel_elementwise_layernorm ( const InGrid2dDescTuple  in_grid_2d_desc_tuple,
const GridDesc_M_K  x_grid_desc_m_k,
const GridDesc_M_K  gamma_grid_desc_m_k,
const GridDesc_M_K  beta_grid_desc_m_k,
const GridDesc_M_K  y_grid_desc_m_k,
index_t  num_k_block_tile_iteration,
AccDataType  epsilon,
const InDataTypePointerTuple  p_in_global_tuple,
const GammaDataType *const __restrict__  p_gamma_global,
const BetaDataType *const __restrict__  p_beta_global,
YDataType *const __restrict__  p_y_global,
const XElementwiseOperation  x_elementwise_op,
const YElementwiseOperation  y_elementwise_op 
)

◆ kernel_fpAintB_gemm_wmma()

template<typename GridwiseGemm , typename ADataType , typename BDataType , typename ScaleDataType , typename CDataType , typename AGridDesc , typename BGridDesc , typename ScaleGridDesc , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_fpAintB_gemm_wmma ( const ADataType *__restrict__  p_a_grid,
const BDataType *__restrict__  p_b_grid,
const ScaleDataType *__restrict__  p_scale_grid,
CDataType *__restrict__  p_c_grid,
const AGridDesc  a_grid_desc,
const BGridDesc  b_grid_desc,
const ScaleGridDesc  scale_grid_desc,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  c_grid_desc_mblock_mperblock_nblock_nperblock,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_bias_add_reduce_xdl_cshuffle_v1()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename FloatC0 , typename FloatC1 , typename ReducePtrsGlobal , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename C1ElementwiseOperation , typename ReduceInElementwiseOperations , typename ReduceAccElementwiseOperations , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ReduceGridDescriptor_MBlock_MPerBlock , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_bias_add_reduce_xdl_cshuffle_v1 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const FloatC0 *__restrict__  p_bias_grid,
const FloatC1 *__restrict__  p_d0_grid,
ReducePtrsGlobal  p_reduces_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const C1ElementwiseOperation  c1_element_op,
const ReduceInElementwiseOperations  reduce_in_element_ops,
const ReduceAccElementwiseOperations  reduce_out_element_ops,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  c_grid_desc_mblock_mperblock_nblock_nperblock,
const C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  c0_grid_desc_mblock_mperblock_nblock_nperblock,
const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  c1_grid_desc_mblock_mperblock_nblock_nperblock,
const ReduceGridDescriptor_MBlock_MPerBlock  reduce_grid_desc_mblock_mperblock,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_dl_multiple_d()

template<typename GridwiseGemm , typename ABDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_K0_M0_M1_K1 , typename BGridDesc_K0_N0_N1_K1 , typename DsGridDesc_M0_M10_M11_N0_N10_N11 , typename CGridDesc_M0_M10_M11_N0_N10_N11 , typename Block2CTileMap , bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__global__ void ck::kernel_gemm_dl_multiple_d ( const ABDataType *__restrict__  p_a_grid,
const ABDataType *__restrict__  p_b_grid,
DsPointer  p_ds_grid,
EDataType *__restrict__  p_e_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AGridDesc_K0_M0_M1_K1  a_grid_desc_k0_m0_m1_k1,
const BGridDesc_K0_N0_N1_K1  b_grid_desc_k0_n0_n1_k1,
const DsGridDesc_M0_M10_M11_N0_N10_N11  ds_grid_desc_m0_m10_m11_n0_n10_n11,
const CGridDesc_M0_M10_M11_N0_N10_N11  e_grid_desc_m0_m10_m11_n0_n10_n11,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_dl_v1r3()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M0_M1_K1 , typename BGridDesc_K0_N0_N1_K1 , typename CGridDesc_M0_M10_M11_N0_N10_N11 , typename Block2CTileMap , bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__global__ void ck::kernel_gemm_dl_v1r3 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const AGridDesc_K0_M0_M1_K1  a_grid_desc_k0_m0_m1_k1,
const BGridDesc_K0_N0_N1_K1  b_grid_desc_k0_n0_n1_k1,
const CGridDesc_M0_M10_M11_N0_N10_N11  c_grid_desc_m0_m10_m11_n0_n10_n11,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_dpp()

template<typename GridwiseGemm , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_dpp ( const typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_layernorm_xdl_cshuffle_v1()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename FloatC0 , typename AElementwiseOperation , typename BElementwiseOperation , typename AccElementwiseOperation , typename CElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename C0GridDescriptor_NBlock_NPerBlock , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_layernorm_xdl_cshuffle_v1 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const FloatC0 *__restrict__  p_c0_bias_grid,
const FloatC0 *__restrict__  p_c0_add_grid,
const FloatC0 *__restrict__  p_c0_gamma_grid,
const FloatC0 *__restrict__  p_c0_beta_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const AccElementwiseOperation  acc_element_op,
const CElementwiseOperation  c_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  c_grid_desc_mblock_mperblock_nblock_nperblock,
const C0GridDescriptor_NBlock_NPerBlock  c0_grid_desc_nblock_nperblock,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_multiple_d_multiple_r_xdl_cshuffle()

template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename FloatRsPointer , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename QsElementwiseOperation , typename RsElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename RsGridDescriptor_MBlock_MPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_multiple_d_multiple_r_xdl_cshuffle ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatDsPointer  p_ds_grid,
FloatE *__restrict__  p_e_grid,
FloatRsPointer  p_rs_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const QsElementwiseOperation  qs_element_op,
const RsElementwiseOperation  rs_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const RsGridDescriptor_MBlock_MPerBlock  rs_grid_desc_mblock_mperblock,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_gemm_multiple_d_welford_first_half_xdl_cshuffle()

template<typename GridwiseGemmWelford , typename ABDataType , typename DsPointer , typename EMeanVarDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename MeanVarGridDescriptor_MBlock_MPerBlock_NBlock , typename CountGridDescriptor_MBlock_MPerBlock_NBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_multiple_d_welford_first_half_xdl_cshuffle ( const ABDataType *__restrict__  p_a_grid,
const ABDataType *__restrict__  p_b_grid,
DsPointer  p_ds_grid,
EMeanVarDataType *__restrict__  p_e_grid,
EMeanVarDataType *__restrict__  p_welford_mean_grid,
EMeanVarDataType *__restrict__  p_welford_var_grid,
int32_t *__restrict__  p_welford_count_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const MeanVarGridDescriptor_MBlock_MPerBlock_NBlock  mean_var_grid_desc_mblock_mperblock_nblock,
const CountGridDescriptor_MBlock_MPerBlock_NBlock  count_grid_desc_mblock_mperblock_nblock,
const Block2ETileMap  block_2_etile_map,
index_t  NRaw 
)

◆ kernel_gemm_multiple_d_xdl_cshuffle()

template<typename GridwiseGemm , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_multiple_d_xdl_cshuffle ( const ADataType *__restrict__  p_a_grid,
const BDataType *__restrict__  p_b_grid,
DsPointer  p_ds_grid,
EDataType *__restrict__  p_e_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_gemm_multiple_d_xdl_cshuffle_lds_direct_load()

template<typename GridwiseGemm , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_multiple_d_xdl_cshuffle_lds_direct_load ( const ADataType *__restrict__  p_a_grid,
const BDataType *__restrict__  p_b_grid,
DsPointer  p_ds_grid,
EDataType *__restrict__  p_e_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_gemm_mupltipe_d_wmma_cshuffle()

template<typename GridwiseOp , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AGridDesc , typename BGridDesc , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_mupltipe_d_wmma_cshuffle ( const ADataType *__restrict__  p_a_grid,
const BDataType *__restrict__  p_b_grid,
DsPointer  p_ds_grid,
EDataType *__restrict__  p_e_grid,
const AGridDesc  a_grid_desc,
const BGridDesc  b_grid_desc,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_reduce_xdl_cshuffle_v1()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename ReducePtrsGlobal , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename ReduceInElementwiseOperations , typename ReduceAccElementwiseOperations , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ReduceGridDescriptor_MBlock_MPerBlock , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_reduce_xdl_cshuffle_v1 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
ReducePtrsGlobal  p_reduces_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const ReduceInElementwiseOperations  reduce_in_element_ops,
const ReduceAccElementwiseOperations  reduce_out_element_ops,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  c_grid_desc_mblock_mperblock_nblock_nperblock,
const ReduceGridDescriptor_MBlock_MPerBlock  reduce_grid_desc_mblock_mperblock,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_wmma()

template<typename GridwiseGemm , typename ADataType , typename BDataType , typename CDataType , typename AGridDesc , typename BGridDesc , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_wmma ( const ADataType *__restrict__  p_a_grid,
const BDataType *__restrict__  p_b_grid,
CDataType *__restrict__  p_c_grid,
const AGridDesc  a_grid_desc,
const BGridDesc  b_grid_desc,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  c_grid_desc_mblock_mperblock_nblock_nperblock,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_wmma_cshuffle_v3()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum EGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_wmma_cshuffle_v3 ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v1() [1/2]

template<typename GridwiseGemm , typename FloatA , typename FloatB , typename FloatC , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdl_cshuffle_v1 ( const FloatA *__restrict__  p_a_grid,
const FloatB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
typename GridwiseGemm::Problem  problem 
)

◆ kernel_gemm_xdl_cshuffle_v1() [2/2]

template<typename GridwiseGemm , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdl_cshuffle_v1 ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v2() [1/2]

template<typename GridwiseGemm , typename FloatA , typename FloatB , typename FloatC , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdl_cshuffle_v2 ( const FloatA *  p_a_grid,
const FloatB *  p_b_grid,
FloatC *  p_c_grid,
typename GridwiseGemm::Problem  problem 
)

◆ kernel_gemm_xdl_cshuffle_v2() [2/2]

template<typename GridwiseGemm , bool HasMainKBlockLoop, index_t TailNum = 3>
__global__ void ck::kernel_gemm_xdl_cshuffle_v2 ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3 ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_2lds()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_2lds ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_b_preshuffle()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_b_preshuffle ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d_2lds()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d_2lds ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_mx() [1/2]

template<bool Use2LDS, typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ enable_if_t<!Use2LDS, void > ck::kernel_gemm_xdl_cshuffle_v3_mx ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_mx() [2/2]

template<bool Use2LDS, typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ enable_if_t< Use2LDS, void > ck::kernel_gemm_xdl_cshuffle_v3_mx ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_waveletmodel_cshuffle()

template<typename GridwiseGemm , typename ABDataType , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename EElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdl_waveletmodel_cshuffle ( const ABDataType *__restrict__  p_a_grid,
const ABDataType *__restrict__  p_b_grid,
EDataType *__restrict__  p_e_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const EElementwiseOperation  e_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_gemm_xdlops_bwd_weight()

template<typename GridwiseGemm , typename FloatA , typename FloatB , typename FloatC , typename AGridDesc_B_K0_M_K1 , typename BGridDesc_B_K0_N_K1 , typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename CBlockClusterAdaptor , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_bwd_weight ( const FloatA *__restrict__  p_a_grid,
const FloatB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const AGridDesc_B_K0_M_K1  a_b_k0_m_k1_grid_desc,
const BGridDesc_B_K0_N_K1  b_b_k0_n_k1_grid_desc,
const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  c_grid_desc_mblock_mperblock_nblock_nperblock,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const CBlockClusterAdaptor  c_block_cluster_adaptor 
)

◆ kernel_gemm_xdlops_skip_b_lds_v1()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDesc_M_N , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainK0BlockLoop>
__global__ void ck::kernel_gemm_xdlops_skip_b_lds_v1 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const AGridDesc_K0_M_K1  a_grid_desc_k0_m_k1,
const BGridDesc_K0_N_K1  b_grid_desc_k0_n_k1,
const CGridDesc_M_N  c_grid_desc_m_n,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_xdlops_splitk_lds_direct_load()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation >
__global__ void ck::kernel_gemm_xdlops_splitk_lds_direct_load ( typename GridwiseGemm::Argument  karg,
const Block2CTileMap &  b2c_map,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op 
)

◆ kernel_gemm_xdlops_streamk()

template<typename GridwiseGemm >
__global__ void ck::kernel_gemm_xdlops_streamk ( const typename GridwiseGemm::FloatAB *  p_a_grid,
const typename GridwiseGemm::FloatAB *  p_b_grid,
typename GridwiseGemm::FloatC *  p_c_grid,
void *  p_workspace,
index_t  M,
index_t  N,
index_t  K,
index_t  StrideA,
index_t  StrideB,
index_t  StrideC,
typename GridwiseGemm::Block2CTileMap  block_mapping 
)

◆ kernel_gemm_xdlops_v2r3() [1/2]

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDesc_M_N , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_v2r3 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const AGridDesc_K0_M_K1  a_grid_desc_k0_m_k1,
const BGridDesc_K0_N_K1  b_grid_desc_k0_n_k1,
const CGridDesc_M_N  c_grid_desc_m_n 
)

◆ kernel_gemm_xdlops_v2r3() [2/2]

template<typename GridwiseGemm , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_v2r3 ( const typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdlops_v2r4()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename ABK0MK1GridDesc , typename BBK0NK1GridDesc , typename CM0N0M1N1M2M3M4N2GridDesc , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename CBlockClusterAdaptor , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_v2r4 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const ABK0MK1GridDesc  a_b_k0_m_k1_grid_desc,
const BBK0NK1GridDesc  b_b_k0_n_k1_grid_desc,
const CM0N0M1N1M2M3M4N2GridDesc  c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const CBlockClusterAdaptor  c_block_cluster_adaptor 
)

◆ kernel_gemm_xdlops_v2r4r2_simplified()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation >
__global__ void ck::kernel_gemm_xdlops_v2r4r2_simplified ( typename GridwiseGemm::Argument  karg,
const Block2CTileMap &  b2c_map,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op 
)

◆ kernel_gemm_xdlops_v3r1()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainK0BlockLoop>
__global__ void ck::kernel_gemm_xdlops_v3r1 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl  c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_xdlops_v3r2()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_v3r2 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const FloatC *__restrict__  p_c0_grid,
const AGridDesc_K0_M_K1  a_grid_desc_k0_m_k1,
const BGridDesc_K0_N_K1  b_grid_desc_k0_n_k1,
const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl  c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl  c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_xdlops_v3r3()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_v3r3 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const FloatC *__restrict__  p_c0_grid,
const FloatC *__restrict__  p_c1_grid,
const AGridDesc_K0_M_K1  a_grid_desc_k0_m_k1,
const BGridDesc_K0_N_K1  b_grid_desc_k0_n_k1,
const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl  c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl  c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl  c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_grouped_contraction_multiple_d_xdl_cshuffle()

template<typename GridwiseGemm , typename ContractionMultiDKernelArg , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , bool HasMainKBlockLoop>
__global__ void ck::kernel_grouped_contraction_multiple_d_xdl_cshuffle ( const void CK_CONSTANT_ADDRESS_SPACE contraction_args,
const index_t  group_count,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op 
)

◆ kernel_grouped_conv_multiple_d_wmma_cshuffle()

template<typename GridwiseOp , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2CTileMap , typename ComputePtrOffsetOfBatch , bool HasMainKBlockLoop>
__global__ void ck::kernel_grouped_conv_multiple_d_wmma_cshuffle ( const ADataType *__restrict__  p_a_grid,
const BDataType *__restrict__  p_b_grid,
DsPointer  p_ds_grid,
EDataType *__restrict__  p_e_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const index_t  batch_count,
const AGridDesc_AK0_M_AK1  a_grid_desc,
const BGridDesc_BK0_N_BK1  b_grid_desc,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock_,
const Block2CTileMap  block_2_ctile_map,
const ComputePtrOffsetOfBatch  compute_ptr_offset_of_batch 
)

◆ kernel_moe_gemm()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_moe_gemm ( typename GridwiseGemm::Argument  karg)

◆ kernel_moe_gemm_2lds()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_moe_gemm_2lds ( typename GridwiseGemm::Argument  karg)

◆ kernel_moe_mxgemm()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_moe_mxgemm ( typename GridwiseGemm::Argument  karg)

◆ kernel_moe_mxgemm_2lds()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_moe_mxgemm_2lds ( typename GridwiseGemm::Argument  karg)

◆ kernel_multiblock_batchnorm_forward()

template<typename GridwiseMultiblockBatchNormForward_ , typename XDataType , typename YDataType , typename AccDataType , typename ScaleDataType , typename BiasDataType , typename MeanVarDataType , typename YElementwiseOp , typename XYGridDesc_M_K , typename MeanVarCountGridDesc_M_G , typename MeanVarCountGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M , typename GetReduceCountPerThreadFunctor >
__global__ void ck::kernel_multiblock_batchnorm_forward ( const XYGridDesc_M_K  x_grid_desc_m_k,
const XYGridDesc_M_K  y_grid_desc_m_k,
const MeanVarCountGridDesc_M_G  mean_var_count_grid_desc_m_g,
const MeanVarCountGridDesc_M_K  mean_var_count_grid_desc_m_k,
const ScaleBiasGridDesc_M  scale_grid_desc_m,
const ScaleBiasGridDesc_M  bias_grid_desc_m,
const MeanVarGridDesc_M  mean_var_grid_desc_m,
const GetReduceCountPerThreadFunctor  get_reduce_count_per_thread,
index_t  num_k_block_tile_iteration,
AccDataType  epsilon,
const XDataType *const __restrict__  p_x,
MeanVarDataType *const __restrict__  p_welford_mean,
MeanVarDataType *const __restrict__  p_welford_variance,
int32_t *const __restrict__  p_welford_count,
int32_t *const __restrict__  p_control,
const ScaleDataType *const __restrict__  p_scale,
const BiasDataType *const __restrict__  p_bias,
const YElementwiseOp  y_elementwise_op,
YDataType *const __restrict__  p_y,
bool  updateMovingAverage,
AccDataType  averageFactor,
MeanVarDataType *const __restrict__  resultRunningMean,
MeanVarDataType *const __restrict__  resultRunningVariance,
bool  saveMeanInvVariance,
MeanVarDataType *const __restrict__  resultSaveMean,
MeanVarDataType *const __restrict__  resultSaveInvVariance 
)

◆ kernel_multiblock_welford_first_half()

template<typename GridwiseMultiblockWelfordFirstHalf_ , typename XDataType , typename MeanVarDataType , typename XGridDesc_M_K , typename MeanVarCountGridDesc_M_G , typename GetReduceCountPerThreadFunctor >
__global__ void ck::kernel_multiblock_welford_first_half ( const XGridDesc_M_K  x_grid_desc_m_k,
const MeanVarCountGridDesc_M_G  mean_var_count_grid_desc_m_g,
const GetReduceCountPerThreadFunctor  get_reduce_count_per_thread,
index_t  num_k_block_tile_iteration,
const XDataType *const __restrict__  p_x,
MeanVarDataType *const  p_welford_mean,
MeanVarDataType *const  p_welford_variance,
int32_t *const  p_welford_count 
)

◆ kernel_multiple_buffer_set_value()

template<typename Grid1dBufferDescTuple , index_t NumBuffer, index_t BlockSize, typename DataTypePointerTuple , typename DataTypeTuple >
__global__ void ck::kernel_multiple_buffer_set_value ( const Grid1dBufferDescTuple  grid_1d_buffer_desc_tuple,
DataTypePointerTuple  p_global_tuple,
DataTypeTuple  value_tuple 
)

◆ kernel_multiple_reduce_multiblock()

template<typename GridwiseMultipleReduction , index_t NumReduction, typename InDataType , typename OutDataTypePointerTuple , typename AccDataType , typename InGridDesc_M_K , typename OutGridDesc_M_Tuple , typename InElementwiseOperationTuple , typename AccElementwiseOperationTuple >
__global__ void ck::kernel_multiple_reduce_multiblock ( const InGridDesc_M_K  in_grid_desc_m_k,
const OutGridDesc_M_Tuple  out_grid_desc_m_tuple,
const InElementwiseOperationTuple  in_elementwise_op_tuple,
const AccElementwiseOperationTuple  acc_elementwise_op_tuple,
index_t  block_group_size,
index_t  num_k_block_tile_iteration,
Array< AccDataType, NumReduction >  alpha_values,
const InDataType *const __restrict__  p_in_value_global,
Array< AccDataType, NumReduction >  beta_values,
OutDataTypePointerTuple  p_out_value_global_tuple 
)

◆ kernel_multiple_reduce_threadwise()

template<typename GridwiseMultipleReduction , index_t NumReduction, typename InDataType , typename OutDataTypePointerTuple , typename AccDataType , typename InGridDesc_M_K , typename OutGridDesc_M_Tuple , typename InElementwiseOperationTuple , typename AccElementwiseOperationTuple >
__global__ void ck::kernel_multiple_reduce_threadwise ( const InGridDesc_M_K  in_grid_desc_m_k,
const OutGridDesc_M_Tuple  out_grid_desc_m_tuple,
const InElementwiseOperationTuple  in_elementwise_op_tuple,
const AccElementwiseOperationTuple  acc_elementwise_op_tuple,
Array< AccDataType, NumReduction >  alpha_values,
const InDataType *const __restrict__  p_in_value_global,
Array< AccDataType, NumReduction >  beta_values,
OutDataTypePointerTuple  p_out_value_global_tuple 
)

◆ kernel_nd_permute()

template<typename GridwisePermute , typename InGridDesc , typename OutGridDesc , typename InDataType , typename OutDataType , typename ElementwiseOperation , typename Block2TileMap >
__global__ void ck::kernel_nd_permute ( const InGridDesc  in_grid_desc,
const OutGridDesc  out_grid_desc,
const InDataType *  p_in_global,
OutDataType *  p_out_global,
const ElementwiseOperation  elementwise_op,
const Block2TileMap  block_2_tile_map 
)

◆ kernel_normalization()

template<typename GridwiseReduction , typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename SaveMeanInvStdDataType , typename ComputeDataType , typename YElementwiseOperation , typename GridDesc_M_K , typename GridDesc_M >
__global__ void ck::kernel_normalization ( const GridDesc_M_K  x_grid_desc_m_k,
const GridDesc_M_K  gamma_grid_desc_m_k,
const GridDesc_M_K  beta_grid_desc_m_k,
const GridDesc_M_K  y_grid_desc_m_k,
const GridDesc_M  save_mean_grid_desc_m,
const GridDesc_M  save_inv_std_grid_desc_m,
index_t  num_k_block_tile_iteration,
ComputeDataType  epsilon,
const XDataType *const __restrict__  p_x_global,
const GammaDataType *const __restrict__  p_gamma_global,
const BetaDataType *const __restrict__  p_beta_global,
YDataType *const __restrict__  p_y_global,
SaveMeanInvStdDataType *const __restrict__  p_save_mean_global,
SaveMeanInvStdDataType *const __restrict__  p_save_inv_std_global,
const YElementwiseOperation  y_elementwise_op 
)

◆ kernel_normalizationSplitK1st()

template<typename GridwiseWelford , typename XDataType , typename WorkspaceMeanVarDataType , typename ComputeDataType , typename XGridDesc_M_K , typename MeanVarGridDesc_M_KBlock >
__global__ void ck::kernel_normalizationSplitK1st ( const XGridDesc_M_K  x_grid_desc_m_k,
const MeanVarGridDesc_M_KBlock  mean_var_grid_desc_m_kblock,
index_t  num_k_block_tile_iteration,
const XDataType *const __restrict__  p_x_global,
WorkspaceMeanVarDataType *const __restrict__  p_welford_mean,
WorkspaceMeanVarDataType *const __restrict__  p_welford_variance,
int32_t *const __restrict__  p_welford_count 
)

◆ kernel_normalizationSplitK2nd()

template<typename GridwiseWelfordNormalization , typename WorkspaceMeanVarDataType , typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename SaveMeanInvStdDataType , typename ComputeDataType , typename YElementwiseOperation , typename MeanVarGridDesc_M_KBlock , typename CountGridDesc_M_KBlock , typename XYGammaBetaGridDesc_M_K , typename SaveMeanInvStdGridDesc_M >
__global__ void ck::kernel_normalizationSplitK2nd ( const MeanVarGridDesc_M_KBlock  mean_var_grid_desc_m_kblock,
const CountGridDesc_M_KBlock  count_grid_desc_m_kblock,
const XYGammaBetaGridDesc_M_K  x_grid_desc_m_k,
const XYGammaBetaGridDesc_M_K  gamma_grid_desc_m_k,
const XYGammaBetaGridDesc_M_K  beta_grid_desc_m_k,
const XYGammaBetaGridDesc_M_K  y_grid_desc_m_k,
const SaveMeanInvStdGridDesc_M  save_mean_grid_desc_m,
const SaveMeanInvStdGridDesc_M  save_inv_std_grid_desc_m,
index_t  num_k_mean_var_count_iteration,
index_t  num_k_block_tile_iteration,
index_t  k_grid_size,
ComputeDataType  epsilon,
const WorkspaceMeanVarDataType *const  p_mean_global,
const WorkspaceMeanVarDataType *const  p_variance_global,
const int32_t *const  p_welford_count_global,
const XDataType *const __restrict__  p_x_global,
const GammaDataType *const __restrict__  p_gamma_global,
const BetaDataType *const __restrict__  p_beta_global,
YDataType *const __restrict__  p_y_global,
SaveMeanInvStdDataType *const __restrict__  p_save_mean_global,
SaveMeanInvStdDataType *const __restrict__  p_save_inv_std_global,
const YElementwiseOperation  y_elementwise_op 
)

◆ kernel_put_element_1d()

template<typename GridwisePutElementwise1dFunctor , typename InGrid1dDesc , typename InDataType , typename IndexDataType , typename OutDataType , typename ElementwiseOperation >
__global__ void ck::kernel_put_element_1d ( const InGrid1dDesc  in_grid_1d_desc,
const InDataType *__restrict__  p_in_global,
const IndexDataType *__restrict__  p_indices_global,
OutDataType *__restrict__  p_out_global,
const ElementwiseOperation  elementwise_op 
)

◆ kernel_reduce_multiblock()

template<typename GridwiseReduction , bool OutputIndex, bool HaveIndexInput, typename InDataType , typename OutDataType , typename AccDataType , typename IndexDataType , typename InGridDesc_M_K , typename OutGridDesc_M , typename InElementwiseOperation , typename AccElementwiseOperation >
__global__ void ck::kernel_reduce_multiblock ( const InGridDesc_M_K  in_grid_desc_m_k,
const OutGridDesc_M  out_grid_desc_m,
const InElementwiseOperation  in_elementwise_op,
const AccElementwiseOperation  acc_elementwise_op,
index_t  block_group_size,
index_t  num_k_block_tile_iteration,
AccDataType  alpha,
const InDataType *const __restrict__  p_in_value_global,
const IndexDataType *const __restrict__  p_in_index_global,
AccDataType  beta,
OutDataType *const __restrict__  p_out_value_global,
IndexDataType *const __restrict__  p_out_index_global 
)

◆ kernel_reduce_second_half_batchnorm_backward_final()

template<typename GridwiseReduceSecondHalfBatchNormBackwardFinal_ , typename XDataType , typename DyDataType , typename DxDataType , typename ScaleDataType , typename DscaleDbiasDataType , typename MeanVarDataType , typename DyElementwiseOp , typename XYGridDesc_M_K , typename DscaleDbiasGridDesc_M_K , typename MeanVarGridDesc_M , typename ScaleBiasGridDesc_M >
__global__ void ck::kernel_reduce_second_half_batchnorm_backward_final ( const XYGridDesc_M_K  x_grid_desc_m_k,
const XYGridDesc_M_K  dy_grid_desc_m_k,
const XYGridDesc_M_K  dx_grid_desc_m_k,
const DscaleDbiasGridDesc_M_K  dscale_dbias_grid_desc_m_k,
const MeanVarGridDesc_M  mean_var_grid_desc_m,
const ScaleBiasGridDesc_M  scale_grid_desc_m,
const ScaleBiasGridDesc_M  bias_grid_desc_m,
index_t  blkgroup_size,
long_index_t  reduce_size,
index_t  num_xy_k_block_tile_iteration,
index_t  num_dscale_dbias_k_block_tile_iteration,
const DscaleDbiasDataType *const __restrict__  p_reduce_dscale,
const DscaleDbiasDataType *const __restrict__  p_reduce_dbias,
const MeanVarDataType *const __restrict__  p_mean,
const MeanVarDataType *const __restrict__  p_inv_var,
const XDataType *const __restrict__  p_x,
const DyDataType *const __restrict__  p_dy,
const ScaleDataType *const __restrict__  p_scale,
const DyElementwiseOp  dy_elementwise_op,
DxDataType *const __restrict__  p_dx,
DscaleDbiasDataType *const __restrict__  p_dscale,
DscaleDbiasDataType *const __restrict__  p_dbias 
)

◆ kernel_reduce_threadwise()

template<typename GridwiseReduction , bool OutputIndex, bool TransformIndexKtoGlobal, bool HaveIndexInput, typename InDataType , typename OutDataType , typename AccDataType , typename IndexDataType , typename InGridDesc_M_K , typename OutGridDesc_M , typename InElementwiseOperation , typename AccElementwiseOperation >
__global__ void ck::kernel_reduce_threadwise ( const InGridDesc_M_K  in_grid_desc_m_k,
const OutGridDesc_M  out_grid_desc_m,
const InElementwiseOperation  in_elementwise_op,
const AccElementwiseOperation  acc_elementwise_op,
AccDataType  alpha,
const InDataType *const __restrict__  p_in_value_global,
const IndexDataType *const __restrict__  p_in_index_global,
AccDataType  beta,
OutDataType *const __restrict__  p_out_value_global,
IndexDataType *const __restrict__  p_out_index_global 
)

◆ kernel_reduce_threadwise_multi_d()

template<typename GridwiseReduction , typename InDataType , typename OutDataType , typename AccDataType , typename InGridDesc_M_K , typename DsGridDesc_M , typename OutGridDesc_M , typename InElementwiseOperation , typename OutElementwiseOperation , typename DsGridPointer >
__global__ void ck::kernel_reduce_threadwise_multi_d ( const InGridDesc_M_K  in_grid_desc_m_k,
const DsGridDesc_M  ds_grid_desc_m,
const OutGridDesc_M  out_grid_desc_m,
const InElementwiseOperation  in_elementwise_op,
const OutElementwiseOperation  out_elementwise_op,
const InDataType *const __restrict__  p_in_value_global,
const DsGridPointer  p_ds_value_global,
OutDataType *const __restrict__  p_out_value_global 
)

◆ kernel_softmax()

template<typename GridwiseReduction , typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K >
__global__ void ck::kernel_softmax ( const GridDesc_M_K  in_grid_desc_m_k,
const GridDesc_M_K  out_grid_desc_m_k,
index_t  block_group_size,
index_t  num_k_block_tile_iteration,
AccDataType  alpha,
const InDataType *const __restrict__  p_in_value_global,
AccDataType  beta,
OutDataType *const __restrict__  p_out_value_global 
)

◆ kernel_sparse_embeddings_forward_layernorm()

template<typename GridwiseSparseEmbedding , typename EmbType , typename IndexType , typename GammaDataType , typename BetaDataType , typename AccDataType , typename OutType , typename OutGridDesc , typename EmbElementwiseOperation , ck::index_t NumEmbeddings>
__global__ void ck::kernel_sparse_embeddings_forward_layernorm ( OutType *  p_out,
const ck::Array< EmbType *, NumEmbeddings >  p_embs,
const ck::Array< IndexType *, NumEmbeddings >  p_indexes,
const GammaDataType *  p_gamma,
const BetaDataType *  p_beta,
const OutGridDesc  out_grid_desc,
const AccDataType  epsilon,
const EmbElementwiseOperation  emb_elementwise_op 
)

◆ kernel_tensor_rearrange()

template<typename InputGridDesc , typename InputDataType , typename OutputGridDesc , typename OutputDataType , typename Block2ETileMap , typename ComputePtrOffsetOfStridedBatch , typename GridwiseTensorRearrangeKernel >
__global__ void ck::kernel_tensor_rearrange ( const InputGridDesc  in_grid_desc,
const InputDataType *__restrict__  p_in_global,
const OutputGridDesc  out_grid_desc,
OutputDataType *__restrict__  p_out_global,
const index_t  batch_count,
const Block2ETileMap  block_2_tile_map,
const ComputePtrOffsetOfStridedBatch  compute_ptr_offset_of_batch 
)

◆ kernel_welford_layernorm2d_second_half()

template<typename GridwiseWelfordLayernorm , typename EMeanVarDataType , typename HDataType , typename GammaDataType , typename BetaDataType , typename ComputeDataType , typename EHGridDesc_M_N , typename LayernormMeanVarGridDesc_M_NBlock , typename LayernormCountGridDesc_M_NBlock , typename GammaBetaGridDesc_N , typename HElementwiseOperation >
__global__ void ck::kernel_welford_layernorm2d_second_half ( const EMeanVarDataType *__restrict__  p_e_grid,
const EMeanVarDataType *__restrict__  p_in_welford_mean_grid,
const EMeanVarDataType *__restrict__  p_in_welford_var_grid,
const int32_t *__restrict__  p_in_welford_count_grid,
const GammaDataType *__restrict__  p_gamma_grid,
const BetaDataType *__restrict__  p_beta_grid,
HDataType *__restrict__  p_h_grid,
const EHGridDesc_M_N  e_grid_desc_m_n,
const EHGridDesc_M_N  h_grid_desc_m_n,
const LayernormMeanVarGridDesc_M_NBlock  mean_var_grid_desc_m_nblock,
const LayernormCountGridDesc_M_NBlock  count_grid_desc_m_nblock,
const GammaBetaGridDesc_N  gamma_grid_desc_n,
const GammaBetaGridDesc_N  beta_grid_desc_n,
index_t  numMeanVarCountBlockTileIteration_N,
index_t  NBlockClusterLength,
ComputeDataType  epsilon,
HElementwiseOperation  h_element_op 
)

◆ kernel_welford_second_half_batchnorm_forward_final()

template<typename GridwiseWelfordSecondHalfBatchNormForwardFinal_ , typename XDataType , typename YDataType , typename AccDataType , typename ScaleDataType , typename BiasDataType , typename MeanVarDataType , typename YElementwiseOp , typename XYGridDesc_M_K , typename MeanVarCountGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M >
__global__ void ck::kernel_welford_second_half_batchnorm_forward_final ( const XYGridDesc_M_K  x_grid_desc_m_k,
const XYGridDesc_M_K  y_grid_desc_m_k,
const MeanVarCountGridDesc_M_K  mean_var_count_grid_desc_m_k,
const ScaleBiasGridDesc_M  scale_grid_desc_m,
const ScaleBiasGridDesc_M  bias_grid_desc_m,
const MeanVarGridDesc_M  mean_var_grid_desc_m,
index_t  blkgroup_size,
index_t  num_xy_k_block_tile_iteration,
AccDataType  epsilon,
const MeanVarDataType *const __restrict__  p_in_welford_mean,
const MeanVarDataType *const __restrict__  p_in_welford_variance,
const int32_t *const __restrict__  p_in_welford_count,
const XDataType *const __restrict__  p_x,
const ScaleDataType *const __restrict__  p_scale,
const BiasDataType *const __restrict__  p_bias,
const YElementwiseOp  y_elementwise_op,
YDataType *const __restrict__  p_y,
bool  updateMovingAverage,
AccDataType  averageFactor,
MeanVarDataType *const __restrict__  resultRunningMean,
MeanVarDataType *const __restrict__  resultRunningVariance,
bool  saveMeanInvVariance,
MeanVarDataType *const __restrict__  resultSaveMean,
MeanVarDataType *const __restrict__  resultSaveInvVariance 
)

◆ kernel_welford_second_half_reduce_first_half()

template<typename GridwiseWelfordSecondHalfReduceFirstHalf_ , typename XDataType , typename DyDataType , typename AccDataType , typename ScaleDataType , typename DscaleDbiasDataType , typename MeanVarDataType , typename DyElementwiseOp , typename XYGridDesc_M_K , typename MeanVarGridDesc_M , typename MeanVarCountGridDesc_M_K , typename DscaleDbiasGridDesc_M_G >
__global__ void ck::kernel_welford_second_half_reduce_first_half ( const XYGridDesc_M_K  x_grid_desc_m_k,
const XYGridDesc_M_K  dy_grid_desc_m_k,
const MeanVarGridDesc_M  mean_var_grid_desc_m,
const MeanVarCountGridDesc_M_K  mean_var_count_grid_desc_m_k,
const DscaleDbiasGridDesc_M_G  dscale_dbias_grid_desc_m_g,
index_t  blkgroup_size,
index_t  num_xy_k_block_tile_iteration,
index_t  num_mean_var_count_k_block_tile_iteration,
AccDataType  epsilon,
bool  haveSavedMeanInvVar,
const MeanVarDataType *const __restrict__  p_savedMean,
const MeanVarDataType *const __restrict__  p_savedInvVar,
const MeanVarDataType *const __restrict__  p_in_welford_mean,
const MeanVarDataType *const __restrict__  p_in_welford_variance,
const int32_t *const __restrict__  p_in_welford_count,
const DyElementwiseOp  dy_elementwise_op,
MeanVarDataType *const __restrict__  p_out_welford_mean,
MeanVarDataType *const __restrict__  p_out_welford_inv_variance,
const XDataType *const __restrict__  p_x,
const DyDataType *const __restrict__  p_dy,
DscaleDbiasDataType *const __restrict__  p_reduce_dscale,
DscaleDbiasDataType *const __restrict__  p_reduce_dbias 
)

◆ llvm_amdgcn_raw_buffer_atomic_add_fp16x2()

__device__ half2_t ck::llvm_amdgcn_raw_buffer_atomic_add_fp16x2 ( half2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_atomic_add_fp32()

__device__ float ck::llvm_amdgcn_raw_buffer_atomic_add_fp32 ( float  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_atomic_add_i32()

__device__ int32_t ck::llvm_amdgcn_raw_buffer_atomic_add_i32 ( int32_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_atomic_max_fp64()

__device__ double ck::llvm_amdgcn_raw_buffer_atomic_max_fp64 ( double  vdata,
int32x4_t  rsrc,
int  voffset,
int  soffset,
int  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp16()

__device__ half_t ck::llvm_amdgcn_raw_buffer_load_fp16 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp16x2()

__device__ half2_t ck::llvm_amdgcn_raw_buffer_load_fp16x2 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp16x4()

__device__ half4_t ck::llvm_amdgcn_raw_buffer_load_fp16x4 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp32()

__device__ float ck::llvm_amdgcn_raw_buffer_load_fp32 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp32x2()

__device__ float2_t ck::llvm_amdgcn_raw_buffer_load_fp32x2 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp32x4()

__device__ float4_t ck::llvm_amdgcn_raw_buffer_load_fp32x4 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i16()

__device__ bhalf_t ck::llvm_amdgcn_raw_buffer_load_i16 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i16x2()

__device__ bhalf2_t ck::llvm_amdgcn_raw_buffer_load_i16x2 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i16x4()

__device__ bhalf4_t ck::llvm_amdgcn_raw_buffer_load_i16x4 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i32()

__device__ int32_t ck::llvm_amdgcn_raw_buffer_load_i32 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i32x2()

__device__ int32x2_t ck::llvm_amdgcn_raw_buffer_load_i32x2 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i32x4()

__device__ int32x4_t ck::llvm_amdgcn_raw_buffer_load_i32x4 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i8()

__device__ int8_t ck::llvm_amdgcn_raw_buffer_load_i8 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i8x2()

__device__ int8x2_t ck::llvm_amdgcn_raw_buffer_load_i8x2 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i8x4()

__device__ int8x4_t ck::llvm_amdgcn_raw_buffer_load_i8x4 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_lds()

__device__ void ck::llvm_amdgcn_raw_buffer_load_lds ( int32x4_t  rsrc,
uint32_t lds_ptr,
index_t  size,
index_t  voffset,
index_t  soffset,
index_t  offset,
index_t  aux 
)

◆ llvm_amdgcn_raw_buffer_store_fp16()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp16 ( half_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_fp16x2()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp16x2 ( half2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_fp16x4()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp16x4 ( half4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_fp32()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp32 ( float  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_fp32x2()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp32x2 ( float2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_fp32x4()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp32x4 ( float4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i16()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i16 ( bhalf_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i16x2()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i16x2 ( bhalf2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i16x4()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i16x4 ( bhalf4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i32()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i32 ( int32_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i32x2()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i32x2 ( int32x2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i32x4()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i32x4 ( int32x4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i8()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i8 ( int8_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i8x2()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i8x2 ( int8x2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i8x4()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i8x4 ( int8x4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ make_array() [1/2]

template<typename X >
__host__ constexpr __device__ auto ck::make_array ( )
constexpr

◆ make_array() [2/2]

template<typename X , typename... Xs>
__host__ constexpr __device__ auto ck::make_array ( X &&  x,
Xs &&...  xs 
)
constexpr

◆ make_cluster_descriptor()

template<typename Lengths , typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
__host__ constexpr __device__ auto ck::make_cluster_descriptor ( const Lengths &  lengths,
ArrangeOrder  order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{} 
)
constexpr

◆ make_conv_bwd_data_out_transform()

__host__ constexpr __device__ auto ck::make_conv_bwd_data_out_transform ( index_t  N,
index_t  Ho,
index_t  Wo,
index_t  K,
[[maybe_unused] ] index_t  YDot,
index_t  XDot,
index_t  HTilde,
index_t  WTilde,
index_t  ConvDilationH,
index_t  ConvDilationW,
index_t  HTildeSlice,
index_t  WTildeSlice,
index_t  YDotSlice,
index_t  XDotSlice,
index_t  IHTildeSliceBegin,
index_t  IWTildeSliceBegin,
index_t  GcdStrideDilationH,
index_t  GcdStrideDilationW,
index_t  K0,
index_t  K1,
index_t  MPerBlock,
index_t  GemmKPerBlock 
)
constexpr

◆ make_default_loop_scheduler()

constexpr LoopScheduler ck::make_default_loop_scheduler ( )
constexpr

◆ make_dynamic_buffer() [1/2]

template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T , typename ElementSpaceSize >
__host__ constexpr __device__ auto ck::make_dynamic_buffer ( T *  p,
ElementSpaceSize  element_space_size 
)
constexpr

◆ make_dynamic_buffer() [2/2]

template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T , typename ElementSpaceSize , typename X , typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X >>::value, bool >::type = false>
__host__ constexpr __device__ auto ck::make_dynamic_buffer ( T *  p,
ElementSpaceSize  element_space_size,
invalid_element_value 
)
constexpr

◆ make_embed_transform()

template<typename UpLengths , typename Coefficients , typename enable_if< UpLengths::Size()==Coefficients::Size(), bool >::type = false>
__host__ constexpr __device__ auto ck::make_embed_transform ( const UpLengths &  up_lengths,
const Coefficients &  coefficients 
)
constexpr

◆ make_freeze_transform()

template<typename LowerIndex >
__host__ constexpr __device__ auto ck::make_freeze_transform ( const LowerIndex &  low_idx)
constexpr

◆ make_insert_transform()

template<typename UpperIndex >
__host__ constexpr __device__ auto ck::make_insert_transform ( const UpperIndex &  up_idx)
constexpr

◆ make_left_pad_transform()

template<typename LowLength , typename LeftPadLength , bool SkipIsValidCheck = false>
__host__ constexpr __device__ auto ck::make_left_pad_transform ( const LowLength &  low_length,
const LeftPadLength &  left_pad,
integral_constant< bool, SkipIsValidCheck >  = integral_constant<bool, false>{} 
)
constexpr

◆ make_long_dynamic_buffer()

template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T , typename ElementSpaceSize >
__host__ constexpr __device__ auto ck::make_long_dynamic_buffer ( T *  p,
ElementSpaceSize  element_space_size 
)
constexpr

◆ make_merge_transform()

template<typename LowLengths >
__host__ constexpr __device__ auto ck::make_merge_transform ( const LowLengths &  low_lengths)
constexpr

◆ make_merge_transform_v1_carry_check()

template<typename LowLengths >
__host__ constexpr __device__ auto ck::make_merge_transform_v1_carry_check ( const LowLengths &  low_lengths)
constexpr

◆ make_merge_transform_v2_magic_division()

template<typename LowLengths >
__host__ constexpr __device__ auto ck::make_merge_transform_v2_magic_division ( const LowLengths &  low_lengths)
constexpr

◆ make_merge_transform_v3_division_mod()

template<typename LowLengths >
__host__ constexpr __device__ auto ck::make_merge_transform_v3_division_mod ( const LowLengths &  low_lengths)
constexpr

◆ make_merge_transform_v4_no_carry()

template<typename LowLengths >
__host__ constexpr __device__ auto ck::make_merge_transform_v4_no_carry ( const LowLengths &  low_lengths)
constexpr

◆ make_modulo_transform()

template<typename Modulus , typename UpLength >
__host__ constexpr __device__ auto ck::make_modulo_transform ( const Modulus &  modulus,
const UpLength &  up_length 
)
constexpr

◆ make_multi_index()

template<typename... Xs>
__host__ constexpr __device__ auto ck::make_multi_index ( Xs &&...  xs)
constexpr

◆ make_naive_tensor_descriptor()

template<typename... Lengths, typename... Strides, typename enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
__host__ constexpr __device__ auto ck::make_naive_tensor_descriptor ( const Tuple< Lengths... > &  lengths,
const Tuple< Strides... > &  strides 
)
constexpr

◆ make_naive_tensor_descriptor_aligned()

template<typename... Lengths, typename Align >
__host__ constexpr __device__ auto ck::make_naive_tensor_descriptor_aligned ( const Tuple< Lengths... > &  lengths,
Align  align 
)
constexpr

◆ make_naive_tensor_descriptor_packed()

template<typename... Lengths>
__host__ constexpr __device__ auto ck::make_naive_tensor_descriptor_packed ( const Tuple< Lengths... > &  lengths)
constexpr

◆ make_pad_transform()

template<typename LowLength , typename LeftPad , typename RightPad , bool SkipIsValidCheck = false>
__host__ constexpr __device__ auto ck::make_pad_transform ( const LowLength &  low_length,
const LeftPad left_pad,
const RightPad right_pad,
integral_constant< bool, SkipIsValidCheck >  = integral_constant<bool, false>{} 
)
constexpr

◆ make_pass_through_transform()

template<typename LowLength >
__host__ constexpr __device__ auto ck::make_pass_through_transform ( const LowLength &  low_length)
constexpr

◆ make_right_pad_transform()

template<typename LowLength , typename RightPadLength , bool SkipIsValidCheck = false>
__host__ constexpr __device__ auto ck::make_right_pad_transform ( const LowLength &  low_length,
const RightPadLength &  right_pad,
integral_constant< bool, SkipIsValidCheck >  = integral_constant<bool, false>{} 
)
constexpr

◆ make_sequence()

template<index_t... Is>
__host__ constexpr __device__ auto ck::make_sequence ( Number< Is >  ...)
constexpr

◆ make_single_stage_tensor_adaptor()

template<typename Transforms , typename LowerDimensionOldTopIdss , typename UpperDimensionNewTopIdss >
__host__ constexpr __device__ auto ck::make_single_stage_tensor_adaptor ( const Transforms &  transforms,
LowerDimensionOldTopIdss  ,
UpperDimensionNewTopIdss   
)
constexpr

◆ make_slice_transform()

template<typename LowLength , typename SliceBegin , typename SliceEnd >
__host__ constexpr __device__ auto ck::make_slice_transform ( const LowLength &  low_length,
const SliceBegin &  slice_begin,
const SliceEnd &  slice_end 
)
constexpr

◆ make_static_buffer() [1/2]

template<AddressSpaceEnum AddressSpace, typename T , long_index_t N>
__host__ constexpr __device__ auto ck::make_static_buffer ( LongNumber< N >  )
constexpr

◆ make_static_buffer() [2/2]

template<AddressSpaceEnum AddressSpace, typename T , index_t N>
__host__ constexpr __device__ auto ck::make_static_buffer ( Number< N >  )
constexpr

◆ make_static_tensor() [1/2]

template<AddressSpaceEnum AddressSpace, typename T , typename TensorDesc , typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false>
__host__ constexpr __device__ auto ck::make_static_tensor ( TensorDesc  )
constexpr

◆ make_static_tensor() [2/2]

template<AddressSpaceEnum AddressSpace, typename T , typename TensorDesc , typename X , typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false, typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X >>::value, bool >::type = false>
__host__ constexpr __device__ auto ck::make_static_tensor ( TensorDesc  ,
invalid_element_value 
)
constexpr

◆ make_statically_indexed_array() [1/2]

template<typename X >
__host__ constexpr __device__ auto ck::make_statically_indexed_array ( )
constexpr

◆ make_statically_indexed_array() [2/2]

template<typename X , typename... Xs>
__host__ constexpr __device__ auto ck::make_statically_indexed_array ( const X &  x,
const Xs &...  xs 
)
constexpr

◆ make_tensor_coordinate()

template<typename TensorDesc , typename VisibleIndex >
__host__ constexpr __device__ auto ck::make_tensor_coordinate ( const TensorDesc &  tensor_desc,
const VisibleIndex &  idx_visible 
)
constexpr

◆ make_tensor_coordinate_step() [1/2]

template<typename TensorDesc , typename VisibleIndex >
__host__ constexpr __device__ auto ck::make_tensor_coordinate_step ( const TensorDesc &  ,
const VisibleIndex &  idx_diff_visible 
)
constexpr

◆ make_tensor_coordinate_step() [2/2]

template<typename TensorDesc , typename VisibleIndex , typename UpdateLowerIndexHack >
__host__ constexpr __device__ auto ck::make_tensor_coordinate_step ( const TensorDesc &  ,
const VisibleIndex &  idx_diff_visible,
UpdateLowerIndexHack   
)
constexpr

◆ make_tuple()

template<typename... Xs>
__host__ constexpr __device__ auto ck::make_tuple ( Xs &&...  xs)
constexpr

◆ make_unmerge_transform()

template<typename UpLengths , bool Use24BitIntegerCalculation = false>
__host__ constexpr __device__ auto ck::make_unmerge_transform ( const UpLengths &  up_lengths,
integral_constant< bool, Use24BitIntegerCalculation >  = integral_constant<bool, false>{} 
)
constexpr

◆ make_vector_type()

template<typename T , index_t N>
__host__ constexpr __device__ auto ck::make_vector_type ( Number< N >  )
constexpr

◆ make_vectorize_transform()

template<typename VectorSize , typename UpLength >
__host__ constexpr __device__ auto ck::make_vectorize_transform ( const VectorSize &  vector_size,
const UpLength &  up_length 
)
constexpr

◆ make_wave_buffer_resource()

template<typename T >
__device__ int32x4_t ck::make_wave_buffer_resource ( T *  p_wave,
index_t  element_space_size 
)

◆ make_wave_buffer_resource_new()

template<typename T >
__device__ __amdgpu_buffer_rsrc_t ck::make_wave_buffer_resource_new ( T *  p_wave,
index_t  element_space_size 
)

◆ make_wave_buffer_resource_with_default_range()

template<typename T >
__device__ int32x4_t ck::make_wave_buffer_resource_with_default_range ( T *  p_wave)

◆ make_wave_buffer_resource_with_default_range_new()

template<typename T >
__device__ __amdgpu_buffer_rsrc_t ck::make_wave_buffer_resource_with_default_range_new ( T *  p_wave)

◆ make_xor_transform()

template<typename LowLengths >
__host__ constexpr __device__ auto ck::make_xor_transform ( const LowLengths &  low_lengths)
constexpr

◆ make_xor_with_modulo_transform()

template<typename LowLengths >
__host__ constexpr __device__ auto ck::make_xor_with_modulo_transform ( const LowLengths &  low_lengths)
constexpr

◆ make_zero_multi_index()

template<index_t NSize>
__host__ constexpr __device__ auto ck::make_zero_multi_index ( )
constexpr

◆ merge_sequences()

template<typename... Seqs>
__host__ constexpr __device__ auto ck::merge_sequences ( Seqs...  )
constexpr

◆ modify_sequence_elements_by_ids()

template<typename Seq , typename Values , typename Ids >
__host__ constexpr __device__ auto ck::modify_sequence_elements_by_ids ( Seq  ,
Values  ,
Ids   
)
constexpr

◆ move_tensor_coordinate()

template<typename TensorDesc , typename TensorCoord , typename TensorCoordStep >
__host__ constexpr __device__ void ck::move_tensor_coordinate ( const TensorDesc &  tensor_desc,
TensorCoord &  coord,
const TensorCoordStep &  coord_step 
)
constexpr

◆ mxf8_convert_rne()

template<typename Y , typename X >
__host__ constexpr __device__ Y ck::mxf8_convert_rne ( x,
float  scale 
)
constexpr

◆ mxf8_convert_rne< bf8_ocp_t, float >()

template<>
__host__ __device__ bf8_ocp_t ck::mxf8_convert_rne< bf8_ocp_t, float > ( float  x,
float  scale 
)
inline

◆ mxf8_convert_rne< bf8x16_ocp_t, float16_t >()

template<>
__host__ __device__ bf8x16_ocp_t ck::mxf8_convert_rne< bf8x16_ocp_t, float16_t > ( float16_t  x,
float  scale 
)
inline

◆ mxf8_convert_rne< bf8x2_ocp_t, float2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::mxf8_convert_rne< bf8x2_ocp_t, float2_t > ( float2_t  x,
float  scale 
)
inline

◆ mxf8_convert_rne< bf8x32_ocp_t, float32_t >()

template<>
__host__ __device__ bf8x32_ocp_t ck::mxf8_convert_rne< bf8x32_ocp_t, float32_t > ( float32_t  x,
float  scale 
)
inline

◆ mxf8_convert_rne< f8_ocp_t, float >()

template<>
__host__ __device__ f8_ocp_t ck::mxf8_convert_rne< f8_ocp_t, float > ( float  x,
float  scale 
)
inline

◆ mxf8_convert_rne< f8x16_ocp_t, float16_t >()

template<>
__host__ __device__ f8x16_ocp_t ck::mxf8_convert_rne< f8x16_ocp_t, float16_t > ( float16_t  x,
float  scale 
)
inline

◆ mxf8_convert_rne< f8x2_ocp_t, float2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::mxf8_convert_rne< f8x2_ocp_t, float2_t > ( float2_t  x,
float  scale 
)
inline

◆ mxf8_convert_rne< f8x32_ocp_t, float32_t >()

template<>
__host__ __device__ f8x32_ocp_t ck::mxf8_convert_rne< f8x32_ocp_t, float32_t > ( float32_t  x,
float  scale 
)
inline

◆ mxf8_convert_sr()

template<typename Y , typename X >
__host__ constexpr __device__ Y ck::mxf8_convert_sr ( x,
float  scale 
)
constexpr

◆ mxf8_convert_sr< bf8_ocp_t, float >()

template<>
__host__ __device__ bf8_ocp_t ck::mxf8_convert_sr< bf8_ocp_t, float > ( float  x,
float  scale 
)
inline

◆ mxf8_convert_sr< bf8x16_ocp_t, float16_t >()

template<>
__host__ __device__ bf8x16_ocp_t ck::mxf8_convert_sr< bf8x16_ocp_t, float16_t > ( float16_t  x,
float  scale 
)
inline

◆ mxf8_convert_sr< bf8x2_ocp_t, float2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::mxf8_convert_sr< bf8x2_ocp_t, float2_t > ( float2_t  x,
float  scale 
)
inline

◆ mxf8_convert_sr< bf8x32_ocp_t, float32_t >()

template<>
__host__ __device__ bf8x32_ocp_t ck::mxf8_convert_sr< bf8x32_ocp_t, float32_t > ( float32_t  x,
float  scale 
)
inline

◆ mxf8_convert_sr< f8_ocp_t, float >()

template<>
__host__ __device__ f8_ocp_t ck::mxf8_convert_sr< f8_ocp_t, float > ( float  x,
float  scale 
)
inline

◆ mxf8_convert_sr< f8x16_ocp_t, float16_t >()

template<>
__host__ __device__ f8x16_ocp_t ck::mxf8_convert_sr< f8x16_ocp_t, float16_t > ( float16_t  x,
float  scale 
)
inline

◆ mxf8_convert_sr< f8x2_ocp_t, float2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::mxf8_convert_sr< f8x2_ocp_t, float2_t > ( float2_t  x,
float  scale 
)
inline

◆ mxf8_convert_sr< f8x32_ocp_t, float32_t >()

template<>
__host__ __device__ f8x32_ocp_t ck::mxf8_convert_sr< f8x32_ocp_t, float32_t > ( float32_t  x,
float  scale 
)
inline

◆ next_pow2()

constexpr auto ck::next_pow2 ( uint32_t  x)
inlineconstexpr

◆ NormalizationKernelSelector()

template<typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename SaveMeanInvStdDataType , typename ComputeDataType , typename YElementwiseOperation , typename GridDesc_M_K , typename GridDesc_M , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t XSrcVectorDim, index_t XSrcVectorSize, index_t GammaSrcVectorDim, index_t GammaSrcVectorSize, index_t BetaSrcVectorDim, index_t BetaSrcVectorSize, index_t YDstVectorDim, index_t YDstVectorSize, index_t SaveMeanInvStdDstVectorSize, bool UseWelford>
auto ck::NormalizationKernelSelector ( bool  isSweepOnce)

◆ operator%() [1/4]

template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto ck::operator% ( integral_constant< TX, X >  ,
integral_constant< TY, Y >   
)
constexpr

◆ operator%() [2/4]

template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto ck::operator% ( Number< Y >  ,
Sequence< Xs... >   
)
constexpr

◆ operator%() [3/4]

template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto ck::operator% ( Sequence< Xs... >  ,
Number< Y >   
)
constexpr

◆ operator%() [4/4]

template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto ck::operator% ( Sequence< Xs... >  ,
Sequence< Ys... >   
)
constexpr

◆ operator*() [1/8]

template<index_t NSize, typename T >
__host__ constexpr __device__ auto ck::operator* ( const MultiIndex< NSize > &  a,
const T &  b 
)
constexpr

◆ operator*() [2/8]

template<typename... Xs, typename Y , enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto ck::operator* ( const Tuple< Xs... > &  x,
const Y &  y 
)
constexpr

◆ operator*() [3/8]

template<typename... Xs, typename Y , enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto ck::operator* ( const Tuple< Xs... > &  x,
a 
)
constexpr

◆ operator*() [4/8]

template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto ck::operator* ( integral_constant< TX, X >  ,
integral_constant< TY, Y >   
)
constexpr

◆ operator*() [5/8]

template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto ck::operator* ( Number< Y >  ,
Sequence< Xs... >   
)
constexpr

◆ operator*() [6/8]

template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto ck::operator* ( Sequence< Xs... >  ,
Number< Y >   
)
constexpr

◆ operator*() [7/8]

template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto ck::operator* ( Sequence< Xs... >  ,
Sequence< Ys... >   
)
constexpr

◆ operator*() [8/8]

template<typename... Xs, typename Y , enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto ck::operator* ( a,
const Tuple< Xs... > &  x 
)
constexpr

◆ operator+() [1/6]

template<index_t NSize, typename T >
__host__ constexpr __device__ auto ck::operator+ ( const MultiIndex< NSize > &  a,
const T &  b 
)
constexpr

◆ operator+() [2/6]

template<typename... Xs, typename Y , enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto ck::operator+ ( const Tuple< Xs... > &  x,
const Y &  y 
)
constexpr

◆ operator+() [3/6]

template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto ck::operator+ ( integral_constant< TX, X >  ,
integral_constant< TY, Y >   
)
constexpr

◆ operator+() [4/6]

template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto ck::operator+ ( Number< Y >  ,
Sequence< Xs... >   
)
constexpr

◆ operator+() [5/6]

template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto ck::operator+ ( Sequence< Xs... >  ,
Number< Y >   
)
constexpr

◆ operator+() [6/6]

template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto ck::operator+ ( Sequence< Xs... >  ,
Sequence< Ys... >   
)
constexpr

◆ operator+=() [1/3]

template<typename Arr , typename Picks , typename X >
__host__ constexpr __device__ auto ck::operator+= ( ContainerElementPicker< Arr, Picks > &  y,
const X &  x 
)
constexpr

◆ operator+=() [2/3]

template<index_t NSize, typename X >
__host__ constexpr __device__ auto ck::operator+= ( MultiIndex< NSize > &  y,
const X &  x 
)
constexpr

◆ operator+=() [3/3]

template<typename... Ys, typename X , enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false>
__host__ constexpr __device__ auto ck::operator+= ( Tuple< Ys... > &  y,
const X &  x 
)
constexpr

◆ operator-() [1/6]

template<index_t NSize, typename T >
__host__ constexpr __device__ auto ck::operator- ( const MultiIndex< NSize > &  a,
const T &  b 
)
constexpr

◆ operator-() [2/6]

template<typename... Xs, typename Y , enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto ck::operator- ( const Tuple< Xs... > &  x,
const Y &  y 
)
constexpr

◆ operator-() [3/6]

template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto ck::operator- ( integral_constant< TX, X >  ,
integral_constant< TY, Y >   
)
constexpr

◆ operator-() [4/6]

template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto ck::operator- ( Number< Y >  ,
Sequence< Xs... >   
)
constexpr

◆ operator-() [5/6]

template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto ck::operator- ( Sequence< Xs... >  ,
Number< Y >   
)
constexpr

◆ operator-() [6/6]

template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto ck::operator- ( Sequence< Xs... >  ,
Sequence< Ys... >   
)
constexpr

◆ operator-=() [1/3]

template<typename Arr , typename Picks , typename X >
__host__ constexpr __device__ auto ck::operator-= ( ContainerElementPicker< Arr, Picks > &  y,
const X &  x 
)
constexpr

◆ operator-=() [2/3]

template<index_t NSize, typename X >
__host__ constexpr __device__ auto ck::operator-= ( MultiIndex< NSize > &  y,
const X &  x 
)
constexpr

◆ operator-=() [3/3]

template<typename... Ys, typename X , enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false>
__host__ constexpr __device__ auto ck::operator-= ( Tuple< Ys... > &  y,
const X &  x 
)
constexpr

◆ operator/() [1/4]

template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto ck::operator/ ( integral_constant< TX, X >  ,
integral_constant< TY, Y >   
)
constexpr

◆ operator/() [2/4]

template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto ck::operator/ ( Number< Y >  ,
Sequence< Xs... >   
)
constexpr

◆ operator/() [3/4]

template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto ck::operator/ ( Sequence< Xs... >  ,
Number< Y >   
)
constexpr

◆ operator/() [4/4]

template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto ck::operator/ ( Sequence< Xs... >  ,
Sequence< Ys... >   
)
constexpr

◆ operator==()

template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ bool ck::operator== ( Sequence< Xs... >  ,
Sequence< Ys... >   
)
constexpr

◆ pick_container_element() [1/2]

template<typename Arr , typename Picks >
__host__ constexpr __device__ auto ck::pick_container_element ( Arr &  a,
Picks   
)
constexpr

◆ pick_container_element() [2/2]

template<typename Arr , typename Picks >
__host__ constexpr __device__ auto ck::pick_container_element ( const Arr &  a,
Picks   
)
constexpr

◆ pick_sequence_elements_by_ids()

template<typename Seq , index_t... Is>
__host__ constexpr __device__ auto ck::pick_sequence_elements_by_ids ( Seq  ,
Sequence< Is... >   
)
constexpr

◆ pick_sequence_elements_by_mask()

template<typename Seq , typename Mask >
__host__ constexpr __device__ auto ck::pick_sequence_elements_by_mask ( Seq  ,
Mask   
)
constexpr

◆ prand_generator() [1/2]

template<typename T , uint32_t seed_t, ck::enable_if_t< is_same< float, T >{}, bool > = false>
__host__ __device__ uint32_t ck::prand_generator ( index_t  id,
val,
uint32_t  seed = seed_t 
)

◆ prand_generator() [2/2]

template<typename T , uint32_t seed_t, ck::enable_if_t<!(is_same< float, T >{}||is_same< _Float16, T >{}), bool > = false>
__host__ __device__ uint32_t ck::prand_generator ( int  id,
val,
uint32_t  seed = seed_t 
)

◆ print_multi_index()

template<typename... Xs>
__host__ __device__ void ck::print_multi_index ( const Tuple< Xs... > &  x)

◆ reduce_on_sequence()

template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ index_t ck::reduce_on_sequence ( Seq  ,
Reduce  f,
Number< Init >   
)
constexpr

◆ reverse_exclusive_scan_sequence()

template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ auto ck::reverse_exclusive_scan_sequence ( Seq  ,
Reduce  ,
Number< Init >   
)
constexpr

◆ reverse_inclusive_scan_sequence()

template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ auto ck::reverse_inclusive_scan_sequence ( Seq  ,
Reduce  ,
Number< Init >   
)
constexpr

◆ s_nop()

__device__ void ck::s_nop ( )

◆ scaled_type_convert()

template<typename Y , typename X >
constexpr __host__ Y ck::scaled_type_convert ( e8m0_bexp_t  scale,
x 
)
constexpr

◆ scaled_type_convert< bf8_ocp_t, float >()

template<>
__host__ bf8_ocp_t ck::scaled_type_convert< bf8_ocp_t, float > ( e8m0_bexp_t  scale,
float  x 
)
inline

◆ scaled_type_convert< bf8x16_ocp_t, float16_t >()

template<>
__host__ bf8x16_ocp_t ck::scaled_type_convert< bf8x16_ocp_t, float16_t > ( e8m0_bexp_t  scale,
float16_t  x 
)
inline

◆ scaled_type_convert< bf8x2_ocp_t, float2_t >()

template<>
__host__ bf8x2_ocp_t ck::scaled_type_convert< bf8x2_ocp_t, float2_t > ( e8m0_bexp_t  scale,
float2_t  x 
)
inline

◆ scaled_type_convert< bf8x32_ocp_t, float32_t >()

template<>
__host__ bf8x32_ocp_t ck::scaled_type_convert< bf8x32_ocp_t, float32_t > ( e8m0_bexp_t  scale,
float32_t  x 
)
inline

◆ scaled_type_convert< f8_ocp_t, float >()

template<>
__host__ f8_ocp_t ck::scaled_type_convert< f8_ocp_t, float > ( e8m0_bexp_t  scale,
float  x 
)
inline

◆ scaled_type_convert< f8x16_ocp_t, float16_t >()

template<>
__host__ f8x16_ocp_t ck::scaled_type_convert< f8x16_ocp_t, float16_t > ( e8m0_bexp_t  scale,
float16_t  x 
)
inline

◆ scaled_type_convert< f8x2_ocp_t, float2_t >()

template<>
__host__ f8x2_ocp_t ck::scaled_type_convert< f8x2_ocp_t, float2_t > ( e8m0_bexp_t  scale,
float2_t  x 
)
inline

◆ scaled_type_convert< f8x32_ocp_t, float32_t >()

template<>
__host__ f8x32_ocp_t ck::scaled_type_convert< f8x32_ocp_t, float32_t > ( e8m0_bexp_t  scale,
float32_t  x 
)
inline

◆ scaled_type_convert< float, bf8_ocp_t >()

template<>
__host__ float ck::scaled_type_convert< float, bf8_ocp_t > ( e8m0_bexp_t  scale,
bf8_ocp_t  x 
)
inline

◆ scaled_type_convert< float, f8_ocp_t >()

template<>
__host__ float ck::scaled_type_convert< float, f8_ocp_t > ( e8m0_bexp_t  scale,
f8_ocp_t  x 
)
inline

◆ scaled_type_convert< float16_t, bf8x16_ocp_t >()

template<>
__host__ float16_t ck::scaled_type_convert< float16_t, bf8x16_ocp_t > ( e8m0_bexp_t  scale,
bf8x16_ocp_t  x 
)
inline

◆ scaled_type_convert< float16_t, f8x16_ocp_t >()

template<>
__host__ float16_t ck::scaled_type_convert< float16_t, f8x16_ocp_t > ( e8m0_bexp_t  scale,
f8x16_ocp_t  x 
)
inline

◆ scaled_type_convert< float2_t, bf8x2_ocp_t >()

template<>
__host__ float2_t ck::scaled_type_convert< float2_t, bf8x2_ocp_t > ( e8m0_bexp_t  scale,
bf8x2_ocp_t  x 
)
inline

◆ scaled_type_convert< float2_t, f8x2_ocp_t >()

template<>
__host__ float2_t ck::scaled_type_convert< float2_t, f8x2_ocp_t > ( e8m0_bexp_t  scale,
f8x2_ocp_t  x 
)
inline

◆ scaled_type_convert< float32_t, bf8x32_ocp_t >()

template<>
__host__ float32_t ck::scaled_type_convert< float32_t, bf8x32_ocp_t > ( e8m0_bexp_t  scale,
bf8x32_ocp_t  x 
)
inline

◆ scaled_type_convert< float32_t, f8x32_ocp_t >()

template<>
__host__ float32_t ck::scaled_type_convert< float32_t, f8x32_ocp_t > ( e8m0_bexp_t  scale,
f8x32_ocp_t  x 
)
inline

◆ sequence_all_of()

template<typename Seq , typename F >
__host__ constexpr __device__ bool ck::sequence_all_of ( Seq  ,
f 
)
constexpr

◆ sequence_any_of()

template<typename Seq , typename F >
__host__ constexpr __device__ bool ck::sequence_any_of ( Seq  ,
f 
)
constexpr

◆ sequence_pop_back()

template<typename Seq >
__host__ constexpr __device__ auto ck::sequence_pop_back ( Seq  )
constexpr

◆ sequence_pop_front()

template<index_t I, index_t... Is>
__host__ constexpr __device__ auto ck::sequence_pop_front ( Sequence< I, Is... >  )
constexpr

◆ sequence_to_tuple_of_number()

template<index_t... Is>
__host__ constexpr __device__ auto ck::sequence_to_tuple_of_number ( Sequence< Is... >  )
constexpr

◆ set_container_subset() [1/2]

template<typename T , index_t N, index_t... Is>
__host__ constexpr __device__ void ck::set_container_subset ( Array< T, N > &  y,
Sequence< Is... >  picks,
const Array< T, sizeof...(Is)> &  x 
)
constexpr

◆ set_container_subset() [2/2]

template<typename... Ys, index_t... Is, typename... Xs>
__host__ constexpr __device__ void ck::set_container_subset ( Tuple< Ys... > &  y,
Sequence< Is... >  picks,
const Tuple< Xs... > &  x 
)
constexpr

◆ tie()

template<typename... Args>
constexpr Tuple<Args&...> ck::tie ( Args &...  args)
constexprnoexcept

◆ to_multi_index()

template<typename T >
__host__ constexpr __device__ auto ck::to_multi_index ( const T &  x)
constexpr

◆ to_sequence()

template<index_t... Is>
__host__ constexpr __device__ auto ck::to_sequence ( Tuple< Number< Is >... >  )
constexpr

◆ transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad()

template<typename... In, typename... Wei, typename... Out, typename ConvStrides , typename ConvDilations , typename InLeftPads , typename InRightPads , index_t GemmK1Value>
__host__ constexpr __device__ auto ck::transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad ( const TensorDescriptor< In... > &  in_grid_desc_n_di_hi_wi_c,
const TensorDescriptor< Wei... > &  wei_k_z_y_x_c_grid_desc,
const TensorDescriptor< Out... > &  out_n_do_ho_wo_k_grid_desc,
const ConvStrides &  conv_strides,
const ConvDilations &  conv_dilations,
const InLeftPads &  in_left_pads,
const InRightPads &  in_right_pads,
Number< GemmK1Value >   
)
constexpr

◆ transform_sequences() [1/3]

template<typename F , index_t... Xs>
__host__ constexpr __device__ auto ck::transform_sequences ( f,
Sequence< Xs... >   
)
constexpr

◆ transform_sequences() [2/3]

template<typename F , index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto ck::transform_sequences ( f,
Sequence< Xs... >  ,
Sequence< Ys... >   
)
constexpr

◆ transform_sequences() [3/3]

template<typename F , index_t... Xs, index_t... Ys, index_t... Zs>
__host__ constexpr __device__ auto ck::transform_sequences ( f,
Sequence< Xs... >  ,
Sequence< Ys... >  ,
Sequence< Zs... >   
)
constexpr

◆ transform_tensor_descriptor()

template<typename OldTensorDescriptor , typename NewTransforms , typename NewLowerDimensionOldVisibleIdss , typename NewUpperDimensionNewVisibleIdss >
__host__ constexpr __device__ auto ck::transform_tensor_descriptor ( const OldTensorDescriptor &  old_tensor_desc,
const NewTransforms &  new_transforms,
NewLowerDimensionOldVisibleIdss  ,
NewUpperDimensionNewVisibleIdss   
)
constexpr

◆ transform_tuples() [1/3]

template<typename F , typename X >
__host__ constexpr __device__ auto ck::transform_tuples ( f,
const X &  x 
)
constexpr

◆ transform_tuples() [2/3]

template<typename F , typename X , typename Y >
__host__ constexpr __device__ auto ck::transform_tuples ( f,
const X &  x,
const Y &  y 
)
constexpr

◆ transform_tuples() [3/3]

template<typename F , typename X , typename Y , typename Z >
__host__ constexpr __device__ auto ck::transform_tuples ( f,
const X &  x,
const Y &  y,
const Z &  z 
)
constexpr

◆ transpose_f8_4x4()

__device__ void ck::transpose_f8_4x4 ( const f8x4_t &  x0,
const f8x4_t &  x1,
const f8x4_t &  x2,
const f8x4_t &  x3,
f8x4_t &  y0,
f8x4_t &  y1,
f8x4_t &  y2,
f8x4_t &  y3 
)

◆ transpose_fp16_2x2()

__device__ void ck::transpose_fp16_2x2 ( const half2_t x0,
const half2_t x1,
half2_t y0,
half2_t y1 
)

◆ transpose_int8_4x4()

__device__ void ck::transpose_int8_4x4 ( const int8x4_t x0,
const int8x4_t x1,
const int8x4_t x2,
const int8x4_t x3,
int8x4_t y0,
int8x4_t y1,
int8x4_t y2,
int8x4_t y3 
)

◆ TupleDepth() [1/2]

template<index_t depth = 0, typename T >
__host__ constexpr __device__ auto ck::TupleDepth ( const T &  )
constexpr

◆ TupleDepth() [2/2]

template<index_t depth = 0, typename... Ts>
__host__ constexpr __device__ auto ck::TupleDepth ( const Tuple< Ts... > &  )
constexpr

◆ TupleReduce()

template<index_t Idx, index_t End, typename F , typename... Ts>
__host__ constexpr __device__ auto ck::TupleReduce ( F &&  f,
const Tuple< Ts... > &  tuple 
)
constexpr

◆ TupleReverse()

template<typename... Ts>
__host__ constexpr __device__ auto ck::TupleReverse ( const Tuple< Ts... > &  tuple)
constexpr

◆ TupleSlice()

template<index_t from, index_t to, typename... Ts>
__host__ constexpr __device__ auto ck::TupleSlice ( const Tuple< Ts... > &  tuple)
constexpr

◆ type_convert() [1/2]

template<typename Y , enable_if_t< is_same_v< Y, ck::tf32_t >, bool > = false>
__host__ constexpr __device__ float ck::type_convert ( float  x)
inlineconstexpr

◆ type_convert() [2/2]

template<typename Y , typename X , ck::enable_if_t<!(ck::is_const_v< Y >||ck::is_const_v< X >), bool > = false>
__host__ constexpr __device__ Y ck::type_convert ( x)
constexpr

◆ type_convert< bf6_t, float >()

template<>
__host__ __device__ bf6_t ck::type_convert< bf6_t, float > ( float  x)
inline

Specializes float-to-bf6_t conversion.

Uses stochastic rounding if CK_USE_SR_F6_CONVERSION is defined, otherwise uses round-to-nearest-even.

Parameters
xInput float value to convert.
Returns
Converted bf6_t value.

◆ type_convert< bf6x16_pk_t, float16_t >()

template<>
__host__ __device__ bf6x16_pk_t ck::type_convert< bf6x16_pk_t, float16_t > ( float16_t  x)
inline

◆ type_convert< bf6x16_t, float16_t >()

template<>
__host__ __device__ bf6x16_t ck::type_convert< bf6x16_t, float16_t > ( float16_t  x)
inline

◆ type_convert< bf6x32_pk_t, float32_t >()

template<>
__host__ __device__ bf6x32_pk_t ck::type_convert< bf6x32_pk_t, float32_t > ( float32_t  x)
inline

◆ type_convert< bf6x32_t, float32_t >()

template<>
__host__ __device__ bf6x32_t ck::type_convert< bf6x32_t, float32_t > ( float32_t  x)
inline

Specializes vector of 32 float-to-bf6_t conversion.

Uses stochastic rounding if CK_USE_SR_F6_CONVERSION is defined, otherwise uses round-to-nearest-even.

Parameters
xInput float vector to convert.
Returns
Converted bf6x32_t vector.

◆ type_convert< bf8_fnuz_t, float >()

template<>
__host__ __device__ bf8_fnuz_t ck::type_convert< bf8_fnuz_t, float > ( float  x)
inline

◆ type_convert< bf8_fnuz_t, half_t >()

template<>
__host__ __device__ bf8_fnuz_t ck::type_convert< bf8_fnuz_t, half_t > ( half_t  x)
inline

◆ type_convert< bf8_ocp_t, bhalf_t >()

template<>
__host__ __device__ bf8_ocp_t ck::type_convert< bf8_ocp_t, bhalf_t > ( bhalf_t  x)
inline

Converts a bhalf_t value to a bf8_ocp_t value with rounding determined by a flag.

Parameters
xThe input bhalf_t value.
Returns
The converted bf8_ocp_t value.

◆ type_convert< bf8_ocp_t, float >()

template<>
__host__ __device__ bf8_ocp_t ck::type_convert< bf8_ocp_t, float > ( float  x)
inline

Converts a float value to a bf8_ocp_t value with rounding determined by a flag.

Parameters
xThe input float value.
Returns
The converted bf8_ocp_t value.

◆ type_convert< bf8_ocp_t, half_t >()

template<>
__host__ __device__ bf8_ocp_t ck::type_convert< bf8_ocp_t, half_t > ( half_t  x)
inline

Converts a half_t value to a bf8_ocp_t value with rounding determined by a flag.

Parameters
xThe input half_t value.
Returns
The converted bf8_ocp_t value.

◆ type_convert< bf8_ocp_t, int >()

template<>
__host__ constexpr __device__ bf8_ocp_t ck::type_convert< bf8_ocp_t, int > ( int  x)
inlineconstexpr

◆ type_convert< bhalf2_t, bf8x2_ocp_t >()

template<>
__host__ __device__ bhalf2_t ck::type_convert< bhalf2_t, bf8x2_ocp_t > ( bf8x2_ocp_t  x)
inline

Converts a vector of 2 bf8_ocp_t values to a vector of 2 bhalf_t values.

Parameters
xThe input vector of 2 bf8_ocp_t values.
Returns
The converted vector of 2 bhalf_t values.

◆ type_convert< bhalf2_t, f8x2_ocp_t >()

template<>
__host__ __device__ bhalf2_t ck::type_convert< bhalf2_t, f8x2_ocp_t > ( f8x2_ocp_t  x)
inline

Converts a vector of 2 f8_ocp_t values to a vector of 2 bhalf_t values.

Parameters
xThe input vector of 2 f8_ocp_t values.
Returns
The converted vector of 2 bhalf_t values.

◆ type_convert< bhalf2_t, pk_i4_t >()

template<>
__host__ __device__ bhalf2_t ck::type_convert< bhalf2_t, pk_i4_t > ( pk_i4_t  x)
inline

◆ type_convert< bhalf_t, bf8_ocp_t >()

template<>
__host__ __device__ bhalf_t ck::type_convert< bhalf_t, bf8_ocp_t > ( bf8_ocp_t  x)
inline

Converts a bf8_ocp_t value to a bhalf_t value.

Parameters
xThe input bf8_ocp_t value.
Returns
The converted bhalf_t value.

◆ type_convert< bhalf_t, f8_ocp_t >()

template<>
__host__ __device__ bhalf_t ck::type_convert< bhalf_t, f8_ocp_t > ( f8_ocp_t  x)
inline

Converts a f8_ocp_t value to a bhalf_t value.

Parameters
xThe input f8_ocp_t value.
Returns
The converted bhalf_t value.

◆ type_convert< bhalf_t, float >()

template<>
__host__ constexpr __device__ bhalf_t ck::type_convert< bhalf_t, float > ( float  x)
inlineconstexpr

◆ type_convert< bhalf_t, half_t >()

template<>
__host__ constexpr __device__ bhalf_t ck::type_convert< bhalf_t, half_t > ( half_t  x)
inlineconstexpr

◆ type_convert< bhalf_t, int8_t >()

template<>
__host__ constexpr __device__ bhalf_t ck::type_convert< bhalf_t, int8_t > ( int8_t  x)
inlineconstexpr

◆ type_convert< f4_t, float >()

template<>
__host__ __device__ f4_t ck::type_convert< f4_t, float > ( float  x)
inline

◆ type_convert< f4x2_pk_t, float2_t >()

template<>
__host__ __device__ f4x2_pk_t ck::type_convert< f4x2_pk_t, float2_t > ( float2_t  x)
inline

◆ type_convert< f4x2_t, float2_t >()

template<>
__host__ __device__ f4x2_t ck::type_convert< f4x2_t, float2_t > ( float2_t  x)
inline

◆ type_convert< f4x32_t, float32_t >()

template<>
__host__ __device__ f4x32_t ck::type_convert< f4x32_t, float32_t > ( float32_t  x)
inline

◆ type_convert< f6_t, float >()

template<>
__host__ __device__ f6_t ck::type_convert< f6_t, float > ( float  x)
inline

Specializes the type conversion template for converting a float into the 6-bit float type (f6_t).

Depending on the CK_USE_SR_F6_CONVERSION flag, the conversion uses stochastic rounding or round-to-nearest-even.

Parameters
xInput float value to be converted.
Returns
The converted f6_t value.

◆ type_convert< f6x16_pk_t, float16_t >()

template<>
__host__ __device__ f6x16_pk_t ck::type_convert< f6x16_pk_t, float16_t > ( float16_t  x)
inline

◆ type_convert< f6x16_t, float16_t >()

template<>
__host__ __device__ f6x16_t ck::type_convert< f6x16_t, float16_t > ( float16_t  x)
inline

◆ type_convert< f6x32_pk_t, float32_t >()

template<>
__host__ __device__ f6x32_pk_t ck::type_convert< f6x32_pk_t, float32_t > ( float32_t  x)
inline

◆ type_convert< f6x32_t, float32_t >()

template<>
__host__ __device__ f6x32_t ck::type_convert< f6x32_t, float32_t > ( float32_t  x)
inline

Specializes the type conversion template for converting a vector of 32 floats into the vector of 32 6-bit float types (f6x32_t).

Depending on the CK_USE_SR_F6_CONVERSION flag, the conversion uses stochastic rounding or round-to-nearest-even.

Parameters
xInput float value to be converted.
Returns
The converted f6x32_t vector.

◆ type_convert< f8_fnuz_t, float >()

template<>
__host__ __device__ f8_fnuz_t ck::type_convert< f8_fnuz_t, float > ( float  x)
inline

◆ type_convert< f8_fnuz_t, half_t >()

template<>
__host__ __device__ f8_fnuz_t ck::type_convert< f8_fnuz_t, half_t > ( half_t  x)
inline

◆ type_convert< f8_ocp_t, bhalf_t >()

template<>
__host__ __device__ f8_ocp_t ck::type_convert< f8_ocp_t, bhalf_t > ( bhalf_t  x)
inline

Converts a bhalf_t value to a f8_ocp_t value with rounding determined by a flag.

Parameters
xThe input bhalf_t value.
Returns
The converted f8_ocp_t value.

◆ type_convert< f8_ocp_t, float >()

template<>
__host__ __device__ f8_ocp_t ck::type_convert< f8_ocp_t, float > ( float  x)
inline

Converts a float value to a f8_ocp_t value with rounding determined by a flag.

Parameters
xThe input float value.
Returns
The converted f8_ocp_t value.

◆ type_convert< f8_ocp_t, half_t >()

template<>
__host__ __device__ f8_ocp_t ck::type_convert< f8_ocp_t, half_t > ( half_t  x)
inline

Converts a half_t value to a f8_ocp_t value with rounding determined by a flag.

Parameters
xThe input half_t value.
Returns
The converted f8_ocp_t value.

◆ type_convert< f8_ocp_t, int >()

template<>
__host__ constexpr __device__ f8_ocp_t ck::type_convert< f8_ocp_t, int > ( int  x)
inlineconstexpr

◆ type_convert< float, bf6_t >()

template<>
__host__ __device__ float ck::type_convert< float, bf6_t > ( bf6_t  x)
inline

Specializes the type conversion template for converting a bf6_t value to float.

Interprets the bf6_t value using the default scale factor of 1 and returns its floating-point representation.

Parameters
xThe bf6_t value to convert.
Returns
The float representation of the given bf6_t value.

◆ type_convert< float, bf8_fnuz_t >()

template<>
__host__ __device__ float ck::type_convert< float, bf8_fnuz_t > ( bf8_fnuz_t  x)
inline

◆ type_convert< float, bf8_ocp_t >()

template<>
__host__ __device__ float ck::type_convert< float, bf8_ocp_t > ( bf8_ocp_t  x)
inline

Converts a bf8_ocp_t value to a float value.

Parameters
xThe input bf8_ocp_t value.
Returns
The converted float value.

◆ type_convert< float, bhalf_t >()

template<>
__host__ constexpr __device__ float ck::type_convert< float, bhalf_t > ( bhalf_t  x)
inlineconstexpr

◆ type_convert< float, f4_t >()

template<>
__host__ __device__ float ck::type_convert< float, f4_t > ( f4_t  x)
inline

◆ type_convert< float, f6_t >()

template<>
__host__ __device__ float ck::type_convert< float, f6_t > ( f6_t  x)
inline

Specializes the type conversion template for converting the 6-bit float type (f6_t) to float.

Interprets an f6_t value as a float using the default scale factor of 1.

Parameters
xThe 6-bit float (f6_t) value to be converted.
Returns
The corresponding float representation.

◆ type_convert< float, f8_fnuz_t >()

template<>
__host__ __device__ float ck::type_convert< float, f8_fnuz_t > ( f8_fnuz_t  x)
inline

◆ type_convert< float, f8_ocp_t >()

template<>
__host__ __device__ float ck::type_convert< float, f8_ocp_t > ( f8_ocp_t  x)
inline

Converts a f8_ocp_t value to a float value.

Parameters
xThe input f8_ocp_t value.
Returns
The converted float value.

◆ type_convert< float16_t, bf6x16_pk_t >()

template<>
__host__ __device__ float16_t ck::type_convert< float16_t, bf6x16_pk_t > ( bf6x16_pk_t  x)
inline

◆ type_convert< float16_t, bf6x16_t >()

template<>
__host__ __device__ float16_t ck::type_convert< float16_t, bf6x16_t > ( bf6x16_t  x)
inline

◆ type_convert< float16_t, f6x16_pk_t >()

template<>
__host__ __device__ float16_t ck::type_convert< float16_t, f6x16_pk_t > ( f6x16_pk_t  x)
inline

◆ type_convert< float16_t, f6x16_t >()

template<>
__host__ __device__ float16_t ck::type_convert< float16_t, f6x16_t > ( f6x16_t  x)
inline

◆ type_convert< float2_t, bf8x2_ocp_t >()

template<>
__host__ __device__ float2_t ck::type_convert< float2_t, bf8x2_ocp_t > ( bf8x2_ocp_t  x)
inline

Converts a vector of 2 bf8_ocp_t values to a vector of 2 float values.

Parameters
xThe input vector of 2 bf8_ocp_t values.
Returns
The converted vector of 2 float values.

◆ type_convert< float2_t, f4x2_t >()

template<>
__host__ __device__ float2_t ck::type_convert< float2_t, f4x2_t > ( f4x2_t  x)
inline

◆ type_convert< float2_t, f8x2_fnuz_t >()

template<>
__host__ __device__ float2_t ck::type_convert< float2_t, f8x2_fnuz_t > ( f8x2_fnuz_t  x)
inline

◆ type_convert< float2_t, f8x2_ocp_t >()

template<>
__host__ __device__ float2_t ck::type_convert< float2_t, f8x2_ocp_t > ( f8x2_ocp_t  x)
inline

Converts a vector of 2 f8_ocp_t values to a vector of 2 float values.

Parameters
xThe input vector of 2 f8_ocp_t values.
Returns
The converted vector of 2 float values.

◆ type_convert< float2_t, pk_i4_t >()

template<>
__host__ __device__ float2_t ck::type_convert< float2_t, pk_i4_t > ( pk_i4_t  x)
inline

◆ type_convert< float32_t, bf6x32_t >()

template<>
__host__ __device__ float32_t ck::type_convert< float32_t, bf6x32_t > ( bf6x32_t  x)
inline

Specializes the type conversion template for converting a vector of 32 bf6_t values to vector of 32 floats.

Interprets the bf6x32_t value using the default scale factor of 1 and returns its floating-point representation.

Parameters
xThe bf6x32_t value to convert.
Returns
The float representation of the given vector.

◆ type_convert< float32_t, f4x32_t >()

template<>
__host__ __device__ float32_t ck::type_convert< float32_t, f4x32_t > ( f4x32_t  x)
inline

◆ type_convert< float32_t, f6x32_t >()

template<>
__host__ __device__ float32_t ck::type_convert< float32_t, f6x32_t > ( f6x32_t  x)
inline

Specializes the type conversion template for converting the vector of 32 6-bit float types (f6x32_t) to vector of 32 floats.

Interprets an f6_t values as floats using the default scale factor of 1.

Parameters
xThe vector of 32 6-bit float (f6x32_t) values to be converted.
Returns
The corresponding float representation.

◆ type_convert< half2_t, bf8x2_ocp_t >()

template<>
__host__ __device__ half2_t ck::type_convert< half2_t, bf8x2_ocp_t > ( bf8x2_ocp_t  x)
inline

Converts a vector of 2 bf8_ocp_t values to a vector of 2 half_t values.

Parameters
xThe input vector of 2 bf8_ocp_t values.
Returns
The converted vector of 2 half_t values.

◆ type_convert< half2_t, f8x2_ocp_t >()

template<>
__host__ __device__ half2_t ck::type_convert< half2_t, f8x2_ocp_t > ( f8x2_ocp_t  x)
inline

Converts a vector of 2 f8_ocp_t values to a vector of 2 half_t values.

Parameters
xThe input vector of 2 f8_ocp_t values.
Returns
The converted vector of 2 half_t values.

◆ type_convert< half2_t, float2_t >()

template<>
__host__ __device__ half2_t ck::type_convert< half2_t, float2_t > ( float2_t  x)
inline

◆ type_convert< half2_t, pk_i4_t >()

template<>
__host__ __device__ half2_t ck::type_convert< half2_t, pk_i4_t > ( pk_i4_t  x)
inline

◆ type_convert< half_t, bf8_fnuz_t >()

template<>
__host__ __device__ half_t ck::type_convert< half_t, bf8_fnuz_t > ( bf8_fnuz_t  x)
inline

◆ type_convert< half_t, bf8_ocp_t >()

template<>
__host__ __device__ half_t ck::type_convert< half_t, bf8_ocp_t > ( bf8_ocp_t  x)
inline

Converts a bf8_ocp_t value to a half_t value.

Parameters
xThe input bf8_ocp_t value.
Returns
The converted half_t value.

◆ type_convert< half_t, bhalf_t >()

template<>
__host__ constexpr __device__ half_t ck::type_convert< half_t, bhalf_t > ( bhalf_t  x)
inlineconstexpr

◆ type_convert< half_t, f8_fnuz_t >()

template<>
__host__ __device__ half_t ck::type_convert< half_t, f8_fnuz_t > ( f8_fnuz_t  x)
inline

◆ type_convert< half_t, f8_ocp_t >()

template<>
__host__ __device__ half_t ck::type_convert< half_t, f8_ocp_t > ( f8_ocp_t  x)
inline

Converts a f8_ocp_t value to a half_t value.

Parameters
xThe input f8_ocp_t value.
Returns
The converted half_t value.

◆ type_convert< int8_t, bhalf_t >()

template<>
__host__ constexpr __device__ int8_t ck::type_convert< int8_t, bhalf_t > ( bhalf_t  x)
inlineconstexpr

◆ type_convert_sp()

template<typename Y , typename X >
__host__ constexpr __device__ Y ck::type_convert_sp ( x)
constexpr

◆ type_convert_sp< bhalf_t, float >()

template<>
__host__ constexpr __device__ bhalf_t ck::type_convert_sp< bhalf_t, float > ( float  x)
inlineconstexpr

◆ type_convert_sp< bhalf_t, int >()

template<>
__host__ constexpr __device__ bhalf_t ck::type_convert_sp< bhalf_t, int > ( int  x)
inlineconstexpr

◆ type_convert_sp< f8_t, int >()

template<>
__host__ constexpr __device__ f8_t ck::type_convert_sp< f8_t, int > ( int  x)
inlineconstexpr

◆ type_convert_sp< float, int >()

template<>
__host__ constexpr __device__ float ck::type_convert_sp< float, int > ( int  x)
inlineconstexpr

◆ type_convert_sp< half_t, float >()

template<>
__host__ constexpr __device__ half_t ck::type_convert_sp< half_t, float > ( float  x)
inlineconstexpr

◆ type_convert_sp< half_t, int >()

template<>
__host__ constexpr __device__ half_t ck::type_convert_sp< half_t, int > ( int  x)
inlineconstexpr

◆ type_convert_sp< int, bhalf_t >()

template<>
__host__ constexpr __device__ int ck::type_convert_sp< int, bhalf_t > ( bhalf_t  x)
inlineconstexpr

◆ type_convert_sp< int, f8_t >()

template<>
__host__ constexpr __device__ int ck::type_convert_sp< int, f8_t > ( f8_t  x)
inlineconstexpr

◆ type_convert_sp< int, float >()

template<>
__host__ constexpr __device__ int ck::type_convert_sp< int, float > ( float  x)
inlineconstexpr

◆ type_convert_sp< int, half_t >()

template<>
__host__ constexpr __device__ int ck::type_convert_sp< int, half_t > ( half_t  x)
inlineconstexpr

◆ unpack()

template<typename F , typename X >
__host__ constexpr __device__ auto ck::unpack ( F &&  f,
X &&  x 
)
constexpr

◆ unpack2()

template<typename F , typename X , typename Y >
__host__ constexpr __device__ auto ck::unpack2 ( F &&  f,
X &&  x,
Y &&  y 
)
constexpr

◆ UnrollNestedTuple() [1/3]

template<index_t Depth = 0, index_t MaxDepth = -1, typename T >
__host__ constexpr __device__ auto ck::UnrollNestedTuple ( const T &  element)
constexpr

◆ UnrollNestedTuple() [2/3]

template<index_t Depth = 0, index_t MaxDepth = -1, typename... Ts>
__host__ constexpr __device__ auto ck::UnrollNestedTuple ( const Tuple< Ts... > &  tuple)
constexpr

◆ UnrollNestedTuple() [3/3]

template<index_t Depth = 0, index_t MaxDepth = -1>
__host__ constexpr __device__ auto ck::UnrollNestedTuple ( const Tuple<> &  element)
constexpr

◆ UpdateEnvVar() [1/2]

template<typename EnvVar >
void ck::UpdateEnvVar ( EnvVar  ,
const std::string_view &  val 
)

◆ UpdateEnvVar() [2/2]

template<typename EnvVar , typename ValueType >
void ck::UpdateEnvVar ( EnvVar  ,
const ValueType &  val 
)

updates the cached value of an environment variable

Variable Documentation

◆ ignore

constexpr detail::ignore_t ck::ignore
inlineconstexpr

◆ is_base_of_v

template<typename X , typename Y >
constexpr bool ck::is_base_of_v = is_base_of<X, Y>::value
inlineconstexpr

◆ is_packed_type_v

template<typename T >
constexpr bool ck::is_packed_type_v = packed_size_v<T> > 1
inlineconstexpr

◆ is_pointer_v

template<typename T >
constexpr bool ck::is_pointer_v = is_pointer<T>::value
inlineconstexpr

◆ is_same_v

template<typename X , typename Y >
constexpr bool ck::is_same_v = is_same<X, Y>::value
inlineconstexpr

◆ is_unsigned_v

template<typename T >
constexpr bool ck::is_unsigned_v = is_unsigned<T>::value
inlineconstexpr

◆ packed_size_v

template<typename T >
constexpr index_t ck::packed_size_v = packed_type_info<T>::packed_size
inlineconstexpr