GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce > Struct Template Reference

GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce > Struct Template Reference#

Composable Kernel: ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce > Struct Template Reference

#include <gridwise_softmax.hpp>

Public Types
using	ThreadClusterLengths_M_K = Sequence< MThreadClusterSize, KThreadClusterSize >

using	ThreadBufferDimAccessOrder = typename conditional< reorder_thread_cluster, Sequence< 1, 0 >, Sequence< 0, 1 > >::type

using	ThreadClusterArrangeOrder = typename conditional< reorder_thread_cluster, Sequence< 1, 0 >, Sequence< 0, 1 > >::type

using	ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(make_tuple(Number< MThreadSliceSize >{}, Number< KThreadSliceSize >{})))

using	ThreadReduceDstDesc_M = decltype(make_naive_tensor_descriptor_packed(make_tuple(Number< MThreadSliceSize >{})))

using	PassThroughOp = tensor_operation::element_wise::PassThrough

Static Public Member Functions
static __device__ void	Run (const GridDesc_M_K &in_grid_desc_m_k, const GridDesc_M_K &out_grid_desc_m_k, index_t block_group_size, index_t num_k_block_tile_iteration, AccDataType alpha, const InDataType const __restrict__ p_in_value_global, AccDataType beta, OutDataType const __restrict__ p_out_value_global)

Static Public Attributes
static constexpr bool	reorder_thread_cluster = (InSrcVectorDim == 0)

static constexpr auto	thread_cluster_desc

static constexpr auto	I0 = Number<0>{}

static constexpr auto	I1 = Number<1>{}

static constexpr index_t	M_BlockTileSize = MThreadClusterSize * MThreadSliceSize

static constexpr index_t	K_BlockTileSize = KThreadClusterSize * KThreadSliceSize

Member Typedef Documentation

◆ PassThroughOp

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>

using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::PassThroughOp = tensor_operation::element_wise::PassThrough

◆ ThreadBufferDimAccessOrder

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>

using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::ThreadBufferDimAccessOrder = typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1> >::type

◆ ThreadClusterArrangeOrder

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>

using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::ThreadClusterArrangeOrder = typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1> >::type

◆ ThreadClusterLengths_M_K

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>

using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>

◆ ThreadReduceDstDesc_M

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>

using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::ThreadReduceDstDesc_M = decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})))

◆ ThreadReduceSrcDesc_M_K

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>

using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})))

Member Function Documentation

◆ Run()

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>

static __device__ void ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::Run	(	const GridDesc_M_K &	in_grid_desc_m_k,
		const GridDesc_M_K &	out_grid_desc_m_k,
		index_t	block_group_size,
		index_t	num_k_block_tile_iteration,
		AccDataType	alpha,
		const InDataType *const __restrict__	p_in_value_global,
		AccDataType	beta,
		OutDataType *const __restrict__	p_out_value_global
	)

inlinestatic

max(x)

sum(exp(x - max(x)))

softmax

Member Data Documentation

◆ I0

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>

constexpr auto ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::I0 = Number<0>{}

staticconstexpr

◆ I1

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>

constexpr auto ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::I1 = Number<1>{}

staticconstexpr

◆ K_BlockTileSize

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>

constexpr index_t ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::K_BlockTileSize = KThreadClusterSize * KThreadSliceSize

staticconstexpr

◆ M_BlockTileSize

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>

constexpr index_t ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::M_BlockTileSize = MThreadClusterSize * MThreadSliceSize

staticconstexpr

◆ reorder_thread_cluster

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>

constexpr bool ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::reorder_thread_cluster = (InSrcVectorDim == 0)

staticconstexpr

◆ thread_cluster_desc

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>

constexpr auto ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::thread_cluster_desc

staticconstexpr

Initial value:

=

make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{})

ck::make_cluster_descriptor

__host__ constexpr __device__ auto make_cluster_descriptor(const Lengths &lengths, ArrangeOrder order=typename arithmetic_sequence_gen< 0, Lengths::Size(), 1 >::type{})

Definition: cluster_descriptor.hpp:13

ck::GridwiseSoftmax_mk_to_mk::ThreadClusterArrangeOrder

typename conditional< reorder_thread_cluster, Sequence< 1, 0 >, Sequence< 0, 1 > >::type ThreadClusterArrangeOrder

Definition: gridwise_softmax.hpp:69

ck::GridwiseSoftmax_mk_to_mk::ThreadClusterLengths_M_K

Sequence< MThreadClusterSize, KThreadClusterSize > ThreadClusterLengths_M_K

Definition: gridwise_softmax.hpp:63

The documentation for this struct was generated from the following file:

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp

GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce > Struct Template Reference

GridwiseSoftmax_mk_to_mk&lt; InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce &gt; Struct Template Reference#

Public Types

Static Public Member Functions

Static Public Attributes

Member Typedef Documentation

◆ PassThroughOp

◆ ThreadBufferDimAccessOrder

◆ ThreadClusterArrangeOrder

◆ ThreadClusterLengths_M_K

◆ ThreadReduceDstDesc_M

◆ ThreadReduceSrcDesc_M_K

Member Function Documentation

◆ Run()

Member Data Documentation

◆ I0

◆ I1

◆ K_BlockTileSize

◆ M_BlockTileSize

◆ reorder_thread_cluster

◆ thread_cluster_desc

GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce > Struct Template Reference#