GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce > Struct Template Reference

GridwiseSoftmax_mk_to_mk&lt; InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce &gt; Struct Template Reference#

Composable Kernel: ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce > Struct Template Reference
ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce > Struct Template Reference

#include <gridwise_softmax.hpp>

Public Types

using ThreadClusterLengths_M_K = Sequence< MThreadClusterSize, KThreadClusterSize >
 
using ThreadBufferDimAccessOrder = typename conditional< reorder_thread_cluster, Sequence< 1, 0 >, Sequence< 0, 1 > >::type
 
using ThreadClusterArrangeOrder = typename conditional< reorder_thread_cluster, Sequence< 1, 0 >, Sequence< 0, 1 > >::type
 
using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(make_tuple(Number< MThreadSliceSize >{}, Number< KThreadSliceSize >{})))
 
using ThreadReduceDstDesc_M = decltype(make_naive_tensor_descriptor_packed(make_tuple(Number< MThreadSliceSize >{})))
 
using PassThroughOp = tensor_operation::element_wise::PassThrough
 

Static Public Member Functions

static __device__ void Run (const GridDesc_M_K &in_grid_desc_m_k, const GridDesc_M_K &out_grid_desc_m_k, index_t block_group_size, index_t num_k_block_tile_iteration, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global)
 

Static Public Attributes

static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0)
 
static constexpr auto thread_cluster_desc
 
static constexpr auto I0 = Number<0>{}
 
static constexpr auto I1 = Number<1>{}
 
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize
 
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize
 

Member Typedef Documentation

◆ PassThroughOp

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::PassThroughOp = tensor_operation::element_wise::PassThrough

◆ ThreadBufferDimAccessOrder

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::ThreadBufferDimAccessOrder = typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1> >::type

◆ ThreadClusterArrangeOrder

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::ThreadClusterArrangeOrder = typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1> >::type

◆ ThreadClusterLengths_M_K

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>

◆ ThreadReduceDstDesc_M

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::ThreadReduceDstDesc_M = decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})))

◆ ThreadReduceSrcDesc_M_K

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})))

Member Function Documentation

◆ Run()

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
static __device__ void ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::Run ( const GridDesc_M_K &  in_grid_desc_m_k,
const GridDesc_M_K &  out_grid_desc_m_k,
index_t  block_group_size,
index_t  num_k_block_tile_iteration,
AccDataType  alpha,
const InDataType *const __restrict__  p_in_value_global,
AccDataType  beta,
OutDataType *const __restrict__  p_out_value_global 
)
inlinestatic

max(x)

sum(exp(x - max(x)))

softmax

Member Data Documentation

◆ I0

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
constexpr auto ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::I0 = Number<0>{}
staticconstexpr

◆ I1

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
constexpr auto ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::I1 = Number<1>{}
staticconstexpr

◆ K_BlockTileSize

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
constexpr index_t ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::K_BlockTileSize = KThreadClusterSize * KThreadSliceSize
staticconstexpr

◆ M_BlockTileSize

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
constexpr index_t ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::M_BlockTileSize = MThreadClusterSize * MThreadSliceSize
staticconstexpr

◆ reorder_thread_cluster

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
constexpr bool ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::reorder_thread_cluster = (InSrcVectorDim == 0)
staticconstexpr

◆ thread_cluster_desc

template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
constexpr auto ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::thread_cluster_desc
staticconstexpr
Initial value:
=
__host__ constexpr __device__ auto make_cluster_descriptor(const Lengths &lengths, ArrangeOrder order=typename arithmetic_sequence_gen< 0, Lengths::Size(), 1 >::type{})
Definition: cluster_descriptor.hpp:13
typename conditional< reorder_thread_cluster, Sequence< 1, 0 >, Sequence< 0, 1 > >::type ThreadClusterArrangeOrder
Definition: gridwise_softmax.hpp:69
Sequence< MThreadClusterSize, KThreadClusterSize > ThreadClusterLengths_M_K
Definition: gridwise_softmax.hpp:63

The documentation for this struct was generated from the following file:
  • /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp