GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce > Struct Template Reference#
ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce > Struct Template Reference
#include <gridwise_softmax.hpp>
Public Types | |
| using | ThreadClusterLengths_M_K = Sequence< MThreadClusterSize, KThreadClusterSize > |
| using | ThreadBufferDimAccessOrder = typename conditional< reorder_thread_cluster, Sequence< 1, 0 >, Sequence< 0, 1 > >::type |
| using | ThreadClusterArrangeOrder = typename conditional< reorder_thread_cluster, Sequence< 1, 0 >, Sequence< 0, 1 > >::type |
| using | ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(make_tuple(Number< MThreadSliceSize >{}, Number< KThreadSliceSize >{}))) |
| using | ThreadReduceDstDesc_M = decltype(make_naive_tensor_descriptor_packed(make_tuple(Number< MThreadSliceSize >{}))) |
| using | PassThroughOp = tensor_operation::element_wise::PassThrough |
Static Public Member Functions | |
| static __device__ void | Run (const GridDesc_M_K &in_grid_desc_m_k, const GridDesc_M_K &out_grid_desc_m_k, index_t block_group_size, index_t num_k_block_tile_iteration, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global) |
Static Public Attributes | |
| static constexpr bool | reorder_thread_cluster = (InSrcVectorDim == 0) |
| static constexpr auto | thread_cluster_desc |
| static constexpr auto | I0 = Number<0>{} |
| static constexpr auto | I1 = Number<1>{} |
| static constexpr index_t | M_BlockTileSize = MThreadClusterSize * MThreadSliceSize |
| static constexpr index_t | K_BlockTileSize = KThreadClusterSize * KThreadSliceSize |
Member Typedef Documentation
◆ PassThroughOp
template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
| using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::PassThroughOp = tensor_operation::element_wise::PassThrough |
◆ ThreadBufferDimAccessOrder
template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
| using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::ThreadBufferDimAccessOrder = typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1> >::type |
◆ ThreadClusterArrangeOrder
template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
| using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::ThreadClusterArrangeOrder = typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1> >::type |
◆ ThreadClusterLengths_M_K
template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
| using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize> |
◆ ThreadReduceDstDesc_M
template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
| using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::ThreadReduceDstDesc_M = decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}))) |
◆ ThreadReduceSrcDesc_M_K
template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
| using ck::GridwiseSoftmax_mk_to_mk< InDataType, OutDataType, AccDataType, GridDesc_M_K, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize, SweepOnce >::ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}))) |
Member Function Documentation
◆ Run()
template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
|
inlinestatic |
max(x)
sum(exp(x - max(x)))
softmax
Member Data Documentation
◆ I0
template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
|
staticconstexpr |
◆ I1
template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
|
staticconstexpr |
◆ K_BlockTileSize
template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
|
staticconstexpr |
◆ M_BlockTileSize
template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
|
staticconstexpr |
◆ reorder_thread_cluster
template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
|
staticconstexpr |
◆ thread_cluster_desc
template<typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t InSrcVectorDim, index_t InSrcVectorSize, index_t OutDstVectorSize, bool SweepOnce>
|
staticconstexpr |
Initial value:
=
__host__ constexpr __device__ auto make_cluster_descriptor(const Lengths &lengths, ArrangeOrder order=typename arithmetic_sequence_gen< 0, Lengths::Size(), 1 >::type{})
Definition: cluster_descriptor.hpp:13
typename conditional< reorder_thread_cluster, Sequence< 1, 0 >, Sequence< 0, 1 > >::type ThreadClusterArrangeOrder
Definition: gridwise_softmax.hpp:69
Sequence< MThreadClusterSize, KThreadClusterSize > ThreadClusterLengths_M_K
Definition: gridwise_softmax.hpp:63
The documentation for this struct was generated from the following file:
- /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp