BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN > Struct Template Reference

BlockwiseSoftmax&lt; BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN &gt; Struct Template Reference#

Composable Kernel: ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN > Struct Template Reference
ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN > Struct Template Reference

Blockwise softmax. More...

#include <blockwise_softmax.hpp>

Public Types

using ThreadSliceDesc_M = decltype(make_naive_tensor_descriptor_packed(make_tuple(ThreadSliceDesc_M_K{}.GetLength(I0))))
 
using ThreadwiseMaxReduce = typename conditional< IgnoreNaN, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false, detail::AccumulateWithNanIgnore< reduce::Max, AccDataType > >, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false > >::type
 
using ThreadwiseSumReduce = typename conditional< IgnoreNaN, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false, detail::AccumulateWithNanIgnore< reduce::Add, AccDataType > >, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false > >::type
 
using ThreadClusterLengths_M_K = decltype(ThreadClusterDesc_M_K{}.GetLengths())
 
using BlockwiseMaxReduce = PartitionedBlockwiseReduction_v2< AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Max, false >
 
using BlockwiseSumReduce = PartitionedBlockwiseReduction_v2< AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Add, false >
 
using BufferType = StaticBuffer< AddressSpaceEnum::Vgpr, AccDataType, MRepeat, true >
 

Public Member Functions

template<typename CThreadBuffer , typename WorkspaceBuffer >
__host__ __device__ void Run (CThreadBuffer &in_thread_buf, WorkspaceBuffer &reduce_work_buf)
 

Public Attributes

BufferType max_value_buf
 
BufferType sum_value_buf
 

Static Public Attributes

static constexpr auto I0 = Number<0>{}
 
static constexpr auto I1 = Number<1>{}
 
static constexpr index_t MRepeat = ThreadSliceDesc_M_K{}.GetLength(I0)
 
static constexpr index_t KRepeat = ThreadSliceDesc_M_K{}.GetLength(I1)
 

Detailed Description

template<index_t BlockSize, typename AccDataType, typename ThreadMap_M_K, typename ThreadClusterDesc_M_K, typename ThreadSliceDesc_M_K, bool IgnoreNaN = false>
struct ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >

Blockwise softmax.

Template Parameters
BlockSizeBlock size
AccDataTypeAccumulator data type
ThreadMap_M_KThread id to m_k
ThreadClusterDesc_M_KThreadwise cluster descriptor
ThreadSliceDesc_M_KThreadwise slices descriptor
IgnoreNaNFlag to ignore NaN, false by default

Member Typedef Documentation

◆ BlockwiseMaxReduce

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::BlockwiseMaxReduce = PartitionedBlockwiseReduction_v2<AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Max, false>

◆ BlockwiseSumReduce

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::BlockwiseSumReduce = PartitionedBlockwiseReduction_v2<AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Add, false>

◆ BufferType

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::BufferType = StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MRepeat, true>

◆ ThreadClusterLengths_M_K

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::ThreadClusterLengths_M_K = decltype(ThreadClusterDesc_M_K{}.GetLengths())

◆ ThreadSliceDesc_M

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::ThreadSliceDesc_M = decltype(make_naive_tensor_descriptor_packed( make_tuple(ThreadSliceDesc_M_K{}.GetLength(I0))))

◆ ThreadwiseMaxReduce

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::ThreadwiseMaxReduce = typename conditional< IgnoreNaN, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false, detail::AccumulateWithNanIgnore<reduce::Max, AccDataType> >, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false> >::type

◆ ThreadwiseSumReduce

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::ThreadwiseSumReduce = typename conditional< IgnoreNaN, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false, detail::AccumulateWithNanIgnore<reduce::Add, AccDataType> >, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false> >::type

Member Function Documentation

◆ Run()

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
template<typename CThreadBuffer , typename WorkspaceBuffer >
__host__ __device__ void ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::Run ( CThreadBuffer &  in_thread_buf,
WorkspaceBuffer &  reduce_work_buf 
)
inline

Member Data Documentation

◆ I0

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
constexpr auto ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::I0 = Number<0>{}
staticconstexpr

◆ I1

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
constexpr auto ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::I1 = Number<1>{}
staticconstexpr

◆ KRepeat

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
constexpr index_t ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::KRepeat = ThreadSliceDesc_M_K{}.GetLength(I1)
staticconstexpr

◆ max_value_buf

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
BufferType ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::max_value_buf

◆ MRepeat

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
constexpr index_t ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::MRepeat = ThreadSliceDesc_M_K{}.GetLength(I0)
staticconstexpr

◆ sum_value_buf

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
BufferType ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::sum_value_buf

The documentation for this struct was generated from the following file:
  • /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp