BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN > Struct Template Reference

BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN > Struct Template Reference#

Composable Kernel: ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN > Struct Template Reference

Blockwise softmax. More...

#include <blockwise_softmax.hpp>

Public Types
using	ThreadSliceDesc_M = decltype(make_naive_tensor_descriptor_packed(make_tuple(ThreadSliceDesc_M_K{}.GetLength(I0))))

using	ThreadwiseMaxReduce = typename conditional< IgnoreNaN, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false, detail::AccumulateWithNanIgnore< reduce::Max, AccDataType > >, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false > >::type

using	ThreadwiseSumReduce = typename conditional< IgnoreNaN, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false, detail::AccumulateWithNanIgnore< reduce::Add, AccDataType > >, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false > >::type

using	ThreadClusterLengths_M_K = decltype(ThreadClusterDesc_M_K{}.GetLengths())

using	BlockwiseMaxReduce = PartitionedBlockwiseReduction_v2< AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Max, false >

using	BlockwiseSumReduce = PartitionedBlockwiseReduction_v2< AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Add, false >

using	BufferType = StaticBuffer< AddressSpaceEnum::Vgpr, AccDataType, MRepeat, true >

Public Member Functions
template<typename CThreadBuffer , typename WorkspaceBuffer >
__host__ __device__ void	Run (CThreadBuffer &in_thread_buf, WorkspaceBuffer &reduce_work_buf)

Public Attributes
BufferType	max_value_buf

BufferType	sum_value_buf

Static Public Attributes
static constexpr auto	I0 = Number<0>{}

static constexpr auto	I1 = Number<1>{}

static constexpr index_t	MRepeat = ThreadSliceDesc_M_K{}.GetLength(I0)

static constexpr index_t	KRepeat = ThreadSliceDesc_M_K{}.GetLength(I1)

Detailed Description

template<index_t BlockSize, typename AccDataType, typename ThreadMap_M_K, typename ThreadClusterDesc_M_K, typename ThreadSliceDesc_M_K, bool IgnoreNaN = false>
struct ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >

Blockwise softmax.

Template Parameters

BlockSize	Block size
AccDataType	Accumulator data type
ThreadMap_M_K	Thread id to m_k
ThreadClusterDesc_M_K	Threadwise cluster descriptor
ThreadSliceDesc_M_K	Threadwise slices descriptor
IgnoreNaN	Flag to ignore NaN, false by default

Member Typedef Documentation

◆ BlockwiseMaxReduce

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>

using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::BlockwiseMaxReduce = PartitionedBlockwiseReduction_v2<AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Max, false>

◆ BlockwiseSumReduce

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>

using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::BlockwiseSumReduce = PartitionedBlockwiseReduction_v2<AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Add, false>

◆ BufferType

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>

using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::BufferType = StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MRepeat, true>

◆ ThreadClusterLengths_M_K

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>

using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::ThreadClusterLengths_M_K = decltype(ThreadClusterDesc_M_K{}.GetLengths())

◆ ThreadSliceDesc_M

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>

using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::ThreadSliceDesc_M = decltype(make_naive_tensor_descriptor_packed( make_tuple(ThreadSliceDesc_M_K{}.GetLength(I0))))

◆ ThreadwiseMaxReduce

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>

using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::ThreadwiseMaxReduce = typename conditional< IgnoreNaN, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false, detail::AccumulateWithNanIgnore<reduce::Max, AccDataType> >, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false> >::type

◆ ThreadwiseSumReduce

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>

using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::ThreadwiseSumReduce = typename conditional< IgnoreNaN, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false, detail::AccumulateWithNanIgnore<reduce::Add, AccDataType> >, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false> >::type

Member Function Documentation

◆ Run()

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>

template<typename CThreadBuffer , typename WorkspaceBuffer >

__host__ __device__ void ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::Run	(	CThreadBuffer &	in_thread_buf,
		WorkspaceBuffer &	reduce_work_buf
	)

inline

Member Data Documentation

◆ I0

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>

constexpr auto ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::I0 = Number<0>{}

staticconstexpr

◆ I1

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>

constexpr auto ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::I1 = Number<1>{}

staticconstexpr

◆ KRepeat

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>

constexpr index_t ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::KRepeat = ThreadSliceDesc_M_K{}.GetLength(I1)

staticconstexpr

◆ max_value_buf

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>

BufferType ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::max_value_buf

◆ MRepeat

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>

constexpr index_t ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::MRepeat = ThreadSliceDesc_M_K{}.GetLength(I0)

staticconstexpr

◆ sum_value_buf

template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>

BufferType ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::sum_value_buf

The documentation for this struct was generated from the following file:

include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp