Blockwise softmax.
More...
#include <blockwise_softmax.hpp>
|
using | ThreadSliceDesc_M = decltype(make_naive_tensor_descriptor_packed(make_tuple(ThreadSliceDesc_M_K{}.GetLength(I0)))) |
|
using | ThreadwiseMaxReduce = typename conditional< IgnoreNaN, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false, detail::AccumulateWithNanIgnore< reduce::Max, AccDataType > >, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false > >::type |
|
using | ThreadwiseSumReduce = typename conditional< IgnoreNaN, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false, detail::AccumulateWithNanIgnore< reduce::Add, AccDataType > >, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false > >::type |
|
using | ThreadClusterLengths_M_K = decltype(ThreadClusterDesc_M_K{}.GetLengths()) |
|
using | BlockwiseMaxReduce = PartitionedBlockwiseReduction_v2< AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Max, false > |
|
using | BlockwiseSumReduce = PartitionedBlockwiseReduction_v2< AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Add, false > |
|
using | BufferType = StaticBuffer< AddressSpaceEnum::Vgpr, AccDataType, MRepeat, true > |
|
|
template<typename CThreadBuffer , typename WorkspaceBuffer > |
__host__ __device__ void | Run (CThreadBuffer &in_thread_buf, WorkspaceBuffer &reduce_work_buf) |
|
template<index_t BlockSize, typename AccDataType, typename ThreadMap_M_K, typename ThreadClusterDesc_M_K, typename ThreadSliceDesc_M_K, bool IgnoreNaN = false>
struct ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >
Blockwise softmax.
- Template Parameters
-
BlockSize | Block size |
AccDataType | Accumulator data type |
ThreadMap_M_K | Thread id to m_k |
ThreadClusterDesc_M_K | Threadwise cluster descriptor |
ThreadSliceDesc_M_K | Threadwise slices descriptor |
IgnoreNaN | Flag to ignore NaN, false by default |
◆ BlockwiseMaxReduce
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::BlockwiseMaxReduce = PartitionedBlockwiseReduction_v2<AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Max, false> |
◆ BlockwiseSumReduce
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::BlockwiseSumReduce = PartitionedBlockwiseReduction_v2<AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Add, false> |
◆ BufferType
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
◆ ThreadClusterLengths_M_K
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
◆ ThreadSliceDesc_M
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
◆ ThreadwiseMaxReduce
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::ThreadwiseMaxReduce = typename conditional< IgnoreNaN, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false, detail::AccumulateWithNanIgnore<reduce::Max, AccDataType> >, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false> >::type |
◆ ThreadwiseSumReduce
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::ThreadwiseSumReduce = typename conditional< IgnoreNaN, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false, detail::AccumulateWithNanIgnore<reduce::Add, AccDataType> >, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false> >::type |
◆ Run()
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
template<typename CThreadBuffer , typename WorkspaceBuffer >
__host__ __device__ void ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::Run |
( |
CThreadBuffer & |
in_thread_buf, |
|
|
WorkspaceBuffer & |
reduce_work_buf |
|
) |
| |
|
inline |
◆ I0
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
constexpr auto ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::I0 = Number<0>{} |
|
staticconstexpr |
◆ I1
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
constexpr auto ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::I1 = Number<1>{} |
|
staticconstexpr |
◆ KRepeat
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
constexpr index_t ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::KRepeat = ThreadSliceDesc_M_K{}.GetLength(I1) |
|
staticconstexpr |
◆ max_value_buf
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
◆ MRepeat
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
constexpr index_t ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::MRepeat = ThreadSliceDesc_M_K{}.GetLength(I0) |
|
staticconstexpr |
◆ sum_value_buf
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false>
The documentation for this struct was generated from the following file:
- /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp