Blockwise softmax.  
 More...
#include <blockwise_softmax.hpp>
 | 
| using  | ThreadSliceDesc_M = decltype(make_naive_tensor_descriptor_packed(make_tuple(ThreadSliceDesc_M_K{}.GetLength(I0)))) | 
|   | 
| using  | ThreadwiseMaxReduce = typename conditional< IgnoreNaN, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false, detail::AccumulateWithNanIgnore< reduce::Max, AccDataType > >, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false > >::type | 
|   | 
| using  | ThreadwiseSumReduce = typename conditional< IgnoreNaN, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false, detail::AccumulateWithNanIgnore< reduce::Add, AccDataType > >, ThreadwiseReduction< AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false > >::type | 
|   | 
| using  | ThreadClusterLengths_M_K = decltype(ThreadClusterDesc_M_K{}.GetLengths()) | 
|   | 
| using  | BlockwiseMaxReduce = PartitionedBlockwiseReduction_v2< AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Max, false > | 
|   | 
| using  | BlockwiseSumReduce = PartitionedBlockwiseReduction_v2< AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Add, false > | 
|   | 
| using  | BufferType = StaticBuffer< AddressSpaceEnum::Vgpr, AccDataType, MRepeat, true > | 
|   | 
 | 
| template<typename CThreadBuffer , typename WorkspaceBuffer >  | 
| __host__ __device__ void  | Run (CThreadBuffer &in_thread_buf, WorkspaceBuffer &reduce_work_buf) | 
|   | 
template<index_t BlockSize, typename AccDataType, typename ThreadMap_M_K, typename ThreadClusterDesc_M_K, typename ThreadSliceDesc_M_K, bool IgnoreNaN = false>
struct ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >
Blockwise softmax. 
- Template Parameters
 - 
  
    | BlockSize | Block size  | 
    | AccDataType | Accumulator data type  | 
    | ThreadMap_M_K | Thread id to m_k  | 
    | ThreadClusterDesc_M_K | Threadwise cluster descriptor  | 
    | ThreadSliceDesc_M_K | Threadwise slices descriptor  | 
    | IgnoreNaN | Flag to ignore NaN, false by default  | 
  
   
 
◆ BlockwiseMaxReduce
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false> 
      
        
          | using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::BlockwiseMaxReduce =  PartitionedBlockwiseReduction_v2<AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Max, false> | 
        
      
 
 
◆ BlockwiseSumReduce
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false> 
      
        
          | using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::BlockwiseSumReduce =  PartitionedBlockwiseReduction_v2<AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadMap_M_K, reduce::Add, false> | 
        
      
 
 
◆ BufferType
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false> 
      
 
 
◆ ThreadClusterLengths_M_K
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false> 
      
 
 
◆ ThreadSliceDesc_M
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false> 
      
 
 
◆ ThreadwiseMaxReduce
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false> 
      
        
          | using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::ThreadwiseMaxReduce =  typename conditional< IgnoreNaN, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false, detail::AccumulateWithNanIgnore<reduce::Max, AccDataType> >, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Max, false> >::type | 
        
      
 
 
◆ ThreadwiseSumReduce
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false> 
      
        
          | using ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::ThreadwiseSumReduce =  typename conditional< IgnoreNaN, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false, detail::AccumulateWithNanIgnore<reduce::Add, AccDataType> >, ThreadwiseReduction<AccDataType, ThreadSliceDesc_M_K, ThreadSliceDesc_M, reduce::Add, false> >::type | 
        
      
 
 
◆ Run()
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false> 
template<typename CThreadBuffer , typename WorkspaceBuffer > 
  
  
      
        
          | __host__ __device__ void ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::Run  | 
          ( | 
          CThreadBuffer &  | 
          in_thread_buf,  | 
         
        
           | 
           | 
          WorkspaceBuffer &  | 
          reduce_work_buf  | 
         
        
           | 
          ) | 
           |  | 
         
       
   | 
  
inline   | 
  
 
 
◆ I0
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false> 
  
  
      
        
          | constexpr auto ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::I0 = Number<0>{} | 
         
       
   | 
  
staticconstexpr   | 
  
 
 
◆ I1
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false> 
  
  
      
        
          | constexpr auto ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::I1 = Number<1>{} | 
         
       
   | 
  
staticconstexpr   | 
  
 
 
◆ KRepeat
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false> 
  
  
      
        
          | constexpr index_t ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::KRepeat = ThreadSliceDesc_M_K{}.GetLength(I1) | 
         
       
   | 
  
staticconstexpr   | 
  
 
 
◆ max_value_buf
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false> 
      
 
 
◆ MRepeat
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false> 
  
  
      
        
          | constexpr index_t ck::BlockwiseSoftmax< BlockSize, AccDataType, ThreadMap_M_K, ThreadClusterDesc_M_K, ThreadSliceDesc_M_K, IgnoreNaN >::MRepeat = ThreadSliceDesc_M_K{}.GetLength(I0) | 
         
       
   | 
  
staticconstexpr   | 
  
 
 
◆ sum_value_buf
template<index_t BlockSize, typename AccDataType , typename ThreadMap_M_K , typename ThreadClusterDesc_M_K , typename ThreadSliceDesc_M_K , bool IgnoreNaN = false> 
      
 
 
The documentation for this struct was generated from the following file: