/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp Source File#
gridwise_elementwise_layernorm_welford_variance.hpp
Go to the documentation of this file.
__host__ constexpr __device__ T clamp(const T &x, const T &lowerbound, const T &upperbound)
Definition: math.hpp:148
Definition: ck.hpp:264
__host__ constexpr __device__ auto make_multi_index(Xs &&... xs)
Definition: array_multi_index.hpp:15
__host__ constexpr __device__ auto unpack2(F &&f, X &&x, Y &&y)
Definition: functional4.hpp:55
__host__ constexpr __device__ auto generate_tie(F &&f, Number< N >)
Definition: tuple_helper.hpp:22
__host__ constexpr __device__ auto generate_tuple(F &&f, Number< N >)
Definition: tuple_helper.hpp:15
__host__ constexpr __device__ auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition: tensor_descriptor_helper.hpp:101
__host__ constexpr __device__ auto make_cluster_descriptor(const Lengths &lengths, ArrangeOrder order=typename arithmetic_sequence_gen< 0, Lengths::Size(), 1 >::type{})
Definition: cluster_descriptor.hpp:13
static __device__ void Run(T &mean_value, T &var_value, CountDataType &count)
Definition: blockwise_welford.hpp:51
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:42
static constexpr index_t K_BlockTileSize
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:84
static constexpr bool reorder_thread_cluster
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:53
typename conditional< reorder_thread_cluster, Sequence< 1, 0 >, Sequence< 0, 1 > >::type ThreadBufferDimAccessOrder
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:58
static constexpr auto GammaThreadBufferNumber
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:88
static constexpr auto I0
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:79
decltype(make_naive_tensor_descriptor_packed(make_tuple(Number< MThreadSliceSize >{}))) ThreadReduceDstDesc_M
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:69
static __device__ void Run(const InGrid2dDescTuple in_grid_2d_desc_tuple, const GridDesc_M_K &x_grid_desc_m_k, const GridDesc_M_K &gamma_grid_desc_m_k, const GridDesc_M_K &beta_grid_desc_m_k, const GridDesc_M_K &y_grid_desc_m_k, index_t num_k_block_tile_iteration, AccDataType epsilon, const InDataTypePointerTuple p_in_global_tuple, XDataType *const __restrict__ p_x_lds_, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, const XElementwiseOperation x_elementwise_op, const YElementwiseOperation y_elementwise_op)
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:114
static constexpr auto thread_cluster_desc
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:63
decltype(make_naive_tensor_descriptor_packed(make_tuple(Number< MThreadSliceSize >{}, Number< XSrcVectorSize >{}))) ThreadReduceSrcDesc_M_K
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:67
static constexpr auto YThreadBufferNumber
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:90
ThreadwiseWelford< AccDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M > ThreadwiseWelford
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:72
typename conditional< reorder_thread_cluster, Sequence< 1, 0 >, Sequence< 0, 1 > >::type ThreadClusterArrangeOrder
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:61
Sequence< MThreadClusterSize, KThreadClusterSize > ThreadClusterLengths_M_K
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:55
static constexpr auto BetaThreadBufferNumber
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:89
static constexpr auto I1
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:80
static __device__ int GetKPerThread(const GridDesc_M_K &x_grid_desc_m_k, int thread_k_cluster_id)
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:92
static constexpr index_t K_BlockTileStepSize
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:85
static constexpr auto I2
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:81
static constexpr auto XThreadBufferNumber
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:87
BlockwiseWelford< AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadClusterArrangeOrder > BlockwiseWelford
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:77
static constexpr index_t NumInput
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:51
static constexpr index_t M_BlockTileSize
Definition: gridwise_elementwise_layernorm_welford_variance.hpp:83
Definition: multi_index_transform.hpp:13
Definition: sequence.hpp:43
Definition: static_buffer.hpp:16
Definition: threadwise_tensor_slice_transfer.hpp:39
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:173
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:66
Definition: threadwise_tensor_slice_transfer.hpp:214
__device__ void Run(const SrcDesc &src_desc, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:243
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:355
Definition: functional.hpp:100
Definition: integral_constant.hpp:10
Definition: functional2.hpp:31
Definition: unary_element_wise_operation.hpp:241