|  | 
| template<typename Lengths , typename ArrangeOrder  = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_cluster_descriptor (const Lengths &lengths, ArrangeOrder order=typename arithmetic_sequence_gen< 0, Lengths::size(), 1 >::type{}) | 
|  | 
| template<typename LowLength > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_pass_through_transform (const LowLength &low_length) | 
|  | 
| template<typename LowLength , typename LeftPad , typename RightPad , bool SkipIsValidCheck = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_pad_transform (const LowLength &low_length, const LeftPad &left_pad, const RightPad &right_pad, bool_constant< SkipIsValidCheck >=bool_constant< false >{}) | 
|  | 
| template<typename LowLength , typename LeftPadLength , bool SkipIsValidCheck = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_left_pad_transform (const LowLength &low_length, const LeftPadLength &left_pad_, bool_constant< SkipIsValidCheck >=bool_constant< false >{}) | 
|  | 
| template<typename LowLength , typename RightPadLength , bool SkipIsValidCheck = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_right_pad_transform (const LowLength &low_length, const RightPadLength &right_pad_, bool_constant< SkipIsValidCheck >=bool_constant< false >{}) | 
|  | 
| template<typename UpLengths , typename Coefficients , typename std::enable_if< UpLengths::size()==Coefficients::size(), bool >::type  = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_embed_transform (const UpLengths &up_lengths, const Coefficients &coefficients) | 
|  | 
| template<typename LowLengths > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_merge_transform_v2_magic_division (const LowLengths &low_lengths) | 
|  | 
| template<typename LowLengths > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_merge_transform_v3_division_mod (const LowLengths &low_lengths) | 
|  | 
| template<typename LowLengths > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_merge_transform (const LowLengths &low_lengths) | 
|  | 
| template<typename UpLengths , bool Use24BitIntegerCalculation = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_unmerge_transform (const UpLengths &up_lengths, bool_constant< Use24BitIntegerCalculation >=bool_constant< false >{}) | 
|  | 
| template<typename LowerIndex > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_freeze_transform (const LowerIndex &low_idx) | 
|  | 
| template<typename UpperIndex > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_insert_transform (const UpperIndex &up_idx) | 
|  | 
| template<typename UpLengths > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_replicate_transform (const UpLengths &up_lengths) | 
|  | 
| template<typename LowLength , typename SliceBegin , typename SliceEnd > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_slice_transform (const LowLength &low_length, const SliceBegin &slice_begin, const SliceEnd &slice_end) | 
|  | 
| template<typename Modulus , typename UpLength > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_modulo_transform (const Modulus &modulus, const UpLength &up_length) | 
|  | 
| template<typename LowLengths > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_xor_transform (const LowLengths &low_lengths) | 
|  | 
| template<typename LowLength , typename OffsetLength > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_offset_transform (const LowLength &low_length, const OffsetLength &offset_length) | 
|  | 
| template<typename UpLength , typename Indices > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_indexing_transform (const UpLength &up_lengths, const Indices &indices) | 
|  | 
| template<typename UpLength , typename IndexingAdaptor > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_indexing_transform_with_adaptor (const UpLength &up_lengths, const IndexingAdaptor &iadaptor) | 
|  | 
| CK_TILE_DEVICE int32x4_t | make_wave_buffer_resource (const void *ptr, uint32_t size=0xffffffff) | 
|  | 
| CK_TILE_DEVICE void | buffer_load_fence (index_t cnt=0) | 
|  | 
| CK_TILE_DEVICE void | lds_load_fence (index_t cnt=0) | 
|  | 
| template<typename... T> | 
| CK_TILE_DEVICE void | buffer_load_fence (index_t cnt=0, T &... o) | 
|  | 
| CK_TILE_DEVICE void | buffer_store_fence (index_t cnt=0) | 
|  | 
| CK_TILE_DEVICE auto | async_load_fence_raw (index_t cnt=0) | 
|  | 
| CK_TILE_DEVICE_EXTERN int8_t | llvm_amdgcn_raw_buffer_load_i8 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8") | 
|  | 
| CK_TILE_DEVICE_EXTERN int8x2_t | llvm_amdgcn_raw_buffer_load_i8x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8") | 
|  | 
| CK_TILE_DEVICE_EXTERN int8x4_t | llvm_amdgcn_raw_buffer_load_i8x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8") | 
|  | 
| CK_TILE_DEVICE_EXTERN int16_t | llvm_amdgcn_raw_buffer_load_i16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16") | 
|  | 
| CK_TILE_DEVICE_EXTERN int16x2_t | llvm_amdgcn_raw_buffer_load_i16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16") | 
|  | 
| CK_TILE_DEVICE_EXTERN int16x4_t | llvm_amdgcn_raw_buffer_load_i16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16") | 
|  | 
| CK_TILE_DEVICE_EXTERN int32_t | llvm_amdgcn_raw_buffer_load_i32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32") | 
|  | 
| CK_TILE_DEVICE_EXTERN int32x2_t | llvm_amdgcn_raw_buffer_load_i32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32") | 
|  | 
| CK_TILE_DEVICE_EXTERN int32x4_t | llvm_amdgcn_raw_buffer_load_i32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32") | 
|  | 
| CK_TILE_DEVICE_EXTERN _Float16 | llvm_amdgcn_raw_buffer_load_fp16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16") | 
|  | 
| CK_TILE_DEVICE_EXTERN fp16x2_t | llvm_amdgcn_raw_buffer_load_fp16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16") | 
|  | 
| CK_TILE_DEVICE_EXTERN fp16x4_t | llvm_amdgcn_raw_buffer_load_fp16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16") | 
|  | 
| CK_TILE_DEVICE_EXTERN float | llvm_amdgcn_raw_buffer_load_fp32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32") | 
|  | 
| CK_TILE_DEVICE_EXTERN fp32x2_t | llvm_amdgcn_raw_buffer_load_fp32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32") | 
|  | 
| CK_TILE_DEVICE_EXTERN fp32x4_t | llvm_amdgcn_raw_buffer_load_fp32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i8 (int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i8x2 (int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i8x4 (int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i16 (int16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i16x2 (int16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i16x4 (int16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_ui16 (uint16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_ui16x2 (uint16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_ui16x4 (uint16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i32x2 (int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i32x4 (int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_fp16 (_Float16 vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_fp16x2 (fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_fp16x4 (fp16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_fp32x2 (fp32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_fp32x4 (fp32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32") | 
|  | 
| CK_TILE_DEVICE_EXTERN fp16x2_t | llvm_amdgcn_raw_buffer_atomic_add_fp16x2 (fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16") | 
|  | 
| CK_TILE_DEVICE_EXTERN int32_t | llvm_amdgcn_raw_buffer_atomic_add_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32") | 
|  | 
| CK_TILE_DEVICE_EXTERN float | llvm_amdgcn_raw_buffer_atomic_add_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32") | 
|  | 
| CK_TILE_DEVICE_EXTERN double | llvm_amdgcn_raw_buffer_atomic_max_fp64 (double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64") | 
|  | 
| CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_load_lds (int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds") | 
|  | 
| template<bool pre_nop = false> | 
| CK_TILE_DEVICE void | async_buffer_load_dword_v (void *smem, int32x4_t rsrc, index_t voffset, index_t, index_t ioffset, index_t=0, bool_constant< pre_nop >={}) | 
|  | 
| CK_TILE_DEVICE void | async_buffer_load_fence (index_t cnt=0) | 
|  | 
| template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> | 
| CK_TILE_DEVICE thread_buffer< int8_t, N > | amd_buffer_load_impl_with_bytes (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> | 
| CK_TILE_DEVICE thread_buffer< T, N > | amd_buffer_load_impl (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false> | 
| CK_TILE_DEVICE void | amd_buffer_load_raw_impl (thread_buffer< T, N > &dst, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_linear_addr_offset, index_t flag=0, bool_constant< pre_nop >={}) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false> | 
| CK_TILE_DEVICE void | amd_async_buffer_load_impl (T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, bool_constant< pre_nop >={}) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | 
| CK_TILE_DEVICE void | amd_async_buffer_load (CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, index_t flag=0, bool_constant< oob_conditional_check >={}) | 
|  | 
| template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> | 
| CK_TILE_DEVICE void | amd_buffer_store_impl_with_bytes (const thread_buffer< int8_t, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> | 
| CK_TILE_DEVICE void | amd_buffer_store_impl (const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | 
| CK_TILE_DEVICE void | amd_buffer_store_raw_impl (const thread_buffer< T, N > &dst_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset, index_t dst_linear_addr_offset, index_t is_valid_element=1) | 
|  | 
| template<typename T , index_t N> | 
| CK_TILE_DEVICE void | amd_buffer_atomic_add_impl (const thread_buffer< T, N > &src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) | 
|  | 
| template<typename T , index_t N> | 
| CK_TILE_DEVICE void | amd_buffer_atomic_max_impl (const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | 
| CK_TILE_DEVICE thread_buffer< T, N > | amd_buffer_load_invalid_element_return_zero (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | 
| CK_TILE_DEVICE thread_buffer< T, N > | amd_buffer_load_invalid_element_return_customized_value (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false> | 
| CK_TILE_DEVICE void | amd_buffer_load_raw (thread_buffer< T, N > &dst, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, index_t is_valid_element=0, bool_constant< pre_nop >={}) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false> | 
| CK_TILE_DEVICE void | amd_buffer_load_raw (thread_buffer< T, N > &dst, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t is_valid_element=0, bool_constant< pre_nop >={}) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false> | 
| CK_TILE_DEVICE void | amd_async_buffer_load_with_oob_raw (T *smem, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, bool_constant< pre_nop >={}) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false> | 
| CK_TILE_DEVICE void | amd_async_buffer_load_with_oob_raw (T *smem, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, bool_constant< pre_nop >={}) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = false> | 
| CK_TILE_DEVICE void | amd_async_buffer_load_with_oob (CK_TILE_LDS_ADDR T *smem, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | 
| CK_TILE_DEVICE void | amd_buffer_store (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | 
| CK_TILE_DEVICE void | amd_buffer_store_raw (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) | 
|  | 
| template<typename T , index_t N> | 
| CK_TILE_DEVICE void | amd_buffer_atomic_add (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) | 
|  | 
| template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false> | 
| CK_TILE_DEVICE void | amd_buffer_atomic_add_raw (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size, bool_constant< pre_nop >={}) | 
|  | 
| template<typename T , index_t N> | 
| CK_TILE_DEVICE void | amd_buffer_atomic_max (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) | 
|  | 
| template<typename T , index_t NumElemsPerThread> | 
| CK_TILE_DEVICE void | amd_direct_load_global_to_lds (const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE index_t | get_warp_size () | 
|  | 
| CK_TILE_DEVICE index_t | get_grid_size () | 
|  | 
| CK_TILE_DEVICE index_t | get_block_size () | 
|  | 
| CK_TILE_DEVICE index_t | get_thread_local_1d_id () | 
|  | 
| CK_TILE_DEVICE index_t | get_thread_global_1d_id () | 
|  | 
| CK_TILE_DEVICE index_t | get_block_1d_id () | 
|  | 
| CK_TILE_DEVICE index_t | get_lane_id () | 
|  | 
| CK_TILE_DEVICE index_t | get_warp_id () | 
|  | 
| CK_TILE_DEVICE index_t | get_thread_id () | 
|  | 
| CK_TILE_DEVICE index_t | get_block_id () | 
|  | 
| CK_TILE_DEVICE void | block_sync_lds () | 
|  | 
| CK_TILE_DEVICE void | block_sync_load_raw (index_t cnt=0) | 
|  | 
| CK_TILE_DEVICE void | block_sync_lds_direct_load () | 
|  | 
| CK_TILE_DEVICE void | s_nop (index_t cnt=0) | 
|  | 
| template<typename T > | 
| __device__ T * | cast_pointer_to_generic_address_space (T CK_CONSTANT_ADDRESS_SPACE *p) | 
|  | 
| template<typename T > | 
| __host__ __device__ T CK_CONSTANT_ADDRESS_SPACE * | cast_pointer_to_constant_address_space (T *p) | 
|  | 
| template<typename T , typename ComputeType > | 
| CK_TILE_HOST_DEVICE T | add (const T &a, const T &b) | 
|  | 
| CK_TILE_HOST_DEVICE bf16x2_t | add_bf16x2_t (const bf16x2_t &a, const bf16x2_t &b) | 
|  | 
| CK_TILE_HOST_DEVICE bf16x4_t | add_bf16x4_t (const bf16x4_t &a, const bf16x4_t &b) | 
|  | 
| CK_TILE_HOST_DEVICE fp8x4_t | add_fp8x4_t (const fp8x4_t &a, const fp8x4_t &b) | 
|  | 
| CK_TILE_HOST_DEVICE fp8x8_t | add_fp8x8_t (const fp8x8_t &a, const fp8x8_t &b) | 
|  | 
| CK_TILE_HOST_DEVICE bf8x4_t | add_bf8x4_t (const bf8x4_t &a, const bf8x4_t &b) | 
|  | 
| CK_TILE_HOST_DEVICE bf8x8_t | add_bf8x8_t (const bf8x8_t &a, const bf8x8_t &b) | 
|  | 
| template<typename X > | 
| CK_TILE_DEVICE void | atomic_add (X *p_dst, const X &x) | 
|  | 
| template<> | 
| CK_TILE_DEVICE void | atomic_add< bf16x2_t > (bf16x2_t *p_dst, const bf16x2_t &x) | 
|  | 
| template<> | 
| CK_TILE_DEVICE void | atomic_add< bf16x4_t > (bf16x4_t *p_dst, bf16x4_t const &x) | 
|  | 
| template<> | 
| CK_TILE_DEVICE void | atomic_add< fp8x4_t > (fp8x4_t *p_dst, const fp8x4_t &x) | 
|  | 
| template<> | 
| CK_TILE_DEVICE void | atomic_add< bf8x4_t > (bf8x4_t *p_dst, const bf8x4_t &x) | 
|  | 
| template<> | 
| CK_TILE_DEVICE void | atomic_add< fp8x8_t > (fp8x8_t *p_dst, fp8x8_t const &x) | 
|  | 
| template<> | 
| CK_TILE_DEVICE void | atomic_add< bf8x8_t > (bf8x8_t *p_dst, bf8x8_t const &x) | 
|  | 
| template<typename T , index_t N> | 
| CK_TILE_DEVICE void | atomic_add_g (T *p_dst, const thread_buffer< T, N > &x) | 
|  | 
| template<typename T , index_t N> | 
| CK_TILE_DEVICE void | atomic_max_g (T *p_dst, const thread_buffer< T, N > &x) | 
|  | 
| CK_TILE_DEVICE void | m0_set_with_memory (index_t v) | 
|  | 
| CK_TILE_DEVICE void | m0_inc_with_memory (index_t v) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | warp_shuffle_up (const T &v_local, uint32_t lane_delta) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | warp_shuffle_down (const T &v_local, uint32_t lane_delta) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | warp_shuffle (const T &v_local, uint32_t src_lane) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE auto | flag_to_exec (const T &v_flag) | 
|  | 
| template<typename X , typename Y > | 
| CK_TILE_DEVICE auto | cmp_lt_to_exec (const X &x, const Y &y) | 
|  | 
| template<typename D  = void, typename... Ts> | 
| constexpr CK_TILE_HOST_DEVICE details::return_type< D, Ts... > | make_array (Ts &&... ts) | 
|  | 
| template<typename T , index_t Size> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_array_with (std::initializer_list< T > ilist) | 
|  | 
| template<typename T , index_t Size> | 
| constexpr CK_TILE_HOST_DEVICE bool | operator== (const array< T, Size > &a, const array< T, Size > &b) | 
|  | 
| template<typename T , index_t Size> | 
| constexpr CK_TILE_HOST_DEVICE bool | operator!= (const array< T, Size > &a, const array< T, Size > &b) | 
|  | 
| template<typename T , index_t N, typename X > | 
| constexpr CK_TILE_HOST_DEVICE auto | to_array (const std::vector< X > &x) | 
|  | 
| template<typename T , index_t N, typename X > | 
| constexpr CK_TILE_HOST_DEVICE auto | to_array (const X &x) | 
|  | 
| template<typename TData , index_t NSize> | 
| constexpr CK_TILE_HOST_DEVICE auto | container_push_back (const array< TData, NSize > &a, const TData &x) | 
|  | 
| template<typename... Ts, typename T > | 
| constexpr CK_TILE_HOST_DEVICE auto | container_push_front (const tuple< Ts... > &a, const T &x) | 
|  | 
| template<typename... Ts, typename T > | 
| constexpr CK_TILE_HOST_DEVICE auto | container_push_back (const tuple< Ts... > &a, const T &x) | 
|  | 
| template<typename TData , index_t NSize, index_t... IRs> | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_new2old (const array< TData, NSize > &old_array, sequence< IRs... >) | 
|  | 
| template<typename TData , index_t NSize, index_t... IRs> | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_old2new (const array< TData, NSize > &old_array, sequence< IRs... > old2new) | 
|  | 
| template<typename TData , index_t NSize> | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_new2old (const array< TData, NSize > &old_array, const map< index_t, index_t > &new2old) | 
|  | 
| template<typename TData , index_t NSize> | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_old2new (const array< TData, NSize > &old_array, const map< index_t, index_t > &old2new) | 
|  | 
| template<typename... Ts, index_t... IRs> | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_new2old (const tuple< Ts... > &old_tuple, sequence< IRs... >) | 
|  | 
| template<typename... Ts, index_t... IRs> | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_old2new (const tuple< Ts... > &old_tuple, sequence< IRs... > old2new) | 
|  | 
| template<index_t... Is, index_t... IRs> | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_new2old (sequence< Is... >, sequence< IRs... >) | 
|  | 
| template<index_t... Is, index_t... IRs> | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_old2new (sequence< Is... > old_seq, sequence< IRs... >) | 
|  | 
| template<typename Container , typename Reduce , typename ROld , index_t I, index_t IEnd, index_t IStep> | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reduce_impl (const Container &x, Reduce reduce, ROld r_old, number< I > i, number< IEnd >, number< IStep >) | 
|  | 
| template<typename Container , typename Reduce , typename Init , index_t IBegin = 0, index_t IEnd = Container::size(), index_t IStep = 1> | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reduce (const Container &x, Reduce reduce, Init init, number< IBegin >=number< 0 >{}, number< IEnd >=number< Container::size()>{}, number< IStep >=number< 1 >{}) | 
|  | 
| template<typename TData , index_t NSize, typename Reduce > | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reverse_inclusive_scan (const array< TData, NSize > &x, Reduce f, TData init) | 
|  | 
| template<typename TData , index_t NSize, typename Reduce , typename Init > | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reverse_exclusive_scan (const array< TData, NSize > &x, Reduce f, Init init) | 
|  | 
| template<index_t... Is, typename Reduce , index_t Init> | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reverse_exclusive_scan (const sequence< Is... > &seq, Reduce f, number< Init >) | 
|  | 
| template<typename... Xs, typename Reduce , index_t I, typename YOld , typename ROld > | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reverse_exclusive_scan_impl (const tuple< Xs... > &x, Reduce reduce, number< I > i, YOld y_old, ROld r_old) | 
|  | 
| template<typename... Xs, typename Reduce , typename Init > | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reverse_exclusive_scan (const tuple< Xs... > &x, Reduce reduce, Init init) | 
|  | 
| template<typename... Xs, typename Reduce , typename TData > | 
| constexpr CK_TILE_HOST_DEVICE auto | container_reverse_inclusive_scan (const tuple< Xs... > &x, Reduce f, TData init) | 
|  | 
| template<typename X , typename... Ys> | 
| constexpr CK_TILE_HOST_DEVICE auto | container_concat (const X &x, const Ys &... ys) | 
|  | 
| template<typename T , index_t NX, index_t NY> | 
| constexpr CK_TILE_HOST_DEVICE auto | container_concat (const array< T, NX > &ax, const array< T, NY > &ay) | 
|  | 
| template<typename... X, typename... Y> | 
| constexpr CK_TILE_HOST_DEVICE auto | container_concat (const tuple< X... > &tx, const tuple< Y... > &ty) | 
|  | 
| template<typename Container > | 
| constexpr CK_TILE_HOST_DEVICE auto | container_concat (const Container &x) | 
|  | 
| template<typename T , index_t N, index_t... Is> | 
| constexpr CK_TILE_HOST_DEVICE auto | get_container_subset (const array< T, N > &arr, sequence< Is... >) | 
|  | 
| template<typename... Ts, index_t... Is> | 
| constexpr CK_TILE_HOST_DEVICE auto | get_container_subset (const tuple< Ts... > &tup, sequence< Is... >) | 
|  | 
| template<typename T , index_t N, index_t... Is> | 
| constexpr CK_TILE_HOST_DEVICE void | set_container_subset (array< T, N > &y, sequence< Is... > picks, const array< T, sizeof...(Is)> &x) | 
|  | 
| template<typename Y , typename X , index_t... Is> | 
| constexpr CK_TILE_HOST_DEVICE void | set_container_subset (Y &y, sequence< Is... > picks, const X &x) | 
|  | 
| template<index_t... Is> | 
| constexpr index_t | container_find (sequence< Is... > seq, index_t value) | 
|  | 
| template<index_t... Is> | 
| constexpr CK_TILE_HOST_DEVICE auto | sequence_to_tuple_of_number (sequence< Is... >) | 
|  | 
| template<typename... Xs> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_multi_index (Xs &&... xs) | 
|  | 
| template<index_t NSize> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_zero_multi_index () | 
|  | 
| template<typename T > | 
| constexpr CK_TILE_HOST_DEVICE auto | to_multi_index (const T &x) | 
|  | 
| template<index_t NSize, typename X > | 
| constexpr CK_TILE_HOST_DEVICE auto | operator+= (multi_index< NSize > &y, const X &x) | 
|  | 
| template<index_t NSize, typename X > | 
| constexpr CK_TILE_HOST_DEVICE auto | operator-= (multi_index< NSize > &y, const X &x) | 
|  | 
| template<index_t NSize, typename T > | 
| constexpr CK_TILE_HOST_DEVICE auto | operator+ (const multi_index< NSize > &a, const T &b) | 
|  | 
| template<index_t NSize, typename T > | 
| constexpr CK_TILE_HOST_DEVICE auto | operator- (const multi_index< NSize > &a, const T &b) | 
|  | 
| template<index_t NSize, typename T > | 
| constexpr CK_TILE_HOST_DEVICE auto | operator* (const multi_index< NSize > &a, const T &b) | 
|  | 
| template<index_t NSize> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator* (index_t a, const multi_index< NSize > &x) | 
|  | 
| template<index_t NSize> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator* (const multi_index< NSize > &x, index_t a) | 
|  | 
| template<index_t I, index_t... Is> | 
| constexpr CK_TILE_HOST_DEVICE auto | sequence_pop_front (sequence< I, Is... >) | 
|  | 
| template<typename Seq > | 
| constexpr CK_TILE_HOST_DEVICE auto | sequence_pop_back (Seq) | 
|  | 
| template<index_t... Xs, index_t... Ys> | 
| constexpr CK_TILE_HOST_DEVICE bool | operator== (sequence< Xs... >, sequence< Ys... >) | 
|  | 
| template<index_t... Xs, index_t... Ys> | 
| constexpr CK_TILE_HOST_DEVICE bool | operator!= (sequence< Xs... > x, sequence< Ys... > y) | 
|  | 
| template<index_t... Xs, index_t... Ys> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator+ (sequence< Xs... >, sequence< Ys... >) | 
|  | 
| template<index_t... Xs, index_t... Ys> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator- (sequence< Xs... >, sequence< Ys... >) | 
|  | 
| template<index_t... Xs, index_t... Ys> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator* (sequence< Xs... >, sequence< Ys... >) | 
|  | 
| template<index_t... Xs, index_t... Ys> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator/ (sequence< Xs... >, sequence< Ys... >) | 
|  | 
| template<index_t... Xs, index_t... Ys> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator% (sequence< Xs... >, sequence< Ys... >) | 
|  | 
| template<index_t... Xs, index_t Y> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator+ (sequence< Xs... >, number< Y >) | 
|  | 
| template<index_t... Xs, index_t Y> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator- (sequence< Xs... >, number< Y >) | 
|  | 
| template<index_t... Xs, index_t Y> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator* (sequence< Xs... >, number< Y >) | 
|  | 
| template<index_t... Xs, index_t Y> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator/ (sequence< Xs... >, number< Y >) | 
|  | 
| template<index_t... Xs, index_t Y> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator% (sequence< Xs... >, number< Y >) | 
|  | 
| template<index_t Y, index_t... Xs> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator+ (number< Y >, sequence< Xs... >) | 
|  | 
| template<index_t Y, index_t... Xs> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator- (number< Y >, sequence< Xs... >) | 
|  | 
| template<index_t Y, index_t... Xs> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator* (number< Y >, sequence< Xs... >) | 
|  | 
| template<index_t Y, index_t... Xs> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator/ (number< Y >, sequence< Xs... >) | 
|  | 
| template<index_t Y, index_t... Xs> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator% (number< Y >, sequence< Xs... >) | 
|  | 
| template<typename... Seqs> | 
| constexpr CK_TILE_HOST_DEVICE auto | merge_sequences (Seqs...) | 
|  | 
| template<typename F , index_t... Xs> | 
| constexpr CK_TILE_HOST_DEVICE auto | transform_sequences (F f, sequence< Xs... >) | 
|  | 
| template<typename F , index_t... Xs, index_t... Ys> | 
| constexpr CK_TILE_HOST_DEVICE auto | transform_sequences (F f, sequence< Xs... >, sequence< Ys... >) | 
|  | 
| template<typename F , index_t... Xs, index_t... Ys, index_t... Zs> | 
| constexpr CK_TILE_HOST_DEVICE auto | transform_sequences (F f, sequence< Xs... >, sequence< Ys... >, sequence< Zs... >) | 
|  | 
| template<typename Seq , typename Reduce , index_t Init> | 
| constexpr CK_TILE_HOST_DEVICE auto | reverse_inclusive_scan_sequence (Seq, Reduce, number< Init >) | 
|  | 
| template<typename Seq , typename Reduce , index_t Init> | 
| constexpr CK_TILE_HOST_DEVICE auto | reverse_exclusive_scan_sequence (Seq, Reduce, number< Init >) | 
|  | 
| template<typename Seq , typename Reduce , index_t Init> | 
| constexpr CK_TILE_HOST_DEVICE auto | inclusive_scan_sequence (Seq, Reduce, number< Init >) | 
|  | 
| template<typename Seq , typename Reduce , index_t Init> | 
| constexpr auto | exclusive_scan_sequence (Seq, Reduce, number< Init >) | 
|  | 
| template<typename Seq > | 
| constexpr auto | prefix_sum_sequence (Seq) | 
|  | 
| template<typename Seq , index_t... Is> | 
| constexpr CK_TILE_HOST_DEVICE auto | pick_sequence_elements_by_ids (Seq, sequence< Is... >) | 
|  | 
| template<typename Seq , typename Mask > | 
| constexpr CK_TILE_HOST_DEVICE auto | pick_sequence_elements_by_mask (Seq, Mask) | 
|  | 
| template<typename Seq , typename Values , typename Ids > | 
| constexpr CK_TILE_HOST_DEVICE auto | modify_sequence_elements_by_ids (Seq, Values, Ids) | 
|  | 
| template<typename Seq , typename Reduce , index_t Init> | 
| constexpr CK_TILE_HOST_DEVICE index_t | reduce_on_sequence (Seq, Reduce f, number< Init >) | 
|  | 
| template<typename Seq , typename F > | 
| constexpr CK_TILE_HOST_DEVICE bool | sequence_any_of (Seq, F f) | 
|  | 
| template<typename Seq , typename F > | 
| constexpr CK_TILE_HOST_DEVICE bool | sequence_all_of (Seq, F f) | 
|  | 
| template<index_t... Is> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_sequence (number< Is >...) | 
|  | 
| template<typename F , index_t N> | 
| constexpr CK_TILE_HOST_DEVICE auto | generate_sequence (F, number< N >) | 
|  | 
| template<typename F , index_t N> | 
| constexpr CK_TILE_HOST_DEVICE auto | generate_sequence_v2 (F &&f, number< N >) | 
|  | 
| template<index_t... Is> | 
| constexpr CK_TILE_HOST_DEVICE auto | to_sequence (tuple< number< Is >... >) | 
|  | 
| template<typename SeqSortedSamples , index_t r, index_t... rs> | 
| constexpr CK_TILE_HOST_DEVICE auto | histogram_sorted_sequence (SeqSortedSamples, sequence< r, rs... >) | 
|  | 
| template<typename F , index_t N> | 
| constexpr CK_TILE_HOST_DEVICE auto | generate_array (F &&f, number< N >) | 
|  | 
| template<typename Seq , index_t SliceSize, typename Mask  = typename uniform_sequence_gen<Seq::size(), 1>::type> | 
| constexpr auto | reverse_slice_sequence (Seq, number< SliceSize >, Mask=typename uniform_sequence_gen< Seq::size(), 1 >::type{}) | 
|  | 
| template<typename Seq , index_t SliceSize, typename Mask  = typename uniform_sequence_gen<Seq::size(), 1>::type> | 
| constexpr auto | slice_sequence (Seq, number< SliceSize >, Mask=typename uniform_sequence_gen< Seq::size(), 1 >::type{}) | 
|  | 
| template<typename... Ts> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_thread_buffer (Ts &&... ts) | 
|  | 
| template<typename... Xs> | 
| constexpr CK_TILE_HOST_DEVICE bool | operator== (const tuple< Xs... > &a, const tuple< Xs... > &b) | 
|  | 
| template<typename... Xs> | 
| constexpr CK_TILE_HOST_DEVICE bool | operator!= (const tuple< Xs... > &a, const tuple< Xs... > &b) | 
|  | 
| template<typename... Xs> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_tuple (Xs &&... xs) | 
|  | 
| template<typename... Args> | 
| constexpr tuple< Args &... > | tie (Args &... args) noexcept | 
|  | 
| template<typename F , index_t N> | 
| constexpr CK_TILE_HOST_DEVICE auto | generate_tuple (F &&f, number< N >) | 
|  | 
| template<typename F , index_t N> | 
| constexpr CK_TILE_HOST_DEVICE auto | generate_tie (F &&f, number< N >) | 
|  | 
| template<typename... X, typename... Y> | 
| constexpr CK_TILE_HOST_DEVICE auto | concat_tuple_of_reference (const tuple< X &... > &tx, const tuple< Y &... > &ty) | 
|  | 
| template<typename... X, typename... Y> | 
| constexpr CK_TILE_HOST_DEVICE auto | concat_tuple (const tuple< X... > &tx, const tuple< Y... > &ty) | 
|  | 
| template<typename... X> | 
| constexpr CK_TILE_HOST_DEVICE auto | concat_tuple (const tuple< X... > &tx) | 
|  | 
| template<typename... X, typename... Tuples> | 
| constexpr CK_TILE_HOST_DEVICE auto | concat_tuple (const tuple< X... > &tx, const Tuples &... tuples) | 
|  | 
| template<typename F , typename X > | 
| constexpr CK_TILE_HOST_DEVICE auto | transform_tuples (F f, const X &x) | 
|  | 
| template<typename F , typename X , typename Y > | 
| constexpr CK_TILE_HOST_DEVICE auto | transform_tuples (F f, const X &x, const Y &y) | 
|  | 
| template<typename F , typename X , typename Y , typename Z > | 
| constexpr CK_TILE_HOST_DEVICE auto | transform_tuples (F f, const X &x, const Y &y, const Z &z) | 
|  | 
| template<typename F , typename X > | 
| constexpr CK_TILE_HOST_DEVICE auto | embed_tuples (F f, const X &x) | 
|  | 
| template<index_t Depth = 0, index_t MaxDepth = -1> | 
| constexpr CK_TILE_HOST_DEVICE auto | unroll_nested_tuple (const tuple<> &t) | 
|  | 
| template<index_t Depth = 0, index_t MaxDepth = -1, typename T > | 
| constexpr CK_TILE_HOST_DEVICE auto | unroll_nested_tuple (const T &t) | 
|  | 
| template<index_t Depth = 0, index_t MaxDepth = -1, typename... Ts> | 
| constexpr CK_TILE_HOST_DEVICE auto | unroll_nested_tuple (const tuple< Ts... > &t) | 
|  | 
| template<typename... Ts> | 
| constexpr CK_TILE_HOST_DEVICE auto | tuple_reverse (const tuple< Ts... > &t) | 
|  | 
| template<index_t Idx, index_t End, typename F , typename... Ts> | 
| constexpr CK_TILE_HOST_DEVICE auto | tuple_reduce (F &&f, const tuple< Ts... > &t) | 
|  | 
| template<typename... Ts> | 
| constexpr CK_TILE_HOST_DEVICE auto | is_nested_tuple (const tuple< Ts... > &) | 
|  | 
| template<index_t depth = 0, typename T > | 
| constexpr CK_TILE_HOST_DEVICE auto | tuple_depth (const T &) | 
|  | 
| template<index_t depth = 0, typename... Ts> | 
| constexpr CK_TILE_HOST_DEVICE auto | tuple_depth (const tuple< Ts... > &) | 
|  | 
| template<typename... Seqs> | 
| constexpr CK_TILE_HOST_DEVICE auto | to_array_of_array (tuple< Seqs... > t_of_s) | 
|  | 
| template<typename... Ys, typename X , std::enable_if_t<!std::is_integral< X >::value &&!std::is_floating_point< X >::value, bool >  = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator+= (tuple< Ys... > &y, const X &x) | 
|  | 
| template<typename... Ys, typename X , std::enable_if_t<!std::is_integral< X >::value &&!std::is_floating_point< X >::value, bool >  = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator-= (tuple< Ys... > &y, const X &x) | 
|  | 
| template<typename... Xs, typename Y , std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool >  = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator+ (const tuple< Xs... > &x, const Y &y) | 
|  | 
| template<typename... Xs, typename... Ys> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator+ (const tuple< Xs... > &x, const tuple< Ys... > &y) | 
|  | 
| template<typename... Xs, typename Y , std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool >  = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator- (const tuple< Xs... > &x, const Y &y) | 
|  | 
| template<typename... Xs, typename... Ys> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator- (const tuple< Xs... > &x, const tuple< Ys... > &y) | 
|  | 
| template<typename... Xs, typename Y , std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool >  = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator* (const tuple< Xs... > &x, const Y &y) | 
|  | 
| template<typename... Xs, typename Y , std::enable_if_t< std::is_integral< Y >::value||std::is_floating_point< Y >::value, bool >  = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator* (Y a, const tuple< Xs... > &x) | 
|  | 
| template<typename... Xs, typename Y , std::enable_if_t< std::is_integral< Y >::value||std::is_floating_point< Y >::value, bool >  = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator* (const tuple< Xs... > &x, Y a) | 
|  | 
| template<typename... Xs, typename... Ys> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator* (const tuple< Xs... > &x, const tuple< Ys... > &y) | 
|  | 
| template<typename... Xs, typename... Ys> | 
| constexpr CK_TILE_HOST_DEVICE auto | operator/ (const tuple< Xs... > &x, const tuple< Ys... > &y) | 
|  | 
| template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)> | 
| constexpr CK_TILE_HOST_DEVICE uint16_t | float_to_bf16_raw (float f, constant< rounding >={}) | 
|  | 
| template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)> | 
| constexpr CK_TILE_HOST_DEVICE uint16_t | double_to_bf16_raw (double f, constant< rounding >={}) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE float | bf16_to_float_raw (uint16_t x) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE double | bf16_to_double_raw (uint16_t x) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE uint16_t | float_to_bf16_rtn_raw (float f) | 
|  | 
| constexpr CK_TILE_HOST uint16_t | float_to_bf16_rtn_asm (float f) | 
|  | 
| CK_TILE_HOST uint16_t | float_to_bf16_rta_asm (float f) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE uint16_t | float_to_bf16_truc_nan_raw (float f) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE uint16_t | float_to_bf16_truc_raw (float f) | 
|  | 
| template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)> | 
| constexpr CK_TILE_HOST_DEVICE bfloat16_t | float_to_bf16 (float f, constant< rounding >={}) | 
|  | 
| template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)> | 
| constexpr CK_TILE_HOST_DEVICE bfloat16_t | double_to_bf16 (double f, constant< rounding >={}) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE float | bf16_to_float (bfloat16_t x) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE double | bf16_to_double (bfloat16_t x) | 
|  | 
| template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)> | 
| CK_TILE_HOST_DEVICE constexpr bfloat16_t | fp16_to_bf16 (half_t f, constant< rounding >={}) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE half_t | bf16_to_fp16 (bfloat16_t x) | 
|  | 
| CK_TILE_HOST_DEVICE bfloat16_t | abs (const bfloat16_t &x) | 
|  | 
| CK_TILE_HOST_DEVICE bool | isnan (const bfloat16_t &x) | 
|  | 
| CK_TILE_DEVICE bfloat16_t | sqrt (bfloat16_t x) | 
|  | 
| CK_TILE_DEVICE bfloat16_t | exp (bfloat16_t x) | 
|  | 
| CK_TILE_DEVICE bfloat16_t | exp2 (bfloat16_t x) | 
|  | 
| CK_TILE_DEVICE bfloat16_t | log (bfloat16_t x) | 
|  | 
| template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)> | 
| CK_TILE_HOST_DEVICE uint8_t | float_to_fp8_raw (float, constant< rounding >={}) | 
|  | 
| template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)> | 
| CK_TILE_HOST_DEVICE uint8_t | float_to_bf8_raw (float, constant< rounding >={}) | 
|  | 
| CK_TILE_HOST_DEVICE float | fp8_to_float_raw (uint8_t) | 
|  | 
| CK_TILE_HOST_DEVICE float | bf8_to_float_raw (uint8_t) | 
|  | 
| template<typename SrcT , typename DstT > | 
| CK_TILE_HOST_DEVICE numeric_traits< DstT >::bitwise_type | float_to_fp8_sr_raw (SrcT x) | 
|  | Converts a floating-point value to an 8-bit floating-point representation with stochastic rounding.  More... 
 | 
|  | 
| template<typename SrcT , typename DstT > | 
| CK_TILE_HOST_DEVICE numeric_traits< DstT >::bitwise_type | float_to_fp8_rtn_raw (SrcT x) | 
|  | Converts a floating-point value to an 8-bit floating-point representation with rounding to nearest even.  More... 
 | 
|  | 
| template<fp8_rounding_mode rounding> | 
| CK_TILE_HOST_DEVICE fp8_raw_t | float_to_fp8_raw (float x, constant< rounding >) | 
|  | 
| template<fp8_rounding_mode rounding> | 
| CK_TILE_HOST_DEVICE bf8_raw_t | float_to_bf8_raw (float x, constant< rounding >) | 
|  | 
| template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)> | 
| CK_TILE_HOST_DEVICE fp8_t | float_to_fp8 (float x, constant< rounding >={}) | 
|  | 
| template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)> | 
| CK_TILE_HOST_DEVICE bf8_t | float_to_bf8 (float x, constant< rounding >={}) | 
|  | 
| CK_TILE_HOST_DEVICE float | fp8_to_float (fp8_t x) | 
|  | 
| CK_TILE_HOST_DEVICE float | bf8_to_float (bf8_t x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST_DEVICE T | abs (const T &x) | 
|  | 
| CK_TILE_HOST_DEVICE bool | isnan (const fp8_t &x) | 
|  | 
| CK_TILE_HOST_DEVICE bool | isnan (const bf8_t &x) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE float | fp16_to_float_hip (const fp16_hip_t &x) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE double | fp16_to_double_hip (const fp16_hip_t &x) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE fp16_hip_t | float_to_fp16_hip (const float &x) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE fp16_hip_t | double_to_fp16_hip (const double &x) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE float | fp16_to_float (const half_t &x) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE float | fp16_to_double (const half_t &x) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE half_t | float_to_fp16 (const float &x) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE half_t | double_to_fp16 (const double &x) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE float | int8_to_float (const int8_t &x) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE int8_t | float_to_int8 (const float &x) | 
|  | 
| template<typename Scale > | 
| __host__ __device__ | scales (Scale) -> scales< Scale > | 
|  | FIXME: create macro to replace 'host device' and nothing more.  More... 
 | 
|  | 
| __host__ __device__ | plus () -> plus< void, void > | 
|  | FIXME: create macro to replace 'host device' and nothing more.  More... 
 | 
|  | 
| __host__ __device__ | minus () -> minus< void, void > | 
|  | FIXME: create macro to replace 'host device' and nothing more.  More... 
 | 
|  | 
| __host__ __device__ | multiplies () -> multiplies< void, void > | 
|  | FIXME: create macro to replace 'host device' and nothing more.  More... 
 | 
|  | 
| template<typename X , typename Y > | 
| constexpr CK_TILE_HOST_DEVICE auto | integer_divide_floor (X x, Y y) | 
|  | 
| template<typename X , typename Y > | 
| constexpr CK_TILE_HOST_DEVICE auto | integer_divide_ceil (X x, Y y) | 
|  | 
| template<typename X , typename Y > | 
| constexpr CK_TILE_HOST_DEVICE auto | integer_least_multiple (X x, Y y) | 
|  | 
| template<typename T > | 
| constexpr CK_TILE_HOST_DEVICE T | max (T x) | 
|  | 
| template<typename T > | 
| constexpr CK_TILE_HOST T | max (T x, T y) | 
|  | 
| template<typename T > | 
| constexpr CK_TILE_DEVICE T | max (T x, T y) | 
|  | 
| template<> | 
| constexpr CK_TILE_DEVICE float | max (float x, float y) | 
|  | 
| template<> | 
| constexpr CK_TILE_DEVICE double | max (double x, double y) | 
|  | 
| template<index_t X> | 
| constexpr CK_TILE_HOST_DEVICE index_t | max (number< X >, index_t y) | 
|  | 
| template<index_t Y> | 
| constexpr CK_TILE_HOST_DEVICE index_t | max (index_t x, number< Y >) | 
|  | 
| template<typename X , typename... Ys> | 
| constexpr CK_TILE_HOST_DEVICE auto | max (X x, Ys... ys) | 
|  | 
| template<typename T > | 
| constexpr CK_TILE_HOST_DEVICE T | min (T x) | 
|  | 
| template<typename T > | 
| constexpr CK_TILE_HOST T | min (T x, T y) | 
|  | 
| template<typename T > | 
| constexpr CK_TILE_DEVICE T | min (T x, T y) | 
|  | 
| template<> | 
| constexpr CK_TILE_DEVICE float | min (float x, float y) | 
|  | 
| template<> | 
| constexpr CK_TILE_DEVICE double | min (double x, double y) | 
|  | 
| template<index_t X> | 
| constexpr CK_TILE_HOST_DEVICE index_t | min (number< X >, index_t y) | 
|  | 
| template<index_t Y> | 
| constexpr CK_TILE_HOST_DEVICE index_t | min (index_t x, number< Y >) | 
|  | 
| template<typename X , typename... Ys> | 
| constexpr CK_TILE_HOST_DEVICE auto | min (X x, Ys... ys) | 
|  | 
| template<typename T > | 
| constexpr CK_TILE_HOST_DEVICE T | clamp (const T &x, const T &lowerbound, const T &upperbound) | 
|  | 
| CK_TILE_HOST int | clz (uint32_t x) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE index_t | gcd (index_t x, index_t y) | 
|  | 
| template<index_t X, index_t Y> | 
| constexpr CK_TILE_HOST_DEVICE auto | gcd (number< X >, number< Y >) | 
|  | 
| Y constexpr CK_TILE_HOST_DEVICE auto | lcm (X x, Y y) | 
|  | 
| __host__ __device__ | equal () -> equal< void, void > | 
|  | FIXME: create macro to replace 'host device' and nothing more.  More... 
 | 
|  | 
| __host__ __device__ | less () -> less< void, void > | 
|  | FIXME: create macro to replace 'host device' and nothing more.  More... 
 | 
|  | 
| __host__ __device__ | less_equal () -> less_equal< void, void > | 
|  | FIXME: create macro to replace 'host device' and nothing more.  More... 
 | 
|  | 
| constexpr CK_TILE_HOST_DEVICE int32_t | next_power_of_two (int32_t x) | 
|  | 
| template<index_t X> | 
| constexpr CK_TILE_HOST_DEVICE auto | next_power_of_two () | 
|  | 
| template<index_t X> | 
| constexpr CK_TILE_HOST_DEVICE auto | next_power_of_two (number< X >) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE int32_t | integer_log2_floor (int32_t x) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE bool | is_power_of_two_integer (int32_t x) | 
|  | 
| CK_TILE_DEVICE float | exp2 (float x) | 
|  | 
| CK_TILE_DEVICE uint16_t | sad_u16 (uint16_t x, uint16_t y, uint16_t acc) | 
|  | 
| CK_TILE_DEVICE uint32_t | sad_u32 (uint32_t x, uint32_t y, uint32_t acc) | 
|  | 
| CK_TILE_HOST float | abs (float x) | 
|  | 
| CK_TILE_HOST double | abs (double x) | 
|  | 
| CK_TILE_HOST int8_t | abs (int8_t x) | 
|  | 
| CK_TILE_HOST int32_t | abs (int32_t x) | 
|  | 
| CK_TILE_HOST fp16_t | abs (fp16_t x) | 
|  | 
| CK_TILE_HOST bool | isnan (float x) | 
|  | 
| CK_TILE_HOST bool | isnan (double x) | 
|  | 
| CK_TILE_HOST bool | isnan (int8_t x) | 
|  | 
| CK_TILE_HOST bool | isnan (int32_t x) | 
|  | 
| CK_TILE_HOST bool | isnan (fp16_t x) | 
|  | 
| CK_TILE_HOST fp16_t | sqrt (fp16_t x) | 
|  | 
| CK_TILE_HOST float | sqrt (float x) | 
|  | 
| CK_TILE_HOST double | sqrt (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | tanh (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | tanh< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | tanh< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | acos (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | acos< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | acos< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | neg (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | neg< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | neg< double > (double x) | 
|  | 
| template<> | 
| CK_TILE_HOST int32_t | neg< int32_t > (int32_t x) | 
|  | 
| template<> | 
| CK_TILE_HOST int8_t | neg< int8_t > (int8_t x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | atan (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | atan< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | atan< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | sin (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | sin< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | sin< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | asin (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | asin< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | asin< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | asinh (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | asinh< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | asinh< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | cos (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | cos< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | cos< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | acosh (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | acosh< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | acosh< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | tan (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | tan< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | tan< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | atanh (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | atanh< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | atanh< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | sinh (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | sinh< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | sinh< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | ceil (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | ceil< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | ceil< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | cosh (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | cosh< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | cosh< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | floor (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | floor< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | floor< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | rcp (T x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | exp (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | exp< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | exp< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | log (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | log< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | log< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | pow (T x, T gamma) | 
|  | 
| template<> | 
| CK_TILE_HOST float | pow< float > (float x, float gamma) | 
|  | 
| template<> | 
| CK_TILE_HOST double | pow< double > (double x, double gamma) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST T | expm1 (T x) | 
|  | 
| template<> | 
| CK_TILE_HOST float | expm1< float > (float x) | 
|  | 
| template<> | 
| CK_TILE_HOST double | expm1< double > (double x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | tanh (T x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | acos (T x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | neg (T x) | 
|  | 
| template<> | 
| CK_TILE_DEVICE fp16_t | neg< fp16_t > (fp16_t x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | atan (T x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | sin (T x) | 
|  | 
| template<> | 
| CK_TILE_DEVICE fp16_t | sin< fp16_t > (fp16_t x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | asin (T x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | asinh (T x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | acosh (T x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | tan (T x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | atanh (T x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | sinh (T x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | ceil (T x) | 
|  | 
| template<> | 
| CK_TILE_DEVICE fp16_t | ceil< fp16_t > (fp16_t x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | cosh (T x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | floor (T x) | 
|  | 
| template<> | 
| CK_TILE_DEVICE fp16_t | floor< fp16_t > (fp16_t x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | rcp (T x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | exp (T x) | 
|  | 
| template<> | 
| CK_TILE_DEVICE fp16_t | exp< fp16_t > (fp16_t x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | log (T x) | 
|  | 
| template<> | 
| CK_TILE_DEVICE fp16_t | log< fp16_t > (fp16_t x) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | pow (T x, T gamma) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE T | expm1 (T x) | 
|  | 
| CK_TILE_HOST_DEVICE fp32x2_t | pk_int4_t_to_fp32x2_t (const pk_int4_t &x) | 
|  | 
| CK_TILE_HOST_DEVICE fp16x2_t | pk_int4_t_to_halfx2_t (const pk_int4_t &x) | 
|  | 
| CK_TILE_HOST_DEVICE bf16x2_t | pk_int4_t_to_bfloat16x2_t (const pk_int4_t &x) | 
|  | 
| template<typename Y , typename X , std::enable_if_t<!(std::is_const_v< Y >||std::is_const_v< X >), bool >  = false> | 
| constexpr CK_TILE_HOST_DEVICE Y | type_convert (X x) | 
|  | 
| CK_TILE_HOST fp16x2_t | pk_add_f16 (const fp16x2_t &x, const fp16x2_t &y) | 
|  | 
| template<address_space_enum BufferAddressSpace, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename T , typename BufferSizeType > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_buffer_view (T *p, BufferSizeType buffer_size) | 
|  | 
| template<address_space_enum BufferAddressSpace, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename T , typename BufferSizeType , typename X , typename std::enable_if< std::is_same< remove_cvref_t< T >, remove_cvref_t< X >>::value, bool >::type  = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_buffer_view (T *p, BufferSizeType buffer_size, X invalid_element_value) | 
|  | 
| template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, index_t i_access = -1, bool oob_conditional_check = true> | 
| CK_TILE_DEVICE auto | load_tile (const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}) | 
|  | 
| template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , index_t i_access = -1, bool oob_conditional_check = true> | 
| CK_TILE_DEVICE auto | load_tile (const tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}) | 
|  | 
| template<typename DistributedTensor_ , typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, index_t i_access = -1, bool oob_conditional_check = true> | 
| CK_TILE_DEVICE auto | load_tile (DistributedTensor_ &dst_tile, const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}) | 
|  | 
| template<typename DistributedTensor_ , typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , index_t i_access = -1, bool oob_conditional_check = true> | 
| CK_TILE_DEVICE auto | load_tile (DistributedTensor_ &dst_tile, const tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}) | 
|  | 
| template<typename T , typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false> | 
| CK_TILE_DEVICE auto | load_tile_raw (T &tile, const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) | 
|  | Loads a tile of data using inline assembly.  More... 
 | 
|  | 
| template<typename T , typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false> | 
| CK_TILE_DEVICE auto | load_tile_raw (T &tile, const tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) | 
|  | 
| template<typename LdsTileWindow_ , typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false> | 
| CK_TILE_DEVICE auto | async_load_tile_raw (LdsTileWindow_ &&lds_tile, const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) | 
|  | 
| template<typename LdsTileWindow_ , typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false> | 
| CK_TILE_DEVICE auto | async_load_tile_raw (LdsTileWindow_ &&lds_tile, const tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) | 
|  | 
| CK_TILE_DEVICE auto | async_load_fence (index_t cnt=0) | 
|  | 
| template<typename WindowLengths > | 
| CK_TILE_DEVICE auto | load_tile (const null_tile_window< WindowLengths > &) | 
|  | 
| template<typename T , typename WindowLengths > | 
| CK_TILE_DEVICE auto | load_tile_raw (T &, const null_tile_window< WindowLengths > &) | 
|  | 
| template<typename T > | 
| constexpr CK_TILE_DEVICE auto | is_null_tile_window (const T &) | 
|  | 
| template<typename WindowLengths > | 
| constexpr CK_TILE_DEVICE auto | make_null_tile_window (const WindowLengths &window_lengths) | 
|  | 
| template<typename WindowLengths , typename... Ts> | 
| constexpr CK_TILE_DEVICE auto | make_tile_window (null_tensor_view, const WindowLengths &window_lengths, const multi_index< WindowLengths::size()> &, Ts &&...) | 
|  | 
| template<typename WindowLengths , typename StaticTileDistribution > | 
| constexpr CK_TILE_DEVICE auto | make_tile_window (const null_tile_window< WindowLengths > &t, const StaticTileDistribution &) | 
|  | 
| template<typename WindowLengths > | 
| CK_TILE_DEVICE void | move_tile_window (null_tile_window< WindowLengths > &, const typename null_tile_window< WindowLengths >::BottomTensorIndex &) | 
|  | 
| template<typename OutTensor , typename InTensor > | 
| CK_TILE_DEVICE void | shuffle_tile (OutTensor &out, const InTensor &in) | 
|  | 
| template<typename BottomTensorView_ , typename WindowLengths_ , index_t... SliceBegins, index_t... SliceEnds> | 
| constexpr CK_TILE_DEVICE auto | get_slice_tile (const tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile, sequence< SliceBegins... > slice_begins, sequence< SliceEnds... > slice_ends) | 
|  | 
| template<typename DataType_ , typename StaticTileDistribution_ , index_t... SliceBegins, index_t... SliceEnds> | 
| constexpr CK_TILE_DEVICE auto | get_slice_tile (const static_distributed_tensor< DataType_, StaticTileDistribution_ > &tile, sequence< SliceBegins... > slice_begins, sequence< SliceEnds... > slice_ends) | 
|  | 
| template<typename DstDataType_ , typename DstStaticTileDistribution_ , typename SrcDataType_ , typename SrcStaticTileDistribution_ , index_t... SliceBegins, index_t... SliceEnds> | 
| constexpr CK_TILE_DEVICE auto | set_slice_tile (static_distributed_tensor< DstDataType_, DstStaticTileDistribution_ > &dst_tile, const static_distributed_tensor< SrcDataType_, SrcStaticTileDistribution_ > &src_tile, sequence< SliceBegins... > slice_begins, sequence< SliceEnds... > slice_ends) | 
|  | 
| template<typename DataType , typename StaticTileDistribution > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_static_distributed_tensor (const StaticTileDistribution &) | 
|  | 
| template<typename DataType , typename StaticTileDistribution , typename ThreadBuffer > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_static_distributed_tensor (const StaticTileDistribution &, ThreadBuffer &&thread_buffer_) | 
|  | 
| template<typename StaticTileDistribution , typename DistributedIndices > | 
| constexpr CK_TILE_HOST_DEVICE auto | get_x_indices_from_distributed_indices (StaticTileDistribution tile_distribution, DistributedIndices distributed_indices) | 
|  | 
| template<typename DataType , typename StaticTileDistribution , typename XIndicesPredicate > | 
| CK_TILE_HOST_DEVICE void | set_tile_if (static_distributed_tensor< DataType, StaticTileDistribution > &out_tensor, DataType value, XIndicesPredicate predicate) | 
|  | 
| template<typename YLengths , index_t XUnpacks> | 
| constexpr CK_TILE_HOST_DEVICE auto | get_y_unpacks_from_x_unpacks (YLengths, number< XUnpacks >) | 
|  | 
| template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename DataType_ > | 
| CK_TILE_DEVICE void | store_tile (tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile_window_tmp, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor) | 
|  | 
| template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename DataType_ > | 
| CK_TILE_DEVICE void | store_tile_raw (tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile_window_tmp, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor) | 
|  | 
| template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ > | 
| CK_TILE_DEVICE void | store_tile (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor) | 
|  | 
| template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ > | 
| CK_TILE_DEVICE void | store_tile_raw (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor) | 
|  | 
| template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , typename DataType_ > | 
| CK_TILE_DEVICE void | store_tile (tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor) | 
|  | 
| template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , typename DataType_ > | 
| CK_TILE_DEVICE void | store_tile_raw (tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor) | 
|  | 
| template<typename TileDistributedSpan_ , typename F > | 
| CK_TILE_DEVICE void | sweep_tile_span (TileDistributedSpan_, const F &f) | 
|  | 
| template<typename TileDistributedSpan_ , typename F , typename Unpacks  = typename uniform_sequence_gen<TileDistributedSpan_::Impl::size(), 1>::type> | 
| CK_TILE_DEVICE void | sweep_tile_uspan (TileDistributedSpan_, const F &f, Unpacks={}) | 
|  | 
| template<typename DistributedTensor , typename F , typename UnpacksPerXDim  = typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type> | 
| constexpr CK_TILE_HOST_DEVICE void | sweep_tile (const F &f, UnpacksPerXDim={}) | 
|  | 
| template<typename DistributedTensor , typename F , typename UnpacksPerXDim  = typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type> | 
| constexpr CK_TILE_HOST_DEVICE void | sweep_tile (const DistributedTensor &, const F &f, UnpacksPerXDim={}) | 
|  | 
| template<typename T , typename F , typename U  = typename uniform_sequence_gen<T::get_num_of_dimension(), 1>::type> | 
| CK_TILE_HOST_DEVICE_EXTERN | tile_sweeper (const T &, const F &, U={}) -> tile_sweeper< T, F, U > | 
|  | 
| template<typename Transforms , typename LowerDimensionOldTopIdss , typename UpperDimensionNewTopIdss > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_single_stage_tensor_adaptor (const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss) | 
|  | 
| template<typename OldTensorAdaptor , typename NewTransforms , typename NewLowerDimensionOldTopIdss , typename NewUpperDimensionNewTopIdss > | 
| constexpr CK_TILE_HOST_DEVICE auto | transform_tensor_adaptor (const OldTensorAdaptor &old_tensor_adaptor, const NewTransforms &new_transforms, NewLowerDimensionOldTopIdss, NewUpperDimensionNewTopIdss) | 
|  | 
| template<typename TensorAdaptor0 , typename TensorAdaptor1 > | 
| constexpr CK_TILE_HOST_DEVICE auto | chain_tensor_adaptors (const TensorAdaptor0 &adaptor0, const TensorAdaptor1 &adaptor1) | 
|  | 
| template<typename Adaptor , typename TopIndex > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_tensor_adaptor_coordinate (const Adaptor &adaptor, const TopIndex &idx_top) | 
|  | 
| template<bool JudgeDoTransforms = true, typename Adaptor , typename AdaptorCoord , typename TopIndex , typename BottomIndex > | 
| constexpr CK_TILE_HOST_DEVICE void | move_tensor_adaptor_coordinate (const Adaptor &adaptor, AdaptorCoord &coord, const TopIndex &idx_diff_top, BottomIndex &idx_diff_bottom) | 
|  | 
| template<bool JudgeDoTransforms = true, typename Adaptor , typename AdaptorCoord , typename TopIndex > | 
| constexpr CK_TILE_HOST_DEVICE void | move_tensor_adaptor_coordinate (const Adaptor &adaptor, AdaptorCoord &coord, const TopIndex &idx_diff_top) | 
|  | 
| template<typename Adaptor , typename AdaptorCoord > | 
| constexpr CK_TILE_HOST_DEVICE bool | adaptor_coordinate_is_valid_assuming_top_index_is_valid (const Adaptor &adaptor, const AdaptorCoord &coord) | 
|  | 
| template<typename Adaptor , typename AdpatorCoord > | 
| constexpr CK_TILE_HOST_DEVICE bool | adaptor_coordinate_is_valid (const Adaptor &adaptor, const AdpatorCoord &coord) | 
|  | 
| template<typename TensorDesc , typename TopIndex > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_tensor_coordinate (const TensorDesc &tensor_desc, const TopIndex &idx_top) | 
|  | 
| template<bool JudgeDoTransforms = true, typename TensorDesc , typename TensorCoord , typename Index > | 
| constexpr CK_TILE_HOST_DEVICE void | move_tensor_coordinate (const TensorDesc &tensor_desc, TensorCoord &coord, const Index &coord_step) | 
|  | 
| template<typename TensorDesc , typename TensorCoord > | 
| constexpr CK_TILE_HOST_DEVICE bool | coordinate_has_valid_offset_assuming_top_index_is_valid (const TensorDesc &tensor_desc, const TensorCoord &coord) | 
|  | 
| template<typename TensorDesc , typename TensorCoord > | 
| constexpr CK_TILE_HOST_DEVICE bool | coordinate_has_valid_offset (const TensorDesc &tensor_desc, const TensorCoord &coord) | 
|  | 
| template<typename Adaptor , typename ElementSpaceSize > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_tensor_descriptor_from_adaptor (const Adaptor &adaptor, const ElementSpaceSize &element_space_size) | 
|  | 
| template<typename OldTensorDescriptor , typename NewTransforms , typename NewLowerDimensionOldTopIdss , typename NewUpperDimensionNewTopIdss > | 
| constexpr CK_TILE_HOST_DEVICE auto | transform_tensor_descriptor (const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldTopIdss, NewUpperDimensionNewTopIdss) | 
|  | 
| template<typename... Lengths, typename... Strides, index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type  = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_naive_tensor_descriptor (const tuple< Lengths... > &lengths, const tuple< Strides... > &strides, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}, number< GuaranteedLastDimensionVectorStride >=number<-1 >{}) | 
|  | 
| template<typename... Lengths, typename... Strides, typename offset , index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type  = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_naive_tensor_descriptor_with_offset (const tuple< Lengths... > &lengths, const tuple< Strides... > &strides, const offset &os, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}, number< GuaranteedLastDimensionVectorStride >=number<-1 >{}) | 
|  | 
| template<typename... Lengths, index_t GuaranteedLastDimensionVectorLength = -1> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_naive_tensor_descriptor_packed (const tuple< Lengths... > &lengths, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}) | 
|  | 
| template<typename... Lengths, typename... Strides, typename Offset , index_t GuaranteedLastDimensionVectorLength = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type  = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_naive_tensor_descriptor_packed_with_offset (const tuple< Lengths... > &lengths, const Offset &offset, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}) | 
|  | 
| template<typename... Lengths, typename Align > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_naive_tensor_descriptor_aligned (const tuple< Lengths... > &lengths, Align align) | 
|  | 
| template<address_space_enum BufferAddressSpace = address_space_enum::generic, typename DataType , typename... Ts> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_tensor_view (DataType *p, const tensor_descriptor< Ts... > &desc) | 
|  | 
| template<address_space_enum BufferAddressSpace = address_space_enum::generic, memory_operation_enum DstInMemOp = memory_operation_enum::set, typename DataType , typename... Lengths, typename... Strides, index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type  = false> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_naive_tensor_view (DataType *p, const tuple< Lengths... > &lengths, const tuple< Strides... > &strides, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}, number< GuaranteedLastDimensionVectorStride >=number<-1 >{}) | 
|  | 
| template<address_space_enum BufferAddressSpace = address_space_enum::generic, typename DataType , typename... Lengths, index_t GuaranteedLastDimensionVectorLength = -1> | 
| constexpr CK_TILE_HOST_DEVICE auto | make_naive_tensor_view_packed (DataType *p, const tuple< Lengths... > &lengths, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}) | 
|  | 
| template<typename OldTensorView , typename NewTransforms , typename NewLowerDimensionOldVisibleIdss , typename NewUpperDimensionNewVisibleIdss > | 
| constexpr CK_TILE_HOST_DEVICE auto | transform_tensor_view (const OldTensorView &old_tensor_view, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss) | 
|  | 
| template<typename TensorView , typename TileLengths , typename DoPads > | 
| constexpr CK_TILE_HOST_DEVICE auto | pad_tensor_view (const TensorView &tensor_view, const TileLengths &tile_lengths, DoPads) | 
|  | 
| template<typename StaticTileDistributionEncoding_ > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_static_tile_distribution (StaticTileDistributionEncoding_) | 
|  | 
| template<typename InOutElementFunc , typename... InOutDstrTensors, typename  = std::enable_if_t<std::conjunction_v<              std::negation<std::is_same<std::remove_const_t<InOutDstrTensors>, null_tensor>>...>>> | 
| CK_TILE_DEVICE void | tile_elementwise_inout (const InOutElementFunc &inout_element_func, InOutDstrTensors &... inout_dstr_tensors) | 
|  | 
| template<typename InElementFunc , typename... InTensor, typename  = std::enable_if_t<              std::conjunction_v<std::negation<std::is_same<InTensor, null_tensor>>...>>> | 
| CK_TILE_DEVICE auto | tile_elementwise_in (const InElementFunc &in_element_func, const InTensor &... in_dstr_tensors) | 
|  | 
| template<typename DstrTensors , typename T > | 
| CK_TILE_DEVICE void | set_tile (DstrTensors &dstr_tensor, const T &value) | 
|  | 
| template<typename T > | 
| CK_TILE_DEVICE void | set_tile (null_tensor &, const T &) | 
|  | 
| template<typename DstrTensors , index_t v, bool skip_subdword_opt = false> | 
| CK_TILE_DEVICE void | set_tile (DstrTensors &dstr_tensor, number< v >, bool_constant< skip_subdword_opt >={}) | 
|  | 
| template<index_t v> | 
| CK_TILE_DEVICE void | set_tile (null_tensor &, number< v >) | 
|  | 
| template<typename DstrTensors > | 
| CK_TILE_DEVICE void | clear_tile (DstrTensors &dstr_tensor) | 
|  | 
| template<typename DstType , typename SrcTensor > | 
| CK_TILE_DEVICE auto | cast_tile (const SrcTensor &src_tensor) | 
|  | 
| template<typename InOutElementFunc , typename... MaybeNullTensor, typename  = std::enable_if_t<              std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, null_tensor>...>>> | 
| CK_TILE_DEVICE void | tile_elementwise_inout (const InOutElementFunc &, MaybeNullTensor &&...) | 
|  | 
| template<typename InElementFunc , typename... MaybeNullTensor, typename  = std::enable_if_t<              std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, null_tensor>...>>> | 
| CK_TILE_DEVICE auto | tile_elementwise_in (const InElementFunc &, MaybeNullTensor &&...) | 
|  | 
| template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , index_t NumCoord = 1> | 
| constexpr CK_TILE_DEVICE auto | make_tile_window (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, number< NumCoord >={}) | 
|  | 
| template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , index_t NumCoord = 1> | 
| CK_TILE_DEVICE auto | make_tile_window_raw (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, number< NumCoord >={}) | 
|  | 
| template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , index_t NumCoord> | 
| CK_TILE_DEVICE void | move_tile_window (tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord > &window, const typename tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord >::BottomTensorIndex &step) | 
|  | 
| template<typename TensorView_ , typename WindowLengths_ > | 
| constexpr CK_TILE_DEVICE auto | make_tile_window (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin) | 
|  | 
| template<typename TensorView , typename WindowLengths > | 
| constexpr CK_TILE_DEVICE auto | make_tile_window (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const multi_index< TensorView::get_num_of_dimension()> &origin) | 
|  | 
| template<typename TensorView , typename WindowLengths , typename StaticTileDistribution > | 
| constexpr CK_TILE_DEVICE auto | make_tile_window (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const multi_index< TensorView::get_num_of_dimension()> &origin, const StaticTileDistribution &tile_distribution) | 
|  | 
| template<typename TensorView , typename WindowLengths , typename StaticTileDistribution > | 
| constexpr CK_TILE_DEVICE auto | make_tile_window (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const StaticTileDistribution &tile_distribution) | 
|  | 
| template<typename TensorView , typename WindowLengths , typename StaticTileDistribution > | 
| constexpr CK_TILE_DEVICE auto | make_tile_window_raw (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const StaticTileDistribution &tile_distribution) | 
|  | 
| template<typename TensorView_ , typename WindowLengths_ > | 
| CK_TILE_DEVICE void | move_tile_window (tile_window_with_static_lengths< TensorView_, WindowLengths_ > &window, const typename tile_window_with_static_lengths< TensorView_, WindowLengths_ >::BottomTensorIndex &step) | 
|  | 
| template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename LinearBottomDims_  = default_linear_bottom_dims<TensorView_>> | 
| constexpr CK_TILE_DEVICE auto | make_tile_window_linear (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={}) | 
|  | 
| template<typename TileWindow_ , typename StaticTileDistribution_ , typename LinearBottomDims_  = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>> | 
| constexpr CK_TILE_DEVICE auto | make_tile_window_linear (const TileWindow_ &tile_window, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={}) | 
|  | 
| template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename LinearBottomDims_  = default_linear_bottom_dims<TensorView_>> | 
| CK_TILE_DEVICE auto | make_tile_window_linear_raw (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={}) | 
|  | 
| template<typename TileWindow_ , typename StaticTileDistribution_ , typename LinearBottomDims_  = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>> | 
| constexpr CK_TILE_DEVICE auto | make_tile_window_linear_raw (const TileWindow_ &tile_window, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={}) | 
|  | 
| template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename LinearBottomDims_ > | 
| CK_TILE_DEVICE void | move_tile_window (tile_window_linear< TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ > &window, const typename tile_window_linear< TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ >::BottomTensorIndex &step) | 
|  | 
| template<typename LdsTileWindow_ > | 
| CK_TILE_DEVICE auto | get_async_store_smem_info (LdsTileWindow_ &&lds_tile) | 
|  | 
| template<typename OutTensor , typename InTensor > | 
| CK_TILE_DEVICE void | transpose_tile2d (OutTensor &out, const InTensor &in) | 
|  | 
| template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename DataType_ > | 
| CK_TILE_DEVICE void | update_tile (tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile_window_tmp, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor) | 
|  | 
| template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ , index_t i_access = -1, bool oob_conditional_check = true> | 
| CK_TILE_DEVICE void | update_tile (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) | 
|  | 
| template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false> | 
| CK_TILE_DEVICE void | update_tile_raw (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) | 
|  | 
| template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , typename DataType_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false> | 
| CK_TILE_DEVICE auto | update_tile_raw (tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) | 
|  | 
| template<typename Y , typename X > | 
| constexpr CK_TILE_HOST_DEVICE Y | bit_cast (const X &x) | 
|  | 
| template<typename F , typename X > | 
| constexpr CK_TILE_HOST_DEVICE auto | unpack (F &&f, X &&x) | 
|  | 
| template<typename F , typename X , typename Y > | 
| constexpr CK_TILE_HOST_DEVICE auto | unpack2 (F &&f, X &&x, Y &&y) | 
|  | 
| template<bool predicate, typename X , typename Y > | 
| constexpr auto | conditional_expr (X &&x, Y &&y) | 
|  | 
| template<typename PY , typename PX , typename std::enable_if< std::is_pointer_v< PY > &&std::is_pointer_v< PX >, bool >::type  = false> | 
| CK_TILE_HOST_DEVICE PY | c_style_pointer_cast (PX p_x) | 
|  | 
| template<typename... Ts> | 
| __host__ __device__ | composes (Ts &&...) -> composes< remove_cvref_t< Ts >... > | 
|  | FIXME: create macro to replace 'host device' and nothing more.  More... 
 | 
|  | 
| template<typename ComputeDataType , typename OutDataType , typename AccDataType  = ComputeDataType> | 
| double | get_relative_threshold (const int number_of_accumulations=1) | 
|  | 
| template<typename ComputeDataType , typename OutDataType , typename AccDataType  = ComputeDataType> | 
| double | get_absolute_threshold (const double max_possible_num, const int number_of_accumulations=1) | 
|  | 
| template<typename T > | 
| std::ostream & | operator<< (std::ostream &os, const std::vector< T > &v) | 
|  | 
| template<typename Range , typename RefRange > | 
| std::enable_if< std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_floating_point_v< ranges::range_value_t< Range > > &&!std::is_same_v< ranges::range_value_t< Range >, half_t >, bool >::type CK_TILE_HOST | check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-5, double atol=3e-6, bool allow_infinity_ref=false) | 
|  | 
| template<typename Range , typename RefRange > | 
| std::enable_if< std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, bf16_t >, bool >::type CK_TILE_HOST | check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-3, double atol=1e-3, bool allow_infinity_ref=false) | 
|  | 
| template<typename Range , typename RefRange > | 
| std::enable_if< std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, half_t >, bool >::type CK_TILE_HOST | check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-3, double atol=1e-3, bool allow_infinity_ref=false) | 
|  | 
| template<typename Range , typename RefRange > | 
| std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange >> &&std::is_integral_v< ranges::range_value_t< Range >> &&!std::is_same_v< ranges::range_value_t< Range >, bf16_t >), bool > CK_TILE_HOST | check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double=0, double atol=0) | 
|  | 
| template<typename Range , typename RefRange > | 
| std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange >> &&std::is_same_v< ranges::range_value_t< Range >, fp8_t >), bool > CK_TILE_HOST | check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", unsigned max_rounding_point_distance=1, double atol=1e-1, bool allow_infinity_ref=false) | 
|  | 
| template<typename Range , typename RefRange > | 
| std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange >> &&std::is_same_v< ranges::range_value_t< Range >, bf8_t >), bool > CK_TILE_HOST | check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-3, double atol=1e-3, bool allow_infinity_ref=false) | 
|  | 
| template<typename T > | 
| __global__ void | set_buffer_value (T *p, T x, uint64_t buffer_element_size) | 
|  | 
| CK_TILE_HOST void | hip_check_error (hipError_t x) | 
|  | 
| template<typename Range > | 
| CK_TILE_HOST std::ostream & | LogRange (std::ostream &os, Range &&range, std::string delim, int precision=std::cout.precision(), int width=0) | 
|  | 
| template<typename T , typename Range > | 
| CK_TILE_HOST std::ostream & | LogRangeAsType (std::ostream &os, Range &&range, std::string delim, int precision=std::cout.precision(), int width=0) | 
|  | 
| template<typename F , typename T , std::size_t... Is> | 
| CK_TILE_HOST auto | call_f_unpack_args_impl (F f, T args, std::index_sequence< Is... >) | 
|  | 
| template<typename F , typename T > | 
| CK_TILE_HOST auto | call_f_unpack_args (F f, T args) | 
|  | 
| template<typename F , typename T , std::size_t... Is> | 
| CK_TILE_HOST auto | construct_f_unpack_args_impl (T args, std::index_sequence< Is... >) | 
|  | 
| template<typename F , typename T > | 
| CK_TILE_HOST auto | construct_f_unpack_args (F, T args) | 
|  | 
| template<typename New2Old > | 
| CK_TILE_HOST HostTensorDescriptor | transpose_host_tensor_descriptor_given_new2old (const HostTensorDescriptor &a, const New2Old &new2old) | 
|  | 
| template<typename F , typename... Xs> | 
| CK_TILE_HOST auto | make_ParallelTensorFunctor (F f, Xs... xs) | 
|  | 
| template<bool is_row_major> | 
| auto | host_tensor_descriptor (std::size_t row, std::size_t col, std::size_t stride, bool_constant< is_row_major >) | 
|  | 
| template<bool is_row_major> | 
| auto | get_default_stride (std::size_t row, std::size_t col, std::size_t stride, bool_constant< is_row_major >) | 
|  | 
| template<int MaxThreadPerBlock, int MinBlockPerCu, typename Kernel , typename... Args> | 
| __global__ void | kentry (Args... args) | 
|  | 
| template<int MaxThreadPerBlock = CK_TILE_MAX_THREAD_PER_BLOCK, int MinBlockPerCu = CK_TILE_MIN_BLOCK_PER_CU, typename KernelImpl , typename... Args> | 
| CK_TILE_HOST auto | make_kernel (KernelImpl, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args) | 
|  | 
| template<typename... Callables> | 
| CK_TILE_HOST float | launch_kernel (const stream_config &s, Callables... callables) | 
|  | 
| template<typename DataType , typename RandValOutputDataType > | 
| CK_TILE_HOST void | reference_batched_dropout (HostTensor< DataType > &in_out_b_m_n, const HostTensor< RandValOutputDataType > &randval_b_m_n, const uint8_t &p_undrop_in_uint8_t, const float scale) | 
|  | 
| template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename AElementOp  = ck_tile::identity, typename BElementOp  = ck_tile::identity, typename BinaryElementOp  = ck_tile::plus<AccDataType>> | 
| CK_TILE_HOST void | reference_batched_elementwise (const HostTensor< ADataType > &a_b_m_n, const HostTensor< BDataType > &b_b_m_n, HostTensor< CDataType > &c_b_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const BinaryElementOp &binary_element_op={}) | 
|  | 
| template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename AElementOp  = ck_tile::identity, typename BElementOp  = ck_tile::identity, typename ACCElementOp  = ck_tile::identity> | 
| CK_TILE_HOST void | reference_batched_gemm (const HostTensor< ADataType > &a_b_m_k, const HostTensor< BDataType > &b_b_n_k, HostTensor< CDataType > &c_b_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={}) | 
|  | 
| template<typename CDataType , typename MaskingType > | 
| CK_TILE_HOST void | reference_batched_masking (HostTensor< CDataType > &c_b_m_n, const MaskingType &mask) | 
|  | 
| template<typename DataType , typename ComputeDataType  = float> | 
| CK_TILE_HOST void | reference_batched_rotary_position_embedding (const HostTensor< DataType > &input_bsd, const HostTensor< DataType > &cos_sd, const HostTensor< DataType > &sin_sd, bool interleaved, HostTensor< DataType > &output_bsd, bool use_1_row_sin_cos=false) | 
|  | 
| template<typename ADataType , typename CompDataType , typename BDataType , typename CompElementOp  = ck_tile::identity> | 
| CK_TILE_HOST void | reference_batched_softmax (const HostTensor< ADataType > &a_b_m_n, HostTensor< BDataType > &b_b_m_n, const CompElementOp &comp_element_op={}, std::optional< std::reference_wrapper< HostTensor< CompDataType >>> lse_b_m=std::nullopt) | 
|  | 
| template<typename Type > | 
| CK_TILE_HOST void | reference_batched_transpose (const HostTensor< Type > &x, HostTensor< Type > &y, std::string layout_in="NCHW", std::string layout_out="NHWC") | 
|  | 
| template<typename ADataType , typename BDataType , typename ComputeDataType , typename ElementOp > | 
| CK_TILE_HOST void | reference_unary_elementwise (const HostTensor< ADataType > &a, HostTensor< BDataType > &b, ElementOp element_op) | 
|  | 
| template<typename ADataType , typename BDataType , typename CDataType , typename ComputeDataType , typename ElementOp > | 
| CK_TILE_HOST void | reference_binary_elementwise (const HostTensor< ADataType > &a, const HostTensor< BDataType > &b, HostTensor< CDataType > &c, ElementOp element_op) | 
|  | 
| template<typename AccDataType , typename Activation , typename ADataType , typename GDataType , typename DDataType , typename ODataType , typename AScaleDataType , typename GScaleDataType , typename DScaleDataType , typename YSmoothScaleDataType , typename TopkWeightDataType , typename IndexDataType > | 
| void | reference_fused_moe (const ck_tile::HostTensor< ADataType > &a_host, const ck_tile::HostTensor< GDataType > &g_host, const ck_tile::HostTensor< DDataType > &d_host, const ck_tile::HostTensor< AScaleDataType > &sa_host, const ck_tile::HostTensor< GScaleDataType > &sg_host, const ck_tile::HostTensor< DScaleDataType > &sd_host, const ck_tile::HostTensor< YSmoothScaleDataType > &sy_host, ck_tile::HostTensor< ODataType > &o_host, const ck_tile::HostTensor< IndexDataType > &sorted_token_ids_host, const ck_tile::HostTensor< TopkWeightDataType > &sorted_weight_host, const ck_tile::HostTensor< IndexDataType > &sorted_expert_ids_host, const ck_tile::HostTensor< IndexDataType > &num_sorted_tiles_host, const ck_tile::HostTensor< IndexDataType > &token_ids_host, ck_tile::index_t block_m, ck_tile::index_t tokens, ck_tile::index_t experts, ck_tile::index_t hidden_size, ck_tile::index_t intermediate_size, ck_tile::index_t topk, ck_tile::index_t gate_only) | 
|  | 
| template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename AElementOp  = ck_tile::identity, typename BElementOp  = ck_tile::identity, typename ACCElementOp  = ck_tile::identity> | 
| CK_TILE_HOST void | reference_gemm (const HostTensor< ADataType > &a_m_k, const HostTensor< BDataType > &b_k_n, HostTensor< CDataType > &c_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={}) | 
|  | 
| template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename LayoutA , typename LayoutB , typename LayoutC > | 
| __global__ void | naive_gemm_kernel (ADataType *A, BDataType *B, CDataType *C, ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t K, ck_tile::index_t strideA, ck_tile::index_t strideB, ck_tile::index_t strideC) | 
|  | 
| template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename LayoutA , typename LayoutB , typename LayoutC > | 
| void | reference_gemm_gpu (ADataType *a_ptr, BDataType *b_ptr, CDataType *c_ptr, index_t M, index_t N, index_t K, index_t stride_a, index_t stride_b, index_t stride_c) | 
|  | 
| template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename LayoutA , typename LayoutB , typename LayoutC > | 
| void | reference_batched_gemm_gpu (ADataType *a_ptr, BDataType *b_ptr, CDataType *c_ptr, index_t M, index_t N, index_t K, index_t stride_a, index_t stride_b, index_t stride_c, index_t batch_stride_A, index_t batch_stride_B, index_t batch_stride_C, index_t batch_count) | 
|  | 
| template<typename InDataType , typename OutDataType , index_t NDimSpatial> | 
| CK_TILE_HOST void | reference_im2col (const HostTensor< InDataType > &in_host, HostTensor< OutDataType > &out_host, const ck_tile::conv::ConvParam &conv_params) | 
|  | 
| template<typename XDataType , typename GammaDataType , typename BetaDataType , typename ComputeDataType , typename YDataType , typename MeanDataType , typename InvStdDataType , typename Epilogue  = reference_layernorm2d_default_epilogue> | 
| void | reference_layernorm2d_fwd (const HostTensor< XDataType > &x_m_n, const HostTensor< GammaDataType > &gamma_n, const HostTensor< BetaDataType > &beta_n, HostTensor< YDataType > &y_m_n, HostTensor< MeanDataType > &mean_m, HostTensor< InvStdDataType > &invStd_m, ComputeDataType epsilon, Epilogue epilogue_functor={}) | 
|  | 
| template<typename WeightType , typename IndexType  = index_t> | 
| CK_TILE_HOST void | reference_moe_sorting (const HostTensor< IndexType > &topk_ids, const HostTensor< WeightType > &weights, HostTensor< IndexType > &p_sorted_token_ids, HostTensor< WeightType > &sorted_weight, HostTensor< IndexType > &sorted_expert_ids, index_t &unit_cnt, const index_t experts, const index_t unit_size) | 
|  | 
| template<typename DataType > | 
| CK_TILE_HOST void | reference_permute (const HostTensor< DataType > &x, HostTensor< DataType > &y, std::vector< index_t > perm) | 
|  | 
| template<typename DataType > | 
| CK_TILE_HOST auto | reference_permute (const HostTensor< DataType > &x, std::vector< index_t > perm) | 
|  | 
| template<typename XDataType , typename ComputeDataType , typename YDataType , typename ReduceOp > | 
| CK_TILE_HOST void | reference_reduce (const HostTensor< XDataType > &x_m_n, HostTensor< YDataType > &y_m, ReduceOp reduce_op) | 
|  | 
| template<typename XDataType , typename GammaDataType , typename ComputeDataType , typename YDataType , typename InvRmsDataType , typename Epilogue  = reference_rmsnorm2d_default_epilogue> | 
| void | reference_rmsnorm2d_fwd (const HostTensor< XDataType > &x_m_n, const HostTensor< GammaDataType > &gamma_n, HostTensor< YDataType > &y_m_n, HostTensor< InvRmsDataType > &invRms_m, ComputeDataType epsilon, Epilogue epilogue_functor={}) | 
|  | 
| template<typename XDataType , typename ScaleDataType , typename QXDataType > | 
| CK_TILE_HOST void | reference_rowwise_quantization2d (const HostTensor< XDataType > &x_m_n, const HostTensor< ScaleDataType > &scale_m, HostTensor< QXDataType > &qx_m_n) | 
|  | 
| template<typename InputType , typename ComputeType , typename OutputType  = ComputeType> | 
| CK_TILE_HOST void | reference_softmax (const HostTensor< InputType > &x, HostTensor< OutputType > &y, index_t dim=-1) | 
|  | 
| template<typename InputType , typename ComputeType , typename OutputType  = ComputeType> | 
| CK_TILE_HOST auto | reference_softmax (const HostTensor< InputType > &x, index_t dim=-1) | 
|  | 
| template<typename DataType , typename IndexType  = index_t> | 
| CK_TILE_HOST void | reference_topk (const HostTensor< DataType > &x, HostTensor< DataType > &y_values, HostTensor< IndexType > &y_indices, index_t k, index_t dim=-1, bool largest=true, bool sorted=true) | 
|  | 
| template<typename DataType , typename IndexType  = index_t> | 
| CK_TILE_HOST auto | reference_topk (const HostTensor< DataType > &x, index_t k, index_t dim=-1, bool largest=true, bool sorted=true) | 
|  | 
| constexpr CK_TILE_HOST_DEVICE auto | make_generic_attention_mask_coordinates_from_lr_window (index_t left_size, index_t right_size, index_t y_total, index_t x_total, bool is_top_left=true) | 
|  | 
| template<typename MaskType > | 
| constexpr CK_TILE_HOST_DEVICE auto | make_generic_attention_mask_from_lr_window (index_t left_size, index_t right_size, index_t y_total, index_t x_total, bool is_top_left=true) | 
|  | 
| template<typename DataType , bool RowMajor = true, unsigned LogMaxSadOprndSize = 16> | 
| CK_TILE_HOST_DEVICE auto | make_alibi_from_lr_mask (DataType slope, index_t window_left_size, index_t window_right_size, index_t y_total, index_t x_total, GenericAttentionMaskEnum mask_enum) | 
|  | 
| template<typename DataType > | 
| CK_TILE_HOST std::vector< DataType > | get_alibi_slopes (ck_tile::index_t nheads) | 
|  | 
| template<typename TensorView > | 
| CK_TILE_HOST_DEVICE auto | make_page_block_navigator (const TensorView &tensor_view) | 
|  | 
| template<typename DataType , index_t VirtualDim, typename TensorView > | 
| CK_TILE_HOST_DEVICE auto | make_page_block_navigator (copy_const_t< DataType, void > *physical_blocks, long_index_t block_stride, long_index_t fixed_offset, const int32_t *physical_block_indices, index_t num_blocks, index_t page_block_size, const TensorView &complete_view, const TensorView &last_view) | 
|  | 
| template<typename BlockShape > | 
| constexpr CK_TILE_DEVICE index_t | block_tile_welford_calculate_max_count (int row_size) | 
|  | 
| template<typename VarDistributedTensor_ , bool FastFdiv_ = false> | 
| constexpr CK_TILE_DEVICE void | block_tile_welford_post_scale_var (VarDistributedTensor_ &var_tensor, int count, bool_constant< FastFdiv_ >={}) | 
|  | 
| template<typename T , bool kFastFDiv = false> | 
| CK_TILE_DEVICE void | welford_update (T &mean, T &var, T x, int count, bool_constant< kFastFDiv >={}) | 
|  | 
| template<typename AccDistributedTensor_ , typename ReduceFunc , bool WithBroadcast = true> | 
| CK_TILE_DEVICE void | block_tile_reduce_sync (AccDistributedTensor_ &acc_tensor, const ReduceFunc &reduce_func, bool_constant< WithBroadcast >={}) | 
|  | 
| template<typename AccDistributedTensor_ , typename ReduceFunc > | 
| CK_TILE_DEVICE void | block_tile_reduce_xor_sync (AccDistributedTensor_ &acc_tensor, const ReduceFunc &reduce_func) | 
|  | 
| template<typename AccDistributedTensor_ , typename InDistributedTensor_ , index_t... InReduceDims, typename ReduceFunc > | 
| CK_TILE_DEVICE void | block_tile_reduce (AccDistributedTensor_ &acc_tensor, const InDistributedTensor_ &in_tensor, sequence< InReduceDims... >, const ReduceFunc &reduce_func) | 
|  | 
| template<typename AccDataType_ , typename InDistributedTensor_ , index_t... InReduceDims, typename ReduceFunc , typename InDataType_ > | 
| CK_TILE_DEVICE auto | block_tile_reduce (const InDistributedTensor_ &in_tensor, sequence< InReduceDims... > in_reduce_dims, const ReduceFunc &reduce_func, const InDataType_ &reduce_init) | 
|  | 
| template<typename T > | 
| CK_TILE_HOST_DEVICE_EXTERN | BlockReduce2D (const T &, const typename T::DataType &) -> BlockReduce2D< T > | 
|  | 
| CK_TILE_HOST float | naive_attention_fwd (naive_attention_fwd_traits t, naive_attention_fwd_args a, ck_tile::stream_config s) | 
|  |