/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp File Reference#
device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp File Reference
#include <iostream>#include <sstream>#include <numeric>#include <initializer_list>#include <cstdlib>#include "ck/ck.hpp"#include "ck/utility/common_header.hpp"#include "ck/tensor_description/tensor_descriptor.hpp"#include "ck/tensor_description/tensor_descriptor_helper.hpp"#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp"#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp"#include "ck/host_utility/device_prop.hpp"#include "ck/host_utility/kernel_launch.hpp"Go to the source code of this file.
Namespaces | |
| ck | |
| ck::tensor_operation | |
| ck::tensor_operation::device | |
Functions | |
| template<typename DeviceOp , typename GridwiseOp , typename ADataType , typename B0DataType , typename B1DataType , typename CDataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , bool HasMainKBlockLoop> | |
| __global__ void | ck::tensor_operation::device::kernel_batched_gemm_softmax_gemm_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const B0DataType *__restrict__ p_b0_grid, const B1DataType *__restrict__ p_b1_grid, CDataType *__restrict__ p_c_grid, index_t M, index_t N, index_t K, index_t O, index_t G0, index_t G1, float alpha, bool input_permute, bool output_permute) |
| template<typename DeviceOp , typename GridwiseOp , typename QKVDataType , typename ODataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , bool HasMainKBlockLoop> | |
| __global__ void | ck::tensor_operation::device::kernel_wmma_self_attention_forward (const QKVDataType *__restrict__ p_qkv_grid, ODataType *__restrict__ p_out_grid, index_t batch_size, index_t sequence_length, index_t head_count, index_t head_size, float alpha) |
| template<typename DeviceOp , typename GridwiseOp , typename QDataType , typename KVDataType , typename ODataType , typename AElementwiseOperation , typename B0ElementwiseOperation , typename AccElementwiseOperation , typename B1ElementwiseOperation , typename CElementwiseOperation , bool HasMainKBlockLoop> | |
| __global__ void | ck::tensor_operation::device::kernel_wmma_cross_attention_forward (const QDataType *__restrict__ p_q_grid, const KVDataType *__restrict__ p_kv_grid, ODataType *__restrict__ p_out_grid, index_t batch_size, index_t q_sequence_length, index_t kv_sequence_length, index_t head_count, index_t head_size, float alpha) |