/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp Source File
Go to the documentation of this file.
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
  
 #pragma once
  
 #include <sstream>
  
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
  
 namespace ck {
 namespace tensor_operation {
 namespace device {
  
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
           typename AccDataType,
           typename ALayout,
           typename BLayout,
           typename CLayout,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
           GemmSpecialization GemmSpec,
           ck::index_t BlockSize,
           ck::index_t MPerBlock,
           ck::index_t NPerBlock,
           ck::index_t KPerBlock,
           ck::index_t AK1,
           ck::index_t BK1,
           ck::index_t MPerDpp,
           ck::index_t NPerDpp,
           ck::index_t MDppPerWave,
           ck::index_t NDppPerWave,
           typename ABlockTransferThreadClusterLengths_K0_M_K1,
           typename ABlockTransferThreadClusterArrangeOrder,
           typename ABlockTransferSrcAccessOrder,
           ck::index_t ABlockTransferSrcVectorDim,
           ck::index_t ABlockTransferSrcScalarPerVector,
           ck::index_t ABlockTransferDstScalarPerVector_K1,
           bool ABlockLdsAddExtraM,
           typename BBlockTransferThreadClusterLengths_K0_N_K1,
           typename BBlockTransferThreadClusterArrangeOrder,
           typename BBlockTransferSrcAccessOrder,
           ck::index_t BBlockTransferSrcVectorDim,
           ck::index_t BBlockTransferSrcScalarPerVector,
           ck::index_t BBlockTransferDstScalarPerVector_K1,
           bool BBlockLdsAddExtraN,
           ck::index_t CThreadTransferSrcDstVectorDim,
           ck::index_t CThreadTransferDstScalarPerVector,
           ck::index_t NumPrefetch         = 1,
           ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
 struct DeviceGemmDpp : public DeviceGemm<ALayout,
                                          BLayout,
                                          CLayout,
                                          ADataType,
                                          BDataType,
                                          CDataType,
                                          AElementwiseOperation,
                                          BElementwiseOperation,
                                          CElementwiseOperation>
 {
     using GridwiseGemm = GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp<
         BlockSize,
         ADataType,
         AccDataType,
         CDataType,
         InMemoryDataOperationEnum::Set,
         ALayout,
         BLayout,
         CLayout,
         AElementwiseOperation,
         BElementwiseOperation,
         CElementwiseOperation,
         GemmSpec,
         MPerBlock,
         NPerBlock,
         KPerBlock,
         MPerDpp,
         NPerDpp,
         AK1,
         BK1,
         MDppPerWave,
         NDppPerWave,
         ABlockTransferThreadClusterLengths_K0_M_K1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
         ABlockTransferSrcVectorDim,
         ABlockTransferSrcScalarPerVector,
         ABlockTransferDstScalarPerVector_K1,
         false, // AThreadTransferSrcResetCoordinateAfterRun,
         ABlockLdsAddExtraM,
         BBlockTransferThreadClusterLengths_K0_N_K1,
         BBlockTransferThreadClusterArrangeOrder,
         BBlockTransferSrcAccessOrder,
         BBlockTransferSrcVectorDim,
         BBlockTransferSrcScalarPerVector,
         BBlockTransferDstScalarPerVector_K1,
         false, // BThreadTransferSrcResetCoordinateAfterRun,
         BBlockLdsAddExtraN,
         Sequence<0, 2, 4, 1, 3, 5>, // CThreadTransferSrcDstAccessOrder,
         CThreadTransferSrcDstVectorDim,
         CThreadTransferDstScalarPerVector,
         NumPrefetch,
         PipelineVer>;
  
     using Argument = typename GridwiseGemm::Argument;
  
     // Invoker
     struct Invoker : public BaseInvoker
     {
         float Run(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
                 karg.Print();
             }
  
             if(!GridwiseGemm::CheckValidity(karg))
             {
                 throw std::runtime_error(
                     "wrong! GridwiseGemm_k0mk1_k0nk1_mn_dpp has invalid setting");
             }
  
             const auto [gdx, gdy, gdz] = GridwiseGemm::CalculateGridSize(karg.M, karg.N);
  
             float ave_time = 0;
  
             if(GridwiseGemm::CalculateHasMainKBlockLoop(karg.K))
             {
                 const auto kernel = kernel_gemm_dpp<GridwiseGemm, true>;
  
                 ave_time = launch_and_time_kernel(
                     stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg);
             }
             else
             {
                 const auto kernel = kernel_gemm_dpp<GridwiseGemm, false>;
  
                 ave_time = launch_and_time_kernel(
                     stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg);
             }
  
             return ave_time;
         }
  
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
  
     static constexpr bool IsValidCompilationParameter()
     {
         // TODO: properly implement this check
         return true;
     }
  
     static bool IsSupportedArgument(const Argument& karg)
     {
         if(ck::is_gfx103_supported() || ck::is_gfx11_supported())
         {
             return GridwiseGemm::CheckValidity(karg);
         }
         return false;
     }
  
     // polymorphic
     bool IsSupportedArgument(const BaseArgument* p_arg) override
     {
         return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
     }
  
     static auto MakeArgument(const ADataType* p_a,
                              const BDataType* p_b,
                              CDataType* p_c,
                              index_t M,
                              index_t N,
                              index_t K,
                              index_t StrideA,
                              index_t StrideB,
                              index_t StrideC,
                              AElementwiseOperation,
                              BElementwiseOperation,
                              CElementwiseOperation)
     {
         return Argument{p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC};
     }
  
     static auto MakeInvoker() { return Invoker{}; }
  
     // polymorphic
     std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
                                                       const void* p_b,
                                                       void* p_c,
                                                       index_t M,
                                                       index_t N,
                                                       index_t K,
                                                       index_t StrideA,
                                                       index_t StrideB,
                                                       index_t StrideC,
                                                       AElementwiseOperation,
                                                       BElementwiseOperation,
                                                       CElementwiseOperation) override
     {
         return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                           static_cast<const BDataType*>(p_b),
                                           static_cast<CDataType*>(p_c),
                                           M,
                                           N,
                                           K,
                                           StrideA,
                                           StrideB,
                                           StrideC);
     }
  
     // polymorphic
     std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
     {
         return std::make_unique<Invoker>(Invoker{});
     }
  
     // polymorphic
     std::string GetTypeString() const override
     {
         auto str = std::stringstream();
  
         std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
                                                                        {PipelineVersion::v2, "v2"}};
  
         // clang-format off
         str << "DeviceGemmDpp"
             << "<"
             << BlockSize << ", "
             << MPerBlock << ", "
             << NPerBlock << ", "
             << KPerBlock << ", "
             << AK1 << ", "
             << BK1 << ", "
             << MPerDpp << ", "
             << NPerDpp << ", "
             << MDppPerWave << ", "
             << MDppPerWave << ", "
             << ABlockTransferSrcScalarPerVector << ", "
             << ABlockTransferDstScalarPerVector_K1 << ", "
             << BBlockTransferSrcScalarPerVector << ", "
             << BBlockTransferDstScalarPerVector_K1
             << ">"
             << " NumPrefetch: "
             << NumPrefetch << ", "
             << "PipelineVersion: "
             << PipelineVersionToString[PipelineVer];
         // clang-format on
  
         return str.str();
     }
 };
  
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck