/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/arch/mma/mma.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/arch/mma/mma.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/arch/mma/mma.hpp Source File
Go to the documentation of this file.
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 #pragma once
 #include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/numeric/vector_type.hpp"
  
 #include "amdgcn_mma.hpp"
 #include "mma_selector.hpp"
 #include "mma_traits.hpp"
 #include "mma_transforms.hpp"
  
 #include "mfma/mfma.hpp"
 #include "wmma/wmma.hpp"
  
 namespace ck_tile::core::arch::mma {
  
 enum struct MmaAccumPolicy
 {
     // Decomposition and accumulation in row-major block order
     ROW_MAJOR,
     // Decomposition and accumulation in col-major block order
     COL_MAJOR
 };
  
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
           uint32_t FragM,
           uint32_t FragN,
           uint32_t FragK,
           MmaAccumPolicy AccumPolicy = MmaAccumPolicy::ROW_MAJOR,
           typename CompilerTarget =
               decltype(get_compiler_target()), // TODO: c++20 amdgcn_target_arch_id GfxTargetId =
                                                // get_compiler_target(),
           typename MmaOp =
               typename MmaDefaultSelector<ADataType, // TODO: c++20 MmaOpI MmaOp = typename
                                                      // MmaDefaultSelector<ADataType,
                                           BDataType,
                                           CDataType,
                                           FragM,
                                           FragN,
                                           FragK,
                                           CompilerTarget>::SelectedOp,
           typename MmaTransforms = // TODO: c++20 MmaTransformsI MmaTransforms =
           typename MmaTransformsDefaultSelector<MmaOp, CompilerTarget>::SelectedTransforms>
 struct WaveWiseMma
 {
  
     using BlockWiseMmaOp       = MmaOp;
     using BlockWiseMmaOpTraits = MmaOpTraits<BlockWiseMmaOp>;
  
     // Block dimensions
     constexpr static uint32_t BlockM = BlockWiseMmaOpTraits::BlockM;
     constexpr static uint32_t BlockN = BlockWiseMmaOpTraits::BlockN;
     constexpr static uint32_t BlockK = BlockWiseMmaOpTraits::BlockK;
  
     // Block counts for decomposition
     constexpr static uint32_t BlocksM = FragM / BlockM;
     constexpr static uint32_t BlocksN = FragN / BlockN;
     constexpr static uint32_t BlocksK = FragK / BlockK;
     constexpr static uint32_t BlocksC = BlocksM * BlocksN;
  
     // Vector types for packed registers in each block
     using AVecType = typename BlockWiseMmaOpTraits::AVecType;
     using BVecType = typename BlockWiseMmaOpTraits::BVecType;
     using CVecType = typename BlockWiseMmaOpTraits::CVecType;
  
     // Buffer types for fragments
     using ABufferType = AVecType[BlocksM][BlocksK];
     using BBufferType = BVecType[BlocksN][BlocksK];
     using CBufferType = CVecType[BlocksM][BlocksN];
  
     // Transforms
     using ATransform = typename MmaTransforms::ATransform;
     using BTransform = typename MmaTransforms::BTransform;
     using CTransform = typename MmaTransforms::CTransform;
     using DTransform = typename MmaTransforms::DTransform;
  
     // Sanity checks
     static_assert(FragM >= BlockM, "FragM must be larger than BlockM");
     static_assert(FragN >= BlockN, "FragN must be larger than BlockN");
     static_assert(FragK >= BlockK, "FragK must be larger than BlockK");
     static_assert(FragM % BlockM == 0u, "FragM must be a multiple of BlockM");
     static_assert(FragN % BlockN == 0u, "FragN must be a multiple of BlockN");
     static_assert(FragK % BlockK == 0u, "FragK must be a multiple of BlockK");
  
     private:
     template <typename DstT, typename SrcT>
     CK_TILE_DEVICE static auto formatBuffer(SrcT const& inputBuffer)
     {
         // TODO: Implement formatting logic as needed.
         // This is intended to convert input fragments to the native vector types
         // required by the BlockWiseMma operation for iteration
         static_assert(sizeof(DstT) == sizeof(SrcT), "Size mismatch in formatBuffer");
         return reinterpret_cast<DstT const&>(inputBuffer);
     }
  
     template <typename DstT, typename SrcT>
     CK_TILE_DEVICE static auto formatBuffer(SrcT& inputBuffer)
     {
         // TODO: Implement formatting logic as needed.
         // This is intended to convert input fragments to the native vector types
         // required by the BlockWiseMma operation for iteration
         static_assert(sizeof(DstT) == sizeof(SrcT), "Size mismatch in formatBuffer");
         return reinterpret_cast<DstT&>(inputBuffer);
     }
  
     template <typename VecTA, typename VecTB, typename VecTC>
     CK_TILE_DEVICE static decltype(auto) exec_col_major(VecTA&& a, VecTB&& b, VecTC&& accum)
     {
         // We implement an example wave-tile pipeline here.
         // First, we apply the necessary transforms to the input fragments,
         // then we convert the result into buffers of native vector formats
         // that we can easily index. Native vector formats are necessary inputs
         // to the given MmaOp exec function.
         auto a_frag = formatBuffer<ABufferType>(ATransform::exec(a));
         auto b_frag = formatBuffer<BBufferType>(BTransform::exec(b));
         auto c_frag = formatBuffer<CBufferType>(CTransform::exec(accum));
  
         // "Col-major" accumulation over the M-dimension blocks first.
         // Pseudo code here, but we would basically iterate over the blocks in col-major order
         for(uint32_t bn = 0u; bn < BlocksN; ++bn)
         {
             for(uint32_t bm = 0u; bm < BlocksM; ++bm)
             {
                 for(uint32_t bk = 0u; bk < BlocksK; ++bk)
                 {
                     c_frag[bm][bn] =
                         BlockWiseMmaOp::exec(a_frag[bm][bk], b_frag[bn][bk], c_frag[bm][bn]);
                 }
             }
         }
  
         // Convert native vector results back to the output fragment format
         // and then return after we apply the final output transform.
         return DTransform::exec(formatBuffer<std::decay_t<VecTC>>(c_frag));
     }
  
     template <typename VecTA, typename VecTB, typename VecTC>
     CK_TILE_DEVICE static decltype(auto) exec_row_major(VecTA&& a, VecTB&& b, VecTC&& accum)
     {
         // We implement an example wave-tile pipeline here.
         // First, we apply the necessary transforms to the input fragments,
         // then we convert the result into buffers of native vector formats
         // that we can easily index. Native vector formats are necessary inputs
         // to the given MmaOp exec function.
         auto a_frag = formatBuffer<ABufferType>(ATransform::exec(a));
         auto b_frag = formatBuffer<BBufferType>(BTransform::exec(b));
         auto c_frag = formatBuffer<CBufferType>(CTransform::exec(accum));
  
         // "Row-major" accumulation over the N-dimension blocks first.
         // Pseudo code here, but we would basically iterate over the blocks in row-major order.
         // We also have to ensure that the incoming vector fragments are converted to native vector
         // types before passing to the BlockWiseMma exec function.
         for(uint32_t bm = 0u; bm < BlocksM; ++bm)
         {
             for(uint32_t bn = 0u; bn < BlocksN; ++bn)
             {
                 for(uint32_t bk = 0u; bk < BlocksK; ++bk)
                 {
                     c_frag[bm][bn] =
                         BlockWiseMmaOp::exec(a_frag[bm][bk], b_frag[bn][bk], c_frag[bm][bn]);
                 }
             }
         }
  
         // Convert native vector results back to the output fragment format
         // and then return after we apply the final output transform.
         return DTransform::exec(formatBuffer<std::decay_t<VecTC>>(c_frag));
     }
  
     public:
     template <typename VecTA, typename VecTB, typename VecTC>
     CK_TILE_DEVICE static decltype(auto) exec(VecTA&& a, VecTB&& b, VecTC&& accum)
     {
         if constexpr(AccumPolicy == MmaAccumPolicy::ROW_MAJOR)
         {
             return exec_row_major(
                 std::forward<VecTA>(a), std::forward<VecTB>(b), std::forward<VecTC>(accum));
         }
         else // if constexpr(AccumPolicy == MmaAccumPolicy::COL_MAJOR)
         {
             return exec_col_major(
                 std::forward<VecTA>(a), std::forward<VecTB>(b), std::forward<VecTC>(accum));
         }
     }
 };
  
 } // namespace ck_tile::core::arch::mma