/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/reduce/kernel/multi_reduce2d_tile_partitioner.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/reduce/kernel/multi_reduce2d_tile_partitioner.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/reduce/kernel/multi_reduce2d_tile_partitioner.hpp Source File

Go to the documentation of this file.

 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
  
 #pragma once
  
 #include "ck_tile/core.hpp"
  
 namespace ck_tile {
  
 template <typename BlockShape_, bool ForceMultiBlock_ = false>
 struct Reduce2dTilePartitioner
 {
     using BlockShape = remove_cvref_t<BlockShape_>;
  
     static constexpr bool ForceMultiBlock = ForceMultiBlock_;
  
     static constexpr index_t MPerBlock = BlockShape::Block_M;
     static constexpr index_t NPerBlock = BlockShape::Block_N;
  
     CK_TILE_HOST_DEVICE Reduce2dTilePartitioner() noexcept = delete;
  
     CK_TILE_HOST_DEVICE Reduce2dTilePartitioner(index_t total_reduce_len) noexcept
         : total_reduction_length(total_reduce_len)
     {
     }
  
     CK_TILE_DEVICE auto GetOutputTileIndex(index_t block_idx) const noexcept -> index_t
     {
         return amd_wave_read_first_lane(block_idx);
     }
  
     CK_TILE_DEVICE auto
     GetOutputTileIndexMultiBlock(index_t block_global_idx,
                                  index_t block_group_size) const noexcept -> tuple<index_t, index_t>
     {
         const index_t tile_idx  = amd_wave_read_first_lane(block_global_idx / block_group_size);
         const index_t local_idx = amd_wave_read_first_lane(block_global_idx % block_group_size);
         return make_tuple(tile_idx, local_idx);
     }
  
     CK_TILE_HOST_DEVICE auto GetBlockGroupParams() const noexcept -> tuple<index_t, index_t>
     {
         index_t block_group_size = 1;
         index_t num_iters        = 0;
  
         if(!ForceMultiBlock)
         {
             // Single-block strategy: one block handles entire reduction
             block_group_size = 1;
             num_iters        = (total_reduction_length + NPerBlock - 1) / NPerBlock;
             return make_tuple(num_iters, block_group_size);
         }
         else
         {
             constexpr int max_block_group_size =
                 128; // Maximum 128, as in CK. It balances between latency (i.e. limiting stalls
                      // when performing the atomic operation) and block parallelism.
  
             num_iters = (total_reduction_length + (NPerBlock * max_block_group_size) - 1) /
                         (NPerBlock * max_block_group_size);
  
             // This should only happen if reduce_total_length is 0 (empty tensor)
             if(num_iters == 0)
             {
 #ifndef __HIP_DEVICE_COMPILE__
                 // Warning only on host side
                 if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
                 {
                     printf("Warning: reduce_total_length is 0, there is no data to process\n");
                 }
 #endif
                 block_group_size = 1;
                 return make_tuple(num_iters, block_group_size);
             }
  
             block_group_size =
                 (total_reduction_length + (NPerBlock * num_iters) - 1) / (NPerBlock * num_iters);
  
             return make_tuple(num_iters, block_group_size);
         }
     }
  
     CK_TILE_DEVICE auto
     GetInputTileOffsets(const index_t block_global_idx,
                         const index_t block_group_size,
                         const index_t num_iterations) const -> tuple<index_t, index_t>
     {
         const auto [tile_idx, local_idx] =
             GetOutputTileIndexMultiBlock(block_global_idx, block_group_size);
  
         const index_t m_offset = MPerBlock * tile_idx;
         const index_t n_offset = NPerBlock * num_iterations * local_idx;
  
         return make_tuple(m_offset, n_offset);
     }
  
     CK_TILE_DEVICE index_t GetOutputTileOffset(const index_t block_group_id) const
     {
         return MPerBlock * block_group_id;
     }
  
     private:
     index_t total_reduction_length;
 };
 } // namespace ck_tile