/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/reduce/kernel/multi_reduce2d_tile_partitioner.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/reduce/kernel/multi_reduce2d_tile_partitioner.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/reduce/kernel/multi_reduce2d_tile_partitioner.hpp Source File
multi_reduce2d_tile_partitioner.hpp
Go to the documentation of this file.
1 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2 // SPDX-License-Identifier: MIT
3 
4 #pragma once
5 
6 #include "ck_tile/core.hpp"
7 
8 namespace ck_tile {
9 
11 template <typename BlockShape_, bool ForceMultiBlock_ = false>
13 {
15 
16  static constexpr bool ForceMultiBlock = ForceMultiBlock_;
17 
18  static constexpr index_t MPerBlock = BlockShape::Block_M;
19  static constexpr index_t NPerBlock = BlockShape::Block_N;
20 
22 
26  : total_reduction_length(total_reduce_len)
27  {
28  }
29 
32  CK_TILE_DEVICE auto GetOutputTileIndex(index_t block_idx) const noexcept -> index_t
33  {
34  return amd_wave_read_first_lane(block_idx);
35  }
36 
41  CK_TILE_DEVICE auto
43  index_t block_group_size) const noexcept -> tuple<index_t, index_t>
44  {
45  const index_t tile_idx = amd_wave_read_first_lane(block_global_idx / block_group_size);
46  const index_t local_idx = amd_wave_read_first_lane(block_global_idx % block_group_size);
47  return make_tuple(tile_idx, local_idx);
48  }
49 
54  {
55  index_t block_group_size = 1;
56  index_t num_iters = 0;
57 
58  if(!ForceMultiBlock)
59  {
60  // Single-block strategy: one block handles entire reduction
61  block_group_size = 1;
62  num_iters = (total_reduction_length + NPerBlock - 1) / NPerBlock;
63  return make_tuple(num_iters, block_group_size);
64  }
65  else
66  {
67  constexpr int max_block_group_size =
68  128; // Maximum 128, as in CK. It balances between latency (i.e. limiting stalls
69  // when performing the atomic operation) and block parallelism.
70 
71  num_iters = (total_reduction_length + (NPerBlock * max_block_group_size) - 1) /
72  (NPerBlock * max_block_group_size);
73 
74  // This should only happen if reduce_total_length is 0 (empty tensor)
75  if(num_iters == 0)
76  {
77 #ifndef __HIP_DEVICE_COMPILE__
78  // Warning only on host side
79  if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
80  {
81  printf("Warning: reduce_total_length is 0, there is no data to process\n");
82  }
83 #endif
84  block_group_size = 1;
85  return make_tuple(num_iters, block_group_size);
86  }
87 
88  block_group_size =
89  (total_reduction_length + (NPerBlock * num_iters) - 1) / (NPerBlock * num_iters);
90 
91  return make_tuple(num_iters, block_group_size);
92  }
93  }
94 
100  CK_TILE_DEVICE auto
101  GetInputTileOffsets(const index_t block_global_idx,
102  const index_t block_group_size,
103  const index_t num_iterations) const -> tuple<index_t, index_t>
104  {
105  const auto [tile_idx, local_idx] =
106  GetOutputTileIndexMultiBlock(block_global_idx, block_group_size);
107 
108  const index_t m_offset = MPerBlock * tile_idx;
109  const index_t n_offset = NPerBlock * num_iterations * local_idx;
110 
111  return make_tuple(m_offset, n_offset);
112  }
113 
117  CK_TILE_DEVICE index_t GetOutputTileOffset(const index_t block_group_id) const
118  {
119  return MPerBlock * block_group_id;
120  }
121 
122  private:
123  index_t total_reduction_length;
124 };
125 } // namespace ck_tile
#define CK_TILE_DEVICE
Definition: config.hpp:45
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:46
Definition: cluster_descriptor.hpp:13
bool EnvIsEnabled(EnvVar)
Definition: env.hpp:156
__device__ uint32_t amd_wave_read_first_lane(uint16_t v)
Definition: amd_buffer_addressing.hpp:36
int32_t index_t
Definition: integer.hpp:9
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:21
constexpr CK_TILE_HOST_DEVICE auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:360
TilePartitioner for 2D reduction operations.
Definition: multi_reduce2d_tile_partitioner.hpp:13
CK_TILE_DEVICE auto GetInputTileOffsets(const index_t block_global_idx, const index_t block_group_size, const index_t num_iterations) const -> tuple< index_t, index_t >
Compute the input tile offset for the given thread, block index.
Definition: multi_reduce2d_tile_partitioner.hpp:101
CK_TILE_HOST_DEVICE auto GetBlockGroupParams() const noexcept -> tuple< index_t, index_t >
Calculate the number of iterations and the number of blocks required to perform the reduction.
Definition: multi_reduce2d_tile_partitioner.hpp:53
CK_TILE_DEVICE auto GetOutputTileIndexMultiBlock(index_t block_global_idx, index_t block_group_size) const noexcept -> tuple< index_t, index_t >
Get output tile index and block local ID for multi-block reduction.
Definition: multi_reduce2d_tile_partitioner.hpp:42
CK_TILE_DEVICE auto GetOutputTileIndex(index_t block_idx) const noexcept -> index_t
Get output tile index for threadwise reduction.
Definition: multi_reduce2d_tile_partitioner.hpp:32
static constexpr index_t NPerBlock
Definition: multi_reduce2d_tile_partitioner.hpp:19
static constexpr index_t MPerBlock
Definition: multi_reduce2d_tile_partitioner.hpp:18
remove_cvref_t< BlockShape_ > BlockShape
Definition: multi_reduce2d_tile_partitioner.hpp:14
static constexpr bool ForceMultiBlock
Definition: multi_reduce2d_tile_partitioner.hpp:16
CK_TILE_DEVICE index_t GetOutputTileOffset(const index_t block_group_id) const
Compute the output tile offset for the given operation and block group.
Definition: multi_reduce2d_tile_partitioner.hpp:117
CK_TILE_HOST_DEVICE Reduce2dTilePartitioner() noexcept=delete
Definition: tuple.hpp:192
#define CK_TILE_ENV(name)
Definition: env.hpp:145