/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp Source File
elementwise_kernel.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
6 #include "ck_tile/core.hpp"
7 #include "ck_tile/ops/common.hpp"
10 namespace ck_tile {
11 
12 template <typename Problem_, typename Policy_>
14 {
17 
22 
23  static constexpr index_t kBlockSize = Problem::BlockShape::kBlockSize;
24 
25  template <typename... XDataType, typename Dims>
26  CK_TILE_DEVICE void operator()(const Dims lens,
27  const Dims input_strides,
28  const Dims output_strides,
29  const tuple<XDataType...>& input_tensors,
30  YDataType* p_y) const
31  {
32  using S = typename Problem::BlockShape;
33 
34  // Setup block-level coordinates and transforms
35  const index_t iM = get_block_id() * S::kBlockM;
36  const auto merge_transform = make_merge_transform(lens);
37 
38  // Load all input tiles into registers.
39  // The lambda structure here is intended to minimize the lifetime
40  // of intermediate objects (views, windows) used for loading.
41  const auto x_tiles = ck_tile::generate_tuple(
42  [&](auto i) {
43  const auto tensor_view = make_naive_tensor_view<address_space_enum::global>(
44  input_tensors.get(i), lens, input_strides, number<S::kVectorM>{}, number<1>{});
45 
46  const auto transformed_tensor = pad_tensor_view(
48  ck_tile::make_tuple(merge_transform),
53 
54  const auto x_window =
55  make_tile_window(transformed_tensor,
57  {iM},
58  Policy::template MakeXBlockTileDistribution<Problem>());
59 
60  return load_tile(x_window);
61  },
62  number<sizeof...(XDataType)>{});
63 
64  // Setup output tile in registers.
65  const auto& x_tile0 = x_tiles.get(number<0>{});
66  auto y_tile = make_static_distributed_tensor<YDataType>(x_tile0.get_tile_distribution());
67 
68  // Perform element-wise computation.
69  const auto spans = x_tile0.get_distributed_spans();
70  sweep_tile_span(spans[number<0>{}], [&](auto idx) {
71  const auto tile_idx = make_tuple(idx);
72  apply(
73  [&](auto&&... tiles) {
74  ElementWiseOperation{}(y_tile(tile_idx),
75  type_convert<ComputeDataType>(tiles[tile_idx])...);
76  },
77  x_tiles);
78  });
79 
80  // Setup output window and store the result tile.
81  const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
82  p_y, lens, output_strides, number<S::kVectorM>{});
83 
84  const auto transformed_y_m_n = pad_tensor_view(
86  ck_tile::make_tuple(merge_transform),
91 
92  auto y_window = make_tile_window(transformed_y_m_n,
94  {iM},
95  y_tile.get_tile_distribution());
96 
97  store_tile(y_window, cast_tile<YDataType>(y_tile));
98  }
99 
100  template <typename... Ints>
102  {
103  int total_elements = 1;
104  const auto kVectorM = Problem_::BlockShape::kVectorM;
105 
106  apply([&](auto&&... args) { ((total_elements *= args), ...); }, input_sizes);
107 
108  if((total_elements % kVectorM) != 0)
109  {
110  if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
111  {
112  CK_TILE_ERROR("Conditions not met: total number of input elements (",
113  total_elements,
114  ") should be multiple of the vectorization size (",
115  kVectorM,
116  ")");
117  }
118  return false;
119  }
120 
121  return true;
122  }
123 };
124 
125 } // namespace ck_tile
#define CK_TILE_DEVICE
Definition: config.hpp:41
#define CK_TILE_HOST
Definition: config.hpp:40
Definition: cluster_descriptor.hpp:13
constexpr decltype(auto) apply(F &&f, Tuple &&t)
Definition: tuple.hpp:526
bool EnvIsEnabled(EnvVar)
Definition: env.hpp:156
void CK_TILE_ERROR(Args &&... args) noexcept
Definition: env.hpp:12
constexpr CK_TILE_HOST_DEVICE auto transform_tensor_view(const OldTensorView &old_tensor_view, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition: tensor_view.hpp:511
constexpr CK_TILE_HOST_DEVICE auto make_merge_transform(const LowLengths &low_lengths)
Definition: coordinate_transform.hpp:1615
int32_t index_t
Definition: integer.hpp:9
constexpr CK_TILE_HOST_DEVICE auto pad_tensor_view(const TensorView &tensor_view, const TileLengths &tile_lengths, DoPads)
Definition: tensor_view.hpp:530
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:21
CK_TILE_DEVICE void sweep_tile_span(TileDistributedSpan_, const F &f)
Definition: sweep_tile.hpp:20
constexpr CK_TILE_DEVICE auto make_tile_window(null_tensor_view, const WindowLengths &window_lengths, const multi_index< WindowLengths::size()> &, Ts &&...)
Definition: null_tile_window.hpp:75
constexpr CK_TILE_HOST_DEVICE auto generate_tuple(F &&f, number< N >)
Definition: tuple.hpp:429
constexpr CK_TILE_HOST_DEVICE auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:360
CK_TILE_DEVICE void store_tile(tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile_window_tmp, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
Definition: store_tile.hpp:23
CK_TILE_DEVICE auto load_tile(const TileWindow_ &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={})
Definition: load_tile.hpp:22
typename __make_integer_seq< impl::__integer_sequence, index_t, N >::seq_type make_index_sequence
Definition: sequence.hpp:228
Definition: elementwise_kernel.hpp:14
ck_tile::remove_cvref_t< Problem_ > Problem
Definition: elementwise_kernel.hpp:15
ck_tile::remove_cvref_t< typename Problem::XDataType > XDataType
Definition: elementwise_kernel.hpp:18
ck_tile::remove_cvref_t< Policy_ > Policy
Definition: elementwise_kernel.hpp:16
ck_tile::remove_cvref_t< typename Problem::ElementWiseOperation > ElementWiseOperation
Definition: elementwise_kernel.hpp:21
CK_TILE_DEVICE void operator()(const Dims lens, const Dims input_strides, const Dims output_strides, const tuple< XDataType... > &input_tensors, YDataType *p_y) const
Definition: elementwise_kernel.hpp:26
static constexpr index_t kBlockSize
Definition: elementwise_kernel.hpp:23
static CK_TILE_HOST bool IsSupportedArgument(const ck_tile::tuple< Ints... > &input_sizes)
Definition: elementwise_kernel.hpp:101
ck_tile::remove_cvref_t< typename Problem::YDataType > YDataType
Definition: elementwise_kernel.hpp:20
ck_tile::remove_cvref_t< typename Problem::ComputeDataType > ComputeDataType
Definition: elementwise_kernel.hpp:19
Definition: integral_constant.hpp:13
Definition: sequence.hpp:49
Definition: tensor_view.hpp:41
Definition: tuple.hpp:192
#define CK_TILE_ENV(name)
Definition: env.hpp:145