/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/tensor_operation/gpu/element/quantization_operation.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/tensor_operation/gpu/element/quantization_operation.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/tensor_operation/gpu/element/quantization_operation.hpp Source File
quantization_operation.hpp
Go to the documentation of this file.
1 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2 // SPDX-License-Identifier: MIT
3 
4 #pragma once
5 
7 // #include "ck/utility/get_id.hpp"
8 
9 namespace ck {
10 namespace tensor_operation {
11 namespace element_wise {
12 
13 // Y = Sy * Qy
14 // W = Sw * Qw
15 // X = Sx * Qx
16 // B = Sb * Qb = Sw * Sx * Qb
17 // Where X, W, Y are float32, Qx, Qw, Qy are int8
18 // Sx, Sw, Sy are scale of x, w, y (float32), which is calculated from quantization range
19 // Qb is int32, scale of B is Sw * Sx for convenient
20 
21 // Y = W @ X, where @ is convolution or matrix multiplication
22 // Sy * Qy = Sw * Qw @ Sx * Qx
23 // Qy = [(Sw*Sx)/Sy] * Qw @ Qx
24 
25 // For Activation function which is piecewise linear function, such as relu, leaky relu ...etc
26 // Activation(Sy * Qy) = Sy * Activation(Qy)
27 template <typename Activation>
29 {
30  static constexpr const char* name = "Activation_Mul_Clamp";
31 
32  // Convolution + Activation (piecewise linear function)
33  // If an activation is piecewise linear function, then Activation(Sy * Qy) = Sy * Activation(Qy)
34  // Z = Activation(Y) = Activation(W @ X)
35  // Sz * Qz = Activation(Sy * Qy)
36  // Qz = Sy / Sz * Activation(Qy) = (Sw * Sx / Sz) * Activation(Qw @ Qx)
37 
38  // requantScale_ = Sw * Sx / Sz
39  Activation_Mul_Clamp(float requantScale, Activation activationOp)
40  : requantScale_(requantScale), activationOp_(activationOp)
41  {
42  }
43 
44  __host__ __device__ constexpr void operator()(int8_t& y, const int32_t& x) const
45  {
46  float y_fp32 = ck::type_convert<float>(x);
47  activationOp_(y_fp32, y_fp32);
48  y_fp32 = math::clamp(requantScale_ * y_fp32, -128.f, 127.f);
49  y = ck::type_convert<int8_t>(y_fp32);
50  }
51 
52  __device__ constexpr void operator()(int32_t& y, const int32_t& x) const
53  {
54  // CAUSION - We might type_convert to int8 in threadwise copy
55  // eg. GridwiseGemmDlMultipleD_km_kn_mn
56  float y_fp32 = ck::type_convert<float>(x);
57  activationOp_(y_fp32, y_fp32);
58  y_fp32 = math::clamp(requantScale_ * y_fp32, -128.f, 127.f);
59  y = ck::type_convert<int32_t>(y_fp32);
60  }
61 
62  __host__ constexpr void operator()(float& y, const float& x) const
63  {
64  // CAUSION - We might float in & float out in reference code
65  activationOp_(y, x);
66  y = math::clamp(requantScale_ * y, -128.f, 127.f);
67  }
68 
71 };
72 
73 // For Activation function which is non piecewise linear function, such as TanH, Sigmoid ...etc
74 // If an activation is not piecewise linear function
75 // then Activation(Sy * Qy) != Sy * Activation(Qy)
76 template <typename Activation>
78 {
79  static constexpr const char* name = "Mul_Activation_Mul_Clamp";
80 
81  // Convolution + Activation (non piecewise linear function)
82  // Z = Activation(Y) = Activation(W @ X)
83  // Sz * Qz = Activation(Sy * Qy)
84  // Qz = S1 * Activation[Sacc * (Qw @ Qx)]
85  // Where S1 = 1 / Sz, Sacc = Sw * Sx
86  Mul_Activation_Mul_Clamp(float scale_z_inv, float scaleAcc, Activation activationOp)
87  : scale_z_inv_(scale_z_inv), scaleAcc_(scaleAcc), activationOp_(activationOp)
88  {
89  }
90 
91  __host__ __device__ constexpr void operator()(int8_t& y, const int32_t& x) const
92  {
93  float y_fp32 = ck::type_convert<float>(x);
94  y_fp32 = scaleAcc_ * y_fp32;
95  activationOp_(y_fp32, y_fp32);
96  y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
97  y = ck::type_convert<int8_t>(y_fp32);
98  }
99 
101  float scaleAcc_;
103 };
104 
105 // Conv Perchannel quantization + Activation function which is piecewise linear function, such as
106 // relu, leaky relu ...etc
107 // Activation(Sy * Qy) = Sy * Activation(Qy)
108 template <typename Activation>
110 {
111  static constexpr const char* name = "Activation_Mul2_Clamp";
112 
113  Activation_Mul2_Clamp(Activation activationOp) : activationOp_(activationOp) {}
114 
115  __host__ __device__ constexpr void
116  operator()(int8_t& y, const int32_t& x, const float& requantScale) const
117  {
118  float y_fp32 = ck::type_convert<float>(x);
119  activationOp_(y_fp32, y_fp32);
120  y_fp32 = math::clamp(requantScale * y_fp32, -128.f, 127.f);
121  y = ck::type_convert<int8_t>(y_fp32);
122  }
123 
124  __device__ constexpr void
125  operator()(int32_t& y, const int32_t& x, const float& requantScale) const
126  {
127  // CAUSION - We might type_convert to int8 in threadwise copy
128  // eg. GridwiseGemmDlMultipleD_km_kn_mn
129  float y_fp32 = ck::type_convert<float>(x);
130  activationOp_(y_fp32, y_fp32);
131  y_fp32 = math::clamp(requantScale * y_fp32, -128.f, 127.f);
132  y = ck::type_convert<int32_t>(y_fp32);
133  }
134 
136 };
137 
138 // For Activation function which is piecewise linear function, such as relu, leaky relu ...etc
139 // Activation(Sy * Qy) = Sy * Activation(Qy)
140 template <typename Activation>
142 {
143  static constexpr const char* name = "Add_Activation_Mul_Clamp";
144 
145  // Convolution + bias
146  // Let Bias = B = Sw * Sx * Qb
147  // Where Qb is int32
148  // Y = W @ X + B
149  // Sy * Qy = Sw * Qw @ Sx * Qx + Sw * Sx * Qb
150  // Qy = [(Sw*Sx)/Sy] * (Qw @ Qx + Qb)
151 
152  // For activation, Z = Activaiton(Y)
153  // Sz * Qz = Activation(Sy * Qy)
154  // Qz = Sy / Sz * Activation(Qy) = [(Sw*Sx)/Sz] * Activation(Qw @ Qx + Qb)
155  Add_Activation_Mul_Clamp(float requantScale, Activation activationOp)
156  : requantScale_(requantScale), activationOp_(activationOp)
157  {
158  }
159 
160  __host__ __device__ constexpr void
161  operator()(int8_t& y, const int32_t& x, const int32_t& bias) const
162  {
163  float y_fp32 = ck::type_convert<float>(x + bias);
164  activationOp_(y_fp32, y_fp32);
165  y_fp32 = math::clamp(requantScale_ * y_fp32, -128.f, 127.f);
166  y = ck::type_convert<int8_t>(y_fp32);
167  }
168 
169  __host__ __device__ constexpr void
170  operator()(int32_t& y, const int32_t& x, const int32_t& bias) const
171  {
172  // CAUSION - We might type_convert to int8 in threadwise copy
173  // eg. GridwiseGemmDlMultipleD_km_kn_mn
174  float y_fp32 = ck::type_convert<float>(x + bias);
175  activationOp_(y_fp32, y_fp32);
176  y_fp32 = math::clamp(requantScale_ * y_fp32, -128.f, 127.f);
177  y = ck::type_convert<int32_t>(y_fp32);
178  }
179 
182 };
183 
184 // Conv Perchannel quantization + Activation function which is piecewise linear function, such as
185 // relu, leaky relu ...etc
186 template <typename Activation>
188 {
189  static constexpr const char* name = "Add_Activation_Mul2_Clamp";
190 
191  Add_Activation_Mul2_Clamp(Activation activationOp) : activationOp_(activationOp) {}
192 
193  __host__ __device__ constexpr void
194  operator()(int8_t& y, const int32_t& x, const int32_t& bias, const float& requantScale) const
195  {
196  float y_fp32 = ck::type_convert<float>(x + bias);
197  activationOp_(y_fp32, y_fp32);
198  y_fp32 = math::clamp(requantScale * y_fp32, -128.f, 127.f);
199  y = ck::type_convert<int8_t>(y_fp32);
200  }
201 
202  __host__ __device__ constexpr void
203  operator()(int32_t& y, const int32_t& x, const int32_t& bias, const float& requantScale) const
204  {
205  // CAUSION - We might type_convert to int8 in threadwise copy
206  // eg. GridwiseGemmDlMultipleD_km_kn_mn
207  float y_fp32 = ck::type_convert<float>(x + bias);
208  activationOp_(y_fp32, y_fp32);
209  y_fp32 = math::clamp(requantScale * y_fp32, -128.f, 127.f);
210  y = ck::type_convert<int32_t>(y_fp32);
211  }
212 
214 };
215 
216 // For Activation function which is non piecewise linear function, such as TanH, Sigmoid ...etc
217 // If an activation is not piecewise linear function
218 // then Activation(Sy * Qy) != Sy * Activation(Qy)
219 template <typename Activation>
221 {
222  static constexpr const char* name = "Add_Mul_Activation_Mul_Clamp";
223 
224  // Convolution + Activation (non piecewise linear function)
225  // Z = Activation(Y) = Activation(W @ X + B)
226  // Sz * Qz = Activation(Sy * Qy)
227  // Qz = S1 * Activation[Sacc * (Qw @ Qx + Qb)]
228  // Where S1 = 1 / Sz, Sacc = Sw * Sx
229  Add_Mul_Activation_Mul_Clamp(float scale_z_inv, float scaleAcc, Activation activationOp)
230  : scale_z_inv_(scale_z_inv), scaleAcc_(scaleAcc), activationOp_(activationOp)
231  {
232  }
233 
234  __host__ __device__ constexpr void
235  operator()(int8_t& y, const int32_t& x, const int32_t& bias) const
236  {
237  float y_fp32 = ck::type_convert<float>(x + bias);
238  y_fp32 = scaleAcc_ * y_fp32;
239  activationOp_(y_fp32, y_fp32);
240  y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
241  y = ck::type_convert<int8_t>(y_fp32);
242  }
243 
244  __host__ __device__ constexpr void
245  operator()(int32_t& y, const int32_t& x, const int32_t& bias) const
246  {
247  // CAUSION - We might type_convert to int8 in threadwise copy
248  // eg. GridwiseGemmDlMultipleD_km_kn_mn
249  float y_fp32 = ck::type_convert<float>(x + bias);
250  y_fp32 = scaleAcc_ * y_fp32;
251  activationOp_(y_fp32, y_fp32);
252  y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
253  y = ck::type_convert<int32_t>(y_fp32);
254  }
255 
257  float scaleAcc_;
259 };
260 
261 // Conv Perchannel quantization + Activation function which is non piecewise linear function,
262 // such as TanH, Sigmoid ...etc
263 // If an activation is not piecewise linear function
264 // then Activation(Sy *Qy) != Sy * Activation(Qy)
265 template <typename Activation>
267 {
268  static constexpr const char* name = "Add_Mul2_Activation_Mul_Clamp";
269 
270  Add_Mul2_Activation_Mul_Clamp(float scale_z_inv, Activation activationOp)
271  : scale_z_inv_(scale_z_inv), activationOp_(activationOp)
272  {
273  }
274 
275  __host__ __device__ constexpr void
276  operator()(int8_t& y, const int32_t& x, const int32_t& bias, const float& scaleAcc) const
277  {
278  float y_fp32 = ck::type_convert<float>(x + bias);
279  y_fp32 = scaleAcc * y_fp32;
280  activationOp_(y_fp32, y_fp32);
281  y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
282  y = ck::type_convert<int8_t>(y_fp32);
283  }
284 
285  __host__ __device__ constexpr void
286  operator()(int32_t& y, const int32_t& x, const int32_t& bias, const float& scaleAcc) const
287  {
288  // CAUSION - We might type_convert to int8 in threadwise copy
289  // eg. GridwiseGemmDlMultipleD_km_kn_mn
290  float y_fp32 = ck::type_convert<float>(x + bias);
291  y_fp32 = scaleAcc * y_fp32;
292  activationOp_(y_fp32, y_fp32);
293  y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
294  y = ck::type_convert<int32_t>(y_fp32);
295  }
296 
299 };
300 
301 } // namespace element_wise
302 } // namespace tensor_operation
303 } // namespace ck
__host__ constexpr __device__ T clamp(const T &x, const T &lowerbound, const T &upperbound)
Definition: math.hpp:148
Definition: ck.hpp:270
Activation
Definition: gridwise_moe_gemm.hpp:31
signed int int32_t
Definition: stdint.h:123
signed char int8_t
Definition: stdint.h:121
Definition: quantization_operation.hpp:110
Activation activationOp_
Definition: quantization_operation.hpp:135
static constexpr const char * name
Definition: quantization_operation.hpp:111
constexpr __device__ void operator()(int32_t &y, const int32_t &x, const float &requantScale) const
Definition: quantization_operation.hpp:125
Activation_Mul2_Clamp(Activation activationOp)
Definition: quantization_operation.hpp:113
__host__ constexpr __device__ void operator()(int8_t &y, const int32_t &x, const float &requantScale) const
Definition: quantization_operation.hpp:116
Definition: quantization_operation.hpp:29
Activation activationOp_
Definition: quantization_operation.hpp:70
float requantScale_
Definition: quantization_operation.hpp:69
constexpr __host__ void operator()(float &y, const float &x) const
Definition: quantization_operation.hpp:62
__host__ constexpr __device__ void operator()(int8_t &y, const int32_t &x) const
Definition: quantization_operation.hpp:44
Activation_Mul_Clamp(float requantScale, Activation activationOp)
Definition: quantization_operation.hpp:39
static constexpr const char * name
Definition: quantization_operation.hpp:30
constexpr __device__ void operator()(int32_t &y, const int32_t &x) const
Definition: quantization_operation.hpp:52
Definition: quantization_operation.hpp:188
__host__ constexpr __device__ void operator()(int8_t &y, const int32_t &x, const int32_t &bias, const float &requantScale) const
Definition: quantization_operation.hpp:194
static constexpr const char * name
Definition: quantization_operation.hpp:189
__host__ constexpr __device__ void operator()(int32_t &y, const int32_t &x, const int32_t &bias, const float &requantScale) const
Definition: quantization_operation.hpp:203
Add_Activation_Mul2_Clamp(Activation activationOp)
Definition: quantization_operation.hpp:191
Activation activationOp_
Definition: quantization_operation.hpp:213
Definition: quantization_operation.hpp:142
Activation activationOp_
Definition: quantization_operation.hpp:181
float requantScale_
Definition: quantization_operation.hpp:180
__host__ constexpr __device__ void operator()(int32_t &y, const int32_t &x, const int32_t &bias) const
Definition: quantization_operation.hpp:170
Add_Activation_Mul_Clamp(float requantScale, Activation activationOp)
Definition: quantization_operation.hpp:155
static constexpr const char * name
Definition: quantization_operation.hpp:143
__host__ constexpr __device__ void operator()(int8_t &y, const int32_t &x, const int32_t &bias) const
Definition: quantization_operation.hpp:161
Activation activationOp_
Definition: quantization_operation.hpp:298
static constexpr const char * name
Definition: quantization_operation.hpp:268
__host__ constexpr __device__ void operator()(int32_t &y, const int32_t &x, const int32_t &bias, const float &scaleAcc) const
Definition: quantization_operation.hpp:286
__host__ constexpr __device__ void operator()(int8_t &y, const int32_t &x, const int32_t &bias, const float &scaleAcc) const
Definition: quantization_operation.hpp:276
float scale_z_inv_
Definition: quantization_operation.hpp:297
Add_Mul2_Activation_Mul_Clamp(float scale_z_inv, Activation activationOp)
Definition: quantization_operation.hpp:270
float scaleAcc_
Definition: quantization_operation.hpp:257
__host__ constexpr __device__ void operator()(int32_t &y, const int32_t &x, const int32_t &bias) const
Definition: quantization_operation.hpp:245
Add_Mul_Activation_Mul_Clamp(float scale_z_inv, float scaleAcc, Activation activationOp)
Definition: quantization_operation.hpp:229
float scale_z_inv_
Definition: quantization_operation.hpp:256
Activation activationOp_
Definition: quantization_operation.hpp:258
__host__ constexpr __device__ void operator()(int8_t &y, const int32_t &x, const int32_t &bias) const
Definition: quantization_operation.hpp:235
static constexpr const char * name
Definition: quantization_operation.hpp:222
Definition: quantization_operation.hpp:78
float scaleAcc_
Definition: quantization_operation.hpp:101
static constexpr const char * name
Definition: quantization_operation.hpp:79
Mul_Activation_Mul_Clamp(float scale_z_inv, float scaleAcc, Activation activationOp)
Definition: quantization_operation.hpp:86
float scale_z_inv_
Definition: quantization_operation.hpp:100
Activation activationOp_
Definition: quantization_operation.hpp:102
__host__ constexpr __device__ void operator()(int8_t &y, const int32_t &x) const
Definition: quantization_operation.hpp:91