/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/utility/data_type.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/utility/data_type.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-6.4.3/include/ck/utility/data_type.hpp Source File
data_type.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
7 #include "ck/utility/e8m0.hpp"
9 #ifdef CK_CODE_GEN_RTC
10 using int8_t = signed char;
11 using uint8_t = unsigned char;
12 using int16_t = signed short;
13 using uint16_t = unsigned short;
14 using float_t = float;
15 #endif
16 namespace ck {
17 
18 #ifdef CK_CODE_GEN_RTC
19 using byte = unsigned char;
20 #else
21 using std::byte;
22 #endif
23 
24 using bhalf_t = ushort;
25 using half_t = _Float16;
26 using int4_t = _BitInt(4);
27 using f4_t = unsigned _BitInt(4);
28 using f6_t = _BitInt(6); // e2m3 format
29 using bf6_t = unsigned _BitInt(6); // e3m2 format
30 
31 struct f4x2_pk_t
32 {
33  using type = uint8_t;
35  f4x2_pk_t() : data{type{}} {}
36  f4x2_pk_t(type init) : data{init} {}
37 
38  template <index_t I>
39  __host__ __device__ inline type unpack(Number<I>) const
40  {
41  static_assert(I < 2, "Index is out of range.");
42  if constexpr(I == 0)
43  return data & 0b00001111;
44  else
45  return (data >> 4);
46  }
47 
48  __host__ __device__ inline type pack(const type x0, const type x1)
49  {
50  return (x1 << 4) | (x0 & 0b00001111);
51  }
52 };
53 
54 struct f6x16_pk_t
55 {
56  // store 16 elements of f6_t in an array of 3 uint32_t
57  using element_type = uint32_t;
60  typedef int8_t test_vec_t __attribute__((ext_vector_type(16)));
61  f6x16_pk_t() : data{type{}} {}
62  f6x16_pk_t(type init) : data{init} {}
63 
64  template <index_t I>
65  __host__ __device__ inline f6_t unpack(Number<I>)
66  {
67  static_assert(I < 16, "Index out of range for 16 f6_t elements.");
68 
69  constexpr int num_bits_elem = 6;
70  constexpr int num_bits_vec_elem = 32;
71  constexpr int vector_size = 3;
72  constexpr int bit_pos = I * num_bits_elem;
73  constexpr int arr_idx = bit_pos / num_bits_vec_elem;
74  constexpr int bit_offset = bit_pos % num_bits_vec_elem;
75  uint32_t bits = data.At(Number<arr_idx>{}) >> bit_offset;
76  constexpr int overhang = bit_offset + num_bits_elem - num_bits_vec_elem;
77 
78  if constexpr(overhang > 0 && (arr_idx + 1) < vector_size)
79  {
80  bits |= (data.At(Number<arr_idx + 1>{}) & ((1u << overhang) - 1))
81  << (num_bits_elem - overhang);
82  }
83 
84  return static_cast<f6_t>(bits & 0x3F);
85  }
86 
87  __host__ __device__ inline type pack(const test_vec_t& x)
88  {
89  type packed{};
90 
91  // for each of the 16 f6_t values, place its 6 bits in the correct position
92  ck::static_for<0, 16, 1>{}([&](auto i) {
93  uint32_t bits = static_cast<uint32_t>(x[static_cast<int>(i)]) & 0x3F;
94  constexpr int num_bits_elem = 6;
95  constexpr int num_bits_vec_elem = 32;
96  constexpr int vector_size = 3;
97  constexpr int bit_pos = i * num_bits_elem;
98  constexpr int arr_index = bit_pos / num_bits_vec_elem;
99  constexpr int bit_offset = bit_pos % num_bits_vec_elem;
100  constexpr int overhang = bit_offset + num_bits_elem - num_bits_vec_elem;
101  uint32_t old_value = packed.At(Number<arr_index>{});
102 
103  // insert bits into the current 32-bit block
104  old_value |= (bits << bit_offset);
105  packed.At(Number<arr_index>{}) = old_value;
106 
107  // if it crosses into the next block, shift the remainder
108  if constexpr(overhang > 0 && (arr_index + 1) < vector_size)
109  {
110  uint32_t next_value = packed.At(Number<arr_index + 1>{});
111  next_value |= (bits >> (num_bits_elem - overhang));
112  packed.At(Number<arr_index + 1>{}) = next_value;
113  }
114  });
115 
116  return packed;
117  }
118 };
119 
121 {
122  // store 32 elements of f6_t in an array of 6 uint32_t
123  using element_type = uint32_t;
126  typedef int8_t test_vec_t __attribute__((ext_vector_type(32)));
127  f6x32_pk_t() : data{type{}} {}
128  f6x32_pk_t(type init) : data{init} {}
129 
130  template <index_t I>
131  __host__ __device__ inline f6_t unpack(Number<I>)
132  {
133  static_assert(I < 32, "Index out of range for 32 f6_t elements.");
134 
135  constexpr int num_bits_elem = 6;
136  constexpr int num_bits_vec_elem = 32;
137  constexpr int vector_size = 6;
138  constexpr int bit_pos = I * num_bits_elem;
139  constexpr int arr_idx = bit_pos / num_bits_vec_elem;
140  constexpr int bit_offset = bit_pos % num_bits_vec_elem;
141  uint32_t bits = data.At(Number<arr_idx>{}) >> bit_offset;
142  constexpr int overhang = bit_offset + num_bits_elem - num_bits_vec_elem;
143 
144  if constexpr(overhang > 0 && (arr_idx + 1) < vector_size)
145  {
146  bits |= (data.At(Number<arr_idx + 1>{}) & ((1u << overhang) - 1))
147  << (num_bits_elem - overhang);
148  }
149 
150  return static_cast<f6_t>(bits & 0x3F);
151  }
152 
153  __host__ __device__ inline type pack(const test_vec_t& x)
154  {
155  type packed{};
156 
157  // for each of the 32 f6_t values, place its 6 bits in the correct position
158  ck::static_for<0, 32, 1>{}([&](auto i) {
159  uint32_t bits = static_cast<uint32_t>(x[static_cast<int>(i)]) & 0x3F;
160  constexpr int num_bits_elem = 6;
161  constexpr int num_bits_vec_elem = 32;
162  constexpr int vector_size = 6;
163  constexpr int bit_pos = i * num_bits_elem;
164  constexpr int arr_index = bit_pos / num_bits_vec_elem;
165  constexpr int bit_offset = bit_pos % num_bits_vec_elem;
166  constexpr int overhang = bit_offset + num_bits_elem - num_bits_vec_elem;
167  uint32_t old_value = packed.At(Number<arr_index>{});
168 
169  // insert bits into the current 32-bit block
170  old_value |= (bits << bit_offset);
171  packed.At(Number<arr_index>{}) = old_value;
172 
173  // if it crosses into the next block, shift the remainder
174  if constexpr(overhang > 0 && (arr_index + 1) < vector_size)
175  {
176  uint32_t next_value = packed.At(Number<arr_index + 1>{});
177  next_value |= (bits >> (num_bits_elem - overhang));
178  packed.At(Number<arr_index + 1>{}) = next_value;
179  }
180  });
181 
182  return packed;
183  }
184 };
185 
187 {
188  // store 16 elements of bf6_t in an array of 3 uint32_t
189  using element_type = uint32_t;
192  typedef int8_t test_vec_t __attribute__((ext_vector_type(16)));
194  bf6x16_pk_t(type init) : data{init} {}
195 
196  template <index_t I>
197  __host__ __device__ inline bf6_t unpack(Number<I>)
198  {
199  static_assert(I < 16, "Index out of range for 16 f6_t elements.");
200 
201  constexpr int num_bits_elem = 6;
202  constexpr int num_bits_vec_elem = 32;
203  constexpr int vector_size = 3;
204  constexpr int bit_pos = I * num_bits_elem;
205  constexpr int arr_idx = bit_pos / num_bits_vec_elem;
206  constexpr int bit_offset = bit_pos % num_bits_vec_elem;
207  uint32_t bits = data.At(Number<arr_idx>{}) >> bit_offset;
208  constexpr int overhang = bit_offset + num_bits_elem - num_bits_vec_elem;
209 
210  if constexpr(overhang > 0 && (arr_idx + 1) < vector_size)
211  {
212  bits |= (data.At(Number<arr_idx + 1>{}) & ((1u << overhang) - 1))
213  << (num_bits_elem - overhang);
214  }
215 
216  return static_cast<bf6_t>(bits & 0x3F);
217  }
218 
219  __host__ __device__ inline type pack(const test_vec_t& x)
220  {
221  type packed{};
222 
223  // for each of the 16 bf6_t values, place its 6 bits in the correct position
224  ck::static_for<0, 16, 1>{}([&](auto i) {
225  uint32_t bits = static_cast<uint32_t>(x[static_cast<int>(i)]) & 0x3F;
226  constexpr int num_bits_elem = 6;
227  constexpr int num_bits_vec_elem = 32;
228  constexpr int vector_size = 3;
229  constexpr int bit_pos = i * num_bits_elem;
230  constexpr int arr_index = bit_pos / num_bits_vec_elem;
231  constexpr int bit_offset = bit_pos % num_bits_vec_elem;
232  constexpr int overhang = bit_offset + num_bits_elem - num_bits_vec_elem;
233  uint32_t old_value = packed.At(Number<arr_index>{});
234 
235  // insert bits into the current 32-bit block
236  old_value |= (bits << bit_offset);
237  packed.At(Number<arr_index>{}) = old_value;
238 
239  // if it crosses into the next block, shift the remainder
240  if constexpr(overhang > 0 && (arr_index + 1) < vector_size)
241  {
242  uint32_t next_value = packed.At(Number<arr_index + 1>{});
243  next_value |= (bits >> (num_bits_elem - overhang));
244  packed.At(Number<arr_index + 1>{}) = next_value;
245  }
246  });
247 
248  return packed;
249  }
250 };
251 
253 {
254  // store 32 elements of bf6_t in an array of 6 uint32_t
255  using element_type = uint32_t;
258  typedef int8_t test_vec_t __attribute__((ext_vector_type(32)));
260  bf6x32_pk_t(type init) : data{init} {}
261 
262  template <index_t I>
263  __host__ __device__ inline bf6_t unpack(Number<I>)
264  {
265  static_assert(I < 32, "Index out of range for 32 f6_t elements.");
266 
267  constexpr int num_bits_elem = 6;
268  constexpr int num_bits_vec_elem = 32;
269  constexpr int vector_size = 6;
270  constexpr int bit_pos = I * num_bits_elem;
271  constexpr int arr_idx = bit_pos / num_bits_vec_elem;
272  constexpr int bit_offset = bit_pos % num_bits_vec_elem;
273  uint32_t bits = data.At(Number<arr_idx>{}) >> bit_offset;
274  constexpr int overhang = bit_offset + num_bits_elem - num_bits_vec_elem;
275 
276  if constexpr(overhang > 0 && (arr_idx + 1) < vector_size)
277  {
278  bits |= (data.At(Number<arr_idx + 1>{}) & ((1u << overhang) - 1))
279  << (num_bits_elem - overhang);
280  }
281 
282  return static_cast<bf6_t>(bits & 0x3F);
283  }
284 
285  __host__ __device__ inline type pack(const test_vec_t& x)
286  {
287  type packed{};
288 
289  // for each of the 32 bf6_t values, place its 6 bits in the correct position
290  ck::static_for<0, 32, 1>{}([&](auto i) {
291  uint32_t bits = static_cast<uint32_t>(x[static_cast<int>(i)]) & 0x3F;
292  constexpr int num_bits_elem = 6;
293  constexpr int num_bits_vec_elem = 32;
294  constexpr int vector_size = 6;
295  constexpr int bit_pos = i * num_bits_elem;
296  constexpr int arr_index = bit_pos / num_bits_vec_elem;
297  constexpr int bit_offset = bit_pos % num_bits_vec_elem;
298  constexpr int overhang = bit_offset + num_bits_elem - num_bits_vec_elem;
299  uint32_t old_value = packed.At(Number<arr_index>{});
300 
301  // insert bits into the current 32-bit block
302  old_value |= (bits << bit_offset);
303  packed.At(Number<arr_index>{}) = old_value;
304 
305  // if it crosses into the next block, shift the remainder
306  if constexpr(overhang > 0 && (arr_index + 1) < vector_size)
307  {
308  uint32_t next_value = packed.At(Number<arr_index + 1>{});
309  next_value |= (bits >> (num_bits_elem - overhang));
310  packed.At(Number<arr_index + 1>{}) = next_value;
311  }
312  });
313 
314  return packed;
315  }
316 };
317 
318 // custom data type - pack int4 data
319 struct pk_i4_t
320 {
321  using type = int8_t;
323  __host__ __device__ constexpr pk_i4_t() : data{type{}} {}
324  __host__ __device__ constexpr pk_i4_t(type init) : data{init} {}
325 };
326 
327 inline constexpr auto next_pow2(uint32_t x)
328 {
329  // Precondition: x > 1.
330  return x > 1u ? (1u << (32u - __builtin_clz(x - 1u))) : x;
331 }
332 
333 // native types: double, float, _Float16, ushort, int32_t, int8_t, uint8_t, f8_fnuz_t, bf8_fnuz_t,
334 // native types: bool, f4_t, f6_t, bf6_t
335 template <typename T>
336 inline constexpr bool is_native_type()
337 {
343 }
344 
345 // vector_type
346 template <typename T, index_t N, typename Enable = void>
347 struct vector_type;
348 
349 // Caution: DO NOT REMOVE
350 // intentionally have only declaration but no definition to cause compilation failure when trying to
351 // instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
352 // vectors"
353 template <typename T, index_t V, index_t N>
354 struct vector_type<T __attribute__((ext_vector_type(V))), N>;
355 
356 // Caution: DO NOT REMOVE
357 // intentionally have only declaration but no definition to cause compilation failure when trying to
358 // instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
359 // vectors"
360 template <typename T, index_t V, index_t N>
361 struct vector_type<vector_type<T, V>, N>;
362 
363 // vector_type_maker
364 // This is the right way to handle "vector of vectors": making a bigger vector instead
365 template <typename T, index_t N>
367 {
369 };
370 
371 template <typename T, index_t N0, index_t N1>
372 struct vector_type_maker<T __attribute__((ext_vector_type(N1))), N0>
373 {
375 };
376 
377 template <typename T, index_t N0, index_t N1>
378 struct vector_type_maker<vector_type<T, N1>, N0>
379 {
381 };
382 
383 template <typename T, index_t N>
385 
386 template <typename T, index_t N>
387 __host__ __device__ constexpr auto make_vector_type(Number<N>)
388 {
389  return typename vector_type_maker<T, N>::type{};
390 }
391 
392 // scalar_type
393 template <typename TV>
394 struct scalar_type;
395 
396 // is_scalar_type
397 template <typename TV>
399 {
400  static constexpr bool value = (scalar_type<remove_cvref_t<TV>>::vector_size == 1);
401 };
402 
403 // has_same_scalar_type
404 template <typename X, typename Y>
406  typename scalar_type<remove_cvref_t<Y>>::type>;
407 
408 template <typename T, index_t N>
409 struct scalar_type<T __attribute__((ext_vector_type(N)))>
410 {
411  using type = T;
412  static constexpr index_t vector_size = N;
413 };
414 
415 template <typename T, index_t N>
417 {
418  using type = T;
419  static constexpr index_t vector_size = N;
420 };
421 
422 //
423 template <>
424 struct scalar_type<double>
425 {
426  using type = double;
427  static constexpr index_t vector_size = 1;
428 };
429 
430 template <>
431 struct scalar_type<float>
432 {
433  using type = float;
434  static constexpr index_t vector_size = 1;
435 };
436 
437 template <>
439 {
440  using type = half_t;
441  static constexpr index_t vector_size = 1;
442 };
443 
444 template <>
446 {
447  using type = bhalf_t;
448  static constexpr index_t vector_size = 1;
449 };
450 
451 template <>
452 struct scalar_type<int32_t>
453 {
454  using type = int32_t;
455  static constexpr index_t vector_size = 1;
456 };
457 
458 template <>
460 {
461  using type = int8_t;
462  static constexpr index_t vector_size = 1;
463 };
464 
465 template <>
466 struct scalar_type<uint8_t>
467 {
468  using type = uint8_t;
469  static constexpr index_t vector_size = 1;
470 };
471 
472 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
473 template <>
474 struct scalar_type<int4_t>
475 {
476  using type = int4_t;
477  static constexpr index_t vector_size = 1;
478 };
479 #endif
480 
481 template <>
483 {
484  using type = pk_i4_t;
485  static constexpr index_t vector_size = 1;
486 };
487 
488 template <>
490 {
491  using type = f8_fnuz_t;
492  static constexpr index_t vector_size = 1;
493 };
494 
495 template <>
497 {
498  using type = bf8_fnuz_t;
499  static constexpr index_t vector_size = 1;
500 };
501 
502 template <>
504 {
506  static constexpr index_t vector_size = 1;
507 };
508 
509 template <>
511 {
513  static constexpr index_t vector_size = 1;
514 };
515 
516 template <>
517 struct scalar_type<bool>
518 {
519  using type = bool;
520  static constexpr index_t vector_size = 1;
521 };
522 
523 template <typename T>
524 struct vector_type<T, 1, typename ck::enable_if_t<is_native_type<T>()>>
525 {
526  using d1_t = T;
527  using type = d1_t;
528 
529  union
530  {
531  T d1_;
533  } data_;
534 
535  __host__ __device__ constexpr vector_type() : data_{type{0}} {}
536 
537  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
538 
539  template <typename X>
540  __host__ __device__ constexpr const auto& AsType() const
541  {
542  static_assert(is_same<X, d1_t>::value,
543  "Something went wrong, please check src and dst types.");
544 
545  return data_.d1x1_;
546  }
547 
548  template <typename X>
549  __host__ __device__ constexpr auto& AsType()
550  {
551  static_assert(is_same<X, d1_t>::value,
552  "Something went wrong, please check src and dst types.");
553 
554  return data_.d1x1_;
555  }
556 };
557 
558 __device__ int static err = 0;
559 template <typename T>
560 struct vector_type<T, 2, typename ck::enable_if_t<is_native_type<T>()>>
561 {
562  using d1_t = T;
563  typedef T d2_t __attribute__((ext_vector_type(2)));
564 
565  using type = d2_t;
566 
567  union
568  {
572  } data_;
573 
574  __host__ __device__ constexpr vector_type() : data_{type{0}} {}
575 
576  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
577 
578  template <typename X>
579  __host__ __device__ constexpr const auto& AsType() const
580  {
582  "Something went wrong, please check src and dst types.");
583 
584  if constexpr(is_same<X, d1_t>::value)
585  {
586  return data_.d1x2_;
587  }
588  else if constexpr(is_same<X, d2_t>::value)
589  {
590  return data_.d2x1_;
591  }
592  else
593  {
594  return err;
595  }
596  }
597 
598  template <typename X>
599  __host__ __device__ constexpr auto& AsType()
600  {
602  "Something went wrong, please check src and dst types.");
603 
604  if constexpr(is_same<X, d1_t>::value)
605  {
606  return data_.d1x2_;
607  }
608  else if constexpr(is_same<X, d2_t>::value)
609  {
610  return data_.d2x1_;
611  }
612  else
613  {
614  return err;
615  }
616  }
617 };
618 
619 template <typename T>
620 struct vector_type<T, 3, typename ck::enable_if_t<is_native_type<T>()>>
621 {
622  using d1_t = T;
623  typedef T d2_t __attribute__((ext_vector_type(2)));
624  typedef T d3_t __attribute__((ext_vector_type(3)));
625 
626  using type = d3_t;
627 
628  union
629  {
634  } data_;
635 
636  __host__ __device__ constexpr vector_type() : data_{type{0}} {}
637 
638  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
639 
640  template <typename X>
641  __host__ __device__ constexpr const auto& AsType() const
642  {
644  "Something went wrong, please check src and dst types.");
645 
646  if constexpr(is_same<X, d1_t>::value)
647  {
648  return data_.d1x3_;
649  }
650  else if constexpr(is_same<X, d2_t>::value)
651  {
652  return data_.d2x1_;
653  }
654  else if constexpr(is_same<X, d3_t>::value)
655  {
656  return data_.d3x1_;
657  }
658  else
659  {
660  return err;
661  }
662  }
663 
664  template <typename X>
665  __host__ __device__ constexpr auto& AsType()
666  {
668  "Something went wrong, please check src and dst types.");
669 
670  if constexpr(is_same<X, d1_t>::value)
671  {
672  return data_.d1x3_;
673  }
674  else if constexpr(is_same<X, d2_t>::value)
675  {
676  return data_.d2x1_;
677  }
678  else if constexpr(is_same<X, d3_t>::value)
679  {
680  return data_.d3x1_;
681  }
682  else
683  {
684  return err;
685  }
686  }
687 };
688 
689 template <typename T>
690 struct vector_type<T, 4, typename ck::enable_if_t<is_native_type<T>()>>
691 {
692  using d1_t = T;
693  typedef T d2_t __attribute__((ext_vector_type(2)));
694  typedef T d4_t __attribute__((ext_vector_type(4)));
695 
696  using type = d4_t;
697 
698  union
699  {
704  } data_;
705 
706  __host__ __device__ constexpr vector_type() : data_{type{0}} {}
707 
708  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
709 
710  template <typename X>
711  __host__ __device__ constexpr const auto& AsType() const
712  {
714  "Something went wrong, please check src and dst types.");
715 
716  if constexpr(is_same<X, d1_t>::value)
717  {
718  return data_.d1x4_;
719  }
720  else if constexpr(is_same<X, d2_t>::value)
721  {
722  return data_.d2x2_;
723  }
724  else if constexpr(is_same<X, d4_t>::value)
725  {
726  return data_.d4x1_;
727  }
728  else
729  {
730  return err;
731  }
732  }
733 
734  template <typename X>
735  __host__ __device__ constexpr auto& AsType()
736  {
738  "Something went wrong, please check src and dst types.");
739 
740  if constexpr(is_same<X, d1_t>::value)
741  {
742  return data_.d1x4_;
743  }
744  else if constexpr(is_same<X, d2_t>::value)
745  {
746  return data_.d2x2_;
747  }
748  else if constexpr(is_same<X, d4_t>::value)
749  {
750  return data_.d4x1_;
751  }
752  else
753  {
754  return err;
755  }
756  }
757 };
758 
759 template <typename T>
760 struct vector_type<T, 5, typename ck::enable_if_t<is_native_type<T>()>>
761 {
762  using d1_t = T;
763  typedef T d4_t __attribute__((ext_vector_type(4)));
764  typedef T d5_t __attribute__((ext_vector_type(5)));
765 
766  using type = d5_t;
767 
768  union
769  {
774  } data_;
775 
776  __host__ __device__ constexpr vector_type() : data_{type{0}} {}
777 
778  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
779 
780  template <typename X>
781  __host__ __device__ constexpr const auto& AsType() const
782  {
784  "Something went wrong, please check src and dst types.");
785 
786  if constexpr(is_same<X, d1_t>::value)
787  {
788  return data_.d1x5_;
789  }
790  else if constexpr(is_same<X, d4_t>::value)
791  {
792  return data_.d4x1_;
793  }
794  else if constexpr(is_same<X, d5_t>::value)
795  {
796  return data_.d5x1_;
797  }
798  else
799  {
800  return err;
801  }
802  }
803 
804  template <typename X>
805  __host__ __device__ constexpr auto& AsType()
806  {
808  "Something went wrong, please check src and dst types.");
809 
810  if constexpr(is_same<X, d1_t>::value)
811  {
812  return data_.d1x5_;
813  }
814  else if constexpr(is_same<X, d4_t>::value)
815  {
816  return data_.d4x1_;
817  }
818  else if constexpr(is_same<X, d5_t>::value)
819  {
820  return data_.d5x1_;
821  }
822  else
823  {
824  return err;
825  }
826  }
827 };
828 
829 template <typename T>
830 struct vector_type<T, 7, typename ck::enable_if_t<is_native_type<T>()>>
831 {
832  using d1_t = T;
833  typedef T d2_t __attribute__((ext_vector_type(2)));
834  typedef T d4_t __attribute__((ext_vector_type(4)));
835  typedef T d7_t __attribute__((ext_vector_type(7)));
836 
837  using type = d7_t;
838 
839  union
840  {
846  } data_;
847 
848  __host__ __device__ constexpr vector_type() : data_{type{0}} {}
849 
850  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
851 
852  template <typename X>
853  __host__ __device__ constexpr const auto& AsType() const
854  {
857  "Something went wrong, please check src and dst types.");
858 
859  if constexpr(is_same<X, d1_t>::value)
860  {
861  return data_.d1x7_;
862  }
863  else if constexpr(is_same<X, d2_t>::value)
864  {
865  return data_.d2x3_;
866  }
867  else if constexpr(is_same<X, d4_t>::value)
868  {
869  return data_.d4x1_;
870  }
871  else if constexpr(is_same<X, d7_t>::value)
872  {
873  return data_.d7x1_;
874  }
875  else
876  {
877  return err;
878  }
879  }
880 
881  template <typename X>
882  __host__ __device__ constexpr auto& AsType()
883  {
886  "Something went wrong, please check src and dst types.");
887 
888  if constexpr(is_same<X, d1_t>::value)
889  {
890  return data_.d1x7_;
891  }
892  else if constexpr(is_same<X, d2_t>::value)
893  {
894  return data_.d2x3_;
895  }
896  else if constexpr(is_same<X, d4_t>::value)
897  {
898  return data_.d4x1_;
899  }
900  else if constexpr(is_same<X, d7_t>::value)
901  {
902  return data_.d7x1_;
903  }
904  else
905  {
906  return err;
907  }
908  }
909 };
910 
911 template <typename T>
912 struct vector_type<T, 8, typename ck::enable_if_t<is_native_type<T>()>>
913 {
914  using d1_t = T;
915  typedef T d2_t __attribute__((ext_vector_type(2)));
916  typedef T d4_t __attribute__((ext_vector_type(4)));
917  typedef T d8_t __attribute__((ext_vector_type(8)));
918 
919  using type = d8_t;
920 
921  union
922  {
928  } data_;
929 
930  __host__ __device__ constexpr vector_type() : data_{type{0}} {}
931 
932  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
933 
934  template <typename X>
935  __host__ __device__ constexpr const auto& AsType() const
936  {
939  "Something went wrong, please check src and dst types.");
940 
941  if constexpr(is_same<X, d1_t>::value)
942  {
943  return data_.d1x8_;
944  }
945  else if constexpr(is_same<X, d2_t>::value)
946  {
947  return data_.d2x4_;
948  }
949  else if constexpr(is_same<X, d4_t>::value)
950  {
951  return data_.d4x2_;
952  }
953  else if constexpr(is_same<X, d8_t>::value)
954  {
955  return data_.d8x1_;
956  }
957  else
958  {
959  return err;
960  }
961  }
962 
963  template <typename X>
964  __host__ __device__ constexpr auto& AsType()
965  {
968  "Something went wrong, please check src and dst types.");
969 
970  if constexpr(is_same<X, d1_t>::value)
971  {
972  return data_.d1x8_;
973  }
974  else if constexpr(is_same<X, d2_t>::value)
975  {
976  return data_.d2x4_;
977  }
978  else if constexpr(is_same<X, d4_t>::value)
979  {
980  return data_.d4x2_;
981  }
982  else if constexpr(is_same<X, d8_t>::value)
983  {
984  return data_.d8x1_;
985  }
986  else
987  {
988  return err;
989  }
990  }
991 };
992 
993 template <typename T>
994 struct vector_type<T, 13, typename ck::enable_if_t<is_native_type<T>()>>
995 {
996  using d1_t = T;
997  typedef T d4_t __attribute__((ext_vector_type(4)));
998  typedef T d8_t __attribute__((ext_vector_type(8)));
999  typedef T d13_t __attribute__((ext_vector_type(13)));
1000 
1001  using type = d13_t;
1002 
1003  union
1004  {
1010  } data_;
1011 
1012  __host__ __device__ constexpr vector_type() : data_{type{0}} {}
1013 
1014  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
1015 
1016  template <typename X>
1017  __host__ __device__ constexpr const auto& AsType() const
1018  {
1021  "Something went wrong, please check src and dst types.");
1022 
1023  if constexpr(is_same<X, d1_t>::value)
1024  {
1025  return data_.d1x13_;
1026  }
1027  else if constexpr(is_same<X, d4_t>::value)
1028  {
1029  return data_.d4x3_;
1030  }
1031  else if constexpr(is_same<X, d8_t>::value)
1032  {
1033  return data_.d8x1_;
1034  }
1035  else if constexpr(is_same<X, d13_t>::value)
1036  {
1037  return data_.d13x1_;
1038  }
1039  else
1040  {
1041  return err;
1042  }
1043  }
1044 
1045  template <typename X>
1046  __host__ __device__ constexpr auto& AsType()
1047  {
1050  "Something went wrong, please check src and dst types.");
1051 
1052  if constexpr(is_same<X, d1_t>::value)
1053  {
1054  return data_.d1x13_;
1055  }
1056  else if constexpr(is_same<X, d4_t>::value)
1057  {
1058  return data_.d4x3_;
1059  }
1060  else if constexpr(is_same<X, d8_t>::value)
1061  {
1062  return data_.d8x1_;
1063  }
1064  else if constexpr(is_same<X, d13_t>::value)
1065  {
1066  return data_.d13x1_;
1067  }
1068  else
1069  {
1070  return err;
1071  }
1072  }
1073 };
1074 
1075 template <typename T>
1076 struct vector_type<T, 16, typename ck::enable_if_t<is_native_type<T>()>>
1077 {
1078  using d1_t = T;
1079  typedef T d2_t __attribute__((ext_vector_type(2)));
1080  typedef T d4_t __attribute__((ext_vector_type(4)));
1081  typedef T d8_t __attribute__((ext_vector_type(8)));
1082  typedef T d16_t __attribute__((ext_vector_type(16)));
1083 
1084  using type = d16_t;
1085 
1086  union
1087  {
1094  } data_;
1095 
1096  __host__ __device__ constexpr vector_type() : data_{type{0}} {}
1097 
1098  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
1099 
1100  template <typename X>
1101  __host__ __device__ constexpr const auto& AsType() const
1102  {
1106  "Something went wrong, please check src and dst types.");
1107 
1108  if constexpr(is_same<X, d1_t>::value)
1109  {
1110  return data_.d1x16_;
1111  }
1112  else if constexpr(is_same<X, d2_t>::value)
1113  {
1114  return data_.d2x8_;
1115  }
1116  else if constexpr(is_same<X, d4_t>::value)
1117  {
1118  return data_.d4x4_;
1119  }
1120  else if constexpr(is_same<X, d8_t>::value)
1121  {
1122  return data_.d8x2_;
1123  }
1124  else if constexpr(is_same<X, d16_t>::value)
1125  {
1126  return data_.d16x1_;
1127  }
1128  else
1129  {
1130  return err;
1131  }
1132  }
1133 
1134  template <typename X>
1135  __host__ __device__ constexpr auto& AsType()
1136  {
1140  "Something went wrong, please check src and dst types.");
1141 
1142  if constexpr(is_same<X, d1_t>::value)
1143  {
1144  return data_.d1x16_;
1145  }
1146  else if constexpr(is_same<X, d2_t>::value)
1147  {
1148  return data_.d2x8_;
1149  }
1150  else if constexpr(is_same<X, d4_t>::value)
1151  {
1152  return data_.d4x4_;
1153  }
1154  else if constexpr(is_same<X, d8_t>::value)
1155  {
1156  return data_.d8x2_;
1157  }
1158  else if constexpr(is_same<X, d16_t>::value)
1159  {
1160  return data_.d16x1_;
1161  }
1162  else
1163  {
1164  return err;
1165  }
1166  }
1167 };
1168 
1169 template <typename T>
1170 struct vector_type<T, 32, typename ck::enable_if_t<is_native_type<T>()>>
1171 {
1172  using d1_t = T;
1173  typedef T d2_t __attribute__((ext_vector_type(2)));
1174  typedef T d4_t __attribute__((ext_vector_type(4)));
1175  typedef T d8_t __attribute__((ext_vector_type(8)));
1176  typedef T d16_t __attribute__((ext_vector_type(16)));
1177  typedef T d32_t __attribute__((ext_vector_type(32)));
1178 
1179  using type = d32_t;
1180 
1181  union
1182  {
1190  } data_;
1191 
1192  __host__ __device__ constexpr vector_type() : data_{type{0}} {}
1193 
1194  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
1195 
1196  template <typename X>
1197  __host__ __device__ constexpr const auto& AsType() const
1198  {
1202  "Something went wrong, please check src and dst types.");
1203 
1204  if constexpr(is_same<X, d1_t>::value)
1205  {
1206  return data_.d1x32_;
1207  }
1208  else if constexpr(is_same<X, d2_t>::value)
1209  {
1210  return data_.d2x16_;
1211  }
1212  else if constexpr(is_same<X, d4_t>::value)
1213  {
1214  return data_.d4x8_;
1215  }
1216  else if constexpr(is_same<X, d8_t>::value)
1217  {
1218  return data_.d8x4_;
1219  }
1220  else if constexpr(is_same<X, d16_t>::value)
1221  {
1222  return data_.d16x2_;
1223  }
1224  else if constexpr(is_same<X, d32_t>::value)
1225  {
1226  return data_.d32x1_;
1227  }
1228  else
1229  {
1230  return err;
1231  }
1232  }
1233 
1234  template <typename X>
1235  __host__ __device__ constexpr auto& AsType()
1236  {
1240  "Something went wrong, please check src and dst types.");
1241 
1242  if constexpr(is_same<X, d1_t>::value)
1243  {
1244  return data_.d1x32_;
1245  }
1246  else if constexpr(is_same<X, d2_t>::value)
1247  {
1248  return data_.d2x16_;
1249  }
1250  else if constexpr(is_same<X, d4_t>::value)
1251  {
1252  return data_.d4x8_;
1253  }
1254  else if constexpr(is_same<X, d8_t>::value)
1255  {
1256  return data_.d8x4_;
1257  }
1258  else if constexpr(is_same<X, d16_t>::value)
1259  {
1260  return data_.d16x2_;
1261  }
1262  else if constexpr(is_same<X, d32_t>::value)
1263  {
1264  return data_.d32x1_;
1265  }
1266  else
1267  {
1268  return err;
1269  }
1270  }
1271 };
1272 
1273 template <typename T>
1274 struct vector_type<T, 64, typename ck::enable_if_t<is_native_type<T>()>>
1275 {
1276  using d1_t = T;
1277  typedef T d2_t __attribute__((ext_vector_type(2)));
1278  typedef T d4_t __attribute__((ext_vector_type(4)));
1279  typedef T d8_t __attribute__((ext_vector_type(8)));
1280  typedef T d16_t __attribute__((ext_vector_type(16)));
1281  typedef T d32_t __attribute__((ext_vector_type(32)));
1282  typedef T d64_t __attribute__((ext_vector_type(64)));
1283 
1284  using type = d64_t;
1285 
1286  union
1287  {
1296  } data_;
1297 
1298  __host__ __device__ constexpr vector_type() : data_{type{0}} {}
1299 
1300  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
1301 
1302  template <typename X>
1303  __host__ __device__ constexpr const auto& AsType() const
1304  {
1309  "Something went wrong, please check src and dst types.");
1310 
1311  if constexpr(is_same<X, d1_t>::value)
1312  {
1313  return data_.d1x64_;
1314  }
1315  else if constexpr(is_same<X, d2_t>::value)
1316  {
1317  return data_.d2x32_;
1318  }
1319  else if constexpr(is_same<X, d4_t>::value)
1320  {
1321  return data_.d4x16_;
1322  }
1323  else if constexpr(is_same<X, d8_t>::value)
1324  {
1325  return data_.d8x8_;
1326  }
1327  else if constexpr(is_same<X, d16_t>::value)
1328  {
1329  return data_.d16x4_;
1330  }
1331  else if constexpr(is_same<X, d32_t>::value)
1332  {
1333  return data_.d32x2_;
1334  }
1335  else if constexpr(is_same<X, d64_t>::value)
1336  {
1337  return data_.d64x1_;
1338  }
1339  else
1340  {
1341  return err;
1342  }
1343  }
1344 
1345  template <typename X>
1346  __host__ __device__ constexpr auto& AsType()
1347  {
1352  "Something went wrong, please check src and dst types.");
1353 
1354  if constexpr(is_same<X, d1_t>::value)
1355  {
1356  return data_.d1x64_;
1357  }
1358  else if constexpr(is_same<X, d2_t>::value)
1359  {
1360  return data_.d2x32_;
1361  }
1362  else if constexpr(is_same<X, d4_t>::value)
1363  {
1364  return data_.d4x16_;
1365  }
1366  else if constexpr(is_same<X, d8_t>::value)
1367  {
1368  return data_.d8x8_;
1369  }
1370  else if constexpr(is_same<X, d16_t>::value)
1371  {
1372  return data_.d16x4_;
1373  }
1374  else if constexpr(is_same<X, d32_t>::value)
1375  {
1376  return data_.d32x2_;
1377  }
1378  else if constexpr(is_same<X, d64_t>::value)
1379  {
1380  return data_.d64x1_;
1381  }
1382  else
1383  {
1384  return err;
1385  }
1386  }
1387 };
1388 
1389 template <typename T>
1390 struct vector_type<T, 128, typename ck::enable_if_t<is_native_type<T>()>>
1391 {
1392  using d1_t = T;
1393  typedef T d2_t __attribute__((ext_vector_type(2)));
1394  typedef T d4_t __attribute__((ext_vector_type(4)));
1395  typedef T d8_t __attribute__((ext_vector_type(8)));
1396  typedef T d16_t __attribute__((ext_vector_type(16)));
1397  typedef T d32_t __attribute__((ext_vector_type(32)));
1398  typedef T d64_t __attribute__((ext_vector_type(64)));
1399  typedef T d128_t __attribute__((ext_vector_type(128)));
1400 
1401  using type = d128_t;
1402 
1403  union
1404  {
1414  } data_;
1415 
1416  __host__ __device__ constexpr vector_type() : data_{type{0}} {}
1417 
1418  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
1419 
1420  template <typename X>
1421  __host__ __device__ constexpr const auto& AsType() const
1422  {
1427  "Something went wrong, please check src and dst types.");
1428 
1429  if constexpr(is_same<X, d1_t>::value)
1430  {
1431  return data_.d1x128_;
1432  }
1433  else if constexpr(is_same<X, d2_t>::value)
1434  {
1435  return data_.d2x64_;
1436  }
1437  else if constexpr(is_same<X, d4_t>::value)
1438  {
1439  return data_.d4x32_;
1440  }
1441  else if constexpr(is_same<X, d8_t>::value)
1442  {
1443  return data_.d8x16_;
1444  }
1445  else if constexpr(is_same<X, d16_t>::value)
1446  {
1447  return data_.d16x8_;
1448  }
1449  else if constexpr(is_same<X, d32_t>::value)
1450  {
1451  return data_.d32x4_;
1452  }
1453  else if constexpr(is_same<X, d64_t>::value)
1454  {
1455  return data_.d64x2_;
1456  }
1457  else if constexpr(is_same<X, d128_t>::value)
1458  {
1459  return data_.d128x1_;
1460  }
1461  else
1462  {
1463  return err;
1464  }
1465  }
1466 
1467  template <typename X>
1468  __host__ __device__ constexpr auto& AsType()
1469  {
1474  "Something went wrong, please check src and dst types.");
1475 
1476  if constexpr(is_same<X, d1_t>::value)
1477  {
1478  return data_.d1x128_;
1479  }
1480  else if constexpr(is_same<X, d2_t>::value)
1481  {
1482  return data_.d2x64_;
1483  }
1484  else if constexpr(is_same<X, d4_t>::value)
1485  {
1486  return data_.d4x32_;
1487  }
1488  else if constexpr(is_same<X, d8_t>::value)
1489  {
1490  return data_.d8x16_;
1491  }
1492  else if constexpr(is_same<X, d16_t>::value)
1493  {
1494  return data_.d16x8_;
1495  }
1496  else if constexpr(is_same<X, d32_t>::value)
1497  {
1498  return data_.d32x4_;
1499  }
1500  else if constexpr(is_same<X, d64_t>::value)
1501  {
1502  return data_.d64x2_;
1503  }
1504  else if constexpr(is_same<X, d128_t>::value)
1505  {
1506  return data_.d128x1_;
1507  }
1508  else
1509  {
1510  return err;
1511  }
1512  }
1513 };
1514 
1515 template <typename T>
1516 struct vector_type<T, 256, typename ck::enable_if_t<is_native_type<T>()>>
1517 {
1518  using d1_t = T;
1519  typedef T d2_t __attribute__((ext_vector_type(2)));
1520  typedef T d4_t __attribute__((ext_vector_type(4)));
1521  typedef T d8_t __attribute__((ext_vector_type(8)));
1522  typedef T d16_t __attribute__((ext_vector_type(16)));
1523  typedef T d32_t __attribute__((ext_vector_type(32)));
1524  typedef T d64_t __attribute__((ext_vector_type(64)));
1525  typedef T d128_t __attribute__((ext_vector_type(128)));
1526  typedef T d256_t __attribute__((ext_vector_type(256)));
1527 
1528  using type = d256_t;
1529 
1530  union
1531  {
1542  } data_;
1543 
1544  __host__ __device__ constexpr vector_type() : data_{type{0}} {}
1545 
1546  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
1547 
1548  template <typename X>
1549  __host__ __device__ constexpr const auto& AsType() const
1550  {
1551  static_assert(
1555  "Something went wrong, please check src and dst types.");
1556 
1557  if constexpr(is_same<X, d1_t>::value)
1558  {
1559  return data_.d1x256_;
1560  }
1561  else if constexpr(is_same<X, d2_t>::value)
1562  {
1563  return data_.d2x128_;
1564  }
1565  else if constexpr(is_same<X, d4_t>::value)
1566  {
1567  return data_.d4x64_;
1568  }
1569  else if constexpr(is_same<X, d8_t>::value)
1570  {
1571  return data_.d8x32_;
1572  }
1573  else if constexpr(is_same<X, d16_t>::value)
1574  {
1575  return data_.d16x16_;
1576  }
1577  else if constexpr(is_same<X, d32_t>::value)
1578  {
1579  return data_.d32x8_;
1580  }
1581  else if constexpr(is_same<X, d64_t>::value)
1582  {
1583  return data_.d64x4_;
1584  }
1585  else if constexpr(is_same<X, d128_t>::value)
1586  {
1587  return data_.d128x2_;
1588  }
1589  else if constexpr(is_same<X, d256_t>::value)
1590  {
1591  return data_.d256x1_;
1592  }
1593  else
1594  {
1595  return err;
1596  }
1597  }
1598 
1599  template <typename X>
1600  __host__ __device__ constexpr auto& AsType()
1601  {
1602  static_assert(
1606  "Something went wrong, please check src and dst types.");
1607 
1608  if constexpr(is_same<X, d1_t>::value)
1609  {
1610  return data_.d1x256_;
1611  }
1612  else if constexpr(is_same<X, d2_t>::value)
1613  {
1614  return data_.d2x128_;
1615  }
1616  else if constexpr(is_same<X, d4_t>::value)
1617  {
1618  return data_.d4x64_;
1619  }
1620  else if constexpr(is_same<X, d8_t>::value)
1621  {
1622  return data_.d8x32_;
1623  }
1624  else if constexpr(is_same<X, d16_t>::value)
1625  {
1626  return data_.d16x16_;
1627  }
1628  else if constexpr(is_same<X, d32_t>::value)
1629  {
1630  return data_.d32x8_;
1631  }
1632  else if constexpr(is_same<X, d64_t>::value)
1633  {
1634  return data_.d64x4_;
1635  }
1636  else if constexpr(is_same<X, d128_t>::value)
1637  {
1638  return data_.d128x2_;
1639  }
1640  else if constexpr(is_same<X, d256_t>::value)
1641  {
1642  return data_.d256x1_;
1643  }
1644  else
1645  {
1646  return err;
1647  }
1648  }
1649 };
1650 
1651 template <typename T, index_t N, typename Enable = void>
1653 
1654 template <typename T>
1656 {
1657  using type = unsigned _BitInt(8 * sizeof(T));
1658 };
1659 
1660 template <>
1662 {
1664 };
1665 
1666 template <>
1668 {
1670 };
1671 
1672 template <>
1674 {
1676 };
1677 
1678 template <>
1680 {
1682 };
1683 
1684 template <>
1686 {
1688 };
1689 
1690 template <>
1692 {
1694 };
1695 
1696 template <>
1698 {
1700 };
1701 
1702 template <typename T, index_t N>
1704  T,
1705  N,
1706  ck::enable_if_t<sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8>>
1707 {
1708  using data_t = typename nnvb_data_t_selector<T>::type; // select data_t based on the size of T
1709  static_assert(sizeof(T) == sizeof(data_t), "non_native_vector_base storage size mismatch");
1710  using data_v = data_t __attribute__((ext_vector_type(N)));
1712 
1713  union alignas(next_pow2(N * sizeof(T)))
1714  {
1715  data_v dN; // storage vector;
1719  } data_;
1720 
1721  __host__ __device__ constexpr non_native_vector_base(data_t a) : data_{data_v(a)} {}
1722  __host__ __device__ constexpr non_native_vector_base(T f)
1724  {
1725  }
1726  __host__ __device__ constexpr non_native_vector_base() : non_native_vector_base(T{}){};
1727  __host__ __device__ constexpr non_native_vector_base(data_v v) : data_{v} {}
1728 
1729  __host__ __device__ constexpr operator data_v() const { return data_.dN; }
1730  __host__ __device__ constexpr operator data_t() const
1731  {
1732  if constexpr(N == 1)
1733  {
1734  return data_.dxN[Number<0>{}];
1735  }
1736  else
1737  {
1738  return data_.dxN; // XXX this should cause an error
1739  }
1740  }
1741  __host__ __device__ constexpr operator T() const
1742  {
1743  if constexpr(N == 1)
1744  {
1745  return data_.dTxN[Number<0>{}];
1746  }
1747  else
1748  {
1749  return data_.dTxN; // XXX this should cause an error
1750  }
1751  }
1752 
1753  template <typename X>
1754  __host__ __device__ constexpr const auto& AsType() const
1755  {
1756  static_assert(is_same_v<X, data_t> || is_same_v<X, T> || is_same_v<X, data_v>,
1757  "Something went wrong, please check src and dst types.");
1758 
1759  if constexpr(is_same_v<X, data_t>)
1760  {
1761  return data_.dxN;
1762  }
1763  else if constexpr(is_same_v<X, T>)
1764  {
1765  return data_.dTxN;
1766  }
1767  else if constexpr(is_same_v<X, data_v>)
1768  {
1769  return data_.dNx1;
1770  }
1771  else
1772  {
1773  return err;
1774  }
1775  }
1776 
1777  template <typename X>
1778  __host__ __device__ constexpr auto& AsType()
1779  {
1780  static_assert(is_same_v<X, data_t> || is_same_v<X, T> || is_same_v<X, data_v>,
1781  "Something went wrong, please check src and dst types.");
1782 
1783  if constexpr(is_same_v<X, data_t>)
1784  {
1785  return data_.dxN;
1786  }
1787  else if constexpr(is_same_v<X, T>)
1788  {
1789  return data_.dTxN;
1790  }
1791  else if constexpr(is_same_v<X, data_v>)
1792  {
1793  return data_.dNx1;
1794  }
1795  else
1796  {
1797  return err;
1798  }
1799  }
1800 };
1801 
1802 // implementation for f6x16 and f6x32
1803 template <typename T, index_t N>
1804 struct non_native_vector_base<T, N, std::enable_if_t<sizeof(T) == 12 || sizeof(T) == 24>>
1805 {
1806  using data_t =
1807  typename nnvb_data_t_selector<T>::type; // select data_t based on declared base type
1808  using element_t = typename T::element_type; // select element_t based on declared element type
1809  static_assert(sizeof(T) == sizeof(data_t), "non_native_vector_base storage size mismatch");
1810  static constexpr size_t size_factor =
1811  sizeof(data_t) / sizeof(element_t); // f6x16: 12/4 = 3, f6x32: 24/4 = 6
1812  using data_v = element_t __attribute__((ext_vector_type(N * size_factor)));
1814 
1815  union alignas(next_pow2(N * sizeof(T)))
1816  {
1817  data_v dN; // storage vector;
1821  } data_;
1822 
1823  __host__ __device__ constexpr non_native_vector_base(data_t a)
1824  : data_{data_v(a.At(Number<0>{}))}
1825  {
1826  }
1827  __host__ __device__ constexpr non_native_vector_base(T f)
1829  {
1830  }
1831  __host__ __device__ constexpr non_native_vector_base() : non_native_vector_base(T{}){};
1832  __host__ __device__ constexpr non_native_vector_base(data_v v) : data_{v} {}
1833 
1834  __host__ __device__ constexpr operator data_v() const { return data_.dN; }
1835  __host__ __device__ constexpr operator data_t() const
1836  {
1837  if constexpr(N == 1)
1838  {
1839  return data_.dxN[Number<0>{}];
1840  }
1841  else
1842  {
1843  return data_.dxN; // XXX this should cause an error
1844  }
1845  }
1846  __host__ __device__ constexpr operator T() const
1847  {
1848  if constexpr(N == 1)
1849  {
1850  return data_.dTxN[Number<0>{}];
1851  }
1852  else
1853  {
1854  return data_.dTxN; // XXX this should cause an error
1855  }
1856  }
1857 };
1858 
1859 template <typename T, index_t N>
1860 struct scalar_type<non_native_vector_base<T, N>>;
1861 
1862 template <index_t N>
1864 {
1866 
1867  static constexpr index_t vector_size = N;
1868 };
1869 
1870 template <index_t N>
1872 {
1874 
1875  static constexpr index_t vector_size = N;
1876 };
1877 
1878 template <index_t N>
1880 {
1882 
1883  static constexpr index_t vector_size = N;
1884 };
1885 
1886 // non-native vector_type implementation
1887 template <typename T>
1888 struct vector_type<T, 1, typename ck::enable_if_t<!is_native_type<T>()>>
1889 {
1890  using d1_t = T;
1892  using type = d1_nnv_t;
1893 
1894  union alignas(next_pow2(1 * sizeof(T)))
1895  {
1899  } data_;
1900 
1901  __host__ __device__ constexpr vector_type() : data_{d1_t{}} {}
1902 
1903  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
1904 
1905  template <typename X>
1906  __host__ __device__ constexpr const auto& AsType() const
1907  {
1909  "Something went wrong, please check src and dst types.");
1910 
1912  {
1913  return data_.d1x1_;
1914  }
1915  else
1916  {
1917  return err;
1918  }
1919  }
1920 
1921  template <typename X>
1922  __host__ __device__ constexpr auto& AsType()
1923  {
1925  "Something went wrong, please check src and dst types.");
1926 
1928  {
1929  return data_.d1x1_;
1930  }
1931  else
1932  {
1933  return err;
1934  }
1935  }
1936 };
1937 
1938 template <typename T>
1939 struct vector_type<T, 2, typename ck::enable_if_t<!is_native_type<T>()>>
1940 {
1941  using d1_t = T;
1944 
1945  using type = d2_t;
1946 
1947  union alignas(next_pow2(2 * sizeof(T)))
1948  {
1952  } data_;
1953 
1954  __host__ __device__ constexpr vector_type() : data_{type{}} {}
1955 
1956  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
1957 
1958  template <typename X>
1959  __host__ __device__ constexpr const auto& AsType() const
1960  {
1963  "Something went wrong, please check src and dst types.");
1964 
1966  {
1967  return data_.d1x2_;
1968  }
1969  else if constexpr(is_same<X, d2_t>::value)
1970  {
1971  return data_.d2x1_;
1972  }
1973  else
1974  {
1975  return err;
1976  }
1977  }
1978 
1979  template <typename X>
1980  __host__ __device__ constexpr auto& AsType()
1981  {
1984  "Something went wrong, please check src and dst types.");
1985 
1987  {
1988  return data_.d1x2_;
1989  }
1990  else if constexpr(is_same<X, d2_t>::value)
1991  {
1992  return data_.d2x1_;
1993  }
1994  else
1995  {
1996  return err;
1997  }
1998  }
1999 };
2000 
2001 template <typename T>
2002 struct vector_type<T, 4, typename ck::enable_if_t<!is_native_type<T>()>>
2003 {
2004  using d1_t = T;
2008 
2009  using type = d4_t;
2010 
2011  union alignas(next_pow2(4 * sizeof(T)))
2012  {
2017  } data_;
2018 
2019  __host__ __device__ constexpr vector_type() : data_{type{}} {}
2020 
2021  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
2022 
2023  template <typename X>
2024  __host__ __device__ constexpr const auto& AsType() const
2025  {
2028  "Something went wrong, please check src and dst types.");
2029 
2031  {
2032  return data_.d1x4_;
2033  }
2034  else if constexpr(is_same<X, d2_t>::value)
2035  {
2036  return data_.d2x2_;
2037  }
2038  else if constexpr(is_same<X, d4_t>::value)
2039  {
2040  return data_.d4x1_;
2041  }
2042  else
2043  {
2044  return err;
2045  }
2046  }
2047 
2048  template <typename X>
2049  __host__ __device__ constexpr auto& AsType()
2050  {
2053  "Something went wrong, please check src and dst types.");
2054 
2056  {
2057  return data_.d1x4_;
2058  }
2059  else if constexpr(is_same<X, d2_t>::value)
2060  {
2061  return data_.d2x2_;
2062  }
2063  else if constexpr(is_same<X, d4_t>::value)
2064  {
2065  return data_.d4x1_;
2066  }
2067  else
2068  {
2069  return err;
2070  }
2071  }
2072 };
2073 
2074 template <typename T>
2075 struct vector_type<T, 8, typename ck::enable_if_t<!is_native_type<T>()>>
2076 {
2077  using d1_t = T;
2082 
2083  using type = d8_t;
2084 
2085  union alignas(next_pow2(8 * sizeof(T)))
2086  {
2092  } data_;
2093 
2094  __host__ __device__ constexpr vector_type() : data_{type{}} {}
2095 
2096  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
2097 
2098  template <typename X>
2099  __host__ __device__ constexpr const auto& AsType() const
2100  {
2104  "Something went wrong, please check src and dst types.");
2105 
2107  {
2108  return data_.d1x8_;
2109  }
2110  else if constexpr(is_same<X, d2_t>::value)
2111  {
2112  return data_.d2x4_;
2113  }
2114  else if constexpr(is_same<X, d4_t>::value)
2115  {
2116  return data_.d4x2_;
2117  }
2118  else if constexpr(is_same<X, d8_t>::value)
2119  {
2120  return data_.d8x1_;
2121  }
2122  else
2123  {
2124  return err;
2125  }
2126  }
2127 
2128  template <typename X>
2129  __host__ __device__ constexpr auto& AsType()
2130  {
2134  "Something went wrong, please check src and dst types.");
2135 
2137  {
2138  return data_.d1x8_;
2139  }
2140  else if constexpr(is_same<X, d2_t>::value)
2141  {
2142  return data_.d2x4_;
2143  }
2144  else if constexpr(is_same<X, d4_t>::value)
2145  {
2146  return data_.d4x2_;
2147  }
2148  else if constexpr(is_same<X, d8_t>::value)
2149  {
2150  return data_.d8x1_;
2151  }
2152  else
2153  {
2154  return err;
2155  }
2156  }
2157 };
2158 
2159 template <typename T>
2160 struct vector_type<T, 16, typename ck::enable_if_t<!is_native_type<T>()>>
2161 {
2162  using d1_t = T;
2168 
2169  using type = d16_t;
2170 
2171  union alignas(next_pow2(16 * sizeof(T)))
2172  {
2179  } data_;
2180 
2181  __host__ __device__ constexpr vector_type() : data_{type{}} {}
2182 
2183  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
2184 
2185  template <typename X>
2186  __host__ __device__ constexpr const auto& AsType() const
2187  {
2191  "Something went wrong, please check src and dst types.");
2192 
2194  {
2195  return data_.d1x16_;
2196  }
2197  else if constexpr(is_same<X, d2_t>::value)
2198  {
2199  return data_.d2x8_;
2200  }
2201  else if constexpr(is_same<X, d4_t>::value)
2202  {
2203  return data_.d4x4_;
2204  }
2205  else if constexpr(is_same<X, d8_t>::value)
2206  {
2207  return data_.d8x2_;
2208  }
2209  else if constexpr(is_same<X, d16_t>::value)
2210  {
2211  return data_.d16x1_;
2212  }
2213  else
2214  {
2215  return err;
2216  }
2217  }
2218 
2219  template <typename X>
2220  __host__ __device__ constexpr auto& AsType()
2221  {
2225  "Something went wrong, please check src and dst types.");
2226 
2228  {
2229  return data_.d1x16_;
2230  }
2231  else if constexpr(is_same<X, d2_t>::value)
2232  {
2233  return data_.d2x8_;
2234  }
2235  else if constexpr(is_same<X, d4_t>::value)
2236  {
2237  return data_.d4x4_;
2238  }
2239  else if constexpr(is_same<X, d8_t>::value)
2240  {
2241  return data_.d8x2_;
2242  }
2243  else if constexpr(is_same<X, d16_t>::value)
2244  {
2245  return data_.d16x1_;
2246  }
2247  else
2248  {
2249  return err;
2250  }
2251  }
2252 };
2253 
2254 template <typename T>
2255 struct vector_type<T, 32, typename ck::enable_if_t<!is_native_type<T>()>>
2256 {
2257  using d1_t = T;
2263 
2264  using type = d32_t;
2265 
2266  union alignas(next_pow2(32 * sizeof(T)))
2267  {
2275  } data_;
2276 
2277  __host__ __device__ constexpr vector_type() : data_{type{}} {}
2278 
2279  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
2280 
2281  template <typename X>
2282  __host__ __device__ constexpr const auto& AsType() const
2283  {
2287  "Something went wrong, please check src and dst types.");
2288 
2289  if constexpr(is_same<X, d1_t>::value)
2290  {
2291  return data_.d1x32_;
2292  }
2293  else if constexpr(is_same<X, d2_t>::value)
2294  {
2295  return data_.d2x16_;
2296  }
2297  else if constexpr(is_same<X, d4_t>::value)
2298  {
2299  return data_.d4x8_;
2300  }
2301  else if constexpr(is_same<X, d8_t>::value)
2302  {
2303  return data_.d8x4_;
2304  }
2305  else if constexpr(is_same<X, d16_t>::value)
2306  {
2307  return data_.d16x2_;
2308  }
2309  else if constexpr(is_same<X, d32_t>::value)
2310  {
2311  return data_.d32x1_;
2312  }
2313  else
2314  {
2315  return err;
2316  }
2317  }
2318 
2319  template <typename X>
2320  __host__ __device__ constexpr auto& AsType()
2321  {
2325  "Something went wrong, please check src and dst types.");
2326 
2327  if constexpr(is_same<X, d1_t>::value)
2328  {
2329  return data_.d1x32_;
2330  }
2331  else if constexpr(is_same<X, d2_t>::value)
2332  {
2333  return data_.d2x16_;
2334  }
2335  else if constexpr(is_same<X, d4_t>::value)
2336  {
2337  return data_.d4x8_;
2338  }
2339  else if constexpr(is_same<X, d8_t>::value)
2340  {
2341  return data_.d8x4_;
2342  }
2343  else if constexpr(is_same<X, d16_t>::value)
2344  {
2345  return data_.d16x2_;
2346  }
2347  else if constexpr(is_same<X, d32_t>::value)
2348  {
2349  return data_.d32x1_;
2350  }
2351  else
2352  {
2353  return err;
2354  }
2355  }
2356 };
2357 
2358 template <typename T>
2359 struct vector_type<T, 64, typename ck::enable_if_t<!is_native_type<T>()>>
2360 {
2361  using d1_t = T;
2368 
2369  using type = d64_t;
2370 
2371  union alignas(next_pow2(64 * sizeof(T)))
2372  {
2381  } data_;
2382 
2383  __host__ __device__ constexpr vector_type() : data_{type{}} {}
2384 
2385  __host__ __device__ constexpr vector_type(type v) : data_{v} {}
2386 
2387  template <typename X>
2388  __host__ __device__ constexpr const auto& AsType() const
2389  {
2394  "Something went wrong, please check src and dst types.");
2395 
2396  if constexpr(is_same<X, d1_t>::value)
2397  {
2398  return data_.d1x64_;
2399  }
2400  else if constexpr(is_same<X, d2_t>::value)
2401  {
2402  return data_.d2x32_;
2403  }
2404  else if constexpr(is_same<X, d4_t>::value)
2405  {
2406  return data_.d4x16_;
2407  }
2408  else if constexpr(is_same<X, d8_t>::value)
2409  {
2410  return data_.d8x8_;
2411  }
2412  else if constexpr(is_same<X, d16_t>::value)
2413  {
2414  return data_.d16x4_;
2415  }
2416  else if constexpr(is_same<X, d32_t>::value)
2417  {
2418  return data_.d32x2_;
2419  }
2420  else if constexpr(is_same<X, d64_t>::value)
2421  {
2422  return data_.d64x1_;
2423  }
2424  else
2425  {
2426  return err;
2427  }
2428  }
2429 
2430  template <typename X>
2431  __host__ __device__ constexpr auto& AsType()
2432  {
2437  "Something went wrong, please check src and dst types.");
2438 
2439  if constexpr(is_same<X, d1_t>::value)
2440  {
2441  return data_.d1x64_;
2442  }
2443  else if constexpr(is_same<X, d2_t>::value)
2444  {
2445  return data_.d2x32_;
2446  }
2447  else if constexpr(is_same<X, d4_t>::value)
2448  {
2449  return data_.d4x16_;
2450  }
2451  else if constexpr(is_same<X, d8_t>::value)
2452  {
2453  return data_.d8x8_;
2454  }
2455  else if constexpr(is_same<X, d16_t>::value)
2456  {
2457  return data_.d16x4_;
2458  }
2459  else if constexpr(is_same<X, d32_t>::value)
2460  {
2461  return data_.d32x2_;
2462  }
2463  else if constexpr(is_same<X, d64_t>::value)
2464  {
2465  return data_.d64x1_;
2466  }
2467  else
2468  {
2469  return err;
2470  }
2471  }
2472 };
2473 
2474 using int64_t = long;
2475 
2476 // fp64
2479 
2480 // fp32
2487 
2488 // fp16
2495 
2496 // bfp16
2503 
2504 // i32
2511 
2512 // i8
2519 
2520 // f8
2527 
2528 // bf8
2535 
2536 // f8
2543 
2544 // bf8
2551 
2552 #if CK_FP8_TYPE_OCP
2553 // f8
2554 using f8x2_t = f8x2_ocp_t;
2555 using f8x4_t = f8x4_ocp_t;
2556 using f8x8_t = f8x8_ocp_t;
2557 using f8x16_t = f8x16_ocp_t;
2558 using f8x32_t = f8x32_ocp_t;
2559 using f8x64_t = f8x64_ocp_t;
2560 
2561 // bf8
2562 using bf8x2_t = bf8x2_ocp_t;
2563 using bf8x4_t = bf8x4_ocp_t;
2564 using bf8x8_t = bf8x8_ocp_t;
2565 using bf8x16_t = bf8x16_ocp_t;
2566 using bf8x32_t = bf8x32_ocp_t;
2567 using bf8x64_t = bf8x64_ocp_t;
2568 #elif CK_FP8_TYPE_FNUZ
2569 // f8
2570 using f8x2_t = f8x2_fnuz_t;
2571 using f8x4_t = f8x4_fnuz_t;
2572 using f8x8_t = f8x8_fnuz_t;
2573 using f8x16_t = f8x16_fnuz_t;
2574 using f8x32_t = f8x32_fnuz_t;
2575 using f8x64_t = f8x64_fnuz_t;
2576 
2577 // bf8
2578 using bf8x2_t = bf8x2_fnuz_t;
2579 using bf8x4_t = bf8x4_fnuz_t;
2580 using bf8x8_t = bf8x8_fnuz_t;
2581 using bf8x16_t = bf8x16_fnuz_t;
2582 using bf8x32_t = bf8x32_fnuz_t;
2583 using bf8x64_t = bf8x64_fnuz_t;
2584 #endif
2585 
2586 // u8
2593 
2594 // f4
2601 
2602 // f6
2605 
2606 // bf6
2609 
2610 // pack int4
2614 
2615 #ifdef CK_CODE_GEN_RTC
2616 template <typename T>
2617 struct NumericLimits;
2618 
2619 template <>
2620 struct NumericLimits<int32_t>
2621 {
2622  __host__ __device__ static constexpr int32_t Lowest() noexcept { return -2147483647 - 1; }
2623 
2624  __host__ __device__ static constexpr int32_t Min() noexcept { return -2147483647 - 1; }
2625 
2626  __host__ __device__ static constexpr int32_t Max() noexcept { return 2147483647; }
2627 
2628  __host__ __device__ static constexpr int32_t Infinity() noexcept { return 0; }
2629 
2630  __host__ __device__ static constexpr int32_t QuietNaN() { return 0; }
2631 };
2632 template <>
2633 struct NumericLimits<int16_t>
2634 {
2635  __host__ __device__ static constexpr int16_t Lowest() noexcept { return -32768; }
2636 
2637  __host__ __device__ static constexpr int16_t Min() noexcept { return -32768; }
2638 
2639  __host__ __device__ static constexpr int16_t Max() noexcept { return 32767; }
2640 
2641  __host__ __device__ static constexpr int16_t Infinity() noexcept { return 0; }
2642 
2643  __host__ __device__ static constexpr int16_t QuietNaN() { return 0; }
2644 };
2645 
2646 template <>
2647 struct NumericLimits<int8_t>
2648 {
2649  __host__ __device__ static constexpr int8_t Lowest() noexcept { return -128; }
2650 
2651  __host__ __device__ static constexpr int8_t Min() noexcept { return -128; }
2652 
2653  __host__ __device__ static constexpr int8_t Max() noexcept { return 127; }
2654 
2655  __host__ __device__ static constexpr int8_t Infinity() noexcept { return 0; }
2656 
2657  __host__ __device__ static constexpr int8_t QuietNaN() { return 0; }
2658 };
2659 
2660 template <>
2661 struct NumericLimits<uint32_t>
2662 {
2663  __host__ __device__ static constexpr uint32_t Lowest() noexcept { return 0; }
2664 
2665  __host__ __device__ static constexpr uint32_t Min() noexcept { return 0; }
2666 
2667  __host__ __device__ static constexpr uint32_t Max() noexcept { return 4294967295U; }
2668 
2669  __host__ __device__ static constexpr uint32_t Infinity() noexcept { return 0; }
2670 
2671  __host__ __device__ static constexpr uint32_t QuietNaN() { return 0; }
2672 };
2673 
2674 template <>
2675 struct NumericLimits<uint16_t>
2676 {
2677  __host__ __device__ static constexpr uint16_t Lowest() noexcept { return 0; }
2678 
2679  __host__ __device__ static constexpr uint16_t Min() noexcept { return 0; }
2680 
2681  __host__ __device__ static constexpr uint16_t Max() noexcept { return 65535U; }
2682 
2683  __host__ __device__ static constexpr uint16_t Infinity() noexcept { return 0; }
2684 
2685  __host__ __device__ static constexpr uint16_t QuietNaN() { return 0; }
2686 };
2687 
2688 template <>
2689 struct NumericLimits<float>
2690 {
2691  static constexpr unsigned int binary_min = 0x00800000;
2692  static constexpr unsigned int binary_max = 0x7F7FFFFF;
2693  static constexpr unsigned int binary_lowest = 0xFF7FFFFF;
2694  static constexpr unsigned int binary_qnan = 0xFFC00001;
2695  static constexpr unsigned int binary_inf = 0x7F8000000;
2696 
2697  __host__ __device__ static constexpr float Min() { return bit_cast<float>(binary_min); }
2698 
2699  __host__ __device__ static constexpr float Max() { return bit_cast<float>(binary_max); }
2700 
2701  __host__ __device__ static constexpr float Lowest() { return bit_cast<float>(binary_lowest); }
2702 
2703  __host__ __device__ static constexpr float QuietNaN() { return bit_cast<float>(binary_qnan); }
2704 
2705  __host__ __device__ static constexpr float Infinity() { return bit_cast<float>(binary_inf); }
2706 };
2707 
2708 template <>
2709 struct NumericLimits<half_t>
2710 {
2711  static constexpr unsigned short binary_min = 0x0400;
2712  static constexpr unsigned short binary_max = 0x7BFF;
2713  static constexpr unsigned short binary_lowest = 0xFBFF;
2714  static constexpr unsigned short binary_qnan = 0x7FFF;
2715 
2716  __host__ __device__ static constexpr half_t Min() { return bit_cast<half_t>(binary_min); }
2717 
2718  __host__ __device__ static constexpr half_t Max() { return bit_cast<half_t>(binary_max); }
2719 
2720  __host__ __device__ static constexpr half_t Lowest() { return bit_cast<half_t>(binary_lowest); }
2721 
2722  __host__ __device__ static constexpr half_t QuietNaN() { return bit_cast<half_t>(binary_qnan); }
2723 };
2724 
2725 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
2726 template <>
2727 struct NumericLimits<int4_t>
2728 {
2729  __host__ __device__ static constexpr int4_t Min() { return int4_t(-8); }
2730 
2731  __host__ __device__ static constexpr int4_t Max() { return int4_t(7); }
2732 
2733  __host__ __device__ static constexpr int4_t Lowest() { return int4_t(-8); }
2734 };
2735 #endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
2736 
2737 template <>
2738 struct NumericLimits<f8_fnuz_t>
2739 {
2740  // negative zero nan mode with exp bias = 8
2741  static constexpr uint8_t binary_min = 0x08; // 0b00001000
2742  static constexpr uint8_t binary_max = 0x7F; // 0b01111111
2743  static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111
2744  static constexpr uint8_t binary_qnan = 0x80; // 0b10000000
2745  // ieee mode with exp bias = 7
2746  // static constexpr uint8_t binary_min = 0x08; // 0b00001000
2747  // static constexpr uint8_t binary_max = 0x77; // 0b01110111
2748  // static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111
2749  // static constexpr uint8_t binary_qnan = 0x79; // any sign, exp=1111, mant!=0
2750 
2751  __host__ __device__ static constexpr f8_fnuz_t Min() { return f8_fnuz_t(binary_min); }
2752 
2753  __host__ __device__ static constexpr f8_fnuz_t Max() { return f8_fnuz_t(binary_max); }
2754 
2755  __host__ __device__ static constexpr f8_fnuz_t Lowest() { return f8_fnuz_t(binary_lowest); }
2756 
2757  __host__ __device__ static constexpr f8_fnuz_t QuietNaN() { return f8_fnuz_t(binary_qnan); }
2758 };
2759 
2760 template <>
2761 struct NumericLimits<bf8_fnuz_t>
2762 {
2763  // negative zero nan mode with exp bias = 16
2764  static constexpr uint8_t binary_min = 0x04; // 0b00000100
2765  static constexpr uint8_t binary_max = 0x7F; // 0b01111111
2766  static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111
2767  static constexpr uint8_t binary_qnan = 0x80; // 0b10000000
2768  // ieee mode with exp bias = 15
2769  // static constexpr uint8_t binary_min = 0x04; // 0b00000100
2770  // static constexpr uint8_t binary_max = 0x7B; // 0b01111011
2771  // static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011
2772  // static constexpr uint8_t binary_qnan = 0x79; // any sign, exp=1111, mant!=
2773 
2774  __host__ __device__ static constexpr bf8_fnuz_t Min() { return bf8_fnuz_t(binary_min); }
2775 
2776  __host__ __device__ static constexpr bf8_fnuz_t Max() { return bf8_fnuz_t(binary_max); }
2777 
2778  __host__ __device__ static constexpr bf8_fnuz_t Lowest() { return bf8_fnuz_t(binary_lowest); }
2779 
2780  __host__ __device__ static constexpr bf8_fnuz_t QuietNaN() { return bf8_fnuz_t(binary_qnan); }
2781 };
2782 
2783 template <>
2784 struct NumericLimits<f8_ocp_t>
2785 {
2786  static constexpr uint8_t binary_min = 0x08; // 0b00001000 = 2^-6
2787  static constexpr uint8_t binary_max = 0x7E; // 0b01111110 = 448
2788  static constexpr uint8_t binary_lowest = 0xFE; // 0b11111110 = -448
2789  static constexpr uint8_t binary_qnan = 0x7F; // 0b01111111
2790 
2791  __host__ __device__ static constexpr f8_ocp_t Min() { return bit_cast<f8_ocp_t>(binary_min); }
2792 
2793  __host__ __device__ static constexpr f8_ocp_t Max() { return bit_cast<f8_ocp_t>(binary_max); }
2794 
2795  __host__ __device__ static constexpr f8_ocp_t Lowest()
2796  {
2797  return bit_cast<f8_ocp_t>(binary_lowest);
2798  }
2799 
2800  __host__ __device__ static constexpr f8_ocp_t QuietNaN()
2801  {
2802  return bit_cast<f8_ocp_t>(binary_qnan);
2803  }
2804 };
2805 
2806 template <>
2807 struct NumericLimits<bf8_ocp_t>
2808 {
2809  static constexpr uint8_t binary_min = 0x04; // 0b00000100 = 2^-14
2810  static constexpr uint8_t binary_max = 0x7B; // 0b01111011 = 57344
2811  static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 = -57344
2812  static constexpr uint8_t binary_qnan = 0x7D; // 0b01111101
2813 
2814  __host__ __device__ static constexpr bf8_ocp_t Min() { return bit_cast<bf8_ocp_t>(binary_min); }
2815 
2816  __host__ __device__ static constexpr bf8_ocp_t Max() { return bit_cast<bf8_ocp_t>(binary_max); }
2817 
2818  __host__ __device__ static constexpr bf8_ocp_t Lowest()
2819  {
2820  return bit_cast<bf8_ocp_t>(binary_lowest);
2821  }
2822 
2823  __host__ __device__ static constexpr bf8_ocp_t QuietNaN()
2824  {
2825  return bit_cast<bf8_ocp_t>(binary_qnan);
2826  }
2827 };
2828 #else
2829 template <typename T>
2831 {
2832  __host__ __device__ static constexpr T Min() { return std::numeric_limits<T>::min(); }
2833  __host__ __device__ static constexpr T Max() { return std::numeric_limits<T>::max(); }
2834  __host__ __device__ static constexpr T Lowest() { return std::numeric_limits<T>::lowest(); }
2835  __host__ __device__ static constexpr T QuietNaN()
2836  {
2837  return std::numeric_limits<T>::quiet_NaN();
2838  }
2839  __host__ __device__ static constexpr T Infinity() { return std::numeric_limits<T>::infinity(); }
2840 };
2841 
2842 template <>
2844 {
2845  static constexpr unsigned short binary_min = 0x0400;
2846  static constexpr unsigned short binary_max = 0x7BFF;
2847  static constexpr unsigned short binary_lowest = 0xFBFF;
2848  static constexpr unsigned short binary_qnan = 0x7FFF;
2849 
2850  __host__ __device__ static constexpr half_t Min() { return bit_cast<half_t>(binary_min); }
2851 
2852  __host__ __device__ static constexpr half_t Max() { return bit_cast<half_t>(binary_max); }
2853 
2854  __host__ __device__ static constexpr half_t Lowest() { return bit_cast<half_t>(binary_lowest); }
2855 
2856  __host__ __device__ static constexpr half_t QuietNaN() { return bit_cast<half_t>(binary_qnan); }
2857 };
2858 
2859 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
2860 template <>
2861 struct NumericLimits<int4_t>
2862 {
2863  __host__ __device__ static constexpr int4_t Min() { return int4_t(-8); }
2864 
2865  __host__ __device__ static constexpr int4_t Max() { return int4_t(7); }
2866 
2867  __host__ __device__ static constexpr int4_t Lowest() { return int4_t(-8); }
2868 };
2869 #endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
2870 
2871 template <>
2873 {
2874  // negative zero nan mode with exp bias = 8
2875  static constexpr uint8_t binary_min = 0x08; // 0b00001000
2876  static constexpr uint8_t binary_max = 0x7F; // 0b01111111
2877  static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111
2878  static constexpr uint8_t binary_qnan = 0x80; // 0b10000000
2879  // ieee mode with exp bias = 7
2880  // static constexpr uint8_t binary_min = 0x08; // 0b00001000
2881  // static constexpr uint8_t binary_max = 0x77; // 0b01110111
2882  // static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111
2883  // static constexpr uint8_t binary_qnan = 0x79; // any sign, exp=1111, mant!=0
2884 
2885  __host__ __device__ static constexpr f8_fnuz_t Min() { return f8_fnuz_t(binary_min); }
2886 
2887  __host__ __device__ static constexpr f8_fnuz_t Max() { return f8_fnuz_t(binary_max); }
2888 
2889  __host__ __device__ static constexpr f8_fnuz_t Lowest() { return f8_fnuz_t(binary_lowest); }
2890 
2891  __host__ __device__ static constexpr f8_fnuz_t QuietNaN() { return f8_fnuz_t(binary_qnan); }
2892 };
2893 
2894 template <>
2896 {
2897  // negative zero nan mode with exp bias = 16
2898  static constexpr uint8_t binary_min = 0x04; // 0b00000100
2899  static constexpr uint8_t binary_max = 0x7F; // 0b01111111
2900  static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111
2901  static constexpr uint8_t binary_qnan = 0x80; // 0b10000000
2902  // ieee mode with exp bias = 15
2903  // static constexpr uint8_t binary_min = 0x04; // 0b00000100
2904  // static constexpr uint8_t binary_max = 0x7B; // 0b01111011
2905  // static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011
2906  // static constexpr uint8_t binary_qnan = 0x79; // any sign, exp=1111, mant!=
2907 
2908  __host__ __device__ static constexpr bf8_fnuz_t Min() { return bf8_fnuz_t(binary_min); }
2909 
2910  __host__ __device__ static constexpr bf8_fnuz_t Max() { return bf8_fnuz_t(binary_max); }
2911 
2912  __host__ __device__ static constexpr bf8_fnuz_t Lowest() { return bf8_fnuz_t(binary_lowest); }
2913 
2914  __host__ __device__ static constexpr bf8_fnuz_t QuietNaN() { return bf8_fnuz_t(binary_qnan); }
2915 };
2916 
2917 template <>
2919 {
2920  static constexpr uint8_t binary_min = 0x08; // 0b00001000 = 2^-6
2921  static constexpr uint8_t binary_max = 0x7E; // 0b01111110 = 448
2922  static constexpr uint8_t binary_lowest = 0xFE; // 0b11111110 = -448
2923  static constexpr uint8_t binary_qnan = 0x7F; // 0b01111111
2924 
2925  __host__ __device__ static constexpr f8_ocp_t Min() { return bit_cast<f8_ocp_t>(binary_min); }
2926 
2927  __host__ __device__ static constexpr f8_ocp_t Max() { return bit_cast<f8_ocp_t>(binary_max); }
2928 
2929  __host__ __device__ static constexpr f8_ocp_t Lowest()
2930  {
2931  return bit_cast<f8_ocp_t>(binary_lowest);
2932  }
2933 
2934  __host__ __device__ static constexpr f8_ocp_t QuietNaN()
2935  {
2936  return bit_cast<f8_ocp_t>(binary_qnan);
2937  }
2938 };
2939 
2940 template <>
2942 {
2943  static constexpr uint8_t binary_min = 0x04; // 0b00000100 = 2^-14
2944  static constexpr uint8_t binary_max = 0x7B; // 0b01111011 = 57344
2945  static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 = -57344
2946  static constexpr uint8_t binary_qnan = 0x7D; // 0b01111101
2947 
2948  __host__ __device__ static constexpr bf8_ocp_t Min() { return bit_cast<bf8_ocp_t>(binary_min); }
2949 
2950  __host__ __device__ static constexpr bf8_ocp_t Max() { return bit_cast<bf8_ocp_t>(binary_max); }
2951 
2952  __host__ __device__ static constexpr bf8_ocp_t Lowest()
2953  {
2954  return bit_cast<bf8_ocp_t>(binary_lowest);
2955  }
2956 
2957  __host__ __device__ static constexpr bf8_ocp_t QuietNaN()
2958  {
2959  return bit_cast<bf8_ocp_t>(binary_qnan);
2960  }
2961 };
2962 #endif
2963 
2964 template <>
2966 {
2967  static constexpr uint8_t binary_min_normal = 0x2; // 0b0010
2968  static constexpr uint8_t binary_max_normal = 0x7; // 0b0111
2969  static constexpr uint8_t binary_lowest_normal = 0xF; // 0b1111
2970  static constexpr uint8_t binary_min_subnorm = 0x1; // 0b0001
2971  static constexpr uint8_t binary_max_subnorm = 0x1; // 0b0001
2972 
2973  static constexpr float data_max_normal_number = 6;
2974  static constexpr float data_min_subnormal_number = 0.5;
2975 
2976  __host__ __device__ static constexpr f4_t Min() { return f4_t(binary_min_normal); }
2977  __host__ __device__ static constexpr f4_t Max() { return f4_t(binary_max_normal); }
2978  __host__ __device__ static constexpr f4_t Lowest() { return f4_t(binary_lowest_normal); }
2979  __host__ __device__ static constexpr f4_t MinSubnorm() { return f4_t(binary_min_subnorm); }
2980  __host__ __device__ static constexpr f4_t MaxSubnorm() { return f4_t(binary_max_subnorm); }
2981 
2982  __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
2983  __host__ __device__ static constexpr float DataMinSubnorm()
2984  {
2985  return data_min_subnormal_number;
2986  }
2987 };
2988 
2989 template <>
2991 {
2992  static constexpr uint8_t binary_min_normal = 0x08; // 0b001000
2993  static constexpr uint8_t binary_max_normal = 0x1F; // 0b011111
2994  static constexpr uint8_t binary_lowest_normal = 0x3F; // 0b111111
2995  static constexpr uint8_t binary_min_subnorm = 0x01; // 0b000001
2996  static constexpr uint8_t binary_max_subnorm = 0x07; // 0b000111
2997 
2998  static constexpr float data_max_normal_number = 7.5;
2999  static constexpr float data_min_subnormal_number = 0.125;
3000 
3001  __host__ __device__ static constexpr f6_t Min() { return f6_t(binary_min_normal & 0b111111); }
3002  __host__ __device__ static constexpr f6_t Max() { return f6_t(binary_max_normal & 0b111111); }
3003  __host__ __device__ static constexpr f6_t Lowest()
3004  {
3005  return f6_t(binary_lowest_normal & 0b111111);
3006  }
3007  __host__ __device__ static constexpr f6_t MinSubnorm()
3008  {
3009  return f6_t(binary_min_subnorm & 0b111111);
3010  }
3011  __host__ __device__ static constexpr f6_t MaxSubnorm()
3012  {
3013  return f6_t(binary_max_subnorm & 0b111111);
3014  }
3015 
3016  __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
3017  __host__ __device__ static constexpr float DataMinSubnorm()
3018  {
3019  return data_min_subnormal_number;
3020  }
3021 };
3022 
3023 template <>
3025 {
3026  static constexpr uint8_t binary_min_normal = 0x08; // 0b001000
3027  static constexpr uint8_t binary_max_normal = 0x1F; // 0b011111
3028  static constexpr uint8_t binary_lowest_normal = 0x3F; // 0b111111
3029  static constexpr uint8_t binary_min_subnorm = 0x01; // 0b000001
3030  static constexpr uint8_t binary_max_subnorm = 0x03; // 0b000011
3031 
3032  static constexpr float data_max_normal_number = 28;
3033  static constexpr float data_min_subnormal_number = 0.0625;
3034 
3035  __host__ __device__ static constexpr bf6_t Min() { return bf6_t(binary_min_normal); }
3036  __host__ __device__ static constexpr bf6_t Max() { return bf6_t(binary_max_normal); }
3037  __host__ __device__ static constexpr bf6_t Lowest() { return bf6_t(binary_lowest_normal); }
3038  __host__ __device__ static constexpr bf6_t MinSubnorm() { return bf6_t(binary_min_subnorm); }
3039  __host__ __device__ static constexpr bf6_t MaxSubnorm() { return bf6_t(binary_max_subnorm); }
3040 
3041  __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
3042  __host__ __device__ static constexpr float DataMinSubnorm()
3043  {
3044  return data_min_subnormal_number;
3045  }
3046 };
3047 
3048 template <>
3050 {
3051  static constexpr e8m0_bexp_t binary_min = 0x00; // 0b00000000
3052  static constexpr e8m0_bexp_t binary_max = 0xFE; // 0b11111110
3053  static constexpr e8m0_bexp_t binary_qnan = 0xFF; // 0b11111111
3054  static constexpr e8m0_bexp_t binary_1 = 0x7F; // 0b01111111
3055  static constexpr e8m0_bexp_t binary_2 = 0x80; // 0b10000000
3056  static constexpr e8m0_bexp_t binary_3 = 0x82; // 0b10000010
3057  static constexpr e8m0_bexp_t binary_135 = 0x87; // 0b10000111
3058  static constexpr e8m0_bexp_t binary_142 = 0x8E; // 0b10001110
3059 
3060  __host__ __device__ static constexpr e8m0_bexp_t Min() { return e8m0_bexp_t(binary_min); }
3061  __host__ __device__ static constexpr e8m0_bexp_t Max() { return e8m0_bexp_t(binary_max); }
3062  __host__ __device__ static constexpr e8m0_bexp_t QuietNaN() { return e8m0_bexp_t(binary_qnan); }
3063  __host__ __device__ static constexpr e8m0_bexp_t Binary_1() { return e8m0_bexp_t(binary_1); }
3064  __host__ __device__ static constexpr e8m0_bexp_t Binary_2() { return e8m0_bexp_t(binary_2); }
3065  __host__ __device__ static constexpr e8m0_bexp_t Binary_3() { return e8m0_bexp_t(binary_3); }
3066  __host__ __device__ static constexpr e8m0_bexp_t Binary_135()
3067  {
3068  return e8m0_bexp_t(binary_135);
3069  }
3070  __host__ __device__ static constexpr e8m0_bexp_t Binary_142()
3071  {
3072  return e8m0_bexp_t(binary_142);
3073  }
3074 };
3075 
3076 template <typename T>
3078 {
3079 };
3080 
3081 template <>
3082 struct NumericUtils<float>
3083 {
3084  static constexpr int exp = 8;
3085  static constexpr int mant = 23;
3086  static constexpr int bias = 127;
3087  static constexpr uint32_t nan_mask = 0x7F800000;
3088  static constexpr uint32_t head_mask = 0xFF800000;
3089  static constexpr uint32_t mant_mask = 0x7FFFFF;
3090  static constexpr uint32_t exp_mask = 0xFF;
3091  static constexpr uint32_t Inf = 0x7F800000;
3092  static constexpr uint32_t NegInf = 0xFF800000;
3093  static constexpr uint32_t NaN = 0x7F800001;
3094  static constexpr uint32_t Neg0 = 0x80000000;
3095  static constexpr bool has_inf = true;
3096  using bitwise_type = uint32_t;
3097 };
3098 
3099 template <>
3101 {
3102  static constexpr int exp = 5;
3103  static constexpr int mant = 10;
3104  static constexpr int bias = 15;
3105  static constexpr uint16_t nan_mask = 0x7C00;
3106  static constexpr uint16_t head_mask = 0xFC00;
3107  static constexpr uint16_t mant_mask = 0x3FF;
3108  static constexpr uint16_t exp_mask = 0x1F;
3109  static constexpr uint32_t Inf = 0x7C00;
3110  static constexpr uint32_t NegInf = 0xFC00;
3111  static constexpr uint32_t NaN = 0x7C01;
3112  static constexpr uint32_t Neg0 = 0x8000;
3113  static constexpr bool has_inf = true;
3114  using bitwise_type = uint16_t;
3115 };
3116 
3117 template <>
3119 {
3120  static constexpr int exp = 8;
3121  static constexpr int mant = 7;
3122  static constexpr int bias = 128; // negative zero nan mode
3123  // static constexpr int bias = 127; // ieee mode
3124 };
3125 
3126 template <>
3128 {
3129  static constexpr int exp = 4;
3130  static constexpr int mant = 3;
3131  static constexpr int bias = 8; // negative zero nan mode
3132  // static constexpr int bias = 7; // ieee mode
3133  static constexpr bool has_inf = false;
3134 };
3135 
3136 template <>
3138 {
3139  static constexpr int exp = 5;
3140  static constexpr int mant = 2;
3141  static constexpr int bias = 16; // negative zero nan mode
3142  // static constexpr int bias = 15; // ieee mode
3143  static constexpr bool has_inf = false;
3144 };
3145 template <>
3147 {
3148  static constexpr int exp = 4;
3149  static constexpr int mant = 3;
3150  static constexpr int bias = 7;
3151 };
3152 
3153 template <>
3155 {
3156  static constexpr int exp = 5;
3157  static constexpr int mant = 2;
3158  static constexpr int bias = 15;
3159 };
3160 
3161 template <>
3163 {
3164  static constexpr int exp = 2;
3165  static constexpr int mant = 1;
3166  static constexpr int bias = 1;
3167  static constexpr uint32_t sr_shift = 10;
3168 
3169  static constexpr int unbiased_exp_min = 0;
3170  static constexpr int unbiased_exp_max = 2;
3171  static constexpr int biased_exp_min = 1;
3172  static constexpr int biased_exp_max = 3;
3173 
3174  static constexpr uint8_t positive_zero_mask = 0b0000;
3175  static constexpr uint8_t negative_zero_mask = 0b1000;
3176 
3177  static constexpr uint8_t one_mask = 0b0010;
3178  static constexpr uint8_t set_sign_mask = 0b0111;
3179 
3180  static constexpr uint8_t data_max_positive_normal_mask = 0b0111;
3181  static constexpr uint8_t data_max_negative_normal_mask = 0b1111;
3182 
3183  static constexpr uint8_t data_max_positive_subnormal_mask = 0b0001;
3184  static constexpr uint8_t data_max_negative_subnormal_mask = 0b1001;
3185 
3186  static constexpr bool has_inf = false;
3187 
3188  using bitwise_type = uint8_t;
3189 };
3190 
3191 template <>
3193 {
3194  static constexpr int exp = 2;
3195  static constexpr int mant = 3;
3196  static constexpr int bias = 1;
3197  static constexpr uint32_t sr_shift = 12;
3198 
3199  static constexpr int unbiased_exp_min = 0;
3200  static constexpr int unbiased_exp_max = 2;
3201  static constexpr int biased_exp_min = 1;
3202  static constexpr int biased_exp_max = 3;
3203 
3204  static constexpr uint8_t positive_zero_mask = 0b000000;
3205  static constexpr uint8_t negative_zero_mask = 0b100000;
3206 
3207  static constexpr uint8_t set_sign_mask = 0b011111;
3208 
3209  static constexpr uint8_t data_max_positive_normal_mask = 0b011111;
3210  static constexpr uint8_t data_max_negative_normal_mask = 0b111111;
3211 
3212  static constexpr uint8_t data_max_positive_subnormal_mask = 0b000111;
3213  static constexpr uint8_t data_max_negative_subnormal_mask = 0b100111;
3214 
3215  static constexpr bool has_inf = false;
3216  static constexpr bool has_nan = false;
3217  static constexpr bool has_zero = true;
3218 
3219  using bitwise_type = uint8_t;
3220 };
3221 
3222 template <>
3224 {
3225  static constexpr int exp = 3;
3226  static constexpr int mant = 2;
3227  static constexpr int bias = 3;
3228  static constexpr uint32_t sr_shift = 11;
3229 
3230  static constexpr int unbiased_exp_min = -2;
3231  static constexpr int unbiased_exp_max = 4;
3232  static constexpr int biased_exp_min = 1;
3233  static constexpr int biased_exp_max = 7;
3234 
3235  static constexpr uint8_t positive_zero_mask = 0b000000;
3236  static constexpr uint8_t negative_zero_mask = 0b100000;
3237 
3238  static constexpr uint8_t set_sign_mask = 0b011111;
3239 
3240  static constexpr uint8_t data_max_positive_normal_mask = 0b011111;
3241  static constexpr uint8_t data_max_negative_normal_mask = 0b111111;
3242 
3243  static constexpr uint8_t data_max_positive_subnormal_mask = 0b000011;
3244  static constexpr uint8_t data_max_negative_subnormal_mask = 0b100011;
3245 
3246  static constexpr bool has_inf = false;
3247  static constexpr bool has_nan = false;
3248  static constexpr bool has_zero = true;
3249 
3250  using bitwise_type = uint8_t;
3251 };
3252 
3253 template <>
3255 {
3256  static constexpr int exp = 8;
3257  static constexpr int mant = 0;
3258  static constexpr int bias = 127;
3259 
3260  static constexpr int unbiased_exp_min = -127;
3261  static constexpr int unbiased_exp_max = 127;
3262  static constexpr int biased_exp_min = 0;
3263  static constexpr int biased_exp_max = 254;
3264 
3265  using bitwise_type = uint8_t;
3266 };
3267 } // namespace ck
__host__ T exp(T x)
Definition: math_v2.hpp:391
__host__ constexpr __device__ T max(T x)
Definition: math.hpp:84
__host__ constexpr __device__ T min(T x)
Definition: math.hpp:116
bf8_t __attribute((ext_vector_type(2))) bf8x2_t
Definition: vector_type.hpp:195
bf8_t __attribute((ext_vector_type(4))) bf8x4_t
Definition: vector_type.hpp:196
bf8_t __attribute((ext_vector_type(16))) bf8x16_t
Definition: vector_type.hpp:198
int8_t int8_t
Definition: int8.hpp:20
bf8_t __attribute((ext_vector_type(64))) bf8x64_t
Definition: vector_type.hpp:200
bf8_t __attribute((ext_vector_type(8))) bf8x8_t
Definition: vector_type.hpp:197
bf8_t __attribute((ext_vector_type(32))) bf8x32_t
Definition: vector_type.hpp:199
Definition: ck.hpp:264
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition: statically_indexed_array.hpp:45
typename vector_type< f8_fnuz_t, 8 >::type f8x8_fnuz_t
Definition: data_type.hpp:2523
typename vector_type< bf6x16_pk_t, 1 >::type bf6x16_t
Definition: data_type.hpp:2607
typename vector_type< bf8_ocp_t, 4 >::type bf8x4_ocp_t
Definition: data_type.hpp:2546
__host__ constexpr __device__ Y bit_cast(const X &x)
Definition: type.hpp:309
typename vector_type< bhalf_t, 32 >::type bhalf32_t
Definition: data_type.hpp:2501
typename vector_type< f8_fnuz_t, 64 >::type f8x64_fnuz_t
Definition: data_type.hpp:2526
typename vector_type< float, 16 >::type float16_t
Definition: data_type.hpp:2484
typename vector_type< half_t, 32 >::type half32_t
Definition: data_type.hpp:2493
typename vector_type< bf8_ocp_t, 32 >::type bf8x32_ocp_t
Definition: data_type.hpp:2549
typename vector_type< pk_i4_t, 8 >::type pk_i4x8_t
Definition: data_type.hpp:2613
typename vector_type< f6x32_pk_t, 1 >::type f6x32_t
Definition: data_type.hpp:2604
typename vector_type< bhalf_t, 4 >::type bhalf4_t
Definition: data_type.hpp:2498
typename vector_type< int32_t, 2 >::type int32x2_t
Definition: data_type.hpp:2505
typename vector_type< pk_i4_t, 4 >::type pk_i4x4_t
Definition: data_type.hpp:2612
constexpr bool is_native_type()
Definition: data_type.hpp:336
typename vector_type< bhalf_t, 64 >::type bhalf64_t
Definition: data_type.hpp:2502
typename vector_type< int8_t, 2 >::type int8x2_t
Definition: data_type.hpp:2513
typename vector_type< uint8_t, 8 >::type uint8x8_t
Definition: data_type.hpp:2589
unsigned _BitInt(4) f4_t
Definition: data_type.hpp:27
typename vector_type< uint8_t, 32 >::type uint8x32_t
Definition: data_type.hpp:2591
_BitInt(6) f6_t
Definition: data_type.hpp:28
typename vector_type< int8_t, 32 >::type int8x32_t
Definition: data_type.hpp:2517
typename vector_type< f4x2_pk_t, 1 >::type f4x2_t
Definition: data_type.hpp:2595
typename vector_type< bhalf_t, 8 >::type bhalf8_t
Definition: data_type.hpp:2499
typename vector_type< double, 2 >::type double2_t
Definition: data_type.hpp:2477
typename vector_type< f6x16_pk_t, 1 >::type f6x16_t
Definition: data_type.hpp:2603
typename vector_type< float, 2 >::type float2_t
Definition: data_type.hpp:2481
typename vector_type< int8_t, 8 >::type int8x8_t
Definition: data_type.hpp:2515
typename vector_type< half_t, 4 >::type half4_t
Definition: data_type.hpp:2490
typename vector_type< uint8_t, 2 >::type uint8x2_t
Definition: data_type.hpp:2587
_Float16 half_t
Definition: data_type.hpp:25
typename vector_type< bf6x32_pk_t, 1 >::type bf6x32_t
Definition: data_type.hpp:2608
ushort bhalf_t
Definition: data_type.hpp:24
typename vector_type< f8_ocp_t, 2 >::type f8x2_ocp_t
Definition: data_type.hpp:2537
typename vector_type< int32_t, 8 >::type int32x8_t
Definition: data_type.hpp:2507
typename vector_type< f4x2_pk_t, 2 >::type f4x4_t
Definition: data_type.hpp:2596
typename vector_type< bf8_fnuz_t, 4 >::type bf8x4_fnuz_t
Definition: data_type.hpp:2530
typename vector_type< uint8_t, 16 >::type uint8x16_t
Definition: data_type.hpp:2590
typename vector_type< float, 8 >::type float8_t
Definition: data_type.hpp:2483
typename vector_type< bf8_ocp_t, 2 >::type bf8x2_ocp_t
Definition: data_type.hpp:2545
typename vector_type< f4x2_pk_t, 16 >::type f4x32_t
Definition: data_type.hpp:2599
typename vector_type< bhalf_t, 2 >::type bhalf2_t
Definition: data_type.hpp:2497
typename vector_type< f8_fnuz_t, 16 >::type f8x16_fnuz_t
Definition: data_type.hpp:2524
typename vector_type< f4x2_pk_t, 8 >::type f4x16_t
Definition: data_type.hpp:2598
typename vector_type< float, 4 >::type float4_t
Definition: data_type.hpp:2482
typename vector_type< int8_t, 64 >::type int8x64_t
Definition: data_type.hpp:2518
typename vector_type< int32_t, 64 >::type int32x64_t
Definition: data_type.hpp:2510
typename vector_type< int32_t, 16 >::type int32x16_t
Definition: data_type.hpp:2508
_BitInt(4) int4_t
Definition: data_type.hpp:26
unsigned _BitInt(6) bf6_t
Definition: data_type.hpp:29
typename vector_type< f8_ocp_t, 8 >::type f8x8_ocp_t
Definition: data_type.hpp:2539
unsigned _BitInt(8) bf8_fnuz_t
Definition: amd_ck_fp8.hpp:40
typename vector_type< pk_i4_t, 2 >::type pk_i4x2_t
Definition: data_type.hpp:2611
typename vector_type< f8_ocp_t, 32 >::type f8x32_ocp_t
Definition: data_type.hpp:2541
typename vector_type< bf8_fnuz_t, 16 >::type bf8x16_fnuz_t
Definition: data_type.hpp:2532
typename vector_type< uint8_t, 64 >::type uint8x64_t
Definition: data_type.hpp:2592
typename vector_type< f8_ocp_t, 64 >::type f8x64_ocp_t
Definition: data_type.hpp:2542
__host__ constexpr __device__ auto make_vector_type(Number< N >)
Definition: data_type.hpp:387
typename vector_type< int8_t, 16 >::type int8x16_t
Definition: data_type.hpp:2516
constexpr auto next_pow2(uint32_t x)
Definition: data_type.hpp:327
typename vector_type< double, 4 >::type double4_t
Definition: data_type.hpp:2478
typename vector_type< f4x2_pk_t, 32 >::type f4x64_t
Definition: data_type.hpp:2600
typename vector_type< bhalf_t, 16 >::type bhalf16_t
Definition: data_type.hpp:2500
typename vector_type< f8_ocp_t, 4 >::type f8x4_ocp_t
Definition: data_type.hpp:2538
typename vector_type< half_t, 2 >::type half2_t
Definition: data_type.hpp:2489
long int64_t
Definition: data_type.hpp:2474
typename vector_type< int32_t, 32 >::type int32x32_t
Definition: data_type.hpp:2509
typename vector_type< f8_ocp_t, 16 >::type f8x16_ocp_t
Definition: data_type.hpp:2540
typename vector_type< f8_fnuz_t, 32 >::type f8x32_fnuz_t
Definition: data_type.hpp:2525
typename vector_type< int32_t, 4 >::type int32x4_t
Definition: data_type.hpp:2506
int32_t index_t
Definition: ck.hpp:289
typename vector_type< bf8_fnuz_t, 2 >::type bf8x2_fnuz_t
Definition: data_type.hpp:2529
typename std::enable_if< B, T >::type enable_if_t
Definition: enable_if.hpp:13
typename vector_type< f8_fnuz_t, 4 >::type f8x4_fnuz_t
Definition: data_type.hpp:2522
typename vector_type< f8_fnuz_t, 2 >::type f8x2_fnuz_t
Definition: data_type.hpp:2521
typename vector_type< int8_t, 4 >::type int8x4_t
Definition: data_type.hpp:2514
typename vector_type< f4x2_pk_t, 4 >::type f4x8_t
Definition: data_type.hpp:2597
typename vector_type< bf8_ocp_t, 64 >::type bf8x64_ocp_t
Definition: data_type.hpp:2550
typename vector_type< half_t, 16 >::type half16_t
Definition: data_type.hpp:2492
typename vector_type< half_t, 64 >::type half64_t
Definition: data_type.hpp:2494
_BitInt(8) f8_fnuz_t
Definition: amd_ck_fp8.hpp:39
typename vector_type< uint8_t, 4 >::type uint8x4_t
Definition: data_type.hpp:2588
typename vector_type< float, 64 >::type float64_t
Definition: data_type.hpp:2486
typename vector_type< bf8_fnuz_t, 64 >::type bf8x64_fnuz_t
Definition: data_type.hpp:2534
typename vector_type< bf8_fnuz_t, 32 >::type bf8x32_fnuz_t
Definition: data_type.hpp:2533
typename vector_type< bf8_ocp_t, 16 >::type bf8x16_ocp_t
Definition: data_type.hpp:2548
typename vector_type< bf8_ocp_t, 8 >::type bf8x8_ocp_t
Definition: data_type.hpp:2547
typename vector_type< float, 32 >::type float32_t
Definition: data_type.hpp:2485
typename vector_type< half_t, 8 >::type half8_t
Definition: data_type.hpp:2491
typename vector_type< bf8_fnuz_t, 8 >::type bf8x8_fnuz_t
Definition: data_type.hpp:2531
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition: data_type.hpp:384
__host__ static constexpr __device__ float DataMaxNorm()
Definition: data_type.hpp:3041
__host__ static constexpr __device__ bf6_t MinSubnorm()
Definition: data_type.hpp:3038
__host__ static constexpr __device__ float DataMinSubnorm()
Definition: data_type.hpp:3042
__host__ static constexpr __device__ bf6_t MaxSubnorm()
Definition: data_type.hpp:3039
__host__ static constexpr __device__ bf6_t Max()
Definition: data_type.hpp:3036
__host__ static constexpr __device__ bf6_t Lowest()
Definition: data_type.hpp:3037
__host__ static constexpr __device__ bf6_t Min()
Definition: data_type.hpp:3035
__host__ static constexpr __device__ bf8_fnuz_t QuietNaN()
Definition: data_type.hpp:2914
__host__ static constexpr __device__ bf8_fnuz_t Min()
Definition: data_type.hpp:2908
__host__ static constexpr __device__ bf8_fnuz_t Lowest()
Definition: data_type.hpp:2912
__host__ static constexpr __device__ bf8_fnuz_t Max()
Definition: data_type.hpp:2910
__host__ static constexpr __device__ bf8_ocp_t Min()
Definition: data_type.hpp:2948
__host__ static constexpr __device__ bf8_ocp_t Lowest()
Definition: data_type.hpp:2952
__host__ static constexpr __device__ bf8_ocp_t Max()
Definition: data_type.hpp:2950
__host__ static constexpr __device__ bf8_ocp_t QuietNaN()
Definition: data_type.hpp:2957
__host__ static constexpr __device__ e8m0_bexp_t Binary_2()
Definition: data_type.hpp:3064
__host__ static constexpr __device__ e8m0_bexp_t Binary_142()
Definition: data_type.hpp:3070
__host__ static constexpr __device__ e8m0_bexp_t Max()
Definition: data_type.hpp:3061
__host__ static constexpr __device__ e8m0_bexp_t QuietNaN()
Definition: data_type.hpp:3062
__host__ static constexpr __device__ e8m0_bexp_t Binary_135()
Definition: data_type.hpp:3066
__host__ static constexpr __device__ e8m0_bexp_t Min()
Definition: data_type.hpp:3060
__host__ static constexpr __device__ e8m0_bexp_t Binary_1()
Definition: data_type.hpp:3063
__host__ static constexpr __device__ e8m0_bexp_t Binary_3()
Definition: data_type.hpp:3065
__host__ static constexpr __device__ float DataMinSubnorm()
Definition: data_type.hpp:2983
__host__ static constexpr __device__ f4_t Min()
Definition: data_type.hpp:2976
__host__ static constexpr __device__ f4_t Lowest()
Definition: data_type.hpp:2978
__host__ static constexpr __device__ float DataMaxNorm()
Definition: data_type.hpp:2982
__host__ static constexpr __device__ f4_t Max()
Definition: data_type.hpp:2977
__host__ static constexpr __device__ f4_t MaxSubnorm()
Definition: data_type.hpp:2980
__host__ static constexpr __device__ f4_t MinSubnorm()
Definition: data_type.hpp:2979
__host__ static constexpr __device__ float DataMaxNorm()
Definition: data_type.hpp:3016
__host__ static constexpr __device__ f6_t MinSubnorm()
Definition: data_type.hpp:3007
__host__ static constexpr __device__ f6_t MaxSubnorm()
Definition: data_type.hpp:3011
__host__ static constexpr __device__ f6_t Min()
Definition: data_type.hpp:3001
__host__ static constexpr __device__ f6_t Max()
Definition: data_type.hpp:3002
__host__ static constexpr __device__ f6_t Lowest()
Definition: data_type.hpp:3003
__host__ static constexpr __device__ float DataMinSubnorm()
Definition: data_type.hpp:3017
__host__ static constexpr __device__ f8_fnuz_t QuietNaN()
Definition: data_type.hpp:2891
__host__ static constexpr __device__ f8_fnuz_t Min()
Definition: data_type.hpp:2885
__host__ static constexpr __device__ f8_fnuz_t Max()
Definition: data_type.hpp:2887
__host__ static constexpr __device__ f8_fnuz_t Lowest()
Definition: data_type.hpp:2889
__host__ static constexpr __device__ f8_ocp_t Min()
Definition: data_type.hpp:2925
__host__ static constexpr __device__ f8_ocp_t Max()
Definition: data_type.hpp:2927
__host__ static constexpr __device__ f8_ocp_t QuietNaN()
Definition: data_type.hpp:2934
__host__ static constexpr __device__ f8_ocp_t Lowest()
Definition: data_type.hpp:2929
__host__ static constexpr __device__ half_t Max()
Definition: data_type.hpp:2852
__host__ static constexpr __device__ half_t Lowest()
Definition: data_type.hpp:2854
__host__ static constexpr __device__ half_t Min()
Definition: data_type.hpp:2850
__host__ static constexpr __device__ half_t QuietNaN()
Definition: data_type.hpp:2856
Definition: data_type.hpp:2831
__host__ static constexpr __device__ T Lowest()
Definition: data_type.hpp:2834
__host__ static constexpr __device__ T Infinity()
Definition: data_type.hpp:2839
__host__ static constexpr __device__ T QuietNaN()
Definition: data_type.hpp:2835
__host__ static constexpr __device__ T Min()
Definition: data_type.hpp:2832
__host__ static constexpr __device__ T Max()
Definition: data_type.hpp:2833
uint8_t bitwise_type
Definition: data_type.hpp:3250
uint8_t bitwise_type
Definition: data_type.hpp:3265
uint8_t bitwise_type
Definition: data_type.hpp:3188
uint8_t bitwise_type
Definition: data_type.hpp:3219
uint32_t bitwise_type
Definition: data_type.hpp:3096
uint16_t bitwise_type
Definition: data_type.hpp:3114
Definition: data_type.hpp:3078
__host__ constexpr __device__ const auto & At(Number< I >) const
Definition: statically_indexed_array.hpp:69
Definition: data_type.hpp:187
uint32_t element_type
Definition: data_type.hpp:189
__host__ __device__ type pack(const test_vec_t &x)
Definition: data_type.hpp:219
type data
Definition: data_type.hpp:191
int8_t test_vec_t
Definition: data_type.hpp:192
__host__ __device__ bf6_t unpack(Number< I >)
Definition: data_type.hpp:197
StaticallyIndexedArray_v2< element_type, 3 > type
Definition: data_type.hpp:190
bf6x16_pk_t()
Definition: data_type.hpp:193
bf6x16_pk_t(type init)
Definition: data_type.hpp:194
Definition: data_type.hpp:253
bf6x32_pk_t()
Definition: data_type.hpp:259
uint32_t element_type
Definition: data_type.hpp:255
type data
Definition: data_type.hpp:257
StaticallyIndexedArray_v2< element_type, 6 > type
Definition: data_type.hpp:256
__host__ __device__ bf6_t unpack(Number< I >)
Definition: data_type.hpp:263
__host__ __device__ type pack(const test_vec_t &x)
Definition: data_type.hpp:285
bf6x32_pk_t(type init)
Definition: data_type.hpp:260
int8_t test_vec_t
Definition: data_type.hpp:258
Definition: amd_ck_fp8.hpp:344
fp8_storage_t data_type
Definition: amd_ck_fp8.hpp:345
Unsigned representation of a conventional biased Float32 exponent.
Definition: e8m0.hpp:25
Definition: data_type.hpp:32
f4x2_pk_t()
Definition: data_type.hpp:35
__host__ __device__ type unpack(Number< I >) const
Definition: data_type.hpp:39
type data
Definition: data_type.hpp:34
__host__ __device__ type pack(const type x0, const type x1)
Definition: data_type.hpp:48
uint8_t type
Definition: data_type.hpp:33
f4x2_pk_t(type init)
Definition: data_type.hpp:36
Definition: data_type.hpp:55
f6x16_pk_t(type init)
Definition: data_type.hpp:62
type data
Definition: data_type.hpp:59
StaticallyIndexedArray_v2< element_type, 3 > type
Definition: data_type.hpp:58
int8_t test_vec_t
Definition: data_type.hpp:60
uint32_t element_type
Definition: data_type.hpp:57
__host__ __device__ type pack(const test_vec_t &x)
Definition: data_type.hpp:87
__host__ __device__ f6_t unpack(Number< I >)
Definition: data_type.hpp:65
f6x16_pk_t()
Definition: data_type.hpp:61
Definition: data_type.hpp:121
int8_t test_vec_t
Definition: data_type.hpp:126
f6x32_pk_t(type init)
Definition: data_type.hpp:128
__host__ __device__ f6_t unpack(Number< I >)
Definition: data_type.hpp:131
f6x32_pk_t()
Definition: data_type.hpp:127
type data
Definition: data_type.hpp:125
__host__ __device__ type pack(const test_vec_t &x)
Definition: data_type.hpp:153
StaticallyIndexedArray_v2< element_type, 6 > type
Definition: data_type.hpp:124
uint32_t element_type
Definition: data_type.hpp:123
Definition: amd_ck_fp8.hpp:298
fp8_storage_t data_type
Definition: amd_ck_fp8.hpp:299
Definition: integral_constant.hpp:10
Definition: type.hpp:177
Definition: data_type.hpp:399
static constexpr bool value
Definition: data_type.hpp:400
bf8_ocp_t::data_type type
Definition: data_type.hpp:1669
f8_ocp_t::data_type type
Definition: data_type.hpp:1663
pk_i4_t::type type
Definition: data_type.hpp:1699
Definition: data_type.hpp:1656
unsigned _BitInt(8 *sizeof(T)) type
Definition: data_type.hpp:1657
__host__ constexpr __device__ non_native_vector_base(data_v v)
Definition: data_type.hpp:1727
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:1754
__host__ constexpr __device__ non_native_vector_base(data_t a)
Definition: data_type.hpp:1721
__host__ constexpr __device__ non_native_vector_base(data_t a)
Definition: data_type.hpp:1823
typename nnvb_data_t_selector< T >::type data_t
Definition: data_type.hpp:1807
typename T::element_type element_t
Definition: data_type.hpp:1808
__host__ constexpr __device__ non_native_vector_base()
Definition: data_type.hpp:1831
__host__ constexpr __device__ non_native_vector_base(data_v v)
Definition: data_type.hpp:1832
__host__ constexpr __device__ non_native_vector_base(T f)
Definition: data_type.hpp:1827
Definition: data_type.hpp:1652
Definition: data_type.hpp:320
type data
Definition: data_type.hpp:322
__host__ constexpr __device__ pk_i4_t(type init)
Definition: data_type.hpp:324
int8_t type
Definition: data_type.hpp:321
__host__ constexpr __device__ pk_i4_t()
Definition: data_type.hpp:323
T type
Definition: data_type.hpp:411
bf8_fnuz_t type
Definition: data_type.hpp:498
bf8_ocp_t::data_type type
Definition: data_type.hpp:512
bhalf_t type
Definition: data_type.hpp:447
bool type
Definition: data_type.hpp:519
double type
Definition: data_type.hpp:426
f8_fnuz_t type
Definition: data_type.hpp:491
f8_ocp_t::data_type type
Definition: data_type.hpp:505
float type
Definition: data_type.hpp:433
half_t type
Definition: data_type.hpp:440
int32_t type
Definition: data_type.hpp:454
int8_t type
Definition: data_type.hpp:461
typename non_native_vector_base< bf8_ocp_t, N >::data_t type
Definition: data_type.hpp:1873
typename non_native_vector_base< f8_ocp_t, N >::data_t type
Definition: data_type.hpp:1865
typename non_native_vector_base< pk_i4_t, N >::data_t type
Definition: data_type.hpp:1881
uint8_t type
Definition: data_type.hpp:468
T type
Definition: data_type.hpp:418
Definition: data_type.hpp:394
Definition: functional2.hpp:31
StaticallyIndexedArray< d16_t, 8 > d16x8_
Definition: data_type.hpp:1410
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:1418
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:1421
StaticallyIndexedArray< d64_t, 2 > d64x2_
Definition: data_type.hpp:1412
StaticallyIndexedArray< d128_t, 1 > d128x1_
Definition: data_type.hpp:1413
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:1416
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:1468
StaticallyIndexedArray< d4_t, 32 > d4x32_
Definition: data_type.hpp:1408
StaticallyIndexedArray< d8_t, 16 > d8x16_
Definition: data_type.hpp:1409
StaticallyIndexedArray< d1_t, 128 > d1x128_
Definition: data_type.hpp:1406
StaticallyIndexedArray< d32_t, 4 > d32x4_
Definition: data_type.hpp:1411
StaticallyIndexedArray< d2_t, 64 > d2x64_
Definition: data_type.hpp:1407
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:1017
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:1012
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:1014
StaticallyIndexedArray< d4_t, 3 > d4x3_
Definition: data_type.hpp:1007
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:1046
StaticallyIndexedArray< d1_t, 13 > d1x13_
Definition: data_type.hpp:1006
StaticallyIndexedArray< d13_t, 1 > d13x1_
Definition: data_type.hpp:1009
StaticallyIndexedArray< d8_t, 1 > d8x1_
Definition: data_type.hpp:1008
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:1101
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:1096
StaticallyIndexedArray< d4_t, 4 > d4x4_
Definition: data_type.hpp:1091
StaticallyIndexedArray< d8_t, 2 > d8x2_
Definition: data_type.hpp:1092
StaticallyIndexedArray< d2_t, 8 > d2x8_
Definition: data_type.hpp:1090
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:1098
StaticallyIndexedArray< d1_t, 16 > d1x16_
Definition: data_type.hpp:1089
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:1135
StaticallyIndexedArray< d16_t, 1 > d16x1_
Definition: data_type.hpp:1093
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:2186
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:2220
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:2183
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:2181
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:535
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:549
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:537
StaticallyIndexedArray< T, 1 > d1x1_
Definition: data_type.hpp:532
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:540
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:1903
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:1922
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:1901
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:1906
StaticallyIndexedArray< d32_t, 8 > d32x8_
Definition: data_type.hpp:1538
StaticallyIndexedArray< d256_t, 1 > d256x1_
Definition: data_type.hpp:1541
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:1546
StaticallyIndexedArray< d128_t, 2 > d128x2_
Definition: data_type.hpp:1540
StaticallyIndexedArray< d4_t, 64 > d4x64_
Definition: data_type.hpp:1535
StaticallyIndexedArray< d16_t, 16 > d16x16_
Definition: data_type.hpp:1537
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:1549
StaticallyIndexedArray< d64_t, 4 > d64x4_
Definition: data_type.hpp:1539
StaticallyIndexedArray< d8_t, 32 > d8x32_
Definition: data_type.hpp:1536
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:1544
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:1600
StaticallyIndexedArray< d2_t, 128 > d2x128_
Definition: data_type.hpp:1534
StaticallyIndexedArray< d1_t, 256 > d1x256_
Definition: data_type.hpp:1533
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:574
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:599
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:576
StaticallyIndexedArray< d2_t, 1 > d2x1_
Definition: data_type.hpp:571
StaticallyIndexedArray< d1_t, 2 > d1x2_
Definition: data_type.hpp:570
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:579
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:1980
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:1954
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:1956
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:1959
StaticallyIndexedArray< d8_t, 4 > d8x4_
Definition: data_type.hpp:1187
StaticallyIndexedArray< d1_t, 32 > d1x32_
Definition: data_type.hpp:1184
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:1192
StaticallyIndexedArray< d32_t, 1 > d32x1_
Definition: data_type.hpp:1189
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:1197
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:1235
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:1194
StaticallyIndexedArray< d16_t, 2 > d16x2_
Definition: data_type.hpp:1188
StaticallyIndexedArray< d2_t, 16 > d2x16_
Definition: data_type.hpp:1185
StaticallyIndexedArray< d4_t, 8 > d4x8_
Definition: data_type.hpp:1186
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:2320
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:2282
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:2277
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:2279
StaticallyIndexedArray< d3_t, 1 > d3x1_
Definition: data_type.hpp:633
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:636
StaticallyIndexedArray< d1_t, 3 > d1x3_
Definition: data_type.hpp:631
StaticallyIndexedArray< d2_t, 1 > d2x1_
Definition: data_type.hpp:632
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:665
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:638
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:641
StaticallyIndexedArray< d2_t, 2 > d2x2_
Definition: data_type.hpp:702
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:711
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:706
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:708
StaticallyIndexedArray< d1_t, 4 > d1x4_
Definition: data_type.hpp:701
StaticallyIndexedArray< d4_t, 1 > d4x1_
Definition: data_type.hpp:703
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:735
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:2021
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:2049
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:2019
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:2024
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:776
StaticallyIndexedArray< d4_t, 1 > d4x1_
Definition: data_type.hpp:772
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:778
StaticallyIndexedArray< d1_t, 5 > d1x5_
Definition: data_type.hpp:771
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:805
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:781
StaticallyIndexedArray< d5_t, 1 > d5x1_
Definition: data_type.hpp:773
StaticallyIndexedArray< d2_t, 32 > d2x32_
Definition: data_type.hpp:1290
StaticallyIndexedArray< d1_t, 64 > d1x64_
Definition: data_type.hpp:1289
StaticallyIndexedArray< d4_t, 16 > d4x16_
Definition: data_type.hpp:1291
StaticallyIndexedArray< d64_t, 1 > d64x1_
Definition: data_type.hpp:1295
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:1346
StaticallyIndexedArray< d8_t, 8 > d8x8_
Definition: data_type.hpp:1292
StaticallyIndexedArray< d32_t, 2 > d32x2_
Definition: data_type.hpp:1294
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:1298
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:1303
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:1300
StaticallyIndexedArray< d16_t, 4 > d16x4_
Definition: data_type.hpp:1293
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:2385
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:2431
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:2383
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:2388
StaticallyIndexedArray< d1_t, 7 > d1x7_
Definition: data_type.hpp:842
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:850
StaticallyIndexedArray< d2_t, 3 > d2x3_
Definition: data_type.hpp:843
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:882
StaticallyIndexedArray< d4_t, 1 > d4x1_
Definition: data_type.hpp:844
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:853
StaticallyIndexedArray< d7_t, 1 > d7x1_
Definition: data_type.hpp:845
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:848
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:930
StaticallyIndexedArray< d8_t, 1 > d8x1_
Definition: data_type.hpp:927
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:932
StaticallyIndexedArray< d2_t, 4 > d2x4_
Definition: data_type.hpp:925
StaticallyIndexedArray< d1_t, 8 > d1x8_
Definition: data_type.hpp:924
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:964
StaticallyIndexedArray< d4_t, 2 > d4x2_
Definition: data_type.hpp:926
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:935
__host__ constexpr __device__ const auto & AsType() const
Definition: data_type.hpp:2099
__host__ constexpr __device__ vector_type(type v)
Definition: data_type.hpp:2096
__host__ constexpr __device__ auto & AsType()
Definition: data_type.hpp:2129
__host__ constexpr __device__ vector_type()
Definition: data_type.hpp:2094
Definition: data_type.hpp:367
Definition: data_type.hpp:347
StaticallyIndexedArray< data_t, N > dxN
Definition: data_type.hpp:1818
StaticallyIndexedArray< data_v, 1 > dNx1
Definition: data_type.hpp:1820
StaticallyIndexedArray< d16_t, 1 > d16x1_
Definition: data_type.hpp:2178
StaticallyIndexedArray< d4_t, 4 > d4x4_
Definition: data_type.hpp:2176
StaticallyIndexedArray< d1_t, 16 > d1x16_
Definition: data_type.hpp:2174
StaticallyIndexedArray< d2_t, 8 > d2x8_
Definition: data_type.hpp:2175
StaticallyIndexedArray< d8_t, 2 > d8x2_
Definition: data_type.hpp:2177
StaticallyIndexedArray< d1_t, 1 > d1x1_
Definition: data_type.hpp:1897
StaticallyIndexedArray< d1_t, 2 > d1x2_
Definition: data_type.hpp:1950
StaticallyIndexedArray< d2_t, 1 > d2x1_
Definition: data_type.hpp:1951
StaticallyIndexedArray< d1_t, 32 > d1x32_
Definition: data_type.hpp:2269
StaticallyIndexedArray< d8_t, 4 > d8x4_
Definition: data_type.hpp:2272
StaticallyIndexedArray< d16_t, 2 > d16x2_
Definition: data_type.hpp:2273
StaticallyIndexedArray< d32_t, 1 > d32x1_
Definition: data_type.hpp:2274
StaticallyIndexedArray< d4_t, 8 > d4x8_
Definition: data_type.hpp:2271
StaticallyIndexedArray< d2_t, 16 > d2x16_
Definition: data_type.hpp:2270
StaticallyIndexedArray< d2_t, 2 > d2x2_
Definition: data_type.hpp:2015
StaticallyIndexedArray< d1_t, 4 > d1x4_
Definition: data_type.hpp:2014
StaticallyIndexedArray< d4_t, 1 > d4x1_
Definition: data_type.hpp:2016
StaticallyIndexedArray< d8_t, 8 > d8x8_
Definition: data_type.hpp:2377
StaticallyIndexedArray< d1_t, 64 > d1x64_
Definition: data_type.hpp:2374
StaticallyIndexedArray< d64_t, 1 > d64x1_
Definition: data_type.hpp:2380
StaticallyIndexedArray< d4_t, 16 > d4x16_
Definition: data_type.hpp:2376
StaticallyIndexedArray< d16_t, 4 > d16x4_
Definition: data_type.hpp:2378
StaticallyIndexedArray< d32_t, 2 > d32x2_
Definition: data_type.hpp:2379
StaticallyIndexedArray< d2_t, 32 > d2x32_
Definition: data_type.hpp:2375
StaticallyIndexedArray< d2_t, 4 > d2x4_
Definition: data_type.hpp:2089
StaticallyIndexedArray< d8_t, 1 > d8x1_
Definition: data_type.hpp:2091
StaticallyIndexedArray< d4_t, 2 > d4x2_
Definition: data_type.hpp:2090
StaticallyIndexedArray< d1_t, 8 > d1x8_
Definition: data_type.hpp:2088