/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/library/utility/host_tensor.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/library/utility/host_tensor.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/library/utility/host_tensor.hpp Source File
host_tensor.hpp
Go to the documentation of this file.
1 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2 // SPDX-License-Identifier: MIT
3 
4 #pragma once
5 
6 #include <algorithm>
7 #include <cassert>
8 #include <iostream>
9 #include <fstream>
10 #include <numeric>
11 #include <random>
12 #include <thread>
13 #include <utility>
14 #include <vector>
15 
16 #include "ck/utility/data_type.hpp"
17 #include "ck/utility/span.hpp"
19 
23 
25 
26 namespace ck {
27 
28 template <typename Range>
29 std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
30 {
31  bool first = true;
32  for(auto&& v : range)
33  {
34  if(first)
35  first = false;
36  else
37  os << delim;
38  os << v;
39  }
40  return os;
41 }
42 
43 template <typename T, typename Range>
44 std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
45 {
46  bool first = true;
47  for(auto&& v : range)
48  {
49  if(first)
50  first = false;
51  else
52  os << delim;
53 
54  using RangeType = ck::remove_cvref_t<decltype(v)>;
55  if constexpr(std::is_same_v<RangeType, ck::f8_t> || std::is_same_v<RangeType, ck::bf8_t> ||
56  std::is_same_v<RangeType, ck::bhalf_t>)
57  {
58  os << ck::type_convert<float>(v);
59  }
60  else if constexpr(std::is_same_v<RangeType, ck::pk_i4_t> ||
61  std::is_same_v<RangeType, ck::f4x2_pk_t>)
62  {
63  const auto packed_floats = ck::type_convert<ck::float2_t>(v);
64  const ck::vector_type<float, 2> vector_of_floats{packed_floats};
65  os << vector_of_floats.template AsType<float>()[ck::Number<0>{}] << delim
66  << vector_of_floats.template AsType<float>()[ck::Number<1>{}];
67  }
68  else
69  {
70  os << static_cast<T>(v);
71  }
72  }
73  return os;
74 }
75 
76 template <typename F, typename T, std::size_t... Is>
77 auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
78 {
79  return f(std::get<Is>(args)...);
80 }
81 
82 template <typename F, typename T>
83 auto call_f_unpack_args(F f, T args)
84 {
85  constexpr std::size_t N = std::tuple_size<T>{};
86 
87  return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
88 }
89 
90 template <typename F, typename T, std::size_t... Is>
91 auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
92 {
93  return F(std::get<Is>(args)...);
94 }
95 
96 template <typename F, typename T>
97 auto construct_f_unpack_args(F, T args)
98 {
99  constexpr std::size_t N = std::tuple_size<T>{};
100 
101  return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
102 }
103 
173 {
176 
177  // Runtime tag describing which layout is picked when layout is not specified explicitly at
178  // construction time.
179  enum class ChosenLayout
180  {
181  Original,
182  RowMajor,
184  };
185 
186  // Master constructor
187  template <typename Layout>
188  HostTensorDescriptor(std::vector<std::size_t> lens,
189  std::vector<std::size_t> strides,
190  const Layout& layout = DefaultLayout())
191  : mLens(std::move(lens)), mStrides(std::move(strides))
192  {
193  // To support legacy use cases, when layout is not passed in
194  const auto new_layout = HandleDefaultLayout(layout);
195  if(dbg)
196  {
197  std::cout << "Original Lens: [";
198  LogRange(std::cout, mLens, ", ") << "] and Strides: [";
199  LogRange(std::cout, mStrides, ", ") << "]" << std::endl;
200  std::cout << "Layout: " << layout << " --> " << new_layout << std::endl;
201  }
202 
203  // Handling the strides and validation based on the chosen layout
204  DispatchChosenLayout(new_layout, layout, [&](auto selected_layout) {
205  this->CalculateStrides(selected_layout);
206  this->ValidateStrides(selected_layout);
207  });
208  }
209 
211 
212  // Helper that invokes a callable with a concrete layout object whose type
213  // matches the chosen tag (so template code depending on the layout type
214  // can still leverage if constexpr branches).
215  template <typename F, typename OrigLayout>
216  void DispatchChosenLayout(ChosenLayout tag, const OrigLayout& orig, F&& f) const
217  {
218  switch(tag)
219  {
223  default: f(orig); break;
224  }
225  }
226 
227  template <typename Layout>
229  {
230  if constexpr(!std::is_same_v<Layout, DefaultLayout>)
231  {
232  return ChosenLayout::Original;
233  }
234  else
235  {
236  if(mStrides.empty())
237  {
238  // No strides provided -> assume RowMajor
239  return ChosenLayout::RowMajor;
240  }
241 
242  const auto rank = mLens.size();
243 
244  if(rank > 2)
245  {
246  // Keep as-is - validation will warn/throw later
247  return ChosenLayout::Original;
248  }
249 
250  if(rank == 0)
251  {
252  // Keep as-is - validation will warn/throw later
253  return ChosenLayout::Original;
254  }
255 
256  if(rank == 1)
257  {
258  // Treat 1D tensor as RowMajor
259  return ChosenLayout::RowMajor;
260  }
261 
262  // rank == 2
263  if(mStrides.size() == 2)
264  {
265  // RowMajor pattern (?, 1)
266  if(mStrides[1] == 1)
267  {
268  return ChosenLayout::RowMajor;
269  }
270 
271  // ColumnMajor pattern (1, ?)
272  if(mStrides[0] == 1)
273  {
275  }
276  }
277 
278  // Fallback: leave as-is
279  return ChosenLayout::Original;
280  }
281  }
282 
283  template <typename Layout>
285  {
286  if constexpr(std::is_same_v<Layout, ck::tensor_layout::BypassLayoutVerification>)
287  return;
288  // This is a workaround if the original stride value is -1 (which means "unknown") has been
289  // passed in and casted to size_t (unsigned).
290  auto strides_int = AsInt(mStrides);
291 
292  // case of empty strides or all-zero: auto-calculate based on layout and tensor dimensions
293  if(mStrides.empty() || std::all_of(strides_int.begin(), strides_int.end(), [](int stride) {
294  return stride <= 0;
295  }))
296  {
297 
298  if constexpr(!(std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ||
299  std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>))
300  {
301  std::cerr << "Only RowMajor and ColumnMajor layouts are supported for empty "
302  "strides, got "
303  << layout << ". Will calculate strides as RowMajor." << std::endl;
304  }
305 
306  mStrides.clear();
307  mStrides.resize(mLens.size(), 0);
308  if(mStrides.empty())
309  return;
310 
311  mStrides.back() = 1;
312  std::partial_sum(mLens.rbegin(),
313  mLens.rend() - 1,
314  mStrides.rbegin() + 1,
315  std::multiplies<std::size_t>());
316 
317  if constexpr(std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>)
318  {
319  // swap the last two strides
320  if(mStrides.size() >= 2)
321  std::swap(mStrides[mStrides.size() - 1], mStrides[mStrides.size() - 2]);
322  }
323  }
324  // The other case is if one of the strides is unknown
325  // Currently, only GEMM RowMajor and ColumnMajor layouts are supported and only in the lower
326  // two dimensions, e.g. {..., 0, N} or {..., M, 0}. The higher dimensions are left
327  // untouched.
328  else if constexpr(std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ||
329  std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>)
330  {
331  auto rank = mStrides.size();
332  if(mLens.size() >= 2 && rank >= 2)
333  {
334  const auto inner_idx =
335  std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ? rank - 1 : rank - 2;
336  const auto outer_idx = inner_idx == rank - 1 ? rank - 2 : rank - 1;
337  if(mStrides[inner_idx] <= 0)
338  {
339  mStrides[inner_idx] = 1;
340  }
341  if(mStrides[outer_idx] <= 0)
342  {
343  mStrides[outer_idx] = mLens[inner_idx] * mStrides[inner_idx];
344  }
345  }
346  }
347  }
348 
349  template <typename Layout>
350  void ValidateStrides(const Layout& layout) const
351  {
352  if constexpr(std::is_same_v<ck::tensor_layout::BypassLayoutVerification, Layout>)
353  {
354  return;
355  }
356 
357  if(mLens.empty())
358  {
359  throw std::runtime_error(
360  "HostTensorDescriptor::ValidateStrides: empty tensor dimensions is not allowed.");
361  }
362 
363  const int rank = mLens.size();
364  if(rank == 1) // skip any 1D tensors
365  {
366  return;
367  }
368 
369  if constexpr(std::is_same_v<ck::tensor_layout::BaseTensorLayout, Layout>)
370  {
371  // Any legacy code that doesn't pass layout to HostTensorDescriptor ctor will
372  // hit this case (unless it is a special case - see `HandleDefaultLayout`).
373  throw std::runtime_error("HostTensorDescriptor::ValidateStrides: Abstract tensor "
374  "layout BaseTensorLayout can't be verified. Pls "
375  "pass specific tensor layout to HostTensorDescriptor (or "
376  "ck::tensor_layout::BypassLayoutVerification)");
377  }
378 
379  // GEMM cases
380  if constexpr(std::is_base_of_v<ck::tensor_layout::gemm::BaseGemmLayout, Layout>)
381  {
382  if(mLens.size() != mStrides.size())
383  {
384  std::ostringstream oss;
385  oss << "HostTensorDescriptor::ValidateStrides: mismatch between tensor rank and "
386  "size of strides: "
387  << *this;
388  throw std::runtime_error(oss.str());
389  }
390 
391  // in GEMM, strides must be all positive or all zeros (auto-derived from tensor
392  // dimensions)
393  auto strides_int = AsInt(mStrides);
394  if(std::any_of(
395  strides_int.begin(), strides_int.end(), [](int stride) { return stride <= 0; }))
396  {
397  std::ostringstream oss;
398  oss << "Stride values must be positive or all-zeros (auto-derived from tensor "
399  "dimensions). Instead got ";
400  std::copy(
401  strides_int.begin(), strides_int.end(), std::ostream_iterator<int>(oss, " "));
402  throw std::runtime_error(oss.str());
403  }
404 
405  if constexpr(std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ||
406  std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>)
407  {
408  // The logic here assumes the GEMM with tensor of more than 2 dims, will always have
409  // HW dimesnsions as the inner ones e.g. batched GEMM is either BHW or BWH
410  const auto inner_idx =
411  std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ? rank - 1 : rank - 2;
412  const auto outer_idx = inner_idx == rank - 1 ? rank - 2 : rank - 1;
413 
414  if(mStrides[outer_idx] < mLens[inner_idx] * mStrides[inner_idx])
415  {
416  std::ostringstream oss;
417  oss << "Invalid strides for " << layout << ": " << *this;
418  throw std::runtime_error(oss.str());
419  }
420 
421  // For higher dimensions, validate strides assuming RowMajor
422  for(int i = 1; i < rank - 2; ++i)
423  {
424  if(mStrides[i - 1] < mStrides[i] * mLens[i])
425  {
426  std::ostringstream oss;
427  oss << "Invalid strides for higher dimensions in " << layout << ": "
428  << *this;
429  throw std::runtime_error(oss.str());
430  }
431  }
432  }
433  else
434  {
435  std::ostringstream oss;
436  oss << "Error: Unsupported GEMM layout: " << layout;
437  throw std::runtime_error(oss.str());
438  }
439  }
440  // Convolution cases
442  Layout>)
443  {
444  // TBD: implement verification for Conv layouts
445  // For now, just print warning and return
446  std::cerr << "Warning: Tensor layout verification for ck::tensor_layout::convolution "
447  "layouts is not supported yet. Skipping..."
448  << std::endl;
449  return;
450  }
451  else
452  {
453  std::ostringstream oss;
454  oss << "Error: Tensor layout verification for " << layout << " is not supported yet.";
455  throw std::runtime_error(oss.str());
456  }
457  }
458 
459  template <typename X,
460  typename Layout = DefaultLayout,
461  typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
462  std::is_convertible_v<Layout, BaseTensorLayout>>>
463  HostTensorDescriptor(const std::initializer_list<X>& lens, const Layout& layout = Layout{})
464  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()), {}, layout)
465  {
466  if(dbg)
467  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
468  }
469 
470  template <typename Layout = DefaultLayout,
471  typename = std::enable_if_t<std::is_convertible_v<Layout, BaseTensorLayout>>>
472  HostTensorDescriptor(const std::initializer_list<ck::long_index_t>& lens,
473  const Layout& layout = Layout{})
474  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()), {}, layout)
475  {
476  if(dbg)
477  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
478  }
479 
480  template <typename Lengths,
481  typename Layout = DefaultLayout,
482  typename = std::enable_if_t<
483  (std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> ||
484  std::is_convertible_v<ck::ranges::range_value_t<Lengths>, ck::long_index_t>) &&
485  std::is_convertible_v<Layout, BaseTensorLayout>>>
486  HostTensorDescriptor(const Lengths& lens, const Layout& layout = Layout{})
487  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()), {}, layout)
488  {
489  if(dbg)
490  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
491  }
492 
493  template <typename X,
494  typename Y,
495  typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
496  std::is_convertible_v<Y, std::size_t>>,
497  typename Layout = DefaultLayout>
498  HostTensorDescriptor(const std::initializer_list<X>& lens,
499  const std::initializer_list<Y>& strides,
500  const Layout& layout = Layout{})
501  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
502  std::vector<std::size_t>(strides.begin(), strides.end()),
503  layout)
504  {
505  if(dbg)
506  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
507  }
508 
509  // HostTensorDescriptor({row, col}, {row_stride, col_stride})
510  template <typename Layout = DefaultLayout>
511  HostTensorDescriptor(const std::initializer_list<ck::long_index_t>& lens,
512  const std::initializer_list<ck::long_index_t>& strides,
513  const Layout& layout = Layout{})
514  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
515  std::vector<std::size_t>(strides.begin(), strides.end()),
516  layout)
517  {
518  if(dbg)
519  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
520  }
521 
522  // HostTensorDescriptor({row, col}, strides)
523  template <typename Strides, typename Layout = DefaultLayout>
524  HostTensorDescriptor(const std::initializer_list<std::size_t>& lens,
525  const Strides& strides,
526  const Layout& layout = Layout{})
527  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
528  std::vector<std::size_t>(strides.begin(), strides.end()),
529  layout)
530  {
531  if(dbg)
532  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
533  }
534 
535  template <typename Lengths,
536  typename Strides,
537  typename Layout = DefaultLayout,
538  typename = std::enable_if_t<
539  ((std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> &&
540  std::is_convertible_v<ck::ranges::range_value_t<Strides>, std::size_t>) ||
541  (std::is_convertible_v<ck::ranges::range_value_t<Lengths>, ck::long_index_t> &&
542  std::is_convertible_v<ck::ranges::range_value_t<Strides>, ck::long_index_t>)) &&
543  std::is_convertible_v<Layout, BaseTensorLayout>>>
544  HostTensorDescriptor(const Lengths& lens,
545  const Strides& strides,
546  const Layout& layout = Layout{})
547  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
548  std::vector<std::size_t>(strides.begin(), strides.end()),
549  layout)
550  {
551  if(dbg)
552  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
553  }
554 
555  std::size_t GetNumOfDimension() const;
556  std::size_t GetElementSize() const;
557  std::size_t GetElementSpaceSize() const;
558 
559  const std::vector<std::size_t>& GetLengths() const;
560  const std::vector<std::size_t>& GetStrides() const;
561 
562  template <typename... Is>
563  std::size_t GetOffsetFromMultiIndex(Is... is) const
564  {
565  assert(sizeof...(Is) == this->GetNumOfDimension());
566  std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
567  return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
568  }
569 
570  std::size_t GetOffsetFromMultiIndex(const std::vector<std::size_t>& iss) const
571  {
572  return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
573  }
574 
575  friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc);
576  friend std::ostream& operator<<(std::ostream& os, ChosenLayout tag);
577 
578  private:
579  std::vector<std::size_t> mLens;
580  std::vector<std::size_t> mStrides;
581  static constexpr bool dbg = false;
582 
589  std::vector<int> AsInt(const std::vector<size_t>& vec) const
590  {
591  std::vector<int> strides_int(vec.size());
592  std::transform(vec.begin(), vec.end(), strides_int.begin(), [](std::size_t stride) {
593  return static_cast<int>(stride);
594  });
595  return strides_int;
596  }
597 };
598 
599 template <typename New2Old, typename NewLayout = HostTensorDescriptor::BaseTensorLayout>
600 HostTensorDescriptor
602  const New2Old& new2old,
603  const NewLayout& new_layout = NewLayout())
604 {
605  std::vector<std::size_t> new_lengths(a.GetNumOfDimension());
606  std::vector<std::size_t> new_strides(a.GetNumOfDimension());
607 
608  for(std::size_t i = 0; i < a.GetNumOfDimension(); i++)
609  {
610  new_lengths[i] = a.GetLengths()[new2old[i]];
611  new_strides[i] = a.GetStrides()[new2old[i]];
612  }
613 
614  return HostTensorDescriptor(new_lengths, new_strides, new_layout);
615 }
616 
617 struct joinable_thread : std::thread
618 {
619  template <typename... Xs>
620  joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
621  {
622  }
623 
626 
628  {
629  if(this->joinable())
630  this->join();
631  }
632 };
633 
634 template <typename F, typename... Xs>
636 {
637  F mF;
638  static constexpr std::size_t NDIM = sizeof...(Xs);
639  std::array<std::size_t, NDIM> mLens;
640  std::array<std::size_t, NDIM> mStrides;
641  std::size_t mN1d;
642 
643  ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
644  {
645  mStrides.back() = 1;
646  std::partial_sum(mLens.rbegin(),
647  mLens.rend() - 1,
648  mStrides.rbegin() + 1,
649  std::multiplies<std::size_t>());
650  mN1d = mStrides[0] * mLens[0];
651  }
652 
653  std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
654  {
655  std::array<std::size_t, NDIM> indices;
656 
657  for(std::size_t idim = 0; idim < NDIM; ++idim)
658  {
659  indices[idim] = i / mStrides[idim];
660  i -= indices[idim] * mStrides[idim];
661  }
662 
663  return indices;
664  }
665 
666  void operator()(std::size_t num_thread = 1) const
667  {
668  std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
669 
670  std::vector<joinable_thread> threads(num_thread);
671 
672  for(std::size_t it = 0; it < num_thread; ++it)
673  {
674  std::size_t iw_begin = it * work_per_thread;
675  std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
676 
677  auto f = [=, *this] {
678  for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
679  {
681  }
682  };
683  threads[it] = joinable_thread(f);
684  }
685  }
686 };
687 
688 template <typename F, typename... Xs>
689 auto make_ParallelTensorFunctor(F f, Xs... xs)
690 {
691  return ParallelTensorFunctor<F, Xs...>(f, xs...);
692 }
693 
694 template <typename T>
695 struct Tensor
696 {
698  using Data = std::vector<T>;
699 
700  template <typename X>
701  Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(GetElementSpaceSize())
702  {
703  }
704 
705  template <typename X, typename Y>
706  Tensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
707  : mDesc(lens, strides), mData(GetElementSpaceSize())
708  {
709  }
710 
711  template <typename Lengths>
712  Tensor(const Lengths& lens) : mDesc(lens), mData(GetElementSpaceSize())
713  {
714  }
715 
716  template <typename Lengths, typename Strides>
717  Tensor(const Lengths& lens, const Strides& strides)
718  : mDesc(lens, strides), mData(GetElementSpaceSize())
719  {
720  }
721 
722  template <typename X, typename... Rest, std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
723  Tensor(std::initializer_list<X> lens, Rest&&... rest)
724  : mDesc(lens, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
725  {
726  }
727 
728  template <typename X,
729  typename Y,
730  typename... Rest,
731  std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
732  Tensor(std::initializer_list<X> lens, std::initializer_list<Y> strides, Rest&&... rest)
733  : mDesc(lens, strides, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
734  {
735  }
736 
737  template <typename Lengths, typename... Rest, std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
738  Tensor(const Lengths& lens, Rest&&... rest)
739  : mDesc(lens, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
740  {
741  }
742 
743  template <typename Lengths,
744  typename Strides,
745  typename... Rest,
746  std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
747  Tensor(const Lengths& lens, const Strides& strides, Rest&&... rest)
748  : mDesc(lens, strides, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
749  {
750  }
751 
752  Tensor(const Descriptor& desc) : mDesc(desc), mData(GetElementSpaceSize()) {}
753 
754  template <typename OutT>
756  {
757  Tensor<OutT> ret(mDesc);
758 
760  mData, ret.mData.begin(), [](auto value) { return ck::type_convert<OutT>(value); });
761 
762  return ret;
763  }
764 
765  Tensor() = delete;
766  Tensor(const Tensor&) = default;
767  Tensor(Tensor&&) = default;
768 
769  ~Tensor() = default;
770 
771  Tensor& operator=(const Tensor&) = default;
772  Tensor& operator=(Tensor&&) = default;
773 
774  template <typename FromT>
775  explicit Tensor(const Tensor<FromT>& other) : Tensor(other.template CopyAsType<T>())
776  {
777  }
778  void savetxt(std::string file_name, std::string dtype = "float")
779  {
780  std::ofstream file(file_name);
781 
782  if(file.is_open())
783  {
784  for(auto& itm : mData)
785  {
786  if(dtype == "float")
787  file << ck::type_convert<float>(itm) << std::endl;
788  else if(dtype == "int")
789  file << ck::type_convert<int>(itm) << std::endl;
790  else
791  // TODO: we didn't implement operator<< for all custom
792  // data types, here fall back to float in case compile error
793  file << ck::type_convert<float>(itm) << std::endl;
794  }
795  file.close();
796  }
797  else
798  {
799  // Print an error message to the standard error
800  // stream if the file cannot be opened.
801  throw std::runtime_error(std::string("unable to open file:") + file_name);
802  }
803  }
804  decltype(auto) GetLengths() const { return mDesc.GetLengths(); }
805 
806  decltype(auto) GetStrides() const { return mDesc.GetStrides(); }
807 
808  std::size_t GetNumOfDimension() const { return mDesc.GetNumOfDimension(); }
809 
810  std::size_t GetElementSize() const { return mDesc.GetElementSize(); }
811 
812  std::size_t GetElementSpaceSize() const
813  {
815  {
816  return (mDesc.GetElementSpaceSize() + 1) / ck::packed_size_v<ck::remove_cvref_t<T>>;
817  }
818  else
819  {
820  return mDesc.GetElementSpaceSize();
821  }
822  }
823 
824  std::size_t GetElementSpaceSizeInBytes() const { return sizeof(T) * GetElementSpaceSize(); }
825 
826  void SetZero() { ck::ranges::fill<T>(mData, T{0}); }
827 
828  template <typename F>
829  void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
830  {
831  if(rank == mDesc.GetNumOfDimension())
832  {
833  f(*this, idx);
834  return;
835  }
836  // else
837  for(size_t i = 0; i < mDesc.GetLengths()[rank]; i++)
838  {
839  idx[rank] = i;
840  ForEach_impl(std::forward<F>(f), idx, rank + 1);
841  }
842  }
843 
844  template <typename F>
845  void ForEach(F&& f)
846  {
847  std::vector<size_t> idx(mDesc.GetNumOfDimension(), 0);
848  ForEach_impl(std::forward<F>(f), idx, size_t(0));
849  }
850 
851  template <typename F>
852  void ForEach_impl(const F&& f, std::vector<size_t>& idx, size_t rank) const
853  {
854  if(rank == mDesc.GetNumOfDimension())
855  {
856  f(*this, idx);
857  return;
858  }
859  // else
860  for(size_t i = 0; i < mDesc.GetLengths()[rank]; i++)
861  {
862  idx[rank] = i;
863  ForEach_impl(std::forward<const F>(f), idx, rank + 1);
864  }
865  }
866 
867  template <typename F>
868  void ForEach(const F&& f) const
869  {
870  std::vector<size_t> idx(mDesc.GetNumOfDimension(), 0);
871  ForEach_impl(std::forward<const F>(f), idx, size_t(0));
872  }
873 
874  template <typename G>
875  void GenerateTensorValue(G g, std::size_t num_thread = 1)
876  {
877  switch(mDesc.GetNumOfDimension())
878  {
879  case 1: {
880  auto f = [&](auto i) { (*this)(i) = g(i); };
881  make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
882  break;
883  }
884  case 2: {
885  auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
886  make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
887  break;
888  }
889  case 3: {
890  auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
892  f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
893  break;
894  }
895  case 4: {
896  auto f = [&](auto i0, auto i1, auto i2, auto i3) {
897  (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
898  };
900  mDesc.GetLengths()[0],
901  mDesc.GetLengths()[1],
902  mDesc.GetLengths()[2],
903  mDesc.GetLengths()[3])(num_thread);
904  break;
905  }
906  case 5: {
907  auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
908  (*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
909  };
911  mDesc.GetLengths()[0],
912  mDesc.GetLengths()[1],
913  mDesc.GetLengths()[2],
914  mDesc.GetLengths()[3],
915  mDesc.GetLengths()[4])(num_thread);
916  break;
917  }
918  case 6: {
919  auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4, auto i5) {
920  (*this)(i0, i1, i2, i3, i4, i5) = g(i0, i1, i2, i3, i4, i5);
921  };
923  mDesc.GetLengths()[0],
924  mDesc.GetLengths()[1],
925  mDesc.GetLengths()[2],
926  mDesc.GetLengths()[3],
927  mDesc.GetLengths()[4],
928  mDesc.GetLengths()[5])(num_thread);
929  break;
930  }
931  case 12: {
932  auto f = [&](auto i0,
933  auto i1,
934  auto i2,
935  auto i3,
936  auto i4,
937  auto i5,
938  auto i6,
939  auto i7,
940  auto i8,
941  auto i9,
942  auto i10,
943  auto i11) {
944  (*this)(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) =
945  g(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11);
946  };
948  mDesc.GetLengths()[0],
949  mDesc.GetLengths()[1],
950  mDesc.GetLengths()[2],
951  mDesc.GetLengths()[3],
952  mDesc.GetLengths()[4],
953  mDesc.GetLengths()[5],
954  mDesc.GetLengths()[6],
955  mDesc.GetLengths()[7],
956  mDesc.GetLengths()[8],
957  mDesc.GetLengths()[9],
958  mDesc.GetLengths()[10],
959  mDesc.GetLengths()[11])(num_thread);
960  break;
961  }
962  default: throw std::runtime_error("unspported dimension");
963  }
964  }
965 
966  // Generate random values with multiple threads. Guaranteed to give the same sequence with any
967  // number of threads provided.
968  template <typename Distribution = std::uniform_real_distribution<float>,
969  typename Mapping = ck::identity,
970  typename Generator = std::minstd_rand>
971  void GenerateTensorDistr(Distribution dis = {0.f, 1.f},
972  Mapping fn = {},
973  const Generator g = Generator(0), // default seed 0
974  std::size_t num_thread = -1)
975  {
977  using ck::math::min;
978  if(num_thread == -1ULL)
979  num_thread = min(ck::get_available_cpu_cores(), 80U); // max 80 threads
980  // At least 2MB per thread
981  num_thread = min(num_thread, integer_divide_ceil(this->GetElementSpaceSize(), 0x200000));
982  constexpr std::size_t BLOCK_BYTES = 64;
983  constexpr std::size_t BLOCK_SIZE = BLOCK_BYTES / sizeof(T);
984 
985  const std::size_t num_blocks = integer_divide_ceil(this->GetElementSpaceSize(), BLOCK_SIZE);
986  const std::size_t blocks_per_thread = integer_divide_ceil(num_blocks, num_thread);
987 
988  std::vector<std::thread> threads;
989  threads.reserve(num_thread - 1);
990  const auto dst = const_cast<T*>(this->mData.data());
991  const auto element_space_size = this->GetElementSpaceSize();
992  for(int it = num_thread - 1; it >= 0; --it)
993  {
994  std::size_t ib_begin = it * blocks_per_thread;
995  std::size_t ib_end = min(ib_begin + blocks_per_thread, num_blocks);
996 
997  auto job = [=]() {
998  auto g_ = g; // copy
999  auto dis_ = dis; // copy
1000  g_.discard(ib_begin * BLOCK_SIZE * ck::packed_size_v<T>);
1001  auto t_fn = [&]() {
1002  // As user can pass integer distribution in dis, we must ensure that the correct
1003  // constructor/converter is called at all times. For f4/f6/f8 types, to ensure
1004  // correct results, we convert from float to the target type. In these cases
1005  // integer constructors are interpreted as direct initialization of the internal
1006  // storage with binary values instead of treating integers as subset of floats.
1007  if constexpr(ck::is_same_v<T, ck::f8_t> || ck::is_same_v<T, ck::bf8_t>)
1008  return ck::type_convert<T>(static_cast<float>(fn(dis_(g_))));
1009  else if constexpr(ck::packed_size_v<T> == 1)
1010  return ck::type_convert<T>(fn(dis_(g_)));
1011  else if constexpr(ck::is_same_v<T, ck::f4x2_pk_t>)
1012  return ck::f4x2_pk_t{ck::type_convert<ck::f4x2_t>(
1013  ck::float2_t{ck::type_convert<float>(fn(dis_(g_))),
1014  ck::type_convert<float>(fn(dis_(g_)))})};
1015  else if constexpr(ck::is_same_v<T, ck::f6x32_pk_t> ||
1016  ck::is_same_v<T, ck::bf6x32_pk_t>)
1017  {
1018  return ck::type_convert<T>(
1019  ck::float32_t{ck::type_convert<float>(fn(dis_(g_))),
1020  ck::type_convert<float>(fn(dis_(g_))),
1021  ck::type_convert<float>(fn(dis_(g_))),
1022  ck::type_convert<float>(fn(dis_(g_))),
1023  ck::type_convert<float>(fn(dis_(g_))),
1024  ck::type_convert<float>(fn(dis_(g_))),
1025  ck::type_convert<float>(fn(dis_(g_))),
1026  ck::type_convert<float>(fn(dis_(g_))),
1027  ck::type_convert<float>(fn(dis_(g_))),
1028  ck::type_convert<float>(fn(dis_(g_))),
1029  ck::type_convert<float>(fn(dis_(g_))),
1030  ck::type_convert<float>(fn(dis_(g_))),
1031  ck::type_convert<float>(fn(dis_(g_))),
1032  ck::type_convert<float>(fn(dis_(g_))),
1033  ck::type_convert<float>(fn(dis_(g_))),
1034  ck::type_convert<float>(fn(dis_(g_))),
1035  ck::type_convert<float>(fn(dis_(g_))),
1036  ck::type_convert<float>(fn(dis_(g_))),
1037  ck::type_convert<float>(fn(dis_(g_))),
1038  ck::type_convert<float>(fn(dis_(g_))),
1039  ck::type_convert<float>(fn(dis_(g_))),
1040  ck::type_convert<float>(fn(dis_(g_))),
1041  ck::type_convert<float>(fn(dis_(g_))),
1042  ck::type_convert<float>(fn(dis_(g_))),
1043  ck::type_convert<float>(fn(dis_(g_))),
1044  ck::type_convert<float>(fn(dis_(g_))),
1045  ck::type_convert<float>(fn(dis_(g_))),
1046  ck::type_convert<float>(fn(dis_(g_))),
1047  ck::type_convert<float>(fn(dis_(g_))),
1048  ck::type_convert<float>(fn(dis_(g_))),
1049  ck::type_convert<float>(fn(dis_(g_))),
1050  ck::type_convert<float>(fn(dis_(g_)))});
1051  }
1052  else if constexpr(ck::is_same_v<T, ck::f6x16_pk_t> ||
1053  ck::is_same_v<T, ck::bf6x16_pk_t>)
1054  {
1055  return ck::type_convert<T>(
1056  ck::float16_t{ck::type_convert<float>(fn(dis_(g_))),
1057  ck::type_convert<float>(fn(dis_(g_))),
1058  ck::type_convert<float>(fn(dis_(g_))),
1059  ck::type_convert<float>(fn(dis_(g_))),
1060  ck::type_convert<float>(fn(dis_(g_))),
1061  ck::type_convert<float>(fn(dis_(g_))),
1062  ck::type_convert<float>(fn(dis_(g_))),
1063  ck::type_convert<float>(fn(dis_(g_))),
1064  ck::type_convert<float>(fn(dis_(g_))),
1065  ck::type_convert<float>(fn(dis_(g_))),
1066  ck::type_convert<float>(fn(dis_(g_))),
1067  ck::type_convert<float>(fn(dis_(g_))),
1068  ck::type_convert<float>(fn(dis_(g_))),
1069  ck::type_convert<float>(fn(dis_(g_))),
1070  ck::type_convert<float>(fn(dis_(g_))),
1071  ck::type_convert<float>(fn(dis_(g_)))});
1072  }
1073  else
1074  static_assert(false, "Unsupported packed size for T");
1075  };
1076 
1077  std::size_t ib = ib_begin;
1078  for(; ib < ib_end - 1; ++ib)
1079  ck::static_for<0, BLOCK_SIZE, 1>{}([&](auto iw_) {
1080  constexpr size_t iw = iw_.value;
1081  dst[ib * BLOCK_SIZE + iw] = t_fn();
1082  });
1083  for(std::size_t iw = 0; iw < BLOCK_SIZE; ++iw)
1084  if(ib * BLOCK_SIZE + iw < element_space_size)
1085  dst[ib * BLOCK_SIZE + iw] = t_fn();
1086  };
1087 
1088  if(it > 0)
1089  threads.emplace_back(std::move(job));
1090  else
1091  job(); // last job run in the main thread
1092  }
1093  for(auto& t : threads)
1094  t.join();
1095  }
1096 
1097  template <typename... Is>
1098  std::size_t GetOffsetFromMultiIndex(Is... is) const
1099  {
1100  return mDesc.GetOffsetFromMultiIndex(is...) / ck::packed_size_v<ck::remove_cvref_t<T>>;
1101  }
1102 
1103  template <typename... Is>
1104  T& operator()(Is... is)
1105  {
1106  return mData[mDesc.GetOffsetFromMultiIndex(is...) /
1107  ck::packed_size_v<ck::remove_cvref_t<T>>];
1108  }
1109 
1110  template <typename... Is>
1111  const T& operator()(Is... is) const
1112  {
1113  return mData[mDesc.GetOffsetFromMultiIndex(is...) /
1114  ck::packed_size_v<ck::remove_cvref_t<T>>];
1115  }
1116 
1117  T& operator()(const std::vector<std::size_t>& idx)
1118  {
1119  return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
1120  }
1121 
1122  const T& operator()(const std::vector<std::size_t>& idx) const
1123  {
1124  return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
1125  }
1126 
1127  typename Data::iterator begin() { return mData.begin(); }
1128 
1129  typename Data::iterator end() { return mData.end(); }
1130 
1131  typename Data::pointer data() { return mData.data(); }
1132 
1133  typename Data::const_iterator begin() const { return mData.begin(); }
1134 
1135  typename Data::const_iterator end() const { return mData.end(); }
1136 
1137  typename Data::const_pointer data() const { return mData.data(); }
1138 
1139  typename Data::size_type size() const { return mData.size(); }
1140 
1141  template <typename U = T>
1142  auto AsSpan() const
1143  {
1144  constexpr std::size_t FromSize = sizeof(T);
1145  constexpr std::size_t ToSize = sizeof(U);
1146 
1147  using Element = std::add_const_t<std::remove_reference_t<U>>;
1148  return ck::span<Element>{reinterpret_cast<Element*>(data()), size() * FromSize / ToSize};
1149  }
1150 
1151  template <typename U = T>
1152  auto AsSpan()
1153  {
1154  constexpr std::size_t FromSize = sizeof(T);
1155  constexpr std::size_t ToSize = sizeof(U);
1156 
1157  using Element = std::remove_reference_t<U>;
1158  return ck::span<Element>{reinterpret_cast<Element*>(data()), size() * FromSize / ToSize};
1159  }
1160 
1163 };
1164 
1165 } // namespace ck
Definition: span.hpp:14
__host__ constexpr __device__ auto rank([[maybe_unused]] const Layout< Shape, UnrolledDescriptorType > &layout)
Get layout rank (num elements in shape).
Definition: layout_utils.hpp:310
__host__ constexpr __device__ auto integer_divide_ceil(X x, Y y)
Definition: math.hpp:72
__host__ constexpr __device__ T min(T x)
Definition: math.hpp:116
auto transform(InputRange &&range, OutputIterator iter, UnaryOperation unary_op) -> decltype(std::transform(std::begin(range), std::end(range), iter, unary_op))
Definition: algorithm.hpp:36
auto copy(InputRange &&range, OutputIterator iter) -> decltype(std::copy(std::begin(std::forward< InputRange >(range)), std::end(std::forward< InputRange >(range)), iter))
Definition: algorithm.hpp:14
iter_value_t< ranges::iterator_t< R > > range_value_t
Definition: ranges.hpp:28
Definition: ck.hpp:270
typename vector_type< float, 16 >::type float16_t
Definition: dtype_vector.hpp:2149
unsigned int get_available_cpu_cores()
Definition: thread.hpp:11
HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(const HostTensorDescriptor &a, const New2Old &new2old, const NewLayout &new_layout=NewLayout())
Definition: host_tensor.hpp:601
std::ostream & LogRange(std::ostream &os, Range &&range, std::string delim)
Definition: host_tensor.hpp:29
auto call_f_unpack_args_impl(F f, T args, std::index_sequence< Is... >)
Definition: host_tensor.hpp:77
int64_t long_index_t
Definition: ck.hpp:302
typename vector_type< float, 2 >::type float2_t
Definition: dtype_vector.hpp:2146
auto construct_f_unpack_args_impl(T args, std::index_sequence< Is... >)
Definition: host_tensor.hpp:91
__host__ constexpr __device__ Y type_convert(X x)
Definition: type_convert.hpp:98
constexpr bool is_base_of_v
Definition: type.hpp:286
auto construct_f_unpack_args(F, T args)
Definition: host_tensor.hpp:97
constexpr bool is_same_v
Definition: type.hpp:283
auto make_ParallelTensorFunctor(F f, Xs... xs)
Definition: host_tensor.hpp:689
constexpr bool is_packed_type_v
Definition: data_type.hpp:414
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition: type.hpp:297
typename std::enable_if< B, T >::type enable_if_t
Definition: enable_if.hpp:27
auto call_f_unpack_args(F f, T args)
Definition: host_tensor.hpp:83
__device__ void inner_product(const TA &a, const TB &b, TC &c)
typename vector_type< float, 32 >::type float32_t
Definition: dtype_vector.hpp:2150
std::ostream & LogRangeAsType(std::ostream &os, Range &&range, std::string delim)
Definition: host_tensor.hpp:44
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1697
const GenericPointer< typename T::ValueType > & pointer
Definition: pointer.h:1514
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition: pointer.h:1517
Layout wrapper that performs the tensor descriptor logic.
Definition: layout.hpp:24
A descriptor class for host tensors that manages tensor dimensions, strides, and layout.
Definition: host_tensor.hpp:173
const std::vector< std::size_t > & GetStrides() const
HostTensorDescriptor()
Definition: host_tensor.hpp:210
ChosenLayout HandleDefaultLayout(const Layout &)
Definition: host_tensor.hpp:228
void DispatchChosenLayout(ChosenLayout tag, const OrigLayout &orig, F &&f) const
Definition: host_tensor.hpp:216
HostTensorDescriptor(const std::initializer_list< X > &lens, const Layout &layout=Layout{})
Definition: host_tensor.hpp:463
std::size_t GetElementSize() const
HostTensorDescriptor(const Lengths &lens, const Layout &layout=Layout{})
Definition: host_tensor.hpp:486
HostTensorDescriptor(const Lengths &lens, const Strides &strides, const Layout &layout=Layout{})
Definition: host_tensor.hpp:544
std::size_t GetElementSpaceSize() const
HostTensorDescriptor(const std::initializer_list< ck::long_index_t > &lens, const Layout &layout=Layout{})
Definition: host_tensor.hpp:472
std::size_t GetOffsetFromMultiIndex(Is... is) const
Definition: host_tensor.hpp:563
void CalculateStrides(const Layout &layout)
Definition: host_tensor.hpp:284
ck::tensor_layout::BaseTensorLayout BaseTensorLayout
Definition: host_tensor.hpp:174
void ValidateStrides(const Layout &layout) const
Definition: host_tensor.hpp:350
HostTensorDescriptor(const std::initializer_list< X > &lens, const std::initializer_list< Y > &strides, const Layout &layout=Layout{})
Definition: host_tensor.hpp:498
std::size_t GetOffsetFromMultiIndex(const std::vector< std::size_t > &iss) const
Definition: host_tensor.hpp:570
HostTensorDescriptor(const std::initializer_list< std::size_t > &lens, const Strides &strides, const Layout &layout=Layout{})
Definition: host_tensor.hpp:524
HostTensorDescriptor(const std::initializer_list< ck::long_index_t > &lens, const std::initializer_list< ck::long_index_t > &strides, const Layout &layout=Layout{})
Definition: host_tensor.hpp:511
friend std::ostream & operator<<(std::ostream &os, ChosenLayout tag)
std::size_t GetNumOfDimension() const
HostTensorDescriptor(std::vector< std::size_t > lens, std::vector< std::size_t > strides, const Layout &layout=DefaultLayout())
Definition: host_tensor.hpp:188
const std::vector< std::size_t > & GetLengths() const
ChosenLayout
Definition: host_tensor.hpp:180
BaseTensorLayout DefaultLayout
Definition: host_tensor.hpp:175
friend std::ostream & operator<<(std::ostream &os, const HostTensorDescriptor &desc)
Definition: host_tensor.hpp:636
std::array< std::size_t, NDIM > GetNdIndices(std::size_t i) const
Definition: host_tensor.hpp:653
std::size_t mN1d
Definition: host_tensor.hpp:641
std::array< std::size_t, NDIM > mStrides
Definition: host_tensor.hpp:640
static constexpr std::size_t NDIM
Definition: host_tensor.hpp:638
F mF
Definition: host_tensor.hpp:637
std::array< std::size_t, NDIM > mLens
Definition: host_tensor.hpp:639
ParallelTensorFunctor(F f, Xs... xs)
Definition: host_tensor.hpp:643
void operator()(std::size_t num_thread=1) const
Definition: host_tensor.hpp:666
Definition: host_tensor.hpp:696
void GenerateTensorDistr(Distribution dis={0.f, 1.f}, Mapping fn={}, const Generator g=Generator(0), std::size_t num_thread=-1)
Definition: host_tensor.hpp:971
std::size_t GetOffsetFromMultiIndex(Is... is) const
Definition: host_tensor.hpp:1098
~Tensor()=default
void ForEach(const F &&f) const
Definition: host_tensor.hpp:868
Tensor(const Lengths &lens, const Strides &strides, Rest &&... rest)
Definition: host_tensor.hpp:747
decltype(auto) GetStrides() const
Definition: host_tensor.hpp:806
std::size_t GetElementSpaceSize() const
Definition: host_tensor.hpp:812
Tensor(std::initializer_list< X > lens)
Definition: host_tensor.hpp:701
Descriptor mDesc
Definition: host_tensor.hpp:1161
auto AsSpan() const
Definition: host_tensor.hpp:1142
void SetZero()
Definition: host_tensor.hpp:826
void ForEach_impl(F &&f, std::vector< size_t > &idx, size_t rank)
Definition: host_tensor.hpp:829
const T & operator()(const std::vector< std::size_t > &idx) const
Definition: host_tensor.hpp:1122
Data::const_pointer data() const
Definition: host_tensor.hpp:1137
Tensor(std::initializer_list< X > lens, std::initializer_list< Y > strides)
Definition: host_tensor.hpp:706
Data mData
Definition: host_tensor.hpp:1162
std::vector< T > Data
Definition: host_tensor.hpp:698
Tensor(const Descriptor &desc)
Definition: host_tensor.hpp:752
Tensor(std::initializer_list< X > lens, std::initializer_list< Y > strides, Rest &&... rest)
Definition: host_tensor.hpp:732
T & operator()(Is... is)
Definition: host_tensor.hpp:1104
Tensor & operator=(const Tensor &)=default
Data::pointer data()
Definition: host_tensor.hpp:1131
std::size_t GetElementSpaceSizeInBytes() const
Definition: host_tensor.hpp:824
Tensor(const Lengths &lens, const Strides &strides)
Definition: host_tensor.hpp:717
auto AsSpan()
Definition: host_tensor.hpp:1152
Tensor(std::initializer_list< X > lens, Rest &&... rest)
Definition: host_tensor.hpp:723
decltype(auto) GetLengths() const
Definition: host_tensor.hpp:804
Tensor & operator=(Tensor &&)=default
Tensor< OutT > CopyAsType() const
Definition: host_tensor.hpp:755
std::size_t GetNumOfDimension() const
Definition: host_tensor.hpp:808
void ForEach_impl(const F &&f, std::vector< size_t > &idx, size_t rank) const
Definition: host_tensor.hpp:852
Data::size_type size() const
Definition: host_tensor.hpp:1139
void ForEach(F &&f)
Definition: host_tensor.hpp:845
Tensor(const Lengths &lens)
Definition: host_tensor.hpp:712
T & operator()(const std::vector< std::size_t > &idx)
Definition: host_tensor.hpp:1117
std::size_t GetElementSize() const
Definition: host_tensor.hpp:810
const T & operator()(Is... is) const
Definition: host_tensor.hpp:1111
Tensor(const Tensor< FromT > &other)
Definition: host_tensor.hpp:775
Data::const_iterator end() const
Definition: host_tensor.hpp:1135
void savetxt(std::string file_name, std::string dtype="float")
Definition: host_tensor.hpp:778
Data::iterator end()
Definition: host_tensor.hpp:1129
Data::iterator begin()
Definition: host_tensor.hpp:1127
Tensor(const Lengths &lens, Rest &&... rest)
Definition: host_tensor.hpp:738
Tensor(const Tensor &)=default
void GenerateTensorValue(G g, std::size_t num_thread=1)
Definition: host_tensor.hpp:875
Tensor(Tensor &&)=default
Data::const_iterator begin() const
Definition: host_tensor.hpp:1133
Tensor()=delete
Definition: integral_constant.hpp:20
Definition: host_tensor.hpp:618
joinable_thread & operator=(joinable_thread &&)=default
~joinable_thread()
Definition: host_tensor.hpp:627
joinable_thread(Xs &&... xs)
Definition: host_tensor.hpp:620
joinable_thread(joinable_thread &&)=default
Definition: functional2.hpp:33
Definition: tensor_layout.hpp:10
Definition: tensor_layout.hpp:31
Definition: tensor_layout.hpp:26
Definition: dtype_vector.hpp:11
__host__ constexpr __device__ const auto & layout(const Tensor< BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType > &tensor)
Get Tensor Layout.
Definition: tensor_utils.hpp:162