/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/host/host_tensor.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/host/host_tensor.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/docs-7.0.0/include/ck_tile/host/host_tensor.hpp Source File
host_tensor.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
6 #include <algorithm>
7 #include <cassert>
8 #include <iostream>
9 #include <iomanip>
10 #include <numeric>
11 #include <utility>
12 #include <vector>
13 #include <functional>
14 #include <fstream>
15 
16 #include "ck_tile/core.hpp"
18 #include "ck_tile/host/ranges.hpp"
19 
20 namespace ck_tile {
21 
22 template <typename Range>
23 CK_TILE_HOST std::ostream& LogRange(std::ostream& os,
24  Range&& range,
25  std::string delim,
26  int precision = std::cout.precision(),
27  int width = 0)
28 {
29  bool first = true;
30  for(auto&& v : range)
31  {
32  if(first)
33  first = false;
34  else
35  os << delim;
36  os << std::setw(width) << std::setprecision(precision) << v;
37  }
38  return os;
39 }
40 
41 template <typename T, typename Range>
42 CK_TILE_HOST std::ostream& LogRangeAsType(std::ostream& os,
43  Range&& range,
44  std::string delim,
45  int precision = std::cout.precision(),
46  int width = 0)
47 {
48  bool first = true;
49  for(auto&& v : range)
50  {
51  if(first)
52  first = false;
53  else
54  os << delim;
55  os << std::setw(width) << std::setprecision(precision) << static_cast<T>(v);
56  }
57  return os;
58 }
59 
60 template <typename F, typename T, std::size_t... Is>
61 CK_TILE_HOST auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
62 {
63  return f(std::get<Is>(args)...);
64 }
65 
66 template <typename F, typename T>
68 {
69  constexpr std::size_t N = std::tuple_size<T>{};
70 
71  return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
72 }
73 
74 template <typename F, typename T, std::size_t... Is>
75 CK_TILE_HOST auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
76 {
77  return F(std::get<Is>(args)...);
78 }
79 
80 template <typename F, typename T>
82 {
83  constexpr std::size_t N = std::tuple_size<T>{};
84 
85  return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
86 }
87 
102 {
103  HostTensorDescriptor() = default;
104 
106  {
107  mStrides.clear();
108  mStrides.resize(mLens.size(), 0);
109  if(mStrides.empty())
110  return;
111 
112  mStrides.back() = 1;
113  std::partial_sum(mLens.rbegin(),
114  mLens.rend() - 1,
115  mStrides.rbegin() + 1,
116  std::multiplies<std::size_t>());
117  }
118 
119  template <typename X, typename = std::enable_if_t<std::is_convertible_v<X, std::size_t>>>
120  HostTensorDescriptor(const std::initializer_list<X>& lens) : mLens(lens.begin(), lens.end())
121  {
122  this->CalculateStrides();
123  }
124 
125  template <typename Lengths,
126  typename = std::enable_if_t<
127  std::is_convertible_v<ck_tile::ranges::range_value_t<Lengths>, std::size_t>>>
128  HostTensorDescriptor(const Lengths& lens) : mLens(lens.begin(), lens.end())
129  {
130  this->CalculateStrides();
131  }
132 
133  template <typename X,
134  typename Y,
135  typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
136  std::is_convertible_v<Y, std::size_t>>>
137  HostTensorDescriptor(const std::initializer_list<X>& lens,
138  const std::initializer_list<Y>& strides)
139  : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
140  {
141  }
142 
143  template <typename Lengths,
144  typename Strides,
145  typename = std::enable_if_t<
146  std::is_convertible_v<ck_tile::ranges::range_value_t<Lengths>, std::size_t> &&
147  std::is_convertible_v<ck_tile::ranges::range_value_t<Strides>, std::size_t>>>
148  HostTensorDescriptor(const Lengths& lens, const Strides& strides)
149  : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
150  {
151  }
152 
153  std::size_t get_num_of_dimension() const { return mLens.size(); }
165  std::size_t get_element_size() const
166  {
167  assert(mLens.size() == mStrides.size());
168  return std::accumulate(
169  mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
170  }
183  std::size_t get_element_space_size() const
184  {
185  std::size_t space = 1;
186  for(std::size_t i = 0; i < mLens.size(); ++i)
187  {
188  if(mLens[i] == 0)
189  continue;
190 
191  space += (mLens[i] - 1) * mStrides[i];
192  }
193  return space;
194  }
195 
196  std::size_t get_length(std::size_t dim) const { return mLens[dim]; }
197 
198  const std::vector<std::size_t>& get_lengths() const { return mLens; }
199 
200  std::size_t get_stride(std::size_t dim) const { return mStrides[dim]; }
201 
202  const std::vector<std::size_t>& get_strides() const { return mStrides; }
203 
216  template <typename... Is>
217  std::size_t GetOffsetFromMultiIndex(Is... is) const
218  {
219  assert(sizeof...(Is) == this->get_num_of_dimension());
220  std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
221  return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
222  }
223 
233  std::size_t GetOffsetFromMultiIndex(const std::vector<std::size_t>& iss) const
234  {
235  return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
236  }
237 
238  friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc)
239  {
240  os << "dim " << desc.get_num_of_dimension() << ", ";
241 
242  os << "lengths {";
243  LogRange(os, desc.get_lengths(), ", ");
244  os << "}, ";
245 
246  os << "strides {";
247  LogRange(os, desc.get_strides(), ", ");
248  os << "}";
249 
250  return os;
251  }
252 
253  private:
254  std::vector<std::size_t> mLens;
255  std::vector<std::size_t> mStrides;
256 };
257 
258 template <typename New2Old>
260  const HostTensorDescriptor& a, const New2Old& new2old)
261 {
262  std::vector<std::size_t> new_lengths(a.get_num_of_dimension());
263  std::vector<std::size_t> new_strides(a.get_num_of_dimension());
264 
265  for(std::size_t i = 0; i < a.get_num_of_dimension(); i++)
266  {
267  new_lengths[i] = a.get_lengths()[new2old[i]];
268  new_strides[i] = a.get_strides()[new2old[i]];
269  }
270 
271  return HostTensorDescriptor(new_lengths, new_strides);
272 }
273 
274 template <typename F, typename... Xs>
276 {
277  F mF;
278  static constexpr std::size_t NDIM = sizeof...(Xs);
279  std::array<std::size_t, NDIM> mLens;
280  std::array<std::size_t, NDIM> mStrides;
281  std::size_t mN1d;
282 
283  ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
284  {
285  mStrides.back() = 1;
286  std::partial_sum(mLens.rbegin(),
287  mLens.rend() - 1,
288  mStrides.rbegin() + 1,
289  std::multiplies<std::size_t>());
290  mN1d = mStrides[0] * mLens[0];
291  }
292 
293  std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
294  {
295  std::array<std::size_t, NDIM> indices;
296 
297  for(std::size_t idim = 0; idim < NDIM; ++idim)
298  {
299  indices[idim] = i / mStrides[idim];
300  i -= indices[idim] * mStrides[idim];
301  }
302 
303  return indices;
304  }
305 
306  void operator()(std::size_t num_thread = 1) const
307  {
308  std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
309 
310  std::vector<joinable_thread> threads(num_thread);
311 
312  for(std::size_t it = 0; it < num_thread; ++it)
313  {
314  std::size_t iw_begin = it * work_per_thread;
315  std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
316 
317  auto f = [this, iw_begin, iw_end] {
318  for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
319  {
320  call_f_unpack_args(this->mF, this->GetNdIndices(iw));
321  }
322  };
323  threads[it] = joinable_thread(f);
324  }
325  }
326 };
327 
328 template <typename F, typename... Xs>
330 {
331  return ParallelTensorFunctor<F, Xs...>(f, xs...);
332 }
333 
334 template <typename T>
336 {
338  using Data = std::vector<T>;
339 
340  template <typename X>
341  HostTensor(std::initializer_list<X> lens) : mDesc(lens), mData(get_element_space_size())
342  {
343  }
344 
345  template <typename X, typename Y>
346  HostTensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
347  : mDesc(lens, strides), mData(get_element_space_size())
348  {
349  }
350 
351  template <typename Lengths>
352  HostTensor(const Lengths& lens) : mDesc(lens), mData(get_element_space_size())
353  {
354  }
355 
356  template <typename Lengths, typename Strides>
357  HostTensor(const Lengths& lens, const Strides& strides)
358  : mDesc(lens, strides), mData(get_element_space_size())
359  {
360  }
361 
363 
364  template <typename OutT>
366  {
367  HostTensor<OutT> ret(mDesc);
368  std::transform(mData.cbegin(), mData.cend(), ret.mData.begin(), [](auto value) {
369  return ck_tile::type_convert<OutT>(value);
370  });
371  return ret;
372  }
373 
374  HostTensor() = delete;
375  HostTensor(const HostTensor&) = default;
376  HostTensor(HostTensor&&) = default;
377 
378  ~HostTensor() = default;
379 
380  HostTensor& operator=(const HostTensor&) = default;
382 
383  template <typename FromT>
384  explicit HostTensor(const HostTensor<FromT>& other) : HostTensor(other.template CopyAsType<T>())
385  {
386  }
387 
388  std::size_t get_length(std::size_t dim) const { return mDesc.get_length(dim); }
389 
390  decltype(auto) get_lengths() const { return mDesc.get_lengths(); }
391 
392  std::size_t get_stride(std::size_t dim) const { return mDesc.get_stride(dim); }
393 
394  decltype(auto) get_strides() const { return mDesc.get_strides(); }
395 
396  std::size_t get_num_of_dimension() const { return mDesc.get_num_of_dimension(); }
397 
398  std::size_t get_element_size() const { return mDesc.get_element_size(); }
399 
400  std::size_t get_element_space_size() const
401  {
402  constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
403  return mDesc.get_element_space_size() / PackedSize;
404  }
405 
407  {
408  return sizeof(T) * get_element_space_size();
409  }
410 
411  // void SetZero() { ck_tile::ranges::fill<T>(mData, 0); }
412  void SetZero() { std::fill(mData.begin(), mData.end(), 0); }
413 
414  template <typename F>
415  void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
416  {
418  {
419  f(*this, idx);
420  return;
421  }
422  // else
423  for(size_t i = 0; i < mDesc.get_lengths()[rank]; i++)
424  {
425  idx[rank] = i;
426  ForEach_impl(std::forward<F>(f), idx, rank + 1);
427  }
428  }
429 
430  template <typename F>
431  void ForEach(F&& f)
432  {
433  std::vector<size_t> idx(mDesc.get_num_of_dimension(), 0);
434  ForEach_impl(std::forward<F>(f), idx, size_t(0));
435  }
436 
437  template <typename F>
438  void ForEach_impl(const F&& f, std::vector<size_t>& idx, size_t rank) const
439  {
441  {
442  f(*this, idx);
443  return;
444  }
445  // else
446  for(size_t i = 0; i < mDesc.get_lengths()[rank]; i++)
447  {
448  idx[rank] = i;
449  ForEach_impl(std::forward<const F>(f), idx, rank + 1);
450  }
451  }
452 
453  template <typename F>
454  void ForEach(const F&& f) const
455  {
456  std::vector<size_t> idx(mDesc.get_num_of_dimension(), 0);
457  ForEach_impl(std::forward<const F>(f), idx, size_t(0));
458  }
459 
460  template <typename G>
461  void GenerateTensorValue(G g, std::size_t num_thread = 1)
462  {
463  switch(mDesc.get_num_of_dimension())
464  {
465  case 1: {
466  auto f = [&](auto i) { (*this)(i) = g(i); };
467  make_ParallelTensorFunctor(f, mDesc.get_lengths()[0])(num_thread);
468  break;
469  }
470  case 2: {
471  auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
473  num_thread);
474  break;
475  }
476  case 3: {
477  auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
479  mDesc.get_lengths()[0],
480  mDesc.get_lengths()[1],
481  mDesc.get_lengths()[2])(num_thread);
482  break;
483  }
484  case 4: {
485  auto f = [&](auto i0, auto i1, auto i2, auto i3) {
486  (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
487  };
489  mDesc.get_lengths()[0],
490  mDesc.get_lengths()[1],
491  mDesc.get_lengths()[2],
492  mDesc.get_lengths()[3])(num_thread);
493  break;
494  }
495  case 5: {
496  auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
497  (*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
498  };
500  mDesc.get_lengths()[0],
501  mDesc.get_lengths()[1],
502  mDesc.get_lengths()[2],
503  mDesc.get_lengths()[3],
504  mDesc.get_lengths()[4])(num_thread);
505  break;
506  }
507  case 6: {
508  auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4, auto i5) {
509  (*this)(i0, i1, i2, i3, i4, i5) = g(i0, i1, i2, i3, i4, i5);
510  };
512  mDesc.get_lengths()[0],
513  mDesc.get_lengths()[1],
514  mDesc.get_lengths()[2],
515  mDesc.get_lengths()[3],
516  mDesc.get_lengths()[4],
517  mDesc.get_lengths()[5])(num_thread);
518  break;
519  }
520  default: throw std::runtime_error("unspported dimension");
521  }
522  }
523 
524  template <typename... Is>
525  std::size_t GetOffsetFromMultiIndex(Is... is) const
526  {
527  constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
528  return mDesc.GetOffsetFromMultiIndex(is...) / PackedSize;
529  }
530 
531  template <typename... Is>
532  T& operator()(Is... is)
533  {
534  return mData[GetOffsetFromMultiIndex(is...)];
535  }
536 
537  template <typename... Is>
538  const T& operator()(Is... is) const
539  {
540  return mData[GetOffsetFromMultiIndex(is...)];
541  }
542 
543  T& operator()(const std::vector<std::size_t>& idx)
544  {
545  return mData[GetOffsetFromMultiIndex(idx)];
546  }
547 
548  const T& operator()(const std::vector<std::size_t>& idx) const
549  {
550  return mData[GetOffsetFromMultiIndex(idx)];
551  }
552 
553  HostTensor<T> transpose(std::vector<size_t> axes = {}) const
554  {
555  if(axes.empty())
556  {
557  axes.resize(this->get_num_of_dimension());
558  std::iota(axes.rbegin(), axes.rend(), 0);
559  }
560  if(axes.size() != mDesc.get_num_of_dimension())
561  {
562  throw std::runtime_error(
563  "HostTensor::transpose(): size of axes must match tensor dimension");
564  }
565  std::vector<size_t> tlengths, tstrides;
566  for(const auto& axis : axes)
567  {
568  tlengths.push_back(get_lengths()[axis]);
569  tstrides.push_back(get_strides()[axis]);
570  }
571  HostTensor<T> ret(*this);
572  ret.mDesc = HostTensorDescriptor(tlengths, tstrides);
573  return ret;
574  }
575 
576  HostTensor<T> transpose(std::vector<size_t> axes = {})
577  {
578  return const_cast<HostTensor<T> const*>(this)->transpose(axes);
579  }
580 
581  typename Data::iterator begin() { return mData.begin(); }
582 
583  typename Data::iterator end() { return mData.end(); }
584 
585  typename Data::pointer data() { return mData.data(); }
586 
587  typename Data::const_iterator begin() const { return mData.begin(); }
588 
589  typename Data::const_iterator end() const { return mData.end(); }
590 
591  typename Data::const_pointer data() const { return mData.data(); }
592 
593  typename Data::size_type size() const { return mData.size(); }
594 
595  // return a slice of this tensor
596  // for simplicity we just copy the data and return a new tensor
597  auto slice(std::vector<size_t> s_begin, std::vector<size_t> s_end) const
598  {
599  assert(s_begin.size() == s_end.size());
600  assert(s_begin.size() == get_num_of_dimension());
601 
602  std::vector<size_t> s_len(s_begin.size());
604  s_end.begin(), s_end.end(), s_begin.begin(), s_len.begin(), std::minus<size_t>{});
605  HostTensor<T> sliced_tensor(s_len);
606 
607  sliced_tensor.ForEach([&](auto& self, auto idx) {
608  std::vector<size_t> src_idx(idx.size());
610  idx.begin(), idx.end(), s_begin.begin(), src_idx.begin(), std::plus<size_t>{});
611  self(idx) = operator()(src_idx);
612  });
613 
614  return sliced_tensor;
615  }
616 
617  template <typename U = T>
618  auto AsSpan() const
619  {
620  constexpr std::size_t FromSize = sizeof(T);
621  constexpr std::size_t ToSize = sizeof(U);
622 
623  using Element = std::add_const_t<std::remove_reference_t<U>>;
624  return ck_tile::span<Element>{reinterpret_cast<Element*>(data()),
625  size() * FromSize / ToSize};
626  }
627 
628  template <typename U = T>
629  auto AsSpan()
630  {
631  constexpr std::size_t FromSize = sizeof(T);
632  constexpr std::size_t ToSize = sizeof(U);
633 
634  using Element = std::remove_reference_t<U>;
635  return ck_tile::span<Element>{reinterpret_cast<Element*>(data()),
636  size() * FromSize / ToSize};
637  }
638 
639  friend std::ostream& operator<<(std::ostream& os, const HostTensor<T>& t)
640  {
641  os << t.mDesc;
642  os << "[";
643  for(typename Data::size_type idx = 0; idx < t.mData.size(); ++idx)
644  {
645  if(0 < idx)
646  {
647  os << ", ";
648  }
649  if constexpr(std::is_same_v<T, bf16_t> || std::is_same_v<T, fp16_t>)
650  {
651  os << type_convert<float>(t.mData[idx]) << " #### ";
652  }
653  else
654  {
655  os << t.mData[idx];
656  }
657  }
658  os << "]";
659  return os;
660  }
661 
662  // read data from a file, as dtype
663  // the file could dumped from torch as (targeting tensor is t here)
664  // numpy.savetxt("f.txt", t.view(-1).numpy())
665  // numpy.savetxt("f.txt", t.cpu().view(-1).numpy()) # from cuda to cpu to save
666  // numpy.savetxt("f.txt", t.cpu().view(-1).numpy(), fmt="%d") # save as int
667  // will output f.txt, each line is a value
668  // dtype=float or int, internally will cast to real type
669  void loadtxt(std::string file_name, std::string dtype = "float")
670  {
671  std::ifstream file(file_name);
672 
673  if(file.is_open())
674  {
675  std::string line;
676 
677  index_t cnt = 0;
678  while(std::getline(file, line))
679  {
680  if(cnt >= static_cast<index_t>(mData.size()))
681  {
682  throw std::runtime_error(std::string("data read from file:") + file_name +
683  " is too big");
684  }
685 
686  if(dtype == "float")
687  {
688  mData[cnt] = type_convert<T>(std::stof(line));
689  }
690  else if(dtype == "int" || dtype == "int32")
691  {
692  mData[cnt] = type_convert<T>(std::stoi(line));
693  }
694  cnt++;
695  }
696  file.close();
697  if(cnt < static_cast<index_t>(mData.size()))
698  {
699  std::cerr << "Warning! reading from file:" << file_name
700  << ", does not match the size of this tensor" << std::endl;
701  }
702  }
703  else
704  {
705  // Print an error message to the standard error
706  // stream if the file cannot be opened.
707  throw std::runtime_error(std::string("unable to open file:") + file_name);
708  }
709  }
710 
711  // can save to a txt file and read from torch as:
712  // torch.from_numpy(np.loadtxt('f.txt', dtype=np.int32/np.float32...)).view([...]).contiguous()
713  void savetxt(std::string file_name, std::string dtype = "float")
714  {
715  std::ofstream file(file_name);
716 
717  if(file.is_open())
718  {
719  for(auto& itm : mData)
720  {
721  if(dtype == "float")
722  file << type_convert<float>(itm) << std::endl;
723  else if(dtype == "int")
724  file << type_convert<int>(itm) << std::endl;
725  else if(dtype == "int8_t")
726  file << static_cast<int>(type_convert<ck_tile::int8_t>(itm)) << std::endl;
727  else
728  // TODO: we didn't implement operator<< for all custom
729  // data types, here fall back to float in case compile error
730  file << type_convert<float>(itm) << std::endl;
731  }
732  file.close();
733  }
734  else
735  {
736  // Print an error message to the standard error
737  // stream if the file cannot be opened.
738  throw std::runtime_error(std::string("unable to open file:") + file_name);
739  }
740  }
741 
744 };
745 
764 template <bool is_row_major>
765 auto host_tensor_descriptor(std::size_t row,
766  std::size_t col,
767  std::size_t stride,
769 {
770  using namespace ck_tile::literals;
771 
772  if constexpr(is_row_major)
773  {
774  return HostTensorDescriptor({row, col}, {stride, 1_uz});
775  }
776  else
777  {
778  return HostTensorDescriptor({row, col}, {1_uz, stride});
779  }
780 }
781 
782 template <bool is_row_major>
783 auto get_default_stride(std::size_t row,
784  std::size_t col,
785  std::size_t stride,
787 {
788  if(stride == 0)
789  {
790  if constexpr(is_row_major)
791  {
792  return col;
793  }
794  else
795  {
796  return row;
797  }
798  }
799  else
800  return stride;
801 }
802 } // namespace ck_tile
Definition: span.hpp:18
#define CK_TILE_HOST
Definition: config.hpp:39
__host__ constexpr __device__ auto rank([[maybe_unused]] const Layout< Shape, UnrolledDescriptorType > &layout)
Get layout rank (num elements in shape).
Definition: layout_utils.hpp:310
__host__ constexpr __device__ T min(T x)
Definition: math.hpp:116
auto fill(OutputRange &&range, const T &init) -> std::void_t< decltype(std::fill(std::begin(std::forward< OutputRange >(range)), std::end(std::forward< OutputRange >(range)), init))>
Definition: algorithm.hpp:25
auto transform(InputRange &&range, OutputIterator iter, UnaryOperation unary_op) -> decltype(std::transform(std::begin(range), std::end(range), iter, unary_op))
Definition: algorithm.hpp:36
Definition: literals.hpp:9
Definition: cluster_descriptor.hpp:13
CK_TILE_HOST auto make_ParallelTensorFunctor(F f, Xs... xs)
Definition: host_tensor.hpp:329
CK_TILE_HOST auto call_f_unpack_args(F f, T args)
Definition: host_tensor.hpp:67
CK_TILE_HOST HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(const HostTensorDescriptor &a, const New2Old &new2old)
Definition: host_tensor.hpp:259
CK_TILE_HOST auto call_f_unpack_args_impl(F f, T args, std::index_sequence< Is... >)
Definition: host_tensor.hpp:61
auto host_tensor_descriptor(std::size_t row, std::size_t col, std::size_t stride, bool_constant< is_row_major >)
Creates a host tensor descriptor with specified dimensions and layout.
Definition: host_tensor.hpp:765
CK_TILE_HOST std::ostream & LogRangeAsType(std::ostream &os, Range &&range, std::string delim, int precision=std::cout.precision(), int width=0)
Definition: host_tensor.hpp:42
int32_t index_t
Definition: integer.hpp:9
CK_TILE_HOST std::ostream & LogRange(std::ostream &os, Range &&range, std::string delim, int precision=std::cout.precision(), int width=0)
Definition: host_tensor.hpp:23
CK_TILE_HOST auto construct_f_unpack_args(F, T args)
Definition: host_tensor.hpp:81
CK_TILE_HOST auto construct_f_unpack_args_impl(T args, std::index_sequence< Is... >)
Definition: host_tensor.hpp:75
auto get_default_stride(std::size_t row, std::size_t col, std::size_t stride, bool_constant< is_row_major >)
Definition: host_tensor.hpp:783
typename std::enable_if< B, T >::type enable_if_t
Definition: enable_if.hpp:27
__device__ void inner_product(const TA &a, const TB &b, TC &c)
Definition: host_tensor.hpp:101
Descriptor for tensors in host memory.
Definition: host_tensor.hpp:102
std::size_t get_stride(std::size_t dim) const
Definition: host_tensor.hpp:200
std::size_t GetOffsetFromMultiIndex(Is... is) const
Calculates the linear offset from multi-dimensional indices.
Definition: host_tensor.hpp:217
std::size_t get_element_size() const
Calculates the total number of elements in the tensor.
Definition: host_tensor.hpp:165
void CalculateStrides()
Definition: host_tensor.hpp:105
std::size_t get_num_of_dimension() const
Definition: host_tensor.hpp:153
HostTensorDescriptor(const std::initializer_list< X > &lens, const std::initializer_list< Y > &strides)
Definition: host_tensor.hpp:137
std::size_t get_element_space_size() const
Calculates the total element space required for the tensor in memory.
Definition: host_tensor.hpp:183
const std::vector< std::size_t > & get_strides() const
Definition: host_tensor.hpp:202
const std::vector< std::size_t > & get_lengths() const
Definition: host_tensor.hpp:198
std::size_t get_length(std::size_t dim) const
Definition: host_tensor.hpp:196
HostTensorDescriptor(const Lengths &lens, const Strides &strides)
Definition: host_tensor.hpp:148
HostTensorDescriptor(const std::initializer_list< X > &lens)
Definition: host_tensor.hpp:120
std::size_t GetOffsetFromMultiIndex(const std::vector< std::size_t > &iss) const
Calculates the linear memory offset from a multi-dimensional index.
Definition: host_tensor.hpp:233
HostTensorDescriptor(const Lengths &lens)
Definition: host_tensor.hpp:128
friend std::ostream & operator<<(std::ostream &os, const HostTensorDescriptor &desc)
Definition: host_tensor.hpp:238
Definition: host_tensor.hpp:336
void ForEach(F &&f)
Definition: host_tensor.hpp:431
std::size_t get_stride(std::size_t dim) const
Definition: host_tensor.hpp:392
void ForEach(const F &&f) const
Definition: host_tensor.hpp:454
HostTensor(HostTensor &&)=default
Data::size_type size() const
Definition: host_tensor.hpp:593
decltype(auto) get_lengths() const
Definition: host_tensor.hpp:390
HostTensor(std::initializer_list< X > lens, std::initializer_list< Y > strides)
Definition: host_tensor.hpp:346
HostTensor & operator=(HostTensor &&)=default
friend std::ostream & operator<<(std::ostream &os, const HostTensor< T > &t)
Definition: host_tensor.hpp:639
HostTensor(std::initializer_list< X > lens)
Definition: host_tensor.hpp:341
HostTensor & operator=(const HostTensor &)=default
std::size_t get_element_space_size_in_bytes() const
Definition: host_tensor.hpp:406
decltype(auto) get_strides() const
Definition: host_tensor.hpp:394
HostTensor(const HostTensor &)=default
Data::iterator end()
Definition: host_tensor.hpp:583
void GenerateTensorValue(G g, std::size_t num_thread=1)
Definition: host_tensor.hpp:461
void SetZero()
Definition: host_tensor.hpp:412
Descriptor mDesc
Definition: host_tensor.hpp:742
const T & operator()(Is... is) const
Definition: host_tensor.hpp:538
HostTensor(const Lengths &lens)
Definition: host_tensor.hpp:352
std::size_t GetOffsetFromMultiIndex(Is... is) const
Definition: host_tensor.hpp:525
Data::pointer data()
Definition: host_tensor.hpp:585
T & operator()(Is... is)
Definition: host_tensor.hpp:532
HostTensor< OutT > CopyAsType() const
Definition: host_tensor.hpp:365
auto AsSpan() const
Definition: host_tensor.hpp:618
auto slice(std::vector< size_t > s_begin, std::vector< size_t > s_end) const
Definition: host_tensor.hpp:597
std::vector< T > Data
Definition: host_tensor.hpp:338
auto AsSpan()
Definition: host_tensor.hpp:629
Data::const_iterator begin() const
Definition: host_tensor.hpp:587
std::size_t get_num_of_dimension() const
Definition: host_tensor.hpp:396
std::size_t get_element_space_size() const
Definition: host_tensor.hpp:400
HostTensor(const Lengths &lens, const Strides &strides)
Definition: host_tensor.hpp:357
void loadtxt(std::string file_name, std::string dtype="float")
Definition: host_tensor.hpp:669
Data::const_pointer data() const
Definition: host_tensor.hpp:591
void ForEach_impl(const F &&f, std::vector< size_t > &idx, size_t rank) const
Definition: host_tensor.hpp:438
HostTensor(const Descriptor &desc)
Definition: host_tensor.hpp:362
HostTensor< T > transpose(std::vector< size_t > axes={})
Definition: host_tensor.hpp:576
Data::iterator begin()
Definition: host_tensor.hpp:581
const T & operator()(const std::vector< std::size_t > &idx) const
Definition: host_tensor.hpp:548
void savetxt(std::string file_name, std::string dtype="float")
Definition: host_tensor.hpp:713
HostTensor(const HostTensor< FromT > &other)
Definition: host_tensor.hpp:384
HostTensor< T > transpose(std::vector< size_t > axes={}) const
Definition: host_tensor.hpp:553
std::size_t get_length(std::size_t dim) const
Definition: host_tensor.hpp:388
std::size_t get_element_size() const
Definition: host_tensor.hpp:398
T & operator()(const std::vector< std::size_t > &idx)
Definition: host_tensor.hpp:543
void ForEach_impl(F &&f, std::vector< size_t > &idx, size_t rank)
Definition: host_tensor.hpp:415
Data::const_iterator end() const
Definition: host_tensor.hpp:589
Data mData
Definition: host_tensor.hpp:743
Definition: host_tensor.hpp:276
void operator()(std::size_t num_thread=1) const
Definition: host_tensor.hpp:306
ParallelTensorFunctor(F f, Xs... xs)
Definition: host_tensor.hpp:283
std::size_t mN1d
Definition: host_tensor.hpp:281
std::array< std::size_t, NDIM > mLens
Definition: host_tensor.hpp:279
std::array< std::size_t, NDIM > mStrides
Definition: host_tensor.hpp:280
static constexpr std::size_t NDIM
Definition: host_tensor.hpp:278
F mF
Definition: host_tensor.hpp:277
std::array< std::size_t, NDIM > GetNdIndices(std::size_t i) const
Definition: host_tensor.hpp:293
Definition: integral_constant.hpp:13
Definition: joinable_thread.hpp:12
Definition: numeric.hpp:81