clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File

clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File#

HIP Runtime API Reference: clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File
amd_hip_cooperative_groups.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
32 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
33 #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
34 
35 #if __cplusplus
36 #if !defined(__HIPCC_RTC__)
37 #include <hip/amd_detail/hip_cooperative_groups_helper.h>
38 #endif
39 
40 namespace cooperative_groups {
41 
50 class thread_group {
51  protected:
52  __hip_uint32_t _type;
53  __hip_uint32_t _num_threads;
54  __hip_uint64_t _mask;
56 
62  __CG_QUALIFIER__ thread_group(internal::group_type type,
63  __hip_uint32_t num_threads = static_cast<__hip_uint64_t>(0),
64  __hip_uint64_t mask = static_cast<__hip_uint64_t>(0)) {
65  _type = type;
67  _mask = mask;
68  }
69 
70  struct _tiled_info {
71  bool is_tiled;
72  unsigned int num_threads;
73  unsigned int meta_group_rank;
74  unsigned int meta_group_size;
75  };
76 
77  struct _coalesced_info {
78  lane_mask member_mask;
79  unsigned int num_threads;
80  struct _tiled_info tiled_info;
82 
83  friend __CG_QUALIFIER__ thread_group this_thread();
84  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
85  unsigned int tile_size);
86  friend class thread_block;
87 
88  public:
92  __CG_QUALIFIER__ __hip_uint32_t num_threads() const { return _num_threads; }
94  __CG_QUALIFIER__ __hip_uint32_t size() const { return num_threads(); }
96  __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
98  __CG_QUALIFIER__ __hip_uint32_t thread_rank() const;
100  __CG_QUALIFIER__ bool is_valid() const;
101 
114  __CG_QUALIFIER__ void sync() const;
115 };
141  friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
142 
143  protected:
145  explicit __CG_QUALIFIER__ multi_grid_group(__hip_uint32_t size)
146  : thread_group(internal::cg_multi_grid, size) {}
147 
148  public:
151  __CG_QUALIFIER__ __hip_uint32_t num_grids() { return internal::multi_grid::num_grids(); }
152 
155  __CG_QUALIFIER__ __hip_uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
157  __CG_QUALIFIER__ __hip_uint32_t thread_rank() const {
159  }
161  __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
163  __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
164 };
165 
179 __CG_QUALIFIER__ multi_grid_group this_multi_grid() {
180  return multi_grid_group(internal::multi_grid::num_threads());
181 }
182 // Doxygen end group CooperativeGConstruct
193 class grid_group : public thread_group {
196  friend __CG_QUALIFIER__ grid_group this_grid();
197 
198  protected:
200  explicit __CG_QUALIFIER__ grid_group(__hip_uint32_t size)
201  : thread_group(internal::cg_grid, size) {}
202 
203  public:
205  __CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::grid::thread_rank(); }
207  __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
209  __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
210  __CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); }
211 };
212 
223 __CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::num_threads()); }
224 
234 class thread_block : public thread_group {
237  friend __CG_QUALIFIER__ thread_block this_thread_block();
238  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
239  unsigned int tile_size);
240  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
241  unsigned int tile_size);
242 
243  protected:
244  // Construct a workgroup thread group (through the API this_thread_block())
245  explicit __CG_QUALIFIER__ thread_block(__hip_uint32_t size)
246  : thread_group(internal::cg_workgroup, size) {}
247 
248  __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
249  const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
250  // Invalid tile size, assert
251  if (!tile_size || (tile_size > warpSize) || !pow2) {
252  __hip_assert(false && "invalid tile size");
253  }
254 
255  auto block_size = num_threads();
256  auto rank = thread_rank();
257  auto partitions = (block_size + tile_size - 1) / tile_size;
258  auto tail = (partitions * tile_size) - block_size;
259  auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size);
260  thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size);
261 
262  tiledGroup.coalesced_info.tiled_info.num_threads = tile_size;
263  tiledGroup.coalesced_info.tiled_info.is_tiled = true;
264  tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size;
265  tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions;
266  return tiledGroup;
267  }
268 
269  public:
271  __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
273  __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
275  __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
277  }
279  __CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
280  return internal::workgroup::num_threads();
281  }
283  __CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return num_threads(); }
285  __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
287  __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
289  __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
290 };
291 
302 __CG_QUALIFIER__ thread_block this_thread_block() {
303  return thread_block(internal::workgroup::num_threads());
304 }
305 
313 class tiled_group : public thread_group {
314  private:
315  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
316  unsigned int tile_size);
317  friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
318  unsigned int tile_size);
319 
320  __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
321  const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
322 
323  if (!tile_size || (tile_size > warpSize) || !pow2) {
324  __hip_assert(false && "invalid tile size");
325  }
326 
327  if (num_threads() <= tile_size) {
328  return *this;
329  }
330 
331  tiled_group tiledGroup = tiled_group(tile_size);
332  tiledGroup.coalesced_info.tiled_info.is_tiled = true;
333  return tiledGroup;
334  }
335 
336  protected:
337  explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
338  : thread_group(internal::cg_tiled_group, tileSize) {
341  }
342 
343  public:
345  __CG_QUALIFIER__ unsigned int num_threads() const {
347  }
348 
350  __CG_QUALIFIER__ unsigned int size() const { return num_threads(); }
351 
353  __CG_QUALIFIER__ unsigned int thread_rank() const {
355  }
356 
358  __CG_QUALIFIER__ void sync() const { internal::tiled_group::sync(); }
359 };
360 
361 template <unsigned int size, class ParentCGTy> class thread_block_tile;
362 
371  private:
372  friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
373  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
374  unsigned int tile_size);
375  friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
376  unsigned int tile_size);
377  friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred);
378  template <unsigned int fsize, class fparent> friend __CG_QUALIFIER__ coalesced_group
380 
381  __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
382  const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
383 
384  if (!tile_size || !pow2) {
385  return coalesced_group(0);
386  }
387 
388  // If a tiled group is passed to be partitioned further into a coalesced_group.
389  // prepare a mask for further partitioning it so that it stays coalesced.
391  unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
392  unsigned int masklength =
393  min(static_cast<unsigned int>(num_threads()) - base_offset, tile_size);
394  lane_mask full_mask = (static_cast<int>(warpSize) == 32)
395  ? static_cast<lane_mask>((1u << 32) - 1)
396  : static_cast<lane_mask>(-1ull);
397  lane_mask member_mask = full_mask >> (warpSize - masklength);
398 
399  member_mask <<= (__lane_id() & ~(tile_size - 1));
400  coalesced_group coalesced_tile = coalesced_group(member_mask);
401  coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
402  coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
403  coalesced_tile.coalesced_info.tiled_info.meta_group_size = num_threads() / tile_size;
404  return coalesced_tile;
405  }
406  // Here the parent coalesced_group is not partitioned.
407  else {
408  lane_mask member_mask = 0;
409  unsigned int tile_rank = 0;
410  int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
411 
412  for (unsigned int i = 0; i < warpSize; i++) {
413  lane_mask active = coalesced_info.member_mask & (static_cast<lane_mask>(1) << i);
414  // Make sure the lane is active
415  if (active) {
416  if (lanes_to_skip <= 0 && tile_rank < tile_size) {
417  // Prepare a member_mask that is appropriate for a tile
418  member_mask |= active;
419  tile_rank++;
420  }
421  lanes_to_skip--;
422  }
423  }
424  coalesced_group coalesced_tile = coalesced_group(member_mask);
425  coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
426  coalesced_tile.coalesced_info.tiled_info.meta_group_size =
427  (num_threads() + tile_size - 1) / tile_size;
428  return coalesced_tile;
429  }
430  return coalesced_group(0);
431  }
432 
433  protected:
434  // Constructor
435  explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
436  : thread_group(internal::cg_coalesced_group) {
437  coalesced_info.member_mask = member_mask; // Which threads are active
439  __popcll(coalesced_info.member_mask); // How many threads are active
440  coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
443  }
444 
445  public:
447  __CG_QUALIFIER__ unsigned int num_threads() const { return coalesced_info.num_threads; }
448 
450  __CG_QUALIFIER__ unsigned int size() const { return num_threads(); }
451 
453  __CG_QUALIFIER__ unsigned int thread_rank() const {
454  return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
455  }
456 
458  __CG_QUALIFIER__ void sync() const { internal::coalesced_group::sync(); }
459 
462  __CG_QUALIFIER__ unsigned int meta_group_rank() const {
464  }
465 
467  __CG_QUALIFIER__ unsigned int meta_group_size() const {
469  }
470 
483  template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
484  srcRank = srcRank % static_cast<int>(num_threads());
485 
486  int lane = (num_threads() == warpSize) ? srcRank
487  : (static_cast<int>(warpSize) == 64)
488  ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
489  : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
490 
491  return __shfl(var, lane, warpSize);
492  }
493 
508  template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
509  // Note: The cuda implementation appears to use the remainder of lane_delta
510  // and WARP_SIZE as the shift value rather than lane_delta itself.
511  // This is not described in the documentation and is not done here.
512 
513  if (num_threads() == warpSize) {
514  return __shfl_down(var, lane_delta, warpSize);
515  }
516 
517  int lane;
518  if (static_cast<int>(warpSize) == 64) {
519  lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
520  } else {
521  lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
522  }
523 
524  if (lane == -1) {
525  lane = __lane_id();
526  }
527 
528  return __shfl(var, lane, warpSize);
529  }
530 
545  template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
546  // Note: The cuda implementation appears to use the remainder of lane_delta
547  // and WARP_SIZE as the shift value rather than lane_delta itself.
548  // This is not described in the documentation and is not done here.
549 
550  if (num_threads() == warpSize) {
551  return __shfl_up(var, lane_delta, warpSize);
552  }
553 
554  int lane;
555  if (static_cast<int>(warpSize) == 64) {
556  lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
557  } else if (static_cast<int>(warpSize) == 32) {
558  lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
559  }
560 
561  if (lane == -1) {
562  lane = __lane_id();
563  }
564 
565  return __shfl(var, lane, warpSize);
566  }
567 #if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
568 
576  __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
577  return internal::helper::adjust_mask(
579  __ballot_sync<unsigned long long>(coalesced_info.member_mask, pred));
580  }
581 
588  __CG_QUALIFIER__ int any(int pred) const {
589  return __any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
590  }
591 
598  __CG_QUALIFIER__ int all(int pred) const {
599  return __all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
600  }
601 
610  template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
611  return internal::helper::adjust_mask(
613  __match_any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value));
614  }
615 
627  template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
628  return internal::helper::adjust_mask(
630  __match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
631  &pred));
632  }
633 #endif // HIP_DISABLE_WARP_SYNC_BUILTINS
634 };
635 
644 __CG_QUALIFIER__ coalesced_group coalesced_threads() {
645  return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
646 }
647 
648 #ifndef DOXYGEN_SHOULD_SKIP_THIS
649 
655 __CG_QUALIFIER__ __hip_uint32_t thread_group::thread_rank() const {
656  switch (this->_type) {
657  case internal::cg_multi_grid: {
658  return (static_cast<const multi_grid_group*>(this)->thread_rank());
659  }
660  case internal::cg_grid: {
661  return (static_cast<const grid_group*>(this)->thread_rank());
662  }
663  case internal::cg_workgroup: {
664  return (static_cast<const thread_block*>(this)->thread_rank());
665  }
666  case internal::cg_tiled_group: {
667  return (static_cast<const tiled_group*>(this)->thread_rank());
668  }
669  case internal::cg_coalesced_group: {
670  return (static_cast<const coalesced_group*>(this)->thread_rank());
671  }
672  default: {
673  __hip_assert(false && "invalid cooperative group type");
674  return -1;
675  }
676  }
677 }
678 
684 __CG_QUALIFIER__ bool thread_group::is_valid() const {
685  switch (this->_type) {
686  case internal::cg_multi_grid: {
687  return (static_cast<const multi_grid_group*>(this)->is_valid());
688  }
689  case internal::cg_grid: {
690  return (static_cast<const grid_group*>(this)->is_valid());
691  }
692  case internal::cg_workgroup: {
693  return (static_cast<const thread_block*>(this)->is_valid());
694  }
695  case internal::cg_tiled_group: {
696  return (static_cast<const tiled_group*>(this)->is_valid());
697  }
698  case internal::cg_coalesced_group: {
699  return (static_cast<const coalesced_group*>(this)->is_valid());
700  }
701  default: {
702  __hip_assert(false && "invalid cooperative group type");
703  return false;
704  }
705  }
706 }
707 
713 __CG_QUALIFIER__ void thread_group::sync() const {
714  switch (this->_type) {
715  case internal::cg_multi_grid: {
716  static_cast<const multi_grid_group*>(this)->sync();
717  break;
718  }
719  case internal::cg_grid: {
720  static_cast<const grid_group*>(this)->sync();
721  break;
722  }
723  case internal::cg_workgroup: {
724  static_cast<const thread_block*>(this)->sync();
725  break;
726  }
727  case internal::cg_tiled_group: {
728  static_cast<const tiled_group*>(this)->sync();
729  break;
730  }
731  case internal::cg_coalesced_group: {
732  static_cast<const coalesced_group*>(this)->sync();
733  break;
734  }
735  default: {
736  __hip_assert(false && "invalid cooperative group type");
737  }
738  }
739 }
740 
741 #endif
742 
760 template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) {
761  return g.num_threads();
762 }
763 
775 template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t thread_rank(CGTy const& g) {
776  return g.thread_rank();
777 }
778 
788 template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
789 
799 template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
800 
801 // Doxygen end group CooperativeGAPI
809 template <unsigned int tileSize> class tile_base {
810  protected:
811  _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
812 
813  public:
815  _CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
816  return (internal::workgroup::thread_rank() & (numThreads - 1));
817  }
818 
820  __CG_STATIC_QUALIFIER__ unsigned int num_threads() { return numThreads; }
821 
824  __CG_STATIC_QUALIFIER__ unsigned int size() { return num_threads(); }
825 };
826 
832 template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
833  static_assert(is_valid_tile_size<size>::value,
834  "Tile size is either not a power of 2 or greater than the wavefront size");
836 
837  template <unsigned int fsize, class fparent> friend __CG_QUALIFIER__ coalesced_group
839 
840 #if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
841  __CG_QUALIFIER__ unsigned long long build_mask() const {
842  unsigned long long mask = ~0ull >> (64 - numThreads);
843  // thread_rank() gives thread id from 0..thread launch size.
844  return mask << (((internal::workgroup::thread_rank() % warpSize) / numThreads) * numThreads);
845  }
846 #endif // HIP_DISABLE_WARP_SYNC_BUILTINS
847 
848  public:
849  __CG_STATIC_QUALIFIER__ void sync() { internal::tiled_group::sync(); }
850 
851  template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
852  return (__shfl(var, srcRank, numThreads));
853  }
854 
855  template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
856  return (__shfl_down(var, lane_delta, numThreads));
857  }
858 
859  template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
860  return (__shfl_up(var, lane_delta, numThreads));
861  }
862 
863  template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
864  return (__shfl_xor(var, laneMask, numThreads));
865  }
866 
867 #if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
868  __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
869  const auto mask = build_mask();
870  return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
871  }
872 
873  __CG_QUALIFIER__ int any(int pred) const { return __any_sync(build_mask(), pred); }
874 
875  __CG_QUALIFIER__ int all(int pred) const { return __all_sync(build_mask(), pred); }
876 
877  template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
878  const auto mask = build_mask();
879  return internal::helper::adjust_mask(mask, __match_any_sync(mask, value));
880  }
881 
882  template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
883  const auto mask = build_mask();
884  return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
885  }
886 #endif // HIP_DISABLE_WARP_SYNC_BUILTINS
887 };
888 
891 template <unsigned int tileSize, typename ParentCGTy> class parent_group_info {
892  public:
895  __CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() {
896  return ParentCGTy::thread_rank() / tileSize;
897  }
898 
900  __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
901  return (ParentCGTy::num_threads() + tileSize - 1) / tileSize;
902  }
903 };
904 
911 template <unsigned int tileSize, class ParentCGTy> class thread_block_tile_type
912  : public thread_block_tile_base<tileSize>,
913  public tiled_group,
914  public parent_group_info<tileSize, ParentCGTy> {
915  _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
917 
918  protected:
919  __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
922  }
923 
924  __CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank,
925  unsigned int meta_group_size)
926  : tiled_group(numThreads) {
931  }
932 
933  public:
934  using tbtBase::num_threads;
935  using tbtBase::size;
937  using tbtBase::thread_rank;
938 };
939 
940 // Partial template specialization
941 template <unsigned int tileSize> class thread_block_tile_type<tileSize, void>
942  : public thread_block_tile_base<tileSize>, public tiled_group {
943  _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
944 
946 
947  protected:
948  __CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank,
949  unsigned int meta_group_size)
950  : tiled_group(numThreads) {
955  }
956 
957  public:
958  using tbtBase::num_threads;
959  using tbtBase::size;
960  using tbtBase::sync;
961  using tbtBase::thread_rank;
962 
965  __CG_QUALIFIER__ unsigned int meta_group_rank() const {
967  }
968 
970  __CG_QUALIFIER__ unsigned int meta_group_size() const {
972  }
973  // Doxygen end group CooperativeG
977 };
978 
979 __CG_QUALIFIER__ thread_group this_thread() {
980  thread_group g(internal::group_type::cg_coalesced_group, 1, __ockl_activelane_u32());
981  return g;
982 }
983 
991 __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
992  if (parent.cg_type() == internal::cg_tiled_group) {
993  const tiled_group* cg = static_cast<const tiled_group*>(&parent);
994  return cg->new_tiled_group(tile_size);
995  } else if (parent.cg_type() == internal::cg_coalesced_group) {
996  const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
997  return cg->new_tiled_group(tile_size);
998  } else {
999  const thread_block* tb = static_cast<const thread_block*>(&parent);
1000  return tb->new_tiled_group(tile_size);
1001  }
1002 }
1003 
1004 // Thread block type overload
1005 __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
1006  return (parent.new_tiled_group(tile_size));
1007 }
1008 
1009 __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
1010  return (parent.new_tiled_group(tile_size));
1011 }
1012 
1013 // If a coalesced group is passed to be partitioned, it should remain coalesced
1014 __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
1015  unsigned int tile_size) {
1016  return (parent.new_tiled_group(tile_size));
1017 }
1018 
1019 namespace impl {
1020 template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
1021 
1022 template <unsigned int size, class ParentCGTy> class thread_block_tile_internal
1023  : public thread_block_tile_type<size, ParentCGTy> {
1024  protected:
1025  template <unsigned int tbtSize, class tbtParentT> __CG_QUALIFIER__ thread_block_tile_internal(
1027  : thread_block_tile_type<size, ParentCGTy>(g.meta_group_rank(), g.meta_group_size()) {}
1028 
1029  __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
1030  : thread_block_tile_type<size, ParentCGTy>() {}
1031 };
1032 } // namespace impl
1033 
1042 template <unsigned int size, class ParentCGTy> class thread_block_tile
1043  : public impl::thread_block_tile_internal<size, ParentCGTy> {
1044  protected:
1045  __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
1046  : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
1047 
1048  public:
1049  __CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
1050  return thread_block_tile<size, void>(*this);
1051  }
1052 
1053 #ifdef DOXYGEN_SHOULD_INCLUDE_THIS
1054 
1056  __CG_QUALIFIER__ unsigned int thread_rank() const;
1057 
1059  __CG_QUALIFIER__ void sync();
1060 
1063  __CG_QUALIFIER__ unsigned int meta_group_rank() const;
1064 
1066  __CG_QUALIFIER__ unsigned int meta_group_size() const;
1067 
1080  template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const;
1081 
1096  template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const;
1097 
1112  template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const;
1113 
1126  template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const;
1127 
1135  __CG_QUALIFIER__ unsigned long long ballot(int pred) const;
1136 
1143  __CG_QUALIFIER__ int any(int pred) const;
1144 
1151  __CG_QUALIFIER__ int all(int pred) const;
1152 
1161  template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const;
1162 
1174  template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const;
1175 
1176 #endif
1177 };
1178 
1179 template <unsigned int size> class thread_block_tile<size, void>
1180  : public impl::thread_block_tile_internal<size, void> {
1181  template <unsigned int, class ParentCGTy> friend class thread_block_tile;
1182 
1183  protected:
1184  public:
1185  template <class ParentCGTy>
1187  : impl::thread_block_tile_internal<size, void>(g) {}
1188 };
1189 
1190 template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
1191 
1192 namespace impl {
1193 template <unsigned int size, class ParentCGTy> struct tiled_partition_internal;
1194 
1195 template <unsigned int size> struct tiled_partition_internal<size, thread_block>
1196  : public thread_block_tile<size, thread_block> {
1197  __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g)
1199 };
1200 
1201 // ParentCGTy = thread_block_tile<ParentSize, GrandParentCGTy> specialization
1202 template <unsigned int size, unsigned int ParentSize, class GrandParentCGTy>
1203 struct tiled_partition_internal<size, thread_block_tile<ParentSize, GrandParentCGTy> >
1204  : public thread_block_tile<size, thread_block_tile<ParentSize, GrandParentCGTy> > {
1205  static_assert(size <= ParentSize, "Sub tile size must be <= parent tile size in tiled_partition");
1206 
1208  : thread_block_tile<size, thread_block_tile<ParentSize, GrandParentCGTy> >(g) {}
1209 };
1210 
1211 } // namespace impl
1212 
1225 template <unsigned int size, class ParentCGTy>
1226 __CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
1227  static_assert(is_valid_tile_size<size>::value,
1228  "Tiled partition with size > wavefront size. Currently not supported ");
1230 }
1231 
1232 #if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
1233 
1242 __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred) {
1243  auto mask = __ballot_sync<unsigned long long>(cgrp.coalesced_info.member_mask, pred);
1244 
1245  if (pred) {
1246  return coalesced_group(mask);
1247  } else {
1248  return coalesced_group(cgrp.coalesced_info.member_mask ^ mask);
1249  }
1250 }
1251 
1263 template <unsigned int size, class parent>
1265  bool pred) {
1266  auto mask = __ballot_sync<unsigned long long>(tgrp.build_mask(), pred);
1267 
1268  if (pred) {
1269  return coalesced_group(mask);
1270  } else {
1271  return coalesced_group(tgrp.build_mask() ^ mask);
1272  }
1273 }
1274 #endif
1275 } // namespace cooperative_groups
1276 
1277 #endif // __cplusplus
1278 #endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
The coalesced_group cooperative group type.
Definition: amd_hip_cooperative_groups.h:370
The grid cooperative group type.
Definition: amd_hip_cooperative_groups.h:193
Definition: amd_hip_cooperative_groups.h:1023
thread_block_tile_internal(const thread_block &g)
Definition: amd_hip_cooperative_groups.h:1029
thread_block_tile_internal(const thread_block_tile_internal< tbtSize, tbtParentT > &g)
Definition: amd_hip_cooperative_groups.h:1025
The multi-grid cooperative group type.
Definition: amd_hip_cooperative_groups.h:138
User exposed API that captures the state of the parent group pre-partition.
Definition: amd_hip_cooperative_groups.h:891
Definition: amd_hip_cooperative_groups.h:1180
thread_block_tile(const thread_block_tile< size, ParentCGTy > &g)
Definition: amd_hip_cooperative_groups.h:1186
Definition: amd_hip_cooperative_groups.h:832
Group type - thread_block_tile.
Definition: amd_hip_cooperative_groups.h:914
Group type - thread_block_tile.
Definition: amd_hip_cooperative_groups.h:1043
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
int any(int pred) const
Any function on group level.
unsigned long long ballot(int pred) const
Ballot function on group level.
thread_block_tile(const ParentCGTy &g)
Definition: amd_hip_cooperative_groups.h:1045
unsigned int meta_group_rank() const
unsigned int thread_rank() const
Rank of the calling thread within [0, num_threads() ).
T shfl_xor(T var, unsigned int laneMask) const
Shuffle xor operation on group level.
int all(int pred) const
All function on group level.
unsigned long long match_any(T value) const
Match any function on group level.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
void sync()
Synchronizes the threads in the group.
T shfl(T var, int srcRank) const
Shuffle operation on group level.
The workgroup (thread-block in CUDA terminology) cooperative group type.
Definition: amd_hip_cooperative_groups.h:234
The base type of all cooperative group types.
Definition: amd_hip_cooperative_groups.h:50
Definition: amd_hip_cooperative_groups.h:809
The tiled_group cooperative group type.
Definition: amd_hip_cooperative_groups.h:313
const struct texture< T, dim, readMode > const void size_t size
Definition: hip_runtime_api.h:9903
bool is_valid(CGTy const &g)
Returns true if the group has not violated any API constraints.
Definition: amd_hip_cooperative_groups.h:788
void sync(CGTy const &g)
Synchronizes the threads in the group.
Definition: amd_hip_cooperative_groups.h:799
__hip_uint32_t group_size(CGTy const &g)
Returns the size of the group.
Definition: amd_hip_cooperative_groups.h:760
__hip_uint32_t thread_rank(CGTy const &g)
Returns the rank of thread of the group.
Definition: amd_hip_cooperative_groups.h:775
thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition: amd_hip_cooperative_groups.h:302
coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition: amd_hip_cooperative_groups.h:1242
thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition: amd_hip_cooperative_groups.h:991
multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition: amd_hip_cooperative_groups.h:179
coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition: amd_hip_cooperative_groups.h:644
grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition: amd_hip_cooperative_groups.h:223
void sync() const
Synchronizes the threads in the group.
Definition: amd_hip_cooperative_groups.h:358
T shfl_xor(T var, unsigned int laneMask) const
Definition: amd_hip_cooperative_groups.h:863
static constexpr unsigned int numThreads
Definition: amd_hip_cooperative_groups.h:811
friend multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition: amd_hip_cooperative_groups.h:179
void sync() const
Synchronizes the threads in the group.
Definition: amd_hip_cooperative_groups.h:458
static void sync()
Definition: amd_hip_cooperative_groups.h:849
unsigned int num_threads
Definition: amd_hip_cooperative_groups.h:72
__hip_uint32_t thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition: amd_hip_cooperative_groups.h:205
void sync() const
Synchronizes the threads in the group.
Definition: amd_hip_cooperative_groups.h:209
__hip_uint32_t size() const
Total number of threads in the group (alias of num_threads())
Definition: amd_hip_cooperative_groups.h:94
unsigned int size() const
Total number of threads in the group (alias of num_threads())
Definition: amd_hip_cooperative_groups.h:450
__hip_uint32_t num_grids()
Definition: amd_hip_cooperative_groups.h:151
unsigned long long match_all(T value, int &pred) const
Definition: amd_hip_cooperative_groups.h:882
unsigned long long match_any(T value) const
Match any function on group level.
Definition: amd_hip_cooperative_groups.h:610
friend thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition: amd_hip_cooperative_groups.h:302
static unsigned int meta_group_size()
Returns the number of groups created when the parent group was partitioned.
Definition: amd_hip_cooperative_groups.h:900
unsigned int meta_group_rank() const
Definition: amd_hip_cooperative_groups.h:462
dim3 group_dim() const
Definition: amd_hip_cooperative_groups.h:210
unsigned int meta_group_rank() const
Definition: amd_hip_cooperative_groups.h:965
__hip_uint32_t _num_threads
Type of the thread_group.
Definition: amd_hip_cooperative_groups.h:53
static unsigned int num_threads()
Number of threads within this tile.
Definition: amd_hip_cooperative_groups.h:820
thread_group(internal::group_type type, __hip_uint32_t num_threads=static_cast< __hip_uint64_t >(0), __hip_uint64_t mask=static_cast< __hip_uint64_t >(0))
Definition: amd_hip_cooperative_groups.h:62
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition: amd_hip_cooperative_groups.h:467
unsigned int thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition: amd_hip_cooperative_groups.h:353
unsigned int thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition: amd_hip_cooperative_groups.h:453
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition: amd_hip_cooperative_groups.h:970
int all(int pred) const
All function on group level.
Definition: amd_hip_cooperative_groups.h:598
unsigned int cg_type() const
Returns the type of the group.
Definition: amd_hip_cooperative_groups.h:96
T shfl(T var, int srcRank) const
Definition: amd_hip_cooperative_groups.h:851
thread_group new_tiled_group(unsigned int tile_size) const
Definition: amd_hip_cooperative_groups.h:248
static __hip_uint32_t size()
Total number of threads in the group (alias of num_threads())
Definition: amd_hip_cooperative_groups.h:283
unsigned int num_threads() const
Definition: amd_hip_cooperative_groups.h:345
unsigned int num_threads() const
Definition: amd_hip_cooperative_groups.h:447
tiled_group(unsigned int tileSize)
Definition: amd_hip_cooperative_groups.h:337
unsigned int meta_group_rank
Definition: amd_hip_cooperative_groups.h:73
void sync() const
Synchronizes the threads in the group.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
Definition: amd_hip_cooperative_groups.h:627
thread_block_tile_type()
Definition: amd_hip_cooperative_groups.h:919
grid_group(__hip_uint32_t size)
Construct grid thread group (through the API this_grid())
Definition: amd_hip_cooperative_groups.h:200
__hip_uint32_t grid_rank()
Definition: amd_hip_cooperative_groups.h:155
static constexpr unsigned int thread_rank()
Rank of the thread within this tile.
Definition: amd_hip_cooperative_groups.h:815
bool is_tiled
Definition: amd_hip_cooperative_groups.h:71
unsigned long long ballot(int pred) const
Ballot function on group level.
Definition: amd_hip_cooperative_groups.h:576
void sync() const
Synchronizes the threads in the group.
Definition: amd_hip_cooperative_groups.h:163
T shfl_up(T var, unsigned int lane_delta) const
Definition: amd_hip_cooperative_groups.h:859
static dim3 group_index()
Returns 3-dimensional block index within the grid.
Definition: amd_hip_cooperative_groups.h:271
unsigned int num_threads
Definition: amd_hip_cooperative_groups.h:79
static void sync()
Synchronizes the threads in the group.
Definition: amd_hip_cooperative_groups.h:287
friend coalesced_group binary_partition(const thread_block_tile< fsize, fparent > &tgrp, bool pred)
__hip_uint32_t thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition: amd_hip_cooperative_groups.h:157
bool is_valid() const
Returns true if the group has not violated any API constraints.
static unsigned int size()
Definition: amd_hip_cooperative_groups.h:824
friend thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition: amd_hip_cooperative_groups.h:991
int any(int pred) const
Definition: amd_hip_cooperative_groups.h:873
int all(int pred) const
Definition: amd_hip_cooperative_groups.h:875
__hip_uint64_t _mask
Total number of threads in the thread_group.
Definition: amd_hip_cooperative_groups.h:54
friend thread_group this_thread()
Definition: amd_hip_cooperative_groups.h:979
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition: amd_hip_cooperative_groups.h:207
__hip_uint32_t _type
Definition: amd_hip_cooperative_groups.h:52
unsigned long long match_any(T value) const
Definition: amd_hip_cooperative_groups.h:877
unsigned int size() const
Total number of threads in the group (alias of num_threads())
Definition: amd_hip_cooperative_groups.h:350
struct cooperative_groups::thread_group::_coalesced_info coalesced_info
lane_mask member_mask
Definition: amd_hip_cooperative_groups.h:78
thread_block(__hip_uint32_t size)
Definition: amd_hip_cooperative_groups.h:245
unsigned int meta_group_size
Definition: amd_hip_cooperative_groups.h:74
multi_grid_group(__hip_uint32_t size)
Construct multi-grid thread group (through the API this_multi_grid())
Definition: amd_hip_cooperative_groups.h:145
int any(int pred) const
Any function on group level.
Definition: amd_hip_cooperative_groups.h:588
thread_group this_thread()
Definition: amd_hip_cooperative_groups.h:979
static unsigned int meta_group_rank()
Definition: amd_hip_cooperative_groups.h:895
__hip_uint32_t num_threads() const
Definition: amd_hip_cooperative_groups.h:92
struct _tiled_info tiled_info
Definition: amd_hip_cooperative_groups.h:80
friend coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition: amd_hip_cooperative_groups.h:644
friend coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition: amd_hip_cooperative_groups.h:1242
static dim3 thread_index()
Returns 3-dimensional thread index within the block.
Definition: amd_hip_cooperative_groups.h:273
friend class thread_block
Definition: amd_hip_cooperative_groups.h:86
coalesced_group(lane_mask member_mask)
Definition: amd_hip_cooperative_groups.h:435
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
Definition: amd_hip_cooperative_groups.h:508
static bool is_valid()
Returns true if the group has not violated any API constraints.
Definition: amd_hip_cooperative_groups.h:285
dim3 group_dim()
Returns the group dimensions.
Definition: amd_hip_cooperative_groups.h:289
thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
Definition: amd_hip_cooperative_groups.h:948
static __hip_uint32_t num_threads()
Definition: amd_hip_cooperative_groups.h:279
unsigned long long ballot(int pred) const
Definition: amd_hip_cooperative_groups.h:868
T shfl(T var, int srcRank) const
Shuffle operation on group level.
Definition: amd_hip_cooperative_groups.h:483
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition: amd_hip_cooperative_groups.h:161
static __hip_uint32_t thread_rank()
Rank of the calling thread within [0, num_threads() ).
Definition: amd_hip_cooperative_groups.h:275
friend grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition: amd_hip_cooperative_groups.h:223
T shfl_down(T var, unsigned int lane_delta) const
Definition: amd_hip_cooperative_groups.h:855
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
Definition: amd_hip_cooperative_groups.h:545
thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
Definition: amd_hip_cooperative_groups.h:924
__hip_uint32_t thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition: amd_hip_cooperative_groups.h:40
tiled_partition_internal(const thread_block &g)
Definition: amd_hip_cooperative_groups.h:1197
tiled_partition_internal(const thread_block_tile< ParentSize, GrandParentCGTy > &g)
Definition: amd_hip_cooperative_groups.h:1207
Definition: amd_hip_cooperative_groups.h:1193
Definition: amd_hip_cooperative_groups.h:77
Definition: amd_hip_cooperative_groups.h:70
Definition: hip_runtime_api.h:1278