clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File

clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File#

HIP Runtime API Reference: clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File
amd_hip_cooperative_groups.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
16#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
17#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
18
19#if __cplusplus
20#if !defined(__HIPCC_RTC__)
21#include <hip/amd_detail/hip_cooperative_groups_helper.h>
22#endif
23
25
35 protected:
36 __hip_uint32_t _type;
37 __hip_uint32_t _num_threads;
38 __hip_uint64_t _mask;
40
46 __CG_QUALIFIER__ thread_group(internal::group_type type,
47 __hip_uint32_t num_threads = static_cast<__hip_uint64_t>(0),
48 __hip_uint64_t mask = static_cast<__hip_uint64_t>(0)) {
49 _type = type;
51 _mask = mask;
52 }
53
54 struct _tiled_info {
56 unsigned int num_threads;
57 unsigned int meta_group_rank;
58 unsigned int meta_group_size;
59 };
60
66
67 friend __CG_QUALIFIER__ thread_group this_thread();
68 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
69 unsigned int tile_size);
70 friend class thread_block;
71
72 public:
76 __CG_QUALIFIER__ __hip_uint32_t num_threads() const { return _num_threads; }
78 __CG_QUALIFIER__ __hip_uint32_t size() const { return num_threads(); }
80 __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
82 __CG_QUALIFIER__ __hip_uint32_t thread_rank() const;
84 __CG_QUALIFIER__ __hip_uint32_t block_rank() const;
86 __CG_QUALIFIER__ bool is_valid() const;
87
100 __CG_QUALIFIER__ void sync() const;
101};
127 friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
128
129 protected:
131 explicit __CG_QUALIFIER__ multi_grid_group(__hip_uint32_t size)
132 : thread_group(internal::cg_multi_grid, size) {}
133
134 public:
137 __CG_QUALIFIER__ __hip_uint32_t num_grids() { return internal::multi_grid::num_grids(); }
138
141 __CG_QUALIFIER__ __hip_uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
143 __CG_QUALIFIER__ __hip_uint32_t thread_rank() const {
144 return internal::multi_grid::thread_rank();
145 }
147 __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
149 __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
150};
151
166 return multi_grid_group(internal::multi_grid::num_threads());
167}
168// Doxygen end group CooperativeGConstruct
179class grid_group : public thread_group {
182 friend __CG_QUALIFIER__ grid_group this_grid();
183
184 protected:
186 explicit __CG_QUALIFIER__ grid_group(__hip_uint32_t size)
187 : thread_group(internal::cg_grid, size) {}
188
189 public:
191 __CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::grid::thread_rank(); }
193 __CG_QUALIFIER__ __hip_uint32_t block_rank() const { return internal::grid::block_rank(); }
195 __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
197 __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
198 __CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); }
200 unsigned int signal;
201 };
203 __CG_QUALIFIER__ arrival_token barrier_arrive() const {
205 t.signal = internal::grid::barrier_signal();
206 return t;
207 }
209 __CG_QUALIFIER__ void barrier_wait(arrival_token&& t) const {
210 internal::grid::barrier_wait(t.signal);
211 }
212};
213
224__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::num_threads()); }
225
238 friend __CG_QUALIFIER__ thread_block this_thread_block();
239 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
240 unsigned int tile_size);
241 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
242 unsigned int tile_size);
243
244 protected:
245 // Construct a workgroup thread group (through the API this_thread_block())
246 explicit __CG_QUALIFIER__ thread_block(__hip_uint32_t size)
247 : thread_group(internal::cg_workgroup, size) {}
248
249 __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
250 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
251 // Invalid tile size, assert
252 if (!tile_size || (tile_size > warpSize) || !pow2) {
253 __hip_assert(false && "invalid tile size");
254 }
255
256 auto block_size = num_threads();
257 auto rank = thread_rank();
258 auto partitions = (block_size + tile_size - 1) / tile_size;
259 auto tail = (partitions * tile_size) - block_size;
260 auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size);
261 thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size);
262
263 tiledGroup.coalesced_info.tiled_info.num_threads = tile_size;
264 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
265 tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size;
266 tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions;
267 return tiledGroup;
268 }
269
270 public:
272 __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
274 __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
276 __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
277 return internal::workgroup::thread_rank();
278 }
280 __CG_STATIC_QUALIFIER__ __hip_uint32_t block_rank() {
281 return internal::workgroup::block_rank();
282 }
284 __CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
285 return internal::workgroup::num_threads();
286 }
288 __CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return num_threads(); }
290 __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
292 __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
294 __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
295 struct arrival_token {};
297 __CG_QUALIFIER__ arrival_token barrier_arrive() const {
298 internal::workgroup::barrier_arrive();
299 return arrival_token{};
300 }
302 __CG_QUALIFIER__ void barrier_wait(arrival_token&&) const { internal::workgroup::barrier_wait(); }
303};
304
315__CG_QUALIFIER__ thread_block this_thread_block() {
316 return thread_block(internal::workgroup::num_threads());
317}
318
326class tiled_group : public thread_group {
327 private:
328 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
329 unsigned int tile_size);
330 friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
331 unsigned int tile_size);
332
333 __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
334 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
335
336 if (!tile_size || (tile_size > warpSize) || !pow2) {
337 __hip_assert(false && "invalid tile size");
338 }
339
340 if (num_threads() <= tile_size) {
341 return *this;
342 }
343
344 tiled_group tiledGroup = tiled_group(tile_size);
345 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
346 return tiledGroup;
347 }
348
349 protected:
350 explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
351 : thread_group(internal::cg_tiled_group, tileSize) {
354 }
355
356 public:
358 __CG_QUALIFIER__ unsigned int num_threads() const {
360 }
361
363 __CG_QUALIFIER__ unsigned int size() const { return num_threads(); }
364
366 __CG_QUALIFIER__ unsigned int thread_rank() const {
367 return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.num_threads - 1));
368 }
370 __CG_QUALIFIER__ void sync() const { internal::tiled_group::sync(); }
371};
372
373template <unsigned int size, class ParentCGTy> class thread_block_tile;
374
383 private:
384 friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
385 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
386 unsigned int tile_size);
387 friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
388 unsigned int tile_size);
389 friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred);
390 template <unsigned int fsize, class fparent> friend __CG_QUALIFIER__ coalesced_group
392
393 __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
394 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
395
396 if (!tile_size || !pow2) {
397 return coalesced_group(0);
398 }
399
400 // If a tiled group is passed to be partitioned further into a coalesced_group.
401 // prepare a mask for further partitioning it so that it stays coalesced.
403 unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
404 unsigned int masklength =
405 min(static_cast<unsigned int>(num_threads()) - base_offset, tile_size);
406 lane_mask full_mask = (static_cast<int>(warpSize) == 32)
407 ? static_cast<lane_mask>((1u << 32) - 1)
408 : static_cast<lane_mask>(-1ull);
409 lane_mask member_mask = full_mask >> (warpSize - masklength);
410
411 member_mask <<= (__lane_id() & ~(tile_size - 1));
412 coalesced_group coalesced_tile = coalesced_group(member_mask);
413 coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
414 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
415 coalesced_tile.coalesced_info.tiled_info.meta_group_size = num_threads() / tile_size;
416 return coalesced_tile;
417 }
418 // Here the parent coalesced_group is not partitioned.
419 else {
420 lane_mask member_mask = 0;
421 unsigned int tile_rank = 0;
422 int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
423
424 for (unsigned int i = 0; i < warpSize; i++) {
425 lane_mask active = coalesced_info.member_mask & (static_cast<lane_mask>(1) << i);
426 // Make sure the lane is active
427 if (active) {
428 if (lanes_to_skip <= 0 && tile_rank < tile_size) {
429 // Prepare a member_mask that is appropriate for a tile
430 member_mask |= active;
431 tile_rank++;
432 }
433 lanes_to_skip--;
434 }
435 }
436 coalesced_group coalesced_tile = coalesced_group(member_mask);
437 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
438 coalesced_tile.coalesced_info.tiled_info.meta_group_size =
439 (num_threads() + tile_size - 1) / tile_size;
440 return coalesced_tile;
441 }
442 return coalesced_group(0);
443 }
444
445 protected:
446 // Constructor
447 explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
448 : thread_group(internal::cg_coalesced_group) {
449 coalesced_info.member_mask = member_mask; // Which threads are active
451 __popcll(coalesced_info.member_mask); // How many threads are active
452 coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
455 }
456
457 public:
459 __CG_QUALIFIER__ unsigned int num_threads() const { return coalesced_info.num_threads; }
460
462 __CG_QUALIFIER__ unsigned int size() const { return num_threads(); }
463
465 __CG_QUALIFIER__ unsigned int thread_rank() const {
466 return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
467 }
468
470 __CG_QUALIFIER__ void sync() const { internal::coalesced_group::sync(); }
471
474 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
476 }
477
479 __CG_QUALIFIER__ unsigned int meta_group_size() const {
481 }
482
495 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
496 srcRank = srcRank % static_cast<int>(num_threads());
497
498 int lane = (num_threads() == warpSize) ? srcRank
499 : (static_cast<int>(warpSize) == 64)
500 ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
501 : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
502
503 return __shfl(var, lane, warpSize);
504 }
505
520 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
521 // Note: The cuda implementation appears to use the remainder of lane_delta
522 // and WARP_SIZE as the shift value rather than lane_delta itself.
523 // This is not described in the documentation and is not done here.
524
525 if (num_threads() == warpSize) {
526 return __shfl_down(var, lane_delta, warpSize);
527 }
528
529 int lane;
530 if (static_cast<int>(warpSize) == 64) {
531 lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
532 } else {
533 lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
534 }
535
536 if (lane == -1) {
537 lane = __lane_id();
538 }
539
540 return __shfl(var, lane, warpSize);
541 }
542
557 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
558 // Note: The cuda implementation appears to use the remainder of lane_delta
559 // and WARP_SIZE as the shift value rather than lane_delta itself.
560 // This is not described in the documentation and is not done here.
561
562 if (num_threads() == warpSize) {
563 return __shfl_up(var, lane_delta, warpSize);
564 }
565
566 int lane;
567 if (static_cast<int>(warpSize) == 64) {
568 lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
569 } else if (static_cast<int>(warpSize) == 32) {
570 lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
571 }
572
573 if (lane == -1) {
574 lane = __lane_id();
575 }
576
577 return __shfl(var, lane, warpSize);
578 }
579#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
580
588 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
589 return internal::helper::adjust_mask(
591 __ballot_sync<unsigned long long>(coalesced_info.member_mask, pred));
592 }
593
600 __CG_QUALIFIER__ int any(int pred) const {
601 return __any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
602 }
603
610 __CG_QUALIFIER__ int all(int pred) const {
611 return __all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
612 }
613
622 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
623 return internal::helper::adjust_mask(
625 __match_any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value));
626 }
627
639 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
640 return internal::helper::adjust_mask(
642 __match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
643 &pred));
644 }
645#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
646};
647
657 return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
658}
659
660#ifndef DOXYGEN_SHOULD_SKIP_THIS
661
667__CG_QUALIFIER__ __hip_uint32_t thread_group::thread_rank() const {
668 switch (this->_type) {
669 case internal::cg_multi_grid: {
670 return (static_cast<const multi_grid_group*>(this)->thread_rank());
671 }
672 case internal::cg_grid: {
673 return (static_cast<const grid_group*>(this)->thread_rank());
674 }
675 case internal::cg_workgroup: {
676 return (static_cast<const thread_block*>(this)->thread_rank());
677 }
678 case internal::cg_tiled_group: {
679 return (static_cast<const tiled_group*>(this)->thread_rank());
680 }
681 case internal::cg_coalesced_group: {
682 return (static_cast<const coalesced_group*>(this)->thread_rank());
683 }
684 default: {
685 __hip_assert(false && "invalid cooperative group type");
686 return -1;
687 }
688 }
689}
690
696__CG_QUALIFIER__ bool thread_group::is_valid() const {
697 switch (this->_type) {
698 case internal::cg_multi_grid: {
699 return (static_cast<const multi_grid_group*>(this)->is_valid());
700 }
701 case internal::cg_grid: {
702 return (static_cast<const grid_group*>(this)->is_valid());
703 }
704 case internal::cg_workgroup: {
705 return (static_cast<const thread_block*>(this)->is_valid());
706 }
707 case internal::cg_tiled_group: {
708 return (static_cast<const tiled_group*>(this)->is_valid());
709 }
710 case internal::cg_coalesced_group: {
711 return (static_cast<const coalesced_group*>(this)->is_valid());
712 }
713 default: {
714 __hip_assert(false && "invalid cooperative group type");
715 return false;
716 }
717 }
718}
719
725__CG_QUALIFIER__ void thread_group::sync() const {
726 switch (this->_type) {
727 case internal::cg_multi_grid: {
728 static_cast<const multi_grid_group*>(this)->sync();
729 break;
730 }
731 case internal::cg_grid: {
732 static_cast<const grid_group*>(this)->sync();
733 break;
734 }
735 case internal::cg_workgroup: {
736 static_cast<const thread_block*>(this)->sync();
737 break;
738 }
739 case internal::cg_tiled_group: {
740 static_cast<const tiled_group*>(this)->sync();
741 break;
742 }
743 case internal::cg_coalesced_group: {
744 static_cast<const coalesced_group*>(this)->sync();
745 break;
746 }
747 default: {
748 __hip_assert(false && "invalid cooperative group type");
749 }
750 }
751}
752
753#endif
754
772template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) {
773 return g.num_threads();
774}
775
787template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t thread_rank(CGTy const& g) {
788 return g.thread_rank();
789}
790
800template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
801
811template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
812
813// Doxygen end group CooperativeGAPI
821template <unsigned int tileSize> class tile_base {
822 protected:
823 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
824
825 public:
827 _CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
828 return (internal::workgroup::thread_rank() & (numThreads - 1));
829 }
830
832 __CG_STATIC_QUALIFIER__ unsigned int num_threads() { return numThreads; }
833
836 __CG_STATIC_QUALIFIER__ unsigned int size() { return num_threads(); }
837};
838
844template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
845 static_assert(is_valid_tile_size<size>::value,
846 "Tile size is either not a power of 2 or greater than the wavefront size");
848
849 template <unsigned int fsize, class fparent> friend __CG_QUALIFIER__ coalesced_group
851
852#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
853 __CG_QUALIFIER__ unsigned long long build_mask() const {
854 unsigned long long mask = ~0ull >> (64 - numThreads);
855 // thread_rank() gives thread id from 0..thread launch size.
856 return mask << (((internal::workgroup::thread_rank() % warpSize) / numThreads) * numThreads);
857 }
858#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
859
860 public:
861 __CG_STATIC_QUALIFIER__ void sync() { internal::tiled_group::sync(); }
862
863 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
864 return (__shfl(var, srcRank, numThreads));
865 }
866
867 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
868 return (__shfl_down(var, lane_delta, numThreads));
869 }
870
871 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
872 return (__shfl_up(var, lane_delta, numThreads));
873 }
874
875 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
876 return (__shfl_xor(var, laneMask, numThreads));
877 }
878
879#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
880 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
881 const auto mask = build_mask();
882 return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
883 }
884
885 __CG_QUALIFIER__ int any(int pred) const { return __any_sync(build_mask(), pred); }
886
887 __CG_QUALIFIER__ int all(int pred) const { return __all_sync(build_mask(), pred); }
888
889 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
890 const auto mask = build_mask();
891 return internal::helper::adjust_mask(mask, __match_any_sync(mask, value));
892 }
893
894 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
895 const auto mask = build_mask();
896 return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
897 }
898#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
899};
900
903template <unsigned int tileSize, typename ParentCGTy> class parent_group_info {
904 public:
907 __CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() {
908 return ParentCGTy::thread_rank() / tileSize;
909 }
910
912 __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
913 return (ParentCGTy::num_threads() + tileSize - 1) / tileSize;
914 }
915};
916
923template <unsigned int tileSize, class ParentCGTy> class thread_block_tile_type
924 : public thread_block_tile_base<tileSize>,
925 public tiled_group,
926 public parent_group_info<tileSize, ParentCGTy> {
927 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
929
930 protected:
931 __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
934 }
935
944
945 public:
947 using tbtBase::size;
950};
951
952// Partial template specialization
953template <unsigned int tileSize> class thread_block_tile_type<tileSize, void>
954 : public thread_block_tile_base<tileSize>, public tiled_group {
955 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
956
958
959 protected:
968
969 public:
971 using tbtBase::size;
972 using tbtBase::sync;
974
977 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
979 }
980
982 __CG_QUALIFIER__ unsigned int meta_group_size() const {
984 }
985 // Doxygen end group CooperativeG
989};
990
991__CG_QUALIFIER__ thread_group this_thread() {
992 thread_group g(internal::group_type::cg_coalesced_group, 1, __ockl_activelane_u32());
993 return g;
994}
995
1003__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
1004 if (parent.cg_type() == internal::cg_tiled_group) {
1005 const tiled_group* cg = static_cast<const tiled_group*>(&parent);
1006 return cg->new_tiled_group(tile_size);
1007 } else if (parent.cg_type() == internal::cg_coalesced_group) {
1008 const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
1009 return cg->new_tiled_group(tile_size);
1010 } else {
1011 const thread_block* tb = static_cast<const thread_block*>(&parent);
1012 return tb->new_tiled_group(tile_size);
1013 }
1014}
1015
1016// Thread block type overload
1017__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
1018 return (parent.new_tiled_group(tile_size));
1019}
1020
1021__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
1022 return (parent.new_tiled_group(tile_size));
1023}
1024
1025// If a coalesced group is passed to be partitioned, it should remain coalesced
1026__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
1027 unsigned int tile_size) {
1028 return (parent.new_tiled_group(tile_size));
1029}
1030
1031namespace impl {
1032template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
1033
1034template <unsigned int size, class ParentCGTy> class thread_block_tile_internal
1035 : public thread_block_tile_type<size, ParentCGTy> {
1036 protected:
1037 template <unsigned int tbtSize, class tbtParentT> __CG_QUALIFIER__ thread_block_tile_internal(
1040
1041 __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
1042 : thread_block_tile_type<size, ParentCGTy>() {}
1043};
1044} // namespace impl
1045
1054template <unsigned int size, class ParentCGTy> class thread_block_tile
1055 : public impl::thread_block_tile_internal<size, ParentCGTy> {
1056 protected:
1057 __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
1058 : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
1059
1060 public:
1061 __CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
1062 return thread_block_tile<size, void>(*this);
1063 }
1064
1065#ifdef DOXYGEN_SHOULD_INCLUDE_THIS
1066
1068 __CG_QUALIFIER__ unsigned int thread_rank() const;
1069
1071 __CG_QUALIFIER__ void sync();
1072
1075 __CG_QUALIFIER__ unsigned int meta_group_rank() const;
1076
1078 __CG_QUALIFIER__ unsigned int meta_group_size() const;
1079
1092 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const;
1093
1108 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const;
1109
1124 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const;
1125
1138 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const;
1139
1147 __CG_QUALIFIER__ unsigned long long ballot(int pred) const;
1148
1155 __CG_QUALIFIER__ int any(int pred) const;
1156
1163 __CG_QUALIFIER__ int all(int pred) const;
1164
1173 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const;
1174
1186 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const;
1187
1188#endif
1189};
1190
1191template <unsigned int size> class thread_block_tile<size, void>
1192 : public impl::thread_block_tile_internal<size, void> {
1193 template <unsigned int, class ParentCGTy> friend class thread_block_tile;
1194
1195 protected:
1196 public:
1197 template <class ParentCGTy>
1199 : impl::thread_block_tile_internal<size, void>(g) {}
1200};
1201
1202template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
1203
1204namespace impl {
1205template <unsigned int size, class ParentCGTy> struct tiled_partition_internal;
1206
1207template <unsigned int size> struct tiled_partition_internal<size, thread_block>
1208 : public thread_block_tile<size, thread_block> {
1211};
1212
1213// ParentCGTy = thread_block_tile<ParentSize, GrandParentCGTy> specialization
1214template <unsigned int size, unsigned int ParentSize, class GrandParentCGTy>
1215struct tiled_partition_internal<size, thread_block_tile<ParentSize, GrandParentCGTy> >
1216 : public thread_block_tile<size, thread_block_tile<ParentSize, GrandParentCGTy> > {
1217 static_assert(size <= ParentSize, "Sub tile size must be <= parent tile size in tiled_partition");
1218
1220 : thread_block_tile<size, thread_block_tile<ParentSize, GrandParentCGTy> >(g) {}
1221};
1222
1223} // namespace impl
1224
1237template <unsigned int size, class ParentCGTy>
1238__CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
1239 static_assert(is_valid_tile_size<size>::value,
1240 "Tiled partition with size > wavefront size. Currently not supported ");
1242}
1243
1244#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
1245
1254__CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred) {
1255 auto mask = __ballot_sync<unsigned long long>(cgrp.coalesced_info.member_mask, pred);
1256
1257 if (pred) {
1258 return coalesced_group(mask);
1259 } else {
1260 return coalesced_group(cgrp.coalesced_info.member_mask ^ mask);
1261 }
1262}
1263
1275template <unsigned int size, class parent>
1277 bool pred) {
1278 auto mask = __ballot_sync<unsigned long long>(tgrp.build_mask(), pred);
1279
1280 if (pred) {
1281 return coalesced_group(mask);
1282 } else {
1283 return coalesced_group(tgrp.build_mask() ^ mask);
1284 }
1285}
1286#endif
1287} // namespace cooperative_groups
1288
1289#endif // __cplusplus
1290#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
The coalesced_group cooperative group type.
Definition amd_hip_cooperative_groups.h:382
The grid cooperative group type.
Definition amd_hip_cooperative_groups.h:179
Definition amd_hip_cooperative_groups.h:1035
thread_block_tile_internal(const thread_block &g)
Definition amd_hip_cooperative_groups.h:1041
thread_block_tile_internal(const thread_block_tile_internal< tbtSize, tbtParentT > &g)
Definition amd_hip_cooperative_groups.h:1037
The multi-grid cooperative group type.
Definition amd_hip_cooperative_groups.h:124
User exposed API that captures the state of the parent group pre-partition.
Definition amd_hip_cooperative_groups.h:903
thread_block_tile(const thread_block_tile< size, ParentCGTy > &g)
Definition amd_hip_cooperative_groups.h:1198
Definition amd_hip_cooperative_groups.h:844
Group type - thread_block_tile.
Definition amd_hip_cooperative_groups.h:926
Group type - thread_block_tile.
Definition amd_hip_cooperative_groups.h:1055
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
int any(int pred) const
Any function on group level.
unsigned long long ballot(int pred) const
Ballot function on group level.
thread_block_tile(const ParentCGTy &g)
Definition amd_hip_cooperative_groups.h:1057
unsigned int meta_group_rank() const
unsigned int thread_rank() const
Rank of the calling thread within [0, num_threads() ).
T shfl_xor(T var, unsigned int laneMask) const
Shuffle xor operation on group level.
int all(int pred) const
All function on group level.
unsigned long long match_any(T value) const
Match any function on group level.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
void sync()
Synchronizes the threads in the group.
T shfl(T var, int srcRank) const
Shuffle operation on group level.
The workgroup (thread-block in CUDA terminology) cooperative group type.
Definition amd_hip_cooperative_groups.h:235
The base type of all cooperative group types.
Definition amd_hip_cooperative_groups.h:34
Definition amd_hip_cooperative_groups.h:821
The tiled_group cooperative group type.
Definition amd_hip_cooperative_groups.h:326
const struct texture< T, dim, readMode > const void size_t size
Definition hip_runtime_api.h:10137
bool is_valid(CGTy const &g)
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:800
void sync(CGTy const &g)
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:811
__hip_uint32_t group_size(CGTy const &g)
Returns the size of the group.
Definition amd_hip_cooperative_groups.h:772
__hip_uint32_t thread_rank(CGTy const &g)
Returns the rank of thread of the group.
Definition amd_hip_cooperative_groups.h:787
thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition amd_hip_cooperative_groups.h:315
coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition amd_hip_cooperative_groups.h:1254
thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition amd_hip_cooperative_groups.h:1003
multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition amd_hip_cooperative_groups.h:165
coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition amd_hip_cooperative_groups.h:656
grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition amd_hip_cooperative_groups.h:224
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:370
T shfl_xor(T var, unsigned int laneMask) const
Definition amd_hip_cooperative_groups.h:875
static constexpr unsigned int numThreads
Definition amd_hip_cooperative_groups.h:823
friend multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition amd_hip_cooperative_groups.h:165
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:470
static void sync()
Definition amd_hip_cooperative_groups.h:861
void barrier_wait(arrival_token &&t) const
Arrive at a barrier.
Definition amd_hip_cooperative_groups.h:209
unsigned int num_threads
Definition amd_hip_cooperative_groups.h:56
__hip_uint32_t thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:191
arrival_token barrier_arrive() const
Arrive at a barrier.
Definition amd_hip_cooperative_groups.h:203
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:197
__hip_uint32_t size() const
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:78
unsigned int size() const
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:462
__hip_uint32_t num_grids()
Definition amd_hip_cooperative_groups.h:137
unsigned long long match_all(T value, int &pred) const
Definition amd_hip_cooperative_groups.h:894
unsigned long long match_any(T value) const
Match any function on group level.
Definition amd_hip_cooperative_groups.h:622
friend thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition amd_hip_cooperative_groups.h:315
static unsigned int meta_group_size()
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:912
unsigned int signal
Definition amd_hip_cooperative_groups.h:200
unsigned int meta_group_rank() const
Definition amd_hip_cooperative_groups.h:474
dim3 group_dim() const
Definition amd_hip_cooperative_groups.h:198
unsigned int meta_group_rank() const
Definition amd_hip_cooperative_groups.h:977
__hip_uint32_t _num_threads
Type of the thread_group.
Definition amd_hip_cooperative_groups.h:37
__hip_uint32_t block_rank() const
Rank of the block in calling thread within [0, num_threads() ).
static unsigned int num_threads()
Number of threads within this tile.
Definition amd_hip_cooperative_groups.h:832
thread_group(internal::group_type type, __hip_uint32_t num_threads=static_cast< __hip_uint64_t >(0), __hip_uint64_t mask=static_cast< __hip_uint64_t >(0))
Definition amd_hip_cooperative_groups.h:46
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:479
unsigned int thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:366
unsigned int thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:465
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:982
int all(int pred) const
All function on group level.
Definition amd_hip_cooperative_groups.h:610
unsigned int cg_type() const
Returns the type of the group.
Definition amd_hip_cooperative_groups.h:80
T shfl(T var, int srcRank) const
Definition amd_hip_cooperative_groups.h:863
thread_group new_tiled_group(unsigned int tile_size) const
Definition amd_hip_cooperative_groups.h:249
static __hip_uint32_t size()
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:288
unsigned int num_threads() const
Definition amd_hip_cooperative_groups.h:358
unsigned int num_threads() const
Definition amd_hip_cooperative_groups.h:459
void barrier_wait(arrival_token &&) const
Arrive at a barrier.
Definition amd_hip_cooperative_groups.h:302
tiled_group(unsigned int tileSize)
Definition amd_hip_cooperative_groups.h:350
unsigned int meta_group_rank
Definition amd_hip_cooperative_groups.h:57
void sync() const
Synchronizes the threads in the group.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
Definition amd_hip_cooperative_groups.h:639
thread_block_tile_type()
Definition amd_hip_cooperative_groups.h:931
__hip_uint32_t block_rank() const
Rank of the block in calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:193
grid_group(__hip_uint32_t size)
Construct grid thread group (through the API this_grid())
Definition amd_hip_cooperative_groups.h:186
__hip_uint32_t grid_rank()
Definition amd_hip_cooperative_groups.h:141
static constexpr unsigned int thread_rank()
Rank of the thread within this tile.
Definition amd_hip_cooperative_groups.h:827
bool is_tiled
Definition amd_hip_cooperative_groups.h:55
unsigned long long ballot(int pred) const
Ballot function on group level.
Definition amd_hip_cooperative_groups.h:588
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:149
T shfl_up(T var, unsigned int lane_delta) const
Definition amd_hip_cooperative_groups.h:871
static __hip_uint32_t block_rank()
Rank of the block in calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:280
static dim3 group_index()
Returns 3-dimensional block index within the grid.
Definition amd_hip_cooperative_groups.h:272
unsigned int num_threads
Definition amd_hip_cooperative_groups.h:63
static void sync()
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:292
friend coalesced_group binary_partition(const thread_block_tile< fsize, fparent > &tgrp, bool pred)
__hip_uint32_t thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:143
bool is_valid() const
Returns true if the group has not violated any API constraints.
static unsigned int size()
Definition amd_hip_cooperative_groups.h:836
friend thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition amd_hip_cooperative_groups.h:1003
int any(int pred) const
Definition amd_hip_cooperative_groups.h:885
int all(int pred) const
Definition amd_hip_cooperative_groups.h:887
__hip_uint64_t _mask
Total number of threads in the thread_group.
Definition amd_hip_cooperative_groups.h:38
friend thread_group this_thread()
Definition amd_hip_cooperative_groups.h:991
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:195
__hip_uint32_t _type
Definition amd_hip_cooperative_groups.h:36
unsigned long long match_any(T value) const
Definition amd_hip_cooperative_groups.h:889
unsigned int size() const
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:363
struct cooperative_groups::thread_group::_coalesced_info coalesced_info
lane_mask member_mask
Definition amd_hip_cooperative_groups.h:62
thread_block(__hip_uint32_t size)
Definition amd_hip_cooperative_groups.h:246
unsigned int meta_group_size
Definition amd_hip_cooperative_groups.h:58
multi_grid_group(__hip_uint32_t size)
Construct multi-grid thread group (through the API this_multi_grid())
Definition amd_hip_cooperative_groups.h:131
int any(int pred) const
Any function on group level.
Definition amd_hip_cooperative_groups.h:600
thread_group this_thread()
Definition amd_hip_cooperative_groups.h:991
static unsigned int meta_group_rank()
Definition amd_hip_cooperative_groups.h:907
__hip_uint32_t num_threads() const
Definition amd_hip_cooperative_groups.h:76
struct _tiled_info tiled_info
Definition amd_hip_cooperative_groups.h:64
friend coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition amd_hip_cooperative_groups.h:656
friend coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition amd_hip_cooperative_groups.h:1254
static dim3 thread_index()
Returns 3-dimensional thread index within the block.
Definition amd_hip_cooperative_groups.h:274
friend class thread_block
Definition amd_hip_cooperative_groups.h:70
coalesced_group(lane_mask member_mask)
Definition amd_hip_cooperative_groups.h:447
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
Definition amd_hip_cooperative_groups.h:520
static bool is_valid()
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:290
dim3 group_dim()
Returns the group dimensions.
Definition amd_hip_cooperative_groups.h:294
thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
Definition amd_hip_cooperative_groups.h:960
static __hip_uint32_t num_threads()
Definition amd_hip_cooperative_groups.h:284
unsigned long long ballot(int pred) const
Definition amd_hip_cooperative_groups.h:880
T shfl(T var, int srcRank) const
Shuffle operation on group level.
Definition amd_hip_cooperative_groups.h:495
arrival_token barrier_arrive() const
Arrive at a barrier.
Definition amd_hip_cooperative_groups.h:297
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:147
static __hip_uint32_t thread_rank()
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:276
friend grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition amd_hip_cooperative_groups.h:224
T shfl_down(T var, unsigned int lane_delta) const
Definition amd_hip_cooperative_groups.h:867
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
Definition amd_hip_cooperative_groups.h:557
thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
Definition amd_hip_cooperative_groups.h:936
__hip_uint32_t thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:24
Definition amd_hip_cooperative_groups.h:199
tiled_partition_internal(const thread_block &g)
Definition amd_hip_cooperative_groups.h:1209
tiled_partition_internal(const thread_block_tile< ParentSize, GrandParentCGTy > &g)
Definition amd_hip_cooperative_groups.h:1219
Definition amd_hip_cooperative_groups.h:1205
Definition amd_hip_cooperative_groups.h:295
Definition amd_hip_cooperative_groups.h:61
Definition amd_hip_cooperative_groups.h:54
Definition hip_runtime_api.h:1284