/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hip/checkouts/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hip/checkouts/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File#

HIP Runtime API Reference: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hip/checkouts/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File
amd_hip_cooperative_groups.h
Go to the documentation of this file.
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
32#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
33#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
34
35#if __cplusplus
36#if !defined(__HIPCC_RTC__)
37#include <hip/amd_detail/hip_cooperative_groups_helper.h>
38#endif
39
41
51 protected:
52 uint32_t _type;
53 uint32_t _size;
54 uint64_t _mask;
56
62 __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast<uint64_t>(0),
63 uint64_t mask = static_cast<uint64_t>(0)) {
64 _type = type;
65 _size = size;
66 _mask = mask;
67 }
68
69 struct _tiled_info {
71 unsigned int size;
72 unsigned int meta_group_rank;
73 unsigned int meta_group_size;
74 };
75
81
82 friend __CG_QUALIFIER__ thread_group this_thread();
83 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
84 unsigned int tile_size);
85 friend class thread_block;
86
87 public:
91 __CG_QUALIFIER__ uint32_t size() const { return _size; }
93 __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
95 __CG_QUALIFIER__ uint32_t thread_rank() const;
97 __CG_QUALIFIER__ bool is_valid() const;
98
111 __CG_QUALIFIER__ void sync() const;
112};
138 friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
139
140 protected:
142 explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
143 : thread_group(internal::cg_multi_grid, size) {}
144
145 public:
146
149 __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
150
153 __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
155 __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
157 __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
159 __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
160};
161
176 return multi_grid_group(internal::multi_grid::size());
177}
178// Doxygen end group CooperativeGConstruct
189class grid_group : public thread_group {
192 friend __CG_QUALIFIER__ grid_group this_grid();
193
194 protected:
196 explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
197
198 public:
200 __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
202 __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
204 __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
205 __CG_QUALIFIER__ dim3 group_dim() const { return internal::workgroup::block_dim(); }
206};
207
218__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
219
232 friend __CG_QUALIFIER__ thread_block this_thread_block();
233 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
234 unsigned int tile_size);
235 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
236 unsigned int tile_size);
237 protected:
238 // Construct a workgroup thread group (through the API this_thread_block())
239 explicit __CG_QUALIFIER__ thread_block(uint32_t size)
240 : thread_group(internal::cg_workgroup, size) {}
241
242 __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
243 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
244 // Invalid tile size, assert
245 if (!tile_size || (tile_size > warpSize) || !pow2) {
246 __hip_assert(false && "invalid tile size");
247 }
248
249 auto block_size = size();
250 auto rank = thread_rank();
251 auto partitions = (block_size + tile_size - 1) / tile_size;
252 auto tail = (partitions * tile_size) - block_size;
253 auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size);
254 thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size);
255
256 tiledGroup.coalesced_info.tiled_info.size = tile_size;
257 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
258 tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size;
259 tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions;
260 return tiledGroup;
261 }
262
263 public:
265 __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
267 __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
269 __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
271 __CG_STATIC_QUALIFIER__ uint32_t size() { return internal::workgroup::size(); }
273 __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
275 __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
277 __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
278};
279
290__CG_QUALIFIER__ thread_block this_thread_block() {
291 return thread_block(internal::workgroup::size());
292}
293
301class tiled_group : public thread_group {
302 private:
303 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
304 unsigned int tile_size);
305 friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
306 unsigned int tile_size);
307
308 __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
309 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
310
311 if (!tile_size || (tile_size > warpSize) || !pow2) {
312 __hip_assert(false && "invalid tile size");
313 }
314
315 if (size() <= tile_size) {
316 return *this;
317 }
318
319 tiled_group tiledGroup = tiled_group(tile_size);
320 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
321 return tiledGroup;
322 }
323
324 protected:
325 explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
326 : thread_group(internal::cg_tiled_group, tileSize) {
327 coalesced_info.tiled_info.size = tileSize;
329 }
330
331 public:
333 __CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); }
334
336 __CG_QUALIFIER__ unsigned int thread_rank() const {
337 return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1));
338 }
339
341 __CG_QUALIFIER__ void sync() const {
342 internal::tiled_group::sync();
343 }
344};
345
346template <unsigned int size, class ParentCGTy> class thread_block_tile;
347
356 private:
357 friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
358 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size);
359 friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size);
360 friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred);
361 template <unsigned int fsize, class fparent>
362 friend __CG_QUALIFIER__ coalesced_group
364
365 __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
366 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
367
368 if (!tile_size || !pow2) {
369 return coalesced_group(0);
370 }
371
372 // If a tiled group is passed to be partitioned further into a coalesced_group.
373 // prepare a mask for further partitioning it so that it stays coalesced.
375 unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
376 unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
377 lane_mask member_mask = static_cast<lane_mask>(-1) >> (warpSize - masklength);
378
379 member_mask <<= (__lane_id() & ~(tile_size - 1));
380 coalesced_group coalesced_tile = coalesced_group(member_mask);
381 coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
382 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
383 coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size;
384 return coalesced_tile;
385 }
386 // Here the parent coalesced_group is not partitioned.
387 else {
388 lane_mask member_mask = 0;
389 unsigned int tile_rank = 0;
390 int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
391
392 for (unsigned int i = 0; i < warpSize; i++) {
393 lane_mask active = coalesced_info.member_mask & (1 << i);
394 // Make sure the lane is active
395 if (active) {
396 if (lanes_to_skip <= 0 && tile_rank < tile_size) {
397 // Prepare a member_mask that is appropriate for a tile
398 member_mask |= active;
399 tile_rank++;
400 }
401 lanes_to_skip--;
402 }
403 }
404 coalesced_group coalesced_tile = coalesced_group(member_mask);
405 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
406 coalesced_tile.coalesced_info.tiled_info.meta_group_size =
407 (size() + tile_size - 1) / tile_size;
408 return coalesced_tile;
409 }
410 return coalesced_group(0);
411 }
412
413 protected:
414 // Constructor
415 explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
416 : thread_group(internal::cg_coalesced_group) {
417 coalesced_info.member_mask = member_mask; // Which threads are active
418 coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active
419 coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
422 }
423
424 public:
426 __CG_QUALIFIER__ unsigned int size() const {
427 return coalesced_info.size;
428 }
429
431 __CG_QUALIFIER__ unsigned int thread_rank() const {
432 return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
433 }
434
436 __CG_QUALIFIER__ void sync() const {
437 internal::coalesced_group::sync();
438 }
439
442 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
444 }
445
447 __CG_QUALIFIER__ unsigned int meta_group_size() const {
449 }
450
463 template <class T>
464 __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
465 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
466
467 srcRank = srcRank % static_cast<int>(size());
468
469 int lane = (size() == warpSize) ? srcRank
470 : (warpSize == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
471 : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
472
473 return __shfl(var, lane, warpSize);
474 }
475
490 template <class T>
491 __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
492 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
493
494 // Note: The cuda implementation appears to use the remainder of lane_delta
495 // and WARP_SIZE as the shift value rather than lane_delta itself.
496 // This is not described in the documentation and is not done here.
497
498 if (size() == warpSize) {
499 return __shfl_down(var, lane_delta, warpSize);
500 }
501
502 int lane;
503 if (warpSize == 64) {
504 lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
505 }
506 else {
507 lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
508 }
509
510 if (lane == -1) {
511 lane = __lane_id();
512 }
513
514 return __shfl(var, lane, warpSize);
515 }
516
531 template <class T>
532 __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
533 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
534
535 // Note: The cuda implementation appears to use the remainder of lane_delta
536 // and WARP_SIZE as the shift value rather than lane_delta itself.
537 // This is not described in the documentation and is not done here.
538
539 if (size() == warpSize) {
540 return __shfl_up(var, lane_delta, warpSize);
541 }
542
543 int lane;
544 if (warpSize == 64) {
545 lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
546 }
547 else if (warpSize == 32) {
548 lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
549 }
550
551 if (lane == -1) {
552 lane = __lane_id();
553 }
554
555 return __shfl(var, lane, warpSize);
556 }
557#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
558
566 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
567 return internal::helper::adjust_mask(
569 __ballot_sync<unsigned long long>(coalesced_info.member_mask, pred));
570 }
571
578 __CG_QUALIFIER__ int any(int pred) const {
579 return __any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
580 }
581
588 __CG_QUALIFIER__ int all(int pred) const {
589 return __all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
590 }
591
600 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
601 return internal::helper::adjust_mask(
603 __match_any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value));
604 }
605
617 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
618 return internal::helper::adjust_mask(
620 __match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
621 &pred));
622 }
623#endif
624};
625
635 return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
636}
637
638#ifndef DOXYGEN_SHOULD_SKIP_THIS
639
645__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
646 switch (this->_type) {
647 case internal::cg_multi_grid: {
648 return (static_cast<const multi_grid_group*>(this)->thread_rank());
649 }
650 case internal::cg_grid: {
651 return (static_cast<const grid_group*>(this)->thread_rank());
652 }
653 case internal::cg_workgroup: {
654 return (static_cast<const thread_block*>(this)->thread_rank());
655 }
656 case internal::cg_tiled_group: {
657 return (static_cast<const tiled_group*>(this)->thread_rank());
658 }
659 case internal::cg_coalesced_group: {
660 return (static_cast<const coalesced_group*>(this)->thread_rank());
661 }
662 default: {
663 __hip_assert(false && "invalid cooperative group type");
664 return -1;
665 }
666 }
667}
668
674__CG_QUALIFIER__ bool thread_group::is_valid() const {
675 switch (this->_type) {
676 case internal::cg_multi_grid: {
677 return (static_cast<const multi_grid_group*>(this)->is_valid());
678 }
679 case internal::cg_grid: {
680 return (static_cast<const grid_group*>(this)->is_valid());
681 }
682 case internal::cg_workgroup: {
683 return (static_cast<const thread_block*>(this)->is_valid());
684 }
685 case internal::cg_tiled_group: {
686 return (static_cast<const tiled_group*>(this)->is_valid());
687 }
688 case internal::cg_coalesced_group: {
689 return (static_cast<const coalesced_group*>(this)->is_valid());
690 }
691 default: {
692 __hip_assert(false && "invalid cooperative group type");
693 return false;
694 }
695 }
696}
697
703__CG_QUALIFIER__ void thread_group::sync() const {
704 switch (this->_type) {
705 case internal::cg_multi_grid: {
706 static_cast<const multi_grid_group*>(this)->sync();
707 break;
708 }
709 case internal::cg_grid: {
710 static_cast<const grid_group*>(this)->sync();
711 break;
712 }
713 case internal::cg_workgroup: {
714 static_cast<const thread_block*>(this)->sync();
715 break;
716 }
717 case internal::cg_tiled_group: {
718 static_cast<const tiled_group*>(this)->sync();
719 break;
720 }
721 case internal::cg_coalesced_group: {
722 static_cast<const coalesced_group*>(this)->sync();
723 break;
724 }
725 default: {
726 __hip_assert(false && "invalid cooperative group type");
727 }
728 }
729}
730
731#endif
732
750template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); }
751
763template <class CGTy> __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) {
764 return g.thread_rank();
765}
766
776template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
777
787template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
788
789// Doxygen end group CooperativeGAPI
797template <unsigned int tileSize> class tile_base {
798 protected:
799 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
800
801 public:
803 _CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
804 return (internal::workgroup::thread_rank() & (numThreads - 1));
805 }
806
808 __CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; }
809};
810
816template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
817 static_assert(is_valid_tile_size<size>::value,
818 "Tile size is either not a power of 2 or greater than the wavefront size");
820
821 template <unsigned int fsize, class fparent>
822 friend __CG_QUALIFIER__ coalesced_group
824
825#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
826 __CG_QUALIFIER__ unsigned long long build_mask() const {
827 unsigned long long mask = ~0ull >> (64 - numThreads);
828 // thread_rank() gives thread id from 0..thread launch size.
829 return mask << (((internal::workgroup::thread_rank() % warpSize) / numThreads) *
830 numThreads);
831 }
832#endif
833
834 public:
835
836 __CG_STATIC_QUALIFIER__ void sync() {
837 internal::tiled_group::sync();
838 }
839
840 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
841 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
842 return (__shfl(var, srcRank, numThreads));
843 }
844
845 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
846 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
847 return (__shfl_down(var, lane_delta, numThreads));
848 }
849
850 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
851 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
852 return (__shfl_up(var, lane_delta, numThreads));
853 }
854
855 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
856 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
857 return (__shfl_xor(var, laneMask, numThreads));
858 }
859
860#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
861 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
862 const auto mask = build_mask();
863 return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
864 }
865
866 __CG_QUALIFIER__ int any(int pred) const { return __any_sync(build_mask(), pred); }
867
868 __CG_QUALIFIER__ int all(int pred) const { return __all_sync(build_mask(), pred); }
869
870 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
871 const auto mask = build_mask();
872 return internal::helper::adjust_mask(mask, __match_any_sync(mask, value));
873 }
874
875 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
876 const auto mask = build_mask();
877 return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
878 }
879#endif
880};
881
884template <unsigned int tileSize, typename ParentCGTy>
886public:
889 __CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() {
890 return ParentCGTy::thread_rank() / tileSize;
891 }
892
894 __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
895 return (ParentCGTy::size() + tileSize - 1) / tileSize;
896 }
897};
898
905template <unsigned int tileSize, class ParentCGTy>
907 public tiled_group,
908 public parent_group_info<tileSize, ParentCGTy> {
909 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
911 protected:
912 __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
913 coalesced_info.tiled_info.size = numThreads;
915 }
916 public:
917 using tbtBase::size;
920};
921
922// Partial template specialization
923template <unsigned int tileSize>
924class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<tileSize>,
925 public tiled_group
926 {
927 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
928
930
931 protected:
932
940
941 public:
942 using tbtBase::size;
943 using tbtBase::sync;
945
948 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
950 }
951
953 __CG_QUALIFIER__ unsigned int meta_group_size() const {
955 }
956// Doxygen end group CooperativeG
960};
961
962__CG_QUALIFIER__ thread_group this_thread() {
963 thread_group g(internal::group_type::cg_coalesced_group, 1, __ockl_activelane_u32());
964 return g;
965}
966
974__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
975 if (parent.cg_type() == internal::cg_tiled_group) {
976 const tiled_group* cg = static_cast<const tiled_group*>(&parent);
977 return cg->new_tiled_group(tile_size);
978 }
979 else if(parent.cg_type() == internal::cg_coalesced_group) {
980 const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
981 return cg->new_tiled_group(tile_size);
982 }
983 else {
984 const thread_block* tb = static_cast<const thread_block*>(&parent);
985 return tb->new_tiled_group(tile_size);
986 }
987}
988
989// Thread block type overload
990__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
991 return (parent.new_tiled_group(tile_size));
992}
993
994__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
995 return (parent.new_tiled_group(tile_size));
996}
997
998// If a coalesced group is passed to be partitioned, it should remain coalesced
999__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) {
1000 return (parent.new_tiled_group(tile_size));
1001}
1002
1003namespace impl {
1004template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
1005
1006template <unsigned int size, class ParentCGTy>
1007class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGTy> {
1008 protected:
1009 template <unsigned int tbtSize, class tbtParentT>
1013
1014 __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
1015 : thread_block_tile_type<size, ParentCGTy>() {}
1016};
1017} // namespace impl
1018
1027template <unsigned int size, class ParentCGTy>
1028class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCGTy> {
1029 protected:
1030 __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
1031 : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
1032
1033 public:
1034 __CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
1035 return thread_block_tile<size, void>(*this);
1036 }
1037
1038#ifdef DOXYGEN_SHOULD_INCLUDE_THIS
1039
1041 __CG_QUALIFIER__ unsigned int thread_rank() const;
1042
1044 __CG_QUALIFIER__ void sync();
1045
1048 __CG_QUALIFIER__ unsigned int meta_group_rank() const;
1049
1051 __CG_QUALIFIER__ unsigned int meta_group_size() const;
1052
1065 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const;
1066
1081 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const;
1082
1097 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const;
1098
1111 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const;
1112
1120 __CG_QUALIFIER__ unsigned long long ballot(int pred) const;
1121
1128 __CG_QUALIFIER__ int any(int pred) const;
1129
1136 __CG_QUALIFIER__ int all(int pred) const;
1137
1146 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const;
1147
1159 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const;
1160
1161#endif
1162};
1163
1164template <unsigned int size>
1165class thread_block_tile<size, void> : public impl::thread_block_tile_internal<size, void> {
1166 template <unsigned int, class ParentCGTy> friend class thread_block_tile;
1167
1168 protected:
1169 public:
1170 template <class ParentCGTy>
1172 : impl::thread_block_tile_internal<size, void>(g) {}
1173};
1174
1175template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
1176
1177namespace impl {
1178template <unsigned int size, class ParentCGTy> struct tiled_partition_internal;
1179
1180template <unsigned int size>
1181struct tiled_partition_internal<size, thread_block> : public thread_block_tile<size, thread_block> {
1182 __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g)
1183 : thread_block_tile<size, thread_block>(g) {}
1184};
1185
1186} // namespace impl
1187
1200template <unsigned int size, class ParentCGTy>
1201__CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
1202 static_assert(is_valid_tile_size<size>::value,
1203 "Tiled partition with size > wavefront size. Currently not supported ");
1205}
1206
1207#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
1208
1217__CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred) {
1218 auto mask = __ballot_sync<unsigned long long>(cgrp.coalesced_info.member_mask, pred);
1219
1220 if (pred) {
1221 return coalesced_group(mask);
1222 } else {
1223 return coalesced_group(cgrp.coalesced_info.member_mask ^ mask);
1224 }
1225}
1226
1238template <unsigned int size, class parent>
1240 bool pred) {
1241 auto mask = __ballot_sync<unsigned long long>(tgrp.build_mask(), pred);
1242
1243 if (pred) {
1244 return coalesced_group(mask);
1245 } else {
1246 return coalesced_group(tgrp.build_mask() ^ mask);
1247 }
1248}
1249#endif
1250} // namespace cooperative_groups
1251
1252#endif // __cplusplus
1253#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
The coalesced_group cooperative group type.
Definition amd_hip_cooperative_groups.h:355
The grid cooperative group type.
Definition amd_hip_cooperative_groups.h:189
Definition amd_hip_cooperative_groups.h:1007
thread_block_tile_internal(const thread_block &g)
Definition amd_hip_cooperative_groups.h:1014
thread_block_tile_internal(const thread_block_tile_internal< tbtSize, tbtParentT > &g)
Definition amd_hip_cooperative_groups.h:1010
The multi-grid cooperative group type.
Definition amd_hip_cooperative_groups.h:135
User exposed API that captures the state of the parent group pre-partition.
Definition amd_hip_cooperative_groups.h:885
thread_block_tile(const thread_block_tile< size, ParentCGTy > &g)
Definition amd_hip_cooperative_groups.h:1171
Definition amd_hip_cooperative_groups.h:816
Group type - thread_block_tile.
Definition amd_hip_cooperative_groups.h:908
Group type - thread_block_tile.
Definition amd_hip_cooperative_groups.h:1028
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
int any(int pred) const
Any function on group level.
unsigned long long ballot(int pred) const
Ballot function on group level.
thread_block_tile(const ParentCGTy &g)
Definition amd_hip_cooperative_groups.h:1030
unsigned int meta_group_rank() const
unsigned int thread_rank() const
Rank of the calling thread within [0, size() ).
T shfl_xor(T var, unsigned int laneMask) const
Shuffle xor operation on group level.
int all(int pred) const
All function on group level.
unsigned long long match_any(T value) const
Match any function on group level.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
void sync()
Synchronizes the threads in the group.
T shfl(T var, int srcRank) const
Shuffle operation on group level.
The workgroup (thread-block in CUDA terminology) cooperative group type.
Definition amd_hip_cooperative_groups.h:229
The base type of all cooperative group types.
Definition amd_hip_cooperative_groups.h:50
Definition amd_hip_cooperative_groups.h:797
The tiled_group cooperative group type.
Definition amd_hip_cooperative_groups.h:301
bool is_valid(CGTy const &g)
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:776
void sync(CGTy const &g)
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:787
uint32_t thread_rank(CGTy const &g)
Returns the rank of thread of the group.
Definition amd_hip_cooperative_groups.h:763
uint32_t group_size(CGTy const &g)
Returns the size of the group.
Definition amd_hip_cooperative_groups.h:750
thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition amd_hip_cooperative_groups.h:290
coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition amd_hip_cooperative_groups.h:1217
thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition amd_hip_cooperative_groups.h:974
multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition amd_hip_cooperative_groups.h:175
coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition amd_hip_cooperative_groups.h:634
grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition amd_hip_cooperative_groups.h:218
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:341
T shfl_xor(T var, unsigned int laneMask) const
Definition amd_hip_cooperative_groups.h:855
static constexpr unsigned int numThreads
Definition amd_hip_cooperative_groups.h:799
friend multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition amd_hip_cooperative_groups.h:175
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:436
static void sync()
Definition amd_hip_cooperative_groups.h:836
uint32_t size() const
Definition amd_hip_cooperative_groups.h:91
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:204
unsigned int size() const
Definition amd_hip_cooperative_groups.h:426
unsigned long long match_all(T value, int &pred) const
Definition amd_hip_cooperative_groups.h:875
unsigned long long match_any(T value) const
Match any function on group level.
Definition amd_hip_cooperative_groups.h:600
friend thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition amd_hip_cooperative_groups.h:290
uint32_t _size
Type of the thread_group.
Definition amd_hip_cooperative_groups.h:53
static unsigned int meta_group_size()
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:894
unsigned int meta_group_rank() const
Definition amd_hip_cooperative_groups.h:442
dim3 group_dim() const
Definition amd_hip_cooperative_groups.h:205
unsigned int meta_group_rank() const
Definition amd_hip_cooperative_groups.h:948
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:447
unsigned int thread_rank() const
Rank of the calling thread within [0, size() ).
Definition amd_hip_cooperative_groups.h:336
unsigned int thread_rank() const
Rank of the calling thread within [0, size() ).
Definition amd_hip_cooperative_groups.h:431
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:953
int all(int pred) const
All function on group level.
Definition amd_hip_cooperative_groups.h:588
unsigned int cg_type() const
Returns the type of the group.
Definition amd_hip_cooperative_groups.h:93
T shfl(T var, int srcRank) const
Definition amd_hip_cooperative_groups.h:840
thread_block(uint32_t size)
Definition amd_hip_cooperative_groups.h:239
thread_group new_tiled_group(unsigned int tile_size) const
Definition amd_hip_cooperative_groups.h:242
thread_group(internal::group_type type, uint32_t size=static_cast< uint64_t >(0), uint64_t mask=static_cast< uint64_t >(0))
Definition amd_hip_cooperative_groups.h:62
uint32_t thread_rank() const
Rank of the calling thread within [0, size() ).
Definition amd_hip_cooperative_groups.h:200
tiled_group(unsigned int tileSize)
Definition amd_hip_cooperative_groups.h:325
unsigned int meta_group_rank
Definition amd_hip_cooperative_groups.h:72
void sync() const
Synchronizes the threads in the group.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
Definition amd_hip_cooperative_groups.h:617
thread_block_tile_type()
Definition amd_hip_cooperative_groups.h:912
static uint32_t size()
Definition amd_hip_cooperative_groups.h:271
static constexpr unsigned int thread_rank()
Rank of the thread within this tile.
Definition amd_hip_cooperative_groups.h:803
bool is_tiled
Definition amd_hip_cooperative_groups.h:70
unsigned long long ballot(int pred) const
Ballot function on group level.
Definition amd_hip_cooperative_groups.h:566
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:159
T shfl_up(T var, unsigned int lane_delta) const
Definition amd_hip_cooperative_groups.h:850
uint32_t num_grids()
Definition amd_hip_cooperative_groups.h:149
static dim3 group_index()
Returns 3-dimensional block index within the grid.
Definition amd_hip_cooperative_groups.h:265
static void sync()
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:275
friend coalesced_group binary_partition(const thread_block_tile< fsize, fparent > &tgrp, bool pred)
bool is_valid() const
Returns true if the group has not violated any API constraints.
static unsigned int size()
Number of threads within this tile.
Definition amd_hip_cooperative_groups.h:808
friend thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition amd_hip_cooperative_groups.h:974
int any(int pred) const
Definition amd_hip_cooperative_groups.h:866
int all(int pred) const
Definition amd_hip_cooperative_groups.h:868
uint32_t thread_rank() const
Rank of the calling thread within [0, size() ).
Definition amd_hip_cooperative_groups.h:155
friend thread_group this_thread()
Definition amd_hip_cooperative_groups.h:962
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:202
unsigned long long match_any(T value) const
Definition amd_hip_cooperative_groups.h:870
unsigned int size() const
Definition amd_hip_cooperative_groups.h:333
uint64_t _mask
Total number of threads in the tread_group.
Definition amd_hip_cooperative_groups.h:54
unsigned int size
Definition amd_hip_cooperative_groups.h:71
struct cooperative_groups::thread_group::_coalesced_info coalesced_info
lane_mask member_mask
Definition amd_hip_cooperative_groups.h:77
static uint32_t thread_rank()
Rank of the calling thread within [0, size() ).
Definition amd_hip_cooperative_groups.h:269
unsigned int meta_group_size
Definition amd_hip_cooperative_groups.h:73
uint32_t _type
Definition amd_hip_cooperative_groups.h:52
int any(int pred) const
Any function on group level.
Definition amd_hip_cooperative_groups.h:578
thread_group this_thread()
Definition amd_hip_cooperative_groups.h:962
static unsigned int meta_group_rank()
Definition amd_hip_cooperative_groups.h:889
unsigned int size
Definition amd_hip_cooperative_groups.h:78
struct _tiled_info tiled_info
Definition amd_hip_cooperative_groups.h:79
friend coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition amd_hip_cooperative_groups.h:634
friend coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition amd_hip_cooperative_groups.h:1217
static dim3 thread_index()
Returns 3-dimensional thread index within the block.
Definition amd_hip_cooperative_groups.h:267
friend class thread_block
Definition amd_hip_cooperative_groups.h:85
coalesced_group(lane_mask member_mask)
Definition amd_hip_cooperative_groups.h:415
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
Definition amd_hip_cooperative_groups.h:491
grid_group(uint32_t size)
Construct grid thread group (through the API this_grid())
Definition amd_hip_cooperative_groups.h:196
static bool is_valid()
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:273
dim3 group_dim()
Returns the group dimensions.
Definition amd_hip_cooperative_groups.h:277
thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
Definition amd_hip_cooperative_groups.h:933
unsigned long long ballot(int pred) const
Definition amd_hip_cooperative_groups.h:861
uint32_t thread_rank() const
Rank of the calling thread within [0, size() ).
T shfl(T var, int srcRank) const
Shuffle operation on group level.
Definition amd_hip_cooperative_groups.h:464
multi_grid_group(uint32_t size)
Construct mutli-grid thread group (through the API this_multi_grid())
Definition amd_hip_cooperative_groups.h:142
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:157
uint32_t grid_rank()
Definition amd_hip_cooperative_groups.h:153
friend grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition amd_hip_cooperative_groups.h:218
T shfl_down(T var, unsigned int lane_delta) const
Definition amd_hip_cooperative_groups.h:845
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
Definition amd_hip_cooperative_groups.h:532
Definition amd_hip_cooperative_groups.h:40
tiled_partition_internal(const thread_block &g)
Definition amd_hip_cooperative_groups.h:1182
Definition amd_hip_cooperative_groups.h:1178
Definition amd_hip_cooperative_groups.h:76
Definition amd_hip_cooperative_groups.h:69
Definition hip_runtime_api.h:1061