clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File

clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File#

HIP Runtime API Reference: clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File
amd_hip_cooperative_groups.h
Go to the documentation of this file.
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
32#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
33#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
34
35#if __cplusplus
36#if !defined(__HIPCC_RTC__)
37#include <hip/amd_detail/hip_cooperative_groups_helper.h>
38#endif
39
41
51 protected:
52 __hip_uint32_t _type;
53 __hip_uint32_t _num_threads;
54 __hip_uint64_t _mask;
56
62 __CG_QUALIFIER__ thread_group(internal::group_type type,
63 __hip_uint32_t num_threads = static_cast<__hip_uint64_t>(0),
64 __hip_uint64_t mask = static_cast<__hip_uint64_t>(0)) {
65 _type = type;
67 _mask = mask;
68 }
69
70 struct _tiled_info {
72 unsigned int num_threads;
73 unsigned int meta_group_rank;
74 unsigned int meta_group_size;
75 };
76
82
83 friend __CG_QUALIFIER__ thread_group this_thread();
84 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
85 unsigned int tile_size);
86 friend class thread_block;
87
88 public:
92 __CG_QUALIFIER__ __hip_uint32_t num_threads() const { return _num_threads; }
94 __CG_QUALIFIER__ __hip_uint32_t size() const { return num_threads(); }
96 __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
98 __CG_QUALIFIER__ __hip_uint32_t thread_rank() const;
100 __CG_QUALIFIER__ bool is_valid() const;
101
114 __CG_QUALIFIER__ void sync() const;
115};
141 friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
142
143 protected:
145 explicit __CG_QUALIFIER__ multi_grid_group(__hip_uint32_t size)
146 : thread_group(internal::cg_multi_grid, size) {}
147
148 public:
151 __CG_QUALIFIER__ __hip_uint32_t num_grids() { return internal::multi_grid::num_grids(); }
152
155 __CG_QUALIFIER__ __hip_uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
157 __CG_QUALIFIER__ __hip_uint32_t thread_rank() const {
158 return internal::multi_grid::thread_rank();
159 }
161 __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
163 __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
164};
165
180 return multi_grid_group(internal::multi_grid::num_threads());
181}
182// Doxygen end group CooperativeGConstruct
193class grid_group : public thread_group {
196 friend __CG_QUALIFIER__ grid_group this_grid();
197
198 protected:
200 explicit __CG_QUALIFIER__ grid_group(__hip_uint32_t size)
201 : thread_group(internal::cg_grid, size) {}
202
203 public:
205 __CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::grid::thread_rank(); }
207 __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
209 __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
210 __CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); }
211};
212
223__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::num_threads()); }
224
237 friend __CG_QUALIFIER__ thread_block this_thread_block();
238 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
239 unsigned int tile_size);
240 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
241 unsigned int tile_size);
242
243 protected:
244 // Construct a workgroup thread group (through the API this_thread_block())
245 explicit __CG_QUALIFIER__ thread_block(__hip_uint32_t size)
246 : thread_group(internal::cg_workgroup, size) {}
247
248 __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
249 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
250 // Invalid tile size, assert
251 if (!tile_size || (tile_size > warpSize) || !pow2) {
252 __hip_assert(false && "invalid tile size");
253 }
254
255 auto block_size = num_threads();
256 auto rank = thread_rank();
257 auto partitions = (block_size + tile_size - 1) / tile_size;
258 auto tail = (partitions * tile_size) - block_size;
259 auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size);
260 thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size);
261
262 tiledGroup.coalesced_info.tiled_info.num_threads = tile_size;
263 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
264 tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size;
265 tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions;
266 return tiledGroup;
267 }
268
269 public:
271 __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
273 __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
275 __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
276 return internal::workgroup::thread_rank();
277 }
279 __CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
280 return internal::workgroup::num_threads();
281 }
283 __CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return num_threads(); }
285 __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
287 __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
289 __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
290};
291
302__CG_QUALIFIER__ thread_block this_thread_block() {
303 return thread_block(internal::workgroup::num_threads());
304}
305
313class tiled_group : public thread_group {
314 private:
315 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
316 unsigned int tile_size);
317 friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
318 unsigned int tile_size);
319
320 __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
321 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
322
323 if (!tile_size || (tile_size > warpSize) || !pow2) {
324 __hip_assert(false && "invalid tile size");
325 }
326
327 if (num_threads() <= tile_size) {
328 return *this;
329 }
330
331 tiled_group tiledGroup = tiled_group(tile_size);
332 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
333 return tiledGroup;
334 }
335
336 protected:
337 explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
338 : thread_group(internal::cg_tiled_group, tileSize) {
341 }
342
343 public:
345 __CG_QUALIFIER__ unsigned int num_threads() const {
347 }
348
350 __CG_QUALIFIER__ unsigned int size() const { return num_threads(); }
351
353 __CG_QUALIFIER__ unsigned int thread_rank() const {
354 return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.num_threads - 1));
355 }
356
358 __CG_QUALIFIER__ void sync() const { internal::tiled_group::sync(); }
359};
360
361template <unsigned int size, class ParentCGTy> class thread_block_tile;
362
371 private:
372 friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
373 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
374 unsigned int tile_size);
375 friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
376 unsigned int tile_size);
377 friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred);
378 template <unsigned int fsize, class fparent> friend __CG_QUALIFIER__ coalesced_group
380
381 __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
382 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
383
384 if (!tile_size || !pow2) {
385 return coalesced_group(0);
386 }
387
388 // If a tiled group is passed to be partitioned further into a coalesced_group.
389 // prepare a mask for further partitioning it so that it stays coalesced.
391 unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
392 unsigned int masklength =
393 min(static_cast<unsigned int>(num_threads()) - base_offset, tile_size);
394 lane_mask full_mask = (static_cast<int>(warpSize) == 32)
395 ? static_cast<lane_mask>((1u << 32) - 1)
396 : static_cast<lane_mask>(-1ull);
397 lane_mask member_mask = full_mask >> (warpSize - masklength);
398
399 member_mask <<= (__lane_id() & ~(tile_size - 1));
400 coalesced_group coalesced_tile = coalesced_group(member_mask);
401 coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
402 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
403 coalesced_tile.coalesced_info.tiled_info.meta_group_size = num_threads() / tile_size;
404 return coalesced_tile;
405 }
406 // Here the parent coalesced_group is not partitioned.
407 else {
408 lane_mask member_mask = 0;
409 unsigned int tile_rank = 0;
410 int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
411
412 for (unsigned int i = 0; i < warpSize; i++) {
413 lane_mask active = coalesced_info.member_mask & (static_cast<lane_mask>(1) << i);
414 // Make sure the lane is active
415 if (active) {
416 if (lanes_to_skip <= 0 && tile_rank < tile_size) {
417 // Prepare a member_mask that is appropriate for a tile
418 member_mask |= active;
419 tile_rank++;
420 }
421 lanes_to_skip--;
422 }
423 }
424 coalesced_group coalesced_tile = coalesced_group(member_mask);
425 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
426 coalesced_tile.coalesced_info.tiled_info.meta_group_size =
427 (num_threads() + tile_size - 1) / tile_size;
428 return coalesced_tile;
429 }
430 return coalesced_group(0);
431 }
432
433 protected:
434 // Constructor
435 explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
436 : thread_group(internal::cg_coalesced_group) {
437 coalesced_info.member_mask = member_mask; // Which threads are active
439 __popcll(coalesced_info.member_mask); // How many threads are active
440 coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
443 }
444
445 public:
447 __CG_QUALIFIER__ unsigned int num_threads() const { return coalesced_info.num_threads; }
448
450 __CG_QUALIFIER__ unsigned int size() const { return num_threads(); }
451
453 __CG_QUALIFIER__ unsigned int thread_rank() const {
454 return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
455 }
456
458 __CG_QUALIFIER__ void sync() const { internal::coalesced_group::sync(); }
459
462 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
464 }
465
467 __CG_QUALIFIER__ unsigned int meta_group_size() const {
469 }
470
483 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
484 srcRank = srcRank % static_cast<int>(num_threads());
485
486 int lane = (num_threads() == warpSize) ? srcRank
487 : (static_cast<int>(warpSize) == 64)
488 ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
489 : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
490
491 return __shfl(var, lane, warpSize);
492 }
493
508 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
509 // Note: The cuda implementation appears to use the remainder of lane_delta
510 // and WARP_SIZE as the shift value rather than lane_delta itself.
511 // This is not described in the documentation and is not done here.
512
513 if (num_threads() == warpSize) {
514 return __shfl_down(var, lane_delta, warpSize);
515 }
516
517 int lane;
518 if (static_cast<int>(warpSize) == 64) {
519 lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
520 } else {
521 lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
522 }
523
524 if (lane == -1) {
525 lane = __lane_id();
526 }
527
528 return __shfl(var, lane, warpSize);
529 }
530
545 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
546 // Note: The cuda implementation appears to use the remainder of lane_delta
547 // and WARP_SIZE as the shift value rather than lane_delta itself.
548 // This is not described in the documentation and is not done here.
549
550 if (num_threads() == warpSize) {
551 return __shfl_up(var, lane_delta, warpSize);
552 }
553
554 int lane;
555 if (static_cast<int>(warpSize) == 64) {
556 lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
557 } else if (static_cast<int>(warpSize) == 32) {
558 lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
559 }
560
561 if (lane == -1) {
562 lane = __lane_id();
563 }
564
565 return __shfl(var, lane, warpSize);
566 }
567#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
568
576 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
577 return internal::helper::adjust_mask(
579 __ballot_sync<unsigned long long>(coalesced_info.member_mask, pred));
580 }
581
588 __CG_QUALIFIER__ int any(int pred) const {
589 return __any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
590 }
591
598 __CG_QUALIFIER__ int all(int pred) const {
599 return __all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
600 }
601
610 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
611 return internal::helper::adjust_mask(
613 __match_any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value));
614 }
615
627 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
628 return internal::helper::adjust_mask(
630 __match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
631 &pred));
632 }
633#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
634};
635
645 return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
646}
647
648#ifndef DOXYGEN_SHOULD_SKIP_THIS
649
655__CG_QUALIFIER__ __hip_uint32_t thread_group::thread_rank() const {
656 switch (this->_type) {
657 case internal::cg_multi_grid: {
658 return (static_cast<const multi_grid_group*>(this)->thread_rank());
659 }
660 case internal::cg_grid: {
661 return (static_cast<const grid_group*>(this)->thread_rank());
662 }
663 case internal::cg_workgroup: {
664 return (static_cast<const thread_block*>(this)->thread_rank());
665 }
666 case internal::cg_tiled_group: {
667 return (static_cast<const tiled_group*>(this)->thread_rank());
668 }
669 case internal::cg_coalesced_group: {
670 return (static_cast<const coalesced_group*>(this)->thread_rank());
671 }
672 default: {
673 __hip_assert(false && "invalid cooperative group type");
674 return -1;
675 }
676 }
677}
678
684__CG_QUALIFIER__ bool thread_group::is_valid() const {
685 switch (this->_type) {
686 case internal::cg_multi_grid: {
687 return (static_cast<const multi_grid_group*>(this)->is_valid());
688 }
689 case internal::cg_grid: {
690 return (static_cast<const grid_group*>(this)->is_valid());
691 }
692 case internal::cg_workgroup: {
693 return (static_cast<const thread_block*>(this)->is_valid());
694 }
695 case internal::cg_tiled_group: {
696 return (static_cast<const tiled_group*>(this)->is_valid());
697 }
698 case internal::cg_coalesced_group: {
699 return (static_cast<const coalesced_group*>(this)->is_valid());
700 }
701 default: {
702 __hip_assert(false && "invalid cooperative group type");
703 return false;
704 }
705 }
706}
707
713__CG_QUALIFIER__ void thread_group::sync() const {
714 switch (this->_type) {
715 case internal::cg_multi_grid: {
716 static_cast<const multi_grid_group*>(this)->sync();
717 break;
718 }
719 case internal::cg_grid: {
720 static_cast<const grid_group*>(this)->sync();
721 break;
722 }
723 case internal::cg_workgroup: {
724 static_cast<const thread_block*>(this)->sync();
725 break;
726 }
727 case internal::cg_tiled_group: {
728 static_cast<const tiled_group*>(this)->sync();
729 break;
730 }
731 case internal::cg_coalesced_group: {
732 static_cast<const coalesced_group*>(this)->sync();
733 break;
734 }
735 default: {
736 __hip_assert(false && "invalid cooperative group type");
737 }
738 }
739}
740
741#endif
742
760template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) {
761 return g.num_threads();
762}
763
775template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t thread_rank(CGTy const& g) {
776 return g.thread_rank();
777}
778
788template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
789
799template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
800
801// Doxygen end group CooperativeGAPI
809template <unsigned int tileSize> class tile_base {
810 protected:
811 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
812
813 public:
815 _CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
816 return (internal::workgroup::thread_rank() & (numThreads - 1));
817 }
818
820 __CG_STATIC_QUALIFIER__ unsigned int num_threads() { return numThreads; }
821
824 __CG_STATIC_QUALIFIER__ unsigned int size() { return num_threads(); }
825};
826
832template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
833 static_assert(is_valid_tile_size<size>::value,
834 "Tile size is either not a power of 2 or greater than the wavefront size");
836
837 template <unsigned int fsize, class fparent> friend __CG_QUALIFIER__ coalesced_group
839
840#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
841 __CG_QUALIFIER__ unsigned long long build_mask() const {
842 unsigned long long mask = ~0ull >> (64 - numThreads);
843 // thread_rank() gives thread id from 0..thread launch size.
844 return mask << (((internal::workgroup::thread_rank() % warpSize) / numThreads) * numThreads);
845 }
846#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
847
848 public:
849 __CG_STATIC_QUALIFIER__ void sync() { internal::tiled_group::sync(); }
850
851 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
852 return (__shfl(var, srcRank, numThreads));
853 }
854
855 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
856 return (__shfl_down(var, lane_delta, numThreads));
857 }
858
859 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
860 return (__shfl_up(var, lane_delta, numThreads));
861 }
862
863 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
864 return (__shfl_xor(var, laneMask, numThreads));
865 }
866
867#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
868 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
869 const auto mask = build_mask();
870 return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
871 }
872
873 __CG_QUALIFIER__ int any(int pred) const { return __any_sync(build_mask(), pred); }
874
875 __CG_QUALIFIER__ int all(int pred) const { return __all_sync(build_mask(), pred); }
876
877 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
878 const auto mask = build_mask();
879 return internal::helper::adjust_mask(mask, __match_any_sync(mask, value));
880 }
881
882 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
883 const auto mask = build_mask();
884 return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
885 }
886#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
887};
888
891template <unsigned int tileSize, typename ParentCGTy> class parent_group_info {
892 public:
895 __CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() {
896 return ParentCGTy::thread_rank() / tileSize;
897 }
898
900 __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
901 return (ParentCGTy::num_threads() + tileSize - 1) / tileSize;
902 }
903};
904
911template <unsigned int tileSize, class ParentCGTy> class thread_block_tile_type
912 : public thread_block_tile_base<tileSize>,
913 public tiled_group,
914 public parent_group_info<tileSize, ParentCGTy> {
915 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
917
918 protected:
919 __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
922 }
923
932
933 public:
935 using tbtBase::size;
938};
939
940// Partial template specialization
941template <unsigned int tileSize> class thread_block_tile_type<tileSize, void>
942 : public thread_block_tile_base<tileSize>, public tiled_group {
943 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
944
946
947 protected:
956
957 public:
959 using tbtBase::size;
960 using tbtBase::sync;
962
965 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
967 }
968
970 __CG_QUALIFIER__ unsigned int meta_group_size() const {
972 }
973 // Doxygen end group CooperativeG
977};
978
979__CG_QUALIFIER__ thread_group this_thread() {
980 thread_group g(internal::group_type::cg_coalesced_group, 1, __ockl_activelane_u32());
981 return g;
982}
983
991__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
992 if (parent.cg_type() == internal::cg_tiled_group) {
993 const tiled_group* cg = static_cast<const tiled_group*>(&parent);
994 return cg->new_tiled_group(tile_size);
995 } else if (parent.cg_type() == internal::cg_coalesced_group) {
996 const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
997 return cg->new_tiled_group(tile_size);
998 } else {
999 const thread_block* tb = static_cast<const thread_block*>(&parent);
1000 return tb->new_tiled_group(tile_size);
1001 }
1002}
1003
1004// Thread block type overload
1005__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
1006 return (parent.new_tiled_group(tile_size));
1007}
1008
1009__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
1010 return (parent.new_tiled_group(tile_size));
1011}
1012
1013// If a coalesced group is passed to be partitioned, it should remain coalesced
1014__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
1015 unsigned int tile_size) {
1016 return (parent.new_tiled_group(tile_size));
1017}
1018
1019namespace impl {
1020template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
1021
1022template <unsigned int size, class ParentCGTy> class thread_block_tile_internal
1023 : public thread_block_tile_type<size, ParentCGTy> {
1024 protected:
1025 template <unsigned int tbtSize, class tbtParentT> __CG_QUALIFIER__ thread_block_tile_internal(
1028
1029 __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
1030 : thread_block_tile_type<size, ParentCGTy>() {}
1031};
1032} // namespace impl
1033
1042template <unsigned int size, class ParentCGTy> class thread_block_tile
1043 : public impl::thread_block_tile_internal<size, ParentCGTy> {
1044 protected:
1045 __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
1046 : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
1047
1048 public:
1049 __CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
1050 return thread_block_tile<size, void>(*this);
1051 }
1052
1053#ifdef DOXYGEN_SHOULD_INCLUDE_THIS
1054
1056 __CG_QUALIFIER__ unsigned int thread_rank() const;
1057
1059 __CG_QUALIFIER__ void sync();
1060
1063 __CG_QUALIFIER__ unsigned int meta_group_rank() const;
1064
1066 __CG_QUALIFIER__ unsigned int meta_group_size() const;
1067
1080 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const;
1081
1096 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const;
1097
1112 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const;
1113
1126 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const;
1127
1135 __CG_QUALIFIER__ unsigned long long ballot(int pred) const;
1136
1143 __CG_QUALIFIER__ int any(int pred) const;
1144
1151 __CG_QUALIFIER__ int all(int pred) const;
1152
1161 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const;
1162
1174 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const;
1175
1176#endif
1177};
1178
1179template <unsigned int size> class thread_block_tile<size, void>
1180 : public impl::thread_block_tile_internal<size, void> {
1181 template <unsigned int, class ParentCGTy> friend class thread_block_tile;
1182
1183 protected:
1184 public:
1185 template <class ParentCGTy>
1187 : impl::thread_block_tile_internal<size, void>(g) {}
1188};
1189
1190template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
1191
1192namespace impl {
1193template <unsigned int size, class ParentCGTy> struct tiled_partition_internal;
1194
1195template <unsigned int size> struct tiled_partition_internal<size, thread_block>
1196 : public thread_block_tile<size, thread_block> {
1199};
1200
1201// ParentCGTy = thread_block_tile<ParentSize, GrandParentCGTy> specialization
1202template <unsigned int size, unsigned int ParentSize, class GrandParentCGTy>
1203struct tiled_partition_internal<size, thread_block_tile<ParentSize, GrandParentCGTy> >
1204 : public thread_block_tile<size, thread_block_tile<ParentSize, GrandParentCGTy> > {
1205 static_assert(size <= ParentSize, "Sub tile size must be <= parent tile size in tiled_partition");
1206
1208 : thread_block_tile<size, thread_block_tile<ParentSize, GrandParentCGTy> >(g) {}
1209};
1210
1211} // namespace impl
1212
1225template <unsigned int size, class ParentCGTy>
1226__CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
1227 static_assert(is_valid_tile_size<size>::value,
1228 "Tiled partition with size > wavefront size. Currently not supported ");
1230}
1231
1232#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
1233
1242__CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred) {
1243 auto mask = __ballot_sync<unsigned long long>(cgrp.coalesced_info.member_mask, pred);
1244
1245 if (pred) {
1246 return coalesced_group(mask);
1247 } else {
1248 return coalesced_group(cgrp.coalesced_info.member_mask ^ mask);
1249 }
1250}
1251
1263template <unsigned int size, class parent>
1265 bool pred) {
1266 auto mask = __ballot_sync<unsigned long long>(tgrp.build_mask(), pred);
1267
1268 if (pred) {
1269 return coalesced_group(mask);
1270 } else {
1271 return coalesced_group(tgrp.build_mask() ^ mask);
1272 }
1273}
1274#endif
1275} // namespace cooperative_groups
1276
1277#endif // __cplusplus
1278#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
The coalesced_group cooperative group type.
Definition amd_hip_cooperative_groups.h:370
The grid cooperative group type.
Definition amd_hip_cooperative_groups.h:193
Definition amd_hip_cooperative_groups.h:1023
thread_block_tile_internal(const thread_block &g)
Definition amd_hip_cooperative_groups.h:1029
thread_block_tile_internal(const thread_block_tile_internal< tbtSize, tbtParentT > &g)
Definition amd_hip_cooperative_groups.h:1025
The multi-grid cooperative group type.
Definition amd_hip_cooperative_groups.h:138
User exposed API that captures the state of the parent group pre-partition.
Definition amd_hip_cooperative_groups.h:891
thread_block_tile(const thread_block_tile< size, ParentCGTy > &g)
Definition amd_hip_cooperative_groups.h:1186
Definition amd_hip_cooperative_groups.h:832
Group type - thread_block_tile.
Definition amd_hip_cooperative_groups.h:914
Group type - thread_block_tile.
Definition amd_hip_cooperative_groups.h:1043
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
int any(int pred) const
Any function on group level.
unsigned long long ballot(int pred) const
Ballot function on group level.
thread_block_tile(const ParentCGTy &g)
Definition amd_hip_cooperative_groups.h:1045
unsigned int meta_group_rank() const
unsigned int thread_rank() const
Rank of the calling thread within [0, num_threads() ).
T shfl_xor(T var, unsigned int laneMask) const
Shuffle xor operation on group level.
int all(int pred) const
All function on group level.
unsigned long long match_any(T value) const
Match any function on group level.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
void sync()
Synchronizes the threads in the group.
T shfl(T var, int srcRank) const
Shuffle operation on group level.
The workgroup (thread-block in CUDA terminology) cooperative group type.
Definition amd_hip_cooperative_groups.h:234
The base type of all cooperative group types.
Definition amd_hip_cooperative_groups.h:50
Definition amd_hip_cooperative_groups.h:809
The tiled_group cooperative group type.
Definition amd_hip_cooperative_groups.h:313
const struct texture< T, dim, readMode > const void size_t size
Definition hip_runtime_api.h:10010
bool is_valid(CGTy const &g)
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:788
void sync(CGTy const &g)
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:799
__hip_uint32_t group_size(CGTy const &g)
Returns the size of the group.
Definition amd_hip_cooperative_groups.h:760
__hip_uint32_t thread_rank(CGTy const &g)
Returns the rank of thread of the group.
Definition amd_hip_cooperative_groups.h:775
thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition amd_hip_cooperative_groups.h:302
coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition amd_hip_cooperative_groups.h:1242
thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition amd_hip_cooperative_groups.h:991
multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition amd_hip_cooperative_groups.h:179
coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition amd_hip_cooperative_groups.h:644
grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition amd_hip_cooperative_groups.h:223
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:358
T shfl_xor(T var, unsigned int laneMask) const
Definition amd_hip_cooperative_groups.h:863
static constexpr unsigned int numThreads
Definition amd_hip_cooperative_groups.h:811
friend multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition amd_hip_cooperative_groups.h:179
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:458
static void sync()
Definition amd_hip_cooperative_groups.h:849
unsigned int num_threads
Definition amd_hip_cooperative_groups.h:72
__hip_uint32_t thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:205
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:209
__hip_uint32_t size() const
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:94
unsigned int size() const
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:450
__hip_uint32_t num_grids()
Definition amd_hip_cooperative_groups.h:151
unsigned long long match_all(T value, int &pred) const
Definition amd_hip_cooperative_groups.h:882
unsigned long long match_any(T value) const
Match any function on group level.
Definition amd_hip_cooperative_groups.h:610
friend thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition amd_hip_cooperative_groups.h:302
static unsigned int meta_group_size()
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:900
unsigned int meta_group_rank() const
Definition amd_hip_cooperative_groups.h:462
dim3 group_dim() const
Definition amd_hip_cooperative_groups.h:210
unsigned int meta_group_rank() const
Definition amd_hip_cooperative_groups.h:965
__hip_uint32_t _num_threads
Type of the thread_group.
Definition amd_hip_cooperative_groups.h:53
static unsigned int num_threads()
Number of threads within this tile.
Definition amd_hip_cooperative_groups.h:820
thread_group(internal::group_type type, __hip_uint32_t num_threads=static_cast< __hip_uint64_t >(0), __hip_uint64_t mask=static_cast< __hip_uint64_t >(0))
Definition amd_hip_cooperative_groups.h:62
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:467
unsigned int thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:353
unsigned int thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:453
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:970
int all(int pred) const
All function on group level.
Definition amd_hip_cooperative_groups.h:598
unsigned int cg_type() const
Returns the type of the group.
Definition amd_hip_cooperative_groups.h:96
T shfl(T var, int srcRank) const
Definition amd_hip_cooperative_groups.h:851
thread_group new_tiled_group(unsigned int tile_size) const
Definition amd_hip_cooperative_groups.h:248
static __hip_uint32_t size()
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:283
unsigned int num_threads() const
Definition amd_hip_cooperative_groups.h:345
unsigned int num_threads() const
Definition amd_hip_cooperative_groups.h:447
tiled_group(unsigned int tileSize)
Definition amd_hip_cooperative_groups.h:337
unsigned int meta_group_rank
Definition amd_hip_cooperative_groups.h:73
void sync() const
Synchronizes the threads in the group.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
Definition amd_hip_cooperative_groups.h:627
thread_block_tile_type()
Definition amd_hip_cooperative_groups.h:919
grid_group(__hip_uint32_t size)
Construct grid thread group (through the API this_grid())
Definition amd_hip_cooperative_groups.h:200
__hip_uint32_t grid_rank()
Definition amd_hip_cooperative_groups.h:155
static constexpr unsigned int thread_rank()
Rank of the thread within this tile.
Definition amd_hip_cooperative_groups.h:815
bool is_tiled
Definition amd_hip_cooperative_groups.h:71
unsigned long long ballot(int pred) const
Ballot function on group level.
Definition amd_hip_cooperative_groups.h:576
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:163
T shfl_up(T var, unsigned int lane_delta) const
Definition amd_hip_cooperative_groups.h:859
static dim3 group_index()
Returns 3-dimensional block index within the grid.
Definition amd_hip_cooperative_groups.h:271
unsigned int num_threads
Definition amd_hip_cooperative_groups.h:79
static void sync()
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:287
friend coalesced_group binary_partition(const thread_block_tile< fsize, fparent > &tgrp, bool pred)
__hip_uint32_t thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:157
bool is_valid() const
Returns true if the group has not violated any API constraints.
static unsigned int size()
Definition amd_hip_cooperative_groups.h:824
friend thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition amd_hip_cooperative_groups.h:991
int any(int pred) const
Definition amd_hip_cooperative_groups.h:873
int all(int pred) const
Definition amd_hip_cooperative_groups.h:875
__hip_uint64_t _mask
Total number of threads in the thread_group.
Definition amd_hip_cooperative_groups.h:54
friend thread_group this_thread()
Definition amd_hip_cooperative_groups.h:979
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:207
__hip_uint32_t _type
Definition amd_hip_cooperative_groups.h:52
unsigned long long match_any(T value) const
Definition amd_hip_cooperative_groups.h:877
unsigned int size() const
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:350
struct cooperative_groups::thread_group::_coalesced_info coalesced_info
lane_mask member_mask
Definition amd_hip_cooperative_groups.h:78
thread_block(__hip_uint32_t size)
Definition amd_hip_cooperative_groups.h:245
unsigned int meta_group_size
Definition amd_hip_cooperative_groups.h:74
multi_grid_group(__hip_uint32_t size)
Construct multi-grid thread group (through the API this_multi_grid())
Definition amd_hip_cooperative_groups.h:145
int any(int pred) const
Any function on group level.
Definition amd_hip_cooperative_groups.h:588
thread_group this_thread()
Definition amd_hip_cooperative_groups.h:979
static unsigned int meta_group_rank()
Definition amd_hip_cooperative_groups.h:895
__hip_uint32_t num_threads() const
Definition amd_hip_cooperative_groups.h:92
struct _tiled_info tiled_info
Definition amd_hip_cooperative_groups.h:80
friend coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition amd_hip_cooperative_groups.h:644
friend coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition amd_hip_cooperative_groups.h:1242
static dim3 thread_index()
Returns 3-dimensional thread index within the block.
Definition amd_hip_cooperative_groups.h:273
friend class thread_block
Definition amd_hip_cooperative_groups.h:86
coalesced_group(lane_mask member_mask)
Definition amd_hip_cooperative_groups.h:435
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
Definition amd_hip_cooperative_groups.h:508
static bool is_valid()
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:285
dim3 group_dim()
Returns the group dimensions.
Definition amd_hip_cooperative_groups.h:289
thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
Definition amd_hip_cooperative_groups.h:948
static __hip_uint32_t num_threads()
Definition amd_hip_cooperative_groups.h:279
unsigned long long ballot(int pred) const
Definition amd_hip_cooperative_groups.h:868
T shfl(T var, int srcRank) const
Shuffle operation on group level.
Definition amd_hip_cooperative_groups.h:483
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:161
static __hip_uint32_t thread_rank()
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:275
friend grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition amd_hip_cooperative_groups.h:223
T shfl_down(T var, unsigned int lane_delta) const
Definition amd_hip_cooperative_groups.h:855
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
Definition amd_hip_cooperative_groups.h:545
thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
Definition amd_hip_cooperative_groups.h:924
__hip_uint32_t thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:40
tiled_partition_internal(const thread_block &g)
Definition amd_hip_cooperative_groups.h:1197
tiled_partition_internal(const thread_block_tile< ParentSize, GrandParentCGTy > &g)
Definition amd_hip_cooperative_groups.h:1207
Definition amd_hip_cooperative_groups.h:1193
Definition amd_hip_cooperative_groups.h:77
Definition amd_hip_cooperative_groups.h:70
Definition hip_runtime_api.h:1296