/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hip/checkouts/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hip/checkouts/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File#

HIP Runtime API Reference: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hip/checkouts/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File
amd_hip_cooperative_groups.h
Go to the documentation of this file.
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
32#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
33#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
34
35#if __cplusplus
36#if !defined(__HIPCC_RTC__)
37#include <hip/amd_detail/hip_cooperative_groups_helper.h>
38#endif
39
41
51 protected:
52 uint32_t _type;
53 uint32_t _size;
54 uint64_t _mask;
56
62 __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast<uint64_t>(0),
63 uint64_t mask = static_cast<uint64_t>(0)) {
64 _type = type;
65 _size = size;
66 _mask = mask;
67 }
68
69 struct _tiled_info {
71 unsigned int size;
72 unsigned int meta_group_rank;
73 unsigned int meta_group_size;
74 };
75
81
82 friend __CG_QUALIFIER__ thread_group this_thread();
83 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
84 unsigned int tile_size);
85 friend class thread_block;
86
87 public:
91 __CG_QUALIFIER__ uint32_t size() const { return _size; }
93 __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
95 __CG_QUALIFIER__ uint32_t thread_rank() const;
97 __CG_QUALIFIER__ bool is_valid() const;
98
111 __CG_QUALIFIER__ void sync() const;
112};
138 friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
139
140 protected:
142 explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
143 : thread_group(internal::cg_multi_grid, size) {}
144
145 public:
146
149 __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
150
153 __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
155 __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
157 __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
159 __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
160};
161
176 return multi_grid_group(internal::multi_grid::size());
177}
178// Doxygen end group CooperativeGConstruct
189class grid_group : public thread_group {
192 friend __CG_QUALIFIER__ grid_group this_grid();
193
194 protected:
196 explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
197
198 public:
200 __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
202 __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
204 __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
205 __CG_QUALIFIER__ dim3 group_dim() const { return internal::workgroup::block_dim(); }
206};
207
218__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
219
232 friend __CG_QUALIFIER__ thread_block this_thread_block();
233 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
234 unsigned int tile_size);
235 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
236 unsigned int tile_size);
237 protected:
238 // Construct a workgroup thread group (through the API this_thread_block())
239 explicit __CG_QUALIFIER__ thread_block(uint32_t size)
240 : thread_group(internal::cg_workgroup, size) {}
241
242 __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
243 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
244 // Invalid tile size, assert
245 if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
246 __hip_assert(false && "invalid tile size");
247 }
248
249 auto block_size = size();
250 auto rank = thread_rank();
251 auto partitions = (block_size + tile_size - 1) / tile_size;
252 auto tail = (partitions * tile_size) - block_size;
253 auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size);
254 thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size);
255
256 tiledGroup.coalesced_info.tiled_info.size = tile_size;
257 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
258 tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size;
259 tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions;
260 return tiledGroup;
261 }
262
263 public:
265 __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
267 __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
269 __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
271 __CG_STATIC_QUALIFIER__ uint32_t size() { return internal::workgroup::size(); }
273 __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
275 __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
277 __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
278};
279
290__CG_QUALIFIER__ thread_block this_thread_block() {
291 return thread_block(internal::workgroup::size());
292}
293
301class tiled_group : public thread_group {
302 private:
303 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
304 unsigned int tile_size);
305 friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
306 unsigned int tile_size);
307
308 __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
309 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
310
311 if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
312 __hip_assert(false && "invalid tile size");
313 }
314
315 if (size() <= tile_size) {
316 return *this;
317 }
318
319 tiled_group tiledGroup = tiled_group(tile_size);
320 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
321 return tiledGroup;
322 }
323
324 protected:
325 explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
326 : thread_group(internal::cg_tiled_group, tileSize) {
327 coalesced_info.tiled_info.size = tileSize;
329 }
330
331 public:
333 __CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); }
334
336 __CG_QUALIFIER__ unsigned int thread_rank() const {
337 return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1));
338 }
339
341 __CG_QUALIFIER__ void sync() const {
342 internal::tiled_group::sync();
343 }
344};
345
346template <unsigned int size, class ParentCGTy> class thread_block_tile;
347
356 private:
357 friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
358 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size);
359 friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size);
360 friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred);
361 template <unsigned int fsize, class fparent>
362 friend __CG_QUALIFIER__ coalesced_group
364
365 __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
366 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
367
368 if (!tile_size || !pow2) {
369 return coalesced_group(0);
370 }
371
372 // If a tiled group is passed to be partitioned further into a coalesced_group.
373 // prepare a mask for further partitioning it so that it stays coalesced.
375 unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
376 unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
377 lane_mask member_mask = static_cast<lane_mask>(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);
378
379 member_mask <<= (__lane_id() & ~(tile_size - 1));
380 coalesced_group coalesced_tile = coalesced_group(member_mask);
381 coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
382 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
383 coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size;
384 return coalesced_tile;
385 }
386 // Here the parent coalesced_group is not partitioned.
387 else {
388 lane_mask member_mask = 0;
389 unsigned int tile_rank = 0;
390 int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
391
392 for (unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) {
393 lane_mask active = coalesced_info.member_mask & (1 << i);
394 // Make sure the lane is active
395 if (active) {
396 if (lanes_to_skip <= 0 && tile_rank < tile_size) {
397 // Prepare a member_mask that is appropriate for a tile
398 member_mask |= active;
399 tile_rank++;
400 }
401 lanes_to_skip--;
402 }
403 }
404 coalesced_group coalesced_tile = coalesced_group(member_mask);
405 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
406 coalesced_tile.coalesced_info.tiled_info.meta_group_size =
407 (size() + tile_size - 1) / tile_size;
408 return coalesced_tile;
409 }
410 return coalesced_group(0);
411 }
412
413 protected:
414 // Constructor
415 explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
416 : thread_group(internal::cg_coalesced_group) {
417 coalesced_info.member_mask = member_mask; // Which threads are active
418 coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active
419 coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
422 }
423
424 public:
426 __CG_QUALIFIER__ unsigned int size() const {
427 return coalesced_info.size;
428 }
429
431 __CG_QUALIFIER__ unsigned int thread_rank() const {
432 return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
433 }
434
436 __CG_QUALIFIER__ void sync() const {
437 internal::coalesced_group::sync();
438 }
439
442 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
444 }
445
447 __CG_QUALIFIER__ unsigned int meta_group_size() const {
449 }
450
463 template <class T>
464 __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
465 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
466
467 srcRank = srcRank % static_cast<int>(size());
468
469 int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank
470 : (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
471 : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
472
473 return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
474 }
475
490 template <class T>
491 __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
492 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
493
494 // Note: The cuda implementation appears to use the remainder of lane_delta
495 // and WARP_SIZE as the shift value rather than lane_delta itself.
496 // This is not described in the documentation and is not done here.
497
498 if (size() == __AMDGCN_WAVEFRONT_SIZE) {
499 return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
500 }
501
502 int lane;
503 if (__AMDGCN_WAVEFRONT_SIZE == 64) {
504 lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
505 }
506 else {
507 lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
508 }
509
510 if (lane == -1) {
511 lane = __lane_id();
512 }
513
514 return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
515 }
516
531 template <class T>
532 __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
533 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
534
535 // Note: The cuda implementation appears to use the remainder of lane_delta
536 // and WARP_SIZE as the shift value rather than lane_delta itself.
537 // This is not described in the documentation and is not done here.
538
539 if (size() == __AMDGCN_WAVEFRONT_SIZE) {
540 return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
541 }
542
543 int lane;
544 if (__AMDGCN_WAVEFRONT_SIZE == 64) {
545 lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
546 }
547 else if (__AMDGCN_WAVEFRONT_SIZE == 32) {
548 lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
549 }
550
551 if (lane == -1) {
552 lane = __lane_id();
553 }
554
555 return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
556 }
557#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
558
566 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
567 return internal::helper::adjust_mask(
569 __ballot_sync<unsigned long long>(coalesced_info.member_mask, pred));
570 }
571
578 __CG_QUALIFIER__ int any(int pred) const {
579 return __any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
580 }
581
588 __CG_QUALIFIER__ int all(int pred) const {
589 return __all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
590 }
591
600 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
601 return internal::helper::adjust_mask(
603 __match_any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value));
604 }
605
617 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
618 return internal::helper::adjust_mask(
620 __match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
621 &pred));
622 }
623#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
624};
625
635 return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
636}
637
638#ifndef DOXYGEN_SHOULD_SKIP_THIS
639
645__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
646 switch (this->_type) {
647 case internal::cg_multi_grid: {
648 return (static_cast<const multi_grid_group*>(this)->thread_rank());
649 }
650 case internal::cg_grid: {
651 return (static_cast<const grid_group*>(this)->thread_rank());
652 }
653 case internal::cg_workgroup: {
654 return (static_cast<const thread_block*>(this)->thread_rank());
655 }
656 case internal::cg_tiled_group: {
657 return (static_cast<const tiled_group*>(this)->thread_rank());
658 }
659 case internal::cg_coalesced_group: {
660 return (static_cast<const coalesced_group*>(this)->thread_rank());
661 }
662 default: {
663 __hip_assert(false && "invalid cooperative group type");
664 return -1;
665 }
666 }
667}
668
674__CG_QUALIFIER__ bool thread_group::is_valid() const {
675 switch (this->_type) {
676 case internal::cg_multi_grid: {
677 return (static_cast<const multi_grid_group*>(this)->is_valid());
678 }
679 case internal::cg_grid: {
680 return (static_cast<const grid_group*>(this)->is_valid());
681 }
682 case internal::cg_workgroup: {
683 return (static_cast<const thread_block*>(this)->is_valid());
684 }
685 case internal::cg_tiled_group: {
686 return (static_cast<const tiled_group*>(this)->is_valid());
687 }
688 case internal::cg_coalesced_group: {
689 return (static_cast<const coalesced_group*>(this)->is_valid());
690 }
691 default: {
692 __hip_assert(false && "invalid cooperative group type");
693 return false;
694 }
695 }
696}
697
703__CG_QUALIFIER__ void thread_group::sync() const {
704 switch (this->_type) {
705 case internal::cg_multi_grid: {
706 static_cast<const multi_grid_group*>(this)->sync();
707 break;
708 }
709 case internal::cg_grid: {
710 static_cast<const grid_group*>(this)->sync();
711 break;
712 }
713 case internal::cg_workgroup: {
714 static_cast<const thread_block*>(this)->sync();
715 break;
716 }
717 case internal::cg_tiled_group: {
718 static_cast<const tiled_group*>(this)->sync();
719 break;
720 }
721 case internal::cg_coalesced_group: {
722 static_cast<const coalesced_group*>(this)->sync();
723 break;
724 }
725 default: {
726 __hip_assert(false && "invalid cooperative group type");
727 }
728 }
729}
730
731#endif
732
750template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); }
751
763template <class CGTy> __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) {
764 return g.thread_rank();
765}
766
776template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
777
787template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
788
789// Doxygen end group CooperativeGAPI
797template <unsigned int tileSize> class tile_base {
798 protected:
799 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
800
801 public:
803 _CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
804 return (internal::workgroup::thread_rank() & (numThreads - 1));
805 }
806
808 __CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; }
809};
810
816template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
817 static_assert(is_valid_tile_size<size>::value,
818 "Tile size is either not a power of 2 or greater than the wavefront size");
820
821 template <unsigned int fsize, class fparent>
822 friend __CG_QUALIFIER__ coalesced_group
824
825#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
826 __CG_QUALIFIER__ unsigned long long build_mask() const {
827 unsigned long long mask = ~0ull >> (64 - numThreads);
828 return mask << ((internal::workgroup::thread_rank() / numThreads) * numThreads);
829 }
830#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
831
832 public:
833
834 __CG_STATIC_QUALIFIER__ void sync() {
835 internal::tiled_group::sync();
836 }
837
838 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
839 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
840 return (__shfl(var, srcRank, numThreads));
841 }
842
843 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
844 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
845 return (__shfl_down(var, lane_delta, numThreads));
846 }
847
848 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
849 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
850 return (__shfl_up(var, lane_delta, numThreads));
851 }
852
853 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
854 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
855 return (__shfl_xor(var, laneMask, numThreads));
856 }
857
858#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
859 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
860 const auto mask = build_mask();
861 return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
862 }
863
864 __CG_QUALIFIER__ int any(int pred) const { return __any_sync(build_mask(), pred); }
865
866 __CG_QUALIFIER__ int all(int pred) const { return __all_sync(build_mask(), pred); }
867
868 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
869 const auto mask = build_mask();
870 return internal::helper::adjust_mask(mask, __match_any_sync(mask, value));
871 }
872
873 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
874 const auto mask = build_mask();
875 return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
876 }
877#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
878};
879
882template <unsigned int tileSize, typename ParentCGTy>
884public:
887 __CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() {
888 return ParentCGTy::thread_rank() / tileSize;
889 }
890
892 __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
893 return (ParentCGTy::size() + tileSize - 1) / tileSize;
894 }
895};
896
903template <unsigned int tileSize, class ParentCGTy>
905 public tiled_group,
906 public parent_group_info<tileSize, ParentCGTy> {
907 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
909 protected:
910 __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
911 coalesced_info.tiled_info.size = numThreads;
913 }
914 public:
915 using tbtBase::size;
918};
919
920// Partial template specialization
921template <unsigned int tileSize>
922class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<tileSize>,
923 public tiled_group
924 {
925 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
926
928
929 protected:
930
938
939 public:
940 using tbtBase::size;
941 using tbtBase::sync;
943
946 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
948 }
949
951 __CG_QUALIFIER__ unsigned int meta_group_size() const {
953 }
954// Doxygen end group CooperativeG
958};
959
960__CG_QUALIFIER__ thread_group this_thread() {
961 thread_group g(internal::group_type::cg_coalesced_group, 1, __ockl_activelane_u32());
962 return g;
963}
964
972__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
973 if (parent.cg_type() == internal::cg_tiled_group) {
974 const tiled_group* cg = static_cast<const tiled_group*>(&parent);
975 return cg->new_tiled_group(tile_size);
976 }
977 else if(parent.cg_type() == internal::cg_coalesced_group) {
978 const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
979 return cg->new_tiled_group(tile_size);
980 }
981 else {
982 const thread_block* tb = static_cast<const thread_block*>(&parent);
983 return tb->new_tiled_group(tile_size);
984 }
985}
986
987// Thread block type overload
988__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
989 return (parent.new_tiled_group(tile_size));
990}
991
992__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
993 return (parent.new_tiled_group(tile_size));
994}
995
996// If a coalesced group is passed to be partitioned, it should remain coalesced
997__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) {
998 return (parent.new_tiled_group(tile_size));
999}
1000
1001namespace impl {
1002template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
1003
1004template <unsigned int size, class ParentCGTy>
1005class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGTy> {
1006 protected:
1007 template <unsigned int tbtSize, class tbtParentT>
1011
1012 __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
1013 : thread_block_tile_type<size, ParentCGTy>() {}
1014};
1015} // namespace impl
1016
1025template <unsigned int size, class ParentCGTy>
1026class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCGTy> {
1027 protected:
1028 __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
1029 : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
1030
1031 public:
1032 __CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
1033 return thread_block_tile<size, void>(*this);
1034 }
1035
1036#ifdef DOXYGEN_SHOULD_INCLUDE_THIS
1037
1039 __CG_QUALIFIER__ unsigned int thread_rank() const;
1040
1042 __CG_QUALIFIER__ void sync();
1043
1046 __CG_QUALIFIER__ unsigned int meta_group_rank() const;
1047
1049 __CG_QUALIFIER__ unsigned int meta_group_size() const;
1050
1063 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const;
1064
1079 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const;
1080
1095 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const;
1096
1109 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const;
1110
1118 __CG_QUALIFIER__ unsigned long long ballot(int pred) const;
1119
1126 __CG_QUALIFIER__ int any(int pred) const;
1127
1134 __CG_QUALIFIER__ int all(int pred) const;
1135
1144 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const;
1145
1157 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const;
1158
1159#endif
1160};
1161
1162template <unsigned int size>
1163class thread_block_tile<size, void> : public impl::thread_block_tile_internal<size, void> {
1164 template <unsigned int, class ParentCGTy> friend class thread_block_tile;
1165
1166 protected:
1167 public:
1168 template <class ParentCGTy>
1170 : impl::thread_block_tile_internal<size, void>(g) {}
1171};
1172
1173template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
1174
1175namespace impl {
1176template <unsigned int size, class ParentCGTy> struct tiled_partition_internal;
1177
1178template <unsigned int size>
1179struct tiled_partition_internal<size, thread_block> : public thread_block_tile<size, thread_block> {
1180 __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g)
1181 : thread_block_tile<size, thread_block>(g) {}
1182};
1183
1184} // namespace impl
1185
1198template <unsigned int size, class ParentCGTy>
1199__CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
1200 static_assert(is_valid_tile_size<size>::value,
1201 "Tiled partition with size > wavefront size. Currently not supported ");
1203}
1204
1205#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
1206
1215__CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred) {
1216 auto mask = __ballot_sync<unsigned long long>(cgrp.coalesced_info.member_mask, pred);
1217
1218 if (pred) {
1219 return coalesced_group(mask);
1220 } else {
1221 return coalesced_group(cgrp.coalesced_info.member_mask ^ mask);
1222 }
1223}
1224
1236template <unsigned int size, class parent>
1238 bool pred) {
1239 auto mask = __ballot_sync<unsigned long long>(tgrp.build_mask(), pred);
1240
1241 if (pred) {
1242 return coalesced_group(mask);
1243 } else {
1244 return coalesced_group(tgrp.build_mask() ^ mask);
1245 }
1246}
1247#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
1248} // namespace cooperative_groups
1249
1250#endif // __cplusplus
1251#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
The coalesced_group cooperative group type.
Definition amd_hip_cooperative_groups.h:355
The grid cooperative group type.
Definition amd_hip_cooperative_groups.h:189
Definition amd_hip_cooperative_groups.h:1005
thread_block_tile_internal(const thread_block &g)
Definition amd_hip_cooperative_groups.h:1012
thread_block_tile_internal(const thread_block_tile_internal< tbtSize, tbtParentT > &g)
Definition amd_hip_cooperative_groups.h:1008
The multi-grid cooperative group type.
Definition amd_hip_cooperative_groups.h:135
User exposed API that captures the state of the parent group pre-partition.
Definition amd_hip_cooperative_groups.h:883
thread_block_tile(const thread_block_tile< size, ParentCGTy > &g)
Definition amd_hip_cooperative_groups.h:1169
Definition amd_hip_cooperative_groups.h:816
Group type - thread_block_tile.
Definition amd_hip_cooperative_groups.h:906
Group type - thread_block_tile.
Definition amd_hip_cooperative_groups.h:1026
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
int any(int pred) const
Any function on group level.
unsigned long long ballot(int pred) const
Ballot function on group level.
thread_block_tile(const ParentCGTy &g)
Definition amd_hip_cooperative_groups.h:1028
unsigned int meta_group_rank() const
unsigned int thread_rank() const
Rank of the calling thread within [0, size() ).
T shfl_xor(T var, unsigned int laneMask) const
Shuffle xor operation on group level.
int all(int pred) const
All function on group level.
unsigned long long match_any(T value) const
Match any function on group level.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
void sync()
Synchronizes the threads in the group.
T shfl(T var, int srcRank) const
Shuffle operation on group level.
The workgroup (thread-block in CUDA terminology) cooperative group type.
Definition amd_hip_cooperative_groups.h:229
The base type of all cooperative group types.
Definition amd_hip_cooperative_groups.h:50
Definition amd_hip_cooperative_groups.h:797
The tiled_group cooperative group type.
Definition amd_hip_cooperative_groups.h:301
bool is_valid(CGTy const &g)
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:776
void sync(CGTy const &g)
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:787
uint32_t thread_rank(CGTy const &g)
Returns the rank of thread of the group.
Definition amd_hip_cooperative_groups.h:763
uint32_t group_size(CGTy const &g)
Returns the size of the group.
Definition amd_hip_cooperative_groups.h:750
thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition amd_hip_cooperative_groups.h:290
coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition amd_hip_cooperative_groups.h:1215
thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition amd_hip_cooperative_groups.h:972
multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition amd_hip_cooperative_groups.h:175
coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition amd_hip_cooperative_groups.h:634
grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition amd_hip_cooperative_groups.h:218
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:341
T shfl_xor(T var, unsigned int laneMask) const
Definition amd_hip_cooperative_groups.h:853
static constexpr unsigned int numThreads
Definition amd_hip_cooperative_groups.h:799
friend multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition amd_hip_cooperative_groups.h:175
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:436
static void sync()
Definition amd_hip_cooperative_groups.h:834
uint32_t size() const
Definition amd_hip_cooperative_groups.h:91
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:204
unsigned int size() const
Definition amd_hip_cooperative_groups.h:426
unsigned long long match_all(T value, int &pred) const
Definition amd_hip_cooperative_groups.h:873
unsigned long long match_any(T value) const
Match any function on group level.
Definition amd_hip_cooperative_groups.h:600
friend thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition amd_hip_cooperative_groups.h:290
uint32_t _size
Type of the thread_group.
Definition amd_hip_cooperative_groups.h:53
static unsigned int meta_group_size()
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:892
unsigned int meta_group_rank() const
Definition amd_hip_cooperative_groups.h:442
dim3 group_dim() const
Definition amd_hip_cooperative_groups.h:205
unsigned int meta_group_rank() const
Definition amd_hip_cooperative_groups.h:946
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:447
unsigned int thread_rank() const
Rank of the calling thread within [0, size() ).
Definition amd_hip_cooperative_groups.h:336
unsigned int thread_rank() const
Rank of the calling thread within [0, size() ).
Definition amd_hip_cooperative_groups.h:431
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:951
int all(int pred) const
All function on group level.
Definition amd_hip_cooperative_groups.h:588
unsigned int cg_type() const
Returns the type of the group.
Definition amd_hip_cooperative_groups.h:93
T shfl(T var, int srcRank) const
Definition amd_hip_cooperative_groups.h:838
thread_block(uint32_t size)
Definition amd_hip_cooperative_groups.h:239
thread_group new_tiled_group(unsigned int tile_size) const
Definition amd_hip_cooperative_groups.h:242
thread_group(internal::group_type type, uint32_t size=static_cast< uint64_t >(0), uint64_t mask=static_cast< uint64_t >(0))
Definition amd_hip_cooperative_groups.h:62
uint32_t thread_rank() const
Rank of the calling thread within [0, size() ).
Definition amd_hip_cooperative_groups.h:200
tiled_group(unsigned int tileSize)
Definition amd_hip_cooperative_groups.h:325
unsigned int meta_group_rank
Definition amd_hip_cooperative_groups.h:72
void sync() const
Synchronizes the threads in the group.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
Definition amd_hip_cooperative_groups.h:617
thread_block_tile_type()
Definition amd_hip_cooperative_groups.h:910
static uint32_t size()
Definition amd_hip_cooperative_groups.h:271
static constexpr unsigned int thread_rank()
Rank of the thread within this tile.
Definition amd_hip_cooperative_groups.h:803
bool is_tiled
Definition amd_hip_cooperative_groups.h:70
unsigned long long ballot(int pred) const
Ballot function on group level.
Definition amd_hip_cooperative_groups.h:566
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:159
T shfl_up(T var, unsigned int lane_delta) const
Definition amd_hip_cooperative_groups.h:848
uint32_t num_grids()
Definition amd_hip_cooperative_groups.h:149
static dim3 group_index()
Returns 3-dimensional block index within the grid.
Definition amd_hip_cooperative_groups.h:265
static void sync()
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:275
friend coalesced_group binary_partition(const thread_block_tile< fsize, fparent > &tgrp, bool pred)
bool is_valid() const
Returns true if the group has not violated any API constraints.
static unsigned int size()
Number of threads within this tile.
Definition amd_hip_cooperative_groups.h:808
friend thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition amd_hip_cooperative_groups.h:972
int any(int pred) const
Definition amd_hip_cooperative_groups.h:864
int all(int pred) const
Definition amd_hip_cooperative_groups.h:866
uint32_t thread_rank() const
Rank of the calling thread within [0, size() ).
Definition amd_hip_cooperative_groups.h:155
friend thread_group this_thread()
Definition amd_hip_cooperative_groups.h:960
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:202
unsigned long long match_any(T value) const
Definition amd_hip_cooperative_groups.h:868
unsigned int size() const
Definition amd_hip_cooperative_groups.h:333
uint64_t _mask
Total number of threads in the tread_group.
Definition amd_hip_cooperative_groups.h:54
unsigned int size
Definition amd_hip_cooperative_groups.h:71
struct cooperative_groups::thread_group::_coalesced_info coalesced_info
lane_mask member_mask
Definition amd_hip_cooperative_groups.h:77
static uint32_t thread_rank()
Rank of the calling thread within [0, size() ).
Definition amd_hip_cooperative_groups.h:269
unsigned int meta_group_size
Definition amd_hip_cooperative_groups.h:73
uint32_t _type
Definition amd_hip_cooperative_groups.h:52
int any(int pred) const
Any function on group level.
Definition amd_hip_cooperative_groups.h:578
thread_group this_thread()
Definition amd_hip_cooperative_groups.h:960
static unsigned int meta_group_rank()
Definition amd_hip_cooperative_groups.h:887
unsigned int size
Definition amd_hip_cooperative_groups.h:78
struct _tiled_info tiled_info
Definition amd_hip_cooperative_groups.h:79
friend coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition amd_hip_cooperative_groups.h:634
friend coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition amd_hip_cooperative_groups.h:1215
static dim3 thread_index()
Returns 3-dimensional thread index within the block.
Definition amd_hip_cooperative_groups.h:267
friend class thread_block
Definition amd_hip_cooperative_groups.h:85
coalesced_group(lane_mask member_mask)
Definition amd_hip_cooperative_groups.h:415
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
Definition amd_hip_cooperative_groups.h:491
grid_group(uint32_t size)
Construct grid thread group (through the API this_grid())
Definition amd_hip_cooperative_groups.h:196
static bool is_valid()
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:273
dim3 group_dim()
Returns the group dimensions.
Definition amd_hip_cooperative_groups.h:277
thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
Definition amd_hip_cooperative_groups.h:931
unsigned long long ballot(int pred) const
Definition amd_hip_cooperative_groups.h:859
uint32_t thread_rank() const
Rank of the calling thread within [0, size() ).
T shfl(T var, int srcRank) const
Shuffle operation on group level.
Definition amd_hip_cooperative_groups.h:464
multi_grid_group(uint32_t size)
Construct mutli-grid thread group (through the API this_multi_grid())
Definition amd_hip_cooperative_groups.h:142
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:157
uint32_t grid_rank()
Definition amd_hip_cooperative_groups.h:153
friend grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition amd_hip_cooperative_groups.h:218
T shfl_down(T var, unsigned int lane_delta) const
Definition amd_hip_cooperative_groups.h:843
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
Definition amd_hip_cooperative_groups.h:532
Definition amd_hip_cooperative_groups.h:40
tiled_partition_internal(const thread_block &g)
Definition amd_hip_cooperative_groups.h:1180
Definition amd_hip_cooperative_groups.h:1176
Definition amd_hip_cooperative_groups.h:76
Definition amd_hip_cooperative_groups.h:69
Definition hip_runtime_api.h:1055