/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hip/checkouts/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hip/checkouts/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File#

HIP Runtime API Reference: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hip/checkouts/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File
amd_hip_cooperative_groups.h
Go to the documentation of this file.
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
32#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
33#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
34
35#if __cplusplus
36#if !defined(__HIPCC_RTC__)
37#include <hip/amd_detail/hip_cooperative_groups_helper.h>
38#endif
39
41
51 protected:
52 uint32_t _type;
53 uint32_t _size;
54 uint64_t _mask;
56
62 __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast<uint64_t>(0),
63 uint64_t mask = static_cast<uint64_t>(0)) {
64 _type = type;
65 _size = size;
66 _mask = mask;
67 }
68
69 struct _tiled_info {
71 unsigned int size;
72 unsigned int meta_group_rank;
73 unsigned int meta_group_size;
74 };
75
81
82 friend __CG_QUALIFIER__ thread_group this_thread();
83 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
84 unsigned int tile_size);
85 friend class thread_block;
86
87 public:
91 __CG_QUALIFIER__ uint32_t size() const { return _size; }
93 __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
95 __CG_QUALIFIER__ uint32_t thread_rank() const;
97 __CG_QUALIFIER__ bool is_valid() const;
98
111 __CG_QUALIFIER__ void sync() const;
112};
138 friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
139
140 protected:
142 explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
143 : thread_group(internal::cg_multi_grid, size) {}
144
145 public:
146
149 __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
150
153 __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
155 __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
157 __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
159 __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
160};
161
172 return multi_grid_group(internal::multi_grid::size());
173}
174
183class grid_group : public thread_group {
186 friend __CG_QUALIFIER__ grid_group this_grid();
187
188 protected:
190 explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
191
192 public:
194 __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
196 __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
198 __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
199 __CG_QUALIFIER__ dim3 group_dim() const { return internal::workgroup::block_dim(); }
200};
201
211__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
212
225 friend __CG_QUALIFIER__ thread_block this_thread_block();
226 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
227 unsigned int tile_size);
228 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
229 unsigned int tile_size);
230 protected:
231 // Construct a workgroup thread group (through the API this_thread_block())
232 explicit __CG_QUALIFIER__ thread_block(uint32_t size)
233 : thread_group(internal::cg_workgroup, size) {}
234
235 __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
236 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
237 // Invalid tile size, assert
238 if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
239 __hip_assert(false && "invalid tile size");
240 }
241
242 auto block_size = size();
243 auto rank = thread_rank();
244 auto partitions = (block_size + tile_size - 1) / tile_size;
245 auto tail = (partitions * tile_size) - block_size;
246 auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size);
247 thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size);
248
249 tiledGroup.coalesced_info.tiled_info.size = tile_size;
250 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
251 tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size;
252 tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions;
253 return tiledGroup;
254 }
255
256 public:
258 __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
260 __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
262 __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
264 __CG_STATIC_QUALIFIER__ uint32_t size() { return internal::workgroup::size(); }
266 __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
268 __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
270 __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
271};
272
282__CG_QUALIFIER__ thread_block this_thread_block() {
283 return thread_block(internal::workgroup::size());
284}
285
293class tiled_group : public thread_group {
294 private:
295 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
296 unsigned int tile_size);
297 friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
298 unsigned int tile_size);
299
300 __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
301 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
302
303 if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
304 __hip_assert(false && "invalid tile size");
305 }
306
307 if (size() <= tile_size) {
308 return *this;
309 }
310
311 tiled_group tiledGroup = tiled_group(tile_size);
312 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
313 return tiledGroup;
314 }
315
316 protected:
317 explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
318 : thread_group(internal::cg_tiled_group, tileSize) {
319 coalesced_info.tiled_info.size = tileSize;
321 }
322
323 public:
325 __CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); }
326
328 __CG_QUALIFIER__ unsigned int thread_rank() const {
329 return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1));
330 }
331
333 __CG_QUALIFIER__ void sync() const {
334 internal::tiled_group::sync();
335 }
336};
337
338template <unsigned int size, class ParentCGTy> class thread_block_tile;
339
348 private:
349 friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
350 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size);
351 friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size);
352 friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred);
353 template <unsigned int fsize, class fparent>
354 friend __CG_QUALIFIER__ coalesced_group
356
357 __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
358 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
359
360 if (!tile_size || !pow2) {
361 return coalesced_group(0);
362 }
363
364 // If a tiled group is passed to be partitioned further into a coalesced_group.
365 // prepare a mask for further partitioning it so that it stays coalesced.
367 unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
368 unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
369 lane_mask member_mask = static_cast<lane_mask>(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);
370
371 member_mask <<= (__lane_id() & ~(tile_size - 1));
372 coalesced_group coalesced_tile = coalesced_group(member_mask);
373 coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
374 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
375 coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size;
376 return coalesced_tile;
377 }
378 // Here the parent coalesced_group is not partitioned.
379 else {
380 lane_mask member_mask = 0;
381 unsigned int tile_rank = 0;
382 int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
383
384 for (unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) {
385 lane_mask active = coalesced_info.member_mask & (1 << i);
386 // Make sure the lane is active
387 if (active) {
388 if (lanes_to_skip <= 0 && tile_rank < tile_size) {
389 // Prepare a member_mask that is appropriate for a tile
390 member_mask |= active;
391 tile_rank++;
392 }
393 lanes_to_skip--;
394 }
395 }
396 coalesced_group coalesced_tile = coalesced_group(member_mask);
397 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
398 coalesced_tile.coalesced_info.tiled_info.meta_group_size =
399 (size() + tile_size - 1) / tile_size;
400 return coalesced_tile;
401 }
402 return coalesced_group(0);
403 }
404
405 protected:
406 // Constructor
407 explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
408 : thread_group(internal::cg_coalesced_group) {
409 coalesced_info.member_mask = member_mask; // Which threads are active
410 coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active
411 coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
414 }
415
416 public:
418 __CG_QUALIFIER__ unsigned int size() const {
419 return coalesced_info.size;
420 }
421
423 __CG_QUALIFIER__ unsigned int thread_rank() const {
424 return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
425 }
426
428 __CG_QUALIFIER__ void sync() const {
429 internal::coalesced_group::sync();
430 }
431
434 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
436 }
437
439 __CG_QUALIFIER__ unsigned int meta_group_size() const {
441 }
442
455 template <class T>
456 __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
457 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
458
459 srcRank = srcRank % static_cast<int>(size());
460
461 int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank
462 : (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
463 : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
464
465 return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
466 }
467
482 template <class T>
483 __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
484 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
485
486 // Note: The cuda implementation appears to use the remainder of lane_delta
487 // and WARP_SIZE as the shift value rather than lane_delta itself.
488 // This is not described in the documentation and is not done here.
489
490 if (size() == __AMDGCN_WAVEFRONT_SIZE) {
491 return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
492 }
493
494 int lane;
495 if (__AMDGCN_WAVEFRONT_SIZE == 64) {
496 lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
497 }
498 else {
499 lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
500 }
501
502 if (lane == -1) {
503 lane = __lane_id();
504 }
505
506 return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
507 }
508
523 template <class T>
524 __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
525 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
526
527 // Note: The cuda implementation appears to use the remainder of lane_delta
528 // and WARP_SIZE as the shift value rather than lane_delta itself.
529 // This is not described in the documentation and is not done here.
530
531 if (size() == __AMDGCN_WAVEFRONT_SIZE) {
532 return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
533 }
534
535 int lane;
536 if (__AMDGCN_WAVEFRONT_SIZE == 64) {
537 lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
538 }
539 else if (__AMDGCN_WAVEFRONT_SIZE == 32) {
540 lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
541 }
542
543 if (lane == -1) {
544 lane = __lane_id();
545 }
546
547 return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
548 }
549#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
550
558 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
559 return internal::helper::adjust_mask(
561 __ballot_sync<unsigned long long>(coalesced_info.member_mask, pred));
562 }
563
570 __CG_QUALIFIER__ int any(int pred) const {
571 return __any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
572 }
573
580 __CG_QUALIFIER__ int all(int pred) const {
581 return __all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
582 }
583
592 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
593 return internal::helper::adjust_mask(
595 __match_any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value));
596 }
597
609 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
610 return internal::helper::adjust_mask(
612 __match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
613 &pred));
614 }
615#endif
616};
617
626 return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
627}
628
629#ifndef DOXYGEN_SHOULD_SKIP_THIS
630
636__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
637 switch (this->_type) {
638 case internal::cg_multi_grid: {
639 return (static_cast<const multi_grid_group*>(this)->thread_rank());
640 }
641 case internal::cg_grid: {
642 return (static_cast<const grid_group*>(this)->thread_rank());
643 }
644 case internal::cg_workgroup: {
645 return (static_cast<const thread_block*>(this)->thread_rank());
646 }
647 case internal::cg_tiled_group: {
648 return (static_cast<const tiled_group*>(this)->thread_rank());
649 }
650 case internal::cg_coalesced_group: {
651 return (static_cast<const coalesced_group*>(this)->thread_rank());
652 }
653 default: {
654 __hip_assert(false && "invalid cooperative group type");
655 return -1;
656 }
657 }
658}
659
665__CG_QUALIFIER__ bool thread_group::is_valid() const {
666 switch (this->_type) {
667 case internal::cg_multi_grid: {
668 return (static_cast<const multi_grid_group*>(this)->is_valid());
669 }
670 case internal::cg_grid: {
671 return (static_cast<const grid_group*>(this)->is_valid());
672 }
673 case internal::cg_workgroup: {
674 return (static_cast<const thread_block*>(this)->is_valid());
675 }
676 case internal::cg_tiled_group: {
677 return (static_cast<const tiled_group*>(this)->is_valid());
678 }
679 case internal::cg_coalesced_group: {
680 return (static_cast<const coalesced_group*>(this)->is_valid());
681 }
682 default: {
683 __hip_assert(false && "invalid cooperative group type");
684 return false;
685 }
686 }
687}
688
694__CG_QUALIFIER__ void thread_group::sync() const {
695 switch (this->_type) {
696 case internal::cg_multi_grid: {
697 static_cast<const multi_grid_group*>(this)->sync();
698 break;
699 }
700 case internal::cg_grid: {
701 static_cast<const grid_group*>(this)->sync();
702 break;
703 }
704 case internal::cg_workgroup: {
705 static_cast<const thread_block*>(this)->sync();
706 break;
707 }
708 case internal::cg_tiled_group: {
709 static_cast<const tiled_group*>(this)->sync();
710 break;
711 }
712 case internal::cg_coalesced_group: {
713 static_cast<const coalesced_group*>(this)->sync();
714 break;
715 }
716 default: {
717 __hip_assert(false && "invalid cooperative group type");
718 }
719 }
720}
721
722#endif
723
737template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); }
738
750template <class CGTy> __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) {
751 return g.thread_rank();
752}
753
763template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
764
774template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
775
781template <unsigned int tileSize> class tile_base {
782 protected:
783 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
784
785 public:
787 _CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
788 return (internal::workgroup::thread_rank() & (numThreads - 1));
789 }
790
792 __CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; }
793};
794
800template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
801 static_assert(is_valid_tile_size<size>::value,
802 "Tile size is either not a power of 2 or greater than the wavefront size");
804
805 template <unsigned int fsize, class fparent>
806 friend __CG_QUALIFIER__ coalesced_group
808
809#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
810 __CG_QUALIFIER__ unsigned long long build_mask() const {
811 unsigned long long mask = ~0ull >> (64 - numThreads);
812 return mask << ((internal::workgroup::thread_rank() / numThreads) * numThreads);
813 }
814#endif
815
816 public:
817
818 __CG_STATIC_QUALIFIER__ void sync() {
819 internal::tiled_group::sync();
820 }
821
822 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
823 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
824 return (__shfl(var, srcRank, numThreads));
825 }
826
827 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
828 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
829 return (__shfl_down(var, lane_delta, numThreads));
830 }
831
832 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
833 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
834 return (__shfl_up(var, lane_delta, numThreads));
835 }
836
837 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
838 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
839 return (__shfl_xor(var, laneMask, numThreads));
840 }
841
842#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
843 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
844 const auto mask = build_mask();
845 return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
846 }
847
848 __CG_QUALIFIER__ int any(int pred) const { return __any_sync(build_mask(), pred); }
849
850 __CG_QUALIFIER__ int all(int pred) const { return __all_sync(build_mask(), pred); }
851
852 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
853 const auto mask = build_mask();
854 return internal::helper::adjust_mask(mask, __match_any_sync(mask, value));
855 }
856
857 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
858 const auto mask = build_mask();
859 return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
860 }
861#endif
862};
863
866template <unsigned int tileSize, typename ParentCGTy>
868public:
871 __CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() {
872 return ParentCGTy::thread_rank() / tileSize;
873 }
874
876 __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
877 return (ParentCGTy::size() + tileSize - 1) / tileSize;
878 }
879};
880
887template <unsigned int tileSize, class ParentCGTy>
889 public tiled_group,
890 public parent_group_info<tileSize, ParentCGTy> {
891 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
893 protected:
894 __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
895 coalesced_info.tiled_info.size = numThreads;
897 }
898 public:
899 using tbtBase::size;
902};
903
904// Partial template specialization
905template <unsigned int tileSize>
906class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<tileSize>,
907 public tiled_group
908 {
909 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
910
912
913 protected:
914
922
923 public:
924 using tbtBase::size;
925 using tbtBase::sync;
927
930 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
932 }
933
935 __CG_QUALIFIER__ unsigned int meta_group_size() const {
937 }
938// end of operative group
942};
943
944__CG_QUALIFIER__ thread_group this_thread() {
945 thread_group g(internal::group_type::cg_coalesced_group, 1, __ockl_activelane_u32());
946 return g;
947}
948
955__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
956 if (parent.cg_type() == internal::cg_tiled_group) {
957 const tiled_group* cg = static_cast<const tiled_group*>(&parent);
958 return cg->new_tiled_group(tile_size);
959 }
960 else if(parent.cg_type() == internal::cg_coalesced_group) {
961 const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
962 return cg->new_tiled_group(tile_size);
963 }
964 else {
965 const thread_block* tb = static_cast<const thread_block*>(&parent);
966 return tb->new_tiled_group(tile_size);
967 }
968}
969
970// Thread block type overload
971__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
972 return (parent.new_tiled_group(tile_size));
973}
974
975__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
976 return (parent.new_tiled_group(tile_size));
977}
978
979// If a coalesced group is passed to be partitioned, it should remain coalesced
980__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) {
981 return (parent.new_tiled_group(tile_size));
982}
983
984namespace impl {
985template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
986
987template <unsigned int size, class ParentCGTy>
988class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGTy> {
989 protected:
990 template <unsigned int tbtSize, class tbtParentT>
994
995 __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
996 : thread_block_tile_type<size, ParentCGTy>() {}
997};
998} // namespace impl
999
1008template <unsigned int size, class ParentCGTy>
1009class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCGTy> {
1010 protected:
1011 __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
1012 : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
1013
1014 public:
1015 __CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
1016 return thread_block_tile<size, void>(*this);
1017 }
1018
1019#ifdef DOXYGEN_SHOULD_INCLUDE_THIS
1020
1022 __CG_QUALIFIER__ unsigned int thread_rank() const;
1023
1025 __CG_QUALIFIER__ void sync();
1026
1029 __CG_QUALIFIER__ unsigned int meta_group_rank() const;
1030
1032 __CG_QUALIFIER__ unsigned int meta_group_size() const;
1033
1046 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const;
1047
1062 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const;
1063
1078 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const;
1079
1092 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const;
1093
1101 __CG_QUALIFIER__ unsigned long long ballot(int pred) const;
1102
1109 __CG_QUALIFIER__ int any(int pred) const;
1110
1117 __CG_QUALIFIER__ int all(int pred) const;
1118
1127 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const;
1128
1140 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const;
1141
1142#endif
1143};
1144
1145template <unsigned int size>
1146class thread_block_tile<size, void> : public impl::thread_block_tile_internal<size, void> {
1147 template <unsigned int, class ParentCGTy> friend class thread_block_tile;
1148
1149 protected:
1150 public:
1151 template <class ParentCGTy>
1153 : impl::thread_block_tile_internal<size, void>(g) {}
1154};
1155
1156template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
1157
1158namespace impl {
1159template <unsigned int size, class ParentCGTy> struct tiled_partition_internal;
1160
1161template <unsigned int size>
1162struct tiled_partition_internal<size, thread_block> : public thread_block_tile<size, thread_block> {
1163 __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g)
1164 : thread_block_tile<size, thread_block>(g) {}
1165};
1166
1167} // namespace impl
1168
1180template <unsigned int size, class ParentCGTy>
1181__CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
1182 static_assert(is_valid_tile_size<size>::value,
1183 "Tiled partition with size > wavefront size. Currently not supported ");
1185}
1186
1187#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
1188
1196__CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred) {
1197 auto mask = __ballot_sync<unsigned long long>(cgrp.coalesced_info.member_mask, pred);
1198
1199 if (pred) {
1200 return coalesced_group(mask);
1201 } else {
1202 return coalesced_group(cgrp.coalesced_info.member_mask ^ mask);
1203 }
1204}
1205
1216template <unsigned int size, class parent>
1218 bool pred) {
1219 auto mask = __ballot_sync<unsigned long long>(tgrp.build_mask(), pred);
1220
1221 if (pred) {
1222 return coalesced_group(mask);
1223 } else {
1224 return coalesced_group(tgrp.build_mask() ^ mask);
1225 }
1226}
1227
1228#endif
1229} // namespace cooperative_groups
1230
1231#endif // __cplusplus
1232#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
The coalesced_group cooperative group type.
Definition amd_hip_cooperative_groups.h:347
The grid cooperative group type.
Definition amd_hip_cooperative_groups.h:183
Definition amd_hip_cooperative_groups.h:988
thread_block_tile_internal(const thread_block &g)
Definition amd_hip_cooperative_groups.h:995
thread_block_tile_internal(const thread_block_tile_internal< tbtSize, tbtParentT > &g)
Definition amd_hip_cooperative_groups.h:991
The multi-grid cooperative group type.
Definition amd_hip_cooperative_groups.h:135
User exposed API that captures the state of the parent group pre-partition.
Definition amd_hip_cooperative_groups.h:867
thread_block_tile(const thread_block_tile< size, ParentCGTy > &g)
Definition amd_hip_cooperative_groups.h:1152
Definition amd_hip_cooperative_groups.h:800
Group type - thread_block_tile.
Definition amd_hip_cooperative_groups.h:890
Group type - thread_block_tile.
Definition amd_hip_cooperative_groups.h:1009
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
int any(int pred) const
Any function on group level.
unsigned long long ballot(int pred) const
Ballot function on group level.
thread_block_tile(const ParentCGTy &g)
Definition amd_hip_cooperative_groups.h:1011
unsigned int meta_group_rank() const
unsigned int thread_rank() const
Rank of the calling thread within [0, size() ).
T shfl_xor(T var, unsigned int laneMask) const
Shuffle xor operation on group level.
int all(int pred) const
All function on group level.
unsigned long long match_any(T value) const
Match any function on group level.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
void sync()
Synchronizes the threads in the group.
T shfl(T var, int srcRank) const
Shuffle operation on group level.
The workgroup (thread-block in CUDA terminology) cooperative group type.
Definition amd_hip_cooperative_groups.h:222
The base type of all cooperative group types.
Definition amd_hip_cooperative_groups.h:50
Definition amd_hip_cooperative_groups.h:781
The tiled_group cooperative group type.
Definition amd_hip_cooperative_groups.h:293
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:333
T shfl_xor(T var, unsigned int laneMask) const
Definition amd_hip_cooperative_groups.h:837
static constexpr unsigned int numThreads
Definition amd_hip_cooperative_groups.h:783
friend multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition amd_hip_cooperative_groups.h:171
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:428
static void sync()
Definition amd_hip_cooperative_groups.h:818
thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition amd_hip_cooperative_groups.h:282
uint32_t size() const
Definition amd_hip_cooperative_groups.h:91
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:198
unsigned int size() const
Definition amd_hip_cooperative_groups.h:418
unsigned long long match_all(T value, int &pred) const
Definition amd_hip_cooperative_groups.h:857
unsigned long long match_any(T value) const
Match any function on group level.
Definition amd_hip_cooperative_groups.h:592
friend thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition amd_hip_cooperative_groups.h:282
uint32_t _size
Type of the thread_group.
Definition amd_hip_cooperative_groups.h:53
static unsigned int meta_group_size()
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:876
coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition amd_hip_cooperative_groups.h:1196
unsigned int meta_group_rank() const
Definition amd_hip_cooperative_groups.h:434
dim3 group_dim() const
Definition amd_hip_cooperative_groups.h:199
unsigned int meta_group_rank() const
Definition amd_hip_cooperative_groups.h:930
bool is_valid(CGTy const &g)
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:763
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:439
unsigned int thread_rank() const
Rank of the calling thread within [0, size() ).
Definition amd_hip_cooperative_groups.h:328
unsigned int thread_rank() const
Rank of the calling thread within [0, size() ).
Definition amd_hip_cooperative_groups.h:423
thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition amd_hip_cooperative_groups.h:955
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:935
int all(int pred) const
All function on group level.
Definition amd_hip_cooperative_groups.h:580
unsigned int cg_type() const
Returns the type of the group.
Definition amd_hip_cooperative_groups.h:93
T shfl(T var, int srcRank) const
Definition amd_hip_cooperative_groups.h:822
thread_block(uint32_t size)
Definition amd_hip_cooperative_groups.h:232
thread_group new_tiled_group(unsigned int tile_size) const
Definition amd_hip_cooperative_groups.h:235
thread_group(internal::group_type type, uint32_t size=static_cast< uint64_t >(0), uint64_t mask=static_cast< uint64_t >(0))
Definition amd_hip_cooperative_groups.h:62
uint32_t thread_rank() const
Rank of the calling thread within [0, size() ).
Definition amd_hip_cooperative_groups.h:194
tiled_group(unsigned int tileSize)
Definition amd_hip_cooperative_groups.h:317
multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition amd_hip_cooperative_groups.h:171
unsigned int meta_group_rank
Definition amd_hip_cooperative_groups.h:72
void sync() const
Synchronizes the threads in the group.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
Definition amd_hip_cooperative_groups.h:609
thread_block_tile_type()
Definition amd_hip_cooperative_groups.h:894
static uint32_t size()
Definition amd_hip_cooperative_groups.h:264
static constexpr unsigned int thread_rank()
Rank of the thread within this tile.
Definition amd_hip_cooperative_groups.h:787
bool is_tiled
Definition amd_hip_cooperative_groups.h:70
unsigned long long ballot(int pred) const
Ballot function on group level.
Definition amd_hip_cooperative_groups.h:558
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:159
T shfl_up(T var, unsigned int lane_delta) const
Definition amd_hip_cooperative_groups.h:832
uint32_t num_grids()
Definition amd_hip_cooperative_groups.h:149
static dim3 group_index()
Returns 3-dimensional block index within the grid.
Definition amd_hip_cooperative_groups.h:258
static void sync()
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:268
friend coalesced_group binary_partition(const thread_block_tile< fsize, fparent > &tgrp, bool pred)
bool is_valid() const
Returns true if the group has not violated any API constraints.
void sync(CGTy const &g)
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:774
static unsigned int size()
Number of threads within this tile.
Definition amd_hip_cooperative_groups.h:792
friend thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition amd_hip_cooperative_groups.h:955
int any(int pred) const
Definition amd_hip_cooperative_groups.h:848
int all(int pred) const
Definition amd_hip_cooperative_groups.h:850
uint32_t thread_rank() const
Rank of the calling thread within [0, size() ).
Definition amd_hip_cooperative_groups.h:155
friend thread_group this_thread()
Definition amd_hip_cooperative_groups.h:944
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:196
unsigned long long match_any(T value) const
Definition amd_hip_cooperative_groups.h:852
unsigned int size() const
Definition amd_hip_cooperative_groups.h:325
uint64_t _mask
Total number of threads in the tread_group.
Definition amd_hip_cooperative_groups.h:54
unsigned int size
Definition amd_hip_cooperative_groups.h:71
struct cooperative_groups::thread_group::_coalesced_info coalesced_info
lane_mask member_mask
Definition amd_hip_cooperative_groups.h:77
static uint32_t thread_rank()
Rank of the calling thread within [0, size() ).
Definition amd_hip_cooperative_groups.h:262
unsigned int meta_group_size
Definition amd_hip_cooperative_groups.h:73
uint32_t _type
Definition amd_hip_cooperative_groups.h:52
int any(int pred) const
Any function on group level.
Definition amd_hip_cooperative_groups.h:570
thread_group this_thread()
Definition amd_hip_cooperative_groups.h:944
coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition amd_hip_cooperative_groups.h:625
static unsigned int meta_group_rank()
Definition amd_hip_cooperative_groups.h:871
unsigned int size
Definition amd_hip_cooperative_groups.h:78
struct _tiled_info tiled_info
Definition amd_hip_cooperative_groups.h:79
friend coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition amd_hip_cooperative_groups.h:625
friend coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition amd_hip_cooperative_groups.h:1196
static dim3 thread_index()
Returns 3-dimensional thread index within the block.
Definition amd_hip_cooperative_groups.h:260
friend class thread_block
Definition amd_hip_cooperative_groups.h:85
coalesced_group(lane_mask member_mask)
Definition amd_hip_cooperative_groups.h:407
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
Definition amd_hip_cooperative_groups.h:483
uint32_t thread_rank(CGTy const &g)
Returns the rank of thread of the group.
Definition amd_hip_cooperative_groups.h:750
grid_group(uint32_t size)
Construct grid thread group (through the API this_grid())
Definition amd_hip_cooperative_groups.h:190
static bool is_valid()
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:266
dim3 group_dim()
Returns the group dimensions.
Definition amd_hip_cooperative_groups.h:270
thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
Definition amd_hip_cooperative_groups.h:915
uint32_t group_size(CGTy const &g)
Returns the size of the group.
Definition amd_hip_cooperative_groups.h:737
unsigned long long ballot(int pred) const
Definition amd_hip_cooperative_groups.h:843
uint32_t thread_rank() const
Rank of the calling thread within [0, size() ).
grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition amd_hip_cooperative_groups.h:211
T shfl(T var, int srcRank) const
Shuffle operation on group level.
Definition amd_hip_cooperative_groups.h:456
multi_grid_group(uint32_t size)
Construct mutli-grid thread group (through the API this_multi_grid())
Definition amd_hip_cooperative_groups.h:142
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:157
uint32_t grid_rank()
Definition amd_hip_cooperative_groups.h:153
friend grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition amd_hip_cooperative_groups.h:211
T shfl_down(T var, unsigned int lane_delta) const
Definition amd_hip_cooperative_groups.h:827
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
Definition amd_hip_cooperative_groups.h:524
Definition amd_hip_cooperative_groups.h:40
tiled_partition_internal(const thread_block &g)
Definition amd_hip_cooperative_groups.h:1163
Definition amd_hip_cooperative_groups.h:1159
Definition amd_hip_cooperative_groups.h:76
Definition amd_hip_cooperative_groups.h:69
Definition hip_runtime_api.h:1051