/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.0.2/hipcub/include/hipcub/backend/rocprim/block/block_radix_rank.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.0.2/hipcub/include/hipcub/backend/rocprim/block/block_radix_rank.hpp Source File#

hipCUB: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.0.2/hipcub/include/hipcub/backend/rocprim/block/block_radix_rank.hpp Source File
block_radix_rank.hpp
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35  #ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_RANK_HPP_
36  #define HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_RANK_HPP_
37 
38 #include <stdint.h>
39 
40 #include "../../../config.hpp"
41 #include "../../../util_type.hpp"
42 #include "../../../util_ptx.hpp"
43 
44 #include "../thread/thread_reduce.hpp"
45 #include "../block/block_scan.hpp"
46 #include "../block/radix_rank_sort_operations.hpp"
47 
48 BEGIN_HIPCUB_NAMESPACE
49 
50 
51 
86 template <
87  int BLOCK_DIM_X,
88  int RADIX_BITS,
89  bool IS_DESCENDING,
90  bool MEMOIZE_OUTER_SCAN = false,
91  BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
92  hipSharedMemConfig SMEM_CONFIG = hipSharedMemBankSizeFourByte,
93  int BLOCK_DIM_Y = 1,
94  int BLOCK_DIM_Z = 1,
95  int ARCH = HIPCUB_ARCH /* ignored */>
97 {
98 private:
99 
100  /******************************************************************************
101  * Type definitions and constants
102  ******************************************************************************/
103 
104  // Integer type for digit counters (to be packed into words of type PackedCounters)
105  typedef unsigned short DigitCounter;
106 
107  // Integer type for packing DigitCounters into columns of shared memory banks
108  typedef typename If<(SMEM_CONFIG == hipSharedMemBankSizeEightByte),
109  unsigned long long,
110  unsigned int>::Type PackedCounter;
111 
112  enum
113  {
114  // The thread block size in threads
115  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
116 
117  RADIX_DIGITS = 1 << RADIX_BITS,
118 
119  LOG_WARP_THREADS = Log2<ARCH>::VALUE,
120  WARP_THREADS = 1 << LOG_WARP_THREADS,
121  WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
122 
123  BYTES_PER_COUNTER = sizeof(DigitCounter),
124  LOG_BYTES_PER_COUNTER = Log2<BYTES_PER_COUNTER>::VALUE,
125 
126  PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter),
127  LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,
128 
129  LOG_COUNTER_LANES = rocprim::maximum<int>()((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0), // Always at least one lane
130  COUNTER_LANES = 1 << LOG_COUNTER_LANES,
131 
132  // The number of packed counters per thread (plus one for padding)
133  PADDED_COUNTER_LANES = COUNTER_LANES + 1,
134  RAKING_SEGMENT = PADDED_COUNTER_LANES,
135  };
136 
137 public:
138 
139  enum
140  {
142  BINS_TRACKED_PER_THREAD = rocprim::maximum<int>()(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
143  };
144 
145 private:
146 
147 
149  typedef BlockScan<
150  PackedCounter,
151  BLOCK_DIM_X,
152  INNER_SCAN_ALGORITHM,
153  BLOCK_DIM_Y,
154  BLOCK_DIM_Z,
155  ARCH>
156  BlockScan;
157 
158 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
159 
161  struct __align__(16) _TempStorage
162  {
163  union Aliasable
164  {
165  DigitCounter digit_counters[PADDED_COUNTER_LANES * BLOCK_THREADS * PACKING_RATIO];
166  PackedCounter raking_grid[BLOCK_THREADS * RAKING_SEGMENT];
167 
168  } aliasable;
169 
170  // Storage for scanning local ranks
171  typename BlockScan::TempStorage block_scan;
172  };
173 
174 #endif
175 
176  /******************************************************************************
177  * Thread fields
178  ******************************************************************************/
179 
181  _TempStorage &temp_storage;
182 
184  unsigned int linear_tid;
185 
187  PackedCounter cached_segment[RAKING_SEGMENT];
188 
189 
190  /******************************************************************************
191  * Utility methods
192  ******************************************************************************/
193 
197  HIPCUB_DEVICE inline _TempStorage& PrivateStorage()
198  {
199  __shared__ _TempStorage private_storage;
200  return private_storage;
201  }
202 
203 
207  HIPCUB_DEVICE inline PackedCounter Upsweep()
208  {
209  PackedCounter *smem_raking_ptr = &temp_storage.aliasable.raking_grid[linear_tid * RAKING_SEGMENT];
210  PackedCounter *raking_ptr;
211 
212  if (MEMOIZE_OUTER_SCAN)
213  {
214  // Copy data into registers
215  #pragma unroll
216  for (int i = 0; i < RAKING_SEGMENT; i++)
217  {
218  cached_segment[i] = smem_raking_ptr[i];
219  }
220  raking_ptr = cached_segment;
221  }
222  else
223  {
224  raking_ptr = smem_raking_ptr;
225  }
226 
227  return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
228  }
229 
230 
232  HIPCUB_DEVICE inline void ExclusiveDownsweep(
233  PackedCounter raking_partial)
234  {
235  PackedCounter *smem_raking_ptr = &temp_storage.aliasable.raking_grid[linear_tid * RAKING_SEGMENT];
236 
237  PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
238  cached_segment :
239  smem_raking_ptr;
240 
241  // Exclusive raking downsweep scan
242  internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
243 
244  if (MEMOIZE_OUTER_SCAN)
245  {
246  // Copy data back to smem
247  #pragma unroll
248  for (int i = 0; i < RAKING_SEGMENT; i++)
249  {
250  smem_raking_ptr[i] = cached_segment[i];
251  }
252  }
253  }
254 
255 
259  HIPCUB_DEVICE inline void ResetCounters()
260  {
261  // Reset shared memory digit counters
262  #pragma unroll
263  for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
264  {
265  #pragma unroll
266  for (int SUB_COUNTER = 0; SUB_COUNTER < PACKING_RATIO; SUB_COUNTER++)
267  {
268  temp_storage.aliasable.digit_counters[(LANE * BLOCK_THREADS + linear_tid) * PACKING_RATIO + SUB_COUNTER] = 0;
269  }
270  }
271  }
272 
273 
277  struct PrefixCallBack
278  {
279  HIPCUB_DEVICE inline PackedCounter operator()(PackedCounter block_aggregate)
280  {
281  PackedCounter block_prefix = 0;
282 
283  // Propagate totals in packed fields
284  #pragma unroll
285  for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
286  {
287  block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
288  }
289 
290  return block_prefix;
291  }
292  };
293 
294 
298  HIPCUB_DEVICE inline void ScanCounters()
299  {
300  // Upsweep scan
301  PackedCounter raking_partial = Upsweep();
302 
303  // Compute exclusive sum
304  PackedCounter exclusive_partial;
305  PrefixCallBack prefix_call_back;
306  BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
307 
308  // Downsweep scan with exclusive partial
309  ExclusiveDownsweep(exclusive_partial);
310  }
311 
312 public:
313 
315  struct TempStorage : Uninitialized<_TempStorage> {};
316 
317 
318  /******************************************************************/
322 
326  HIPCUB_DEVICE inline BlockRadixRank()
327  :
328  temp_storage(PrivateStorage()),
329  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
330  {}
331 
332 
336  HIPCUB_DEVICE inline BlockRadixRank(
337  TempStorage &temp_storage)
338  :
339  temp_storage(temp_storage.Alias()),
340  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
341  {}
342 
343 
345  /******************************************************************/
349 
353  template <
354  typename UnsignedBits,
355  int KEYS_PER_THREAD,
356  typename DigitExtractorT>
357  HIPCUB_DEVICE inline void RankKeys(
358  UnsignedBits (&keys)[KEYS_PER_THREAD],
359  int (&ranks)[KEYS_PER_THREAD],
360  DigitExtractorT digit_extractor)
361  {
362  DigitCounter thread_prefixes[KEYS_PER_THREAD]; // For each key, the count of previous keys in this tile having the same digit
363  DigitCounter* digit_counters[KEYS_PER_THREAD]; // For each key, the byte-offset of its corresponding digit counter in smem
364 
365  // Reset shared memory digit counters
366  ResetCounters();
367 
368  #pragma unroll
369  for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
370  {
371  // Get digit
372  unsigned int digit = digit_extractor.Digit(keys[ITEM]);
373 
374  // Get sub-counter
375  unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
376 
377  // Get counter lane
378  unsigned int counter_lane = digit & (COUNTER_LANES - 1);
379 
380  if (IS_DESCENDING)
381  {
382  sub_counter = PACKING_RATIO - 1 - sub_counter;
383  counter_lane = COUNTER_LANES - 1 - counter_lane;
384  }
385 
386  // Pointer to smem digit counter
387  digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane * BLOCK_THREADS * PACKING_RATIO + linear_tid * PACKING_RATIO + sub_counter];
388 
389  // Load thread-exclusive prefix
390  thread_prefixes[ITEM] = *digit_counters[ITEM];
391 
392  // Store inclusive prefix
393  *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
394  }
395 
396  ::rocprim::syncthreads();
397 
398  // Scan shared memory counters
399  ScanCounters();
400 
401  ::rocprim::syncthreads();
402 
403  // Extract the local ranks of each key
404  #pragma unroll
405  for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
406  {
407  // Add in thread block exclusive prefix
408  ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
409  }
410  }
411 
412 
416  template <
417  typename UnsignedBits,
418  int KEYS_PER_THREAD,
419  typename DigitExtractorT>
420  HIPCUB_DEVICE inline void RankKeys(
421  UnsignedBits (&keys)[KEYS_PER_THREAD],
422  int (&ranks)[KEYS_PER_THREAD],
423  DigitExtractorT digit_extractor,
424  int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
425  {
426  // Rank keys
427  RankKeys(keys, ranks, digit_extractor);
428 
429  // Get the inclusive and exclusive digit totals corresponding to the calling thread.
430  #pragma unroll
431  for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
432  {
433  int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
434 
435  if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
436  {
437  if (IS_DESCENDING)
438  bin_idx = RADIX_DIGITS - bin_idx - 1;
439 
440  // Obtain ex/inclusive digit counts. (Unfortunately these all reside in the
441  // first counter column, resulting in unavoidable bank conflicts.)
442  unsigned int counter_lane = (bin_idx & (COUNTER_LANES - 1));
443  unsigned int sub_counter = bin_idx >> (LOG_COUNTER_LANES);
444 
445  exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counter[counter_lane * BLOCK_THREADS * PACKING_RATIO + sub_counter];
446  }
447  }
448  }
449 };
450 
451 
452 
453 
454 
458 template <
459  int BLOCK_DIM_X,
460  int RADIX_BITS,
461  bool IS_DESCENDING,
462  BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
463  int BLOCK_DIM_Y = 1,
464  int BLOCK_DIM_Z = 1,
465  int ARCH = HIPCUB_ARCH>
467 {
468 private:
469 
470  /******************************************************************************
471  * Type definitions and constants
472  ******************************************************************************/
473 
474  typedef int32_t RankT;
475  typedef int32_t DigitCounterT;
476 
477  enum
478  {
479  // The thread block size in threads
480  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
481 
482  RADIX_DIGITS = 1 << RADIX_BITS,
483 
484  LOG_WARP_THREADS = Log2<ARCH>::VALUE,
485  WARP_THREADS = 1 << LOG_WARP_THREADS,
486  WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
487 
488  PADDED_WARPS = ((WARPS & 0x1) == 0) ?
489  WARPS + 1 :
490  WARPS,
491 
492  COUNTERS = PADDED_WARPS * RADIX_DIGITS,
493  RAKING_SEGMENT = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
494  PADDED_RAKING_SEGMENT = ((RAKING_SEGMENT & 0x1) == 0) ?
495  RAKING_SEGMENT + 1 :
496  RAKING_SEGMENT,
497  };
498 
499 public:
500 
501  enum
502  {
504  BINS_TRACKED_PER_THREAD = rocprim::maximum<int>()(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
505  };
506 
507 private:
508 
510  typedef BlockScan<
511  DigitCounterT,
512  BLOCK_THREADS,
513  INNER_SCAN_ALGORITHM,
514  BLOCK_DIM_Y,
515  BLOCK_DIM_Z,
516  ARCH>
517  BlockScanT;
518 
519 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
521  struct __align__(16) _TempStorage
522  {
523  typename BlockScanT::TempStorage block_scan;
524 
525  union __align__(16) Aliasable
526  {
527  volatile DigitCounterT warp_digit_counters[RADIX_DIGITS * PADDED_WARPS];
528  DigitCounterT raking_grid[BLOCK_THREADS * PADDED_RAKING_SEGMENT];
529 
530  } aliasable;
531  };
532 #endif
533 
534  /******************************************************************************
535  * Thread fields
536  ******************************************************************************/
537 
539  _TempStorage &temp_storage;
540 
542  unsigned int linear_tid;
543 
544 
545 
546 public:
547 
549  struct TempStorage : Uninitialized<_TempStorage> {};
550 
551 
552  /******************************************************************/
556 
557 
561  HIPCUB_DEVICE inline BlockRadixRankMatch(
562  TempStorage &temp_storage)
563  :
564  temp_storage(temp_storage.Alias()),
565  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
566  {}
567 
568 
570  /******************************************************************/
574 
578  template <
579  typename UnsignedBits,
580  int KEYS_PER_THREAD,
581  typename DigitExtractorT>
582  __device__ __forceinline__ void RankKeys(
583  UnsignedBits (&keys)[KEYS_PER_THREAD],
584  int (&ranks)[KEYS_PER_THREAD],
585  DigitExtractorT digit_extractor)
586  {
587  // Initialize shared digit counters
588 
589  #pragma unroll
590  for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
591  temp_storage.aliasable.raking_grid[linear_tid * PADDED_RAKING_SEGMENT + ITEM] = 0;
592 
593  ::rocprim::syncthreads();
594 
595  // Each warp will strip-mine its section of input, one strip at a time
596 
597  volatile DigitCounterT *digit_counters[KEYS_PER_THREAD];
598  uint32_t warp_id = linear_tid >> LOG_WARP_THREADS;
599  uint32_t lane_mask_lt = LaneMaskLt();
600 
601  #pragma unroll
602  for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
603  {
604  // My digit
605  uint32_t digit = digit_extractor.Digit(keys[ITEM]);
606 
607  if (IS_DESCENDING)
608  digit = RADIX_DIGITS - digit - 1;
609 
610  // Mask of peers who have same digit as me
611  uint32_t peer_mask = rocprim::MatchAny<RADIX_BITS>(digit);
612 
613  // Pointer to smem digit counter for this key
614  digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit * PADDED_WARPS + warp_id];
615 
616  // Number of occurrences in previous strips
617  DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
618 
619  // Warp-sync
620  WARP_SYNC(0xFFFFFFFF);
621 
622  // Number of peers having same digit as me
623  int32_t digit_count = __popc(peer_mask);
624 
625  // Number of lower-ranked peers having same digit seen so far
626  int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
627 
628  if (peer_digit_prefix == 0)
629  {
630  // First thread for each digit updates the shared warp counter
631  *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
632  }
633 
634  // Warp-sync
635  WARP_SYNC(0xFFFFFFFF);
636 
637  // Number of prior keys having same digit
638  ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
639  }
640 
641  ::rocprim::syncthreads();
642 
643  // Scan warp counters
644 
645  DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
646 
647  #pragma unroll
648  for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
649  scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid * PADDED_RAKING_SEGMENT + ITEM];
650 
651  BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
652 
653  #pragma unroll
654  for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
655  temp_storage.aliasable.raking_grid[linear_tid * PADDED_RAKING_SEGMENT + ITEM] = scan_counters[ITEM];
656 
657  ::rocprim::syncthreads();
658 
659  // Seed ranks with counter values from previous warps
660  #pragma unroll
661  for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
662  ranks[ITEM] += *digit_counters[ITEM];
663  }
664 
665 
669  template <
670  typename UnsignedBits,
671  int KEYS_PER_THREAD,
672  typename DigitExtractorT>
673  __device__ __forceinline__ void RankKeys(
674  UnsignedBits (&keys)[KEYS_PER_THREAD],
675  int (&ranks)[KEYS_PER_THREAD],
676  DigitExtractorT digit_extractor,
677  int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
678  {
679  RankKeys(keys, ranks, digit_extractor);
680 
681  // Get exclusive count for each digit
682  #pragma unroll
683  for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
684  {
685  int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
686 
687  if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
688  {
689  if (IS_DESCENDING)
690  bin_idx = RADIX_DIGITS - bin_idx - 1;
691 
692  exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx * PADDED_WARPS];
693  }
694  }
695  }
696 };
697 
698 
699 
700 END_HIPCUB_NAMESPACE
701 
702 #endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_RANK_HPP_
Definition: block_radix_rank.hpp:467
__device__ BlockRadixRankMatch(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
Definition: block_radix_rank.hpp:561
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor, int(&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
Rank keys. For the lower RADIX_DIGITS threads, digit counts for each digit are provided for the corre...
Definition: block_radix_rank.hpp:673
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor)
Rank keys.
Definition: block_radix_rank.hpp:582
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
Definition: block_radix_rank.hpp:97
__device__ BlockRadixRank(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
Definition: block_radix_rank.hpp:336
__device__ BlockRadixRank()
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition: block_radix_rank.hpp:326
__device__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor, int(&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
Rank keys. For the lower RADIX_DIGITS threads, digit counts for each digit are provided for the corre...
Definition: block_radix_rank.hpp:420
__device__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor)
Rank keys.
Definition: block_radix_rank.hpp:357
Definition: block_scan.hpp:80
\smemstorage{BlockScan}
Definition: block_radix_rank.hpp:549
\smemstorage{BlockScan}
Definition: block_radix_rank.hpp:315
Definition: util_type.hpp:53
Definition: util_type.hpp:100
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.hpp:359