/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.5.1/hipcub/include/hipcub/backend/rocprim/block/block_radix_rank.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.5.1/hipcub/include/hipcub/backend/rocprim/block/block_radix_rank.hpp Source File#

hipCUB: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.5.1/hipcub/include/hipcub/backend/rocprim/block/block_radix_rank.hpp Source File
block_radix_rank.hpp
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  * Modifications Copyright (c) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35  #ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_RANK_HPP_
36  #define HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_RANK_HPP_
37 
38 #include <stdint.h>
39 
40 #include "../../../config.hpp"
41 #include "../../../util_type.hpp"
42 #include "../../../util_ptx.hpp"
43 
44 #include "../block/block_scan.hpp"
45 #include "../block/radix_rank_sort_operations.hpp"
46 #include "../thread/thread_reduce.hpp"
47 #include "../thread/thread_scan.hpp"
48 
49 BEGIN_HIPCUB_NAMESPACE
50 
51 
52 
87 template <
88  int BLOCK_DIM_X,
89  int RADIX_BITS,
90  bool IS_DESCENDING,
91  bool MEMOIZE_OUTER_SCAN = false,
92  BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
93  hipSharedMemConfig SMEM_CONFIG = hipSharedMemBankSizeFourByte,
94  int BLOCK_DIM_Y = 1,
95  int BLOCK_DIM_Z = 1,
96  int ARCH = HIPCUB_ARCH /* ignored */>
98 {
99 private:
100 
101  /******************************************************************************
102  * Type definitions and constants
103  ******************************************************************************/
104 
105  // Integer type for digit counters (to be packed into words of type PackedCounters)
106  typedef unsigned short DigitCounter;
107 
108  // Integer type for packing DigitCounters into columns of shared memory banks
109  typedef typename std::conditional<(SMEM_CONFIG == hipSharedMemBankSizeEightByte),
110  unsigned long long,
111  unsigned int>::type PackedCounter;
112 
113  enum
114  {
115  // The thread block size in threads
116  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
117 
118  RADIX_DIGITS = 1 << RADIX_BITS,
119 
120  LOG_WARP_THREADS = Log2<ARCH>::VALUE,
121  WARP_THREADS = 1 << LOG_WARP_THREADS,
122  WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
123 
124  BYTES_PER_COUNTER = sizeof(DigitCounter),
125  LOG_BYTES_PER_COUNTER = Log2<BYTES_PER_COUNTER>::VALUE,
126 
127  PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter),
128  LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,
129 
130  LOG_COUNTER_LANES = rocprim::maximum<int>()((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0), // Always at least one lane
131  COUNTER_LANES = 1 << LOG_COUNTER_LANES,
132 
133  // The number of packed counters per thread (plus one for padding)
134  PADDED_COUNTER_LANES = COUNTER_LANES + 1,
135  RAKING_SEGMENT = PADDED_COUNTER_LANES,
136  };
137 
138 public:
139 
140  enum
141  {
143  BINS_TRACKED_PER_THREAD = rocprim::maximum<int>()(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
144  };
145 
146 private:
147 
148 
150  typedef BlockScan<
151  PackedCounter,
152  BLOCK_DIM_X,
153  INNER_SCAN_ALGORITHM,
154  BLOCK_DIM_Y,
155  BLOCK_DIM_Z,
156  ARCH>
157  BlockScan;
158 
159 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
160 
162  struct __align__(16) _TempStorage
163  {
164  union Aliasable
165  {
166  DigitCounter digit_counters[PADDED_COUNTER_LANES * BLOCK_THREADS * PACKING_RATIO];
167  PackedCounter raking_grid[BLOCK_THREADS * RAKING_SEGMENT];
168 
169  } aliasable;
170 
171  // Storage for scanning local ranks
172  typename BlockScan::TempStorage block_scan;
173  };
174 
175 #endif
176 
177  /******************************************************************************
178  * Thread fields
179  ******************************************************************************/
180 
182  _TempStorage &temp_storage;
183 
185  unsigned int linear_tid;
186 
188  PackedCounter cached_segment[RAKING_SEGMENT];
189 
190 
191  /******************************************************************************
192  * Utility methods
193  ******************************************************************************/
194 
198  HIPCUB_DEVICE inline _TempStorage& PrivateStorage()
199  {
200  __shared__ _TempStorage private_storage;
201  return private_storage;
202  }
203 
204 
208  HIPCUB_DEVICE inline PackedCounter Upsweep()
209  {
210  PackedCounter *smem_raking_ptr = &temp_storage.aliasable.raking_grid[linear_tid * RAKING_SEGMENT];
211  PackedCounter *raking_ptr;
212 
213  if (MEMOIZE_OUTER_SCAN)
214  {
215  // Copy data into registers
216  #pragma unroll
217  for (int i = 0; i < RAKING_SEGMENT; i++)
218  {
219  cached_segment[i] = smem_raking_ptr[i];
220  }
221  raking_ptr = cached_segment;
222  }
223  else
224  {
225  raking_ptr = smem_raking_ptr;
226  }
227 
228  return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
229  }
230 
231 
233  HIPCUB_DEVICE inline void ExclusiveDownsweep(
234  PackedCounter raking_partial)
235  {
236  PackedCounter *smem_raking_ptr = &temp_storage.aliasable.raking_grid[linear_tid * RAKING_SEGMENT];
237 
238  PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
239  cached_segment :
240  smem_raking_ptr;
241 
242  // Exclusive raking downsweep scan
243  internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
244 
245  if (MEMOIZE_OUTER_SCAN)
246  {
247  // Copy data back to smem
248  #pragma unroll
249  for (int i = 0; i < RAKING_SEGMENT; i++)
250  {
251  smem_raking_ptr[i] = cached_segment[i];
252  }
253  }
254  }
255 
256 
260  HIPCUB_DEVICE inline void ResetCounters()
261  {
262  // Reset shared memory digit counters
263  #pragma unroll
264  for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
265  {
266  #pragma unroll
267  for (int SUB_COUNTER = 0; SUB_COUNTER < PACKING_RATIO; SUB_COUNTER++)
268  {
269  temp_storage.aliasable.digit_counters[(LANE * BLOCK_THREADS + linear_tid) * PACKING_RATIO + SUB_COUNTER] = 0;
270  }
271  }
272  }
273 
274 
278  struct PrefixCallBack
279  {
280  HIPCUB_DEVICE inline PackedCounter operator()(PackedCounter block_aggregate)
281  {
282  PackedCounter block_prefix = 0;
283 
284  // Propagate totals in packed fields
285  #pragma unroll
286  for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
287  {
288  block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
289  }
290 
291  return block_prefix;
292  }
293  };
294 
295 
299  HIPCUB_DEVICE inline void ScanCounters()
300  {
301  // Upsweep scan
302  PackedCounter raking_partial = Upsweep();
303 
304  // Compute exclusive sum
305  PackedCounter exclusive_partial;
306  PrefixCallBack prefix_call_back;
307  BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
308 
309  // Downsweep scan with exclusive partial
310  ExclusiveDownsweep(exclusive_partial);
311  }
312 
313 public:
314 
316  struct TempStorage : Uninitialized<_TempStorage> {};
317 
318 
319  /******************************************************************/
323 
327  HIPCUB_DEVICE inline BlockRadixRank()
328  :
329  temp_storage(PrivateStorage()),
330  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
331  {}
332 
333 
337  HIPCUB_DEVICE inline BlockRadixRank(
338  TempStorage &temp_storage)
339  :
340  temp_storage(temp_storage.Alias()),
341  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
342  {}
343 
344 
346  /******************************************************************/
350 
354  template <
355  typename UnsignedBits,
356  int KEYS_PER_THREAD,
357  typename DigitExtractorT>
358  HIPCUB_DEVICE inline void RankKeys(
359  UnsignedBits (&keys)[KEYS_PER_THREAD],
360  int (&ranks)[KEYS_PER_THREAD],
361  DigitExtractorT digit_extractor)
362  {
363  DigitCounter thread_prefixes[KEYS_PER_THREAD]; // For each key, the count of previous keys in this tile having the same digit
364  DigitCounter* digit_counters[KEYS_PER_THREAD]; // For each key, the byte-offset of its corresponding digit counter in smem
365 
366  // Reset shared memory digit counters
367  ResetCounters();
368 
369  #pragma unroll
370  for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
371  {
372  // Get digit
373  unsigned int digit = digit_extractor.Digit(keys[ITEM]);
374 
375  // Get sub-counter
376  unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
377 
378  // Get counter lane
379  unsigned int counter_lane = digit & (COUNTER_LANES - 1);
380 
381  if (IS_DESCENDING)
382  {
383  sub_counter = PACKING_RATIO - 1 - sub_counter;
384  counter_lane = COUNTER_LANES - 1 - counter_lane;
385  }
386 
387  // Pointer to smem digit counter
388  digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane * BLOCK_THREADS * PACKING_RATIO + linear_tid * PACKING_RATIO + sub_counter];
389 
390  // Load thread-exclusive prefix
391  thread_prefixes[ITEM] = *digit_counters[ITEM];
392 
393  // Store inclusive prefix
394  *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
395  }
396 
397  ::rocprim::syncthreads();
398 
399  // Scan shared memory counters
400  ScanCounters();
401 
402  ::rocprim::syncthreads();
403 
404  // Extract the local ranks of each key
405  #pragma unroll
406  for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
407  {
408  // Add in thread block exclusive prefix
409  ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
410  }
411  }
412 
413 
417  template <
418  typename UnsignedBits,
419  int KEYS_PER_THREAD,
420  typename DigitExtractorT>
421  HIPCUB_DEVICE inline void RankKeys(
422  UnsignedBits (&keys)[KEYS_PER_THREAD],
423  int (&ranks)[KEYS_PER_THREAD],
424  DigitExtractorT digit_extractor,
425  int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
426  {
427  // Rank keys
428  RankKeys(keys, ranks, digit_extractor);
429 
430  // Get the inclusive and exclusive digit totals corresponding to the calling thread.
431  #pragma unroll
432  for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
433  {
434  int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
435 
436  if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
437  {
438  if (IS_DESCENDING)
439  bin_idx = RADIX_DIGITS - bin_idx - 1;
440 
441  // Obtain ex/inclusive digit counts. (Unfortunately these all reside in the
442  // first counter column, resulting in unavoidable bank conflicts.)
443  unsigned int counter_lane = (bin_idx & (COUNTER_LANES - 1));
444  unsigned int sub_counter = bin_idx >> (LOG_COUNTER_LANES);
445 
446  exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counter[counter_lane * BLOCK_THREADS * PACKING_RATIO + sub_counter];
447  }
448  }
449  }
450 };
451 
452 
453 
454 
455 
459 template <
460  int BLOCK_DIM_X,
461  int RADIX_BITS,
462  bool IS_DESCENDING,
463  BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
464  int BLOCK_DIM_Y = 1,
465  int BLOCK_DIM_Z = 1,
466  int ARCH = HIPCUB_ARCH>
468 {
469 private:
470 
471  /******************************************************************************
472  * Type definitions and constants
473  ******************************************************************************/
474 
475  typedef int32_t RankT;
476  typedef int32_t DigitCounterT;
477 
478  enum
479  {
480  // The thread block size in threads
481  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
482 
483  RADIX_DIGITS = 1 << RADIX_BITS,
484 
485  LOG_WARP_THREADS = Log2<ARCH>::VALUE,
486  WARP_THREADS = 1 << LOG_WARP_THREADS,
487  WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
488 
489  PADDED_WARPS = ((WARPS & 0x1) == 0) ?
490  WARPS + 1 :
491  WARPS,
492 
493  COUNTERS = PADDED_WARPS * RADIX_DIGITS,
494  RAKING_SEGMENT = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
495  PADDED_RAKING_SEGMENT = ((RAKING_SEGMENT & 0x1) == 0) ?
496  RAKING_SEGMENT + 1 :
497  RAKING_SEGMENT,
498  };
499 
500 public:
501 
502  enum
503  {
505  BINS_TRACKED_PER_THREAD = rocprim::maximum<int>()(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
506  };
507 
508 private:
509 
511  typedef BlockScan<
512  DigitCounterT,
513  BLOCK_THREADS,
514  INNER_SCAN_ALGORITHM,
515  BLOCK_DIM_Y,
516  BLOCK_DIM_Z,
517  ARCH>
518  BlockScanT;
519 
520 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
522  struct __align__(16) _TempStorage
523  {
524  typename BlockScanT::TempStorage block_scan;
525 
526  union __align__(16) Aliasable
527  {
528  volatile DigitCounterT warp_digit_counters[RADIX_DIGITS * PADDED_WARPS];
529  DigitCounterT raking_grid[BLOCK_THREADS * PADDED_RAKING_SEGMENT];
530 
531  } aliasable;
532  };
533 #endif
534 
535  /******************************************************************************
536  * Thread fields
537  ******************************************************************************/
538 
540  _TempStorage &temp_storage;
541 
543  unsigned int linear_tid;
544 
545 
546 
547 public:
548 
550  struct TempStorage : Uninitialized<_TempStorage> {};
551 
552 
553  /******************************************************************/
557 
558 
562  HIPCUB_DEVICE inline BlockRadixRankMatch(
563  TempStorage &temp_storage)
564  :
565  temp_storage(temp_storage.Alias()),
566  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
567  {}
568 
569 
571  /******************************************************************/
575 
579  template <
580  typename UnsignedBits,
581  int KEYS_PER_THREAD,
582  typename DigitExtractorT>
583  __device__ __forceinline__ void RankKeys(
584  UnsignedBits (&keys)[KEYS_PER_THREAD],
585  int (&ranks)[KEYS_PER_THREAD],
586  DigitExtractorT digit_extractor)
587  {
588  // Initialize shared digit counters
589 
590  #pragma unroll
591  for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
592  temp_storage.aliasable.raking_grid[linear_tid * PADDED_RAKING_SEGMENT + ITEM] = 0;
593 
594  ::rocprim::syncthreads();
595 
596  // Each warp will strip-mine its section of input, one strip at a time
597 
598  volatile DigitCounterT *digit_counters[KEYS_PER_THREAD];
599  uint32_t warp_id = linear_tid >> LOG_WARP_THREADS;
600  uint32_t lane_mask_lt = LaneMaskLt();
601 
602  #pragma unroll
603  for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
604  {
605  // My digit
606  uint32_t digit = digit_extractor.Digit(keys[ITEM]);
607 
608  if (IS_DESCENDING)
609  digit = RADIX_DIGITS - digit - 1;
610 
611  // Mask of peers who have same digit as me
612  uint32_t peer_mask = rocprim::MatchAny<RADIX_BITS>(digit);
613 
614  // Pointer to smem digit counter for this key
615  digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit * PADDED_WARPS + warp_id];
616 
617  // Number of occurrences in previous strips
618  DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
619 
620  // Warp-sync
621  WARP_SYNC(0xFFFFFFFF);
622 
623  // Number of peers having same digit as me
624  int32_t digit_count = __popc(peer_mask);
625 
626  // Number of lower-ranked peers having same digit seen so far
627  int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
628 
629  if (peer_digit_prefix == 0)
630  {
631  // First thread for each digit updates the shared warp counter
632  *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
633  }
634 
635  // Warp-sync
636  WARP_SYNC(0xFFFFFFFF);
637 
638  // Number of prior keys having same digit
639  ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
640  }
641 
642  ::rocprim::syncthreads();
643 
644  // Scan warp counters
645 
646  DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
647 
648  #pragma unroll
649  for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
650  scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid * PADDED_RAKING_SEGMENT + ITEM];
651 
652  BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
653 
654  #pragma unroll
655  for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
656  temp_storage.aliasable.raking_grid[linear_tid * PADDED_RAKING_SEGMENT + ITEM] = scan_counters[ITEM];
657 
658  ::rocprim::syncthreads();
659 
660  // Seed ranks with counter values from previous warps
661  #pragma unroll
662  for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
663  ranks[ITEM] += *digit_counters[ITEM];
664  }
665 
666 
670  template <
671  typename UnsignedBits,
672  int KEYS_PER_THREAD,
673  typename DigitExtractorT>
674  __device__ __forceinline__ void RankKeys(
675  UnsignedBits (&keys)[KEYS_PER_THREAD],
676  int (&ranks)[KEYS_PER_THREAD],
677  DigitExtractorT digit_extractor,
678  int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
679  {
680  RankKeys(keys, ranks, digit_extractor);
681 
682  // Get exclusive count for each digit
683  #pragma unroll
684  for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
685  {
686  int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
687 
688  if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
689  {
690  if (IS_DESCENDING)
691  bin_idx = RADIX_DIGITS - bin_idx - 1;
692 
693  exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx * PADDED_WARPS];
694  }
695  }
696  }
697 };
698 
699 
700 
701 END_HIPCUB_NAMESPACE
702 
703 #endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_RANK_HPP_
Definition: block_radix_rank.hpp:468
__device__ BlockRadixRankMatch(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
Definition: block_radix_rank.hpp:562
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor, int(&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
Rank keys. For the lower RADIX_DIGITS threads, digit counts for each digit are provided for the corre...
Definition: block_radix_rank.hpp:674
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor)
Rank keys.
Definition: block_radix_rank.hpp:583
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
Definition: block_radix_rank.hpp:98
__device__ BlockRadixRank(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
Definition: block_radix_rank.hpp:337
__device__ BlockRadixRank()
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition: block_radix_rank.hpp:327
__device__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor, int(&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
Rank keys. For the lower RADIX_DIGITS threads, digit counts for each digit are provided for the corre...
Definition: block_radix_rank.hpp:421
__device__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor)
Rank keys.
Definition: block_radix_rank.hpp:358
Definition: block_scan.hpp:80
\smemstorage{BlockScan}
Definition: block_radix_rank.hpp:550
\smemstorage{BlockScan}
Definition: block_radix_rank.hpp:316
Definition: util_type.hpp:101
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.hpp:363