/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.7.0/hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.7.0/hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp Source File#

hipCUB: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.7.0/hipcub/include/hipcub/backend/rocprim/block/block_run_length_decode.hpp Source File
block_run_length_decode.hpp
1 /******************************************************************************
2  * Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
30 #ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
31 #define HIPCUB_ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
32 
33 #include "../../../config.hpp"
34 #include "../thread/thread_search.hpp"
35 #include "../util_math.hpp"
36 #include "../util_ptx.hpp"
37 #include "../util_type.hpp"
38 #include "block_scan.hpp"
39 #include <limits>
40 #include <type_traits>
41 
42 BEGIN_HIPCUB_NAMESPACE
43 
123 template <typename ItemT,
124  int BLOCK_DIM_X,
125  int RUNS_PER_THREAD,
126  int DECODED_ITEMS_PER_THREAD,
127  typename DecodedOffsetT = uint32_t,
128  int BLOCK_DIM_Y = 1,
129  int BLOCK_DIM_Z = 1>
131 {
132  //---------------------------------------------------------------------
133  // CONFIGS & TYPE ALIASES
134  //---------------------------------------------------------------------
135 private:
137  static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
138 
140  static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD;
141 
144 
146  using RunOffsetT = uint32_t;
147 
149  union _TempStorage
150  {
151  typename RunOffsetScanT::TempStorage offset_scan;
152  struct
153  {
154  ItemT run_values[BLOCK_RUNS];
155  DecodedOffsetT run_offsets[BLOCK_RUNS];
156  } runs;
157  }; // union TempStorage
158 
160  HIPCUB_DEVICE __forceinline__ _TempStorage &PrivateStorage()
161  {
162  __shared__ _TempStorage private_storage;
163  return private_storage;
164  }
165 
167  _TempStorage &temp_storage;
168 
170  uint32_t linear_tid;
171 
172 public:
173  struct TempStorage : Uninitialized<_TempStorage>
174  {
175  };
176 
177  //---------------------------------------------------------------------
178  // CONSTRUCTOR
179  //---------------------------------------------------------------------
180 
186  template <typename RunLengthT, typename TotalDecodedSizeT>
187  HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
188  ItemT (&run_values)[RUNS_PER_THREAD],
189  RunLengthT (&run_lengths)[RUNS_PER_THREAD],
190  TotalDecodedSizeT &total_decoded_size)
191  : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
192  {
193  InitWithRunLengths(run_values, run_lengths, total_decoded_size);
194  }
195 
201  template <typename UserRunOffsetT>
202  HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
203  ItemT (&run_values)[RUNS_PER_THREAD],
204  UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
205  : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
206  {
207  InitWithRunOffsets(run_values, run_offsets);
208  }
209 
213  template <typename RunLengthT, typename TotalDecodedSizeT>
214  HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
215  RunLengthT (&run_lengths)[RUNS_PER_THREAD],
216  TotalDecodedSizeT &total_decoded_size)
217  : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
218  {
219  InitWithRunLengths(run_values, run_lengths, total_decoded_size);
220  }
221 
225  template <typename UserRunOffsetT>
226  HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
227  UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
228  : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
229  {
230  InitWithRunOffsets(run_values, run_offsets);
231  }
232 
233 private:
239  template <int MAX_NUM_ITEMS,
240  typename InputIteratorT,
241  typename OffsetT,
242  typename T>
243  HIPCUB_DEVICE __forceinline__ OffsetT StaticUpperBound(InputIteratorT input,
244  OffsetT num_items,
245  T val)
246  {
247  OffsetT lower_bound = 0;
248  OffsetT upper_bound = num_items;
249  #pragma unroll
250  for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
251  {
252  OffsetT mid = hipcub::MidPoint<OffsetT>(lower_bound, upper_bound);
253  mid = (rocprim::min)(mid, num_items - 1);
254 
255  if (val < input[mid])
256  {
257  upper_bound = mid;
258  }
259  else
260  {
261  lower_bound = mid + 1;
262  }
263  }
264 
265  return lower_bound;
266  }
267 
268  template <typename RunOffsetT>
269  HIPCUB_DEVICE __forceinline__ void InitWithRunOffsets(ItemT (&run_values)[RUNS_PER_THREAD],
270  RunOffsetT (&run_offsets)[RUNS_PER_THREAD])
271  {
272  // Keep the runs' items and the offsets of each run's beginning in the temporary storage
273  RunOffsetT thread_dst_offset = static_cast<RunOffsetT>(linear_tid) * static_cast<RunOffsetT>(RUNS_PER_THREAD);
274  #pragma unroll
275  for (int i = 0; i < RUNS_PER_THREAD; i++)
276  {
277  temp_storage.runs.run_values[thread_dst_offset] = run_values[i];
278  temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i];
279  thread_dst_offset++;
280  }
281 
282  // Ensure run offsets and run values have been writen to shared memory
283  CTA_SYNC();
284  }
285 
286  template <typename RunLengthT, typename TotalDecodedSizeT>
287  HIPCUB_DEVICE __forceinline__ void InitWithRunLengths(ItemT (&run_values)[RUNS_PER_THREAD],
288  RunLengthT (&run_lengths)[RUNS_PER_THREAD],
289  TotalDecodedSizeT &total_decoded_size)
290  {
291  // Compute the offset for the beginning of each run
292  DecodedOffsetT run_offsets[RUNS_PER_THREAD];
293  #pragma unroll
294  for (int i = 0; i < RUNS_PER_THREAD; i++)
295  {
296  run_offsets[i] = static_cast<DecodedOffsetT>(run_lengths[i]);
297  }
298  DecodedOffsetT decoded_size_aggregate;
299  RunOffsetScanT(this->temp_storage.offset_scan).ExclusiveSum(run_offsets, run_offsets, decoded_size_aggregate);
300  total_decoded_size = static_cast<TotalDecodedSizeT>(decoded_size_aggregate);
301 
302  // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation)
303  CTA_SYNC();
304 
305  InitWithRunOffsets(run_values, run_offsets);
306  }
307 
308 public:
326  template <typename RelativeOffsetT>
327  HIPCUB_DEVICE __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
328  RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD],
329  DecodedOffsetT from_decoded_offset = 0)
330  {
331  // The (global) offset of the first item decoded by this thread
332  DecodedOffsetT thread_decoded_offset = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD;
333 
334  // The run that the first decoded item of this thread belongs to
335  // If this thread's <thread_decoded_offset> is already beyond the total decoded size, it will be assigned to the
336  // last run
337  RunOffsetT assigned_run =
338  StaticUpperBound<BLOCK_RUNS>(temp_storage.runs.run_offsets, BLOCK_RUNS, thread_decoded_offset) -
339  static_cast<RunOffsetT>(1U);
340 
341  DecodedOffsetT assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
342 
343  // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
344  DecodedOffsetT assigned_run_end = (assigned_run == BLOCK_RUNS - 1)
345  ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
346  : temp_storage.runs.run_offsets[assigned_run + 1];
347 
348  ItemT val = temp_storage.runs.run_values[assigned_run];
349 
350  #pragma unroll
351  for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++)
352  {
353  decoded_items[i] = val;
354  item_offsets[i] = thread_decoded_offset - assigned_run_begin;
355  if (thread_decoded_offset == assigned_run_end - 1)
356  {
357  // We make sure that a thread is not re-entering this conditional when being assigned to the last run already by
358  // extending the last run's length to all the thread's item
359  assigned_run++;
360  assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
361 
362  // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
363  assigned_run_end = (assigned_run == BLOCK_RUNS - 1) ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
364  : temp_storage.runs.run_offsets[assigned_run + 1];
365  val = temp_storage.runs.run_values[assigned_run];
366  }
367  thread_decoded_offset++;
368  }
369  }
370 
383  HIPCUB_DEVICE __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
384  DecodedOffsetT from_decoded_offset = 0)
385  {
386  DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD];
387  RunLengthDecode(decoded_items, item_offsets, from_decoded_offset);
388  }
389 };
390 
391 END_HIPCUB_NAMESPACE
392 
393 #endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
The BlockRunLengthDecode class supports decoding a run-length encoded array of items....
Definition: block_run_length_decode.hpp:131
__device__ __forceinline__ void RunLengthDecode(ItemT(&decoded_items)[DECODED_ITEMS_PER_THREAD], RelativeOffsetT(&item_offsets)[DECODED_ITEMS_PER_THREAD], DecodedOffsetT from_decoded_offset=0)
Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length deco...
Definition: block_run_length_decode.hpp:327
__device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage, ItemT(&run_values)[RUNS_PER_THREAD], RunLengthT(&run_lengths)[RUNS_PER_THREAD], TotalDecodedSizeT &total_decoded_size)
Constructor specialised for user-provided temporary storage, initializing using the runs' lengths....
Definition: block_run_length_decode.hpp:187
__device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage, ItemT(&run_values)[RUNS_PER_THREAD], UserRunOffsetT(&run_offsets)[RUNS_PER_THREAD])
Constructor specialised for user-provided temporary storage, initializing using the runs' offsets....
Definition: block_run_length_decode.hpp:202
__device__ __forceinline__ void RunLengthDecode(ItemT(&decoded_items)[DECODED_ITEMS_PER_THREAD], DecodedOffsetT from_decoded_offset=0)
Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length deco...
Definition: block_run_length_decode.hpp:383
__device__ __forceinline__ BlockRunLengthDecode(ItemT(&run_values)[RUNS_PER_THREAD], UserRunOffsetT(&run_offsets)[RUNS_PER_THREAD])
Constructor specialised for static temporary storage, initializing using the runs' offsets.
Definition: block_run_length_decode.hpp:226
__device__ __forceinline__ BlockRunLengthDecode(ItemT(&run_values)[RUNS_PER_THREAD], RunLengthT(&run_lengths)[RUNS_PER_THREAD], TotalDecodedSizeT &total_decoded_size)
Constructor specialised for static temporary storage, initializing using the runs' lengths.
Definition: block_run_length_decode.hpp:214
Definition: block_scan.hpp:80
Definition: block_run_length_decode.hpp:174
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.hpp:363