/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.7.0/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.7.0/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp Source File#

hipCUB: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.7.0/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp Source File
device_segmented_reduce.hpp
1 /******************************************************************************
2  * Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  * Modifications Copyright (c) 2017-2023, Advanced Micro Devices, Inc. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
30 #ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
31 #define HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
32 
33 #include <limits>
34 #include <iterator>
35 
36 #include "../../../config.hpp"
37 
38 #include "../iterator/arg_index_input_iterator.hpp"
39 #include "../thread/thread_operators.hpp"
40 #include "device_reduce.hpp"
41 
42 #include <rocprim/device/device_segmented_reduce.hpp>
43 
44 BEGIN_HIPCUB_NAMESPACE
45 
46 namespace detail
47 {
48 
49 template<class Config,
50  class InputIterator,
51  class OutputIterator,
52  class OffsetIterator,
53  class ResultType,
54  class BinaryFunction>
55 __global__ __launch_bounds__(
56  ::rocprim::detail::device_params<Config>()
57  .reduce_config.block_size) void segmented_arg_minmax_kernel(InputIterator input,
58  OutputIterator output,
59  OffsetIterator begin_offsets,
60  OffsetIterator end_offsets,
61  BinaryFunction reduce_op,
62  ResultType initial_value,
63  ResultType empty_value)
64 {
65  // each block processes one segment
66  ::rocprim::detail::segmented_reduce<Config>(input,
67  output,
68  begin_offsets,
69  end_offsets,
70  reduce_op,
71  initial_value);
72  // no synchronization is needed since thread 0 writes to output
73 
74  const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>();
75  const unsigned int segment_id = ::rocprim::detail::block_id<0>();
76 
77  const unsigned int begin_offset = begin_offsets[segment_id];
78  const unsigned int end_offset = end_offsets[segment_id];
79 
80  // transform the segment output
81  if(flat_id == 0)
82  {
83  if(begin_offset == end_offset)
84  {
85  output[segment_id] = empty_value;
86  }
87  else
88  {
89  output[segment_id].key -= begin_offset;
90  }
91  }
92 }
93 
94 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
95  { \
96  auto _error = hipGetLastError(); \
97  if(_error != hipSuccess) \
98  return _error; \
99  if(debug_synchronous) \
100  { \
101  std::cout << name << "(" << size << ")"; \
102  auto __error = hipStreamSynchronize(stream); \
103  if(__error != hipSuccess) \
104  return __error; \
105  auto _end = std::chrono::high_resolution_clock::now(); \
106  auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
107  std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
108  } \
109  }
110 
113 template<class Config = rocprim::default_config,
114  class InputIterator,
115  class OutputIterator,
116  class OffsetIterator,
117  class InitValueType,
118  class BinaryFunction>
119 inline hipError_t segmented_arg_minmax(void* temporary_storage,
120  size_t& storage_size,
121  InputIterator input,
122  OutputIterator output,
123  unsigned int segments,
124  OffsetIterator begin_offsets,
125  OffsetIterator end_offsets,
126  BinaryFunction reduce_op,
127  InitValueType initial_value,
128  InitValueType empty_value,
129  hipStream_t stream,
130  bool debug_synchronous)
131 {
132  using input_type = typename std::iterator_traits<InputIterator>::value_type;
133  using result_type =
134  typename ::rocprim::detail::match_result_type<input_type, BinaryFunction>::type;
135 
136  using config = ::rocprim::detail::wrapped_reduce_config<Config, result_type>;
137 
138  ::rocprim::detail::target_arch target_arch;
139  hipError_t result = host_target_arch(stream, target_arch);
140  if(result != hipSuccess)
141  {
142  return result;
143  }
144  const ::rocprim::detail::reduce_config_params params
145  = ::rocprim::detail::dispatch_target_arch<config>(target_arch);
146 
147  const unsigned int block_size = params.reduce_config.block_size;
148 
149  if(temporary_storage == nullptr)
150  {
151  // Make sure user won't try to allocate 0 bytes memory, because
152  // hipMalloc will return nullptr when size is zero.
153  storage_size = 4;
154  return hipSuccess;
155  }
156 
157  if(segments == 0u)
158  return hipSuccess;
159 
160  std::chrono::high_resolution_clock::time_point start;
161 
162  if(debug_synchronous)
163  start = std::chrono::high_resolution_clock::now();
164  hipLaunchKernelGGL(HIP_KERNEL_NAME(segmented_arg_minmax_kernel<config>),
165  dim3(segments),
166  dim3(block_size),
167  0,
168  stream,
169  input,
170  output,
171  begin_offsets,
172  end_offsets,
173  reduce_op,
174  static_cast<result_type>(initial_value),
175  static_cast<result_type>(empty_value));
176  ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_arg_minmax", segments, start);
177 
178  return hipSuccess;
179 }
180 
181 } // namespace detail
182 
183 struct DeviceSegmentedReduce
184 {
185  template<
186  typename InputIteratorT,
187  typename OutputIteratorT,
188  typename OffsetIteratorT,
189  typename ReductionOp,
190  typename T
191  >
192  HIPCUB_RUNTIME_FUNCTION static
193  hipError_t Reduce(void * d_temp_storage,
194  size_t& temp_storage_bytes,
195  InputIteratorT d_in,
196  OutputIteratorT d_out,
197  int num_segments,
198  OffsetIteratorT d_begin_offsets,
199  OffsetIteratorT d_end_offsets,
200  ReductionOp reduction_op,
201  T initial_value,
202  hipStream_t stream = 0,
203  bool debug_synchronous = false)
204  {
205  return ::rocprim::segmented_reduce(
206  d_temp_storage, temp_storage_bytes,
207  d_in, d_out,
208  num_segments, d_begin_offsets, d_end_offsets,
209  ::hipcub::detail::convert_result_type<InputIteratorT, OutputIteratorT>(reduction_op),
210  initial_value,
211  stream, debug_synchronous
212  );
213  }
214 
215  template<
216  typename InputIteratorT,
217  typename OutputIteratorT,
218  typename OffsetIteratorT
219  >
220  HIPCUB_RUNTIME_FUNCTION static
221  hipError_t Sum(void * d_temp_storage,
222  size_t& temp_storage_bytes,
223  InputIteratorT d_in,
224  OutputIteratorT d_out,
225  int num_segments,
226  OffsetIteratorT d_begin_offsets,
227  OffsetIteratorT d_end_offsets,
228  hipStream_t stream = 0,
229  bool debug_synchronous = false)
230  {
231  using input_type = typename std::iterator_traits<InputIteratorT>::value_type;
232 
233  return Reduce(
234  d_temp_storage, temp_storage_bytes,
235  d_in, d_out,
236  num_segments, d_begin_offsets, d_end_offsets,
237  ::hipcub::Sum(), input_type(),
238  stream, debug_synchronous
239  );
240  }
241 
242  template<
243  typename InputIteratorT,
244  typename OutputIteratorT,
245  typename OffsetIteratorT
246  >
247  HIPCUB_RUNTIME_FUNCTION static
248  hipError_t Min(void * d_temp_storage,
249  size_t& temp_storage_bytes,
250  InputIteratorT d_in,
251  OutputIteratorT d_out,
252  int num_segments,
253  OffsetIteratorT d_begin_offsets,
254  OffsetIteratorT d_end_offsets,
255  hipStream_t stream = 0,
256  bool debug_synchronous = false)
257  {
258  using input_type = typename std::iterator_traits<InputIteratorT>::value_type;
259 
260  return Reduce(
261  d_temp_storage, temp_storage_bytes,
262  d_in, d_out,
263  num_segments, d_begin_offsets, d_end_offsets,
264  ::hipcub::Min(), std::numeric_limits<input_type>::max(),
265  stream, debug_synchronous
266  );
267  }
268 
269  template<
270  typename InputIteratorT,
271  typename OutputIteratorT,
272  typename OffsetIteratorT
273  >
274  HIPCUB_RUNTIME_FUNCTION static
275  hipError_t ArgMin(void * d_temp_storage,
276  size_t& temp_storage_bytes,
277  InputIteratorT d_in,
278  OutputIteratorT d_out,
279  int num_segments,
280  OffsetIteratorT d_begin_offsets,
281  OffsetIteratorT d_end_offsets,
282  hipStream_t stream = 0,
283  bool debug_synchronous = false)
284  {
285  using OffsetT = int;
286  using T = typename std::iterator_traits<InputIteratorT>::value_type;
287  using O = typename std::iterator_traits<OutputIteratorT>::value_type;
288  using OutputTupleT = typename std::conditional<
289  std::is_same<O, void>::value,
290  KeyValuePair<OffsetT, T>,
291  O
292  >::type;
293 
294  using OutputValueT = typename OutputTupleT::Value;
295  using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
296 
297  IteratorT d_indexed_in(d_in);
298  // true maximum value of the full range
299  // key is ::max because ArgMin finds the lowest value that has the lowest key
300  const OutputTupleT init(std::numeric_limits<OffsetT>::max(),
301  detail::get_max_special_value<T>());
302  // special value for empty segments
303  const OutputTupleT empty_value(1, detail::get_max_value<T>());
304 
305  return detail::segmented_arg_minmax(d_temp_storage,
306  temp_storage_bytes,
307  d_indexed_in,
308  d_out,
309  num_segments,
310  d_begin_offsets,
311  d_end_offsets,
312  ::hipcub::ArgMin(),
313  init,
314  empty_value,
315  stream,
316  debug_synchronous);
317  }
318 
319  template<
320  typename InputIteratorT,
321  typename OutputIteratorT,
322  typename OffsetIteratorT
323  >
324  HIPCUB_RUNTIME_FUNCTION static
325  hipError_t Max(void * d_temp_storage,
326  size_t& temp_storage_bytes,
327  InputIteratorT d_in,
328  OutputIteratorT d_out,
329  int num_segments,
330  OffsetIteratorT d_begin_offsets,
331  OffsetIteratorT d_end_offsets,
332  hipStream_t stream = 0,
333  bool debug_synchronous = false)
334  {
335  using input_type = typename std::iterator_traits<InputIteratorT>::value_type;
336 
337  return Reduce(
338  d_temp_storage, temp_storage_bytes,
339  d_in, d_out,
340  num_segments, d_begin_offsets, d_end_offsets,
341  ::hipcub::Max(), std::numeric_limits<input_type>::lowest(),
342  stream, debug_synchronous
343  );
344  }
345 
346  template<
347  typename InputIteratorT,
348  typename OutputIteratorT,
349  typename OffsetIteratorT
350  >
351  HIPCUB_RUNTIME_FUNCTION static
352  hipError_t ArgMax(void * d_temp_storage,
353  size_t& temp_storage_bytes,
354  InputIteratorT d_in,
355  OutputIteratorT d_out,
356  int num_segments,
357  OffsetIteratorT d_begin_offsets,
358  OffsetIteratorT d_end_offsets,
359  hipStream_t stream = 0,
360  bool debug_synchronous = false)
361  {
362  using OffsetT = int;
363  using T = typename std::iterator_traits<InputIteratorT>::value_type;
364  using O = typename std::iterator_traits<OutputIteratorT>::value_type;
365  using OutputTupleT = typename std::conditional<
366  std::is_same<O, void>::value,
367  KeyValuePair<OffsetT, T>,
368  O
369  >::type;
370 
371  using OutputValueT = typename OutputTupleT::Value;
372  using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
373 
374  IteratorT d_indexed_in(d_in);
375  // true minimum value of the full range
376  // key is ::max because ArgMax finds the highest value that has the lowest key
377  const OutputTupleT init(std::numeric_limits<OffsetT>::max(),
378  detail::get_lowest_special_value<T>());
379  // special value for empty segments
380  const OutputTupleT empty_value(1, detail::get_lowest_value<T>());
381 
382  return detail::segmented_arg_minmax(d_temp_storage,
383  temp_storage_bytes,
384  d_indexed_in,
385  d_out,
386  num_segments,
387  d_begin_offsets,
388  d_end_offsets,
389  ::hipcub::ArgMax(),
390  init,
391  empty_value,
392  stream,
393  debug_synchronous);
394  }
395 };
396 
397 END_HIPCUB_NAMESPACE
398 
399 #endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
Definition: thread_operators.hpp:126
Definition: thread_operators.hpp:141
Definition: thread_operators.hpp:106
Definition: thread_operators.hpp:116
Definition: thread_operators.hpp:76