/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.7.0/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.7.0/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp Source File#

hipCUB: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.7.0/hipcub/include/hipcub/backend/rocprim/device/device_reduce.hpp Source File
device_reduce.hpp
1 /******************************************************************************
2  * Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
30 #ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
31 #define HIPCUB_ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
32 
33 #include <limits>
34 #include <iterator>
35 
36 #include <hip/hip_fp16.h> // __half
37 #include <hip/hip_bfloat16.h> // hip_bfloat16
38 
39 #include "../../../config.hpp"
40 #include "../iterator/arg_index_input_iterator.hpp"
41 #include "../thread/thread_operators.hpp"
42 
43 #include <rocprim/device/device_reduce.hpp>
44 #include <rocprim/device/device_reduce_by_key.hpp>
45 
46 BEGIN_HIPCUB_NAMESPACE
47 namespace detail
48 {
49 
50 template<typename T>
51 HIPCUB_HOST_DEVICE T set_half_bits(uint16_t value)
52 {
53  T half_value{};
54  unsigned char* char_representation = reinterpret_cast<unsigned char*>(&half_value);
55  char_representation[0] = value;
56  char_representation[1] = value >> 8;
57  return half_value;
58 }
59 
60 template<class T>
61 HIPCUB_HOST_DEVICE inline T get_lowest_value()
62 {
63  return std::numeric_limits<T>::lowest();
64 }
65 
66 template<>
67 HIPCUB_HOST_DEVICE inline __half get_lowest_value<__half>()
68 {
69  // smallest normal value (not subnormal): 1 11110 1111111111
70  return set_half_bits<__half>(0xfbff);
71 }
72 
73 template<>
74 HIPCUB_HOST_DEVICE inline hip_bfloat16 get_lowest_value<hip_bfloat16>()
75 {
76  // smallest normal value (not subnormal): 1 11111110 1111111
77  return set_half_bits<hip_bfloat16>(0xff7f);
78 }
79 
80 template<class T>
81 HIPCUB_HOST_DEVICE inline T get_max_value()
82 {
83  return std::numeric_limits<T>::max();
84 }
85 
86 template<>
87 HIPCUB_HOST_DEVICE inline __half get_max_value<__half>()
88 {
89  // largest normal value (not subnormal): 0 11110 1111111111
90  return set_half_bits<__half>(0x7bff);
91 }
92 
93 template<>
94 HIPCUB_HOST_DEVICE inline hip_bfloat16 get_max_value<hip_bfloat16>()
95 {
96  // largest normal value (not subnormal): 0 11111110 1111111
97  return set_half_bits<hip_bfloat16>(0x7f7f);
98 }
99 
101 template<class T>
102 inline auto get_lowest_special_value() ->
103  typename std::enable_if_t<!rocprim::is_floating_point<T>::value, T>
104 {
105  return get_lowest_value<T>();
106 }
107 
109 template<class T>
110 inline auto get_lowest_special_value() ->
111  typename std::enable_if_t<rocprim::is_floating_point<T>::value, T>
112 {
113  return -std::numeric_limits<T>::infinity();
114 }
115 
116 template<>
117 inline __half get_lowest_special_value<__half>()
118 {
119  // negative infinity: 1 11111 0000000000
120  return set_half_bits<__half>(0xfc00);
121 }
122 
123 template<>
124 inline hip_bfloat16 get_lowest_special_value<hip_bfloat16>()
125 {
126  // negative infinity: 1 11111111 0000000
127  return set_half_bits<hip_bfloat16>(0xff80);
128 }
129 
131 template<typename T>
132 inline auto get_max_special_value() ->
133  typename std::enable_if_t<!rocprim::is_floating_point<T>::value, T>
134 {
135  return get_max_value<T>();
136 }
137 
139 template<typename T>
140 inline auto get_max_special_value() ->
141  typename std::enable_if_t<rocprim::is_floating_point<T>::value, T>
142 {
143  return std::numeric_limits<T>::infinity();
144 }
145 
146 template<>
147 inline __half get_max_special_value<__half>()
148 {
149  // positive infinity: 0 11111 0000000000
150  return set_half_bits<__half>(0x7c00);
151 }
152 
153 template<>
154 inline hip_bfloat16 get_max_special_value<hip_bfloat16>()
155 {
156  // positive infinity: 0 11111111 0000000
157  return set_half_bits<hip_bfloat16>(0x7f80);
158 }
159 
160 } // end detail namespace
161 
162 class DeviceReduce
163 {
164 public:
165  template <
166  typename InputIteratorT,
167  typename OutputIteratorT,
168  typename ReduceOpT,
169  typename T
170  >
171  HIPCUB_RUNTIME_FUNCTION static
172  hipError_t Reduce(void *d_temp_storage,
173  size_t &temp_storage_bytes,
174  InputIteratorT d_in,
175  OutputIteratorT d_out,
176  int num_items,
177  ReduceOpT reduction_op,
178  T init,
179  hipStream_t stream = 0,
180  bool debug_synchronous = false)
181  {
182  return ::rocprim::reduce(
183  d_temp_storage, temp_storage_bytes,
184  d_in, d_out, init, num_items,
185  ::hipcub::detail::convert_result_type<InputIteratorT, OutputIteratorT>(reduction_op),
186  stream, debug_synchronous
187  );
188  }
189 
190  template <
191  typename InputIteratorT,
192  typename OutputIteratorT
193  >
194  HIPCUB_RUNTIME_FUNCTION static
195  hipError_t Sum(void *d_temp_storage,
196  size_t &temp_storage_bytes,
197  InputIteratorT d_in,
198  OutputIteratorT d_out,
199  int num_items,
200  hipStream_t stream = 0,
201  bool debug_synchronous = false)
202  {
203  using T = typename std::iterator_traits<InputIteratorT>::value_type;
204  return Reduce(
205  d_temp_storage, temp_storage_bytes,
206  d_in, d_out, num_items, ::hipcub::Sum(), T(0),
207  stream, debug_synchronous
208  );
209  }
210 
211  template <
212  typename InputIteratorT,
213  typename OutputIteratorT
214  >
215  HIPCUB_RUNTIME_FUNCTION static
216  hipError_t Min(void *d_temp_storage,
217  size_t &temp_storage_bytes,
218  InputIteratorT d_in,
219  OutputIteratorT d_out,
220  int num_items,
221  hipStream_t stream = 0,
222  bool debug_synchronous = false)
223  {
224  using T = typename std::iterator_traits<InputIteratorT>::value_type;
225  return Reduce(
226  d_temp_storage, temp_storage_bytes,
227  d_in, d_out, num_items, ::hipcub::Min(), detail::get_max_value<T>(),
228  stream, debug_synchronous
229  );
230  }
231 
232  template <
233  typename InputIteratorT,
234  typename OutputIteratorT
235  >
236  HIPCUB_RUNTIME_FUNCTION static
237  hipError_t ArgMin(void *d_temp_storage,
238  size_t &temp_storage_bytes,
239  InputIteratorT d_in,
240  OutputIteratorT d_out,
241  int num_items,
242  hipStream_t stream = 0,
243  bool debug_synchronous = false)
244  {
245  using OffsetT = int;
246  using T = typename std::iterator_traits<InputIteratorT>::value_type;
247  using O = typename std::iterator_traits<OutputIteratorT>::value_type;
248  using OutputTupleT =
249  typename std::conditional<
250  std::is_same<O, void>::value,
251  KeyValuePair<OffsetT, T>,
252  O
253  >::type;
254 
255  using OutputValueT = typename OutputTupleT::Value;
256  using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
257 
258  IteratorT d_indexed_in(d_in);
259  // Empty inputs produce a specific value dictated by CUB's API: numeric_limits::max.
260  // When not empty, using this value as initial is invalid and +infinity is used instead.
261  OutputTupleT init(1,
262  num_items > 0 ? detail::get_max_special_value<T>()
263  : detail::get_max_value<T>());
264 
265  return Reduce(
266  d_temp_storage, temp_storage_bytes,
267  d_indexed_in, d_out, num_items, ::hipcub::ArgMin(), init,
268  stream, debug_synchronous
269  );
270  }
271 
272  template <
273  typename InputIteratorT,
274  typename OutputIteratorT
275  >
276  HIPCUB_RUNTIME_FUNCTION static
277  hipError_t Max(void *d_temp_storage,
278  size_t &temp_storage_bytes,
279  InputIteratorT d_in,
280  OutputIteratorT d_out,
281  int num_items,
282  hipStream_t stream = 0,
283  bool debug_synchronous = false)
284  {
285  using T = typename std::iterator_traits<InputIteratorT>::value_type;
286  return Reduce(
287  d_temp_storage, temp_storage_bytes,
288  d_in, d_out, num_items, ::hipcub::Max(), detail::get_lowest_value<T>(),
289  stream, debug_synchronous
290  );
291  }
292 
293  template <
294  typename InputIteratorT,
295  typename OutputIteratorT
296  >
297  HIPCUB_RUNTIME_FUNCTION static
298  hipError_t ArgMax(void *d_temp_storage,
299  size_t &temp_storage_bytes,
300  InputIteratorT d_in,
301  OutputIteratorT d_out,
302  int num_items,
303  hipStream_t stream = 0,
304  bool debug_synchronous = false)
305  {
306  using OffsetT = int;
307  using T = typename std::iterator_traits<InputIteratorT>::value_type;
308  using O = typename std::iterator_traits<OutputIteratorT>::value_type;
309  using OutputTupleT =
310  typename std::conditional<
311  std::is_same<O, void>::value,
312  KeyValuePair<OffsetT, T>,
313  O
314  >::type;
315 
316  using OutputValueT = typename OutputTupleT::Value;
317  using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
318 
319  IteratorT d_indexed_in(d_in);
320  // Empty inputs produce a specific value dictated by CUB's API: numeric_limits::lowest.
321  // When not empty, using this value as initial is invalid and -infinity is used instead.
322  const OutputTupleT init(1,
323  num_items > 0 ? detail::get_lowest_special_value<T>()
324  : detail::get_lowest_value<T>());
325 
326  return Reduce(
327  d_temp_storage, temp_storage_bytes,
328  d_indexed_in, d_out, num_items, ::hipcub::ArgMax(), init,
329  stream, debug_synchronous
330  );
331  }
332 
333  template<
334  typename KeysInputIteratorT,
335  typename UniqueOutputIteratorT,
336  typename ValuesInputIteratorT,
337  typename AggregatesOutputIteratorT,
338  typename NumRunsOutputIteratorT,
339  typename ReductionOpT
340  >
341  HIPCUB_RUNTIME_FUNCTION static
342  hipError_t ReduceByKey(void * d_temp_storage,
343  size_t& temp_storage_bytes,
344  KeysInputIteratorT d_keys_in,
345  UniqueOutputIteratorT d_unique_out,
346  ValuesInputIteratorT d_values_in,
347  AggregatesOutputIteratorT d_aggregates_out,
348  NumRunsOutputIteratorT d_num_runs_out,
349  ReductionOpT reduction_op,
350  int num_items,
351  hipStream_t stream = 0,
352  bool debug_synchronous = false)
353  {
354  using key_compare_op =
355  ::rocprim::equal_to<typename std::iterator_traits<KeysInputIteratorT>::value_type>;
356  return ::rocprim::reduce_by_key(
357  d_temp_storage, temp_storage_bytes,
358  d_keys_in, d_values_in, num_items,
359  d_unique_out, d_aggregates_out, d_num_runs_out,
360  ::hipcub::detail::convert_result_type<ValuesInputIteratorT, AggregatesOutputIteratorT>(reduction_op),
361  key_compare_op(),
362  stream, debug_synchronous
363  );
364  }
365 };
366 
367 END_HIPCUB_NAMESPACE
368 
369 #endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
Definition: thread_operators.hpp:126
Definition: thread_operators.hpp:141
Definition: thread_operators.hpp:106
Definition: thread_operators.hpp:116
Definition: thread_operators.hpp:76