30 #ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
31 #define HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
36 #include "../../../config.hpp"
38 #include "../iterator/arg_index_input_iterator.hpp"
39 #include "../thread/thread_operators.hpp"
40 #include "device_reduce.hpp"
42 #include <rocprim/device/device_segmented_reduce.hpp>
44 BEGIN_HIPCUB_NAMESPACE
49 template<
class Config,
55 __global__ __launch_bounds__(
56 ::rocprim::detail::device_params<Config>()
57 .reduce_config.block_size) void segmented_arg_minmax_kernel(InputIterator input,
58 OutputIterator output,
59 OffsetIterator begin_offsets,
60 OffsetIterator end_offsets,
61 BinaryFunction reduce_op,
62 ResultType initial_value,
63 ResultType empty_value)
66 ::rocprim::detail::segmented_reduce<Config>(input,
74 const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>();
75 const unsigned int segment_id = ::rocprim::detail::block_id<0>();
77 const unsigned int begin_offset = begin_offsets[segment_id];
78 const unsigned int end_offset = end_offsets[segment_id];
83 if(begin_offset == end_offset)
85 output[segment_id] = empty_value;
89 output[segment_id].key -= begin_offset;
94 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
96 auto _error = hipGetLastError(); \
97 if(_error != hipSuccess) \
99 if(debug_synchronous) \
101 std::cout << name << "(" << size << ")"; \
102 auto __error = hipStreamSynchronize(stream); \
103 if(__error != hipSuccess) \
105 auto _end = std::chrono::high_resolution_clock::now(); \
106 auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
107 std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
113 template<
class Config = rocprim::default_config,
115 class OutputIterator,
116 class OffsetIterator,
118 class BinaryFunction>
119 inline hipError_t segmented_arg_minmax(
void* temporary_storage,
120 size_t& storage_size,
122 OutputIterator output,
123 unsigned int segments,
124 OffsetIterator begin_offsets,
125 OffsetIterator end_offsets,
126 BinaryFunction reduce_op,
127 InitValueType initial_value,
128 InitValueType empty_value,
130 bool debug_synchronous)
132 using input_type =
typename std::iterator_traits<InputIterator>::value_type;
134 typename ::rocprim::detail::match_result_type<input_type, BinaryFunction>::type;
136 using config = ::rocprim::detail::wrapped_reduce_config<Config, result_type>;
138 ::rocprim::detail::target_arch target_arch;
139 hipError_t result = host_target_arch(stream, target_arch);
140 if(result != hipSuccess)
144 const ::rocprim::detail::reduce_config_params params
145 = ::rocprim::detail::dispatch_target_arch<config>(target_arch);
147 const unsigned int block_size = params.reduce_config.block_size;
149 if(temporary_storage ==
nullptr)
160 std::chrono::high_resolution_clock::time_point start;
162 if(debug_synchronous)
163 start = std::chrono::high_resolution_clock::now();
164 hipLaunchKernelGGL(HIP_KERNEL_NAME(segmented_arg_minmax_kernel<config>),
174 static_cast<result_type
>(initial_value),
175 static_cast<result_type
>(empty_value));
176 ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
"segmented_arg_minmax", segments, start);
183 struct DeviceSegmentedReduce
186 typename InputIteratorT,
187 typename OutputIteratorT,
188 typename OffsetIteratorT,
189 typename ReductionOp,
192 HIPCUB_RUNTIME_FUNCTION
static
193 hipError_t Reduce(
void * d_temp_storage,
194 size_t& temp_storage_bytes,
196 OutputIteratorT d_out,
198 OffsetIteratorT d_begin_offsets,
199 OffsetIteratorT d_end_offsets,
200 ReductionOp reduction_op,
202 hipStream_t stream = 0,
203 bool debug_synchronous =
false)
205 return ::rocprim::segmented_reduce(
206 d_temp_storage, temp_storage_bytes,
208 num_segments, d_begin_offsets, d_end_offsets,
209 ::hipcub::detail::convert_result_type<InputIteratorT, OutputIteratorT>(reduction_op),
211 stream, debug_synchronous
216 typename InputIteratorT,
217 typename OutputIteratorT,
218 typename OffsetIteratorT
220 HIPCUB_RUNTIME_FUNCTION
static
221 hipError_t Sum(
void * d_temp_storage,
222 size_t& temp_storage_bytes,
224 OutputIteratorT d_out,
226 OffsetIteratorT d_begin_offsets,
227 OffsetIteratorT d_end_offsets,
228 hipStream_t stream = 0,
229 bool debug_synchronous =
false)
231 using input_type =
typename std::iterator_traits<InputIteratorT>::value_type;
234 d_temp_storage, temp_storage_bytes,
236 num_segments, d_begin_offsets, d_end_offsets,
238 stream, debug_synchronous
243 typename InputIteratorT,
244 typename OutputIteratorT,
245 typename OffsetIteratorT
247 HIPCUB_RUNTIME_FUNCTION
static
248 hipError_t Min(
void * d_temp_storage,
249 size_t& temp_storage_bytes,
251 OutputIteratorT d_out,
253 OffsetIteratorT d_begin_offsets,
254 OffsetIteratorT d_end_offsets,
255 hipStream_t stream = 0,
256 bool debug_synchronous =
false)
258 using input_type =
typename std::iterator_traits<InputIteratorT>::value_type;
261 d_temp_storage, temp_storage_bytes,
263 num_segments, d_begin_offsets, d_end_offsets,
264 ::
hipcub::Min(), std::numeric_limits<input_type>::max(),
265 stream, debug_synchronous
270 typename InputIteratorT,
271 typename OutputIteratorT,
272 typename OffsetIteratorT
274 HIPCUB_RUNTIME_FUNCTION
static
275 hipError_t ArgMin(
void * d_temp_storage,
276 size_t& temp_storage_bytes,
278 OutputIteratorT d_out,
280 OffsetIteratorT d_begin_offsets,
281 OffsetIteratorT d_end_offsets,
282 hipStream_t stream = 0,
283 bool debug_synchronous =
false)
286 using T =
typename std::iterator_traits<InputIteratorT>::value_type;
287 using O =
typename std::iterator_traits<OutputIteratorT>::value_type;
288 using OutputTupleT =
typename std::conditional<
289 std::is_same<O, void>::value,
290 KeyValuePair<OffsetT, T>,
294 using OutputValueT =
typename OutputTupleT::Value;
295 using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
297 IteratorT d_indexed_in(d_in);
300 const OutputTupleT init(std::numeric_limits<OffsetT>::max(),
301 detail::get_max_special_value<T>());
303 const OutputTupleT empty_value(1, detail::get_max_value<T>());
305 return detail::segmented_arg_minmax(d_temp_storage,
320 typename InputIteratorT,
321 typename OutputIteratorT,
322 typename OffsetIteratorT
324 HIPCUB_RUNTIME_FUNCTION
static
325 hipError_t Max(
void * d_temp_storage,
326 size_t& temp_storage_bytes,
328 OutputIteratorT d_out,
330 OffsetIteratorT d_begin_offsets,
331 OffsetIteratorT d_end_offsets,
332 hipStream_t stream = 0,
333 bool debug_synchronous =
false)
335 using input_type =
typename std::iterator_traits<InputIteratorT>::value_type;
338 d_temp_storage, temp_storage_bytes,
340 num_segments, d_begin_offsets, d_end_offsets,
341 ::
hipcub::Max(), std::numeric_limits<input_type>::lowest(),
342 stream, debug_synchronous
347 typename InputIteratorT,
348 typename OutputIteratorT,
349 typename OffsetIteratorT
351 HIPCUB_RUNTIME_FUNCTION
static
352 hipError_t ArgMax(
void * d_temp_storage,
353 size_t& temp_storage_bytes,
355 OutputIteratorT d_out,
357 OffsetIteratorT d_begin_offsets,
358 OffsetIteratorT d_end_offsets,
359 hipStream_t stream = 0,
360 bool debug_synchronous =
false)
363 using T =
typename std::iterator_traits<InputIteratorT>::value_type;
364 using O =
typename std::iterator_traits<OutputIteratorT>::value_type;
365 using OutputTupleT =
typename std::conditional<
366 std::is_same<O, void>::value,
367 KeyValuePair<OffsetT, T>,
371 using OutputValueT =
typename OutputTupleT::Value;
372 using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
374 IteratorT d_indexed_in(d_in);
377 const OutputTupleT init(std::numeric_limits<OffsetT>::max(),
378 detail::get_lowest_special_value<T>());
380 const OutputTupleT empty_value(1, detail::get_lowest_value<T>());
382 return detail::segmented_arg_minmax(d_temp_storage,
Definition: thread_operators.hpp:126
Definition: thread_operators.hpp:141
Definition: thread_operators.hpp:106
Definition: thread_operators.hpp:116
Definition: thread_operators.hpp:76