/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-roccv/checkouts/latest/include/kernels/device/histogram_device.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-roccv/checkouts/latest/include/kernels/device/histogram_device.hpp Source File#

3 min read time

Applies to Linux

rocCV: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-roccv/checkouts/latest/include/kernels/device/histogram_device.hpp Source File
histogram_device.hpp
Go to the documentation of this file.
1 
23 #pragma once
24 
25 #include <hip/hip_runtime.h>
26 
27 #include "operator_types.h"
28 
29 namespace Kernels {
30 namespace Device {
31 
32 template<typename T, typename SrcWrapper>
33 __global__ void histogram_kernel(SrcWrapper input, roccv::GenericTensorWrapper<T> histogram) {
34  extern __shared__ __align__(sizeof(T)) unsigned char smem[];
35  T *local_histogram = reinterpret_cast<T *>(smem);
36 
37  const auto z_idx = blockIdx.z;
38  const auto gid = blockIdx.x * blockDim.x + threadIdx.x;
39  const auto x_idx = gid % input.width();
40  const auto y_idx = gid / input.width();
41 
42  // thread index in block
43  const auto tid = threadIdx.x; // histogram index
44 
45  local_histogram[tid] = 0; // initialize the histogram
46 
47  __syncthreads();
48 
49  if (gid < input.height() * input.width()) {
50  atomicAdd(&local_histogram[input.at(z_idx, y_idx, x_idx, 0).x], 1);
51  }
52  __syncthreads(); // wait for all of the threads in this block to finish
53 
54  const auto hist_val = local_histogram[tid]; // get local value for this thread
55 
56  // this is the output histogram must be init to and atomically added to.
57  if (hist_val > 0) {
58  atomicAdd(&histogram.at(z_idx, tid, 0), hist_val);
59  }
60 }
61 
62 template <typename T, typename SrcWrapper, typename MaskWrapper>
63 __global__ void histogram_kernel(SrcWrapper input, MaskWrapper mask, roccv::GenericTensorWrapper<T> histogram) {
64  extern __shared__ __align__(sizeof(T)) unsigned char smem[];
65  T *local_histogram = reinterpret_cast<T *>(smem);
66 
67  const auto z_idx = blockIdx.z;
68  const auto gid = blockIdx.x * blockDim.x + threadIdx.x;
69  const auto x_idx = gid % input.width();
70  const auto y_idx = gid / input.width();
71 
72  // thread index in block
73  const auto tid = threadIdx.x; // histogram index
74 
75  local_histogram[tid] = 0; // initialize the histogram
76 
77  __syncthreads();
78 
79  if (gid < input.height() * input.width()) {
80  if (mask.at(z_idx, y_idx, x_idx, 0) != 0) {
81  atomicAdd(
82  &local_histogram[input.at(z_idx, y_idx, x_idx, 0).x],
83  1);
84  }
85  }
86  __syncthreads(); // wait for all of the threads in this block to finish
87 
88  const auto hist_val = local_histogram[tid]; // get local value for this thread
89 
90  // this is the output histogram must be init to and atomically added to.
91  if (hist_val > 0) {
92  atomicAdd(&histogram.at(z_idx, tid, 0), hist_val);
93  }
94 }
95 } // namespace Device
96 } // namespace Kernels
Definition: generic_tensor_wrapper.hpp:28
__device__ __host__ T & at(ARGS... idx)
Definition: generic_tensor_wrapper.hpp:48
__global__ void histogram_kernel(SrcWrapper input, roccv::GenericTensorWrapper< T > histogram)
Definition: histogram_device.hpp:33
Definition: non_max_suppression_helpers.hpp:26