/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-roccv/checkouts/latest/include/kernels/device/histogram_device.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-roccv/checkouts/latest/include/kernels/device/histogram_device.hpp Source File#

3 min read time

Applies to Linux

rocCV: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-roccv/checkouts/latest/include/kernels/device/histogram_device.hpp Source File

Go to the documentation of this file.

  
 #pragma once
  
 #include <hip/hip_runtime.h>
  
 #include "operator_types.h"
  
 namespace Kernels {
 namespace Device {
  
 template<typename T, typename SrcWrapper>
 __global__ void histogram_kernel(SrcWrapper input, roccv::GenericTensorWrapper<T> histogram) {
     extern __shared__ __align__(sizeof(T)) unsigned char smem[];
     T *local_histogram = reinterpret_cast<T *>(smem);
  
     const auto z_idx = blockIdx.z;
     const auto gid = blockIdx.x * blockDim.x + threadIdx.x;
     const auto x_idx = gid % input.width();
     const auto y_idx = gid / input.width();
  
     // thread index in block
     const auto tid = threadIdx.x;  // histogram index
  
     local_histogram[tid] = 0;  // initialize the histogram
  
     __syncthreads();
  
     if (gid < input.height() * input.width()) {
         atomicAdd(&local_histogram[input.at(z_idx, y_idx, x_idx, 0).x], 1);
     }
     __syncthreads();  // wait for all of the threads in this block to finish
  
     const auto hist_val = local_histogram[tid];  // get local value for this thread
  
     // this is the output histogram must be init to and atomically added to.
     if (hist_val > 0) {
         atomicAdd(&histogram.at(z_idx, tid, 0), hist_val);
     }
 }
  
 template <typename T, typename SrcWrapper, typename MaskWrapper>
 __global__ void histogram_kernel(SrcWrapper input, MaskWrapper mask, roccv::GenericTensorWrapper<T> histogram) {
     extern __shared__ __align__(sizeof(T)) unsigned char smem[];
     T *local_histogram = reinterpret_cast<T *>(smem);
  
     const auto z_idx = blockIdx.z;
     const auto gid = blockIdx.x * blockDim.x + threadIdx.x;
     const auto x_idx = gid % input.width();
     const auto y_idx = gid / input.width();
  
     // thread index in block
     const auto tid = threadIdx.x;  // histogram index
  
     local_histogram[tid] = 0;  // initialize the histogram
  
     __syncthreads();
  
     if (gid < input.height() * input.width()) {
         if (mask.at(z_idx, y_idx, x_idx, 0) != 0) {
             atomicAdd(
                 &local_histogram[input.at(z_idx, y_idx, x_idx, 0).x],
                 1);
         }
     }
     __syncthreads();  // wait for all of the threads in this block to finish
  
     const auto hist_val = local_histogram[tid];  // get local value for this thread
  
     // this is the output histogram must be init to and atomically added to.
     if (hist_val > 0) {
         atomicAdd(&histogram.at(z_idx, tid, 0), hist_val);
     }
 }
 }  // namespace Device
 }  // namespace Kernels