API#

struct SubExecParam#
#include <Kernels.hpp>

Public Members

size_t N#
int numSrcs#
int numDsts#
float *src[MAX_SRCS]#
float *dst[MAX_DSTS]#
int32_t preferredXccId#
int teamSize#
int teamIdx#
long long startCycle#
long long stopCycle#
uint32_t hwId#
uint32_t xccId#
namespace std#

STL namespace.

file Kernels.hpp

Defines

PackedFloat_t#
MAX_BLOCKSIZE#
FLOATS_PER_PACK#
MEMSET_CHAR#
MEMSET_VAL#
MAX_WAVEGROUPS#
MAX_UNROLL#
NUM_WAVEORDERS#
MAX_SRCS#
MAX_DSTS#
GetHwId(hwId)#
GetXccId(val)#
GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)#

Typedefs

typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int)#

Functions

void CpuReduceKernel(SubExecParam const &p)#
std::string PrepSrcValueString()#
__host__ __device__ float PrepSrcValue (int srcBufferIdx, size_t idx)
__global__ void CollectXccIdsKernel (int *xccIds)
__global__ void PrepSrcDataKernel (float *ptr, size_t N, int srcBufferIdx)
__device__ int64_t GetTimestamp ()
template<typename T> __device__ __forceinline__ T MemsetVal ()
template<> __device__ __forceinline__ float MemsetVal ()
template<int BLOCKSIZE, int UNROLL> __global__ void __launch_bounds__ (BLOCKSIZE) GpuReduceKernel(SubExecParam *params
if (threadIdx.x==0) startCycle
GetXccId(xccId)#
if (p.preferredXccId !=-1 &&xccId !=p.preferredXccId) return
for (int i=0;i< numSrcs;i++) srcFloat4[i]
switch (waveOrder)
while (1)
__syncthreads()#

Variables

__global__ void int waveOrder
__global__ void int int numSubIterations  {int64_t startCycle
SubExecParam &p = params[blockIdx.y]#
int32_t xccId#
int32_t const numSrcs = p.numSrcs#
int32_t const numDsts = p.numDsts#
float4 const  *__restrict__ srcFloat4 [MAX_SRCS]
float4 *__restrict__ dstFloat4 [MAX_DSTS]
int32_t const nTeams = p.teamSize#
int32_t const teamIdx = p.teamIdx#
int32_t const nWaves = BLOCKSIZE / warpSize#
int32_t const waveIdx = threadIdx.x / warpSize#
int32_t const tIdx = threadIdx.x % warpSize#
size_t const numFloat4 = p.N / 4#
int32_t teamStride#
int32_t waveStride#
int32_t unrlStride#
int32_t teamStride2#
int32_t waveStride2#
int subIterations = 0#
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] = {GPU_KERNEL_UNROLL_DECL(64), GPU_KERNEL_UNROLL_DECL(128), GPU_KERNEL_UNROLL_DECL(192), GPU_KERNEL_UNROLL_DECL(256), GPU_KERNEL_UNROLL_DECL(320), GPU_KERNEL_UNROLL_DECL(384), GPU_KERNEL_UNROLL_DECL(448),}#
dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.3.0/src/include
dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.3.0/src