TransferBench API library#

struct SubExecParam
#include <Kernels.hpp>

Public Members

size_t N
int numSrcs
int numDsts
float *src[MAX_SRCS]
float *dst[MAX_DSTS]
int32_t preferredXccId
int teamSize
int teamIdx
long long startCycle
long long stopCycle
uint32_t hwId
uint32_t xccId
namespace std

STL namespace.

file Kernels.hpp

Defines

PackedFloat_t
MAX_BLOCKSIZE
FLOATS_PER_PACK
MEMSET_CHAR
MEMSET_VAL
MAX_WAVEGROUPS
MAX_UNROLL
NUM_WAVEORDERS
MAX_SRCS
MAX_DSTS
GetHwId(hwId)
GetXccId(val)
GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)

Typedefs

typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int)

Functions

void CpuReduceKernel(SubExecParam const &p)
std::string PrepSrcValueString()
__host__ __device__ float PrepSrcValue (int srcBufferIdx, size_t idx)
__global__ void CollectXccIdsKernel (int *xccIds)
__global__ void PrepSrcDataKernel (float *ptr, size_t N, int srcBufferIdx)
__device__ int64_t GetTimestamp ()
template<typename T> __device__ __forceinline__ T MemsetVal ()
template<> __device__ __forceinline__ float MemsetVal ()
template<int BLOCKSIZE, int UNROLL> __global__ void __launch_bounds__ (BLOCKSIZE) GpuReduceKernel(SubExecParam *params
if (threadIdx.x==0) startCycle
GetXccId(xccId)
if (p.preferredXccId !=-1 &&xccId !=p.preferredXccId) return
for (int i=0;i< numSrcs;i++) srcFloat4[i]
switch (waveOrder)
while (1)
__syncthreads()

Variables

__global__ void int waveOrder
__global__ void int int numSubIterations  {int64_t startCycle
SubExecParam &p = params[blockIdx.y]
int32_t xccId
int32_t const numSrcs = p.numSrcs
int32_t const numDsts = p.numDsts
float4 const  *__restrict__ srcFloat4 [MAX_SRCS]
float4 *__restrict__ dstFloat4 [MAX_DSTS]
int32_t const nTeams = p.teamSize
int32_t const teamIdx = p.teamIdx
int32_t const nWaves = BLOCKSIZE / warpSize
int32_t const waveIdx = threadIdx.x / warpSize
int32_t const tIdx = threadIdx.x % warpSize
size_t const numFloat4 = p.N / 4
int32_t teamStride
int32_t waveStride
int32_t unrlStride
int32_t teamStride2
int32_t waveStride2
int subIterations = 0
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] = {GPU_KERNEL_UNROLL_DECL(64), GPU_KERNEL_UNROLL_DECL(128), GPU_KERNEL_UNROLL_DECL(192), GPU_KERNEL_UNROLL_DECL(256), GPU_KERNEL_UNROLL_DECL(320), GPU_KERNEL_UNROLL_DECL(384), GPU_KERNEL_UNROLL_DECL(448),}
dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.3.0/src/include
dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.3.0/src