TransferBench API library

TransferBench API library#

struct SubExecParam

#include <Kernels.hpp>

Public Members

size_t N

int numSrcs

int numDsts

float *src[MAX_SRCS]

float *dst[MAX_DSTS]

int32_t preferredXccId

int teamSize

int teamIdx

long long startCycle

long long stopCycle

uint32_t hwId

uint32_t xccId

namespace std: STL namespace.

file Kernels.hpp

Defines

PackedFloat_t

MAX_BLOCKSIZE

FLOATS_PER_PACK

MEMSET_CHAR

MEMSET_VAL

MAX_WAVEGROUPS

MAX_UNROLL

NUM_WAVEORDERS

MAX_SRCS

MAX_DSTS

GetHwId(hwId)

GetXccId(val)

GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)

Typedefs

typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int)

Functions

void CpuReduceKernel(SubExecParam const &p)

std::string PrepSrcValueString()

__host__ __device__ float PrepSrcValue (int srcBufferIdx, size_t idx)

__global__ void CollectXccIdsKernel (int *xccIds)

__global__ void PrepSrcDataKernel (float *ptr, size_t N, int srcBufferIdx)

__device__ int64_t GetTimestamp ()

template<typename T> __device__ __forceinline__ T MemsetVal ()

template<> __device__ __forceinline__ float MemsetVal ()

template<int BLOCKSIZE, int UNROLL> __global__ void __launch_bounds__ (BLOCKSIZE) GpuReduceKernel(SubExecParam *params

if (threadIdx.x==0) startCycle

GetXccId(xccId)

if (p.preferredXccId !=-1 &&xccId !=p.preferredXccId) return

for (int i=0;i< numSrcs;i++) srcFloat4[i]

switch (waveOrder)

while (1)

__syncthreads()

Variables

__global__ void int waveOrder

__global__ void int int numSubIterations {int64_t startCycle

SubExecParam &p = params[blockIdx.y]

int32_t xccId

int32_t const numSrcs = p.numSrcs

int32_t const numDsts = p.numDsts

float4 const *__restrict__ srcFloat4 [MAX_SRCS]

float4 *__restrict__ dstFloat4 [MAX_DSTS]

int32_t const nTeams = p.teamSize

int32_t const teamIdx = p.teamIdx

int32_t const nWaves = BLOCKSIZE / warpSize

int32_t const waveIdx = threadIdx.x / warpSize

int32_t const tIdx = threadIdx.x % warpSize

size_t const numFloat4 = p.N / 4

int32_t teamStride

int32_t waveStride

int32_t unrlStride

int32_t teamStride2

int32_t waveStride2

int subIterations = 0

GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] = {GPU_KERNEL_UNROLL_DECL(64), GPU_KERNEL_UNROLL_DECL(128), GPU_KERNEL_UNROLL_DECL(192), GPU_KERNEL_UNROLL_DECL(256), GPU_KERNEL_UNROLL_DECL(320), GPU_KERNEL_UNROLL_DECL(384), GPU_KERNEL_UNROLL_DECL(448),}

dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.3.0/src/include

dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.3.0/src