TransferBench API library#
-
struct SubExecParam
- #include <Kernels.hpp>
Public Members
-
size_t N
-
int numSrcs
-
int numDsts
-
float *src[MAX_SRCS]
-
float *dst[MAX_DSTS]
-
int32_t preferredXccId
-
int teamSize
-
int teamIdx
-
long long startCycle
-
long long stopCycle
-
uint32_t hwId
-
uint32_t xccId
-
size_t N
-
namespace std
STL namespace.
- file Kernels.hpp
Defines
-
PackedFloat_t
-
MAX_BLOCKSIZE
-
FLOATS_PER_PACK
-
MEMSET_CHAR
-
MEMSET_VAL
-
MAX_WAVEGROUPS
-
MAX_UNROLL
-
NUM_WAVEORDERS
-
MAX_SRCS
-
MAX_DSTS
-
GetHwId(hwId)
-
GetXccId(val)
-
GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)
Typedefs
-
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int)
Functions
-
void CpuReduceKernel(SubExecParam const &p)
-
std::string PrepSrcValueString()
- __host__ __device__ float PrepSrcValue (int srcBufferIdx, size_t idx)
- __global__ void CollectXccIdsKernel (int *xccIds)
- __global__ void PrepSrcDataKernel (float *ptr, size_t N, int srcBufferIdx)
- __device__ int64_t GetTimestamp ()
- template<typename T> __device__ __forceinline__ T MemsetVal ()
- template<> __device__ __forceinline__ float MemsetVal ()
- template<int BLOCKSIZE, int UNROLL> __global__ void __launch_bounds__ (BLOCKSIZE) GpuReduceKernel(SubExecParam *params
- if (threadIdx.x==0) startCycle
-
GetXccId(xccId)
- if (p.preferredXccId !=-1 &&xccId !=p.preferredXccId) return
- for (int i=0;i< numSrcs;i++) srcFloat4[i]
- switch (waveOrder)
- while (1)
-
__syncthreads()
Variables
- __global__ void int waveOrder
- __global__ void int int numSubIterations {int64_t startCycle
-
SubExecParam &p = params[blockIdx.y]
-
int32_t xccId
-
int32_t const numSrcs = p.numSrcs
-
int32_t const numDsts = p.numDsts
- float4 const *__restrict__ srcFloat4 [MAX_SRCS]
- float4 *__restrict__ dstFloat4 [MAX_DSTS]
-
int32_t const nTeams = p.teamSize
-
int32_t const teamIdx = p.teamIdx
-
int32_t const nWaves = BLOCKSIZE / warpSize
-
int32_t const waveIdx = threadIdx.x / warpSize
-
int32_t const tIdx = threadIdx.x % warpSize
-
size_t const numFloat4 = p.N / 4
-
int32_t teamStride
-
int32_t waveStride
-
int32_t unrlStride
-
int32_t teamStride2
-
int32_t waveStride2
-
int subIterations = 0
-
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] = {GPU_KERNEL_UNROLL_DECL(64), GPU_KERNEL_UNROLL_DECL(128), GPU_KERNEL_UNROLL_DECL(192), GPU_KERNEL_UNROLL_DECL(256), GPU_KERNEL_UNROLL_DECL(320), GPU_KERNEL_UNROLL_DECL(384), GPU_KERNEL_UNROLL_DECL(448),}
-
PackedFloat_t
- dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.3.0/src/include
- dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.3.0/src