TransferBench API library

Contents

TransferBench API library#

struct AgentData

Public Members

bool isInitialized
std::vector<hsa_agent_t> cpuAgents
std::vector<hsa_agent_t> gpuAgents
std::vector<int> closestNumaNode
class EnvVars
#include <EnvVars.hpp>

Public Functions

inline EnvVars()
inline void DisplayEnvVars() const
inline void DisplayP2PBenchmarkEnvVars() const
inline void DisplaySweepEnvVars() const
inline void DisplayA2AEnvVars() const
inline void DisplaySchmooEnvVars() const
inline void DisplayRemoteWriteEnvVars() const
inline void DisplayParallelCopyEnvVars() const
inline std::string GetCuMaskDesc() const

Public Members

int const DEFAULT_NUM_WARMUPS = 3
int const DEFAULT_NUM_ITERATIONS = 10
int const DEFAULT_SAMPLING_FACTOR = 1
int const DEFAULT_P2P_NUM_CPU_SE = 4
std::string const DEFAULT_SWEEP_SRC = "CG"
std::string const DEFAULT_SWEEP_EXE = "CDG"
std::string const DEFAULT_SWEEP_DST = "CG"
int const DEFAULT_SWEEP_MIN = 1
int const DEFAULT_SWEEP_MAX = 24
int const DEFAULT_SWEEP_TEST_LIMIT = 0
int const DEFAULT_SWEEP_TIME_LIMIT = 0
int alwaysValidate
int blockBytes
int blockOrder
int byteOffset
int continueOnError
int gfxBlockSize
int gfxSingleTeam
int gfxUnroll
int gfxWaveOrder
int hideEnv
int minNumVarSubExec
int maxNumVarSubExec
int numCpuDevices
int numGpuDevices
int numIterations
int numSubIterations
int numWarmups
int outputToCsv
int samplingFactor
int sharedMemBytes
int showIterations
int useHsaDma
int useInteractive
int usePcieIndexing
int usePrepSrcKernel
int useSingleStream
int useXccFilter
int validateDirect
std::vector<float> fillPattern
std::vector<uint32_t> cuMask
std::vector<std::vector<int>> prefXccTable
int numCpuSubExecs
int numGpuSubExecs
int p2pMode
int useDmaCopy
int useRemoteRead
int useFineGrain
int sweepMin
int sweepMax
int sweepTestLimit
int sweepTimeLimit
int sweepXgmiMin
int sweepXgmiMax
int sweepSeed
int sweepRandBytes
std::string sweepSrc
std::string sweepExe
std::string sweepDst
int a2aDirect
int a2aMode
int enableDebug
int gpuMaxHwQueues
ConfigModeEnum configMode
std::default_random_engine *generator
std::vector<int> numCpusPerNuma
std::vector<int> wallClockPerDeviceMhz
std::vector<std::set<int>> xccIdsPerDevice

Public Static Functions

static inline void DisplayUsage()
static inline int GetEnvVar(std::string const &varname, int defaultValue)
static inline std::string GetEnvVar(std::string const &varname, std::string const &defaultValue)
struct ExecutorInfo
#include <TransferBench.hpp>

Public Members

std::vector<Transfer*> transfers
size_t totalBytes
int totalSubExecs
SubExecParam *subExecParamGpu
std::vector<hipStream_t> streams
std::vector<hipEvent_t> startEvents
std::vector<hipEvent_t> stopEvents
double totalTime
struct ExeResult
#include <TransferBench.hpp>

Public Members

double bandwidthGbs
double durationMsec
double sumBandwidthGbs
size_t totalBytes
std::vector<int> transferIdx
struct SubExecParam
#include <Kernels.hpp>

Public Members

size_t N
int numSrcs
int numDsts
float *src[MAX_SRCS]
float *dst[MAX_DSTS]
int32_t preferredXccId
int teamSize
int teamIdx
long long startCycle
long long stopCycle
uint32_t hwId
uint32_t xccId
struct TestResults
#include <TransferBench.hpp>

Public Members

size_t numTimedIterations
size_t totalBytesTransferred
double totalBandwidthCpu
double totalDurationMsec
double overheadMsec
std::map<std::pair<ExeType, int>, ExeResult> exeResults
struct Transfer
#include <TransferBench.hpp>

Public Functions

void PrepareSubExecParams(EnvVars const &ev)
bool PrepareSrc(EnvVars const &ev)
void ValidateDst(EnvVars const &ev)
void PrepareReference(EnvVars const &ev, std::vector<float> &buffer, int bufferIdx)
std::string SrcToStr() const
std::string DstToStr() const

Public Members

ExeType exeType
int exeIndex
int exeSubIndex
int numSubExecs
size_t numBytes
int numSrcs
std::vector<MemType> srcType
std::vector<int> srcIndex
int numDsts
std::vector<MemType> dstType
std::vector<int> dstIndex
size_t numBytesActual
double transferTime
double transferBandwidth
double executorBandwidth
std::vector<double> perIterationTime
std::vector<std::set<std::pair<int, int>>> perIterationCUs
int transferIndex
std::vector<float*> srcMem
std::vector<float*> dstMem
std::vector<SubExecParam> subExecParam
SubExecParam *subExecParamGpuPtr
std::vector<int> subExecIdx
hsa_agent_t dstAgent
hsa_agent_t srcAgent
hsa_signal_t signal
hsa_amd_sdma_engine_id_t sdmaEngineId
namespace std

STL namespace.

file Compatibility.hpp
#include <hip/hip_ext.h>
#include <hip/hip_runtime.h>
#include <hsa/hsa_ext_amd.h>

Defines

HIP_CALL(cmd)
file EnvVars.hpp
#include <algorithm>
#include <random>
#include <time.h>
#include “Compatibility.hpp
#include “Kernels.hpp

Defines

TB_VERSION
PRINT_EV(NAME, VALUE, DESCRIPTION)
PRINT_ES(NAME, VALUE, DESCRIPTION)

Enums

enum ConfigModeEnum

Values:

enumerator CFG_FILE
enumerator CFG_P2P
enumerator CFG_SWEEP
enumerator CFG_SCALE
enumerator CFG_A2A
enumerator CFG_SCHMOO
enumerator CFG_RWRITE
enum BlockOrderEnum

Values:

enumerator ORDER_SEQUENTIAL
enumerator ORDER_INTERLEAVED
enumerator ORDER_RANDOM

Variables

char const MemTypeStr[]
char const ExeTypeStr[]
file GetClosestNumaNode.hpp

Defines

HSA_CHECK(cmd)

Functions

hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data)
hsa_status_t AgentInfoCallback(hsa_agent_t agent, void *data)
AgentData &GetAgentData()
int GetClosestNumaNode(int gpuIdx)
file Kernels.hpp

Defines

PackedFloat_t
MAX_BLOCKSIZE
FLOATS_PER_PACK
MEMSET_CHAR
MEMSET_VAL
MAX_WAVEGROUPS
MAX_UNROLL
NUM_WAVEORDERS
MAX_SRCS
MAX_DSTS
GetHwId(hwId)
GetXccId(val)
GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)

Typedefs

typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int)

Functions

void CpuReduceKernel(SubExecParam const &p)
std::string PrepSrcValueString()
__host__ __device__ float PrepSrcValue (int srcBufferIdx, size_t idx)
__global__ void CollectXccIdsKernel (int *xccIds)
__global__ void PrepSrcDataKernel (float *ptr, size_t N, int srcBufferIdx)
__device__ int64_t GetTimestamp ()
template<typename T> __device__ __forceinline__ T MemsetVal ()
template<> __device__ __forceinline__ float MemsetVal ()
template<int BLOCKSIZE, int UNROLL> __global__ void __launch_bounds__ (BLOCKSIZE) GpuReduceKernel(SubExecParam *params
if (threadIdx.x==0) startCycle
GetXccId(xccId)
if (p.preferredXccId !=-1 &&xccId !=p.preferredXccId) return
for (int i=0;i< numSrcs;i++) srcFloat4[i]
switch (waveOrder)
while (1)
__syncthreads()

Variables

__global__ void int waveOrder
__global__ void int int numSubIterations  {int64_t startCycle
SubExecParam &p = params[blockIdx.y]
int32_t xccId
int32_t const numSrcs = p.numSrcs
int32_t const numDsts = p.numDsts
float4 const  *__restrict__ srcFloat4 [MAX_SRCS]
float4 *__restrict__ dstFloat4 [MAX_DSTS]
int32_t const nTeams = p.teamSize
int32_t const teamIdx = p.teamIdx
int32_t const nWaves = BLOCKSIZE / warpSize
int32_t const waveIdx = threadIdx.x / warpSize
int32_t const tIdx = threadIdx.x % warpSize
size_t const numFloat4 = p.N / 4
int32_t teamStride
int32_t waveStride
int32_t unrlStride
int32_t teamStride2
int32_t waveStride2
int subIterations = 0
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] = {GPU_KERNEL_UNROLL_DECL(64), GPU_KERNEL_UNROLL_DECL(128), GPU_KERNEL_UNROLL_DECL(192), GPU_KERNEL_UNROLL_DECL(256), GPU_KERNEL_UNROLL_DECL(320), GPU_KERNEL_UNROLL_DECL(384), GPU_KERNEL_UNROLL_DECL(448),}
file TransferBench.hpp
#include <vector>
#include <sstream>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <cstdint>
#include <set>
#include <unistd.h>
#include <map>
#include <iostream>
#include “Compatibility.hpp
#include “EnvVars.hpp

Defines

MAX_LINE_LEN

Typedefs

typedef std::pair<ExeType, int> Executor
typedef std::map<Executor, ExecutorInfo> TransferMap

Enums

enum MemType

Values:

enumerator MEM_CPU
enumerator MEM_GPU
enumerator MEM_CPU_FINE
enumerator MEM_GPU_FINE
enumerator MEM_CPU_UNPINNED
enumerator MEM_NULL
enumerator MEM_MANAGED
enum ExeType

Values:

enumerator EXE_CPU
enumerator EXE_GPU_GFX
enumerator EXE_GPU_DMA

Functions

bool IsGpuType(MemType m)
bool IsCpuType(MemType m)
bool IsGpuType(ExeType e)
bool IsCpuType(ExeType e)
inline MemType CharToMemType(char const c)
inline ExeType CharToExeType(char const c)
void DisplayUsage(char const *cmdName)
void DisplayTopology(bool const outputToCsv)
void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor, std::vector<size_t> &valuesofN)
void ParseMemType(EnvVars const &ev, std::string const &token, std::vector<MemType> &memType, std::vector<int> &memIndex)
void ParseExeType(EnvVars const &ev, std::string const &token, ExeType &exeType, int &exeIndex, int &exeSubIndex)
void ParseTransfers(EnvVars const &ev, char *line, std::vector<Transfer> &transfers)
void ExecuteTransfers(EnvVars const &ev, int const testNum, size_t const N, std::vector<Transfer> &transfers, bool verbose = true, double *totalBandwidthCpu = nullptr)
TestResults ExecuteTransfersImpl(EnvVars const &ev, std::vector<Transfer> &transfers)
void ReportResults(EnvVars const &ev, std::vector<Transfer> const &transfers, TestResults const results)
void EnablePeerAccess(int const deviceId, int const peerDeviceId)
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void **memPtr)
void DeallocateMemory(MemType memType, void *memPtr, size_t const size = 0)
void CheckPages(char *byteArray, size_t numBytes, int targetId)
void RunTransfer(EnvVars const &ev, int const iteration, ExecutorInfo &exeInfo, int const transferIdx)
void RunPeerToPeerBenchmarks(EnvVars const &ev, size_t N)
void RunScalingBenchmark(EnvVars const &ev, size_t N, int const exeIndex, int const maxSubExecs)
void RunSweepPreset(EnvVars const &ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom)
void RunAllToAllBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const numSubExecs)
void RunSchmooBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs)
void RunRemoteWriteBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
void RunParallelCopyBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
void RunHealthCheck(EnvVars ev)
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount)
int RemappedIndex(int const origIdx, bool const isCpuType)
void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const &transfers)
std::string PtrVectorToStr(std::vector<float*> const &strVector, int const initOffset)

Variables

size_t const DEFAULT_BYTES_PER_TRANSFER = (1 << 26)
char const MemTypeStr[8] = "CGBFUNM"
char const ExeTypeStr[4] = "CGD"
char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"}
file TransferBench.cpp
#include <numa.h>
#include <cmath>
#include <numaif.h>
#include <random>
#include <stack>
#include <thread>
#include “TransferBench.hpp
#include “GetClosestNumaNode.hpp

Functions

int main(int argc, char **argv)
void ExecuteTransfers(EnvVars const &ev, int const testNum, size_t const N, std::vector<Transfer> &transfers, bool verbose, double *totalBandwidthCpu)
TestResults ExecuteTransfersImpl(EnvVars const &ev, std::vector<Transfer> &transfers)
void DisplayUsage(char const *cmdName)
int RemappedIndex(int const origIdx, bool const isCpuType)
void DisplayTopology(bool const outputToCsv)
void ParseMemType(EnvVars const &ev, std::string const &token, std::vector<MemType> &memTypes, std::vector<int> &memIndices)
void ParseExeType(EnvVars const &ev, std::string const &token, ExeType &exeType, int &exeIndex, int &exeSubIndex)
void ParseTransfers(EnvVars const &ev, char *line, std::vector<Transfer> &transfers)
void EnablePeerAccess(int const deviceId, int const peerDeviceId)
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void **memPtr)
void DeallocateMemory(MemType memType, void *memPtr, size_t const bytes)
void CheckPages(char *array, size_t numBytes, int targetId)
uint32_t GetId(uint32_t hwId)
void RunTransfer(EnvVars const &ev, int const iteration, ExecutorInfo &exeInfo, int const transferIdx)
void RunPeerToPeerBenchmarks(EnvVars const &ev, size_t N)
void RunScalingBenchmark(EnvVars const &ev, size_t N, int const exeIndex, int const maxSubExecs)
void RunAllToAllBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const numSubExecs)
void RunSchmooBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs)
void RunRemoteWriteBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
void RunParallelCopyBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
void RunSweepPreset(EnvVars const &ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const &transfers)
std::string PtrVectorToStr(std::vector<float*> const &strVector, int const initOffset)
void ReportResults(EnvVars const &ev, std::vector<Transfer> const &transfers, TestResults const results)
void RunHealthCheck(EnvVars ev)
dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/latest/src/include
dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/latest/src