API

Contents

API#

struct AgentData#

Public Members

bool isInitialized#
std::vector<hsa_agent_t> cpuAgents#
std::vector<hsa_agent_t> gpuAgents#
std::vector<int> closestNumaNode#
class EnvVars#
#include <EnvVars.hpp>

Public Functions

inline EnvVars()#
inline void DisplayEnvVars() const#
inline void DisplayP2PBenchmarkEnvVars() const#
inline void DisplaySweepEnvVars() const#
inline void DisplayA2AEnvVars() const#
inline void DisplaySchmooEnvVars() const#
inline void DisplayRemoteWriteEnvVars() const#
inline void DisplayParallelCopyEnvVars() const#
inline std::string GetCuMaskDesc() const#

Public Members

int const DEFAULT_NUM_WARMUPS = 3#
int const DEFAULT_NUM_ITERATIONS = 10#
int const DEFAULT_SAMPLING_FACTOR = 1#
int const DEFAULT_P2P_NUM_CPU_SE = 4#
std::string const DEFAULT_SWEEP_SRC = "CG"#
std::string const DEFAULT_SWEEP_EXE = "CDG"#
std::string const DEFAULT_SWEEP_DST = "CG"#
int const DEFAULT_SWEEP_MIN = 1#
int const DEFAULT_SWEEP_MAX = 24#
int const DEFAULT_SWEEP_TEST_LIMIT = 0#
int const DEFAULT_SWEEP_TIME_LIMIT = 0#
int alwaysValidate#
int blockBytes#
int blockOrder#
int byteOffset#
int continueOnError#
int gfxBlockSize#
int gfxSingleTeam#
int gfxUnroll#
int gfxWaveOrder#
int hideEnv#
int minNumVarSubExec#
int maxNumVarSubExec#
int numCpuDevices#
int numGpuDevices#
int numIterations#
int numSubIterations#
int numWarmups#
int outputToCsv#
int samplingFactor#
int sharedMemBytes#
int showIterations#
int useHsaDma#
int useInteractive#
int usePcieIndexing#
int usePrepSrcKernel#
int useSingleStream#
int useXccFilter#
int validateDirect#
std::vector<float> fillPattern#
std::vector<uint32_t> cuMask#
std::vector<std::vector<int>> prefXccTable#
int numCpuSubExecs#
int numGpuSubExecs#
int p2pMode#
int useDmaCopy#
int useRemoteRead#
int useFineGrain#
int sweepMin#
int sweepMax#
int sweepTestLimit#
int sweepTimeLimit#
int sweepXgmiMin#
int sweepXgmiMax#
int sweepSeed#
int sweepRandBytes#
std::string sweepSrc#
std::string sweepExe#
std::string sweepDst#
int a2aDirect#
int a2aMode#
int enableDebug#
int gpuMaxHwQueues#
ConfigModeEnum configMode#
std::default_random_engine *generator#
std::vector<int> numCpusPerNuma#
std::vector<int> wallClockPerDeviceMhz#
std::vector<std::set<int>> xccIdsPerDevice#

Public Static Functions

static inline void DisplayUsage()#
static inline int GetEnvVar(std::string const &varname, int defaultValue)#
static inline std::string GetEnvVar(std::string const &varname, std::string const &defaultValue)#
struct ExecutorInfo#
#include <TransferBench.hpp>

Public Members

std::vector<Transfer*> transfers#
size_t totalBytes#
int totalSubExecs#
SubExecParam *subExecParamGpu#
std::vector<hipStream_t> streams#
std::vector<hipEvent_t> startEvents#
std::vector<hipEvent_t> stopEvents#
double totalTime#
struct ExeResult#
#include <TransferBench.hpp>

Public Members

double bandwidthGbs#
double durationMsec#
double sumBandwidthGbs#
size_t totalBytes#
std::vector<int> transferIdx#
struct SubExecParam#
#include <Kernels.hpp>

Public Members

size_t N#
int numSrcs#
int numDsts#
float *src[MAX_SRCS]#
float *dst[MAX_DSTS]#
int32_t preferredXccId#
int teamSize#
int teamIdx#
long long startCycle#
long long stopCycle#
uint32_t hwId#
uint32_t xccId#
struct TestResults#
#include <TransferBench.hpp>

Public Members

size_t numTimedIterations#
size_t totalBytesTransferred#
double totalBandwidthCpu#
double totalDurationMsec#
double overheadMsec#
std::map<std::pair<ExeType, int>, ExeResult> exeResults#
struct Transfer#
#include <TransferBench.hpp>

Public Functions

void PrepareSubExecParams(EnvVars const &ev)#
bool PrepareSrc(EnvVars const &ev)#
void ValidateDst(EnvVars const &ev)#
void PrepareReference(EnvVars const &ev, std::vector<float> &buffer, int bufferIdx)#
std::string SrcToStr() const#
std::string DstToStr() const#

Public Members

ExeType exeType#
int exeIndex#
int exeSubIndex#
int numSubExecs#
size_t numBytes#
int numSrcs#
std::vector<MemType> srcType#
std::vector<int> srcIndex#
int numDsts#
std::vector<MemType> dstType#
std::vector<int> dstIndex#
size_t numBytesActual#
double transferTime#
double transferBandwidth#
double executorBandwidth#
std::vector<double> perIterationTime#
std::vector<std::set<std::pair<int, int>>> perIterationCUs#
int transferIndex#
std::vector<float*> srcMem#
std::vector<float*> dstMem#
std::vector<SubExecParam> subExecParam#
SubExecParam *subExecParamGpuPtr#
std::vector<int> subExecIdx#
hsa_agent_t dstAgent#
hsa_agent_t srcAgent#
hsa_signal_t signal#
hsa_amd_sdma_engine_id_t sdmaEngineId#
namespace std#

STL namespace.

file Compatibility.hpp
#include <hip/hip_ext.h>
#include <hip/hip_runtime.h>
#include <hsa/hsa_ext_amd.h>

Defines

HIP_CALL(cmd)#
file EnvVars.hpp
#include <algorithm>
#include <random>
#include <time.h>
#include “Compatibility.hpp
#include “Kernels.hpp

Defines

TB_VERSION#
PRINT_EV(NAME, VALUE, DESCRIPTION)#
PRINT_ES(NAME, VALUE, DESCRIPTION)#

Enums

enum ConfigModeEnum#

Values:

enumerator CFG_FILE#
enumerator CFG_P2P#
enumerator CFG_SWEEP#
enumerator CFG_SCALE#
enumerator CFG_A2A#
enumerator CFG_SCHMOO#
enumerator CFG_RWRITE#
enum BlockOrderEnum#

Values:

enumerator ORDER_SEQUENTIAL#
enumerator ORDER_INTERLEAVED#
enumerator ORDER_RANDOM#

Variables

char const MemTypeStr[]#
char const ExeTypeStr[]#
file GetClosestNumaNode.hpp

Defines

HSA_CHECK(cmd)#

Functions

hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data)#
hsa_status_t AgentInfoCallback(hsa_agent_t agent, void *data)#
AgentData &GetAgentData()#
int GetClosestNumaNode(int gpuIdx)#
file Kernels.hpp

Defines

PackedFloat_t#
MAX_BLOCKSIZE#
FLOATS_PER_PACK#
MEMSET_CHAR#
MEMSET_VAL#
MAX_WAVEGROUPS#
MAX_UNROLL#
NUM_WAVEORDERS#
MAX_SRCS#
MAX_DSTS#
GetHwId(hwId)#
GetXccId(val)#
GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)#

Typedefs

typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int)#

Functions

void CpuReduceKernel(SubExecParam const &p)#
std::string PrepSrcValueString()#
__host__ __device__ float PrepSrcValue (int srcBufferIdx, size_t idx)
__global__ void CollectXccIdsKernel (int *xccIds)
__global__ void PrepSrcDataKernel (float *ptr, size_t N, int srcBufferIdx)
__device__ int64_t GetTimestamp ()
template<typename T> __device__ __forceinline__ T MemsetVal ()
template<> __device__ __forceinline__ float MemsetVal ()
template<int BLOCKSIZE, int UNROLL> __global__ void __launch_bounds__ (BLOCKSIZE) GpuReduceKernel(SubExecParam *params
if (threadIdx.x==0) startCycle
GetXccId(xccId)#
if (p.preferredXccId !=-1 &&xccId !=p.preferredXccId) return
for (int i=0;i< numSrcs;i++) srcFloat4[i]
switch (waveOrder)
while (1)
__syncthreads()#

Variables

__global__ void int waveOrder
__global__ void int int numSubIterations  {int64_t startCycle
SubExecParam &p = params[blockIdx.y]#
int32_t xccId#
int32_t const numSrcs = p.numSrcs#
int32_t const numDsts = p.numDsts#
float4 const  *__restrict__ srcFloat4 [MAX_SRCS]
float4 *__restrict__ dstFloat4 [MAX_DSTS]
int32_t const nTeams = p.teamSize#
int32_t const teamIdx = p.teamIdx#
int32_t const nWaves = BLOCKSIZE / warpSize#
int32_t const waveIdx = threadIdx.x / warpSize#
int32_t const tIdx = threadIdx.x % warpSize#
size_t const numFloat4 = p.N / 4#
int32_t teamStride#
int32_t waveStride#
int32_t unrlStride#
int32_t teamStride2#
int32_t waveStride2#
int subIterations = 0#
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] = {GPU_KERNEL_UNROLL_DECL(64), GPU_KERNEL_UNROLL_DECL(128), GPU_KERNEL_UNROLL_DECL(192), GPU_KERNEL_UNROLL_DECL(256), GPU_KERNEL_UNROLL_DECL(320), GPU_KERNEL_UNROLL_DECL(384), GPU_KERNEL_UNROLL_DECL(448),}#
file TransferBench.hpp
#include <vector>
#include <sstream>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <cstdint>
#include <set>
#include <unistd.h>
#include <map>
#include <iostream>
#include “Compatibility.hpp
#include “EnvVars.hpp

Defines

MAX_LINE_LEN#

Typedefs

typedef std::pair<ExeType, int> Executor#
typedef std::map<Executor, ExecutorInfo> TransferMap#

Enums

enum MemType#

Values:

enumerator MEM_CPU#
enumerator MEM_GPU#
enumerator MEM_CPU_FINE#
enumerator MEM_GPU_FINE#
enumerator MEM_CPU_UNPINNED#
enumerator MEM_NULL#
enumerator MEM_MANAGED#
enum ExeType#

Values:

enumerator EXE_CPU#
enumerator EXE_GPU_GFX#
enumerator EXE_GPU_DMA#

Functions

bool IsGpuType(MemType m)#
bool IsCpuType(MemType m)#
bool IsGpuType(ExeType e)#
bool IsCpuType(ExeType e)#
inline MemType CharToMemType(char const c)#
inline ExeType CharToExeType(char const c)#
void DisplayUsage(char const *cmdName)#
void DisplayTopology(bool const outputToCsv)#
void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor, std::vector<size_t> &valuesofN)#
void ParseMemType(EnvVars const &ev, std::string const &token, std::vector<MemType> &memType, std::vector<int> &memIndex)#
void ParseExeType(EnvVars const &ev, std::string const &token, ExeType &exeType, int &exeIndex, int &exeSubIndex)#
void ParseTransfers(EnvVars const &ev, char *line, std::vector<Transfer> &transfers)#
void ExecuteTransfers(EnvVars const &ev, int const testNum, size_t const N, std::vector<Transfer> &transfers, bool verbose = true, double *totalBandwidthCpu = nullptr)#
TestResults ExecuteTransfersImpl(EnvVars const &ev, std::vector<Transfer> &transfers)#
void ReportResults(EnvVars const &ev, std::vector<Transfer> const &transfers, TestResults const results)#
void EnablePeerAccess(int const deviceId, int const peerDeviceId)#
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void **memPtr)#
void DeallocateMemory(MemType memType, void *memPtr, size_t const size = 0)#
void CheckPages(char *byteArray, size_t numBytes, int targetId)#
void RunTransfer(EnvVars const &ev, int const iteration, ExecutorInfo &exeInfo, int const transferIdx)#
void RunPeerToPeerBenchmarks(EnvVars const &ev, size_t N)#
void RunScalingBenchmark(EnvVars const &ev, size_t N, int const exeIndex, int const maxSubExecs)#
void RunSweepPreset(EnvVars const &ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom)#
void RunAllToAllBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const numSubExecs)#
void RunSchmooBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs)#
void RunRemoteWriteBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)#
void RunParallelCopyBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)#
void RunHealthCheck(EnvVars ev)#
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount)#
int RemappedIndex(int const origIdx, bool const isCpuType)#
void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const &transfers)#
std::string PtrVectorToStr(std::vector<float*> const &strVector, int const initOffset)#

Variables

size_t const DEFAULT_BYTES_PER_TRANSFER = (1 << 26)#
char const MemTypeStr[8] = "CGBFUNM"
char const ExeTypeStr[4] = "CGD"
char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"}#
file TransferBench.cpp
#include <numa.h>
#include <cmath>
#include <numaif.h>
#include <random>
#include <stack>
#include <thread>
#include “TransferBench.hpp
#include “GetClosestNumaNode.hpp

Functions

int main(int argc, char **argv)#
void ExecuteTransfers(EnvVars const &ev, int const testNum, size_t const N, std::vector<Transfer> &transfers, bool verbose, double *totalBandwidthCpu)
TestResults ExecuteTransfersImpl(EnvVars const &ev, std::vector<Transfer> &transfers)
void DisplayUsage(char const *cmdName)
int RemappedIndex(int const origIdx, bool const isCpuType)
void DisplayTopology(bool const outputToCsv)
void ParseMemType(EnvVars const &ev, std::string const &token, std::vector<MemType> &memTypes, std::vector<int> &memIndices)
void ParseExeType(EnvVars const &ev, std::string const &token, ExeType &exeType, int &exeIndex, int &exeSubIndex)
void ParseTransfers(EnvVars const &ev, char *line, std::vector<Transfer> &transfers)
void EnablePeerAccess(int const deviceId, int const peerDeviceId)
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void **memPtr)
void DeallocateMemory(MemType memType, void *memPtr, size_t const bytes)
void CheckPages(char *array, size_t numBytes, int targetId)
uint32_t GetId(uint32_t hwId)#
void RunTransfer(EnvVars const &ev, int const iteration, ExecutorInfo &exeInfo, int const transferIdx)
void RunPeerToPeerBenchmarks(EnvVars const &ev, size_t N)
void RunScalingBenchmark(EnvVars const &ev, size_t N, int const exeIndex, int const maxSubExecs)
void RunAllToAllBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const numSubExecs)
void RunSchmooBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs)
void RunRemoteWriteBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
void RunParallelCopyBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
void RunSweepPreset(EnvVars const &ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const &transfers)
std::string PtrVectorToStr(std::vector<float*> const &strVector, int const initOffset)
void ReportResults(EnvVars const &ev, std::vector<Transfer> const &transfers, TestResults const results)
void RunHealthCheck(EnvVars ev)
dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.2.4/src/include
dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.2.4/src