TransferBench API library#
- 
struct ConfigOptions#
- #include <TransferBench.hpp>Configuration options for performing Transfers Public Members - 
GeneralOptions general#
- General options. 
 - 
DataOptions data#
- Data options. 
 - 
GfxOptions gfx#
- GFX executor options. 
 - 
DmaOptions dma#
- DMA executor options. 
 - 
NicOptions nic#
- NIC executor options. 
 
- 
GeneralOptions general#
- 
struct DataOptions#
- #include <TransferBench.hpp>Data options Public Members - 
int alwaysValidate = 0#
- Validate after each iteration instead of once at end. 
 - 
int blockBytes = 256#
- Each subexecutor works on a multiple of this many bytes. 
 - 
int byteOffset = 0#
- Byte-offset for memory allocations. 
 - 
vector<float> fillPattern = {}#
- Pattern of floats used to fill source data. 
 - 
int validateDirect = 0#
- Validate GPU results directly instead of copying to host. 
 - 
int validateSource = 0#
- Validate src GPU memory immediately after preparation. 
 
- 
int alwaysValidate = 0#
- 
struct DmaOptions#
- #include <TransferBench.hpp>DMA Executor options 
- 
struct ErrResult#
- #include <TransferBench.hpp>ErrResult consists of error type and error message 
- 
struct ExeDevice#
- #include <TransferBench.hpp>A ExeDevice defines a specific Executor 
- 
struct ExeResult#
- #include <TransferBench.hpp>Results for a single Executor 
- 
struct GeneralOptions#
- #include <TransferBench.hpp>General options Public Members - 
int numIterations = 10#
- Number of timed iterations to perform. If negative, run for -numIterations seconds instead. 
 - 
int numSubIterations = 1#
- Number of sub-iterations per iteration. 
 - 
int numWarmups = 3#
- Number of un-timed warmup iterations to perform. 
 - 
int recordPerIteration = 0#
- Record per-iteration timing information. 
 - 
int useInteractive = 0#
- Pause for user-input before starting transfer loop. 
 
- 
int numIterations = 10#
- 
struct GfxOptions#
- #include <TransferBench.hpp>GFX Executor options Public Members - 
int blockOrder = 0#
- Determines how threadblocks are ordered (0=sequential, 1=interleaved, 2=random) 
 - 
int blockSize = 256#
- Size of each threadblock (must be multiple of 64) 
 - 
vector<uint32_t> cuMask = {}#
- Bit-vector representing the CU mask. 
 - 
vector<vector<int>> prefXccTable = {}#
- 2D table with preferred XCD to use for a specific [src][dst] GPU device 
 - 
int temporalMode = 0#
- Non-temporal load/store mode 0=none, 1=load, 2=store, 3=both. 
 - 
int unrollFactor = 4#
- GFX-kernel unroll factor. 
 - 
int useHipEvents = 1#
- Use HIP events for timing GFX Executor. 
 - 
int useMultiStream = 0#
- Use multiple streams for GFX. 
 - 
int useSingleTeam = 0#
- Team all subExecutors across the data array. 
 - 
int waveOrder = 0#
- GFX-kernel wavefront ordering. 
 - 
int wordSize = 4#
- GFX-kernel packed data size (4=dwordx4, 2=dwordx2, 1=dwordx1) 
 
- 
int blockOrder = 0#
- 
struct MemDevice#
- #include <TransferBench.hpp>A MemDevice indicates a memory type on a specific device 
- 
struct NicOptions#
- #include <TransferBench.hpp>NIC Executor options Public Members - 
vector<int> closestNics = {}#
- Overrides the auto-detected closest NIC per GPU. 
 - 
int ibGidIndex = -1#
- GID Index for RoCE NICs (-1 is auto) 
 - 
uint8_t ibPort = 1#
- NIC port number to be used. 
 - 
int ipAddressFamily = 4#
- 4=IPv4, 6=IPv6 (used for auto GID detection) 
 - 
int maxRecvWorkReq = 16#
- Maximum number of recv work requests per queue pair. 
 - 
int maxSendWorkReq = 16#
- Maximum number of send work requests per queue pair. 
 - 
int queueSize = 100#
- Completion queue size. 
 - 
int roceVersion = 2#
- RoCE version (used for auto GID detection) 
 - 
int useRelaxedOrder = 1#
- Use relaxed ordering. 
 - 
int useNuma = 0#
- Switch to closest numa thread for execution. 
 
- 
vector<int> closestNics = {}#
- 
struct TestResults#
- #include <TransferBench.hpp>TestResults contain timing results for a set of Transfers as a group as well as per Executor and per Transfer timing information Public Members - 
int numTimedIterations#
- Number of iterations executed. 
 - 
size_t totalBytesTransferred#
- Total bytes transferred per iteration. 
 - 
double avgTotalDurationMsec#
- Wall-time (msec) to finish all Transfers (averaged across all timed iterations) 
 - 
double avgTotalBandwidthGbPerSec#
- Bandwidth based on all Transfers and average wall time. 
 - 
double overheadMsec#
- Difference between total wall time and slowest executor. 
 - 
vector<TransferResult> tfrResults#
- Per Transfer results. 
 
- 
int numTimedIterations#
- 
struct Transfer#
- #include <TransferBench.hpp>A Transfer adds together data from zero or more sources then writes the sum to zero or more desintations Public Members - 
int32_t exeSubIndex = -1#
- Executor subindex. 
 
- 
int32_t exeSubIndex = -1#
- 
struct TransferResult#
- #include <TransferBench.hpp>Results for a single Transfer 
- 
namespace std#
- STL namespace. 
- 
namespace TransferBench#
- Enums - 
enum ExeType#
- Enumeration of supported Executor types - Note - The Executor is the device used to perform a Transfer - Values: - 
enumerator EXE_CPU#
- CPU executor (subExecutor = CPU thread) 
 - 
enumerator EXE_GPU_GFX#
- GPU kernel-based executor (subExecutor = threadblock/CU) 
 - 
enumerator EXE_GPU_DMA#
- GPU SDMA executor (subExecutor = not supported) 
 - 
enumerator EXE_NIC#
- NIC RDMA executor (subExecutor = queue pair) 
 - 
enumerator EXE_NIC_NEAREST#
- NIC RDMA nearest executor (subExecutor = queue pair) 
 
- 
enumerator EXE_CPU#
 - 
enum MemType#
- Enumeration of supported memory types - Note - These are possible types of memory to be used as sources/destinations - Values: - 
enumerator MEM_CPU#
- Coarse-grained pinned CPU memory. 
 - 
enumerator MEM_GPU#
- Coarse-grained global GPU memory. 
 - 
enumerator MEM_CPU_FINE#
- Fine-grained pinned CPU memory. 
 - 
enumerator MEM_GPU_FINE#
- Fine-grained global GPU memory. 
 - 
enumerator MEM_CPU_UNPINNED#
- Unpinned CPU memory. 
 - 
enumerator MEM_NULL#
- NULL memory - used for empty. 
 - 
enumerator MEM_MANAGED#
- Managed memory. 
 - 
enumerator MEM_CPU_CLOSEST#
- Coarse-grained pinned CPU memory indexed by closest GPU. 
 
- 
enumerator MEM_CPU#
 - 
enum ErrType#
- Enumeration of possible error types - Values: - 
enumerator ERR_NONE#
- No errors. 
 - 
enumerator ERR_WARN#
- Warning - results may not be accurate. 
 - 
enumerator ERR_FATAL#
- Fatal error - results are invalid. 
 
- 
enumerator ERR_NONE#
 - 
enum GidPriority#
- Enumeration of GID priority - Note - These are the GID types ordered in priority from lowest (0) to highest - Values: - 
enumerator UNKNOWN#
- Default. 
 - 
enumerator ROCEV1_LINK_LOCAL#
- RoCEv1 Link-local. 
 - 
enumerator ROCEV2_LINK_LOCAL#
- RoCEv2 Link-local fe80::/10. 
 - 
enumerator ROCEV1_IPV6#
- RoCEv1 IPv6. 
 - 
enumerator ROCEV2_IPV6#
- RoCEv2 IPv6. 
 - 
enumerator ROCEV1_IPV4#
- RoCEv1 IPv4-mapped IPv6. 
 - 
enumerator ROCEV2_IPV4#
- RoCEv2 IPv4-mapped IPv6 ::ffff:192.168.x.x. 
 
- 
enumerator UNKNOWN#
 - Functions - 
bool RunTransfers(ConfigOptions const &config, vector<Transfer> const &transfers, TestResults &results)#
- Run a set of Transfers - Parameters:
- config – [in] Configuration options 
- transfers – [in] Set of Transfers to execute 
- results – [out] Timing results 
 
- Returns:
- true if and only if Transfers were run successfully without any fatal errors 
 
 - 
int GetIntAttribute(IntAttribute attribute)#
- Query attributes (integer) - Note - This allows querying of implementation information such as limits - Parameters:
- attribute – [in] Attribute to query 
- Returns:
- Value of the attribute 
 
 - 
std::string GetStrAttribute(StrAttribute attribute)#
- Query attributes (string) - Note - This allows query of implementation details such as limits - Parameters:
- attrtibute – [in] Attribute to query 
- Returns:
- Value of the attribute 
 
 - 
int GetNumExecutors(ExeType exeType)#
- Returns information about number of available available Executors - Parameters:
- exeType – [in] Executor type to query 
- Returns:
- Number of detected Executors of exeType 
 
 - 
int GetNumExecutorSubIndices(ExeDevice exeDevice)#
- Returns the number of possible Executor subindices - Note - For CPU, this is 0 - Note - For GFX, this refers to the number of XCDs - Note - For DMA, this refers to the number of DMA engines - Parameters:
- exeDevice – [in] The specific Executor to query 
- Returns:
- Number of detected executor subindices 
 
 - 
int GetNumSubExecutors(ExeDevice exeDevice)#
- Returns number of subExecutors for a given ExeDevice - Parameters:
- exeDevice – [in] The specific Executor to query 
- Returns:
- Number of detected subExecutors for the given ExePair 
 
 - 
int GetClosestCpuNumaToGpu(int gpuIndex)#
- Returns the index of the NUMA node closest to the given GPU - Parameters:
- gpuIndex – [in] Index of the GPU to query 
- Returns:
- NUMA node index closest to GPU gpuIndex, or -1 if unable to detect 
 
 - 
int GetClosestCpuNumaToNic(int nicIndex)#
- Returns the index of the NUMA node closest to the given NIC - Parameters:
- nicIndex – [in] Index of the NIC to query 
- Returns:
- NUMA node index closest to the NIC nicIndex, or -1 if unable to detect 
 
 - 
int GetClosestNicToGpu(int gpuIndex)#
- Returns the index of the NIC closest to the given GPU - Note - This function is applicable when the IBV/RDMA executor is available - Parameters:
- gpuIndex – [in] Index of the GPU to query 
- Returns:
- IB Verbs capable NIC index closest to GPU gpuIndex, or -1 if unable to detect 
 
 - 
ErrResult ParseTransfers(std::string str, std::vector<Transfer> &transfers)#
- Helper function to parse a line containing Transfers into a vector of Transfers - Parameters:
- str – [in] String containing description of Transfers 
- transfers – [out] List of Transfers described by ‘str’ 
 
- Returns:
- Information about any error that may have occured 
 
 
- 
enum ExeType#
- file TransferBench.hpp
- dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-7.0.1/src/header
- dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-7.0.1/src