TransferBench API library

TransferBench API library#

struct ConfigOptions#

#include <TransferBench.hpp>

Configuration options for performing Transfers

Public Members

GeneralOptions general#: General options.

DataOptions data#: Data options.

GfxOptions gfx#: GFX executor options.

DmaOptions dma#: DMA executor options.

NicOptions nic#: NIC executor options.

struct DataOptions#

#include <TransferBench.hpp>

Data options

Public Members

int alwaysValidate = 0#: Validate after each iteration instead of once at end.

int blockBytes = 256#: Each subexecutor works on a multiple of this many bytes.

int byteOffset = 0#: Byte-offset for memory allocations.

vector<float> fillPattern = {}#: Pattern of floats used to fill source data.

int validateDirect = 0#: Validate GPU results directly instead of copying to host.

int validateSource = 0#: Validate src GPU memory immediately after preparation.

struct DmaOptions#

#include <TransferBench.hpp>

DMA Executor options

Public Members

int useHipEvents = 1#: Use HIP events for timing DMA Executor.

int useHsaCopy = 0#: Use HSA copy instead of HIP copy to perform DMA.

struct ErrResult#

#include <TransferBench.hpp>

ErrResult consists of error type and error message

Public Functions

ErrResult() = default#

ErrResult(hipError_t err)#

ErrResult(hsa_status_t err)#

ErrResult(ErrType err)#

ErrResult(ErrType errType, const char *format, ...)#

Public Members

ErrType errType#: Error type.

std::string errMsg#: Error details.

struct ExeDevice#

#include <TransferBench.hpp>

A ExeDevice defines a specific Executor

Public Functions

inline bool operator<(ExeDevice const &other) const#

Public Members

ExeType exeType#: Executor type.

int32_t exeIndex#: Executor index.

struct ExeResult#

#include <TransferBench.hpp>

Results for a single Executor

Public Members

size_t numBytes#: Total bytes transferred by this Executor.

double avgDurationMsec#: Averaged duration for all the Transfers for this Executor.

double avgBandwidthGbPerSec#: Average bandwidth for this Executor.

double sumBandwidthGbPerSec#: Naive sum of individual Transfer average bandwidths.

vector<int> transferIdx#: Indicies of Transfers this Executor executed.

struct GeneralOptions#

#include <TransferBench.hpp>

General options

Public Members

int numIterations = 10#: Number of timed iterations to perform. If negative, run for -numIterations seconds instead.

int numSubIterations = 1#: Number of sub-iterations per iteration.

int numWarmups = 3#: Number of un-timed warmup iterations to perform.

int recordPerIteration = 0#: Record per-iteration timing information.

int useInteractive = 0#: Pause for user-input before starting transfer loop.

struct GfxOptions#

#include <TransferBench.hpp>

GFX Executor options

Public Members

int blockOrder = 0#: Determines how threadblocks are ordered (0=sequential, 1=interleaved, 2=random)

int blockSize = 256#: Size of each threadblock (must be multiple of 64)

vector<uint32_t> cuMask = {}#: Bit-vector representing the CU mask.

vector<vector<int>> prefXccTable = {}#: 2D table with preferred XCD to use for a specific [src][dst] GPU device

int temporalMode = 0#: Non-temporal load/store mode 0=none, 1=load, 2=store, 3=both.

int unrollFactor = 4#: GFX-kernel unroll factor.

int useHipEvents = 1#: Use HIP events for timing GFX Executor.

int useMultiStream = 0#: Use multiple streams for GFX.

int useSingleTeam = 0#: Team all subExecutors across the data array.

int waveOrder = 0#: GFX-kernel wavefront ordering.

int wordSize = 4#: GFX-kernel packed data size (4=dwordx4, 2=dwordx2, 1=dwordx1)

struct MemDevice#

#include <TransferBench.hpp>

A MemDevice indicates a memory type on a specific device

Public Functions

inline bool operator<(MemDevice const &other) const#

Public Members

MemType memType#: Memory type.

int32_t memIndex#: Device index.

struct NicOptions#

#include <TransferBench.hpp>

NIC Executor options

Public Members

vector<int> closestNics = {}#: Overrides the auto-detected closest NIC per GPU.

int ibGidIndex = -1#: GID Index for RoCE NICs (-1 is auto)

uint8_t ibPort = 1#: NIC port number to be used.

int ipAddressFamily = 4#: 4=IPv4, 6=IPv6 (used for auto GID detection)

int maxRecvWorkReq = 16#: Maximum number of recv work requests per queue pair.

int maxSendWorkReq = 16#: Maximum number of send work requests per queue pair.

int queueSize = 100#: Completion queue size.

int roceVersion = 2#: RoCE version (used for auto GID detection)

int useRelaxedOrder = 1#: Use relaxed ordering.

int useNuma = 0#: Switch to closest numa thread for execution.

struct TestResults#

#include <TransferBench.hpp>

TestResults contain timing results for a set of Transfers as a group as well as per Executor and per Transfer timing information

Public Members

int numTimedIterations#: Number of iterations executed.

size_t totalBytesTransferred#: Total bytes transferred per iteration.

double avgTotalDurationMsec#: Wall-time (msec) to finish all Transfers (averaged across all timed iterations)

double avgTotalBandwidthGbPerSec#: Bandwidth based on all Transfers and average wall time.

double overheadMsec#: Difference between total wall time and slowest executor.

map<ExeDevice, ExeResult> exeResults#: Per Executor results.

vector<TransferResult> tfrResults#: Per Transfer results.

vector<ErrResult> errResults#: List of any errors/warnings that occurred.

struct Transfer#

#include <TransferBench.hpp>

A Transfer adds together data from zero or more sources then writes the sum to zero or more desintations

Public Members

size_t numBytes = 0#: Number of bytes to Transfer.

vector<MemDevice> srcs = {}#: List of source memory devices.

vector<MemDevice> dsts = {}#: List of destination memory devices.

ExeDevice exeDevice = {}#: Executor to use.

int32_t exeSubIndex = -1#: Executor subindex.

int numSubExecs = 0#: Number of subExecutors to use for this Transfer.

struct TransferResult#

#include <TransferBench.hpp>

Results for a single Transfer

Public Members

size_t numBytes#: Number of bytes transferred by this Transfer.

double avgDurationMsec#: Duration for this Transfer, averaged over all timed iterations.

double avgBandwidthGbPerSec#: Bandwidth for this Transfer based on averaged duration.

vector<double> perIterMsec#: Duration for each individual iteration.

vector<set<pair<int, int>>> perIterCUs#: GFX-Executor only. XCC:CU used per iteration.

ExeDevice exeDevice#: Tracks which executor performed this Transfer (e.g. for EXE_NIC_NEAREST)

ExeDevice exeDstDevice#: Tracks actual destination executor (only valid for EXE_NIC/EXE_NIC_NEAREST)

namespace std#: STL namespace.

namespace TransferBench#

Enums

enum ExeType#

Enumeration of supported Executor types

Note

The Executor is the device used to perform a Transfer

Values:

enumerator EXE_CPU#: CPU executor (subExecutor = CPU thread)

enumerator EXE_GPU_GFX#: GPU kernel-based executor (subExecutor = threadblock/CU)

enumerator EXE_GPU_DMA#: GPU SDMA executor (subExecutor = not supported)

enumerator EXE_NIC#: NIC RDMA executor (subExecutor = queue pair)

enumerator EXE_NIC_NEAREST#: NIC RDMA nearest executor (subExecutor = queue pair)

enum MemType#

Enumeration of supported memory types

Note

These are possible types of memory to be used as sources/destinations

Values:

enumerator MEM_CPU#: Coarse-grained pinned CPU memory.

enumerator MEM_GPU#: Coarse-grained global GPU memory.

enumerator MEM_CPU_FINE#: Fine-grained pinned CPU memory.

enumerator MEM_GPU_FINE#: Fine-grained global GPU memory.

enumerator MEM_CPU_UNPINNED#: Unpinned CPU memory.

enumerator MEM_NULL#: NULL memory - used for empty.

enumerator MEM_MANAGED#: Managed memory.

enumerator MEM_CPU_CLOSEST#: Coarse-grained pinned CPU memory indexed by closest GPU.

enum ErrType#

Enumeration of possible error types

Values:

enumerator ERR_NONE#: No errors.

enumerator ERR_WARN#: Warning - results may not be accurate.

enumerator ERR_FATAL#: Fatal error - results are invalid.

enum GidPriority#

Enumeration of GID priority

Note

These are the GID types ordered in priority from lowest (0) to highest

Values:

enumerator UNKNOWN#: Default.

enumerator ROCEV1_LINK_LOCAL#: RoCEv1 Link-local.

enumerator ROCEV2_LINK_LOCAL#: RoCEv2 Link-local fe80::/10.

enumerator ROCEV1_IPV6#: RoCEv1 IPv6.

enumerator ROCEV2_IPV6#: RoCEv2 IPv6.

enumerator ROCEV1_IPV4#: RoCEv1 IPv4-mapped IPv6.

enumerator ROCEV2_IPV4#: RoCEv2 IPv4-mapped IPv6 ::ffff:192.168.x.x.

enum IntAttribute#

Enumeration of implementation attributes

Values:

enumerator ATR_GFX_MAX_BLOCKSIZE#: Maximum blocksize for GFX executor.

enumerator ATR_GFX_MAX_UNROLL#: Maximum unroll factor for GFX executor.

enum StrAttribute#

Values:

enumerator ATR_SRC_PREP_DESCRIPTION#: Description of how source memory is prepared.

Functions

inline bool IsCpuExeType(ExeType e)#

inline bool IsGpuExeType(ExeType e)#

inline bool IsNicExeType(ExeType e)#

inline bool IsCpuMemType(MemType m)#

inline bool IsGpuMemType(MemType m)#

bool RunTransfers(ConfigOptions const &config, vector<Transfer> const &transfers, TestResults &results)#

Run a set of Transfers

Parameters:

config – [in] Configuration options
transfers – [in] Set of Transfers to execute
results – [out] Timing results

Returns:

true if and only if Transfers were run successfully without any fatal errors

int GetIntAttribute(IntAttribute attribute)#

Query attributes (integer)

Note

This allows querying of implementation information such as limits

Parameters:: attribute – [in] Attribute to query
Returns:: Value of the attribute

std::string GetStrAttribute(StrAttribute attribute)#

Query attributes (string)

Note

This allows query of implementation details such as limits

Parameters:: attrtibute – [in] Attribute to query
Returns:: Value of the attribute

int GetNumExecutors(ExeType exeType)#

Returns information about number of available available Executors

Parameters:: exeType – [in] Executor type to query
Returns:: Number of detected Executors of exeType

int GetNumExecutorSubIndices(ExeDevice exeDevice)#

Returns the number of possible Executor subindices

Note

For CPU, this is 0

Note

For GFX, this refers to the number of XCDs

Note

For DMA, this refers to the number of DMA engines

Parameters:: exeDevice – [in] The specific Executor to query
Returns:: Number of detected executor subindices

int GetNumSubExecutors(ExeDevice exeDevice)#

Returns number of subExecutors for a given ExeDevice

Parameters:: exeDevice – [in] The specific Executor to query
Returns:: Number of detected subExecutors for the given ExePair

int GetClosestCpuNumaToGpu(int gpuIndex)#

Returns the index of the NUMA node closest to the given GPU

Parameters:: gpuIndex – [in] Index of the GPU to query
Returns:: NUMA node index closest to GPU gpuIndex, or -1 if unable to detect

int GetClosestCpuNumaToNic(int nicIndex)#

Returns the index of the NUMA node closest to the given NIC

Parameters:: nicIndex – [in] Index of the NIC to query
Returns:: NUMA node index closest to the NIC nicIndex, or -1 if unable to detect

int GetClosestNicToGpu(int gpuIndex)#

Returns the index of the NIC closest to the given GPU

Note

This function is applicable when the IBV/RDMA executor is available

Parameters:: gpuIndex – [in] Index of the GPU to query
Returns:: IB Verbs capable NIC index closest to GPU gpuIndex, or -1 if unable to detect

ErrResult ParseTransfers(std::string str, std::vector<Transfer> &transfers)#

Helper function to parse a line containing Transfers into a vector of Transfers

Parameters:

str – [in] String containing description of Transfers
transfers – [out] List of Transfers described by ‘str’

Returns:

Information about any error that may have occured

Variables

constexpr char VERSION[] = "1.63"#

char const ExeTypeStr[6] = "CGDIN"#

char const MemTypeStr[9] = "CGBFUNMP"#

const char *GidPriorityStr[] = {"RoCEv1 Link-local", "RoCEv2 Link-local", "RoCEv1 IPv6", "RoCEv2 IPv6", "RoCEv1 IPv4-mapped IPv6", "RoCEv2 IPv4-mapped IPv6"}#

file TransferBench.hpp

Defines

GetHwId(hwId)#

GetXccId(val)#

ERR_CHECK(cmd)#

ERR_APPEND(cmd, list)#

IBV_CALL(__func__, ...)#

IBV_PTR_CALL(__ptr__, __func__, ...)#

dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-7.0.1/src/header

dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-7.0.1/src

TransferBench API library

Contents

TransferBench API library#