This page contains proposed changes for a future release of ROCm. Read the latest Linux release of ROCm documentation for your production environments.

TransferBench API library

Contents

TransferBench API library#

struct ConfigOptions#
#include <TransferBench.hpp>

Configuration options for performing Transfers

Public Members

GeneralOptions general#

General options.

DataOptions data#

Data options.

GfxOptions gfx#

GFX executor options.

DmaOptions dma#

DMA executor options.

struct DataOptions#
#include <TransferBench.hpp>

Data options

Public Members

int alwaysValidate = 0#

Validate after each iteration instead of once at end.

int blockBytes = 256#

Each subexecutor works on a multiple of this many bytes.

int byteOffset = 0#

Byte-offset for memory allocations.

vector<float> fillPattern = {}#

Pattern of floats used to fill source data.

int validateDirect = 0#

Validate GPU results directly instead of copying to host.

int validateSource = 0#

Validate src GPU memory immediately after preparation.

struct DmaOptions#
#include <TransferBench.hpp>

DMA Executor options

Public Members

int useHipEvents = 1#

Use HIP events for timing DMA Executor.

int useHsaCopy = 0#

Use HSA copy instead of HIP copy to perform DMA.

struct ErrResult#
#include <TransferBench.hpp>

ErrResult consists of error type and error message

Public Functions

ErrResult() = default#
ErrResult(hipError_t err)#
ErrResult(hsa_status_t err)#
ErrResult(ErrType err)#
ErrResult(ErrType errType, const char *format, ...)#

Public Members

ErrType errType#

Error type.

std::string errMsg#

Error details.

struct ExeDevice#
#include <TransferBench.hpp>

A ExeDevice defines a specific Executor

Public Functions

inline bool operator<(ExeDevice const &other) const#

Public Members

ExeType exeType#

Executor type.

int32_t exeIndex#

Executor index.

struct ExeResult#
#include <TransferBench.hpp>

Results for a single Executor

Public Members

size_t numBytes#

Total bytes transferred by this Executor.

double avgDurationMsec#

Averaged duration for all the Transfers for this Executor.

double avgBandwidthGbPerSec#

Average bandwidth for this Executor.

double sumBandwidthGbPerSec#

Naive sum of individual Transfer average bandwidths.

vector<int> transferIdx#

Indicies of Transfers this Executor executed.

struct GeneralOptions#
#include <TransferBench.hpp>

General options

Public Members

int numIterations = 10#

Number of timed iterations to perform. If negative, run for -numIterations seconds instead.

int numSubIterations = 1#

Number of sub-iterations per iteration.

int numWarmups = 3#

Number of un-timed warmup iterations to perform.

int recordPerIteration = 0#

Record per-iteration timing information.

int useInteractive = 0#

Pause for user-input before starting transfer loop.

struct GfxOptions#
#include <TransferBench.hpp>

GFX Executor options

Public Members

int blockSize = 256#

Size of each threadblock (must be multiple of 64)

vector<uint32_t> cuMask = {}#

Bit-vector representing the CU mask.

vector<vector<int>> prefXccTable = {}#

2D table with preferred XCD to use for a specific [src][dst] GPU device

int unrollFactor = 4#

GFX-kernel unroll factor.

int useHipEvents = 1#

Use HIP events for timing GFX Executor.

int useMultiStream = 0#

Use multiple streams for GFX.

int useSingleTeam = 0#

Team all subExecutors across the data array.

int waveOrder = 0#

GFX-kernel wavefront ordering.

struct MemDevice#
#include <TransferBench.hpp>

A MemDevice indicates a memory type on a specific device

Public Functions

inline bool operator<(MemDevice const &other) const#

Public Members

MemType memType#

Memory type.

int32_t memIndex#

Device index.

struct TestResults#
#include <TransferBench.hpp>

TestResults contain timing results for a set of Transfers as a group as well as per Executor and per Transfer timing information

Public Members

int numTimedIterations#

Number of iterations executed.

size_t totalBytesTransferred#

Total bytes transferred per iteration.

double avgTotalDurationMsec#

Wall-time (msec) to finish all Transfers (averaged across all timed iterations)

double avgTotalBandwidthGbPerSec#

Bandwidth based on all Transfers and average wall time.

double overheadMsec#

Difference between total wall time and slowest executor.

map<ExeDevice, ExeResult> exeResults#

Per Executor results.

vector<TransferResult> tfrResults#

Per Transfer results.

vector<ErrResult> errResults#

List of any errors/warnings that occurred.

struct Transfer#
#include <TransferBench.hpp>

A Transfer adds together data from zero or more sources then writes the sum to zero or more desintations

Public Members

size_t numBytes = (1 << 26)#

Number of bytes to Transfer.

vector<MemDevice> srcs = {}#

List of source memory devices.

vector<MemDevice> dsts = {}#

List of destination memory devices.

ExeDevice exeDevice = {}#

Executor to use.

int32_t exeDstIndex = -1#

Destination executor index (for RDMA executor only)

int32_t exeSubIndex = -1#

Executor subindex.

int numSubExecs = 0#

Number of subExecutors to use for this Transfer.

struct TransferResult#
#include <TransferBench.hpp>

Results for a single Transfer

Public Members

size_t numBytes#

Number of bytes transferred by this Transfer.

double avgDurationMsec#

Duration for this Transfer, averaged over all timed iterations.

double avgBandwidthGbPerSec#

Bandwidth for this Transfer based on averaged duration.

vector<double> perIterMsec#

Duration for each individual iteration.

vector<set<pair<int, int>>> perIterCUs#

GFX-Executor only. XCC:CU used per iteration.

namespace std#

STL namespace.

namespace TransferBench#

Enums

enum ExeType#

Enumeration of supported Executor types

Note

The Executor is the device used to perform a Transfer

Note

IBVerbs executor is currently not implemented yet

Values:

enumerator EXE_CPU#

CPU executor (subExecutor = CPU thread)

enumerator EXE_GPU_GFX#

GPU kernel-based executor (subExecutor = threadblock/CU)

enumerator EXE_GPU_DMA#

GPU SDMA executor (subExecutor = not supported)

enumerator EXE_IBV#

IBVerbs executor (subExecutor = queue pair)

enum MemType#

Enumeration of supported memory types

Note

These are possible types of memory to be used as sources/destinations

Values:

enumerator MEM_CPU#

Coarse-grained pinned CPU memory.

enumerator MEM_GPU#

Coarse-grained global GPU memory.

enumerator MEM_CPU_FINE#

Fine-grained pinned CPU memory.

enumerator MEM_GPU_FINE#

Fine-grained global GPU memory.

enumerator MEM_CPU_UNPINNED#

Unpinned CPU memory.

enumerator MEM_NULL#

NULL memory - used for empty.

enumerator MEM_MANAGED#

Managed memory.

enum ErrType#

Enumeration of possible error types

Values:

enumerator ERR_NONE#

No errors.

enumerator ERR_WARN#

Warning - results may not be accurate.

enumerator ERR_FATAL#

Fatal error - results are invalid.

enum IntAttribute#

Enumeration of implementation attributes

Values:

enumerator ATR_GFX_MAX_BLOCKSIZE#

Maximum blocksize for GFX executor.

enumerator ATR_GFX_MAX_UNROLL#

Maximum unroll factor for GFX executor.

enum StrAttribute#

Values:

enumerator ATR_SRC_PREP_DESCRIPTION#

Description of how source memory is prepared.

Functions

inline bool IsCpuExeType(ExeType e)#
inline bool IsGpuExeType(ExeType e)#
inline bool IsCpuMemType(MemType m)#
inline bool IsGpuMemType(MemType m)#
bool RunTransfers(ConfigOptions const &config, vector<Transfer> const &transfers, TestResults &results)#

Run a set of Transfers

Parameters:
  • config[in] Configuration options

  • transfers[in] Set of Transfers to execute

  • results[out] Timing results

Returns:

true if and only if Transfers were run successfully without any fatal errors

int GetIntAttribute(IntAttribute attribute)#

Query attributes (integer)

Note

This allows querying of implementation information such as limits

Parameters:

attribute[in] Attribute to query

Returns:

Value of the attribute

std::string GetStrAttribute(StrAttribute attribute)#

Query attributes (string)

Note

This allows query of implementation details such as limits

Parameters:

attrtibute[in] Attribute to query

Returns:

Value of the attribute

int GetNumExecutors(ExeType exeType)#

Returns information about number of available available Executors

Parameters:

exeType[in] Executor type to query

Returns:

Number of detected Executors of exeType

int GetNumExecutorSubIndices(ExeDevice exeDevice)#

Returns the number of possible Executor subindices

Note

For CPU, this is 0

Note

For GFX, this refers to the number of XCDs

Note

For DMA, this refers to the number of DMA engines

Parameters:

exeDevice[in] The specific Executor to query

Returns:

Number of detected executor subindices

int GetNumSubExecutors(ExeDevice exeDevice)#

Returns number of subExecutors for a given ExeDevice

Parameters:

exeDevice[in] The specific Executor to query

Returns:

Number of detected subExecutors for the given ExePair

int GetClosestCpuNumaToGpu(int gpuIndex)#

Returns the index of the NUMA node closest to the given GPU

Parameters:

gpuIndex[in] Index of the GPU to query

Returns:

NUMA node index closest to GPU gpuIndex, or -1 if unable to detect

ErrResult ParseTransfers(std::string str, std::vector<Transfer> &transfers)#

Helper function to parse a line containing Transfers into a vector of Transfers

Parameters:
  • str[in] String containing description of Transfers

  • transfers[out] List of Transfers described by ‘str’

Returns:

Information about any error that may have occured

Variables

constexpr char VERSION[] = "1.58"#
char const ExeTypeStr[5] = "CGDI"#
char const MemTypeStr[8] = "CGBFUNM"#
file TransferBench.hpp

Defines

GetHwId(hwId)#
GetXccId(val)#
ERR_CHECK(cmd)#
ERR_APPEND(cmd, list)#
dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/develop/src/header
dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/develop/src