API

API#

struct AgentData#

#include <GetClosestNumaNode.hpp>

Public Members

bool isInitialized#

std::vector<hsa_agent_t> cpuAgents#

std::vector<hsa_agent_t> gpuAgents#

std::vector<int> closestNumaNode#

class EnvVars#

#include <EnvVars.hpp>

Public Functions

inline EnvVars()#

inline void DisplayEnvVars() const#

inline void DisplayP2PBenchmarkEnvVars() const#

inline void DisplaySweepEnvVars() const#

inline void DisplayA2AEnvVars() const#

inline std::string GetCuMaskDesc() const#

Public Members

int const DEFAULT_NUM_WARMUPS = 3#

int const DEFAULT_NUM_ITERATIONS = 10#

int const DEFAULT_SAMPLING_FACTOR = 1#

int const DEFAULT_P2P_NUM_CPU_SE = 4#

std::string const DEFAULT_SWEEP_SRC = "CG"#

std::string const DEFAULT_SWEEP_EXE = "CDG"#

std::string const DEFAULT_SWEEP_DST = "CG"#

int const DEFAULT_SWEEP_MIN = 1#

int const DEFAULT_SWEEP_MAX = 24#

int const DEFAULT_SWEEP_TEST_LIMIT = 0#

int const DEFAULT_SWEEP_TIME_LIMIT = 0#

int alwaysValidate#

int blockSize#

int blockBytes#

int blockOrder#

int byteOffset#

int continueOnError#

int hideEnv#

int numCpuDevices#

int numGpuDevices#

int numIterations#

int numWarmups#

int outputToCsv#

int samplingFactor#

int sharedMemBytes#

int showIterations#

int useInteractive#

int usePcieIndexing#

int usePrepSrcKernel#

int useSingleStream#

int validateDirect#

std::vector<float> fillPattern#

std::vector<uint32_t> cuMask#

int numCpuSubExecs#

int numGpuSubExecs#

int p2pMode#

int useDmaCopy#

int useRemoteRead#

int useFineGrain#

int sweepMin#

int sweepMax#

int sweepTestLimit#

int sweepTimeLimit#

int sweepXgmiMin#

int sweepXgmiMax#

int sweepSeed#

int sweepRandBytes#

std::string sweepSrc#

std::string sweepExe#

std::string sweepDst#

int a2aDirect#

int enableDebug#

int gpuKernel#

ConfigModeEnum configMode#

std::default_random_engine *generator#

std::vector<int> numCpusPerNuma#

std::vector<int> wallClockPerDeviceMhz#

Public Static Functions

static inline void DisplayUsage()#

static inline int GetEnvVar(std::string const &varname, int defaultValue)#

static inline std::string GetEnvVar(std::string const &varname, std::string const &defaultValue)#

struct ExecutorInfo#

#include <TransferBench.hpp>

Public Members

std::vector<Transfer*> transfers#

size_t totalBytes#

int totalSubExecs#

SubExecParam *subExecParamGpu#

std::vector<hipStream_t> streams#

std::vector<hipEvent_t> startEvents#

std::vector<hipEvent_t> stopEvents#

double totalTime#

struct SubExecParam#

#include <Kernels.hpp>

Public Members

size_t N#

int numSrcs#

int numDsts#

float *src[MAX_SRCS]#

float *dst[MAX_DSTS]#

long long startCycle#

long long stopCycle#

uint32_t hwId#

uint32_t xccId#

struct Transfer#

#include <TransferBench.hpp>

Public Functions

void PrepareSubExecParams(EnvVars const &ev)#

bool PrepareSrc(EnvVars const &ev)#

void ValidateDst(EnvVars const &ev)#

void PrepareReference(EnvVars const &ev, std::vector<float> &buffer, int bufferIdx)#

std::string SrcToStr() const#

std::string DstToStr() const#

Public Members

int transferIndex#

ExeType exeType#

int exeIndex#

int numSubExecs#

size_t numBytes#

size_t numBytesActual#

double transferTime#

int numSrcs#

std::vector<MemType> srcType#

std::vector<int> srcIndex#

std::vector<float*> srcMem#

int numDsts#

std::vector<MemType> dstType#

std::vector<int> dstIndex#

std::vector<float*> dstMem#

std::vector<SubExecParam> subExecParam#

SubExecParam *subExecParamGpuPtr#

std::vector<int> subExecIdx#

std::vector<double> perIterationTime#

std::vector<std::set<std::pair<int, int>>> perIterationCUs#

namespace std#: STL namespace.

file Compatibility.hpp: #include <hip/hip_ext.h>

#include <hip/hip_runtime.h>

#include <hsa/hsa_ext_amd.h>

file EnvVars.hpp

#include <algorithm>

#include <random>

#include <time.h>

#include “Compatibility.hpp”

#include “Kernels.hpp”

Defines

TB_VERSION#

PRINT_EV(NAME, VALUE, DESCRIPTION)#

PRINT_ES(NAME, VALUE, DESCRIPTION)#

Enums

enum ConfigModeEnum#

Values:

enumerator CFG_FILE#

enumerator CFG_P2P#

enumerator CFG_SWEEP#

enumerator CFG_SCALE#

enumerator CFG_A2A#

enum BlockOrderEnum#

Values:

enumerator ORDER_SEQUENTIAL#

enumerator ORDER_INTERLEAVED#

enumerator ORDER_RANDOM#

Variables

char const MemTypeStr[]#

char const ExeTypeStr[]#

file GetClosestNumaNode.hpp

Defines

HSA_CHECK(cmd)#

Functions

hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data)#

hsa_status_t AgentInfoCallback(hsa_agent_t agent, void *data)#

AgentData &GetAgentData()#

int GetClosestNumaNode(int gpuIdx)#

file Kernels.hpp

Defines

PackedFloat_t#

WARP_SIZE#

MAX_BLOCKSIZE#

FLOATS_PER_PACK#

MEMSET_CHAR#

MEMSET_VAL#

MAX_SRCS#

MAX_DSTS#

__trace_hwreg()#

__trace_xccreg()#

NUM_GPU_KERNELS#

Typedefs

typedef void (*GpuKernelFuncPtr)(SubExecParam*)#

Functions

void CpuReduceKernel(SubExecParam const &p)#

std::string PrepSrcValueString()#

__host__ __device__ float PrepSrcValue (int srcBufferIdx, size_t idx)

__global__ void PrepSrcDataKernel (float *ptr, size_t N, int srcBufferIdx)

template<typename T> __device__ __forceinline__ T MemsetVal ()

template<> __device__ __forceinline__ float MemsetVal ()

template<int LOOP1_UNROLL> __global__ void __launch_bounds__ (MAX_BLOCKSIZE) GpuReduceKernel(SubExecParam *params)

template<typename FLOAT_TYPE, int UNROLL_FACTOR> __device__ size_t GpuReduceFuncImpl2 (SubExecParam const &p, size_t const offset, size_t const N)

template<typename FLOAT_TYPE, int UNROLL_FACTOR> __device__ size_t GpuReduceFuncImpl (SubExecParam const &p, size_t const offset, size_t const N)

template<typename FLOAT_TYPE> __device__ size_t GpuReduceFunc (SubExecParam const &p, size_t const offset, size_t const N, int const unroll)

Variables

GpuKernelFuncPtr GpuKernelTable[NUM_GPU_KERNELS] = {GpuReduceKernel<8>, GpuReduceKernel<1>, GpuReduceKernel<2>, GpuReduceKernel<3>, GpuReduceKernel<4>, GpuReduceKernel<5>, GpuReduceKernel<6>, GpuReduceKernel<7>, GpuReduceKernel<8>, GpuReduceKernel<9>, GpuReduceKernel<10>, GpuReduceKernel<11>, GpuReduceKernel<12>, GpuReduceKernel<13>, GpuReduceKernel<14>, GpuReduceKernel<15>, GpuReduceKernel<16>, GpuReduceKernel2}#

std::string GpuKernelNames[NUM_GPU_KERNELS] = {"Default - 8xUnroll", "Unroll x1", "Unroll x2", "Unroll x3", "Unroll x4", "Unroll x5", "Unroll x6", "Unroll x7", "Unroll x8", "Unroll x9", "Unroll x10", "Unroll x11", "Unroll x12", "Unroll x13", "Unroll x14", "Unroll x15", "Unroll x16", "8xUnrollB",}#

file TransferBench.hpp

#include <vector>

#include <sstream>

#include <chrono>

#include <cstdio>

#include <cstdlib>

#include <cstdint>

#include <set>

#include <unistd.h>

#include <map>

#include <iostream>

#include “Compatibility.hpp”

#include “EnvVars.hpp”

Defines

HIP_CALL(cmd)#

MAX_LINE_LEN#

Typedefs

typedef std::pair<ExeType, int> Executor#

typedef std::map<Executor, ExecutorInfo> TransferMap#

Enums

enum MemType#

Values:

enumerator MEM_CPU#

enumerator MEM_GPU#

enumerator MEM_CPU_FINE#

enumerator MEM_GPU_FINE#

enumerator MEM_CPU_UNPINNED#

enumerator MEM_NULL#

enum ExeType#

Values:

enumerator EXE_CPU#

enumerator EXE_GPU_GFX#

enumerator EXE_GPU_DMA#

Functions

bool IsGpuType(MemType m)#

bool IsCpuType(MemType m)#

bool IsGpuType(ExeType e)#

bool IsCpuType(ExeType e)#

inline MemType CharToMemType(char const c)#

inline ExeType CharToExeType(char const c)#

void DisplayUsage(char const *cmdName)#

void DisplayTopology(bool const outputToCsv)#

void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor, std::vector<size_t> &valuesofN)#

void ParseMemType(std::string const &token, int const numCpus, int const numGpus, std::vector<MemType> &memType, std::vector<int> &memIndex)#

void ParseExeType(std::string const &token, int const numCpus, int const numGpus, ExeType &exeType, int &exeIndex)#

void ParseTransfers(char *line, int numCpus, int numGpus, std::vector<Transfer> &transfers)#

void ExecuteTransfers(EnvVars const &ev, int const testNum, size_t const N, std::vector<Transfer> &transfers, bool verbose = true, double *totalBandwidthCpu = nullptr)#

void EnablePeerAccess(int const deviceId, int const peerDeviceId)#

void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void **memPtr)#

void DeallocateMemory(MemType memType, void *memPtr, size_t const size = 0)#

void CheckPages(char *byteArray, size_t numBytes, int targetId)#

void RunTransfer(EnvVars const &ev, int const iteration, ExecutorInfo &exeInfo, int const transferIdx)#

void RunPeerToPeerBenchmarks(EnvVars const &ev, size_t N)#

void RunScalingBenchmark(EnvVars const &ev, size_t N, int const exeIndex, int const maxSubExecs)#

void RunSweepPreset(EnvVars const &ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom)#

void RunAllToAllBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const numSubExecs)#

std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount)#

int RemappedIndex(int const origIdx, bool const isCpuType)#

void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const &transfers)#

std::string PtrVectorToStr(std::vector<float*> const &strVector, int const initOffset)#

Variables

size_t const DEFAULT_BYTES_PER_TRANSFER = (1 << 26)#

char const MemTypeStr[7] = "CGBFUN"

char const ExeTypeStr[4] = "CGD"

char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"}#

file TransferBench.cpp

#include <numa.h>

#include <cmath>

#include <numaif.h>

#include <random>

#include <stack>

#include <thread>

#include “TransferBench.hpp”

#include “GetClosestNumaNode.hpp”

Functions

int main(int argc, char **argv)#

void ExecuteTransfers(EnvVars const &ev, int const testNum, size_t const N, std::vector<Transfer> &transfers, bool verbose, double *totalBandwidthCpu)

void DisplayUsage(char const *cmdName)

int RemappedIndex(int const origIdx, bool const isCpuType)

void DisplayTopology(bool const outputToCsv)

void ParseMemType(std::string const &token, int const numCpus, int const numGpus, std::vector<MemType> &memTypes, std::vector<int> &memIndices)

void ParseExeType(std::string const &token, int const numCpus, int const numGpus, ExeType &exeType, int &exeIndex)

void ParseTransfers(char *line, int numCpus, int numGpus, std::vector<Transfer> &transfers)

void EnablePeerAccess(int const deviceId, int const peerDeviceId)

void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void **memPtr)

void DeallocateMemory(MemType memType, void *memPtr, size_t const bytes)

void CheckPages(char *array, size_t numBytes, int targetId)

uint32_t GetId(uint32_t hwId)#

void RunTransfer(EnvVars const &ev, int const iteration, ExecutorInfo &exeInfo, int const transferIdx)

void RunPeerToPeerBenchmarks(EnvVars const &ev, size_t N)

void RunScalingBenchmark(EnvVars const &ev, size_t N, int const exeIndex, int const maxSubExecs)

void RunAllToAllBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const numSubExecs)

void RunSweepPreset(EnvVars const &ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)

void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const &transfers)

std::string PtrVectorToStr(std::vector<float*> const &strVector, int const initOffset)

dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.0.0/src/include

dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.0.0/src

API

Contents

API#