API#
-
struct AgentData
- #include <GetClosestNumaNode.hpp>
Public Members
-
bool isInitialized
-
std::vector<hsa_agent_t> cpuAgents
-
std::vector<hsa_agent_t> gpuAgents
-
std::vector<int> closestNumaNode
-
bool isInitialized
-
class EnvVars
- #include <EnvVars.hpp>
Public Functions
-
inline EnvVars()
-
inline void DisplayEnvVars() const
-
inline void DisplayP2PBenchmarkEnvVars() const
-
inline void DisplaySweepEnvVars() const
-
inline void DisplayA2AEnvVars() const
-
inline void DisplaySchmooEnvVars() const
-
inline void DisplayRemoteWriteEnvVars() const
-
inline void DisplayParallelCopyEnvVars() const
-
inline std::string GetCuMaskDesc() const
Public Members
-
int const DEFAULT_NUM_WARMUPS = 3
-
int const DEFAULT_NUM_ITERATIONS = 10
-
int const DEFAULT_SAMPLING_FACTOR = 1
-
int const DEFAULT_P2P_NUM_CPU_SE = 4
-
std::string const DEFAULT_SWEEP_SRC = "CG"
-
std::string const DEFAULT_SWEEP_EXE = "CDG"
-
std::string const DEFAULT_SWEEP_DST = "CG"
-
int const DEFAULT_SWEEP_MIN = 1
-
int const DEFAULT_SWEEP_MAX = 24
-
int const DEFAULT_SWEEP_TEST_LIMIT = 0
-
int const DEFAULT_SWEEP_TIME_LIMIT = 0
-
int alwaysValidate
-
int blockBytes
-
int blockOrder
-
int byteOffset
-
int continueOnError
-
int gfxBlockSize
-
int gfxSingleTeam
-
int gfxUnroll
-
int gfxWaveOrder
-
int hideEnv
-
int numCpuDevices
-
int numGpuDevices
-
int numIterations
-
int numWarmups
-
int outputToCsv
-
int samplingFactor
-
int showIterations
-
int useInteractive
-
int usePcieIndexing
-
int usePrepSrcKernel
-
int useSingleStream
-
int useXccFilter
-
int validateDirect
-
std::vector<float> fillPattern
-
std::vector<uint32_t> cuMask
-
int numCpuSubExecs
-
int numGpuSubExecs
-
int p2pMode
-
int useDmaCopy
-
int useRemoteRead
-
int useFineGrain
-
int sweepMin
-
int sweepMax
-
int sweepTestLimit
-
int sweepTimeLimit
-
int sweepXgmiMin
-
int sweepXgmiMax
-
int sweepSeed
-
int sweepRandBytes
-
std::string sweepSrc
-
std::string sweepExe
-
std::string sweepDst
-
int a2aDirect
-
int a2aMode
-
int enableDebug
-
int gpuMaxHwQueues
-
ConfigModeEnum configMode
-
std::default_random_engine *generator
-
std::vector<int> numCpusPerNuma
-
std::vector<int> wallClockPerDeviceMhz
-
inline EnvVars()
-
struct ExecutorInfo
- #include <TransferBench.hpp>
-
struct SubExecParam
- #include <Kernels.hpp>
Public Members
-
size_t N
-
int numSrcs
-
int numDsts
-
float *src[MAX_SRCS]
-
float *dst[MAX_DSTS]
-
int32_t preferredXccId
-
int teamSize
-
int teamIdx
-
long long startCycle
-
long long stopCycle
-
uint32_t hwId
-
uint32_t xccId
-
size_t N
-
struct Transfer
- #include <TransferBench.hpp>
Public Functions
-
void PrepareSubExecParams(EnvVars const &ev)
-
bool PrepareSrc(EnvVars const &ev)
-
void ValidateDst(EnvVars const &ev)
-
std::string SrcToStr() const
-
std::string DstToStr() const
Public Members
-
ExeType exeType
-
int exeIndex
-
int exeSubIndex
-
int numSubExecs
-
size_t numBytes
-
int numSrcs
-
std::vector<int> srcIndex
-
int numDsts
-
std::vector<int> dstIndex
-
size_t numBytesActual
-
double transferTime
-
double transferBandwidth
-
double executorBandwidth
-
std::vector<double> perIterationTime
-
int transferIndex
-
std::vector<float*> srcMem
-
std::vector<float*> dstMem
-
std::vector<SubExecParam> subExecParam
-
SubExecParam *subExecParamGpuPtr
-
std::vector<int> subExecIdx
-
hsa_agent_t dstAgent
-
hsa_agent_t srcAgent
-
hsa_signal_t signal
-
hsa_amd_sdma_engine_id_t sdmaEngineId
-
void PrepareSubExecParams(EnvVars const &ev)
-
namespace std
STL namespace.
- file Compatibility.hpp
- #include <hip/hip_ext.h>#include <hip/hip_runtime.h>#include <hsa/hsa_ext_amd.h>
Defines
-
HIP_CALL(cmd)
-
HIP_CALL(cmd)
- file EnvVars.hpp
- #include <algorithm>#include <random>#include <time.h>#include “Compatibility.hpp”#include “Kernels.hpp”
Defines
-
TB_VERSION
-
PRINT_EV(NAME, VALUE, DESCRIPTION)
-
PRINT_ES(NAME, VALUE, DESCRIPTION)
Enums
-
enum ConfigModeEnum
Values:
-
enumerator CFG_FILE
-
enumerator CFG_P2P
-
enumerator CFG_SWEEP
-
enumerator CFG_SCALE
-
enumerator CFG_A2A
-
enumerator CFG_SCHMOO
-
enumerator CFG_RWRITE
-
enumerator CFG_FILE
-
enum BlockOrderEnum
Values:
-
enumerator ORDER_SEQUENTIAL
-
enumerator ORDER_INTERLEAVED
-
enumerator ORDER_RANDOM
-
enumerator ORDER_SEQUENTIAL
Variables
-
char const MemTypeStr[]
-
char const ExeTypeStr[]
-
TB_VERSION
- file GetClosestNumaNode.hpp
Defines
-
HSA_CHECK(cmd)
Functions
-
hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data)
-
hsa_status_t AgentInfoCallback(hsa_agent_t agent, void *data)
-
AgentData &GetAgentData()
-
int GetClosestNumaNode(int gpuIdx)
-
HSA_CHECK(cmd)
- file Kernels.hpp
Defines
-
PackedFloat_t
-
MAX_BLOCKSIZE
-
FLOATS_PER_PACK
-
MEMSET_CHAR
-
MEMSET_VAL
-
MAX_WAVEGROUPS
-
MAX_UNROLL
-
NUM_WAVEORDERS
-
MAX_SRCS
-
MAX_DSTS
-
GetHwId(hwId)
-
GetXccId(val)
-
GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)
Typedefs
-
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int)
Functions
-
void CpuReduceKernel(SubExecParam const &p)
-
std::string PrepSrcValueString()
- __host__ __device__ float PrepSrcValue (int srcBufferIdx, size_t idx)
- __global__ void CollectXccIdsKernel (int *xccIds)
- __global__ void PrepSrcDataKernel (float *ptr, size_t N, int srcBufferIdx)
- __device__ int64_t GetTimestamp ()
- template<typename T> __device__ __forceinline__ T MemsetVal ()
- template<> __device__ __forceinline__ float MemsetVal ()
- template<int BLOCKSIZE, int UNROLL> __global__ void __launch_bounds__ (BLOCKSIZE) GpuReduceKernel(SubExecParam *params
- if (threadIdx.x==0) startCycle
-
GetXccId(xccId)
- if (p.preferredXccId !=-1 &&xccId !=p.preferredXccId) return
- for (int i=0;i< numSrcs;i++) srcFloat4[i]
- switch (waveOrder)
- if (numSrcs==0)
Variables
- __global__ void int waveOrder {int64_t startCycle
-
SubExecParam &p = params[blockIdx.y]
-
int32_t xccId
-
int32_t const numSrcs = p.numSrcs
-
int32_t const numDsts = p.numDsts
- float4 const *__restrict__ srcFloat4 [MAX_SRCS]
- float4 *__restrict__ dstFloat4 [MAX_DSTS]
-
int32_t const nTeams = p.teamSize
-
int32_t const teamIdx = p.teamIdx
-
int32_t const nWaves = BLOCKSIZE / warpSize
-
int32_t const waveIdx = threadIdx.x / warpSize
-
int32_t const tIdx = threadIdx.x % warpSize
-
size_t const numFloat4 = p.N / 4
-
int32_t teamStride
-
int32_t waveStride
-
int32_t unrlStride
-
int32_t teamStride2
-
int32_t waveStride2
-
size_t const loop1Limit = numFloat4 / loop1Stride * loop1Stride
-
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] = {GPU_KERNEL_UNROLL_DECL(64), GPU_KERNEL_UNROLL_DECL(128), GPU_KERNEL_UNROLL_DECL(192), GPU_KERNEL_UNROLL_DECL(256), GPU_KERNEL_UNROLL_DECL(320), GPU_KERNEL_UNROLL_DECL(384), GPU_KERNEL_UNROLL_DECL(448),}
-
PackedFloat_t
- file TransferBench.hpp
- #include <vector>#include <sstream>#include <chrono>#include <cstdio>#include <cstdlib>#include <cstdint>#include <set>#include <unistd.h>#include <map>#include <iostream>#include “Compatibility.hpp”#include “EnvVars.hpp”
Defines
-
MAX_LINE_LEN
Typedefs
-
typedef std::map<Executor, ExecutorInfo> TransferMap
Enums
-
enum MemType
Values:
-
enumerator MEM_CPU
-
enumerator MEM_GPU
-
enumerator MEM_CPU_FINE
-
enumerator MEM_GPU_FINE
-
enumerator MEM_CPU_UNPINNED
-
enumerator MEM_NULL
-
enumerator MEM_MANAGED
-
enumerator MEM_CPU
-
enum ExeType
Values:
-
enumerator EXE_CPU
-
enumerator EXE_GPU_GFX
-
enumerator EXE_GPU_DMA
-
enumerator EXE_CPU
Functions
-
bool IsGpuType(MemType m)
-
bool IsCpuType(MemType m)
-
bool IsGpuType(ExeType e)
-
bool IsCpuType(ExeType e)
-
inline MemType CharToMemType(char const c)
-
inline ExeType CharToExeType(char const c)
-
void DisplayUsage(char const *cmdName)
-
void DisplayTopology(bool const outputToCsv)
-
void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor, std::vector<size_t> &valuesofN)
-
void ParseMemType(EnvVars const &ev, std::string const &token, std::vector<MemType> &memType, std::vector<int> &memIndex)
-
void ParseExeType(EnvVars const &ev, std::string const &token, ExeType &exeType, int &exeIndex, int &exeSubIndex)
-
void ExecuteTransfers(EnvVars const &ev, int const testNum, size_t const N, std::vector<Transfer> &transfers, bool verbose = true, double *totalBandwidthCpu = nullptr)
-
void EnablePeerAccess(int const deviceId, int const peerDeviceId)
-
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void **memPtr)
-
void DeallocateMemory(MemType memType, void *memPtr, size_t const size = 0)
-
void CheckPages(char *byteArray, size_t numBytes, int targetId)
-
void RunTransfer(EnvVars const &ev, int const iteration, ExecutorInfo &exeInfo, int const transferIdx)
-
void RunPeerToPeerBenchmarks(EnvVars const &ev, size_t N)
-
void RunScalingBenchmark(EnvVars const &ev, size_t N, int const exeIndex, int const maxSubExecs)
-
void RunSweepPreset(EnvVars const &ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom)
-
void RunAllToAllBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const numSubExecs)
-
void RunSchmooBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs)
-
void RunRemoteWriteBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
-
void RunParallelCopyBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
-
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount)
-
int RemappedIndex(int const origIdx, bool const isCpuType)
Variables
-
size_t const DEFAULT_BYTES_PER_TRANSFER = (1 << 26)
-
char const MemTypeStr[8] = "CGBFUNM"
-
char const ExeTypeStr[4] = "CGD"
-
char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"}
-
MAX_LINE_LEN
- file TransferBench.cpp
- #include <numa.h>#include <cmath>#include <numaif.h>#include <random>#include <stack>#include <thread>#include “TransferBench.hpp”#include “GetClosestNumaNode.hpp”
Functions
-
int main(int argc, char **argv)
-
void ExecuteTransfers(EnvVars const &ev, int const testNum, size_t const N, std::vector<Transfer> &transfers, bool verbose, double *totalBandwidthCpu)
-
void DisplayUsage(char const *cmdName)
-
int RemappedIndex(int const origIdx, bool const isCpuType)
-
void DisplayTopology(bool const outputToCsv)
-
void ParseMemType(EnvVars const &ev, std::string const &token, std::vector<MemType> &memTypes, std::vector<int> &memIndices)
-
void ParseExeType(EnvVars const &ev, std::string const &token, ExeType &exeType, int &exeIndex, int &exeSubIndex)
-
void EnablePeerAccess(int const deviceId, int const peerDeviceId)
-
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void **memPtr)
-
void DeallocateMemory(MemType memType, void *memPtr, size_t const bytes)
-
void CheckPages(char *array, size_t numBytes, int targetId)
-
uint32_t GetId(uint32_t hwId)
-
void RunTransfer(EnvVars const &ev, int const iteration, ExecutorInfo &exeInfo, int const transferIdx)
-
void RunPeerToPeerBenchmarks(EnvVars const &ev, size_t N)
-
void RunScalingBenchmark(EnvVars const &ev, size_t N, int const exeIndex, int const maxSubExecs)
-
void RunAllToAllBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const numSubExecs)
-
void RunSchmooBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs)
-
void RunRemoteWriteBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
-
void RunParallelCopyBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
-
void RunSweepPreset(EnvVars const &ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
-
int main(int argc, char **argv)
- dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.1.2/src/include
- dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.1.2/src