API#
- 
struct AgentData#
 - #include <GetClosestNumaNode.hpp>
Public Members
- 
bool isInitialized#
 
- 
std::vector<hsa_agent_t> cpuAgents#
 
- 
std::vector<hsa_agent_t> gpuAgents#
 
 - 
bool isInitialized#
 
- 
class EnvVars#
 - #include <EnvVars.hpp>
Public Functions
- 
inline EnvVars()#
 
- 
inline void DisplayEnvVars() const#
 
- 
inline void DisplayP2PBenchmarkEnvVars() const#
 
- 
inline void DisplaySweepEnvVars() const#
 
- 
inline void DisplayA2AEnvVars() const#
 
- 
inline void DisplaySchmooEnvVars() const#
 
- 
inline void DisplayRemoteWriteEnvVars() const#
 
- 
inline void DisplayParallelCopyEnvVars() const#
 
Public Members
- 
int const DEFAULT_NUM_WARMUPS = 3#
 
- 
int const DEFAULT_NUM_ITERATIONS = 10#
 
- 
int const DEFAULT_SAMPLING_FACTOR = 1#
 
- 
int const DEFAULT_P2P_NUM_CPU_SE = 4#
 
- 
int const DEFAULT_SWEEP_MIN = 1#
 
- 
int const DEFAULT_SWEEP_MAX = 24#
 
- 
int const DEFAULT_SWEEP_TEST_LIMIT = 0#
 
- 
int const DEFAULT_SWEEP_TIME_LIMIT = 0#
 
- 
int alwaysValidate#
 
- 
int blockBytes#
 
- 
int blockOrder#
 
- 
int byteOffset#
 
- 
int continueOnError#
 
- 
int gfxBlockSize#
 
- 
int gfxSingleTeam#
 
- 
int gfxUnroll#
 
- 
int gfxWaveOrder#
 
- 
int hideEnv#
 
- 
int numCpuDevices#
 
- 
int numGpuDevices#
 
- 
int numIterations#
 
- 
int numWarmups#
 
- 
int outputToCsv#
 
- 
int samplingFactor#
 
- 
int showIterations#
 
- 
int useInteractive#
 
- 
int usePcieIndexing#
 
- 
int usePrepSrcKernel#
 
- 
int useSingleStream#
 
- 
int useXccFilter#
 
- 
int validateDirect#
 
- 
int numCpuSubExecs#
 
- 
int numGpuSubExecs#
 
- 
int p2pMode#
 
- 
int useDmaCopy#
 
- 
int useRemoteRead#
 
- 
int useFineGrain#
 
- 
int sweepMin#
 
- 
int sweepMax#
 
- 
int sweepTestLimit#
 
- 
int sweepTimeLimit#
 
- 
int sweepXgmiMin#
 
- 
int sweepXgmiMax#
 
- 
int sweepSeed#
 
- 
int sweepRandBytes#
 
- 
int a2aDirect#
 
- 
int a2aMode#
 
- 
int enableDebug#
 
- 
int gpuMaxHwQueues#
 
- 
ConfigModeEnum configMode#
 
 - 
inline EnvVars()#
 
- 
struct ExecutorInfo#
 - #include <TransferBench.hpp>
 
- 
struct SubExecParam#
 - #include <Kernels.hpp>
 
- 
struct Transfer#
 - #include <TransferBench.hpp>
Public Functions
Public Members
- 
int exeIndex#
 
- 
int exeSubIndex#
 
- 
int numSubExecs#
 
- 
size_t numBytes#
 
- 
int numSrcs#
 
- 
int numDsts#
 
- 
size_t numBytesActual#
 
- 
double transferTime#
 
- 
double transferBandwidth#
 
- 
double executorBandwidth#
 
- 
int transferIndex#
 
- 
std::vector<SubExecParam> subExecParam#
 
- 
SubExecParam *subExecParamGpuPtr#
 
- 
hsa_agent_t dstAgent#
 
- 
hsa_agent_t srcAgent#
 
- 
hsa_signal_t signal#
 
- 
hsa_amd_sdma_engine_id_t sdmaEngineId#
 
 - 
int exeIndex#
 
- 
namespace std#
 STL namespace.
- file Compatibility.hpp
 - #include <hip/hip_ext.h>#include <hip/hip_runtime.h>#include <hsa/hsa_ext_amd.h>
Defines
- 
HIP_CALL(cmd)#
 
 - 
HIP_CALL(cmd)#
 
- file EnvVars.hpp
 - #include <algorithm>#include <random>#include <time.h>#include “Compatibility.hpp”#include “Kernels.hpp”
 
- file GetClosestNumaNode.hpp
 Defines
- 
HSA_CHECK(cmd)#
 
Functions
- 
hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data)#
 
- 
hsa_status_t AgentInfoCallback(hsa_agent_t agent, void *data)#
 
- 
int GetClosestNumaNode(int gpuIdx)#
 
- 
HSA_CHECK(cmd)#
 
- file Kernels.hpp
 Defines
- 
PackedFloat_t#
 
- 
MAX_BLOCKSIZE#
 
- 
FLOATS_PER_PACK#
 
- 
MEMSET_CHAR#
 
- 
MEMSET_VAL#
 
- 
MAX_WAVEGROUPS#
 
- 
MAX_UNROLL#
 
- 
NUM_WAVEORDERS#
 
- 
MAX_SRCS#
 
- 
MAX_DSTS#
 
- 
GetHwId(hwId)#
 
- 
GetXccId(val)#
 
- 
GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)#
 
Typedefs
- 
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int)#
 
Functions
- 
void CpuReduceKernel(SubExecParam const &p)#
 
- __host__ __device__ float PrepSrcValue (int srcBufferIdx, size_t idx)
 
- __global__ void CollectXccIdsKernel (int *xccIds)
 
- __global__ void PrepSrcDataKernel (float *ptr, size_t N, int srcBufferIdx)
 
- __device__ int64_t GetTimestamp ()
 
- template<typename T> __device__ __forceinline__ T MemsetVal ()
 
- template<> __device__ __forceinline__ float MemsetVal ()
 
- template<int BLOCKSIZE, int UNROLL> __global__ void __launch_bounds__ (BLOCKSIZE) GpuReduceKernel(SubExecParam *params
 
- if (threadIdx.x==0) startCycle
 
- if (p.preferredXccId !=-1 &&xccId !=p.preferredXccId) return
 
- for (int i=0;i< numSrcs;i++) srcFloat4[i]
 
- switch (waveOrder)
 
- if (numSrcs==0)
 
Variables
- __global__ void int waveOrder {int64_t startCycle
 
- 
SubExecParam &p = params[blockIdx.y]#
 
- 
int32_t xccId#
 
- float4 const *__restrict__ srcFloat4 [MAX_SRCS]
 
- float4 *__restrict__ dstFloat4 [MAX_DSTS]
 
- 
int32_t const nWaves = BLOCKSIZE / warpSize#
 
- 
int32_t const waveIdx = threadIdx.x / warpSize#
 
- 
int32_t const tIdx = threadIdx.x % warpSize#
 
- 
int32_t teamStride#
 
- 
int32_t waveStride#
 
- 
int32_t unrlStride#
 
- 
int32_t teamStride2#
 
- 
int32_t waveStride2#
 
- 
size_t const loop1Limit = numFloat4 / loop1Stride * loop1Stride#
 
- 
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] = {GPU_KERNEL_UNROLL_DECL(64), GPU_KERNEL_UNROLL_DECL(128), GPU_KERNEL_UNROLL_DECL(192), GPU_KERNEL_UNROLL_DECL(256), GPU_KERNEL_UNROLL_DECL(320), GPU_KERNEL_UNROLL_DECL(384), GPU_KERNEL_UNROLL_DECL(448),}#
 
- 
PackedFloat_t#
 
- file TransferBench.hpp
 - #include <vector>#include <sstream>#include <chrono>#include <cstdio>#include <cstdlib>#include <cstdint>#include <set>#include <unistd.h>#include <map>#include <iostream>#include “Compatibility.hpp”#include “EnvVars.hpp”
Defines
- 
MAX_LINE_LEN#
 
Typedefs
- 
typedef std::map<Executor, ExecutorInfo> TransferMap#
 
Enums
Functions
- 
void DisplayUsage(char const *cmdName)#
 
- 
void DisplayTopology(bool const outputToCsv)#
 
- 
void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor, std::vector<size_t> &valuesofN)#
 
- 
void ParseMemType(EnvVars const &ev, std::string const &token, std::vector<MemType> &memType, std::vector<int> &memIndex)#
 
- 
void ParseExeType(EnvVars const &ev, std::string const &token, ExeType &exeType, int &exeIndex, int &exeSubIndex)#
 
- 
void ExecuteTransfers(EnvVars const &ev, int const testNum, size_t const N, std::vector<Transfer> &transfers, bool verbose = true, double *totalBandwidthCpu = nullptr)#
 
- 
void EnablePeerAccess(int const deviceId, int const peerDeviceId)#
 
- 
void CheckPages(char *byteArray, size_t numBytes, int targetId)#
 
- 
void RunTransfer(EnvVars const &ev, int const iteration, ExecutorInfo &exeInfo, int const transferIdx)#
 
- 
void RunSweepPreset(EnvVars const &ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom)#
 
- 
void RunAllToAllBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const numSubExecs)#
 
- 
void RunSchmooBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs)#
 
- 
void RunRemoteWriteBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)#
 
- 
void RunParallelCopyBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)#
 
- 
int RemappedIndex(int const origIdx, bool const isCpuType)#
 
 - 
MAX_LINE_LEN#
 
- file TransferBench.cpp
 - #include <numa.h>#include <cmath>#include <numaif.h>#include <random>#include <stack>#include <thread>#include “TransferBench.hpp”#include “GetClosestNumaNode.hpp”
Functions
- 
int main(int argc, char **argv)#
 
- 
void ExecuteTransfers(EnvVars const &ev, int const testNum, size_t const N, std::vector<Transfer> &transfers, bool verbose, double *totalBandwidthCpu)
 
- 
void DisplayUsage(char const *cmdName)
 
- 
int RemappedIndex(int const origIdx, bool const isCpuType)
 
- 
void DisplayTopology(bool const outputToCsv)
 
- 
void ParseMemType(EnvVars const &ev, std::string const &token, std::vector<MemType> &memTypes, std::vector<int> &memIndices)
 
- 
void ParseExeType(EnvVars const &ev, std::string const &token, ExeType &exeType, int &exeIndex, int &exeSubIndex)
 
- 
void EnablePeerAccess(int const deviceId, int const peerDeviceId)
 
- 
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void **memPtr)
 
- 
void DeallocateMemory(MemType memType, void *memPtr, size_t const bytes)
 
- 
void CheckPages(char *array, size_t numBytes, int targetId)
 
- 
uint32_t GetId(uint32_t hwId)#
 
- 
void RunTransfer(EnvVars const &ev, int const iteration, ExecutorInfo &exeInfo, int const transferIdx)
 
- 
void RunPeerToPeerBenchmarks(EnvVars const &ev, size_t N)
 
- 
void RunScalingBenchmark(EnvVars const &ev, size_t N, int const exeIndex, int const maxSubExecs)
 
- 
void RunAllToAllBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const numSubExecs)
 
- 
void RunSchmooBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs)
 
- 
void RunRemoteWriteBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
 
- 
void RunParallelCopyBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
 
- 
void RunSweepPreset(EnvVars const &ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
 
 - 
int main(int argc, char **argv)#
 
- dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.1.1/src/include
 
- dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.1.1/src