API#
-
struct AgentData#
- #include <GetClosestNumaNode.hpp>
Public Members
-
bool isInitialized#
-
std::vector<hsa_agent_t> cpuAgents#
-
std::vector<hsa_agent_t> gpuAgents#
-
bool isInitialized#
-
class EnvVars#
- #include <EnvVars.hpp>
Public Functions
-
inline EnvVars()#
-
inline void DisplayEnvVars() const#
-
inline void DisplayP2PBenchmarkEnvVars() const#
-
inline void DisplaySweepEnvVars() const#
-
inline void DisplayA2AEnvVars() const#
-
inline void DisplaySchmooEnvVars() const#
-
inline void DisplayRemoteWriteEnvVars() const#
-
inline void DisplayParallelCopyEnvVars() const#
Public Members
-
int const DEFAULT_NUM_WARMUPS = 3#
-
int const DEFAULT_NUM_ITERATIONS = 10#
-
int const DEFAULT_SAMPLING_FACTOR = 1#
-
int const DEFAULT_P2P_NUM_CPU_SE = 4#
-
int const DEFAULT_SWEEP_MIN = 1#
-
int const DEFAULT_SWEEP_MAX = 24#
-
int const DEFAULT_SWEEP_TEST_LIMIT = 0#
-
int const DEFAULT_SWEEP_TIME_LIMIT = 0#
-
int alwaysValidate#
-
int blockBytes#
-
int blockOrder#
-
int byteOffset#
-
int continueOnError#
-
int gfxBlockSize#
-
int gfxSingleTeam#
-
int gfxUnroll#
-
int gfxWaveOrder#
-
int hideEnv#
-
int minNumVarSubExec#
-
int maxNumVarSubExec#
-
int numCpuDevices#
-
int numGpuDevices#
-
int numIterations#
-
int numSubIterations#
-
int numWarmups#
-
int outputToCsv#
-
int samplingFactor#
-
int showIterations#
-
int useHsaDma#
-
int useInteractive#
-
int usePcieIndexing#
-
int usePrepSrcKernel#
-
int useSingleStream#
-
int useXccFilter#
-
int validateDirect#
-
int numCpuSubExecs#
-
int numGpuSubExecs#
-
int p2pMode#
-
int useDmaCopy#
-
int useRemoteRead#
-
int useFineGrain#
-
int sweepMin#
-
int sweepMax#
-
int sweepTestLimit#
-
int sweepTimeLimit#
-
int sweepXgmiMin#
-
int sweepXgmiMax#
-
int sweepSeed#
-
int sweepRandBytes#
-
int a2aDirect#
-
int a2aMode#
-
int enableDebug#
-
int gpuMaxHwQueues#
-
ConfigModeEnum configMode#
-
inline EnvVars()#
-
struct ExecutorInfo#
- #include <TransferBench.hpp>
Public Members
-
size_t totalBytes#
-
int totalSubExecs#
-
SubExecParam *subExecParamGpu#
-
std::vector<hipStream_t> streams#
-
std::vector<hipEvent_t> startEvents#
-
std::vector<hipEvent_t> stopEvents#
-
double totalTime#
-
size_t totalBytes#
-
struct ExeResult#
- #include <TransferBench.hpp>
-
struct SubExecParam#
- #include <Kernels.hpp>
-
struct TestResults#
- #include <TransferBench.hpp>
-
struct Transfer#
- #include <TransferBench.hpp>
Public Functions
Public Members
-
int exeIndex#
-
int exeSubIndex#
-
int numSubExecs#
-
size_t numBytes#
-
int numSrcs#
-
int numDsts#
-
size_t numBytesActual#
-
double transferTime#
-
double transferBandwidth#
-
double executorBandwidth#
-
int transferIndex#
-
std::vector<SubExecParam> subExecParam#
-
SubExecParam *subExecParamGpuPtr#
-
hsa_agent_t dstAgent#
-
hsa_agent_t srcAgent#
-
hsa_signal_t signal#
-
hsa_amd_sdma_engine_id_t sdmaEngineId#
-
int exeIndex#
-
namespace std#
STL namespace.
- file Compatibility.hpp
- #include <hip/hip_ext.h>#include <hip/hip_runtime.h>#include <hsa/hsa_ext_amd.h>
Defines
-
HIP_CALL(cmd)#
-
HIP_CALL(cmd)#
- file EnvVars.hpp
- #include <algorithm>#include <random>#include <time.h>#include “Compatibility.hpp”#include “Kernels.hpp”
- file GetClosestNumaNode.hpp
Defines
-
HSA_CHECK(cmd)#
Functions
-
hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data)#
-
hsa_status_t AgentInfoCallback(hsa_agent_t agent, void *data)#
-
int GetClosestNumaNode(int gpuIdx)#
-
HSA_CHECK(cmd)#
- file Kernels.hpp
Defines
-
PackedFloat_t#
-
MAX_BLOCKSIZE#
-
FLOATS_PER_PACK#
-
MEMSET_CHAR#
-
MEMSET_VAL#
-
MAX_WAVEGROUPS#
-
MAX_UNROLL#
-
NUM_WAVEORDERS#
-
MAX_SRCS#
-
MAX_DSTS#
-
GetHwId(hwId)#
-
GetXccId(val)#
-
GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)#
Typedefs
-
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int)#
Functions
-
void CpuReduceKernel(SubExecParam const &p)#
- __host__ __device__ float PrepSrcValue (int srcBufferIdx, size_t idx)
- __global__ void CollectXccIdsKernel (int *xccIds)
- __global__ void PrepSrcDataKernel (float *ptr, size_t N, int srcBufferIdx)
- __device__ int64_t GetTimestamp ()
- template<typename T> __device__ __forceinline__ T MemsetVal ()
- template<> __device__ __forceinline__ float MemsetVal ()
- template<int BLOCKSIZE, int UNROLL> __global__ void __launch_bounds__ (BLOCKSIZE) GpuReduceKernel(SubExecParam *params
- if (threadIdx.x==0) startCycle
- if (p.preferredXccId !=-1 &&xccId !=p.preferredXccId) return
- for (int i=0;i< numSrcs;i++) srcFloat4[i]
- switch (waveOrder)
- while (1)
-
__syncthreads()#
Variables
- __global__ void int waveOrder
- __global__ void int int numSubIterations {int64_t startCycle
-
SubExecParam &p = params[blockIdx.y]#
-
int32_t xccId#
- float4 const *__restrict__ srcFloat4 [MAX_SRCS]
- float4 *__restrict__ dstFloat4 [MAX_DSTS]
-
int32_t const nWaves = BLOCKSIZE / warpSize#
-
int32_t const waveIdx = threadIdx.x / warpSize#
-
int32_t const tIdx = threadIdx.x % warpSize#
-
int32_t teamStride#
-
int32_t waveStride#
-
int32_t unrlStride#
-
int32_t teamStride2#
-
int32_t waveStride2#
-
int subIterations = 0#
-
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] = {GPU_KERNEL_UNROLL_DECL(64), GPU_KERNEL_UNROLL_DECL(128), GPU_KERNEL_UNROLL_DECL(192), GPU_KERNEL_UNROLL_DECL(256), GPU_KERNEL_UNROLL_DECL(320), GPU_KERNEL_UNROLL_DECL(384), GPU_KERNEL_UNROLL_DECL(448),}#
-
PackedFloat_t#
- file TransferBench.hpp
- #include <vector>#include <sstream>#include <chrono>#include <cstdio>#include <cstdlib>#include <cstdint>#include <set>#include <unistd.h>#include <map>#include <iostream>#include “Compatibility.hpp”#include “EnvVars.hpp”
Defines
-
MAX_LINE_LEN#
Typedefs
-
typedef std::map<Executor, ExecutorInfo> TransferMap#
Enums
Functions
-
void DisplayUsage(char const *cmdName)#
-
void DisplayTopology(bool const outputToCsv)#
-
void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor, std::vector<size_t> &valuesofN)#
-
void ParseMemType(EnvVars const &ev, std::string const &token, std::vector<MemType> &memType, std::vector<int> &memIndex)#
-
void ParseExeType(EnvVars const &ev, std::string const &token, ExeType &exeType, int &exeIndex, int &exeSubIndex)#
-
void ExecuteTransfers(EnvVars const &ev, int const testNum, size_t const N, std::vector<Transfer> &transfers, bool verbose = true, double *totalBandwidthCpu = nullptr)#
-
TestResults ExecuteTransfersImpl(EnvVars const &ev, std::vector<Transfer> &transfers)#
-
void ReportResults(EnvVars const &ev, std::vector<Transfer> const &transfers, TestResults const results)#
-
void EnablePeerAccess(int const deviceId, int const peerDeviceId)#
-
void CheckPages(char *byteArray, size_t numBytes, int targetId)#
-
void RunTransfer(EnvVars const &ev, int const iteration, ExecutorInfo &exeInfo, int const transferIdx)#
-
void RunSweepPreset(EnvVars const &ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom)#
-
void RunAllToAllBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const numSubExecs)#
-
void RunSchmooBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs)#
-
void RunRemoteWriteBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)#
-
void RunParallelCopyBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)#
-
int RemappedIndex(int const origIdx, bool const isCpuType)#
-
MAX_LINE_LEN#
- file TransferBench.cpp
- #include <numa.h>#include <cmath>#include <numaif.h>#include <random>#include <stack>#include <thread>#include “TransferBench.hpp”#include “GetClosestNumaNode.hpp”
Functions
-
int main(int argc, char **argv)#
-
void ExecuteTransfers(EnvVars const &ev, int const testNum, size_t const N, std::vector<Transfer> &transfers, bool verbose, double *totalBandwidthCpu)
-
TestResults ExecuteTransfersImpl(EnvVars const &ev, std::vector<Transfer> &transfers)
-
void DisplayUsage(char const *cmdName)
-
int RemappedIndex(int const origIdx, bool const isCpuType)
-
void DisplayTopology(bool const outputToCsv)
-
void ParseMemType(EnvVars const &ev, std::string const &token, std::vector<MemType> &memTypes, std::vector<int> &memIndices)
-
void ParseExeType(EnvVars const &ev, std::string const &token, ExeType &exeType, int &exeIndex, int &exeSubIndex)
-
void EnablePeerAccess(int const deviceId, int const peerDeviceId)
-
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void **memPtr)
-
void DeallocateMemory(MemType memType, void *memPtr, size_t const bytes)
-
void CheckPages(char *array, size_t numBytes, int targetId)
-
uint32_t GetId(uint32_t hwId)#
-
void RunTransfer(EnvVars const &ev, int const iteration, ExecutorInfo &exeInfo, int const transferIdx)
-
void RunPeerToPeerBenchmarks(EnvVars const &ev, size_t N)
-
void RunScalingBenchmark(EnvVars const &ev, size_t N, int const exeIndex, int const maxSubExecs)
-
void RunAllToAllBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const numSubExecs)
-
void RunSchmooBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs)
-
void RunRemoteWriteBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
-
void RunParallelCopyBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
-
void RunSweepPreset(EnvVars const &ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
-
void ReportResults(EnvVars const &ev, std::vector<Transfer> const &transfers, TestResults const results)
-
void RunHealthCheck(EnvVars ev)
-
int main(int argc, char **argv)#
- dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.2.4/src/include
- dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.2.4/src