API#
-
struct AgentData#
- #include <GetClosestNumaNode.hpp>
-
class EnvVars#
- #include <EnvVars.hpp>
Public Functions
-
inline EnvVars()#
-
inline void DisplayEnvVars() const#
-
inline void DisplayP2PBenchmarkEnvVars() const#
-
inline void DisplaySweepEnvVars() const#
-
inline void DisplayA2AEnvVars() const#
Public Members
-
int const DEFAULT_NUM_WARMUPS = 3#
-
int const DEFAULT_NUM_ITERATIONS = 10#
-
int const DEFAULT_SAMPLING_FACTOR = 1#
-
int const DEFAULT_P2P_NUM_CPU_SE = 4#
-
int const DEFAULT_SWEEP_MIN = 1#
-
int const DEFAULT_SWEEP_MAX = 24#
-
int const DEFAULT_SWEEP_TEST_LIMIT = 0#
-
int const DEFAULT_SWEEP_TIME_LIMIT = 0#
-
int alwaysValidate#
-
int blockSize#
-
int blockBytes#
-
int blockOrder#
-
int byteOffset#
-
int continueOnError#
-
int hideEnv#
-
int numCpuDevices#
-
int numGpuDevices#
-
int numIterations#
-
int numWarmups#
-
int outputToCsv#
-
int samplingFactor#
-
int showIterations#
-
int useInteractive#
-
int usePcieIndexing#
-
int usePrepSrcKernel#
-
int useSingleStream#
-
int validateDirect#
-
int numCpuSubExecs#
-
int numGpuSubExecs#
-
int p2pMode#
-
int useDmaCopy#
-
int useRemoteRead#
-
int useFineGrain#
-
int sweepMin#
-
int sweepMax#
-
int sweepTestLimit#
-
int sweepTimeLimit#
-
int sweepXgmiMin#
-
int sweepXgmiMax#
-
int sweepSeed#
-
int sweepRandBytes#
-
int a2aDirect#
-
int enableDebug#
-
int gpuKernel#
-
ConfigModeEnum configMode#
-
inline EnvVars()#
-
struct ExecutorInfo#
- #include <TransferBench.hpp>
-
struct SubExecParam#
- #include <Kernels.hpp>
-
struct Transfer#
- #include <TransferBench.hpp>
Public Functions
-
namespace std#
STL namespace.
- file Compatibility.hpp
- #include <hip/hip_ext.h>#include <hip/hip_runtime.h>#include <hsa/hsa_ext_amd.h>
- file EnvVars.hpp
- #include <algorithm>#include <random>#include <time.h>#include “Compatibility.hpp”#include “Kernels.hpp”
- file GetClosestNumaNode.hpp
Defines
-
HSA_CHECK(cmd)#
-
HSA_CHECK(cmd)#
- file Kernels.hpp
Defines
-
PackedFloat_t#
-
WARP_SIZE#
-
MAX_BLOCKSIZE#
-
FLOATS_PER_PACK#
-
MEMSET_CHAR#
-
MEMSET_VAL#
-
MAX_SRCS#
-
MAX_DSTS#
-
__trace_hwreg()#
-
__trace_xccreg()#
-
NUM_GPU_KERNELS#
Typedefs
-
typedef void (*GpuKernelFuncPtr)(SubExecParam*)#
Functions
-
void CpuReduceKernel(SubExecParam const &p)#
- __host__ __device__ float PrepSrcValue (int srcBufferIdx, size_t idx)
- __global__ void PrepSrcDataKernel (float *ptr, size_t N, int srcBufferIdx)
- template<typename T> __device__ __forceinline__ T MemsetVal ()
- template<> __device__ __forceinline__ float MemsetVal ()
- template<int LOOP1_UNROLL> __global__ void __launch_bounds__ (MAX_BLOCKSIZE) GpuReduceKernel(SubExecParam *params)
- template<typename FLOAT_TYPE, int UNROLL_FACTOR> __device__ size_t GpuReduceFuncImpl2 (SubExecParam const &p, size_t const offset, size_t const N)
- template<typename FLOAT_TYPE, int UNROLL_FACTOR> __device__ size_t GpuReduceFuncImpl (SubExecParam const &p, size_t const offset, size_t const N)
- template<typename FLOAT_TYPE> __device__ size_t GpuReduceFunc (SubExecParam const &p, size_t const offset, size_t const N, int const unroll)
Variables
-
GpuKernelFuncPtr GpuKernelTable[NUM_GPU_KERNELS] = {GpuReduceKernel<8>, GpuReduceKernel<1>, GpuReduceKernel<2>, GpuReduceKernel<3>, GpuReduceKernel<4>, GpuReduceKernel<5>, GpuReduceKernel<6>, GpuReduceKernel<7>, GpuReduceKernel<8>, GpuReduceKernel<9>, GpuReduceKernel<10>, GpuReduceKernel<11>, GpuReduceKernel<12>, GpuReduceKernel<13>, GpuReduceKernel<14>, GpuReduceKernel<15>, GpuReduceKernel<16>, GpuReduceKernel2}#
-
PackedFloat_t#
- file TransferBench.hpp
- #include <vector>#include <sstream>#include <chrono>#include <cstdio>#include <cstdlib>#include <cstdint>#include <set>#include <unistd.h>#include <map>#include <iostream>#include “Compatibility.hpp”#include “EnvVars.hpp”
Typedefs
-
typedef std::map<Executor, ExecutorInfo> TransferMap#
Enums
Functions
-
void DisplayUsage(char const *cmdName)#
-
void DisplayTopology(bool const outputToCsv)#
-
void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor, std::vector<size_t> &valuesofN)#
-
void ParseMemType(std::string const &token, int const numCpus, int const numGpus, std::vector<MemType> &memType, std::vector<int> &memIndex)#
-
void ParseExeType(std::string const &token, int const numCpus, int const numGpus, ExeType &exeType, int &exeIndex)#
-
void ExecuteTransfers(EnvVars const &ev, int const testNum, size_t const N, std::vector<Transfer> &transfers, bool verbose = true, double *totalBandwidthCpu = nullptr)#
-
void EnablePeerAccess(int const deviceId, int const peerDeviceId)#
-
void CheckPages(char *byteArray, size_t numBytes, int targetId)#
-
void RunTransfer(EnvVars const &ev, int const iteration, ExecutorInfo &exeInfo, int const transferIdx)#
-
void RunSweepPreset(EnvVars const &ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom)#
-
void RunAllToAllBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const numSubExecs)#
-
int RemappedIndex(int const origIdx, bool const isCpuType)#
-
typedef std::map<Executor, ExecutorInfo> TransferMap#
- file TransferBench.cpp
- #include <numa.h>#include <cmath>#include <numaif.h>#include <random>#include <stack>#include <thread>#include “TransferBench.hpp”#include “GetClosestNumaNode.hpp”
Functions
-
int main(int argc, char **argv)#
-
void ExecuteTransfers(EnvVars const &ev, int const testNum, size_t const N, std::vector<Transfer> &transfers, bool verbose, double *totalBandwidthCpu)
-
void DisplayUsage(char const *cmdName)
-
int RemappedIndex(int const origIdx, bool const isCpuType)
-
void DisplayTopology(bool const outputToCsv)
-
void ParseMemType(std::string const &token, int const numCpus, int const numGpus, std::vector<MemType> &memTypes, std::vector<int> &memIndices)
-
void ParseExeType(std::string const &token, int const numCpus, int const numGpus, ExeType &exeType, int &exeIndex)
-
void EnablePeerAccess(int const deviceId, int const peerDeviceId)
-
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void **memPtr)
-
void DeallocateMemory(MemType memType, void *memPtr, size_t const bytes)
-
void CheckPages(char *array, size_t numBytes, int targetId)
-
uint32_t GetId(uint32_t hwId)#
-
void RunTransfer(EnvVars const &ev, int const iteration, ExecutorInfo &exeInfo, int const transferIdx)
-
void RunPeerToPeerBenchmarks(EnvVars const &ev, size_t N)
-
void RunScalingBenchmark(EnvVars const &ev, size_t N, int const exeIndex, int const maxSubExecs)
-
void RunAllToAllBenchmark(EnvVars const &ev, size_t const numBytesPerTransfer, int const numSubExecs)
-
void RunSweepPreset(EnvVars const &ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
-
int main(int argc, char **argv)#
- dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.0.0/src/include
- dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-transferbench/checkouts/docs-6.0.0/src