API reference#
-
struct rdc_component_version_t#
- #include <rdc.h>
Store version information for each component.
Public Members
-
char version[RDC_MAX_VERSION_STR_LENGTH]#
-
char version[RDC_MAX_VERSION_STR_LENGTH]#
-
struct rdc_device_attributes_t#
- #include <rdc.h>
Represents attributes corresponding to a device.
Public Members
-
char device_name[RDC_MAX_STR_LENGTH]#
Name of the device.
-
char device_name[RDC_MAX_STR_LENGTH]#
-
struct rdc_diag_callback_t#
- #include <rdc.h>
Public Members
-
rdc_callback_t callback#
Callback sends logs for running diagnostics.
-
void *cookie#
Cookie is used to identify different callbacks and supply them with data.
-
rdc_callback_t callback#
-
struct rdc_diag_test_result_t#
- #include <rdc.h>
The diagnostic results for all GPUs.
Public Members
-
rdc_diag_result_t status#
The diagnostic result.
-
rdc_diag_detail_t details#
The summary details.
-
rdc_diag_test_cases_t test_case#
The test case to run.
-
uint32_t per_gpu_result_count#
Result details.
How many gpu_results
-
rdc_diag_per_gpu_result_t gpu_results[RDC_MAX_NUM_DEVICES]#
-
char info[MAX_DIAG_MSG_LENGTH]#
Detail information.
-
rdc_diag_result_t status#
-
struct rdc_field_group_info_t#
- #include <rdc.h>
The structure to store the field group info.
Public Members
-
uint32_t count#
count of fields in the group
-
char group_name[RDC_MAX_STR_LENGTH]#
field group name
-
rdc_field_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP]#
The list of fields in the group
-
uint32_t count#
-
struct rdc_field_value#
- #include <rdc.h>
The structure to store the field value.
Public Members
-
rdc_field_t field_id#
The field id of the value.
-
int status#
RDC_ST_OK or error status.
-
uint64_t ts#
Timestamp in usec since 1970.
-
rdc_field_type_t type#
The field type.
-
rdc_field_value_data value#
Value of the field. Value type depends on the field type.
-
rdc_field_t field_id#
-
struct rdc_gpu_usage_info_t#
- #include <rdc.h>
The structure to hold the GPU usage information.
Public Members
-
uint32_t gpu_id#
GPU_ID_INVALID for summary information.
-
uint64_t start_time#
The time to start the watching.
-
uint64_t end_time#
The time to stop the watching.
-
uint64_t energy_consumed#
GPU Energy consumed.
-
uint64_t ecc_correct#
Correctable errors.
-
uint64_t ecc_uncorrect#
Uncorrectable errors.
-
rdc_stats_summary_t pcie_tx#
Bytes sent over PCIe stats.
-
rdc_stats_summary_t pcie_rx#
Bytes received over PCIe stats.
-
rdc_stats_summary_t power_usage#
GPU Power usage stats.
-
rdc_stats_summary_t gpu_clock#
GPU Clock speed stats.
-
rdc_stats_summary_t memory_clock#
Mem. Clock speed stats.
-
rdc_stats_summary_t gpu_utilization#
GPU Utilization stats.
-
rdc_stats_summary_t gpu_temperature#
GPU temperature stats.
-
uint64_t max_gpu_memory_used#
Maximum GPU memory used.
-
rdc_stats_summary_t memory_utilization#
Memory Utilization statistics.
-
uint32_t gpu_id#
-
struct rdc_health_incidents_t#
- #include <rdc.h>
details of the per health incidents
Public Members
-
uint32_t gpu_index#
which GPU in this group have the issue
-
rdc_health_system_t component#
which components have the issue
-
rdc_health_result_t health#
health diagnosis of this incident
-
rdc_health_detail_t error#
The details of the error, rdc_health_error_code_t.
-
uint32_t gpu_index#
-
struct rdc_health_response_t#
- #include <rdc.h>
The health responses for test cases.
Public Members
-
rdc_health_result_t overall_health#
The overall health of this entire host.
-
unsigned int incidents_count#
The number of health incidents reported in this struct.
-
rdc_health_incidents_t incidents[HEALTH_MAX_ERROR_ITEMS]#
Report of the errors detected.
-
rdc_health_result_t overall_health#
-
struct rdc_job_group_info_t#
- #include <rdc.h>
The structure to store the job info.
Public Members
-
char job_id[RDC_MAX_STR_LENGTH]#
job id
-
rdc_gpu_group_t group_id#
group name
-
uint64_t start_time#
job start time
-
uint64_t stop_time#
job stop time
-
char job_id[RDC_MAX_STR_LENGTH]#
-
struct rdc_job_info_t#
- #include <rdc.h>
The structure to hold the job stats.
Public Members
-
uint32_t num_gpus#
Number of GPUs used by job.
-
rdc_gpu_usage_info_t summary#
Job usage summary statistics (overall)
-
rdc_gpu_usage_info_t gpus[16]#
Job usage summary statistics by GPU.
-
uint32_t num_gpus#
-
struct rdc_policy_callback_response_t#
- #include <rdc.h>
Define the structure is used in RDC policy callback
Public Members
-
unsigned int version#
-
rdc_policy_condition_t condition#
the condition that is meet
-
rdc_gpu_group_t group_id#
The group id trigger this callback.
-
int64_t value#
The current value that meet the condition.
-
unsigned int version#
-
struct rdc_policy_t#
- #include <rdc.h>
The structure to define policy to enforce on GPU.
Public Members
-
rdc_policy_condition_t condition#
condition to meet
-
rdc_policy_action_t action#
Action to take.
-
rdc_policy_condition_t condition#
-
namespace std#
STL namespace.
- file rdc.h
- #include <stddef.h>#include <stdint.h>
The rocm_rdc library api is new, and therefore subject to change either at the ABI or API level. Instead of marking every function prototype as “unstable”, we are instead saying the API is unstable (i.e., changes are possible) while the major version remains 0. This means that if the API/ABI changes, we will not increment the major version to 1. Once the ABI stabilizes, we will increment the major version to 1, and thereafter increment it on all ABI breaks.
Main header file for the ROCm RDC library. All required function, structure, enum, etc. definitions should be defined in this file.
Defines
-
GPU_ID_INVALID#
ID used to represent an invalid GPU.
-
RDC_GROUP_ALL_GPUS#
Used to specify all GPUs.
-
RDC_JOB_STATS_FIELDS#
Used to specify all stats fields.
-
RDC_MAX_STR_LENGTH#
The max rdc field string length.
-
RDC_GROUP_MAX_ENTITIES#
The max entities in a group.
-
RDC_MAX_NUM_DEVICES#
Max number of GPUs supported by RDC.
-
RDC_MAX_FIELD_IDS_PER_FIELD_GROUP#
The max fields in a field group.
-
RDC_MAX_NUM_GROUPS#
The max number of groups.
-
RDC_MAX_NUM_FIELD_GROUPS#
The max number of the field groups.
-
RDC_MAX_VERSION_STR_LENGTH#
The max string length occupied by version information.
-
RDC_EVNT_IS_NOTIF_FIELD(FIELD)#
-
MAX_TEST_CASES#
The maximum test cases to run.
-
MAX_DIAG_MSG_LENGTH#
The maximum length of the diagnostic messages.
-
MAX_HEALTH_MSG_LENGTH#
The maximum length of the health messages.
-
PCIE_MAX_REPLAYS_PERMIN#
8 replays per minute is the maximum recommended
-
HEALTH_MAX_ERROR_ITEMS#
-
RDC_MAX_POLICY_SETTINGS#
Typedefs
-
typedef void *rdc_handle_t#
handlers used in various rdc calls
Handle used for an RDC session
-
typedef uint32_t rdc_gpu_group_t#
GPU Group ID type.
-
typedef uint32_t rdc_field_grp_t#
Field group ID type.
-
typedef void (*rdc_callback_t)(void*, void*)#
-
typedef int (*rdc_policy_register_callback)(rdc_policy_callback_response_t *userData)#
The user data is the rdc_policy_callback_response_t
Enums
-
enum rdc_status_t#
Error codes returned by rocm_rdc_lib functions.
Values:
-
enumerator RDC_ST_OK#
Success.
-
enumerator RDC_ST_NOT_SUPPORTED#
Not supported feature.
-
enumerator RDC_ST_MSI_ERROR#
The MSI library error.
-
enumerator RDC_ST_FAIL_LOAD_MODULE#
Fail to load the library.
-
enumerator RDC_ST_INVALID_HANDLER#
Invalid handler.
-
enumerator RDC_ST_BAD_PARAMETER#
A parameter is invalid.
-
enumerator RDC_ST_NOT_FOUND#
Cannot find the value.
-
enumerator RDC_ST_CONFLICT#
Conflict with current state.
-
enumerator RDC_ST_CLIENT_ERROR#
The RDC client error.
-
enumerator RDC_ST_ALREADY_EXIST#
The item already exists.
-
enumerator RDC_ST_MAX_LIMIT#
Max limit recording for the object.
-
enumerator RDC_ST_INSUFF_RESOURCES#
Not enough resources to complete operation
-
enumerator RDC_ST_FILE_ERROR#
Failed to access a file.
-
enumerator RDC_ST_NO_DATA#
Data was requested, but none was found
-
enumerator RDC_ST_PERM_ERROR#
Insufficient permission to complete operation
-
enumerator RDC_ST_DISABLED_MODULE#
Attempted loading disabled module.
-
enumerator RDC_ST_UNKNOWN_ERROR#
Unknown error.
-
enumerator RDC_ST_OK#
-
enum rdc_operation_mode_t#
rdc operation mode rdc can run in auto mode where background threads will collect metrics. When run in manual mode, the user needs to periodically call rdc_field_update_all for data collection.
Values:
-
enumerator RDC_OPERATION_MODE_AUTO#
-
enumerator RDC_OPERATION_MODE_MANUAL#
-
enumerator RDC_OPERATION_MODE_AUTO#
-
enum rdc_group_type_t#
type of GPU group
Values:
-
enumerator RDC_GROUP_DEFAULT#
All GPUs on the Node.
-
enumerator RDC_GROUP_EMPTY#
Empty group.
-
enumerator RDC_GROUP_DEFAULT#
-
enum rdc_field_type_t#
the type stored in the filed value
Values:
-
enumerator INTEGER#
-
enumerator DOUBLE#
-
enumerator STRING#
-
enumerator BLOB#
-
enumerator INTEGER#
-
enum rdc_field_t#
These enums are used to specify a particular field to be retrieved.
Values:
-
enumerator RDC_FI_INVALID#
Identifier fields.
Invalid field value
-
enumerator RDC_FI_GPU_COUNT#
GPU count in the system.
-
enumerator RDC_FI_DEV_NAME#
Name of the device.
-
enumerator RDC_FI_OAM_ID#
OAM ID of the device.
-
enumerator RDC_FI_GPU_CLOCK#
Frequency related fields.
The current clock for the GPU
-
enumerator RDC_FI_MEM_CLOCK#
Clock for the memory.
-
enumerator RDC_FI_MEMORY_TEMP#
Memory temperature for the device.
-
enumerator RDC_FI_GPU_TEMP#
Current temperature for the device.
-
enumerator RDC_FI_POWER_USAGE#
Power usage for the device.
-
enumerator RDC_FI_PCIE_TX#
PCIe related fields.
PCIe Tx utilization information
-
enumerator RDC_FI_PCIE_RX#
PCIe Rx utilization information.
-
enumerator RDC_FI_PCIE_BANDWIDTH#
PCIe bandwidth in Mbps.
-
enumerator RDC_FI_GPU_UTIL#
GPU usage related fields.
GPU Utilization
-
enumerator RDC_FI_GPU_MEMORY_USAGE#
Memory usage of the GPU instance.
-
enumerator RDC_FI_GPU_MEMORY_TOTAL#
Total memory of the GPU instance.
-
enumerator RDC_FI_GPU_MM_ENC_UTIL#
Multimedia encoder busy percentage.
-
enumerator RDC_FI_GPU_MM_DEC_UTIL#
Multimedia decoder busy percentage.
-
enumerator RDC_FI_GPU_MEMORY_ACTIVITY#
Memory busy percentage.
-
enumerator RDC_FI_GPU_PAGE_RETRIED#
GPU page related fields.
Retried page of the GPU instance
-
enumerator RDC_FI_ECC_CORRECT_TOTAL#
ECC related fields.
Accumulated correctable ECC errors
-
enumerator RDC_FI_ECC_UNCORRECT_TOTAL#
Accumulated uncorrectable ECC errors.
-
enumerator RDC_FI_ECC_FIRST#
FIRST Error Correction and Detection field.
-
enumerator RDC_FI_ECC_SDMA_CE#
-
enumerator RDC_FI_ECC_SDMA_UE#
-
enumerator RDC_FI_ECC_GFX_CE#
-
enumerator RDC_FI_ECC_GFX_UE#
-
enumerator RDC_FI_ECC_MMHUB_CE#
-
enumerator RDC_FI_ECC_MMHUB_UE#
-
enumerator RDC_FI_ECC_ATHUB_CE#
-
enumerator RDC_FI_ECC_ATHUB_UE#
-
enumerator RDC_FI_ECC_PCIE_BIF_CE#
-
enumerator RDC_FI_ECC_PCIE_BIF_UE#
-
enumerator RDC_FI_ECC_HDP_CE#
-
enumerator RDC_FI_ECC_HDP_UE#
-
enumerator RDC_FI_ECC_XGMI_WAFL_CE#
-
enumerator RDC_FI_ECC_XGMI_WAFL_UE#
-
enumerator RDC_FI_ECC_DF_CE#
-
enumerator RDC_FI_ECC_DF_UE#
-
enumerator RDC_FI_ECC_SMN_CE#
-
enumerator RDC_FI_ECC_SMN_UE#
-
enumerator RDC_FI_ECC_SEM_CE#
-
enumerator RDC_FI_ECC_SEM_UE#
-
enumerator RDC_FI_ECC_MP0_CE#
-
enumerator RDC_FI_ECC_MP0_UE#
-
enumerator RDC_FI_ECC_MP1_CE#
-
enumerator RDC_FI_ECC_MP1_UE#
-
enumerator RDC_FI_ECC_FUSE_CE#
-
enumerator RDC_FI_ECC_FUSE_UE#
-
enumerator RDC_FI_ECC_UMC_CE#
-
enumerator RDC_FI_ECC_UMC_UE#
-
enumerator RDC_FI_ECC_MCA_CE#
-
enumerator RDC_FI_ECC_MCA_UE#
-
enumerator RDC_FI_ECC_VCN_CE#
-
enumerator RDC_FI_ECC_VCN_UE#
-
enumerator RDC_FI_ECC_JPEG_CE#
-
enumerator RDC_FI_ECC_JPEG_UE#
-
enumerator RDC_FI_ECC_IH_CE#
-
enumerator RDC_FI_ECC_IH_UE#
-
enumerator RDC_FI_ECC_MPIO_CE#
-
enumerator RDC_FI_ECC_MPIO_UE#
-
enumerator RDC_FI_ECC_LAST#
-
enumerator RDC_FI_XGMI_0_READ_KB#
XGMI_0 accumulated data read size (KB)
-
enumerator RDC_FI_XGMI_1_READ_KB#
XGMI_1 accumulated data read size (KB)
-
enumerator RDC_FI_XGMI_2_READ_KB#
XGMI_2 accumulated data read size (KB)
-
enumerator RDC_FI_XGMI_3_READ_KB#
XGMI_3 accumulated data read size (KB)
-
enumerator RDC_FI_XGMI_4_READ_KB#
XGMI_4 accumulated data read size (KB)
-
enumerator RDC_FI_XGMI_5_READ_KB#
XGMI_5 accumulated data read size (KB)
-
enumerator RDC_FI_XGMI_6_READ_KB#
XGMI_6 accumulated data read size (KB)
-
enumerator RDC_FI_XGMI_7_READ_KB#
XGMI_7 accumulated data read size (KB)
-
enumerator RDC_FI_XGMI_0_WRITE_KB#
XGMI_0 accumulated data write size (KB)
-
enumerator RDC_FI_XGMI_1_WRITE_KB#
XGMI_1 accumulated data write size (KB)
-
enumerator RDC_FI_XGMI_2_WRITE_KB#
XGMI_2 accumulated data write size (KB)
-
enumerator RDC_FI_XGMI_3_WRITE_KB#
XGMI_3 accumulated data write size (KB)
-
enumerator RDC_FI_XGMI_4_WRITE_KB#
XGMI_4 accumulated data write size (KB)
-
enumerator RDC_FI_XGMI_5_WRITE_KB#
XGMI_5 accumulated data write size (KB)
-
enumerator RDC_FI_XGMI_6_WRITE_KB#
XGMI_6 accumulated data write size (KB)
-
enumerator RDC_FI_XGMI_7_WRITE_KB#
XGMI_7 accumulated data write size (KB)
-
enumerator RDC_FI_XGMI_TOTAL_READ_KB#
XGMI_SUM accumulated data read size (KB)
-
enumerator RDC_FI_XGMI_TOTAL_WRITE_KB#
XGMI_SUM accumulated data write size (KB)
-
enumerator RDC_FI_PROF_OCCUPANCY_PERCENT#
ROC-profiler related fields.
-
enumerator RDC_FI_PROF_ACTIVE_CYCLES#
-
enumerator RDC_FI_PROF_ACTIVE_WAVES#
-
enumerator RDC_FI_PROF_ELAPSED_CYCLES#
-
enumerator RDC_FI_PROF_TENSOR_ACTIVE_PERCENT#
-
enumerator RDC_FI_PROF_GPU_UTIL_PERCENT#
-
enumerator RDC_FI_PROF_EVAL_MEM_R_BW#
-
enumerator RDC_FI_PROF_EVAL_MEM_W_BW#
-
enumerator RDC_FI_PROF_EVAL_FLOPS_16#
-
enumerator RDC_FI_PROF_EVAL_FLOPS_32#
-
enumerator RDC_FI_PROF_EVAL_FLOPS_64#
-
enumerator RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL#
-
enumerator RDC_EVNT_XGMI_0_NOP_TX#
Raw XGMI counter events.
NOPs sent to neighbor 0
-
enumerator RDC_EVNT_XGMI_0_REQ_TX#
Outgoing requests to neighbor 0
-
enumerator RDC_EVNT_XGMI_0_RESP_TX#
Outgoing responses to neighbor 0
-
enumerator RDC_EVNT_XGMI_0_BEATS_TX#
Data beats sent to neighbor 0; Each beat represents 32 bytes.
XGMI throughput can be calculated by multiplying a BEATs event such as RDC_EVNT_XGMI_0_BEATS_TX by 32 and dividing by the time for which event collection occurred, rdc_gpu_usage_info_t.start_time (which is in nanoseconds). To get bytes per second, multiply this value by 109
.
Throughput = BEATS/time_running * 10
9 (bytes/second)
-
enumerator RDC_EVNT_XGMI_1_NOP_TX#
NOPs sent to neighbor 1.
-
enumerator RDC_EVNT_XGMI_1_REQ_TX#
Outgoing requests to neighbor 1
-
enumerator RDC_EVNT_XGMI_1_RESP_TX#
Outgoing responses to neighbor 1
-
enumerator RDC_EVNT_XGMI_1_BEATS_TX#
Data beats sent to neighbor 1; Each beat represents 32 bytes
-
enumerator RDC_EVNT_XGMI_0_THRPUT#
Transmit throughput to XGMI neighbor 0 in byes/sec
-
enumerator RDC_EVNT_XGMI_1_THRPUT#
Transmit throughput to XGMI neighbor 1 in byes/sec
-
enumerator RDC_EVNT_XGMI_2_THRPUT#
Transmit throughput to XGMI neighbor 2 in byes/sec
-
enumerator RDC_EVNT_XGMI_3_THRPUT#
Transmit throughput to XGMI neighbor 3 in byes/sec
-
enumerator RDC_EVNT_XGMI_4_THRPUT#
Transmit throughput to XGMI neighbor 4 in byes/sec
-
enumerator RDC_EVNT_XGMI_5_THRPUT#
Transmit throughput to XGMI neighbor 5 in byes/sec
-
enumerator RDC_EVNT_NOTIF_VMFAULT#
VM page fault.
-
enumerator RDC_EVNT_NOTIF_FIRST#
-
enumerator RDC_EVNT_NOTIF_THERMAL_THROTTLE#
Clock frequency has decreased due to temperature rise
-
enumerator RDC_EVNT_NOTIF_PRE_RESET#
GPU reset is about to occur.
-
enumerator RDC_EVNT_NOTIF_POST_RESET#
GPU reset just occurred.
-
enumerator RDC_EVNT_NOTIF_RING_HANG#
GPU ring hang just occurred.
-
enumerator RDC_EVNT_NOTIF_LAST#
-
enumerator RDC_HEALTH_XGMI_ERROR#
RDC health related fields.
XGMI one or more errors detected
-
enumerator RDC_HEALTH_PCIE_REPLAY_COUNT#
Total PCIE replay count.
-
enumerator RDC_HEALTH_RETIRED_PAGE_NUM#
Retired page number.
-
enumerator RDC_HEALTH_PENDING_PAGE_NUM#
Pending page number.
-
enumerator RDC_HEALTH_RETIRED_PAGE_LIMIT#
The threshold of retired page.
-
enumerator RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT#
The threshold of uncorrectable page.
-
enumerator RDC_HEALTH_POWER_THROTTLE_TIME#
Power throttle status counter.
-
enumerator RDC_HEALTH_THERMAL_THROTTLE_TIME#
Total time in thermal throttle status (microseconds)
-
enumerator RDC_FI_INVALID#
-
enum rdc_diag_level_t#
type of diagnostic level
Values:
-
enumerator RDC_DIAG_LVL_INVALID#
invalid level
-
enumerator RDC_DIAG_LVL_SHORT#
take a few seconds to run
-
enumerator RDC_DIAG_LVL_MED#
take less than 2 minutes to run
-
enumerator RDC_DIAG_LVL_LONG#
take up to 15 minutes to run
-
enumerator RDC_DIAG_LVL_INVALID#
-
enum rdc_diag_result_t#
type of diagnostic result
Values:
-
enumerator RDC_DIAG_RESULT_PASS#
The diagnostic test pass.
-
enumerator RDC_DIAG_RESULT_SKIP#
The diagnostic test skipped.
-
enumerator RDC_DIAG_RESULT_WARN#
The diagnostic test has warnings.
-
enumerator RDC_DIAG_RESULT_FAIL#
The diagnostic test fail.
-
enumerator RDC_DIAG_RESULT_PASS#
-
enum rdc_diag_test_cases_t#
The test cases to run.
Values:
-
enumerator RDC_DIAG_TEST_FIRST#
The diagnostic test pass.
-
enumerator RDC_DIAG_COMPUTE_PROCESS#
-
enumerator RDC_DIAG_COMPUTE_QUEUE#
The Compute Queue is ready.
-
enumerator RDC_DIAG_SYS_MEM_CHECK#
Check System memory.
-
enumerator RDC_DIAG_NODE_TOPOLOGY#
Report node topology.
-
enumerator RDC_DIAG_RVS_TEST#
TODO: Replace with real RVS tests.
-
enumerator RDC_DIAG_GPU_PARAMETERS#
GPU parameters in range.
-
enumerator RDC_DIAG_TEST_LAST#
-
enumerator RDC_DIAG_TEST_FIRST#
-
enum rdc_policy_condition_type_t#
The policy type to support.
Values:
-
enumerator RDC_POLICY_COND_FIRST#
-
enumerator RDC_POLICY_COND_MAX_PAGE_RETRIED#
Max number of page retired.
-
enumerator RDC_POLICY_COND_THERMAL#
Temperature threshold, millidegree Celsius.
-
enumerator RDC_POLICY_COND_POWER#
Power threshold, unit milliwatt.
-
enumerator RDC_POLICY_COND_LAST#
-
enumerator RDC_POLICY_COND_MAX#
-
enumerator RDC_POLICY_COND_FIRST#
-
enum rdc_policy_action_t#
Values:
-
enumerator RDC_POLICY_ACTION_NONE#
-
enumerator RDC_POLICY_ACTION_GPU_RESET#
-
enumerator RDC_POLICY_ACTION_NONE#
-
enum rdc_health_system_t#
type of health watches
Values:
-
enumerator RDC_HEALTH_WATCH_PCIE#
PCIe system watches.
-
enumerator RDC_HEALTH_WATCH_XGMI#
XGMI system watches.
-
enumerator RDC_HEALTH_WATCH_MEM#
Memory watches.
-
enumerator RDC_HEALTH_WATCH_INFOROM#
Inforom watches.
-
enumerator RDC_HEALTH_WATCH_THERMAL#
Temperature watches.
-
enumerator RDC_HEALTH_WATCH_POWER#
Power watches.
-
enumerator RDC_HEALTH_WATCH_PCIE#
-
enum rdc_health_result_t#
type of health result
Values:
-
enumerator RDC_HEALTH_RESULT_PASS#
The health test pass.
-
enumerator RDC_HEALTH_RESULT_WARN#
The health test has warnings.
-
enumerator RDC_HEALTH_RESULT_FAIL#
The health test fail.
-
enumerator RDC_HEALTH_RESULT_PASS#
-
enum rdc_health_error_code_t#
Values:
-
enumerator RDC_FR_PCI_REPLAY_RATE#
-
enumerator RDC_FR_ECC_UNCORRECTABLE_DETECTED#
-
enumerator RDC_FR_PENDING_PAGE_RETIREMENTS#
-
enumerator RDC_FR_RETIRED_PAGES_LIMIT#
-
enumerator RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT#
-
enumerator RDC_FR_CLOCKS_THROTTLE_THERMAL#
-
enumerator RDC_FR_CLOCKS_THROTTLE_POWER#
-
enumerator RDC_FR_XGMI_SINGLE_ERROR#
-
enumerator RDC_FR_XGMI_MULTIPLE_ERROR#
-
enumerator RDC_FR_CORRUPT_INFOROM#
-
enumerator RDC_FR_PCI_REPLAY_RATE#
Functions
-
rdc_status_t rdc_init(uint64_t init_flags)#
Initialize ROCm RDC.
When called, this initializes internal data structures, including those corresponding to sources of information that RDC provides. This must be called before rdc_start_embedded() or rdc_connect()
- Parameters:
init_flags – [in] init_flags Bit flags that tell RDC how to initialize.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_shutdown()#
Shutdown ROCm RDC.
Do any necessary clean up.
-
rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode, rdc_handle_t *p_rdc_handle)#
Start embedded RDC agent within this process.
The RDC is loaded as library so that it does not require rdcd daemon. In this mode, the user has to periodically call rdc_field_update_all() when op_mode is RDC_OPERATION_MODE_MANUAL, which tells RDC to collect the stats.
- Parameters:
op_mode – [in] Operation modes. When RDC_OPERATION_MODE_AUTO, RDC schedules background task to collect the stats. When RDC_OPERATION_MODE_MANUAL, the user needs to call rdc_field_update_all() periodically.
p_rdc_handle – [inout] Caller provided pointer to rdc_handle_t. Upon successful call, the value will contain the handler for following API calls.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_stop_embedded(rdc_handle_t p_rdc_handle)#
Stop embedded RDC agent.
Stop the embedded RDC agent, and p_rdc_handle becomes invalid after this call.
- Parameters:
p_rdc_handle – [in] The RDC handler that come from rdc_start_embedded().
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_connect(const char *ipAndPort, rdc_handle_t *p_rdc_handle, const char *root_ca, const char *client_cert, const char *client_key)#
Connect to rdcd daemon.
This method is used to connect to a remote stand-alone rdcd daemon.
- Parameters:
ipAndPort – [in] The IP and port of the remote rdcd. The ipAndPort can be specified in this x.x.x.x:yyyy format, where x.x.x.x is the IP address and yyyy is the port.
p_rdc_handle – [inout] Caller provided pointer to rdc_handle_t. Upon successful call, the value will contain the handler for following API calls.
root_ca – [in] The root CA stored in the string in pem format. Set it as nullptr if the communication is not encrypted.
client_cert – [in] The client certificate stored in the string in pem format. Set it as nullptr if the communication is not encrypted.
client_key – [in] The client key stored in the string in pem format. Set it as nullptr if the communication is not encrypted.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_disconnect(rdc_handle_t p_rdc_handle)#
Disconnect from rdcd daemon.
Disconnect from rdcd daemon, and p_rdc_handle becomes invalid after this call.
- Parameters:
p_rdc_handle – [in] The RDC handler that come from rdc_connect().
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, const char job_id[64], uint64_t update_freq)#
Request the RDC to watch the job stats.
This should be executed as part of job prologue. The summary job stats can be retrieved using rdc_job_get_stats(). In RDC_OPERATION_MODE_MANUAL, user must call rdc_field_update_all(1) at least once, before call rdc_job_get_stats()
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id – [in] The group of GPUs to be watched.
job_id – [in] The name of the job.
update_freq – [in] How often to update this field in usec.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, const char job_id[64], rdc_job_info_t *p_job_info)#
Get the stats of the job using the job id.
The stats can be retrieved at any point when the job is in process.
- Parameters:
p_rdc_handle – [in] The RDC handler.
job_id – [in] The name of the job.
p_job_info – [inout] Caller provided pointer to rdc_job_info_t. Upon successful call, the value will contain the stats of the job.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle, const char job_id[64])#
Request RDC to stop watching the stats of the job.
This should be execute as part of job epilogue. The job Id remains available to view the stats at any point. You must call rdc_watch_job_fields() before this call.
- Parameters:
p_rdc_handle – [in] The RDC handler.
job_id – [in] The name of the job.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_job_remove(rdc_handle_t p_rdc_handle, const char job_id[64])#
Request RDC to stop tracking the job given by job_id.
After this call, you will no longer be able to call rdc_job_get_stats() on this job_id. But you will be able to reuse the job_id after this call.
- Parameters:
p_rdc_handle – [in] The RDC handler.
job_id – [in] The name of the job.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_job_remove_all(rdc_handle_t p_rdc_handle)#
Request RDC to stop tracking all the jobs.
After this call, you will no longer be able to call rdc_job_get_stats() on any job id. But you will be able to reuse the any previous used job id after this call.
- Parameters:
p_rdc_handle – [in] The RDC handler.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_field_update_all(rdc_handle_t p_rdc_handle, uint32_t wait_for_update)#
Request RDC to update all fields to be watched.
In RDC_OPERATION_MODE_MANUAL, the user must call this method periodically.
- Parameters:
p_rdc_handle – [in] The RDC handler.
wait_for_update – [in] Whether or not to wait for the update loop to complete before returning to the caller 1=wait. 0=do not wait.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_device_get_all(rdc_handle_t p_rdc_handle, uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t *count)#
Get indexes corresponding to all the devices on the system.
Indexes represents RDC GPU Id corresponding to each GPU on the system and is immutable during the lifespan of the engine. The list should be queried again if the engine is restarted.
- Parameters:
p_rdc_handle – [in] The RDC handler.
gpu_index_list – [out] Array reference to fill GPU indexes present on the system.
count – [out] Number of GPUs returned in gpu_index_list.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle, uint32_t gpu_index, rdc_device_attributes_t *p_rdc_attr)#
Gets device attributes corresponding to the gpu_index.
Fetch the attributes, such as device name, of a GPU.
- Parameters:
p_rdc_handle – [in] The RDC handler.
gpu_index – [in] GPU index corresponding to which the attributes should be fetched
p_rdc_attr – [out] GPU attribute corresponding to the gpu_index.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_device_get_component_version(rdc_handle_t p_rdc_handle, rdc_component_t component, rdc_component_version_t *p_rdc_compv)#
Get version information of components used by rdc.
Given a component type, return its version information.
- Parameters:
p_rdc_handle – [in] The RDC handler.
component – [in] Type of Components. See rdc_component_t definition for details.
p_rdc_compv – [out] Version information of the corresponding component.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_group_gpu_create(rdc_handle_t p_rdc_handle, rdc_group_type_t type, const char *group_name, rdc_gpu_group_t *p_rdc_group_id)#
Create a group contains multiple GPUs.
This method can create a group contains multiple GPUs. Instead of executing an operation separately for each GPU, the RDC group enables the user to execute same operation on all the GPUs present in the group as a single API call.
- Parameters:
p_rdc_handle – [in] The RDC handler.
type – [in] The type of the group. RDC_GROUP_DEFAULT includes all the GPUs on the node, and RDC_GROUP_EMPTY creates an empty group.
group_name – [in] The group name specified as NULL terminated C String
p_rdc_group_id – [inout] Caller provided pointer to rdc_gpu_group_t. Upon successful call, the value will contain the group id for following group API calls.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, uint32_t gpu_index)#
Add a GPU to the group.
This method can add a GPU to the group
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id – [in] The group id to which the GPU will be added.
gpu_index – [in] The GPU index to be added to the group.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_group_gpu_get_info(rdc_handle_t p_rdc_handle, rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t *p_rdc_group_info)#
Get information about a GPU group.
Get detail information about a GPU group created by rdc_group_gpu_create
- Parameters:
p_rdc_handle – [in] The RDC handler.
p_rdc_group_id – [in] The GPU group handler created by rdc_group_gpu_create
p_rdc_group_info – [out] The information of the GPU group p_rdc_group_id.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_group_get_all_ids(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id_list[], uint32_t *count)#
Used to get information about all GPU groups in the system.
Get the list of GPU group ids in the system.
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id_list – [out] Array reference to fill GPU group ids in the system.
count – [out] Number of GPU group returned in group_id_list.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_group_gpu_destroy(rdc_handle_t p_rdc_handle, rdc_gpu_group_t p_rdc_group_id)#
Destroy GPU group represented by p_rdc_group_id.
Delete the logic group represented by p_rdc_group_id
- Parameters:
p_rdc_handle – [in] The RDC handler.
p_rdc_group_id – [in] The group id
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_group_field_create(rdc_handle_t p_rdc_handle, uint32_t num_field_ids, rdc_field_t *field_ids, const char *field_group_name, rdc_field_grp_t *rdc_field_group_id)#
create a group of fields
The user can create a group of fields and perform an operation on a group of fields at once.
- Parameters:
p_rdc_handle – [in] The RDC handler.
num_field_ids – [in] Number of field IDs that are being provided in field_ids.
field_ids – [in] Field IDs to be added to the newly-created field group.
field_group_name – [in] Unique name for this group of fields.
rdc_field_group_id – [out] Handle to the newly-created field group
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_group_field_get_info(rdc_handle_t p_rdc_handle, rdc_field_grp_t rdc_field_group_id, rdc_field_group_info_t *field_group_info)#
Get information about a field group.
Get detail information about a field group created by rdc_group_field_create
- Parameters:
p_rdc_handle – [in] The RDC handler.
rdc_field_group_id – [in] The field group handler created by rdc_group_field_create
field_group_info – [out] The information of the field group rdc_field_group_id.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_group_field_get_all_ids(rdc_handle_t p_rdc_handle, rdc_field_grp_t field_group_id_list[], uint32_t *count)#
Used to get information about all field groups in the system.
Get the list of field group ids in the system.
- Parameters:
p_rdc_handle – [in] The RDC handler.
field_group_id_list – [out] Array reference to fill field group ids in the system.
count – [out] Number of field group returned in field_group_id_list.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle, rdc_field_grp_t rdc_field_group_id)#
Destroy field group represented by rdc_field_group_id.
Delete the logic group represented by rdc_field_group_id
- Parameters:
p_rdc_handle – [in] The RDC handler.
rdc_field_group_id – [in] The field group id
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_field_watch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, uint64_t update_freq, double max_keep_age, uint32_t max_keep_samples)#
Request the RDC start recording updates for a given field collection.
Note that the first update of the field will not occur until the next field update cycle. To force a field update cycle, user must call rdc_field_update_all(1)
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id – [in] The group of GPUs to be watched.
field_group_id – [in] The collection of fields to record
update_freq – [in] How often to update fields in usec.
max_keep_age – [in] How long to keep data for fields in seconds.
max_keep_samples – [in] Maximum number of samples to keep. 0=no limit.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_field_get_latest_value(rdc_handle_t p_rdc_handle, uint32_t gpu_index, rdc_field_t field, rdc_field_value *value)#
Request a latest cached field of a GPU.
Note that the field can be cached after called rdc_field_watch
- Parameters:
p_rdc_handle – [in] The RDC handler.
gpu_index – [in] The GPU index.
field – [in] The field id
value – [out] The field value got from cache.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_field_get_value_since(rdc_handle_t p_rdc_handle, uint32_t gpu_index, rdc_field_t field, uint64_t since_time_stamp, uint64_t *next_since_time_stamp, rdc_field_value *value)#
Request a history cached field of a GPU.
Note that the field can be cached after called rdc_field_watch
- Parameters:
p_rdc_handle – [in] The RDC handler.
gpu_index – [in] The GPU index.
field – [in] The field id
since_time_stamp – [in] Timestamp to request values since in usec since 1970.
next_since_time_stamp – [out] Timestamp to use for sinceTimestamp on next call to this function
value – [out] The field value got from cache.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id)#
Stop record updates for a given field collection.
The cache of those fields will not be updated after this call
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id – [in] The GPU group id.
field_group_id – [in] The field group id.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, rdc_diag_level_t level, const char *config, size_t config_size, rdc_diag_response_t *response, rdc_diag_callback_t *callback)#
Run the diagnostic test cases.
Run the diagnostic test cases at different levels.
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id – [in] The GPU group id.
level – [in] The level decides how long the test will run. The RDC_DIAG_LVL_SHORT only take a few seconds, and the the RDC_DIAG_LVL_LONG may take up to 15 minutes.
config – [in] Implementation specific configuration.
config_size – [in] Length of the configuration.
response – [inout] The detail results of the tests run.
callback – [inout] Callback for realtime communication
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case, const char *config, size_t config_size, rdc_diag_test_result_t *result, rdc_diag_callback_t *callback)#
Run one diagnostic test case.
Run a specific diagnostic test case.
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id – [in] The GPU group id.
test_case – [in] The test case to run.
config – [in] Implementation specific configuration.
config_size – [in] Length of the configuration.
result – [inout] The results of the test.
callback – [inout] Callback for realtime communication
- Return values:
RDC_ST_OK – is returned upon successful call.
-
const char *rdc_status_string(rdc_status_t status)#
Get a description of a provided RDC error status.
return the string in human readable format.
- Parameters:
status – [in] The RDC status.
- Return values:
The – string to describe the RDC status.
-
const char *field_id_string(rdc_field_t field_id)#
Get the name of a field.
return the string in human readable format.
- Parameters:
field_id – [in] The field id.
- Return values:
The – string to describe the field.
-
rdc_field_t get_field_id_from_name(const char *name)#
Get the field id from name.
return the field id from field name.
- Parameters:
name – [in] The field name.
- Return values:
return – RDC_FI_INVALID if the field name is invalid.
-
const char *rdc_diagnostic_result_string(rdc_diag_result_t result)#
Get a description of a diagnostic result.
return the string in human readable format.
- Parameters:
result – [in] The RDC diagnostic result.
- Return values:
The – string to describe the RDC diagnostic result.
-
rdc_status_t rdc_policy_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, rdc_policy_t policy)#
Set the RDC policy. Each group has multiple policies, these policies can be set by this API one by one. Multiple calls of this API will override the existing policy.
Set the RDC policy
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id – [in] The GPU group id.
policy – [in] The policy to set
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_policy_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, uint32_t *count, rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS])#
Get the RDC policy.
Get the RDC policy
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id – [in] The GPU group id.
count – [out] The size of policies array
policies – [out] The policies to get
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_policy_delete(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, rdc_policy_condition_type_t condition_type)#
delete the RDC policy for this group based on condition type
clear the RDC policy for this group based on condition type. In a GPU group, only one policy can be set for a specific rdc_policy_condition_type_t
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id – [in] The GPU group id
condition_type – [in] The condition type to delete
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, rdc_policy_register_callback callback)#
Register a function to be called when policy condition is meet.
Register the RDC policy callback
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id – [in] The GPU group id.
callback – [in] The callback function to be called when condition meet.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id)#
un-register a policy callback function for a conditioin.
Un-register the policy callback for a condition.
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id – [in] The GPU group id.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_health_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, unsigned int components)#
enable the health check for a group
For each group, only one parameter can be set. If you want to clear the setting for a group, set component == 0x0
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id – [in] The GPU group id.
components – [in] The list of components that should be enabled for health check for example, RDC_HEALTH_WATCH_THERMAL | RDC_HEALTH_WATCH_POWER
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_health_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, unsigned int *components)#
get the health check settings of a group
get the health check settings of a component
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id – [in] The GPU group id.
components – [out] The list of components that should be enabled for health check for example, RDC_HEALTH_WATCH_THERMAL | RDC_HEALTH_WATCH_POWER if it is 0x0, then the health check not set for the group yet.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_health_check(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, rdc_health_response_t *response)#
Check health watch results.
If it has incidents. For each incident, check the component and error message.
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id – [in] The GPU group id.
response – [inout] The detail results of the health.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
rdc_status_t rdc_health_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id)#
clear the health watch
For each group, clear the setting.
- Parameters:
p_rdc_handle – [in] The RDC handler.
group_id – [in] The GPU group id.
- Return values:
RDC_ST_OK – is returned upon successful call.
-
GPU_ID_INVALID#
- dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rdc/checkouts/amd-staging/include
- dir /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rdc/checkouts/amd-staging/include/rdc