rocm_smi.h Source File

rocm_smi.h Source File#

ROCmSMI: rocm_smi.h Source File
rocm_smi.h
Go to the documentation of this file.
1 /*
2  * =============================================================================
3  * The University of Illinois/NCSA
4  * Open Source License (NCSA)
5  *
6  * Copyright (c) 2017, Advanced Micro Devices, Inc.
7  * All rights reserved.
8  *
9  * Developed by:
10  *
11  * AMD Research and AMD ROC Software Development
12  *
13  * Advanced Micro Devices, Inc.
14  *
15  * www.amd.com
16  *
17  * Permission is hereby granted, free of charge, to any person obtaining a copy
18  * of this software and associated documentation files (the "Software"), to
19  * deal with the Software without restriction, including without limitation
20  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
21  * and/or sell copies of the Software, and to permit persons to whom the
22  * Software is furnished to do so, subject to the following conditions:
23  *
24  * - Redistributions of source code must retain the above copyright notice,
25  * this list of conditions and the following disclaimers.
26  * - Redistributions in binary form must reproduce the above copyright
27  * notice, this list of conditions and the following disclaimers in
28  * the documentation and/or other materials provided with the distribution.
29  * - Neither the names of <Name of Development Group, Name of Institution>,
30  * nor the names of its contributors may be used to endorse or promote
31  * products derived from this Software without specific prior written
32  * permission.
33  *
34  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
35  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
36  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
37  * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
38  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
39  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
40  * DEALINGS WITH THE SOFTWARE.
41  *
42  */
43 #ifndef INCLUDE_ROCM_SMI_ROCM_SMI_H_
44 #define INCLUDE_ROCM_SMI_ROCM_SMI_H_
45 
46 #ifdef __cplusplus
47 extern "C" {
48 #include <cstdint>
49 #else
50 #include <stdint.h>
51 #endif // __cplusplus
52 
53 #include <stddef.h>
54 #include <stdbool.h>
55 
56 #include "rocm_smi/kfd_ioctl.h"
57 
73 #define RSMI_MAX_NUM_FREQUENCIES 32
74 
77 #define RSMI_MAX_FAN_SPEED 255
78 
80 #define RSMI_NUM_VOLTAGE_CURVE_POINTS 3
81 
85 typedef enum {
107  RSMI_INITIALIZATION_ERROR = RSMI_STATUS_INIT_ERROR,
129 
131 } rsmi_status_t;
132 
139 typedef enum {
146  RSMI_INIT_FLAG_RESRV_TEST1 = 0x800000000000000,
148 
152 typedef enum {
154  RSMI_DEV_PERF_LEVEL_FIRST = RSMI_DEV_PERF_LEVEL_AUTO,
155 
170 
171  RSMI_DEV_PERF_LEVEL_LAST = RSMI_DEV_PERF_LEVEL_DETERMINISM,
172 
176 typedef rsmi_dev_perf_level_t rsmi_dev_perf_level;
178 
185 typedef enum {
186  RSMI_SW_COMP_FIRST = 0x0,
187 
188  RSMI_SW_COMP_DRIVER = RSMI_SW_COMP_FIRST,
189 
190  RSMI_SW_COMP_LAST = RSMI_SW_COMP_DRIVER
192 
200 typedef uintptr_t rsmi_event_handle_t;
201 
208 typedef enum {
211  RSMI_EVNT_GRP_INVALID = 0xFFFFFFFF
213 
220 typedef enum {
221  RSMI_EVNT_FIRST = RSMI_EVNT_GRP_XGMI,
222 
223  RSMI_EVNT_XGMI_FIRST = RSMI_EVNT_GRP_XGMI,
224  RSMI_EVNT_XGMI_0_NOP_TX = RSMI_EVNT_XGMI_FIRST,
229 
242  // ie, Throughput = BEATS/time_running 10^9 bytes/sec
252 
253  RSMI_EVNT_XGMI_LAST = RSMI_EVNT_XGMI_1_BEATS_TX, // 5
254 
255  RSMI_EVNT_XGMI_DATA_OUT_FIRST = RSMI_EVNT_GRP_XGMI_DATA_OUT, // 10
256 
257  /*
258  * @brief Events in the RSMI_EVNT_GRP_XGMI_DATA_OUT group measure
259  * the number of beats sent on an XGMI link. Each beat represents
260  * 32 bytes. RSMI_EVNT_XGMI_DATA_OUT_n represents the number of
261  * outbound beats (each representing 32 bytes) on link n.<br><br>
262  *
263  * XGMI throughput can be calculated by multiplying a event
264  * such as ::RSMI_EVNT_XGMI_DATA_OUT_n by 32 and dividing by
265  * the time for which event collection occurred,
266  * ::rsmi_counter_value_t.time_running (which is in nanoseconds). To get
267  * bytes per second, multiply this value by 10<sup>9</sup>.<br>
268  * <br>
269  * Throughput = BEATS/time_running * 10<sup>9</sup> (bytes/second)<br>
270  */
271  // ie, Throughput = BEATS/time_running 10^9 bytes/sec
272  RSMI_EVNT_XGMI_DATA_OUT_0 = RSMI_EVNT_XGMI_DATA_OUT_FIRST,
278  RSMI_EVNT_XGMI_DATA_OUT_LAST = RSMI_EVNT_XGMI_DATA_OUT_5,
279 
280  RSMI_EVNT_LAST = RSMI_EVNT_XGMI_DATA_OUT_LAST,
282 
286 typedef enum {
291 
295 typedef struct {
296  uint64_t value;
297  uint64_t time_enabled;
299  uint64_t time_running;
302 
306 typedef enum {
307  RSMI_EVT_NOTIF_VMFAULT = KFD_SMI_EVENT_VMFAULT,
308  RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT,
309  RSMI_EVT_NOTIF_THERMAL_THROTTLE = KFD_SMI_EVENT_THERMAL_THROTTLE,
310  RSMI_EVT_NOTIF_GPU_PRE_RESET = KFD_SMI_EVENT_GPU_PRE_RESET,
311  RSMI_EVT_NOTIF_GPU_POST_RESET = KFD_SMI_EVENT_GPU_POST_RESET,
312 
313  RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_GPU_POST_RESET
315 
319 #define RSMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
320 
322 #define MAX_EVENT_NOTIFICATION_MSG_SIZE 64
323 
327 typedef struct {
328  uint32_t dv_ind;
332 
336 typedef enum {
338  RSMI_CLK_TYPE_FIRST = RSMI_CLK_TYPE_SYS,
344 
345  // Add new clocks to the end (not in the middle) and update
346  // RSMI_CLK_TYPE_LAST
347  RSMI_CLK_TYPE_LAST = RSMI_CLK_TYPE_MEM,
348  RSMI_CLK_INVALID = 0xFFFFFFFF
351 typedef rsmi_clk_type_t rsmi_clk_type;
353 
359 typedef enum {
361  RSMI_TEMP_FIRST = RSMI_TEMP_CURRENT,
362 
393 
394  RSMI_TEMP_LAST = RSMI_TEMP_HIGHEST
397 typedef rsmi_temperature_metric_t rsmi_temperature_metric;
399 
404 typedef enum {
405  RSMI_TEMP_TYPE_FIRST = 0,
406 
407  RSMI_TEMP_TYPE_EDGE = RSMI_TEMP_TYPE_FIRST,
415  RSMI_TEMP_TYPE_LAST = RSMI_TEMP_TYPE_HBM_3,
416  RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF
418 
424 typedef enum {
426 
427  RSMI_VOLT_FIRST = RSMI_VOLT_CURRENT,
435 
436  RSMI_VOLT_LAST = RSMI_VOLT_HIGHEST
438 
443 typedef enum {
444  RSMI_VOLT_TYPE_FIRST = 0,
445 
446  RSMI_VOLT_TYPE_VDDGFX = RSMI_VOLT_TYPE_FIRST,
448  RSMI_VOLT_TYPE_LAST = RSMI_VOLT_TYPE_VDDGFX,
449  RSMI_VOLT_TYPE_INVALID = 0xFFFFFFFF
451 
458 typedef enum {
464 
466  RSMI_PWR_PROF_PRST_3D_FULL_SCR_MASK = 0x20,
469 
471  RSMI_PWR_PROF_PRST_INVALID = 0xFFFFFFFFFFFFFFFF
474 typedef rsmi_power_profile_preset_masks_t rsmi_power_profile_preset_masks;
476 
480 typedef enum {
481  RSMI_GPU_BLOCK_INVALID = 0x0000000000000000,
483  RSMI_GPU_BLOCK_FIRST = 0x0000000000000001,
484 
485  RSMI_GPU_BLOCK_UMC = RSMI_GPU_BLOCK_FIRST,
486  RSMI_GPU_BLOCK_SDMA = 0x0000000000000002,
487  RSMI_GPU_BLOCK_GFX = 0x0000000000000004,
488  RSMI_GPU_BLOCK_MMHUB = 0x0000000000000008,
489  RSMI_GPU_BLOCK_ATHUB = 0x0000000000000010,
490  RSMI_GPU_BLOCK_PCIE_BIF = 0x0000000000000020,
491  RSMI_GPU_BLOCK_HDP = 0x0000000000000040,
492  RSMI_GPU_BLOCK_XGMI_WAFL = 0x0000000000000080,
493  RSMI_GPU_BLOCK_DF = 0x0000000000000100,
494  RSMI_GPU_BLOCK_SMN = 0x0000000000000200,
495  RSMI_GPU_BLOCK_SEM = 0x0000000000000400,
496  RSMI_GPU_BLOCK_MP0 = 0x0000000000000800,
497  RSMI_GPU_BLOCK_MP1 = 0x0000000000001000,
498  RSMI_GPU_BLOCK_FUSE = 0x0000000000002000,
499 
502  RSMI_GPU_BLOCK_RESERVED = 0x8000000000000000
505 typedef rsmi_gpu_block_t rsmi_gpu_block;
507 
511 typedef enum {
520 
521  RSMI_RAS_ERR_STATE_LAST = RSMI_RAS_ERR_STATE_ENABLED,
522  RSMI_RAS_ERR_STATE_INVALID = 0xFFFFFFFF
524 
528 typedef enum {
529  RSMI_MEM_TYPE_FIRST = 0,
530 
531  RSMI_MEM_TYPE_VRAM = RSMI_MEM_TYPE_FIRST,
534 
535  RSMI_MEM_TYPE_LAST = RSMI_MEM_TYPE_GTT
537 
541 typedef enum {
544  RSMI_FREQ_IND_INVALID = 0xFFFFFFFF
547 typedef rsmi_freq_ind_t rsmi_freq_ind;
549 
550 
555 typedef enum {
556  RSMI_FW_BLOCK_FIRST = 0,
557 
558  RSMI_FW_BLOCK_ASD = RSMI_FW_BLOCK_FIRST,
559  RSMI_FW_BLOCK_CE,
560  RSMI_FW_BLOCK_DMCU,
561  RSMI_FW_BLOCK_MC,
562  RSMI_FW_BLOCK_ME,
563  RSMI_FW_BLOCK_MEC,
564  RSMI_FW_BLOCK_MEC2,
565  RSMI_FW_BLOCK_PFP,
566  RSMI_FW_BLOCK_RLC,
567  RSMI_FW_BLOCK_RLC_SRLC,
568  RSMI_FW_BLOCK_RLC_SRLG,
569  RSMI_FW_BLOCK_RLC_SRLS,
570  RSMI_FW_BLOCK_SDMA,
571  RSMI_FW_BLOCK_SDMA2,
572  RSMI_FW_BLOCK_SMC,
573  RSMI_FW_BLOCK_SOS,
574  RSMI_FW_BLOCK_TA_RAS,
575  RSMI_FW_BLOCK_TA_XGMI,
576  RSMI_FW_BLOCK_UVD,
577  RSMI_FW_BLOCK_VCE,
578  RSMI_FW_BLOCK_VCN,
579 
580  RSMI_FW_BLOCK_LAST = RSMI_FW_BLOCK_VCN
582 
586 typedef enum {
587  RSMI_XGMI_STATUS_NO_ERRORS = 0,
588  RSMI_XGMI_STATUS_ERROR,
589  RSMI_XGMI_STATUS_MULTIPLE_ERRORS,
591 
595 typedef uint64_t rsmi_bit_field_t;
597 typedef rsmi_bit_field_t rsmi_bit_field;
599 
603 typedef enum {
611 
615 typedef enum _RSMI_IO_LINK_TYPE {
620  RSMI_IOLINK_TYPE_SIZE = 0xFFFFFFFF
622 
626 typedef enum {
629  RSMI_COARSE_GRAIN_GFX_ACTIVITY = RSMI_UTILIZATION_COUNTER_FIRST,
631  RSMI_UTILIZATION_COUNTER_LAST = RSMI_COARSE_GRAIN_MEM_ACTIVITY
633 
637 typedef struct {
639  uint64_t value;
641 
645 typedef struct {
646  uint64_t page_address;
647  uint64_t page_size;
650 
654 #define RSMI_MAX_NUM_POWER_PROFILES (sizeof(rsmi_bit_field_t) * 8)
655 
661 typedef struct {
666 
671 
675  uint32_t num_profiles;
678 typedef rsmi_power_profile_status_t rsmi_power_profile_status;
680 
684 typedef struct {
688  uint32_t num_supported;
689 
693  uint32_t current;
694 
699  uint64_t frequency[RSMI_MAX_NUM_FREQUENCIES];
702 typedef rsmi_frequencies_t rsmi_frequencies;
704 
710 typedef struct {
715 
720  uint32_t lanes[RSMI_MAX_NUM_FREQUENCIES];
722 
724 typedef rsmi_pcie_bandwidth_t rsmi_pcie_bandwidth;
726 
730 typedef struct {
731  uint32_t major;
732  uint32_t minor;
733  uint32_t patch;
734  const char *build;
737 typedef rsmi_version_t rsmi_version;
739 
742 typedef struct {
743  uint64_t lower_bound;
744  uint64_t upper_bound;
745 } rsmi_range_t;
747 typedef rsmi_range_t rsmi_range;
749 
753 typedef struct {
754  uint64_t frequency;
755  uint64_t voltage;
758 typedef rsmi_od_vddc_point_t rsmi_od_vddc_point;
760 
766 typedef struct {
771 typedef rsmi_freq_volt_region_t rsmi_freq_volt_region;
773 
777 typedef struct {
785 typedef rsmi_od_volt_curve_t rsmi_od_volt_curve;
787 
791 typedef struct {
797 
802  uint32_t num_regions;
805 typedef rsmi_od_volt_freq_data_t rsmi_od_volt_freq_data;
807 
808 
817  // TODO(amd) Doxygen documents
819  uint16_t structure_size;
820  uint8_t format_revision;
821  uint8_t content_revision;
823 };
824 
828 // Below is the assumed version of gpu_metric data on the device. If the device
829 // is using this version, we can read data directly into rsmi_gpu_metrics_t.
830 // If the device is using an older format, a conversion of formats will be
831 // required.
832 // DGPU targets have a format version of 1. APU targets have a format version of
833 // 2. Currently, only version 1 (DGPU) gpu_metrics is supported.
834 #define RSMI_GPU_METRICS_API_FORMAT_VER 1
835 // The content version increments when gpu_metrics is extended with new and/or
836 // existing field sizes are changed.
837 #define RSMI_GPU_METRICS_API_CONTENT_VER_1 1
838 #define RSMI_GPU_METRICS_API_CONTENT_VER_2 2
839 #define RSMI_GPU_METRICS_API_CONTENT_VER_3 3
840 
841 // This should match NUM_HBM_INSTANCES
842 #define RSMI_NUM_HBM_INSTANCES 4
843 
844 // Unit conversion factor for HBM temperatures
845 #define CENTRIGRADE_TO_MILLI_CENTIGRADE 1000
846 
847 typedef struct {
848  // TODO(amd) Doxygen documents
850  struct metrics_table_header_t common_header;
851 
852 /* Temperature */
853  uint16_t temperature_edge;
854  uint16_t temperature_hotspot;
855  uint16_t temperature_mem;
856  uint16_t temperature_vrgfx;
857  uint16_t temperature_vrsoc;
858  uint16_t temperature_vrmem;
859 
860 /* Utilization */
861  uint16_t average_gfx_activity;
862  uint16_t average_umc_activity; // memory controller
863  uint16_t average_mm_activity; // UVD or VCN
864 
865 /* Power/Energy */
866  uint16_t average_socket_power;
867  uint64_t energy_accumulator; // v1 mod. (32->64)
868 
869 /* Driver attached timestamp (in ns) */
870  uint64_t system_clock_counter; // v1 mod. (moved from top of struct)
871 
872 /* Average clocks */
873  uint16_t average_gfxclk_frequency;
874  uint16_t average_socclk_frequency;
875  uint16_t average_uclk_frequency;
876  uint16_t average_vclk0_frequency;
877  uint16_t average_dclk0_frequency;
878  uint16_t average_vclk1_frequency;
879  uint16_t average_dclk1_frequency;
880 
881 /* Current clocks */
882  uint16_t current_gfxclk;
883  uint16_t current_socclk;
884  uint16_t current_uclk;
885  uint16_t current_vclk0;
886  uint16_t current_dclk0;
887  uint16_t current_vclk1;
888  uint16_t current_dclk1;
889 
890 /* Throttle status */
891  uint32_t throttle_status;
892 
893 /* Fans */
894  uint16_t current_fan_speed;
895 
896 /* Link width/speed */
897  uint16_t pcie_link_width; // v1 mod.(8->16)
898  uint16_t pcie_link_speed; // in 0.1 GT/s; v1 mod. (8->16)
899 
900  uint16_t padding; // new in v1
901 
902  uint32_t gfx_activity_acc; // new in v1
903  uint32_t mem_actvity_acc; // new in v1
904  uint16_t temperature_hbm[RSMI_NUM_HBM_INSTANCES]; // new in v1
907 
911 typedef struct {
912  uint64_t correctable_err;
913  uint64_t uncorrectable_err;
915 
919 typedef struct {
920  uint32_t process_id;
921  uint32_t pasid;
922  uint64_t vram_usage;
923  uint64_t sdma_usage;
924  uint32_t cu_occupancy;
926 
927 
931 typedef struct rsmi_func_id_iter_handle * rsmi_func_id_iter_handle_t;
932 
935 #define RSMI_DEFAULT_VARIANT 0xFFFFFFFFFFFFFFFF
936 
942 typedef union id {
943  uint64_t id;
944  const char *name;
945  union {
959  rsmi_gpu_block_t gpu_block_type;
960  };
962 
963 
964 /*****************************************************************************/
982 rsmi_status_t rsmi_init(uint64_t init_flags);
983 
990  // end of InitShut
992 
993 /*****************************************************************************/
1011 
1039 rsmi_status_t rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id);
1040 
1041 
1066 rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, char *sku);
1067 
1091 rsmi_status_t rsmi_dev_vendor_id_get(uint32_t dv_ind, uint16_t *id);
1092 
1127 rsmi_status_t rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len);
1128 
1161 rsmi_status_t rsmi_dev_brand_get(uint32_t dv_ind, char *brand, uint32_t len);
1162 
1197 rsmi_status_t rsmi_dev_vendor_name_get(uint32_t dv_ind, char *name,
1198  size_t len);
1199 
1222 rsmi_status_t rsmi_dev_vram_vendor_get(uint32_t dv_ind, char *brand,
1223  uint32_t len);
1224 
1254  char *serial_num, uint32_t len);
1278 rsmi_status_t rsmi_dev_subsystem_id_get(uint32_t dv_ind, uint16_t *id);
1279 
1315 rsmi_dev_subsystem_name_get(uint32_t dv_ind, char *name, size_t len);
1316 
1335 rsmi_dev_drm_render_minor_get(uint32_t dv_ind, uint32_t *minor);
1336 
1359 rsmi_status_t rsmi_dev_subsystem_vendor_id_get(uint32_t dv_ind, uint16_t *id);
1360 
1382 rsmi_status_t rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *id);
1383  // end of IDQuer
1385 
1386 /*****************************************************************************/
1410 
1446 rsmi_status_t rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid);
1447 
1470 rsmi_status_t rsmi_topo_numa_affinity_get(uint32_t dv_ind, uint32_t *numa_node);
1471 
1496 rsmi_status_t rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent,
1497  uint64_t *received, uint64_t *max_pkt_sz);
1498 
1522  uint64_t *counter);
1523  // end of PCIeQuer
1525 /*****************************************************************************/
1560 rsmi_status_t rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask);
1561  // end of PCIeCont
1563 
1564 /*****************************************************************************/
1595 rsmi_dev_power_ave_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *power);
1596 
1628 rsmi_dev_energy_count_get(uint32_t dv_ind, uint64_t *power,
1629  float *counter_resolution, uint64_t *timestamp);
1630 
1657 rsmi_dev_power_cap_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *cap);
1658 
1681 rsmi_dev_power_cap_default_get(uint32_t dv_ind, uint64_t *default_cap);
1682 
1715 rsmi_dev_power_cap_range_get(uint32_t dv_ind, uint32_t sensor_ind,
1716  uint64_t *max, uint64_t *min);
1717  // end of PowerQuer
1719 
1720 /*****************************************************************************/
1746 rsmi_dev_power_cap_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t cap);
1747 
1768 rsmi_dev_power_profile_set(uint32_t dv_ind, uint32_t reserved,
1769  rsmi_power_profile_preset_masks_t profile); // end of PowerCont
1771 /*****************************************************************************/
1772 
1773 
1774 
1775 /*****************************************************************************/
1808  uint64_t *total);
1809 
1838  uint64_t *used);
1839 
1863 rsmi_dev_memory_busy_percent_get(uint32_t dv_ind, uint32_t *busy_percent);
1864 
1901 rsmi_dev_memory_reserved_pages_get(uint32_t dv_ind, uint32_t *num_pages,
1902  rsmi_retired_page_record_t *records); // end of MemQuer
1904 
1936 rsmi_status_t rsmi_dev_fan_rpms_get(uint32_t dv_ind, uint32_t sensor_ind,
1937  int64_t *speed);
1938 
1967  uint32_t sensor_ind, int64_t *speed);
1968 
1995  uint32_t sensor_ind, uint64_t *max_speed);
1996 
2027 rsmi_status_t rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
2028  rsmi_temperature_metric_t metric, int64_t *temperature);
2029 
2061  rsmi_voltage_type_t sensor_type,
2062  rsmi_voltage_metric_t metric, int64_t *voltage); // end of PhysQuer
2064 
2065 /*****************************************************************************/
2085 rsmi_status_t rsmi_dev_fan_reset(uint32_t dv_ind, uint32_t sensor_ind);
2086 
2109 rsmi_status_t rsmi_dev_fan_speed_set(uint32_t dv_ind, uint32_t sensor_ind,
2110  uint64_t speed);
2111  // end of PhysCont
2113 /*****************************************************************************/
2144 rsmi_dev_busy_percent_get(uint32_t dv_ind, uint32_t *busy_percent);
2145 
2177  rsmi_utilization_counter_t utilization_counters[],
2178  uint32_t count,
2179  uint64_t *timestamp);
2180 
2205  rsmi_dev_perf_level_t *perf);
2206 
2229 rsmi_status_t rsmi_perf_determinism_mode_set(uint32_t dv_ind, uint64_t clkvalue);
2254 rsmi_status_t rsmi_dev_overdrive_level_get(uint32_t dv_ind, uint32_t *od);
2255 
2284  rsmi_clk_type_t clk_type, rsmi_frequencies_t *f);
2285 
2300 
2323 
2345  rsmi_gpu_metrics_t *pgpu_metrics);
2346 
2369 rsmi_status_t rsmi_dev_clk_range_set(uint32_t dv_ind, uint64_t minclkvalue,
2370  uint64_t maxclkvalue,
2371  rsmi_clk_type_t clkType);
2372 
2396  uint64_t clkvalue,
2397  rsmi_clk_type_t clkType);
2398 
2420 rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint,
2421  uint64_t clkvalue, uint64_t voltvalue);
2422 
2462  uint32_t *num_regions, rsmi_freq_volt_region_t *buffer);
2463 
2500 rsmi_dev_power_profile_presets_get(uint32_t dv_ind, uint32_t sensor_ind,
2501  rsmi_power_profile_status_t *status);
2502  // end of PerfQuer
2504 /*****************************************************************************/
2505 
2534 
2555 
2599 rsmi_status_t rsmi_dev_overdrive_level_set(int32_t dv_ind, uint32_t od);
2600 
2640 rsmi_status_t rsmi_dev_overdrive_level_set_v1(uint32_t dv_ind, uint32_t od);
2641 
2677  rsmi_clk_type_t clk_type, uint64_t freq_bitmask);
2678  // end of PerfCont
2680 
2681 /*****************************************************************************/
2702 
2730  uint32_t len);
2731 
2759 rsmi_dev_vbios_version_get(uint32_t dv_ind, char *vbios, uint32_t len);
2760 
2787  uint64_t *fw_version);
2788  // end of VersQuer
2790 
2791 /*****************************************************************************/
2825 
2854  uint64_t *enabled_blocks);
2855 
2882  rsmi_ras_err_state_t *state);
2898 rsmi_status_string(rsmi_status_t status, const char **status_string);
2899  // end of ErrQuer
2901 
2902 /*****************************************************************************/
3025 
3056  rsmi_event_handle_t *evnt_handle);
3057 
3073 
3093  rsmi_counter_command_t cmd, void *cmd_args);
3094 
3114  rsmi_counter_value_t *value);
3115 
3137  rsmi_event_group_t grp, uint32_t *available); // end of PerfCntr
3139 
3140 /*****************************************************************************/
3180 
3204 
3240 rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices,
3241  uint32_t *num_devices);
3242  // end of SysInfo
3244 
3245 /*****************************************************************************/
3276 
3290 rsmi_dev_xgmi_error_reset(uint32_t dv_ind);
3291 
3311 rsmi_dev_xgmi_hive_id_get(uint32_t dv_ind, uint64_t *hive_id);
3312  // end of SysInfo
3314 
3315 /*****************************************************************************/
3339 rsmi_topo_get_numa_node_number(uint32_t dv_ind, uint32_t *numa_node);
3340 
3362 rsmi_topo_get_link_weight(uint32_t dv_ind_src, uint32_t dv_ind_dst,
3363  uint64_t *weight);
3364 
3388 rsmi_minmax_bandwidth_get(uint32_t dv_ind_src, uint32_t dv_ind_dst,
3389  uint64_t *min_bandwidth, uint64_t *max_bandwidth);
3390 
3416 rsmi_topo_get_link_type(uint32_t dv_ind_src, uint32_t dv_ind_dst,
3417  uint64_t *hops, RSMI_IO_LINK_TYPE *type);
3418 
3440 rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst,
3441  bool *accessible);
3442  // end of HWTopo
3444 
3445 /*****************************************************************************/
3580  rsmi_func_id_iter_handle_t *handle);
3581 
3608  rsmi_func_id_iter_handle_t *var_iter);
3609 
3630 
3644 
3665  rsmi_func_id_value_t *value);
3666  // end of APISupport
3668 
3669 /*****************************************************************************/
3691 
3719 rsmi_event_notification_mask_set(uint32_t dv_ind, uint64_t mask);
3720 
3763  uint32_t *num_elem, rsmi_evt_notification_data_t *data);
3764 
3784  // end of EvntNotif
3786 
3787 #ifdef __cplusplus
3788 }
3789 #endif // __cplusplus
3790 #endif // INCLUDE_ROCM_SMI_ROCM_SMI_H_
rsmi_status_t rsmi_func_iter_value_get(rsmi_func_id_iter_handle_t handle, rsmi_func_id_value_t *value)
Get the value associated with a function/variant iterator.
rsmi_status_t rsmi_dev_supported_variant_iterator_open(rsmi_func_id_iter_handle_t obj_h, rsmi_func_id_iter_handle_t *var_iter)
Get a variant iterator for a given handle.
rsmi_status_t rsmi_func_iter_next(rsmi_func_id_iter_handle_t handle)
Advance a function identifer iterator.
rsmi_status_t rsmi_dev_supported_func_iterator_close(rsmi_func_id_iter_handle_t *handle)
Close a variant iterator handle.
rsmi_status_t rsmi_dev_supported_func_iterator_open(uint32_t dv_ind, rsmi_func_id_iter_handle_t *handle)
Get a function name iterator of supported RSMI functions for a device.
rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_ras_err_state_t *state)
Retrieve the ECC status for a GPU block.
rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind, uint64_t *enabled_blocks)
Retrieve the enabled ECC bit-mask.
rsmi_status_t rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t *ec)
Retrieve the error counts for a GPU block.
rsmi_status_t rsmi_status_string(rsmi_status_t status, const char **status_string)
Get a description of a provided RSMI error status.
rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind)
Close any file handles and free any resources used by event notification for a GPU.
rsmi_status_t rsmi_event_notification_get(int timeout_ms, uint32_t *num_elem, rsmi_evt_notification_data_t *data)
Collect event notifications, waiting a specified amount of time.
rsmi_status_t rsmi_event_notification_init(uint32_t dv_ind)
Prepare to collect event notifications for a GPU.
rsmi_status_t rsmi_event_notification_mask_set(uint32_t dv_ind, uint64_t mask)
Specify which events to collect for a device.
rsmi_status_t rsmi_topo_get_link_type(uint32_t dv_ind_src, uint32_t dv_ind_dst, uint64_t *hops, RSMI_IO_LINK_TYPE *type)
Retrieve the hops and the connection type between 2 GPUs.
rsmi_status_t rsmi_topo_get_link_weight(uint32_t dv_ind_src, uint32_t dv_ind_dst, uint64_t *weight)
Retrieve the weight for a connection between 2 GPUs.
rsmi_status_t rsmi_minmax_bandwidth_get(uint32_t dv_ind_src, uint32_t dv_ind_dst, uint64_t *min_bandwidth, uint64_t *max_bandwidth)
Retreive minimal and maximal io link bandwidth between 2 GPUs.
rsmi_status_t rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst, bool *accessible)
Return P2P availability status between 2 GPUs.
rsmi_status_t rsmi_topo_get_numa_node_number(uint32_t dv_ind, uint32_t *numa_node)
Retrieve the NUMA CPU node number for a device.
rsmi_status_t rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len)
Get the name string of a gpu device.
rsmi_status_t rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *id)
Get Unique ID.
rsmi_status_t rsmi_num_monitor_devices(uint32_t *num_devices)
Get the number of devices that have monitor information.
rsmi_status_t rsmi_dev_vram_vendor_get(uint32_t dv_ind, char *brand, uint32_t len)
Get the vram vendor string of a gpu device.
rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, char *sku)
Get the SKU for a desired device associated with the device with provided device index.
rsmi_status_t rsmi_dev_drm_render_minor_get(uint32_t dv_ind, uint32_t *minor)
Get the drm minor number associated with this device.
rsmi_status_t rsmi_dev_subsystem_id_get(uint32_t dv_ind, uint16_t *id)
Get the subsystem device id associated with the device with provided device index.
rsmi_status_t rsmi_dev_serial_number_get(uint32_t dv_ind, char *serial_num, uint32_t len)
Get the serial number string for a device.
rsmi_status_t rsmi_dev_vendor_name_get(uint32_t dv_ind, char *name, size_t len)
Get the name string for a give vendor ID.
rsmi_status_t rsmi_dev_brand_get(uint32_t dv_ind, char *brand, uint32_t len)
Get the brand string of a gpu device.
rsmi_status_t rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id)
Get the device id associated with the device with provided device index.
rsmi_status_t rsmi_dev_subsystem_name_get(uint32_t dv_ind, char *name, size_t len)
Get the name string for the device subsytem.
rsmi_status_t rsmi_dev_subsystem_vendor_id_get(uint32_t dv_ind, uint16_t *id)
Get the device subsystem vendor id associated with the device with provided device index.
rsmi_status_t rsmi_dev_vendor_id_get(uint32_t dv_ind, uint16_t *id)
Get the device vendor id associated with the device with provided device index.
rsmi_status_t rsmi_init(uint64_t init_flags)
Initialize ROCm SMI.
rsmi_status_t rsmi_shut_down(void)
Shutdown ROCm SMI.
rsmi_status_t rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t *total)
Get the total amount of memory that exists.
rsmi_status_t rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t *used)
Get the current memory usage.
rsmi_status_t rsmi_dev_memory_reserved_pages_get(uint32_t dv_ind, uint32_t *num_pages, rsmi_retired_page_record_t *records)
Get information about reserved ("retired") memory pages.
rsmi_status_t rsmi_dev_memory_busy_percent_get(uint32_t dv_ind, uint32_t *busy_percent)
Get percentage of time any device memory is being used.
rsmi_status_t rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask)
Control the set of allowed PCIe bandwidths that can be used.
rsmi_status_t rsmi_topo_numa_affinity_get(uint32_t dv_ind, uint32_t *numa_node)
Get the NUMA node associated with a device.
rsmi_status_t rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent, uint64_t *received, uint64_t *max_pkt_sz)
Get PCIe traffic information.
rsmi_status_t rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *bandwidth)
Get the list of possible PCIe bandwidths that are available.
rsmi_status_t rsmi_dev_pci_replay_counter_get(uint32_t dv_ind, uint64_t *counter)
Get PCIe replay counter.
rsmi_status_t rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid)
Get the unique PCI device identifier associated for a device.
rsmi_status_t rsmi_counter_control(rsmi_event_handle_t evt_handle, rsmi_counter_command_t cmd, void *cmd_args)
Issue performance counter control commands.
rsmi_status_t rsmi_counter_read(rsmi_event_handle_t evt_handle, rsmi_counter_value_t *value)
Read the current value of a performance counter.
rsmi_status_t rsmi_counter_available_counters_get(uint32_t dv_ind, rsmi_event_group_t grp, uint32_t *available)
Get the number of currently available counters.
rsmi_status_t rsmi_dev_counter_destroy(rsmi_event_handle_t evnt_handle)
Deallocate a performance counter object.
rsmi_status_t rsmi_dev_counter_create(uint32_t dv_ind, rsmi_event_type_t type, rsmi_event_handle_t *evnt_handle)
Create a performance counter object.
rsmi_status_t rsmi_dev_counter_group_supported(uint32_t dv_ind, rsmi_event_group_t group)
Tell if an event group is supported by a given device.
rsmi_status_t rsmi_dev_overdrive_level_set_v1(uint32_t dv_ind, uint32_t od)
Set the overdrive percent associated with the device with provided device index with the provided val...
rsmi_status_t rsmi_dev_perf_level_set(int32_t dv_ind, rsmi_dev_perf_level_t perf_lvl)
Set the PowerPlay performance level associated with the device with provided device index with the pr...
rsmi_status_t rsmi_dev_overdrive_level_set(int32_t dv_ind, uint32_t od)
Set the overdrive percent associated with the device with provided device index with the provided val...
rsmi_status_t rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, rsmi_clk_type_t clk_type, uint64_t freq_bitmask)
Control the set of allowed frequencies that can be used for the specified clock.
rsmi_status_t rsmi_dev_perf_level_set_v1(uint32_t dv_ind, rsmi_dev_perf_level_t perf_lvl)
Set the PowerPlay performance level associated with the device with provided device index with the pr...
rsmi_status_t rsmi_dev_clk_range_set(uint32_t dv_ind, uint64_t minclkvalue, uint64_t maxclkvalue, rsmi_clk_type_t clkType)
This function sets the clock range information.
rsmi_status_t rsmi_dev_od_volt_curve_regions_get(uint32_t dv_ind, uint32_t *num_regions, rsmi_freq_volt_region_t *buffer)
This function will retrieve the current valid regions in the frequency/voltage space.
rsmi_status_t rsmi_dev_power_profile_presets_get(uint32_t dv_ind, uint32_t sensor_ind, rsmi_power_profile_status_t *status)
Get the list of available preset power profiles and an indication of which profile is currently activ...
rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level, uint64_t clkvalue, rsmi_clk_type_t clkType)
This function sets the clock frequency information.
rsmi_status_t rsmi_dev_perf_level_get(uint32_t dv_ind, rsmi_dev_perf_level_t *perf)
Get the performance level of the device with provided device index.
rsmi_status_t rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv)
This function retrieves the voltage/frequency curve information.
rsmi_status_t rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *pgpu_metrics)
This function retrieves the gpu metrics information.
rsmi_status_t rsmi_perf_determinism_mode_set(uint32_t dv_ind, uint64_t clkvalue)
Enter performance determinism mode with provided device index.
rsmi_status_t rsmi_dev_gpu_reset(int32_t dv_ind)
Reset the gpu associated with the device with provided device index.
rsmi_status_t rsmi_utilization_count_get(uint32_t dv_ind, rsmi_utilization_counter_t utilization_counters[], uint32_t count, uint64_t *timestamp)
Get coarse grain utilization counter of the specified device.
rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint, uint64_t clkvalue, uint64_t voltvalue)
This function sets 1 of the 3 voltage curve points.
rsmi_status_t rsmi_dev_overdrive_level_get(uint32_t dv_ind, uint32_t *od)
Get the overdrive percent associated with the device with provided device index.
rsmi_status_t rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind, rsmi_clk_type_t clk_type, rsmi_frequencies_t *f)
Get the list of possible system clock speeds of device for a specified clock type.
rsmi_status_t rsmi_dev_busy_percent_get(uint32_t dv_ind, uint32_t *busy_percent)
Get percentage of time device is busy doing any processing.
rsmi_status_t rsmi_dev_fan_speed_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t speed)
Set the fan speed for the specified device with the provided speed, in RPMs.
rsmi_status_t rsmi_dev_fan_reset(uint32_t dv_ind, uint32_t sensor_ind)
Reset the fan to automatic driver control.
rsmi_status_t rsmi_dev_volt_metric_get(uint32_t dv_ind, rsmi_voltage_type_t sensor_type, rsmi_voltage_metric_t metric, int64_t *voltage)
Get the voltage metric value for the specified metric, from the specified voltage sensor on the speci...
rsmi_status_t rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, rsmi_temperature_metric_t metric, int64_t *temperature)
Get the temperature metric value for the specified metric, from the specified temperature sensor on t...
rsmi_status_t rsmi_dev_fan_speed_max_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *max_speed)
Get the max. fan speed of the device with provided device index.
rsmi_status_t rsmi_dev_fan_rpms_get(uint32_t dv_ind, uint32_t sensor_ind, int64_t *speed)
Get the fan speed in RPMs of the device with the specified device index and 0-based sensor index.
rsmi_status_t rsmi_dev_fan_speed_get(uint32_t dv_ind, uint32_t sensor_ind, int64_t *speed)
Get the fan speed for the specified device as a value relative to RSMI_MAX_FAN_SPEED.
rsmi_status_t rsmi_dev_power_cap_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t cap)
Set the power cap value.
rsmi_status_t rsmi_dev_power_profile_set(uint32_t dv_ind, uint32_t reserved, rsmi_power_profile_preset_masks_t profile)
Set the power profile.
rsmi_status_t rsmi_dev_energy_count_get(uint32_t dv_ind, uint64_t *power, float *counter_resolution, uint64_t *timestamp)
Get the energy accumulator counter of the device with provided device index.
rsmi_status_t rsmi_dev_power_cap_default_get(uint32_t dv_ind, uint64_t *default_cap)
Get the default power cap for the device specified by dv_ind.
rsmi_status_t rsmi_dev_power_ave_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *power)
Get the average power consumption of the device with provided device index.
rsmi_status_t rsmi_dev_power_cap_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *cap)
Get the cap on power which, when reached, causes the system to take action to reduce power.
rsmi_status_t rsmi_dev_power_cap_range_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *max, uint64_t *min)
Get the range of valid values for the power cap.
rsmi_status_t rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices, uint32_t *num_devices)
Get the device indices currently being used by a process.
rsmi_status_t rsmi_compute_process_info_by_pid_get(uint32_t pid, rsmi_process_info_t *proc)
Get process information about a specific process.
rsmi_status_t rsmi_compute_process_info_get(rsmi_process_info_t *procs, uint32_t *num_items)
Get process information about processes currently using GPU.
rsmi_status_t rsmi_version_get(rsmi_version_t *version)
Get the build version information for the currently running build of RSMI.
rsmi_status_t rsmi_dev_vbios_version_get(uint32_t dv_ind, char *vbios, uint32_t len)
Get the VBIOS identifer string.
rsmi_status_t rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block, uint64_t *fw_version)
Get the firmware versions for a device.
rsmi_status_t rsmi_version_str_get(rsmi_sw_component_t component, char *ver_str, uint32_t len)
Get the driver version string for the current system.
rsmi_status_t rsmi_dev_xgmi_error_status(uint32_t dv_ind, rsmi_xgmi_status_t *status)
Retrieve the XGMI error status for a device.
rsmi_status_t rsmi_dev_xgmi_error_reset(uint32_t dv_ind)
Reset the XGMI error status for a device.
rsmi_status_t rsmi_dev_xgmi_hive_id_get(uint32_t dv_ind, uint64_t *hive_id)
Retrieve the XGMI hive id for a device.
struct rsmi_func_id_iter_handle * rsmi_func_id_iter_handle_t
Opaque handle to function-support object.
Definition: rocm_smi.h:931
rsmi_memory_page_status_t
Reserved Memory Page States.
Definition: rocm_smi.h:603
@ RSMI_MEM_PAGE_STATUS_UNRESERVABLE
Unable to reserve this page.
Definition: rocm_smi.h:609
@ RSMI_MEM_PAGE_STATUS_PENDING
Definition: rocm_smi.h:606
@ RSMI_MEM_PAGE_STATUS_RESERVED
Definition: rocm_smi.h:604
rsmi_status_t
Error codes retured by rocm_smi_lib functions.
Definition: rocm_smi.h:85
@ RSMI_STATUS_UNEXPECTED_DATA
Definition: rocm_smi.h:122
@ RSMI_STATUS_REFCOUNT_OVERFLOW
exceeded INT32_MAX
Definition: rocm_smi.h:127
@ RSMI_STATUS_NOT_FOUND
Definition: rocm_smi.h:112
@ RSMI_STATUS_UNKNOWN_ERROR
An unknown error occurred.
Definition: rocm_smi.h:130
@ RSMI_STATUS_INIT_ERROR
Definition: rocm_smi.h:104
@ RSMI_STATUS_INSUFFICIENT_SIZE
Definition: rocm_smi.h:114
@ RSMI_STATUS_INVALID_ARGS
Passed in arguments are not valid.
Definition: rocm_smi.h:87
@ RSMI_STATUS_NOT_SUPPORTED
Definition: rocm_smi.h:88
@ RSMI_STATUS_NO_DATA
Definition: rocm_smi.h:120
@ RSMI_STATUS_UNEXPECTED_SIZE
Definition: rocm_smi.h:118
@ RSMI_STATUS_FILE_ERROR
Definition: rocm_smi.h:91
@ RSMI_STATUS_BUSY
Definition: rocm_smi.h:124
@ RSMI_STATUS_NOT_YET_IMPLEMENTED
Definition: rocm_smi.h:108
@ RSMI_STATUS_OUT_OF_RESOURCES
Definition: rocm_smi.h:99
@ RSMI_STATUS_INTERRUPT
Definition: rocm_smi.h:116
@ RSMI_STATUS_INTERNAL_EXCEPTION
An internal exception was caught.
Definition: rocm_smi.h:101
@ RSMI_STATUS_SUCCESS
Operation was successful.
Definition: rocm_smi.h:86
@ RSMI_STATUS_INPUT_OUT_OF_BOUNDS
Definition: rocm_smi.h:102
@ RSMI_STATUS_PERMISSION
Definition: rocm_smi.h:96
RSMI_UTILIZATION_COUNTER_TYPE
The utilization counter type.
Definition: rocm_smi.h:626
@ RSMI_COARSE_GRAIN_MEM_ACTIVITY
Memory Activity.
Definition: rocm_smi.h:630
@ RSMI_UTILIZATION_COUNTER_FIRST
GFX Activity.
Definition: rocm_smi.h:627
_RSMI_IO_LINK_TYPE
Types for IO Link.
Definition: rocm_smi.h:615
@ RSMI_IOLINK_TYPE_XGMI
XGMI.
Definition: rocm_smi.h:618
@ RSMI_IOLINK_TYPE_UNDEFINED
unknown type.
Definition: rocm_smi.h:616
@ RSMI_IOLINK_TYPE_PCIEXPRESS
PCI Express.
Definition: rocm_smi.h:617
@ RSMI_IOLINK_TYPE_SIZE
Max of IO Link types.
Definition: rocm_smi.h:620
@ RSMI_IOLINK_TYPE_NUMIOLINKTYPES
Number of IO Link types.
Definition: rocm_smi.h:619
rsmi_sw_component_t
Available clock types.
Definition: rocm_smi.h:185
@ RSMI_SW_COMP_DRIVER
Driver.
Definition: rocm_smi.h:188
rsmi_event_group_t
Enum denoting an event group. The value of the enum is the base value for all the event enums in the ...
Definition: rocm_smi.h:208
@ RSMI_EVNT_GRP_XGMI
Data Fabric (XGMI) related events.
Definition: rocm_smi.h:209
@ RSMI_EVNT_GRP_XGMI_DATA_OUT
XGMI Outbound data.
Definition: rocm_smi.h:210
#define MAX_EVENT_NOTIFICATION_MSG_SIZE
Maximum number of characters an event notification message will be.
Definition: rocm_smi.h:322
rsmi_dev_perf_level_t
PowerPlay performance levels.
Definition: rocm_smi.h:152
@ RSMI_DEV_PERF_LEVEL_LOW
Definition: rocm_smi.h:156
@ RSMI_DEV_PERF_LEVEL_MANUAL
Definition: rocm_smi.h:160
@ RSMI_DEV_PERF_LEVEL_STABLE_MIN_MCLK
Definition: rocm_smi.h:165
@ RSMI_DEV_PERF_LEVEL_UNKNOWN
Unknown performance level.
Definition: rocm_smi.h:173
@ RSMI_DEV_PERF_LEVEL_DETERMINISM
Performance determinism state.
Definition: rocm_smi.h:169
@ RSMI_DEV_PERF_LEVEL_HIGH
Definition: rocm_smi.h:158
@ RSMI_DEV_PERF_LEVEL_STABLE_STD
Definition: rocm_smi.h:162
@ RSMI_DEV_PERF_LEVEL_STABLE_PEAK
Stable power state with peak clocks.
Definition: rocm_smi.h:164
@ RSMI_DEV_PERF_LEVEL_STABLE_MIN_SCLK
Definition: rocm_smi.h:167
@ RSMI_DEV_PERF_LEVEL_AUTO
Performance level is "auto".
Definition: rocm_smi.h:153
rsmi_voltage_metric_t
Voltage Metrics. This enum is used to identify various Volatge metrics. Corresponding values will be ...
Definition: rocm_smi.h:424
@ RSMI_VOLT_LOWEST
Historical minimum voltage.
Definition: rocm_smi.h:433
@ RSMI_VOLT_MIN_CRIT
Voltage critical min value.
Definition: rocm_smi.h:429
@ RSMI_VOLT_MAX_CRIT
Voltage critical max value.
Definition: rocm_smi.h:431
@ RSMI_VOLT_CURRENT
Voltage current value.
Definition: rocm_smi.h:425
@ RSMI_VOLT_MIN
Voltage min value.
Definition: rocm_smi.h:430
@ RSMI_VOLT_AVERAGE
Average voltage.
Definition: rocm_smi.h:432
@ RSMI_VOLT_MAX
Voltage max value.
Definition: rocm_smi.h:428
@ RSMI_VOLT_HIGHEST
Historical maximum voltage.
Definition: rocm_smi.h:434
rsmi_evt_notification_type_t
Definition: rocm_smi.h:306
@ RSMI_EVT_NOTIF_VMFAULT
VM page fault.
Definition: rocm_smi.h:307
#define RSMI_NUM_VOLTAGE_CURVE_POINTS
The number of points that make up a voltage-frequency curve definition.
Definition: rocm_smi.h:80
rsmi_xgmi_status_t
XGMI Status.
Definition: rocm_smi.h:586
rsmi_voltage_type_t
This ennumeration is used to indicate which type of voltage reading should be obtained.
Definition: rocm_smi.h:443
@ RSMI_VOLT_TYPE_VDDGFX
Definition: rocm_smi.h:446
@ RSMI_VOLT_TYPE_INVALID
Invalid type.
Definition: rocm_smi.h:449
rsmi_temperature_metric_t
Temperature Metrics. This enum is used to identify various temperature metrics. Corresponding values ...
Definition: rocm_smi.h:359
@ RSMI_TEMP_CURRENT
Temperature current value.
Definition: rocm_smi.h:360
@ RSMI_TEMP_LOWEST
Historical minimum temperature.
Definition: rocm_smi.h:391
@ RSMI_TEMP_CRIT_MIN
Definition: rocm_smi.h:383
@ RSMI_TEMP_CRIT_MIN_HYST
Definition: rocm_smi.h:386
@ RSMI_TEMP_MIN
Temperature min value.
Definition: rocm_smi.h:364
@ RSMI_TEMP_MAX
Temperature max value.
Definition: rocm_smi.h:363
@ RSMI_TEMP_EMERGENCY
Definition: rocm_smi.h:376
@ RSMI_TEMP_MAX_HYST
Definition: rocm_smi.h:365
@ RSMI_TEMP_MIN_HYST
Definition: rocm_smi.h:368
@ RSMI_TEMP_CRITICAL_HYST
Definition: rocm_smi.h:373
@ RSMI_TEMP_EMERGENCY_HYST
Definition: rocm_smi.h:380
@ RSMI_TEMP_HIGHEST
Historical maximum temperature.
Definition: rocm_smi.h:392
@ RSMI_TEMP_CRITICAL
Definition: rocm_smi.h:371
@ RSMI_TEMP_OFFSET
Definition: rocm_smi.h:389
rsmi_ras_err_state_t
The current ECC state.
Definition: rocm_smi.h:511
@ RSMI_RAS_ERR_STATE_MULT_UC
Multiple uncorrectable errors.
Definition: rocm_smi.h:516
@ RSMI_RAS_ERR_STATE_POISON
Definition: rocm_smi.h:517
@ RSMI_RAS_ERR_STATE_DISABLED
ECC is disabled.
Definition: rocm_smi.h:513
@ RSMI_RAS_ERR_STATE_PARITY
ECC errors present, but type unknown.
Definition: rocm_smi.h:514
@ RSMI_RAS_ERR_STATE_SING_C
Single correctable error.
Definition: rocm_smi.h:515
@ RSMI_RAS_ERR_STATE_ENABLED
ECC is enabled.
Definition: rocm_smi.h:519
@ RSMI_RAS_ERR_STATE_NONE
No current errors.
Definition: rocm_smi.h:512
rsmi_event_type_t
Event type enum. Events belonging to a particular event group rsmi_event_group_t should begin enumera...
Definition: rocm_smi.h:220
@ RSMI_EVNT_XGMI_1_RESPONSE_TX
Definition: rocm_smi.h:247
@ RSMI_EVNT_XGMI_DATA_OUT_5
Outbound beats to neighbor 5.
Definition: rocm_smi.h:277
@ RSMI_EVNT_XGMI_1_BEATS_TX
Definition: rocm_smi.h:249
@ RSMI_EVNT_XGMI_1_NOP_TX
NOPs sent to neighbor 1.
Definition: rocm_smi.h:244
@ RSMI_EVNT_XGMI_0_NOP_TX
NOPs sent to neighbor 0.
Definition: rocm_smi.h:224
@ RSMI_EVNT_XGMI_1_REQUEST_TX
neighbor 1
Definition: rocm_smi.h:245
@ RSMI_EVNT_XGMI_DATA_OUT_3
Outbound beats to neighbor 3.
Definition: rocm_smi.h:275
@ RSMI_EVNT_XGMI_DATA_OUT_4
Outbound beats to neighbor 4.
Definition: rocm_smi.h:276
@ RSMI_EVNT_XGMI_DATA_OUT_2
Outbound beats to neighbor 2.
Definition: rocm_smi.h:274
@ RSMI_EVNT_XGMI_0_RESPONSE_TX
Definition: rocm_smi.h:227
@ RSMI_EVNT_XGMI_DATA_OUT_1
Outbound beats to neighbor 1.
Definition: rocm_smi.h:273
@ RSMI_EVNT_XGMI_0_BEATS_TX
Data beats sent to neighbor 0; Each beat represents 32 bytes.
Definition: rocm_smi.h:243
@ RSMI_EVNT_XGMI_0_REQUEST_TX
Definition: rocm_smi.h:225
rsmi_freq_ind_t
The values of this enum are used as frequency identifiers.
Definition: rocm_smi.h:541
@ RSMI_FREQ_IND_MAX
Index used for the maximum frequency value.
Definition: rocm_smi.h:543
@ RSMI_FREQ_IND_MIN
Index used for the minimum frequency value.
Definition: rocm_smi.h:542
@ RSMI_FREQ_IND_INVALID
An invalid frequency index.
Definition: rocm_smi.h:544
rsmi_memory_type_t
Types of memory.
Definition: rocm_smi.h:528
@ RSMI_MEM_TYPE_VRAM
VRAM memory.
Definition: rocm_smi.h:531
@ RSMI_MEM_TYPE_GTT
GTT memory.
Definition: rocm_smi.h:533
@ RSMI_MEM_TYPE_VIS_VRAM
VRAM memory that is visible.
Definition: rocm_smi.h:532
rsmi_power_profile_preset_masks_t
Pre-set Profile Selections. These bitmasks can be AND'd with the rsmi_power_profile_status_t....
Definition: rocm_smi.h:458
@ RSMI_PWR_PROF_PRST_LAST
Invalid power profile.
Definition: rocm_smi.h:468
@ RSMI_PWR_PROF_PRST_VIDEO_MASK
Video Power Profile.
Definition: rocm_smi.h:460
@ RSMI_PWR_PROF_PRST_VR_MASK
VR Power Profile.
Definition: rocm_smi.h:463
@ RSMI_PWR_PROF_PRST_CUSTOM_MASK
Custom Power Profile.
Definition: rocm_smi.h:459
@ RSMI_PWR_PROF_PRST_POWER_SAVING_MASK
Power Saving Profile.
Definition: rocm_smi.h:461
@ RSMI_PWR_PROF_PRST_COMPUTE_MASK
Compute Saving Profile.
Definition: rocm_smi.h:462
@ RSMI_PWR_PROF_PRST_BOOTUP_DEFAULT
Default Boot Up Profile.
Definition: rocm_smi.h:467
rsmi_gpu_block_t
This enum is used to identify different GPU blocks.
Definition: rocm_smi.h:480
@ RSMI_GPU_BLOCK_SMN
SMN block.
Definition: rocm_smi.h:494
@ RSMI_GPU_BLOCK_ATHUB
ATHUB block.
Definition: rocm_smi.h:489
@ RSMI_GPU_BLOCK_GFX
GFX block.
Definition: rocm_smi.h:487
@ RSMI_GPU_BLOCK_MMHUB
MMHUB block.
Definition: rocm_smi.h:488
@ RSMI_GPU_BLOCK_FUSE
Fuse block.
Definition: rocm_smi.h:498
@ RSMI_GPU_BLOCK_HDP
HDP block.
Definition: rocm_smi.h:491
@ RSMI_GPU_BLOCK_DF
DF block.
Definition: rocm_smi.h:493
@ RSMI_GPU_BLOCK_SEM
SEM block.
Definition: rocm_smi.h:495
@ RSMI_GPU_BLOCK_INVALID
Definition: rocm_smi.h:481
@ RSMI_GPU_BLOCK_MP1
MP1 block.
Definition: rocm_smi.h:497
@ RSMI_GPU_BLOCK_XGMI_WAFL
XGMI block.
Definition: rocm_smi.h:492
@ RSMI_GPU_BLOCK_UMC
UMC block.
Definition: rocm_smi.h:485
@ RSMI_GPU_BLOCK_LAST
for supported blocks
Definition: rocm_smi.h:500
@ RSMI_GPU_BLOCK_PCIE_BIF
PCIE_BIF block.
Definition: rocm_smi.h:490
@ RSMI_GPU_BLOCK_MP0
MP0 block.
Definition: rocm_smi.h:496
@ RSMI_GPU_BLOCK_SDMA
SDMA block.
Definition: rocm_smi.h:486
rsmi_fw_block_t
The values of this enum are used to identify the various firmware blocks.
Definition: rocm_smi.h:555
rsmi_init_flags_t
Initialization flags.
Definition: rocm_smi.h:139
@ RSMI_INIT_FLAG_RESRV_TEST1
Reserved for test.
Definition: rocm_smi.h:146
@ RSMI_INIT_FLAG_ALL_GPUS
Definition: rocm_smi.h:140
enum _RSMI_IO_LINK_TYPE RSMI_IO_LINK_TYPE
Types for IO Link.
#define RSMI_MAX_NUM_FREQUENCIES
Guaranteed maximum possible number of supported frequencies.
Definition: rocm_smi.h:73
uintptr_t rsmi_event_handle_t
Handle to performance event counter.
Definition: rocm_smi.h:200
rsmi_counter_command_t
Definition: rocm_smi.h:286
@ RSMI_CNTR_CMD_START
Start the counter.
Definition: rocm_smi.h:287
@ RSMI_CNTR_CMD_STOP
Definition: rocm_smi.h:288
uint64_t rsmi_bit_field_t
Bitfield used in various RSMI calls.
Definition: rocm_smi.h:595
rsmi_clk_type_t
Definition: rocm_smi.h:336
@ RSMI_CLK_TYPE_MEM
Memory clock.
Definition: rocm_smi.h:343
@ RSMI_CLK_TYPE_DCEF
Display Controller Engine clock.
Definition: rocm_smi.h:341
@ RSMI_CLK_TYPE_SOC
SOC clock.
Definition: rocm_smi.h:342
@ RSMI_CLK_TYPE_DF
Definition: rocm_smi.h:339
@ RSMI_CLK_TYPE_SYS
System clock.
Definition: rocm_smi.h:337
union id rsmi_func_id_value_t
This union holds the value of an rsmi_func_id_iter_handle_t. The value may be a function name,...
rsmi_temperature_type_t
This enumeration is used to indicate from which part of the device a temperature reading should be ob...
Definition: rocm_smi.h:404
@ RSMI_TEMP_TYPE_HBM_2
HBM temperature instance 2.
Definition: rocm_smi.h:413
@ RSMI_TEMP_TYPE_HBM_0
HBM temperature instance 0.
Definition: rocm_smi.h:411
@ RSMI_TEMP_TYPE_HBM_1
HBM temperature instance 1.
Definition: rocm_smi.h:412
@ RSMI_TEMP_TYPE_MEMORY
VRAM temperature.
Definition: rocm_smi.h:410
@ RSMI_TEMP_TYPE_INVALID
Invalid type.
Definition: rocm_smi.h:416
@ RSMI_TEMP_TYPE_EDGE
Edge GPU temperature.
Definition: rocm_smi.h:407
@ RSMI_TEMP_TYPE_JUNCTION
Definition: rocm_smi.h:408
@ RSMI_TEMP_TYPE_HBM_3
HBM temperature instance 3.
Definition: rocm_smi.h:414
The following structures hold the gpu metrics values for a device.
Definition: rocm_smi.h:816
Definition: rocm_smi.h:295
uint64_t value
Counter value.
Definition: rocm_smi.h:296
uint64_t time_enabled
Definition: rocm_smi.h:297
uint64_t time_running
Definition: rocm_smi.h:299
This structure holds error counts.
Definition: rocm_smi.h:911
uint64_t correctable_err
Accumulated correctable errors.
Definition: rocm_smi.h:912
uint64_t uncorrectable_err
Accumulated uncorrectable errors.
Definition: rocm_smi.h:913
Definition: rocm_smi.h:327
rsmi_evt_notification_type_t event
Event type.
Definition: rocm_smi.h:329
uint32_t dv_ind
Index of device that corresponds to the event.
Definition: rocm_smi.h:328
This structure holds 2 rsmi_range_t's, one for frequency and one for voltage. These 2 ranges indicate...
Definition: rocm_smi.h:766
rsmi_range_t volt_range
The voltage range for this VDDC Curve point.
Definition: rocm_smi.h:768
rsmi_range_t freq_range
The frequency range for this VDDC Curve point.
Definition: rocm_smi.h:767
This structure holds information about clock frequencies.
Definition: rocm_smi.h:684
uint32_t current
Definition: rocm_smi.h:693
uint32_t num_supported
Definition: rocm_smi.h:688
Definition: rocm_smi.h:847
This structure represents a point on the frequency-voltage plane.
Definition: rocm_smi.h:753
uint64_t frequency
Frequency coordinate (in Hz)
Definition: rocm_smi.h:754
uint64_t voltage
Voltage coordinate (in mV)
Definition: rocm_smi.h:755
Definition: rocm_smi.h:777
This structure holds the frequency-voltage values for a device.
Definition: rocm_smi.h:791
rsmi_range_t mclk_freq_limits
The range possible of MCLK values.
Definition: rocm_smi.h:796
uint32_t num_regions
The number of voltage curve regions.
Definition: rocm_smi.h:802
rsmi_range_t curr_mclk_range
Definition: rocm_smi.h:793
rsmi_range_t sclk_freq_limits
The range possible of SCLK values.
Definition: rocm_smi.h:795
rsmi_range_t curr_sclk_range
The current SCLK frequency range.
Definition: rocm_smi.h:792
rsmi_od_volt_curve_t curve
The current voltage curve.
Definition: rocm_smi.h:801
This structure holds information about the possible PCIe bandwidths. Specifically,...
Definition: rocm_smi.h:710
rsmi_frequencies_t transfer_rate
Definition: rocm_smi.h:714
This structure contains information about which power profiles are supported by the system for a give...
Definition: rocm_smi.h:661
uint32_t num_profiles
Definition: rocm_smi.h:675
rsmi_bit_field_t available_profiles
Definition: rocm_smi.h:665
rsmi_power_profile_preset_masks_t current
Definition: rocm_smi.h:670
This structure contains information specific to a process.
Definition: rocm_smi.h:919
uint32_t cu_occupancy
Compute Unit usage in percent.
Definition: rocm_smi.h:924
uint32_t pasid
PASID.
Definition: rocm_smi.h:921
uint32_t process_id
Process ID.
Definition: rocm_smi.h:920
uint64_t sdma_usage
SDMA usage in microseconds.
Definition: rocm_smi.h:923
uint64_t vram_usage
VRAM usage.
Definition: rocm_smi.h:922
This structure represents a range (e.g., frequencies or voltages).
Definition: rocm_smi.h:742
uint64_t upper_bound
Upper bound of range.
Definition: rocm_smi.h:744
uint64_t lower_bound
Lower bound of range.
Definition: rocm_smi.h:743
Reserved Memory Page Record.
Definition: rocm_smi.h:645
uint64_t page_size
Page size.
Definition: rocm_smi.h:647
rsmi_memory_page_status_t status
Page "reserved" status.
Definition: rocm_smi.h:648
uint64_t page_address
Start address of page.
Definition: rocm_smi.h:646
The utilization counter data.
Definition: rocm_smi.h:637
RSMI_UTILIZATION_COUNTER_TYPE type
Utilization counter type.
Definition: rocm_smi.h:638
uint64_t value
Utilization counter value.
Definition: rocm_smi.h:639
This structure holds version information.
Definition: rocm_smi.h:730
uint32_t patch
Patch, build or stepping version.
Definition: rocm_smi.h:733
const char * build
Build string.
Definition: rocm_smi.h:734
uint32_t minor
Minor version.
Definition: rocm_smi.h:732
uint32_t major
Major version.
Definition: rocm_smi.h:731
This union holds the value of an rsmi_func_id_iter_handle_t. The value may be a function name,...
Definition: rocm_smi.h:942
rsmi_temperature_metric_t temp_metric
Used for rsmi_event_type_t variants.
Definition: rocm_smi.h:949
rsmi_memory_type_t memory_type
< Used for rsmi_memory_type_t variants
Definition: rocm_smi.h:947
rsmi_event_group_t evnt_group
Used for rsmi_clk_type_t variants.
Definition: rocm_smi.h:953
rsmi_clk_type_t clk_type
Used for rsmi_fw_block_t variants.
Definition: rocm_smi.h:955
uint64_t id
uint64_t representation of value
Definition: rocm_smi.h:943
rsmi_event_type_t evnt_type
Used for rsmi_event_group_t variants.
Definition: rocm_smi.h:951
rsmi_fw_block_t fw_block
Used for rsmi_gpu_block_t variants.
Definition: rocm_smi.h:957
const char * name
name string (applicable to functions only)
Definition: rocm_smi.h:944