ROCm Communication Collectives Library (RCCL) test configuration files#
2025-11-12
7 min read time
These RCCL tests are comprehensive benchmarks that validate distributed GPU communication performance across AMD GPU clusters. These tests ensure optimal performance for AI training, HPC workloads, and distributed computing.
rccl_config.json#
Here’s a code snippet of the rccl_config.json file for reference:
rccl_config.json
{
"rccl":
{
"no_of_nodes": "2",
"no_of_global_ranks": "16",
"no_of_local_ranks": "8",
"ranks_per_node": "8",
"rccl_dir": "/root/cache/INSTALL/rccl-tests",
"rccl_tests_dir": "/root/cache/INSTALL/rccl-tests/build",
"mpi_dir": "/root/cache/INSTALL/ompi-4.1.6/install/bin",
"mpi_path_var": "/root/cache/INSTALL/ompi-4.1.6/install",
"rocm_path_var": "/opt/rocm-7.0.2/",
"rccl_path_var": "/root/cache/INSTALL/rccl-tests",
"cluster_snapshot_debug": "False",
"env_source_script": "None",
"rccl_collective": [ "all_reduce_perf", "all_gather_perf", "scatter_perf", "gather_perf", "reduce_scatter_perf", "sendrecv_perf", "alltoall_perf", "alltoallv_perf", "reduce_scatter_perf", "broadcast_perf" ],
"rccl_algo": [ "ring" ],
"rccl_protocol": [ "simple" ],
"qp_scale": [ "1" ],
"ib_hca_list": "bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7",
"net_dev_list": "ens11np0,ens12np0,ens21np0,ens22np0,ens31np0,ens32np0,ens41np0,ens42np0",
"oob_port": "ens51f0np0",
"gid_index": "1",
"qp_count": "1",
"start_msg_size": "1024",
"end_msg_size": "16g",
"step_function": "2",
"threads_per_gpu": "1",
"warmup_iterations": "10",
"no_of_iterations": "1",
"check_iteration_count": "1",
"nccl_ib_timeout": "30",
"ib_rx_queue_len": "8192",
"ucx_tls": "tcp",
"hcoll_enable_mcast_all": "0",
"nccl_cumem_enable": "0",
"nccl_ib_sl": "0",
"nccl_ib_tc": "0",
"nccl_ib_split_data_on_qps": "0",
"nccl_pxn_disable": "0",
"nccl_net_plugin": "none",
"verify_bus_bw": "False",
"debug_level": "ERROR",
"rccl_result_file": "/tmp/rccl_result_file.json",
"_comments_results": "expected results below are for 2 node cluster, will vary based on cluster size",
"results":
{
"all_reduce_perf":
{
"bus_bw":
{
"8589934592": "330.00",
"17179869184": "350.00"
}
},
"all_gather_perf":
{
"bus_bw":
{
"8589934592": "330.00",
"17179869184": "350.00"
}
},
"gather_perf":
{
"bus_bw":
{
"8589934592": "330.00",
"17179869184": "350.00"
}
},
"scatter_perf":
{
"bus_bw":
{
"8589934592": "330.0",
"17179869184": "350.0"
}
},
"reduce_perf":
{
"bus_bw":
{
"8589934592": "300.0",
"17179869184": "310.00"
}
},
"reduce_scatter_perf":
{
"bus_bw":
{
"8589934592": "340.0",
"17179869184": "360.00"
}
},
"alltoall_perf":
{
"bus_bw":
{
"8589934592": "45.00",
"17179869184": "50.00"
}
},
"alltoallv_perf":
{
"bus_bw":
{
"8589934592": "45.00",
"17179869184": "50.00"
}
},
"sendrecv_perf":
{
"bus_bw":
{
"8589934592": "47.00",
"17179869184": "48.00"
}
},
"broadcast_perf":
{
"bus_bw":
{
"8589934592": "310.00",
"17179869184": "312.00"
}
}
}
}
}
Parameters#
Here’s an exhaustive list of the available parameters in the rccl_config.json RCCL configuration file:
Configuration parameters |
Default values |
Description |
|---|---|---|
|
2 |
Number of nodes in a cluster |
|
16 |
Total MPI ranks across the entire cluster |
|
8 |
MPI ranks running on each individual node |
|
8 |
Confirms GPUs per node |
|
|
Directory where RCCL is installed |
|
|
Directory where RCCL tests are installed |
|
|
Directory where mpi bin is located |
|
|
Directory where mpi is located |
|
|
Path to ROCm installation |
|
|
Directory where RCCL tests are located |
|
False |
Enables/disables cluster debug snapshot |
|
None |
Path to environment setup script |
|
|
RCCL tests list |
|
|
Communication algorithms |
|
Simple |
Communication protocol |
|
1, 2 |
Queue pair scaling factors |
|
|
List of IB hosts |
|
|
List of network hosts |
|
eth0 |
Out of band port |
|
1 |
Global ID index for InfiniBand |
|
1024 |
Start with 1KB messages |
|
16g |
End with 16GB messages |
|
2 |
Double message size each step |
|
1 |
One thread per GPU |
|
10 |
Warmup runs |
|
1 |
Number of iterations to run the RCCL tests |
|
1 |
Verification iteration |
|
30 |
InfiniBand timeout |
|
8192 |
Receive queue length |
|
Tcp |
UCX transport layer |
|
0 |
Disable multicast |
|
0 |
CUDA memory optimizations |
|
0 |
InfiniBand service level |
|
0 |
InfiniBand traffic class |
|
0 |
Don’t split data across queue pairs |
|
0, 1 |
PXN disable options |
|
None |
Network plugin |
|
False |
Verify bus bandwidth |
|
True |
Check for bandwidth drops |
|
True |
Check for latency spikes |
|
ERROR |
Set the debug level |
|
|
Path where RCCL results are captured |
|
|
Expected results are for the two-node cluster and vary based on cluster size |
|
|
Global reduction: sum/max/min across all GPUs |
|
|
All GPUs receive the complete combined dataset from all ranks |
|
|
Collect data from all ranks to one root rank |
|
|
Distribute data from one rank to all ranks |
|
|
Reduce operation |
|
|
Reduce operation followed by scatter |
|
|
Every rank sends unique data to every other rank |
|
|
Variable-size all-to-all exchange |
|
|
Point-to-point communication between rank pairs |
|
|
One-to-all communication |
single_node_mi355_rccl.json#
Here’s a code snippet of the single_node_mi355_rccl.json file for reference:
single_node_mi355_rccl.json
{
"rccl":
{
"no_of_local_ranks": "8",
"rccl_dir": "/opt/rccl-tests/",
"rccl_tests_dir": "/opt/rccl-tests/build",
"rocm_path_var": "/opt/rocm-7.0.2/",
"rccl_path_var": "/opt/rccl-tests/",
"env_source_script": "/root/env_source_file.sh",
"rccl_collective": [ "all_reduce_perf", "all_gather_perf", "scatter_perf", "gather_perf", "reduce_scatter_perf", "sendrecv_perf", "alltoall_perf", "alltoallv_perf", "reduce_scatter_perf", "broadcast_perf" ],
"start_msg_size": "1024",
"end_msg_size": "16g",
"step_function": "2",
"warmup_iterations": "10",
"no_of_iterations": "1",
"check_iteration_count": "1",
"verify_bus_bw": "False",
"verify_bw_dip": "True",
"verify_lat_dip": "True",
"debug_level": "ERROR",
"rccl_result_file": "/tmp/rccl_result_file.json",
"_comments_results": "expected results below are for 2 node cluster, will vary based on cluster size",
"results":
{
"all_reduce_perf":
{
"bus_bw":
{
"8589934592": "390.00",
"17179869184": "393.00"
}
},
"all_gather_perf":
{
"bus_bw":
{
"8589934592": "380.00",
"17179869184": "383.00"
}
},
"gather_perf":
{
"bus_bw":
{
"8589934592": "430.00",
"17179869184": "430.00"
}
},
"reduce_scatter_perf":
{
"bus_bw":
{
"8589934592": "400.0",
"17179869184": "400.00"
}
},
"alltoall_perf":
{
"bus_bw":
{
"8589934592": "340.00",
"17179869184": "345.00"
}
},
"alltoallv_perf":
{
"bus_bw":
{
"8589934592": "340.00",
"17179869184": "345.00"
}
},
"broadcast_perf":
{
"bus_bw":
{
"8589934592": "370.00",
"17179869184": "370.00"
}
}
}
}
}
Parameters#
Here’s an exhaustive list of the available parameters in the single_node_mi355_rccl.json RCCL configuration file:
Configuration parameters |
Default values |
Description |
|---|---|---|
|
8 |
MPI ranks running on each individual node |
|
|
Directory where RCCL is installed |
|
|
Directory where RCCL tests are installed |
|
|
Path to ROCm installation |
|
|
Directory where RCCL tests are located |
|
|
Path to environment setup script |
|
|
RCCL tests list |
|
1024 |
Start with 1KB messages |
|
16g |
End with 16GB messages |
|
2 |
Double message size each step |
|
10 |
Warmup runs |
|
1 |
Number of iterations to run the RCCL tests |
|
1 |
Verification iteration |
|
False |
Verify bus bandwidth |
|
True |
Check for bandwidth drops |
|
True |
Check for latency spikes |
|
ERROR |
Set the debug level |
|
|
Path where RCCL results are captured |
|
|
Expected results are for the two-node cluster and vary based on cluster size |
|
|
Global reduction: sum/max/min across all GPUs |
|
|
All GPUs receive the complete combined dataset from all ranks |
|
|
Collect data from all ranks to one root rank |
|
|
Reduce operation followed by scatter |
|
|
Every rank sends unique data to every other rank |
|
|
Variable-size all-to-all exchange |
|
|
One-to-all communication |