使用Pytorch测试cuda设备的性能（单卡或多卡并行）

以下CUDA设备泛指NVIDIA显卡或启用ROCm的AMD显卡

测试环境：
- Distributor ID: Ubuntu
- Description: Ubuntu 22.04.3 LTS
- Release: 22.04
- Codename: jammy

1.首先，简单使用torch.ones测试CUDA设备

import torch
import timedef cuda_benchmark(device_id, N=1000000):# 指定要使用的显卡设备torch.cuda.set_device(device_id)# 创建输入数据data = torch.ones(N).cuda()# 启动CUDA操作，并记录执行时间start_time = time.time()for i in range(10000):data += 1torch.cuda.synchronize()  # 等待CUDA操作执行完成end_time = time.time()# 将结果从GPU内存下载到主机内存result = data.cpu().numpy()# 打印Benchmark结果和执行时间print(f"Benchmark结果：{result[:10]}")print(f"执行时间：{end_time - start_time} 秒")if __name__ == '__main__':# 测试第一块显卡device_id = 0cuda_benchmark(device_id,10000000)

2.使用自带的CUDABenchmarkModel测试CUDA设备

import torch
import torch.nn as nn
import timeclass CUDABenchmarkModel(nn.Module):def __init__(self):super(CUDABenchmarkModel, self).__init__()self.fc = nn.Linear(10, 10).cuda()def forward(self, x):return self.fc(x)def cuda_benchmark(device_ids, N=10000000):# 创建模型model = CUDABenchmarkModel()model = nn.DataParallel(model, device_ids=device_ids)# 创建输入数据data = torch.ones(N, 10).cuda()# 启动CUDA操作，并记录执行时间start_time = time.time()for i in range(10000):output = model(data)torch.cuda.synchronize()  # 等待CUDA操作执行完成end_time = time.time()# 打印执行时间print(f"执行时间：{end_time - start_time} 秒")if __name__ == '__main__':# 同时测试3块显卡device_ids = [0, 1, 2]cuda_benchmark(device_ids=device_ids)

3.使用nccl多进程的方式测试CUDA设备

import torch
import torch.nn as nn
import torch.distributed as dist
import torch.multiprocessing as mp
import timedef cuda_benchmark(device_id, N=10000000):# 指定要使用的显卡设备torch.cuda.set_device(device_id)print(f"该GPU的核心数量为：{torch.cuda.get_device_properties(device_id).multi_processor_count}")# 创建输入数据data = torch.ones(N).cuda()# 启动CUDA操作，并记录执行时间start_time = time.time()for i in range(10000):data += 1torch.cuda.synchronize()  # 等待CUDA操作执行完成end_time = time.time()# 将结果从GPU内存下载到主机内存result = data.cpu().numpy()# 打印Benchmark结果和执行时间print(f"Benchmark结果：{result[:10]}")print(f"执行时间：{end_time - start_time} 秒")def main(num):# 初始化多进程mp.spawn(run, args=(num,), nprocs=num)def run(rank,world_size):"""每个进程的入口函数"""# 初始化进程组dist.init_process_group("nccl", init_method="tcp://127.0.0.1:23456", rank=rank, world_size=world_size)# 指定设备IDdevice_id = rank# 在多个GPU上并行执行操作model = cuda_benchmark(device_id)if __name__ == '__main__':# 同时启用3个进程（一个进程对应一块显卡）device_numbers = 3main(device_numbers)