import torch import torch.nn as nn import torch.optim as optim from torch.profiler import profile, record_function, ProfilerActivity # 定义模型和优化器 model = nn.Linear(100, 10).cuda() optimizer = optim.SGD(model.parameters(), lr=0.01) # 启动 Profiler with profile( activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU], # 监控 GPU 和 CPU record_shapes=True, # 记录张量形状 profile_memory=True, # 分析内存使用 with_stack=True, # 记录调用栈 ) as prof: for _ in range(10): x = torch.randn(64, 100).cuda() y = model(x) loss = y.sum() loss.backward() optimizer.step() optimizer.zero_grad() # 输出分析结果 print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) prof.export_chrome_trace("./trace.json")