84 lines
2.2 KiB
Python
84 lines
2.2 KiB
Python
# coding=utf-8
|
|
|
|
import torch
|
|
from torch_cuda_ext import core
|
|
|
|
x = torch.randn(size=[100, 50], dtype=torch.float32, device="cuda")
|
|
y = torch.randn(size=[100, 50], dtype=torch.float32, device="cuda")
|
|
z = torch.empty_like(x)
|
|
print(z.size())
|
|
|
|
core.add_two_tensors(x, y, z)
|
|
|
|
compare_result = x + y
|
|
|
|
assert torch.allclose(z, compare_result), "result is not equal"
|
|
# print(compare_result, z)
|
|
|
|
|
|
x = torch.randn(size=(1024, 1024)).float().cuda()
|
|
y = torch.randn(size=(1024, 1024)).float().cuda()
|
|
z = torch.zeros(size=(1024, 1024)).float().cuda()
|
|
|
|
import time
|
|
|
|
start_time = time.time()
|
|
for _ in range(1000):
|
|
core.matmul(x, y, z)
|
|
end_time = time.time()
|
|
gpu_cost_time = end_time - start_time
|
|
print("gpu cost time is :", gpu_cost_time / 1000)
|
|
|
|
start_time = time.time()
|
|
for _ in range(1000):
|
|
gpu_out = torch.matmul(x, y)
|
|
torch_cost_time = time.time() - start_time
|
|
print("torch cost time is: ", torch_cost_time / 1000)
|
|
print("accelerate rate:", torch_cost_time / gpu_cost_time)
|
|
diff = z - gpu_out
|
|
print("max diff is :", torch.max(diff))
|
|
|
|
|
|
# start_time = time.time()
|
|
# for _ in range(1000):
|
|
# core.matmul_sigmoid(x, y, z)
|
|
# own_cost_time = time.time() - start_time
|
|
# print("matmul_sigmoid cost time is :", own_cost_time)
|
|
|
|
|
|
# start_time = time.time()
|
|
# for _ in range(1000):
|
|
# torch_sigmoid_output = torch.sigmoid(torch.matmul(x, y))
|
|
# torch_cost_time = time.time() - start_time
|
|
# print("matmul sigmoid torch cost time is:", torch_cost_time)
|
|
# print(torch.max(z - torch_sigmoid_output))
|
|
# assert torch.allclose(z, torch_sigmoid_output), "not equal"
|
|
|
|
# x = x.cpu()
|
|
# y = y.cpu()
|
|
# start_time = time.time()
|
|
# for _ in range(1000):
|
|
# torch_out = torch.matmul(x, y)
|
|
# end_time = time.time()
|
|
# cpu_cost_time = end_time - start_time
|
|
# print("cpu cost time is :", end_time - start_time)
|
|
# print("accelerate rate:", cpu_cost_time / gpu_cost_time)
|
|
|
|
|
|
# print(z)
|
|
# print(torch_out)
|
|
|
|
# diff = torch_out - z
|
|
# print(torch.max(diff))
|
|
# assert torch.allclose(torch_out, z), "result is not equal"
|
|
|
|
# 还是有问题,这比较奇怪了,感觉和公司的代码也没啥差别啊,为啥就不对呢?
|
|
x = torch.randn(size=(512, 512)).float().cuda()
|
|
y = torch.empty_like(x)
|
|
core.matmul_shared(x, x, y)
|
|
|
|
real_y = torch.matmul(x, x)
|
|
if not torch.allclose(y, real_y):
|
|
print("not equal")
|
|
print(real_y - y)
|