# coding=utf-8

import torch
from torch_cuda_ext import core

x = torch.randn(size=[100, 50], dtype=torch.float32, device="cuda")
y = torch.randn(size=[100, 50], dtype=torch.float32, device="cuda")
z = torch.empty_like(x)
print(z.size())

core.add_two_tensors(x, y, z)

compare_result = x + y

assert torch.allclose(z, compare_result), "result is not equal"
# print(compare_result, z)


x = torch.randn(size=(1024, 1024)).float().cuda()
y = torch.randn(size=(1024, 1024)).float().cuda()
z = torch.zeros(size=(1024, 1024)).float().cuda()

import time

start_time = time.time()
for _ in range(1000):
    core.matmul(x, y, z)
end_time = time.time()
gpu_cost_time = end_time - start_time
print("gpu cost time is :", gpu_cost_time / 1000)

start_time = time.time()
for _ in range(1000):
    gpu_out = torch.matmul(x, y)
torch_cost_time = time.time() - start_time
print("torch cost time is: ", torch_cost_time / 1000)
print("accelerate rate:", torch_cost_time / gpu_cost_time)
diff = z - gpu_out
print("max diff is :", torch.max(diff))


# start_time = time.time()
# for _ in range(1000):
#     core.matmul_sigmoid(x, y, z)
# own_cost_time = time.time() - start_time
# print("matmul_sigmoid cost time is :", own_cost_time)


# start_time = time.time()
# for _ in range(1000):
#     torch_sigmoid_output = torch.sigmoid(torch.matmul(x, y))
# torch_cost_time = time.time() - start_time
# print("matmul sigmoid torch cost time is:", torch_cost_time)
# print(torch.max(z - torch_sigmoid_output))
# assert torch.allclose(z, torch_sigmoid_output), "not equal"

# x = x.cpu()
# y = y.cpu()
# start_time = time.time()
# for _ in range(1000):
#     torch_out = torch.matmul(x, y)
# end_time = time.time()
# cpu_cost_time = end_time - start_time
# print("cpu cost time is :", end_time - start_time)
# print("accelerate rate:", cpu_cost_time / gpu_cost_time)


# print(z)
# print(torch_out)

# diff = torch_out - z
# print(torch.max(diff))
# assert torch.allclose(torch_out, z), "result is not equal"

# 还是有问题，这比较奇怪了，感觉和公司的代码也没啥差别啊，为啥就不对呢？
x = torch.randn(size=(512, 512)).float().cuda()
y = torch.empty_like(x)
core.matmul_shared(x, x, y)

real_y = torch.matmul(x, x)
if not torch.allclose(y, real_y):
    print("not equal")
    print(real_y - y)