From 920ebe0f8821d5e39e07759d4b48b752636e1623 Mon Sep 17 00:00:00 2001 From: long0x0 Date: Sat, 4 Jan 2025 13:47:42 +0800 Subject: [PATCH] =?UTF-8?q?=E7=AE=80=E5=8D=95=E4=BF=AE=E6=94=B9=E4=B8=80?= =?UTF-8?q?=E4=B8=8B=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- csrc/attention.cu | 29 +++++++++++++++++++++++++++++ csrc/core.cu | 2 +- csrc/fp8_vec.cu | 5 ++++- csrc/layernorm.cu | 1 - fi/test_module.py | 28 ++++++++++++++++++++++++++++ 5 files changed, 62 insertions(+), 3 deletions(-) create mode 100644 csrc/attention.cu create mode 100644 fi/test_module.py diff --git a/csrc/attention.cu b/csrc/attention.cu new file mode 100644 index 0000000..7e76de5 --- /dev/null +++ b/csrc/attention.cu @@ -0,0 +1,29 @@ +#include "core.h" + +// calculate the vec cum of different matrix row and col. +template +__device__ scalar_t vecsum(scalar_t *q, scalar_t *k) +{ +} + +template +__global__ void attention_kernel(const scalar_t *q, + const scalar_t *k, + const scalar_t *v, + int head_num, + int head_dim, + int seq_len, + int batch_size, + int hidden_dim, + scalar_t *output) +{ + // calculate the gemm. + int tid = threadIdx.x; + // caculate the offset. + int q_offset = blockIdx.x * head_num * 1 * head_dim; + int k_offset = blockIdx.x * head_num * seq_len * head_dim; + int v_offset = blockIdx.x * head_num * seq_len * head_dim; + // calculate the sum. + // calculate the softmax + // calculate the weighted sum. +} \ No newline at end of file diff --git a/csrc/core.cu b/csrc/core.cu index f05bf75..8cc4b6b 100644 --- a/csrc/core.cu +++ b/csrc/core.cu @@ -60,7 +60,7 @@ __global__ void matmul_sigmoid_cuda(const T *in1, const T *in2, T *output, int r #define BASE_BLOCK 256 #define CALL_ADD_FUNCTION \ -add_two_tensors_cuda<<<(input1.size(0) * input1.size(1) + BASE_BLOCK - 1) / BASE_BLOCK, BASE_BLOCK>>>(src, src1, dest, input1.size(0) * input1.size(1)); + add_two_tensors_cuda<<<(input1.size(0) * input1.size(1) + BASE_BLOCK - 1) / BASE_BLOCK, BASE_BLOCK>>>(src, src1, dest, input1.size(0) * input1.size(1)); void add_two_tensors(const torch::Tensor &input1, const torch::Tensor &input2, torch::Tensor &output) { // cout << input1.dtype() << " the size 1 is : " << input1.size(0) << " size 2 is " << input1.size(1) << "output dim is :" << output.size(0) << output.size(1) << endl; diff --git a/csrc/fp8_vec.cu b/csrc/fp8_vec.cu index db7b3a4..7f67993 100644 --- a/csrc/fp8_vec.cu +++ b/csrc/fp8_vec.cu @@ -1 +1,4 @@ -#include \ No newline at end of file +#include "core.h" +#include + +#define __nv_fp8_e4m3 fp8_e4m3 diff --git a/csrc/layernorm.cu b/csrc/layernorm.cu index 9462069..c6b3791 100644 --- a/csrc/layernorm.cu +++ b/csrc/layernorm.cu @@ -77,7 +77,6 @@ void rms_norm(torch::Tensor &states, float eps, float gamma) int block_size = 1024; dim3 block(h); dim3 grid(block_size); - cout << states.scalar_type() << endl; TYPING_DISPATCH(states.scalar_type(), [&] { rms_norm_kernel<<>>(reinterpret_cast(states.data_ptr()), hidden_dim, eps, gamma); }); } \ No newline at end of file diff --git a/fi/test_module.py b/fi/test_module.py new file mode 100644 index 0000000..2073d2b --- /dev/null +++ b/fi/test_module.py @@ -0,0 +1,28 @@ +# coding=utf-8 + +import torch.nn as nn + + +class TestModule(nn.Module): + def __init__(self, start_layer_index: int, end_layer_index: int, *args, **kwargs): + super().__init__(*args, **kwargs) + self.model = DecodeLayer() + + def forward(self, x): + for module in self.model: + x = module(x) + return x + + +class DecodeLayer(nn.Module): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.layers = nn.ModuleList() + for i in range(10): + self.layers.append(nn.Linear(10, 10)) + + +if __name__ == "__main__": + test_module = TestModule(0, 3) + for x in test_module.named_parameters(): + print(x[0])