From 920ebe0f8821d5e39e07759d4b48b752636e1623 Mon Sep 17 00:00:00 2001
From: long0x0 <lonelydragon1988@gmail.com>
Date: Sat, 4 Jan 2025 13:47:42 +0800
Subject: [PATCH] =?UTF-8?q?=E7=AE=80=E5=8D=95=E4=BF=AE=E6=94=B9=E4=B8=80?=
 =?UTF-8?q?=E4=B8=8B=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 csrc/attention.cu | 29 +++++++++++++++++++++++++++++
 csrc/core.cu      |  2 +-
 csrc/fp8_vec.cu   |  5 ++++-
 csrc/layernorm.cu |  1 -
 fi/test_module.py | 28 ++++++++++++++++++++++++++++
 5 files changed, 62 insertions(+), 3 deletions(-)
 create mode 100644 csrc/attention.cu
 create mode 100644 fi/test_module.py
diff --git a/csrc/attention.cu b/csrc/attention.cu
new file mode 100644
index 0000000..7e76de5
--- /dev/null
+++ b/csrc/attention.cu
@@ -0,0 +1,29 @@
+#include "core.h"
+
+// calculate the vec cum of different matrix row and col.
+template <typename scalar_t>
+__device__ scalar_t vecsum(scalar_t *q, scalar_t *k)
+{
+}
+
+template <typename scalar_t>
+__global__ void attention_kernel(const scalar_t *q,
+                                 const scalar_t *k,
+                                 const scalar_t *v,
+                                 int head_num,
+                                 int head_dim,
+                                 int seq_len,
+                                 int batch_size,
+                                 int hidden_dim,
+                                 scalar_t *output)
+{
+    // calculate the gemm.
+    int tid = threadIdx.x;
+    // caculate the offset.
+    int q_offset = blockIdx.x * head_num * 1 * head_dim;
+    int k_offset = blockIdx.x * head_num * seq_len * head_dim;
+    int v_offset = blockIdx.x * head_num * seq_len * head_dim;
+    // calculate the sum.
+    // calculate the softmax
+    // calculate the weighted sum.
+}
\ No newline at end of file
diff --git a/csrc/core.cu b/csrc/core.cu
index f05bf75..8cc4b6b 100644
--- a/csrc/core.cu
+++ b/csrc/core.cu
@@ -60,7 +60,7 @@ __global__ void matmul_sigmoid_cuda(const T *in1, const T *in2, T *output, int r
 
 #define BASE_BLOCK 256
 #define CALL_ADD_FUNCTION \
-add_two_tensors_cuda<<<(input1.size(0) * input1.size(1) + BASE_BLOCK - 1) / BASE_BLOCK, BASE_BLOCK>>>(src, src1, dest, input1.size(0) * input1.size(1));
+    add_two_tensors_cuda<<<(input1.size(0) * input1.size(1) + BASE_BLOCK - 1) / BASE_BLOCK, BASE_BLOCK>>>(src, src1, dest, input1.size(0) * input1.size(1));
 void add_two_tensors(const torch::Tensor &input1, const torch::Tensor &input2, torch::Tensor &output)
 {
     // cout << input1.dtype() << " the size 1 is : " << input1.size(0) << " size 2 is " << input1.size(1) << "output dim is :" << output.size(0) << output.size(1) << endl;
diff --git a/csrc/fp8_vec.cu b/csrc/fp8_vec.cu
index db7b3a4..7f67993 100644
--- a/csrc/fp8_vec.cu
+++ b/csrc/fp8_vec.cu
@@ -1 +1,4 @@
-#include <cuda_fp8.h>
\ No newline at end of file
+#include "core.h"
+#include <cuda_fp8.h>
+
+#define __nv_fp8_e4m3 fp8_e4m3
diff --git a/csrc/layernorm.cu b/csrc/layernorm.cu
index 9462069..c6b3791 100644
--- a/csrc/layernorm.cu
+++ b/csrc/layernorm.cu
@@ -77,7 +77,6 @@ void rms_norm(torch::Tensor &states, float eps, float gamma)
     int block_size = 1024;
     dim3 block(h);
     dim3 grid(block_size);
-    cout << states.scalar_type() << endl;
     TYPING_DISPATCH(states.scalar_type(), [&]
                     { rms_norm_kernel<fi_type><<<block, grid>>>(reinterpret_cast<fi_type *>(states.data_ptr()), hidden_dim, eps, gamma); });
 }
\ No newline at end of file
diff --git a/fi/test_module.py b/fi/test_module.py
new file mode 100644
index 0000000..2073d2b
--- /dev/null
+++ b/fi/test_module.py
@@ -0,0 +1,28 @@
+# coding=utf-8
+
+import torch.nn as nn
+
+
+class TestModule(nn.Module):
+    def __init__(self, start_layer_index: int, end_layer_index: int, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.model = DecodeLayer()
+
+    def forward(self, x):
+        for module in self.model:
+            x = module(x)
+        return x
+
+
+class DecodeLayer(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.layers = nn.ModuleList()
+        for i in range(10):
+            self.layers.append(nn.Linear(10, 10))
+
+
+if __name__ == "__main__":
+    test_module = TestModule(0, 3)
+    for x in test_module.named_parameters():
+        print(x[0])