Merge branch 'main' of http://192.168.0.100:3000/squall/torch_ext

2025-03-27 03:49:33 +08:00 · 2025-03-27 03:49:33 +08:00 · a1aa7fd0d6
commit a1aa7fd0d6
parent c77f9602ea 920ebe0f88
13 changed files with 28985 additions and 2 deletions
--- a/csrc/attention.cu
+++ b/csrc/attention.cu
@ -0,0 +1,29 @@
+#include "core.h"
+
+// calculate the vec cum of different matrix row and col.
+template <typename scalar_t>
+__device__ scalar_t vecsum(scalar_t *q, scalar_t *k)
+{
+}
+
+template <typename scalar_t>
+__global__ void attention_kernel(const scalar_t *q,
+                                 const scalar_t *k,
+                                 const scalar_t *v,
+                                 int head_num,
+                                 int head_dim,
+                                 int seq_len,
+                                 int batch_size,
+                                 int hidden_dim,
+                                 scalar_t *output)
+{
+    // calculate the gemm.
+    int tid = threadIdx.x;
+    // caculate the offset.
+    int q_offset = blockIdx.x * head_num * 1 * head_dim;
+    int k_offset = blockIdx.x * head_num * seq_len * head_dim;
+    int v_offset = blockIdx.x * head_num * seq_len * head_dim;
+    // calculate the sum.
+    // calculate the softmax
+    // calculate the weighted sum.
+}
--- a/csrc/core.cu
+++ b/csrc/core.cu
@ -1,6 +1,7 @@
 #include "core.h"
 #include <iostream>
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 // #include <mma.h>
 #include <cuda_runtime.h>

@ -58,7 +59,8 @@ __global__ void matmul_sigmoid_cuda(const T *in1, const T *in2, T *output, int r
 }

 #define BASE_BLOCK 256
-#define CALL_ADD_FUNCTION add_two_tensors_cuda<<<(input1.size(0) * input1.size(1) + BASE_BLOCK - 1) / BASE_BLOCK, BASE_BLOCK>>>(src, src1, dest, input1.size(0) * input1.size(1));
+#define CALL_ADD_FUNCTION \
+    add_two_tensors_cuda<<<(input1.size(0) * input1.size(1) + BASE_BLOCK - 1) / BASE_BLOCK, BASE_BLOCK>>>(src, src1, dest, input1.size(0) * input1.size(1));
 void add_two_tensors(const torch::Tensor &input1, const torch::Tensor &input2, torch::Tensor &output)
 {
    // cout << input1.dtype() << " the size 1 is : " << input1.size(0) << " size 2 is " << input1.size(1) << "output dim is :" << output.size(0) << output.size(1) << endl;
--- a/csrc/core.h
+++ b/csrc/core.h
@ -11,6 +11,36 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
    AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))

+#define TYPING_DISPATCH(scalar_t, ...) \
+    switch (scalar_t)                  \
+    {                                  \
+    case at::ScalarType::Float:        \
+    {                                  \
+        using fi_type = float;         \
+        __VA_ARGS__();                 \
+    }                                  \
+    case at::ScalarType::BFloat16:     \
+    {                                  \
+        using fi_type = __nv_bfloat16; \
+        __VA_ARGS__();                 \
+    }                                  \
+    case at::ScalarType::Half:         \
+    {                                  \
+        using fi_type = __half;        \
+        __VA_ARGS__();                 \
+    }                                  \
+    }
+// default:                                  \
+    //     printf("do not support such type\n"); \
+    // }
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)           \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
+    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+    AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
 void add_two_tensors(const torch::Tensor &input1, const torch::Tensor &input2, torch::Tensor &output);

 void rope_tensors(const torch::Tensor &input, torch::Tensor &output, int rope_index_start);
@ -29,4 +59,5 @@ void md_mm(const torch::Tensor &src);
 void block_sum(const torch::Tensor &src, torch::Tensor &dest);
 void md_block_sum(const torch::Tensor &src, torch::Tensor &dest);
 void softmax(const torch::Tensor &src, torch::Tensor &dest);
+void rms_norm(torch::Tensor &states, float eps, float gamma);
 #endif
--- a/csrc/core_bind.cpp
+++ b/csrc/core_bind.cpp
@ -19,4 +19,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
  m.def("block_sum", &block_sum, "test block sum");
  m.def("md_block_sum", &md_block_sum, "multi dimension block sum");
  m.def("softmax", &softmax, "test softmax example");
+  m.def("rms_norm", &rms_norm, "rms noram");
 }
--- a/csrc/fp8_vec.cu
+++ b/csrc/fp8_vec.cu
@ -0,0 +1,4 @@
+#include "core.h"
+#include <cuda_fp8.h>
+
+#define __nv_fp8_e4m3 fp8_e4m3
--- a/csrc/layernorm.cu
+++ b/csrc/layernorm.cu
@ -0,0 +1,82 @@
+#include "core.h"
+#include <cub/cub.cuh>
+#include <cub/util_device.cuh>
+
+#include <cuda_fp16.h>
+#include <cuda_fp8.h>
+#include <cuda_bf16.h>
+#include <torch/torch.h>
+#include <torch/all.h>
+using namespace std;
+
+template <typename src_type, typename dest_type>
+__device__ dest_type fi_cast(src_type a)
+{
+}
+template <>
+__device__ float fi_cast<__nv_bfloat16, float>(__nv_bfloat16 a)
+{
+    return __bfloat162float(a);
+}
+
+template <>
+__device__ float fi_cast<__half, float>(__half a)
+{
+    return __half2float(a);
+}
+
+template <>
+__device__ __nv_bfloat16 fi_cast<float, __nv_bfloat16>(float a)
+{
+    return __float2bfloat16(a);
+}
+
+template <>
+__device__ __half fi_cast<float, __half>(float a)
+{
+    return __float2half(a);
+}
+
+template <typename scalar_t, int BLOCK_SIZE = 1024>
+__global__ void rms_norm_kernel(scalar_t *states, int hidden_dim, float eps, float gamma)
+{
+    __shared__ float smem[BLOCK_SIZE];
+    int idx = threadIdx.x;
+    int offset = blockIdx.x * hidden_dim;
+    float local_sum = 0.0f;
+    for (int i = idx; i < hidden_dim; i += blockDim.x)
+    {
+        int local_offset = offset + i;
+        float tmp = fi_cast<scalar_t, float>(states[local_offset]);
+        local_sum += tmp * tmp;
+    }
+    if (idx < BLOCK_SIZE)
+        smem[idx] = local_sum;
+    else
+        smem[idx] = 0.0f;
+    __syncthreads();
+    typedef cub::BlockReduce<float, BLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    float sum_res = BlockReduce(temp_storage).Sum(smem[idx]);
+    sum_res = sqrtf(sum_res);
+    sum_res = sum_res + eps;
+    for (int i = idx; i < hidden_dim; i += blockDim.x)
+    {
+        int local_offset = offset + i;
+        float tmp = fi_cast<scalar_t, float>(states[local_offset]);
+        tmp = tmp / sum_res * gamma;
+        states[local_offset] = fi_cast<float, scalar_t>(tmp);
+    }
+}
+
+void rms_norm(torch::Tensor &states, float eps, float gamma)
+{
+    int h = states.size(0);
+    int hidden_dim = states.size(1);
+    int block_size = 1024;
+    dim3 block(h);
+    dim3 grid(block_size);
+    TYPING_DISPATCH(states.scalar_type(), [&]
+                    { rms_norm_kernel<fi_type><<<block, grid>>>(reinterpret_cast<fi_type *>(states.data_ptr()), hidden_dim, eps, gamma); });
+}
--- a/csrc/md.cu
+++ b/csrc/md.cu
@ -69,6 +69,7 @@ __global__ void row_sum_kernel(const float *src, float *dest, int hidden_dim)
    if (tid == 0)
    {
        dest[blockIdx.x] = sum;
+        printf("blockidx.x: %d, blockIdx.y %d, blockIdx.z %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
    }
 }

@ -109,6 +110,7 @@ __global__ void md_row_sum_kernel(const float *src, float *dest, int stride_a, i
    if (tid == 0 && block_offset < all_len)
    {
        dest[block_offset] = sum;
+        printf("blockIdx.x %d, blockIdx.y %d, blockIdx.z %d, blockDim.x %d\n", blockIdx.x, blockIdx.y, blockIdx.z, blockDim.x);
    }
 }

@ -243,3 +245,34 @@ void softmax(const torch::Tensor &src, torch::Tensor &dest)
                                       dest.data_ptr<scalar_t>(),
                                       hidden_dim); });
 }
+
+template <int head_num = 8>
+__global__ void test_head_dim_kernel()
+{
+    int idx = threadIdx.x;
+}
+
+#define LANUCH(head_num) test_head_dim_kernel<head_num><<<block, grid>>>();
+
+void test_head_dim(int head_num)
+{
+    dim3 block(10);
+    dim3 grid(1024);
+    switch (head_num)
+    {
+    case 1:
+        LANUCH(1);
+    case 8:
+        LANUCH(8);
+    case 16:
+        LANUCH(16);
+    case 32:
+        LANUCH(32);
+    case 48:
+        LANUCH(48);
+    case 64:
+        LANUCH(64);
+    default:
+        printf("do not support head num\n");
+    }
+}
--- a/csrc/random_env.cu
+++ b/csrc/random_env.cu
@ -0,0 +1,35 @@
+#include <cuda_fp16.h>
+#include <curand_kernel.h>
+#include <cub/cub.cuh>
+#include <cub/util_device.cuh>
+
+__global__ void initRandom(curandState *state, unsigned long seed)
+{
+    int id = threadIdx.x + blockIdx.x * blockDim.x;
+    curand_init(seed, id, 0, &state[id]);
+}
+
+__global__ void random_generate(float *out, curandState *state)
+{
+    curandState localState = state[id];
+    __shared__ float shared_data[1024];
+    int idx = threadIdx.x;
+    typedef cub::BlockReduce<float, 1024> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    for (int i = 0; i < 1024; i++)
+    {
+        shared_data[idx] += curand_uniform(&localState);
+        float sum = BlockReduce(temp_storage).Sum(shared_data[idx]);
+        shared_data[idx] += shared_data[idx] / sum;
+    }
+    out[idx] = shared_data[idx];
+}
+
+void random_invoke()
+{
+    curandState *devStates;
+    int thread_num = 1024;
+    float out[1024];
+    initRandom<<<1, thread_num>>>(devStates, 1234);
+    random_generate<<<1, thread_num>>>(out, devStates);
+}
--- a/csrc/type_utils.h
+++ b/csrc/type_utils.h
@ -0,0 +1,35 @@
+#ifndef TYPE_UTILS_H
+#define TYPE_UTILS_H
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#define FP16 __half
+#define BF16 __nv_bfloat16
+template <typename src_type, typename dest_type>
+__device__ dest_type fi_cast(src_type a)
+{
+}
+template <>
+__device__ float fi_cast<BF16, float>(BF16 a)
+{
+    return __bfloat162float(a);
+}
+
+template <>
+__device__ float fi_cast<FP16, float>(FP16 a)
+{
+    return __half2float(a);
+}
+
+template <>
+__device__ BF16 fi_cast<float, BF16>(float a)
+{
+    return __float2bfloat16(a);
+}
+
+template <>
+__device__ FP16 fi_cast<float, FP16>(float a)
+{
+    return __float2half(a);
+}
+
+#endif
--- a/fi/test_module.py
+++ b/fi/test_module.py
@ -0,0 +1,28 @@
+# coding=utf-8
+
+import torch.nn as nn
+
+
+class TestModule(nn.Module):
+    def __init__(self, start_layer_index: int, end_layer_index: int, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.model = DecodeLayer()
+
+    def forward(self, x):
+        for module in self.model:
+            x = module(x)
+        return x
+
+
+class DecodeLayer(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.layers = nn.ModuleList()
+        for i in range(10):
+            self.layers.append(nn.Linear(10, 10))
+
+
+if __name__ == "__main__":
+    test_module = TestModule(0, 3)
+    for x in test_module.named_parameters():
+        print(x[0])
--- a/setup.py
+++ b/setup.py
@ -12,12 +12,13 @@ files = [
    "csrc/max.cu",
    "csrc/md.cu",
    "csrc/quantize.cu",
+    "csrc/layernorm.cu",
 ]
 extension = CUDAExtension(
    name="torch_cuda_ext.core",
    sources=files,
    extra_compile_args={"cxx": ["-g"], "nvcc": ["-O2"]},
-    include_dirs=["/home/squall/program/cutlass/include"],
+    include_dirs=["/home/squall/quant_data/program/cutlass/include"],
 )

 cuda_exts.append(extension)
--- a/test_layernorm.py
+++ b/test_layernorm.py
@ -0,0 +1,13 @@
+# coding=utf-8
+import torch
+import torch_cuda_ext.core as core
+import torch.nn.functional as F
+
+eps = float(0.01)
+gamma = float(1)
+states = torch.randn(size=(100, 1024)).half().cuda()
+res_states = F.rms_norm(states, [1024], eps=eps)
+print(res_states)
+
+core.rms_norm(states, eps, gamma)
+print(states)
--- a/28689
+++ b/28689