全都提交一下。

2024-12-14 13:34:30 +08:00 · 2024-12-14 13:34:30 +08:00 · 0a6b5493fa
commit 0a6b5493fa
parent 4da12fd0c2
10 changed files with 28891 additions and 10 deletions
--- a/csrc/core.cu
+++ b/csrc/core.cu
@ -1,6 +1,7 @@
 #include "core.h"
 #include <iostream>
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 // #include <mma.h>
 #include <cuda_runtime.h>

@ -58,7 +59,8 @@ __global__ void matmul_sigmoid_cuda(const T *in1, const T *in2, T *output, int r
 }

 #define BASE_BLOCK 256
-#define CALL_ADD_FUNCTION add_two_tensors_cuda<<<(input1.size(0) * input1.size(1) + BASE_BLOCK - 1) / BASE_BLOCK, BASE_BLOCK>>>(src, src1, dest, input1.size(0) * input1.size(1));
+#define CALL_ADD_FUNCTION \
+add_two_tensors_cuda<<<(input1.size(0) * input1.size(1) + BASE_BLOCK - 1) / BASE_BLOCK, BASE_BLOCK>>>(src, src1, dest, input1.size(0) * input1.size(1));
 void add_two_tensors(const torch::Tensor &input1, const torch::Tensor &input2, torch::Tensor &output)
 {
    // cout << input1.dtype() << " the size 1 is : " << input1.size(0) << " size 2 is " << input1.size(1) << "output dim is :" << output.size(0) << output.size(1) << endl;
--- a/csrc/core.h
+++ b/csrc/core.h
@ -2,6 +2,36 @@
 #define CORE_H
 #include <torch/extension.h>

+#define TYPING_DISPATCH(scalar_t, ...) \
+    switch (scalar_t)                  \
+    {                                  \
+    case at::ScalarType::Float:        \
+    {                                  \
+        using fi_type = float;         \
+        __VA_ARGS__();                 \
+    }                                  \
+    case at::ScalarType::BFloat16:     \
+    {                                  \
+        using fi_type = __nv_bfloat16; \
+        __VA_ARGS__();                 \
+    }                                  \
+    case at::ScalarType::Half:         \
+    {                                  \
+        using fi_type = __half;        \
+        __VA_ARGS__();                 \
+    }                                  \
+    }
+// default:                                  \
+    //     printf("do not support such type\n"); \
+    // }
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)           \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
+    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+    AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
 void add_two_tensors(const torch::Tensor &input1, const torch::Tensor &input2, torch::Tensor &output);

 void rope_tensors(const torch::Tensor &input, torch::Tensor &output, int rope_index_start);
@ -19,4 +49,5 @@ void test_cute_tensor();
 void md_mm(const torch::Tensor &src);
 void block_sum(const torch::Tensor &src, torch::Tensor &dest);
 void md_block_sum(const torch::Tensor &src, torch::Tensor &dest);
+void rms_norm(torch::Tensor &states, float eps, float gamma);
 #endif
--- a/csrc/core_bind.cpp
+++ b/csrc/core_bind.cpp
@ -18,4 +18,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
  m.def("md_mm", &md_mm, "just a test of multi dimension mm");
  m.def("block_sum", &block_sum, "test block sum");
  m.def("md_block_sum", &md_block_sum, "multi dimension block sum");
+  m.def("rms_norm", &rms_norm, "rms noram");
 }
--- a/csrc/layernorm.cu
+++ b/csrc/layernorm.cu
@ -0,0 +1,83 @@
+#include "core.h"
+#include <cub/cub.cuh>
+#include <cub/util_device.cuh>
+
+#include <cuda_fp16.h>
+#include <cuda_fp8.h>
+#include <cuda_bf16.h>
+#include <torch/torch.h>
+#include <torch/all.h>
+using namespace std;
+
+template <typename src_type, typename dest_type>
+__device__ dest_type fi_cast(src_type a)
+{
+}
+template <>
+__device__ float fi_cast<__nv_bfloat16, float>(__nv_bfloat16 a)
+{
+    return __bfloat162float(a);
+}
+
+template <>
+__device__ float fi_cast<__half, float>(__half a)
+{
+    return __half2float(a);
+}
+
+template <>
+__device__ __nv_bfloat16 fi_cast<float, __nv_bfloat16>(float a)
+{
+    return __float2bfloat16(a);
+}
+
+template <>
+__device__ __half fi_cast<float, __half>(float a)
+{
+    return __float2half(a);
+}
+
+template <typename scalar_t, int BLOCK_SIZE = 1024>
+__global__ void rms_norm_kernel(scalar_t *states, int hidden_dim, float eps, float gamma)
+{
+    __shared__ float smem[BLOCK_SIZE];
+    int idx = threadIdx.x;
+    int offset = blockIdx.x * hidden_dim;
+    float local_sum = 0.0f;
+    for (int i = idx; i < hidden_dim; i += blockDim.x)
+    {
+        int local_offset = offset + i;
+        float tmp = fi_cast<scalar_t, float>(states[local_offset]);
+        local_sum += tmp * tmp;
+    }
+    if (idx < BLOCK_SIZE)
+        smem[idx] = local_sum;
+    else
+        smem[idx] = 0.0f;
+    __syncthreads();
+    typedef cub::BlockReduce<float, BLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    float sum_res = BlockReduce(temp_storage).Sum(smem[idx]);
+    sum_res = sqrtf(sum_res);
+    sum_res = sum_res + eps;
+    for (int i = idx; i < hidden_dim; i += blockDim.x)
+    {
+        int local_offset = offset + i;
+        float tmp = fi_cast<scalar_t, float>(states[local_offset]);
+        tmp = tmp / sum_res * gamma;
+        states[local_offset] = fi_cast<float, scalar_t>(tmp);
+    }
+}
+
+void rms_norm(torch::Tensor &states, float eps, float gamma)
+{
+    int h = states.size(0);
+    int hidden_dim = states.size(1);
+    int block_size = 1024;
+    dim3 block(h);
+    dim3 grid(block_size);
+    cout << states.scalar_type() << endl;
+    TYPING_DISPATCH(states.scalar_type(), [&]
+                    { rms_norm_kernel<fi_type><<<block, grid>>>(reinterpret_cast<fi_type *>(states.data_ptr()), hidden_dim, eps, gamma); });
+}
--- a/csrc/max.cu
+++ b/csrc/max.cu
@ -15,13 +15,6 @@
 #include "core.h"

 using namespace cute;
-#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)           \
-    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
-    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
-    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
-
-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
-    AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))

 template <int BLOCK_SIZE = 1024, typename scalar_t>
 __global__ void reducemax_kernel(const scalar_t *src, scalar_t *dest, int len)
--- a/csrc/md.cu
+++ b/csrc/md.cu
@ -69,6 +69,7 @@ __global__ void row_sum_kernel(const float *src, float *dest, int hidden_dim)
    if (tid == 0)
    {
        dest[blockIdx.x] = sum;
+        printf("blockidx.x: %d, blockIdx.y %d, blockIdx.z %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
    }
 }

@ -109,6 +110,7 @@ __global__ void md_row_sum_kernel(const float *src, float *dest, int stride_a, i
    if (tid == 0 && block_offset < all_len)
    {
        dest[block_offset] = sum;
+        printf("blockIdx.x %d, blockIdx.y %d, blockIdx.z %d, blockDim.x %d\n", blockIdx.x, blockIdx.y, blockIdx.z, blockDim.x);
    }
 }

@ -125,3 +127,34 @@ void md_block_sum(const torch::Tensor &src, torch::Tensor &dest)
                                       src.size(1),
                                       src.size(2));
 }
+
+template <int head_num = 8>
+__global__ void test_head_dim_kernel()
+{
+    int idx = threadIdx.x;
+}
+
+#define LANUCH(head_num) test_head_dim_kernel<head_num><<<block, grid>>>();
+
+void test_head_dim(int head_num)
+{
+    dim3 block(10);
+    dim3 grid(1024);
+    switch (head_num)
+    {
+    case 1:
+        LANUCH(1);
+    case 8:
+        LANUCH(8);
+    case 16:
+        LANUCH(16);
+    case 32:
+        LANUCH(32);
+    case 48:
+        LANUCH(48);
+    case 64:
+        LANUCH(64);
+    default:
+        printf("do not support head num\n");
+    }
+}
--- a/csrc/random_env.cu
+++ b/csrc/random_env.cu
@ -0,0 +1,35 @@
+#include <cuda_fp16.h>
+#include <curand_kernel.h>
+#include <cub/cub.cuh>
+#include <cub/util_device.cuh>
+
+__global__ void initRandom(curandState *state, unsigned long seed)
+{
+    int id = threadIdx.x + blockIdx.x * blockDim.x;
+    curand_init(seed, id, 0, &state[id]);
+}
+
+__global__ void random_generate(float *out, curandState *state)
+{
+    curandState localState = state[id];
+    __shared__ float shared_data[1024];
+    int idx = threadIdx.x;
+    typedef cub::BlockReduce<float, 1024> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    for (int i = 0; i < 1024; i++)
+    {
+        shared_data[idx] += curand_uniform(&localState);
+        float sum = BlockReduce(temp_storage).Sum(shared_data[idx]);
+        shared_data[idx] += shared_data[idx] / sum;
+    }
+    out[idx] = shared_data[idx];
+}
+
+void random_invoke()
+{
+    curandState *devStates;
+    int thread_num = 1024;
+    float out[1024];
+    initRandom<<<1, thread_num>>>(devStates, 1234);
+    random_generate<<<1, thread_num>>>(out, devStates);
+}
--- a/setup.py
+++ b/setup.py
@ -11,12 +11,13 @@ files = [
    "csrc/core_bind.cpp",
    "csrc/max.cu",
    "csrc/md.cu",
+    "csrc/layernorm.cu",
 ]
 extension = CUDAExtension(
    name="torch_cuda_ext.core",
    sources=files,
    extra_compile_args={"cxx": ["-g"], "nvcc": ["-O2"]},
-    include_dirs=["/home/squall/program/cutlass/include"],
+    include_dirs=["/home/squall/quant_data/program/cutlass/include"],
 )

 cuda_exts.append(extension)
--- a/test_layernorm.py
+++ b/test_layernorm.py
@ -0,0 +1,13 @@
+# coding=utf-8
+import torch
+import torch_cuda_ext.core as core
+import torch.nn.functional as F
+
+eps = float(0.01)
+gamma = float(1)
+states = torch.randn(size=(100, 1024)).half().cuda()
+res_states = F.rms_norm(states, [1024], eps=eps)
+print(res_states)
+
+core.rms_norm(states, eps, gamma)
+print(states)
--- a/28689
+++ b/28689