全都提交一下。

This commit is contained in:
long0x0 2024-12-14 13:34:30 +08:00
parent 4da12fd0c2
commit 0a6b5493fa
10 changed files with 28891 additions and 10 deletions

View File

@ -1,6 +1,7 @@
#include "core.h"
#include <iostream>
#include <cuda_fp16.h>
#include <cuda_bf16.h>
// #include <mma.h>
#include <cuda_runtime.h>
@ -58,7 +59,8 @@ __global__ void matmul_sigmoid_cuda(const T *in1, const T *in2, T *output, int r
}
#define BASE_BLOCK 256
#define CALL_ADD_FUNCTION add_two_tensors_cuda<<<(input1.size(0) * input1.size(1) + BASE_BLOCK - 1) / BASE_BLOCK, BASE_BLOCK>>>(src, src1, dest, input1.size(0) * input1.size(1));
#define CALL_ADD_FUNCTION \
add_two_tensors_cuda<<<(input1.size(0) * input1.size(1) + BASE_BLOCK - 1) / BASE_BLOCK, BASE_BLOCK>>>(src, src1, dest, input1.size(0) * input1.size(1));
void add_two_tensors(const torch::Tensor &input1, const torch::Tensor &input2, torch::Tensor &output)
{
// cout << input1.dtype() << " the size 1 is : " << input1.size(0) << " size 2 is " << input1.size(1) << "output dim is :" << output.size(0) << output.size(1) << endl;

View File

@ -2,6 +2,36 @@
#define CORE_H
#include <torch/extension.h>
#define TYPING_DISPATCH(scalar_t, ...) \
switch (scalar_t) \
{ \
case at::ScalarType::Float: \
{ \
using fi_type = float; \
__VA_ARGS__(); \
} \
case at::ScalarType::BFloat16: \
{ \
using fi_type = __nv_bfloat16; \
__VA_ARGS__(); \
} \
case at::ScalarType::Half: \
{ \
using fi_type = __half; \
__VA_ARGS__(); \
} \
}
// default: \
// printf("do not support such type\n"); \
// }
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
void add_two_tensors(const torch::Tensor &input1, const torch::Tensor &input2, torch::Tensor &output);
void rope_tensors(const torch::Tensor &input, torch::Tensor &output, int rope_index_start);
@ -19,4 +49,5 @@ void test_cute_tensor();
void md_mm(const torch::Tensor &src);
void block_sum(const torch::Tensor &src, torch::Tensor &dest);
void md_block_sum(const torch::Tensor &src, torch::Tensor &dest);
void rms_norm(torch::Tensor &states, float eps, float gamma);
#endif

View File

@ -18,4 +18,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
m.def("md_mm", &md_mm, "just a test of multi dimension mm");
m.def("block_sum", &block_sum, "test block sum");
m.def("md_block_sum", &md_block_sum, "multi dimension block sum");
m.def("rms_norm", &rms_norm, "rms noram");
}

View File

@ -0,0 +1,83 @@
#include "core.h"
#include <cub/cub.cuh>
#include <cub/util_device.cuh>
#include <cuda_fp16.h>
#include <cuda_fp8.h>
#include <cuda_bf16.h>
#include <torch/torch.h>
#include <torch/all.h>
using namespace std;
template <typename src_type, typename dest_type>
__device__ dest_type fi_cast(src_type a)
{
}
template <>
__device__ float fi_cast<__nv_bfloat16, float>(__nv_bfloat16 a)
{
return __bfloat162float(a);
}
template <>
__device__ float fi_cast<__half, float>(__half a)
{
return __half2float(a);
}
template <>
__device__ __nv_bfloat16 fi_cast<float, __nv_bfloat16>(float a)
{
return __float2bfloat16(a);
}
template <>
__device__ __half fi_cast<float, __half>(float a)
{
return __float2half(a);
}
template <typename scalar_t, int BLOCK_SIZE = 1024>
__global__ void rms_norm_kernel(scalar_t *states, int hidden_dim, float eps, float gamma)
{
__shared__ float smem[BLOCK_SIZE];
int idx = threadIdx.x;
int offset = blockIdx.x * hidden_dim;
float local_sum = 0.0f;
for (int i = idx; i < hidden_dim; i += blockDim.x)
{
int local_offset = offset + i;
float tmp = fi_cast<scalar_t, float>(states[local_offset]);
local_sum += tmp * tmp;
}
if (idx < BLOCK_SIZE)
smem[idx] = local_sum;
else
smem[idx] = 0.0f;
__syncthreads();
typedef cub::BlockReduce<float, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float sum_res = BlockReduce(temp_storage).Sum(smem[idx]);
sum_res = sqrtf(sum_res);
sum_res = sum_res + eps;
for (int i = idx; i < hidden_dim; i += blockDim.x)
{
int local_offset = offset + i;
float tmp = fi_cast<scalar_t, float>(states[local_offset]);
tmp = tmp / sum_res * gamma;
states[local_offset] = fi_cast<float, scalar_t>(tmp);
}
}
void rms_norm(torch::Tensor &states, float eps, float gamma)
{
int h = states.size(0);
int hidden_dim = states.size(1);
int block_size = 1024;
dim3 block(h);
dim3 grid(block_size);
cout << states.scalar_type() << endl;
TYPING_DISPATCH(states.scalar_type(), [&]
{ rms_norm_kernel<fi_type><<<block, grid>>>(reinterpret_cast<fi_type *>(states.data_ptr()), hidden_dim, eps, gamma); });
}

View File

@ -15,13 +15,6 @@
#include "core.h"
using namespace cute;
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
template <int BLOCK_SIZE = 1024, typename scalar_t>
__global__ void reducemax_kernel(const scalar_t *src, scalar_t *dest, int len)

View File

@ -69,6 +69,7 @@ __global__ void row_sum_kernel(const float *src, float *dest, int hidden_dim)
if (tid == 0)
{
dest[blockIdx.x] = sum;
printf("blockidx.x: %d, blockIdx.y %d, blockIdx.z %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
}
}
@ -109,6 +110,7 @@ __global__ void md_row_sum_kernel(const float *src, float *dest, int stride_a, i
if (tid == 0 && block_offset < all_len)
{
dest[block_offset] = sum;
printf("blockIdx.x %d, blockIdx.y %d, blockIdx.z %d, blockDim.x %d\n", blockIdx.x, blockIdx.y, blockIdx.z, blockDim.x);
}
}
@ -125,3 +127,34 @@ void md_block_sum(const torch::Tensor &src, torch::Tensor &dest)
src.size(1),
src.size(2));
}
template <int head_num = 8>
__global__ void test_head_dim_kernel()
{
int idx = threadIdx.x;
}
#define LANUCH(head_num) test_head_dim_kernel<head_num><<<block, grid>>>();
void test_head_dim(int head_num)
{
dim3 block(10);
dim3 grid(1024);
switch (head_num)
{
case 1:
LANUCH(1);
case 8:
LANUCH(8);
case 16:
LANUCH(16);
case 32:
LANUCH(32);
case 48:
LANUCH(48);
case 64:
LANUCH(64);
default:
printf("do not support head num\n");
}
}

35
csrc/random_env.cu Normal file
View File

@ -0,0 +1,35 @@
#include <cuda_fp16.h>
#include <curand_kernel.h>
#include <cub/cub.cuh>
#include <cub/util_device.cuh>
__global__ void initRandom(curandState *state, unsigned long seed)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, id, 0, &state[id]);
}
__global__ void random_generate(float *out, curandState *state)
{
curandState localState = state[id];
__shared__ float shared_data[1024];
int idx = threadIdx.x;
typedef cub::BlockReduce<float, 1024> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
for (int i = 0; i < 1024; i++)
{
shared_data[idx] += curand_uniform(&localState);
float sum = BlockReduce(temp_storage).Sum(shared_data[idx]);
shared_data[idx] += shared_data[idx] / sum;
}
out[idx] = shared_data[idx];
}
void random_invoke()
{
curandState *devStates;
int thread_num = 1024;
float out[1024];
initRandom<<<1, thread_num>>>(devStates, 1234);
random_generate<<<1, thread_num>>>(out, devStates);
}

View File

@ -11,12 +11,13 @@ files = [
"csrc/core_bind.cpp",
"csrc/max.cu",
"csrc/md.cu",
"csrc/layernorm.cu",
]
extension = CUDAExtension(
name="torch_cuda_ext.core",
sources=files,
extra_compile_args={"cxx": ["-g"], "nvcc": ["-O2"]},
include_dirs=["/home/squall/program/cutlass/include"],
include_dirs=["/home/squall/quant_data/program/cutlass/include"],
)
cuda_exts.append(extension)

13
test_layernorm.py Normal file
View File

@ -0,0 +1,13 @@
# coding=utf-8
import torch
import torch_cuda_ext.core as core
import torch.nn.functional as F
eps = float(0.01)
gamma = float(1)
states = torch.randn(size=(100, 1024)).half().cuda()
res_states = F.rms_norm(states, [1024], eps=eps)
print(res_states)
core.rms_norm(states, eps, gamma)
print(states)

28689
tt Normal file

File diff suppressed because it is too large Load Diff