#include #include #include #include #include #include #include #include #include #include #include #include "core.h" using namespace cute; #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) template __global__ void reducemax_kernel(const scalar_t *src, scalar_t *dest, int len) { __shared__ float tmp_data[BLOCK_SIZE]; float local_sum = 0; int idx = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; for (int i = threadIdx.x; i < len; i += blockDim.x) { // add some other place's data. local_sum += (float)src[blockIdx.x * blockDim.x + i]; } if (idx < len) tmp_data[tid] = local_sum; else tmp_data[tid] = 0.0f; __syncthreads(); typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; float sum = BlockReduce(temp_storage).Sum(tmp_data[tid]); if (tid == 0) { dest[blockIdx.x] = (scalar_t)sum; } } __global__ void fuse_rope_kernel() { int x_x = threadIdx.x; int wrap_idx = threadIdx.x % 32; int lane_idx = threadIdx.x / 32; int head_idx = blockIdx.x; int batch_idx = blockIdx.y; } void reducemax(const torch::Tensor &src, torch::Tensor &dest) { int len = src.size(0); int block_size = 1024; dim3 grid((len + block_size - 1) / block_size); dim3 block(block_size); VLLM_DISPATCH_FLOATING_TYPES(src.scalar_type(), "reducemax", [&] { reducemax_kernel<1024, scalar_t><<>>( src.data_ptr(), dest.data_ptr(), len); }); } __global__ void test_cute_tensor_kernel() { cute::Tensor rmem_4x8_col = cute::make_tensor(Shape<_4, _8>{}); Tensor ind_tensor = make_identity_tensor(make_shape(16, 16)); Tensor bool_tensor = make_tensor(shape(rmem_4x8_col)); Tensor rmem_4x8_pad = make_tensor(Shape<_4, _8>{}, Stride<_32, _2>{}); Layout smem_layout = make_layout(make_shape(Int<4>{}, Int<8>{})); __shared__ float smem[decltype(cosize(smem_layout))::value]; // (static-only allocation) printf("smem size is :%d\n", decltype(cosize(smem_layout))::value); Tensor stensor = make_tensor(make_smem_ptr(smem), smem_layout); printf("tensor size is: %d, ind size is: %d, rmem size is: %d , rmem4x8 is: %d, smem size is: %d\n", bool_tensor.size(), ind_tensor.size(), rmem_4x8_col.size(), rmem_4x8_pad.size(), stensor.size()); auto TA = make_layout(make_shape(Int<32>{}, Int<8>{}), LayoutRight{}); // (m,k) -> thr_idx; k-major TiledCopy copyA = make_tiled_copy(Copy_Atom, float>{}, // Atom: Copy TAs as if they were uint128_t Layout>{}, // Thr layout 32x8 m-major Layout>{}); // Val layout 4x1 m-major printf("stensor size 1 is %d\n", cute::size<1>(stensor)); #if 0 print_latex(copyA); #endif } // template // void test_template() // { // if (head % 2 == 0) // { // std::cout << "what the fuck" << endl; // } // } void test_cute_tensor() { dim3 thread_block(16, 16); dim3 block(16); test_cute_tensor_kernel<<>>(); } __global__ void md_op(const float *a) { int tidx = threadIdx.x; int bid = blockIdx.x; int hid = blockIdx.y; int offset = blockDim.x * blockDim.y; // 绑定到自己的进 }