#include "core.h" #include #include #include // #include #include __global__ void add_two_tensors_cuda(const float *input0, const float *input2, float *output, int size) { int index = blockIdx.x * blockDim.x + threadIdx.x; // printf("index is : %d, blockidx is : %d, blockdim is %d, threadidx is %d\n", index, blockIdx.x, blockDim.x, threadIdx.x); if (index < size) { output[index] = input0[index] + input2[index]; } __syncthreads(); } __device__ float sigmoid(float x) { return 1.0f / (1.0f + exp(-x)); } template __global__ void matmul_cuda(const T *in1, const T *in2, T *output, int row, int col, int col2) { int index = blockIdx.x * blockDim.x + threadIdx.x; int row_index = index / col2; int col_index = index % col2; if (index < row * col2) { float sum = 0; for (int i = 0; i < col; i++) { sum += in1[row_index * col + i] * in2[i * col2 + col_index]; } output[index] = sum; } __syncthreads(); } template __global__ void matmul_sigmoid_cuda(const T *in1, const T *in2, T *output, int row, int col, int col2) { int index = blockIdx.x * blockDim.x + threadIdx.x; int row_index = index / col2; int col_index = index % col2; if (index < row * col2) { float sum = 0; output[index] = 0; for (int i = 0; i < col; i++) { sum += in1[row_index * col + i] * in2[i * col2 + col_index]; } output[index] = sigmoid(sum); } __syncthreads(); } #define BASE_BLOCK 256 #define CALL_ADD_FUNCTION \ add_two_tensors_cuda<<<(input1.size(0) * input1.size(1) + BASE_BLOCK - 1) / BASE_BLOCK, BASE_BLOCK>>>(src, src1, dest, input1.size(0) * input1.size(1)); void add_two_tensors(const torch::Tensor &input1, const torch::Tensor &input2, torch::Tensor &output) { // cout << input1.dtype() << " the size 1 is : " << input1.size(0) << " size 2 is " << input1.size(1) << "output dim is :" << output.size(0) << output.size(1) << endl; float *src = input1.data_ptr(); float *src1 = input2.data_ptr(); float *dest = output.data_ptr(); CALL_ADD_FUNCTION; } // // write to the columns of the output tensor void rope_tensors(const torch::Tensor &input, torch::Tensor &output, int rope_index_start) { printf("what the fuck"); } void matmul(const torch::Tensor &input1, const torch::Tensor &input2, torch::Tensor &output) { float *in1 = input1.data_ptr(); float *in2 = input2.data_ptr(); float *in3 = output.data_ptr(); int row = input1.size(0); int col = input1.size(1); int col2 = input2.size(1); matmul_cuda<<<(row * col2 + BASE_BLOCK - 1) / BASE_BLOCK, BASE_BLOCK>>>(in1, in2, in3, row, col, col2); } void matmul_sigmoid(const torch::Tensor &input1, const torch::Tensor &input2, torch::Tensor &output) { float *in1 = input1.data_ptr(); float *in2 = input2.data_ptr(); float *in3 = output.data_ptr(); int row = input1.size(0); int col = input1.size(1); int col2 = input2.size(1); matmul_sigmoid_cuda<<<(row * col2 + BASE_BLOCK - 1) / BASE_BLOCK, BASE_BLOCK>>>(in1, in2, in3, row, col, col2); } // template // __global__ void flash_attention(const dtype *q, // const dtype *k, const dytype *v, dtype *output, // int batch_size, int seq_len, int head_num, int head_dim) // { // int index = blockIdx.x * blockDim.x + threadIdx.x; // int batch_index = index / (head_num * seq_len); // int head_index = (index % (head_num * seq_len)) / seq_len; // int seq_index = index % seq_len; // if (index < batch_size * head_num * seq_len) // { // float sum = 0; // for (int i = 0; i < head_dim; i++) // { // sum += q[batch_index * head_num * seq_len * head_dim + head_index * seq_len * head_dim + i] * k[batch_index * head_num * seq_len * head_dim + head_index * seq_len * head_dim + i]; // } // output[batch_index * head_num * seq_len + head_index * seq_len + seq_index] = sum; // } // __syncthreads(); // } #define TILE_WIDTH 32 __global__ void matrixMul(float *C, float *A, float *B, int width) { __shared__ float As[TILE_WIDTH][TILE_WIDTH]; __shared__ float Bs[TILE_WIDTH][TILE_WIDTH]; int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; int Row = by * TILE_WIDTH + ty; int Col = bx * TILE_WIDTH + tx; float Pvalue = 0; for (int m = 0; m < width / TILE_WIDTH; ++m) { As[ty][tx] = A[Row * width + (m * TILE_WIDTH + tx)]; Bs[ty][tx] = B[(m * TILE_WIDTH + ty) * width + Col]; __syncthreads(); for (int k = 0; k < TILE_WIDTH; ++k) { Pvalue += As[ty][k] * Bs[k][tx]; } __syncthreads(); } C[Row * width + Col] = Pvalue; } void matmul_shared(const torch::Tensor &input1, const torch::Tensor &input2, torch::Tensor &output) { float *in1 = input1.data_ptr(); float *in2 = input2.data_ptr(); float *in3 = output.data_ptr(); int width = input1.size(0); dim3 dimBlock(TILE_WIDTH, TILE_WIDTH); dim3 dimGrid(width / TILE_WIDTH, width / TILE_WIDTH); matrixMul<<>>(in1, in2, in3, width); } // // 定义矩阵的维度 // #define WMMA_M 16 // #define WMMA_N 16 // #define WMMA_K 16 // __global__ void matrixMul_mma(half *A, half *B, float *C, int M, int N, int K) // { // // 使用shared memory来存储输入矩阵 // __shared__ half a[WMMA_M][WMMA_K]; // __shared__ half b[WMMA_K][WMMA_N]; // int warpId = threadIdx.x / warpSize; // int laneId = threadIdx.x % warpSize; // // 定义用于WMMA的矩阵片段 // wmma::fragment a_frag; // wmma::fragment b_frag; // wmma::fragment c_frag; // wmma::fill_fragment(c_frag, 0.0f); // // 加载数据到片段 // wmma::load_matrix_sync(a_frag, a[warpId], WMMA_K); // wmma::load_matrix_sync(b_frag, b[warpId], WMMA_N); // // 执行矩阵乘法 // wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); // // 将结果存储回全局内存 // wmma::store_matrix_sync(&C[warpId * WMMA_M * WMMA_N], c_frag, WMMA_N, wmma::mem_row_major); // } // at::Tensor &gemm_mma(const at::Tensor &a, const at::Tensor &b) // { // __half *a_p = reinterpret_cast<__half *>(a.data_ptr()); // __half *b_p = reinterpret_cast<__half *>(b.data_ptr()); // at::Tensor c = torch::empty({a.size(0), b.size(1)}, torch::kFloat32); // float *c_p = c.data_ptr(); // matrixMul_mma<<<1, 1>>>(a_p, b_p, c_p, a.size(0), b.size(1), 16); // return c; // } /* // 定义矩阵的维度 #define MM 1024 #define NN 1024 #define KK 1024 // 定义 WMMA 矩阵片段的维度 constexpr int WMMA_M = 16; constexpr int WMMA_N = 16; constexpr int WMMA_K = 16; // 定义矩阵乘法的 kernel __global__ void wmma_gemm(__half *a, __half *b, __half *c) { // 计算当前线程应处理的矩阵元素的位置 int wmma_m_index = (blockIdx.x * blockDim.x + threadIdx.x) / WMMA_M; int wmma_n_index = (blockIdx.y * blockDim.y + threadIdx.y) / WMMA_N; // 声明 WMMA 矩阵片段 nvcuda::wmma::fragment a_frag; nvcuda::wmma::fragment b_frag; nvcuda::wmma::fragment c_frag; // 初始化 C 矩阵片段 // nvcuda::wmma::fill_fragment(c_frag, __float2half(0.0f)); // 对于 A 和 B 矩阵的每一列 for (int k = 0; k < KK; k += WMMA_K) { // 加载 A 和 B 矩阵的片段 nvcuda::wmma::load_matrix_sync(a_frag, a + wmma_m_index * WMMA_M + k * MM, MM); nvcuda::wmma::load_matrix_sync(b_frag, b + k * KK + wmma_n_index * WMMA_N, KK); // 执行矩阵乘法和累加 nvcuda::wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); } // 将结果存储回 C 矩阵 nvcuda::wmma::store_matrix_sync(c + wmma_m_index * WMMA_M + wmma_n_index * WMMA_N, c_frag, MM, nvcuda::wmma::mem_row_major); } at::Tensor &gemm_mma(const at::Tensor &A, const at::Tensor &B) { // 获取输入 Tensor 的数据指针, 转成half格式的。 __half *A_data = reinterpret_cast<__half *>(A.data_ptr()); __half *B_data = reinterpret_cast<__half *>(B.data_ptr()); // 创建输出 Tensor torch::Tensor C = torch::zeros({A.size(0), B.size(1)}, A.options()); dim3 grid(MM / WMMA_M, NN / WMMA_N); dim3 block(WMMA_M, WMMA_N); // 获取输出 Tensor 的数据指针 __half *C_data = reinterpret_cast<__half *>(C.data_ptr()); // 调用你的 CUDA 函数 wmma_gemm<<>>(A_data, B_data, C_data); return C; } */