diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..23830fb --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "git.ignoreLimitWarning": true +} diff --git a/csrc/core.h b/csrc/core.h index b523ebe..4c0b4f9 100644 --- a/csrc/core.h +++ b/csrc/core.h @@ -16,4 +16,5 @@ void org_mm_shared_half(const at::Tensor &a, const at::Tensor &b, at::Tensor &c) void print_idx(); void reducemax(const torch::Tensor &src, torch::Tensor &dest); void test_cute_tensor(); +void md_mm(const torch::Tensor &src); #endif \ No newline at end of file diff --git a/csrc/core_bind.cpp b/csrc/core_bind.cpp index ee7f396..1672e6a 100644 --- a/csrc/core_bind.cpp +++ b/csrc/core_bind.cpp @@ -15,4 +15,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) m.def("print_idx", &print_idx, "just_printidx"); m.def("reducemax", &reducemax, "reduce max"); m.def("test_cute_tensor", &test_cute_tensor, "just test cute tensor"); + m.def("md_mm", &md_mm, "just a test of multi dimension mm"); } diff --git a/csrc/max.cu b/csrc/max.cu index 6788b70..344b62a 100644 --- a/csrc/max.cu +++ b/csrc/max.cu @@ -81,17 +81,17 @@ __global__ void test_cute_tensor_kernel() Stride<_32, _2>{}); Layout smem_layout = make_layout(make_shape(Int<4>{}, Int<8>{})); __shared__ float smem[decltype(cosize(smem_layout))::value]; // (static-only allocation) - printf("smem size is :%d\n", decltype(cosize(smem_layout))::value); + // printf("smem size is :%d\n", decltype(cosize(smem_layout))::value); Tensor stensor = make_tensor(make_smem_ptr(smem), smem_layout); printf("tensor size is: %d, ind size is: %d, rmem size is: %d , rmem4x8 is: %d, smem size is: %d\n", bool_tensor.size(), ind_tensor.size(), rmem_4x8_col.size(), rmem_4x8_pad.size(), stensor.size()); - auto TA = make_layout(make_shape(Int<32>{}, Int<8>{}), LayoutRight{}); // (m,k) -> thr_idx; k-major - TiledCopy copyA = make_tiled_copy(Copy_Atom, TA>{}, // Atom: Copy TAs as if they were uint128_t - Layout>{}, // Thr layout 32x8 m-major - Layout>{}); // Val layout 4x1 m-major + auto TA = make_layout(make_shape(Int<32>{}, Int<8>{}), LayoutRight{}); // (m,k) -> thr_idx; k-major + TiledCopy copyA = make_tiled_copy(Copy_Atom, float>{}, // Atom: Copy TAs as if they were uint128_t + Layout>{}, // Thr layout 32x8 m-major + Layout>{}); // Val layout 4x1 m-major print_latex(copyA); } @@ -108,4 +108,13 @@ void test_cute_tensor() dim3 thread_block(16, 16); dim3 block(16); test_cute_tensor_kernel<<>>(); +} + +__global__ void md_op(const float *a) +{ + int tidx = threadIdx.x; + int bid = blockIdx.x; + int hid = blockIdx.y; + int offset = blockDim.x * blockDim.y; + // 绑定到自己的进 } \ No newline at end of file diff --git a/csrc/md.cu b/csrc/md.cu new file mode 100644 index 0000000..7f62793 --- /dev/null +++ b/csrc/md.cu @@ -0,0 +1,29 @@ +#include "core.h" + +#include +#include +#include +#include + +__global__ void md_mm_kernel(const float *src, int stride_a, int stride_b, int stride_c, int thread_num) +{ + int batch_idx = blockIdx.x; + int head_idx = blockIdx.y; + int sequence_idx = blockIdx.z; + int tidx = threadIdx.x; +} + +void md_mm(const torch::Tensor &src) +{ + int batch_size = src.size(0); + int head_size = src.size(1); + int sequence_size = src.size(2); + int head_dim = src.size(3); + int data_block = sequence_size * head_dim; + int thread_num = 256; + dim3 grid(batch_size, head_size, (data_block + thread_num - 1) / thread_num); + dim3 block(thread_num); + md_mm_kernel<<>>(reinterpret_cast(src.data_ptr()), + src.stride(0), src.stride(1), src.stride(2), + thread_num); +} diff --git a/fi/load_model.py b/fi/load_model.py index 32679b1..1f9a5a4 100644 --- a/fi/load_model.py +++ b/fi/load_model.py @@ -1,14 +1,44 @@ # coding=utf-8 +import os import torch import transformers import torch.nn as nn -from transformers.models.qwen2 import Qwen2Config, Qwen2ForCausalLM -from transformers.models.llama import LlamaConfig, LlamaForCausalLM -from transformers import AutoModel, AutoConfig +from transformers import AutoModelForCausalLM, AutoConfig +from transformers.models import qwen2, gemma2, llama, gemma + +decode_layers = { + "gemma": gemma.modeling_gemma.GemmaDecoderLayer, + "gemma2": gemma2.modeling_gemma2.Gemma2DecoderLayer, + "qwen2": qwen2.modeling_qwen2.Qwen2DecoderLayer, +} + +MODELS = { + "gemma": gemma.GemmaForCausalLM, + "gemma2": gemma2.Gemma2ForCausalLM, + "llama": llama.LlamaForCausalLM, + "qwen2": qwen2.Qwen2ForCausalLM, +} class ModelLoader: - def __init__(self, model_path: str): - self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True) + def __init__(self, model_path: str, pipeline_num: int = 1): + self.config_path = os.path.join(model_path, "config.json") + self.model_config = AutoConfig.from_pretrained(self.config_path) + hidden_layers = self.model_config.get("num_hidden_layers", -1) + if hidden_layers == -1: + raise ValueError("do not has such parameter") + self.hidden_layers = hidden_layers + self.pipeline_num = pipeline_num + self.model = AutoModelForCausalLM.from_pretrained( + model_path, trust_remote_code=True + ) + self.model_type = self.model_config["model_type"] + self.per_pipeline_layers = self.hidden_layers // self.pipeline_num + module_list = None + for x in self.model.modules(): + if isinstance(x, torch.nn.modules.container.ModuleList): + module_list = x + if module_list is None: + raise ValueError("do not have module list.") diff --git a/setup.py b/setup.py index fc3902f..3b9bf4d 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ files = [ "csrc/matrix.cu", "csrc/core_bind.cpp", "csrc/max.cu", + "csrc/md.cu", ] extension = CUDAExtension( name="torch_cuda_ext.core", diff --git a/test b/test new file mode 100755 index 0000000..c1aeb71 Binary files /dev/null and b/test differ diff --git a/test.cc b/test.cc new file mode 100644 index 0000000..0345cb5 --- /dev/null +++ b/test.cc @@ -0,0 +1,22 @@ +#include +using namespace std; + +template +struct C +{ + using type = C; + static constexpr auto value = v; + using value_type = decltype(v); + inline constexpr operator value_type() const noexcept { return value; } + inline constexpr value_type operator()() const noexcept { return value; } +}; + +int main() +{ + using _1 = C<10>; + auto x = _1{}; + cout << _1::value << endl; + cout << _1::value_type() << endl; + cout << x.value << endl; + return 0; +} \ No newline at end of file