[Kernel] Split Marlin MoE kernels into multiple files (#8661)

Co-authored-by: mgoin <michael@neuralmagic.com>
2024-09-24 18:31:42 +02:00 · 2024-09-24 18:31:42 +02:00 · a928ded995
commit a928ded995
parent cc4325b66a
7 changed files with 1551 additions and 1426 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -316,6 +316,11 @@ set(VLLM_MOE_EXT_SRC
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_MOE_EXT_SRC
      "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
      "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
      "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
      "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
      "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
      "csrc/moe/marlin_moe_ops.cu")
 endif()
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu
@ -0,0 +1,29 @@
 #include "marlin_moe_kernel_ku4b8.h"
 namespace marlin_moe {
 // We return bool so we can create these different kernel calls as a sequence
 // of if-elseif's.
 bool call_marlin_moe_kernel_ku4b8(
    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
    bool has_act_order, int group_blocks, int num_threads, int blocks,
    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
    const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
    int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
    int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
    bool replicate_input, bool apply_weights, int m_block, int max_par,
    int cfg_max_m_blocks) {
  if (false) {
  }
  GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
  GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
  GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
  GPTQ_CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
  else {
    return false;
  }
  return true;
 }
 }  // namespace marlin_moe
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h
@ -0,0 +1,20 @@
 #pragma once
 #include "marlin_moe_kernel.h"
 namespace marlin_moe {
 // We return bool so we can create these different kernel calls as a sequence
 // of if-elseif's.
 bool call_marlin_moe_kernel_ku4b8(
    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
    bool has_act_order, int group_blocks, int num_threads, int blocks,
    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
    const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
    int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
    int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
    bool replicate_input, bool apply_weights, int m_block, int max_par,
    int cfg_max_m_blocks);
 }  // namespace marlin_moe
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu
@ -0,0 +1,29 @@
 #include "marlin_moe_kernel_ku8b128.h"
 namespace marlin_moe {
 // We return bool so we can create these different kernel calls as a sequence
 // of if-elseif's.
 bool call_marlin_moe_kernel_ku8b128(
    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
    bool has_act_order, int group_blocks, int num_threads, int blocks,
    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
    const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
    int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
    int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
    bool replicate_input, bool apply_weights, int m_block, int max_par,
    int cfg_max_m_blocks) {
  if (false) {
  }
  GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
  GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
  GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
  GPTQ_CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
  else {
    return false;
  }
  return true;
 }
 }  // namespace marlin_moe
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h
@ -0,0 +1,18 @@
 #pragma once
 #include "marlin_moe_kernel.h"
 namespace marlin_moe {
 bool call_marlin_moe_kernel_ku8b128(
    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
    bool has_act_order, int group_blocks, int num_threads, int blocks,
    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
    const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
    int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
    int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
    bool replicate_input, bool apply_weights, int m_block, int max_par,
    int cfg_max_m_blocks);
 }
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu