[Bugfix] Fix illegal memory access in FP8 MoE kernel (#6382)

2024-07-12 14:33:33 -07:00 · 2024-07-12 14:33:33 -07:00 · 75f64d8b94
commit 75f64d8b94
parent 21b2dcedab
1 changed files with 5 additions and 3 deletions
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@ -492,12 +492,14 @@ def fused_experts(hidden_states: torch.Tensor,
        if tokens_in_chunk == 0:
            break
-        if tokens_in_chunk < CHUNK_SIZE:
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
-            # will only happen in the last chunk
+            # Adjust the intermediate cache size and config for the last
            # chunk. Note that in most cases we only have one chunk
            # so the cache size and config are already set correctly and
            # do not need to be adjusted.
            intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
            intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
            # reload config to get better performance on the last chunk
            config = get_config_func(tokens_in_chunk)
        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]