From b2e0ad3b598ed0e022cdbd678a20821d411873c2 Mon Sep 17 00:00:00 2001 From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com> Date: Thu, 14 Nov 2024 16:38:20 -0800 Subject: [PATCH] [Perf] Reduce peak memory usage of llama (#10339) Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com> --- vllm/model_executor/models/llama.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 8aed0fea..e53631ef 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -90,8 +90,8 @@ class LlamaMLP(nn.Module): self.act_fn = SiluAndMul() def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) + x, _ = self.gate_up_proj(x) + x = self.act_fn(x) x, _ = self.down_proj(x) return x