From 02a43f82a97e37581b48f1c177d3393aca4fe3f2 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 23 Nov 2024 00:14:19 -0500 Subject: [PATCH] Update default max_num_batch_tokens for chunked prefill to 2048 (#10544) --- vllm/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 730b069e..42a44f54 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1133,9 +1133,9 @@ class SchedulerConfig: # max_num_batched_tokens. self.max_num_batched_tokens = max(self.max_model_len, 2048) else: - # It is the values that have the best balance between ITL - # and TTFT on A100. Note it is not optimized for throughput. - self.max_num_batched_tokens = 512 + # This value is chosen to have a balance between ITL + # and TTFT. Note it is not optimized for throughput. + self.max_num_batched_tokens = 2048 else: # If max_model_len is too short, use 2048 as the default value # for higher throughput.