Support max-model-len argument for throughput benchmark (#1858)

2023-12-01 00:10:24 +08:00 · 2023-12-01 00:10:24 +08:00 · 8d8c2f6ffe
commit 8d8c2f6ffe
parent 51d3cb951d
1 changed files with 10 additions and 1 deletions
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -69,6 +69,7 @@ def run_vllm(
    use_beam_search: bool,
    trust_remote_code: bool,
    dtype: str,
+    max_model_len: Optional[int] = None,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(
@ -79,6 +80,7 @@ def run_vllm(
        seed=seed,
        trust_remote_code=trust_remote_code,
        dtype=dtype,
+        max_model_len=max_model_len,
    )

    # Add the requests to the engine.
@ -201,7 +203,8 @@ def main(args: argparse.Namespace):
        elapsed_time = run_vllm(requests, args.model, args.tokenizer,
                                args.quantization, args.tensor_parallel_size,
                                args.seed, args.n, args.use_beam_search,
-                                args.trust_remote_code, args.dtype)
+                                args.trust_remote_code, args.dtype,
+                                args.max_model_len)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@ -261,6 +264,12 @@ if __name__ == "__main__":
    parser.add_argument('--trust-remote-code',
                        action='store_true',
                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
    parser.add_argument(
        '--dtype',
        type=str,