From b3376e5c76c199acb216addec7c32ac5299bef31 Mon Sep 17 00:00:00 2001 From: Benjamin Kitor Date: Fri, 7 Jun 2024 18:20:16 -0700 Subject: [PATCH] [Misc] Add args for selecting distributed executor to benchmarks (#5335) --- benchmarks/benchmark_latency.py | 10 +++++++++- benchmarks/benchmark_throughput.py | 13 +++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index f69d91a0..1a41b66b 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -36,7 +36,8 @@ def main(args: argparse.Namespace): enable_chunked_prefill=args.enable_chunked_prefill, download_dir=args.download_dir, block_size=args.block_size, - gpu_memory_utilization=args.gpu_memory_utilization) + gpu_memory_utilization=args.gpu_memory_utilization, + distributed_executor_backend=args.distributed_executor_backend) sampling_params = SamplingParams( n=args.n, @@ -221,5 +222,12 @@ if __name__ == '__main__': help='the fraction of GPU memory to be used for ' 'the model executor, which can range from 0 to 1.' 'If unspecified, will use the default value of 0.9.') + parser.add_argument( + '--distributed-executor-backend', + choices=['ray', 'mp'], + default=None, + help='Backend to use for distributed serving. When more than 1 GPU ' + 'is used, will be automatically set to "ray" if installed ' + 'or "mp" (multiprocessing) otherwise.') args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 7c8cb5ee..90f7433e 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -78,6 +78,7 @@ def run_vllm( enable_prefix_caching: bool, enable_chunked_prefill: bool, max_num_batched_tokens: int, + distributed_executor_backend: Optional[str], gpu_memory_utilization: float = 0.9, download_dir: Optional[str] = None, ) -> float: @@ -100,6 +101,7 @@ def run_vllm( download_dir=download_dir, enable_chunked_prefill=enable_chunked_prefill, max_num_batched_tokens=max_num_batched_tokens, + distributed_executor_backend=distributed_executor_backend, ) # Add the requests to the engine. @@ -225,8 +227,8 @@ def main(args: argparse.Namespace): args.enforce_eager, args.kv_cache_dtype, args.quantization_param_path, args.device, args.enable_prefix_caching, args.enable_chunked_prefill, - args.max_num_batched_tokens, args.gpu_memory_utilization, - args.download_dir) + args.max_num_batched_tokens, args.distributed_executor_backend, + args.gpu_memory_utilization, args.download_dir) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -368,6 +370,13 @@ if __name__ == "__main__": type=str, default=None, help='Path to save the throughput results in JSON format.') + parser.add_argument( + '--distributed-executor-backend', + choices=['ray', 'mp'], + default=None, + help='Backend to use for distributed serving. When more than 1 GPU ' + 'is used, will be automatically set to "ray" if installed ' + 'or "mp" (multiprocessing) otherwise.') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model