From 616e600e0b092050213e79fd2a10baabb30dcf6d Mon Sep 17 00:00:00 2001 From: Marut Pandya Date: Tue, 28 May 2024 17:16:18 -0700 Subject: [PATCH] [Misc] add gpu_memory_utilization arg (#5079) Signed-off-by: pandyamarut --- benchmarks/benchmark_latency.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 3146fb33..f69d91a0 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -35,7 +35,8 @@ def main(args: argparse.Namespace): use_v2_block_manager=args.use_v2_block_manager, enable_chunked_prefill=args.enable_chunked_prefill, download_dir=args.download_dir, - block_size=args.block_size) + block_size=args.block_size, + gpu_memory_utilization=args.gpu_memory_utilization) sampling_params = SamplingParams( n=args.n, @@ -214,5 +215,11 @@ if __name__ == '__main__': type=str, default=None, help='Path to save the latency results in JSON format.') + parser.add_argument('--gpu-memory-utilization', + type=float, + default=0.9, + help='the fraction of GPU memory to be used for ' + 'the model executor, which can range from 0 to 1.' + 'If unspecified, will use the default value of 0.9.') args = parser.parse_args() main(args)