diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 2fdc08c5..0f223571 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -16,18 +16,17 @@ def main(args: argparse.Namespace): # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. - llm = LLM( - model=args.model, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - ) + llm = LLM(model=args.model, + tokenizer=args.tokenizer, + quantization=args.quantization, + tensor_parallel_size=args.tensor_parallel_size, + trust_remote_code=args.trust_remote_code, + dtype=args.dtype, + enforce_eager=args.enforce_eager, + kv_cache_dtype=args.kv_cache_dtype, + device=args.device, + ray_workers_use_nsight=args.ray_workers_use_nsight, + download_dir=args.download_dir) sampling_params = SamplingParams( n=args.n, @@ -151,5 +150,10 @@ if __name__ == '__main__': action='store_true', help="If specified, use nsight to profile ray workers", ) + parser.add_argument('--download-dir', + type=str, + default=None, + help='directory to download and load the weights, ' + 'default to the default cache dir of huggingface') args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index fae4776b..6ccdd865 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -75,6 +75,7 @@ def run_vllm( device: str, enable_prefix_caching: bool, gpu_memory_utilization: float = 0.9, + download_dir: Optional[str] = None, ) -> float: from vllm import LLM, SamplingParams llm = LLM(model=model, @@ -89,7 +90,8 @@ def run_vllm( enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, device=device, - enable_prefix_caching=enable_prefix_caching) + enable_prefix_caching=enable_prefix_caching, + download_dir=download_dir) # Add the requests to the engine. for prompt, _, output_len in requests: @@ -208,12 +210,14 @@ def main(args: argparse.Namespace): args.output_len) if args.backend == "vllm": - elapsed_time = run_vllm( - requests, args.model, args.tokenizer, args.quantization, - args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, - args.trust_remote_code, args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype, args.device, - args.enable_prefix_caching, args.gpu_memory_utilization) + elapsed_time = run_vllm(requests, args.model, args.tokenizer, + args.quantization, args.tensor_parallel_size, + args.seed, args.n, args.use_beam_search, + args.trust_remote_code, args.dtype, + args.max_model_len, args.enforce_eager, + args.kv_cache_dtype, args.device, + args.enable_prefix_caching, + args.gpu_memory_utilization, args.download_dir) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -314,6 +318,11 @@ if __name__ == "__main__": "--enable-prefix-caching", action='store_true', help="enable automatic prefix caching for vLLM backend.") + parser.add_argument('--download-dir', + type=str, + default=None, + help='directory to download and load the weights, ' + 'default to the default cache dir of huggingface') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model