From a5675d348b126e53928e139d1ed5b2c00a0044e8 Mon Sep 17 00:00:00 2001 From: zifeitong Date: Wed, 15 May 2024 07:22:09 -0700 Subject: [PATCH] [Bugfix] Properly set distributed_executor_backend in ParallelConfig (#4816) --- vllm/config.py | 1 + vllm/engine/arg_utils.py | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 26edd456..2eb5bdd1 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -531,6 +531,7 @@ class ParallelConfig: If None, will use synchronous tokenization. ray_workers_use_nsight: Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler. + placement_group: ray distributed model workers placement group. distributed_executor_backend: Backend to use for distributed model workers, either "ray" or "mp" (multiprocessing). If either pipeline_parallel_size or tensor_parallel_size is greater than 1, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bd44c247..dab86b7c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -548,14 +548,18 @@ class EngineArgs: model_config.get_sliding_window(), self.enable_prefix_caching) parallel_config = ParallelConfig( - self.pipeline_parallel_size, self.tensor_parallel_size, - self.worker_use_ray, self.max_parallel_loading_workers, + self.pipeline_parallel_size, + self.tensor_parallel_size, + self.worker_use_ray, + self.max_parallel_loading_workers, self.disable_custom_all_reduce, TokenizerPoolConfig.create_config( self.tokenizer_pool_size, self.tokenizer_pool_type, self.tokenizer_pool_extra_config, - ), self.ray_workers_use_nsight) + ), + self.ray_workers_use_nsight, + distributed_executor_backend=self.distributed_executor_backend) speculative_config = SpeculativeConfig.maybe_create_spec_config( target_model_config=model_config,