vllm/vllm/engine/ray_utils.py

from typing import Optional, Tuple, TYPE_CHECKING

from vllm.config import ParallelConfig
from vllm.logger import init_logger
from vllm.utils import get_open_port, is_hip

logger = init_logger(__name__)

try:
    import ray
    from ray.air.util.torch_dist import TorchDistributedWorker

    class RayWorkerVllm(TorchDistributedWorker):
        """Ray wrapper for vllm.worker.Worker, allowing Worker to be
        lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""

        def __init__(self, init_cached_hf_modules=False) -> None:
            if init_cached_hf_modules:
                from transformers.dynamic_module_utils import init_hf_modules
                init_hf_modules()
            self.worker = None

        def init_worker(self, worker_init_fn):
            self.worker = worker_init_fn()

        def __getattr__(self, name):
            return getattr(self.worker, name)

        def execute_method(self, method, *args, **kwargs):
            executor = getattr(self, method)
            return executor(*args, **kwargs)

except ImportError as e:
    logger.warning(f"Failed to import Ray with {e!r}. "
                   "For distributed inference, please install Ray with "
                   "`pip install ray pandas pyarrow`.")
    ray = None
    TorchDistributedWorker = None
    RayWorkerVllm = None

if TYPE_CHECKING:
    from ray.util.placement_group import PlacementGroup


def initialize_cluster(
    parallel_config: ParallelConfig,
    engine_use_ray: bool = False,
    ray_address: Optional[str] = None,
) -> Tuple[str, Optional["PlacementGroup"]]:
    """Initialize the distributed cluster probably with Ray.

    Args:
        parallel_config: The configurations for parallel execution.
        engine_use_ray: Whether to use Ray for async engine.
        ray_address: The address of the Ray cluster. If None, uses
            the default Ray cluster address.

    Returns:
        A tuple of (`distributed_init_method`, `placement_group`). The
        `distributed_init_method` is the address for initializing the
        distributed backend. `placement_group` includes the specification
        of the resources for each distributed worker.
    """
    if parallel_config.worker_use_ray or engine_use_ray:
        if ray is None:
            raise ImportError(
                "Ray is not installed. Please install Ray to use distributed "
                "serving.")
        # Connect to a ray cluster.
        if is_hip():
            ray.init(address=ray_address,
                     ignore_reinit_error=True,
                     num_gpus=parallel_config.world_size)
        else:
            ray.init(address=ray_address, ignore_reinit_error=True)

    if not parallel_config.worker_use_ray:
        # Initialize cluster locally.
        port = get_open_port()
        # We need to setup the distributed init method to make sure
        # the distributed megatron code (e.g., get world size) works correctly.
        distributed_init_method = f"tcp://localhost:{port}"
        return distributed_init_method, None

    current_placement_group = ray.util.get_current_placement_group()
    if current_placement_group:
        # We are in a placement group
        bundles = current_placement_group.bundle_specs
        # Verify that we can use the placement group.
        gpu_bundles = 0
        for bundle in bundles:
            bundle_gpus = bundle.get("GPU", 0)
            if bundle_gpus > 1:
                raise ValueError(
                    "Placement group bundle cannot have more than 1 GPU.")
            if bundle_gpus:
                gpu_bundles += 1
        if parallel_config.world_size > gpu_bundles:
            raise ValueError(
                "The number of required GPUs exceeds the total number of "
                "available GPUs in the placement group.")
    else:
        num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)
        if parallel_config.world_size > num_gpus_in_cluster:
            raise ValueError(
                "The number of required GPUs exceeds the total number of "
                "available GPUs in the cluster.")
        # Create a new placement group
        current_placement_group = ray.util.placement_group([{
            "GPU": 1
        }] * parallel_config.world_size)
        # Wait until PG is ready - this will block until all
        # requested resources are available, and will timeout
        # if they cannot be provisioned.
        ray.get(current_placement_group.ready(), timeout=1800)

    return None, current_placement_group
Ray placement group support (#397) 2023-07-20 13:49:31 +08:00			`from typing import Optional, Tuple, TYPE_CHECKING`

			`from vllm.config import ParallelConfig`
Add pyarrow to dependencies & Print warning on Ray import error (#1094) 2023-09-19 13:36:17 +08:00			`from vllm.logger import init_logger`
Optimize model execution with CUDA graph (#1926) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Antoni Baum <antoni.baum@protonmail.com> 2023-12-17 13:12:08 +08:00			`from vllm.utils import get_open_port, is_hip`
Add pyarrow to dependencies & Print warning on Ray import error (#1094) 2023-09-19 13:36:17 +08:00
			`logger = init_logger(__name__)`
Refactor system architecture (#109) 2023-05-21 04:06:59 +08:00
			`try:`
			`import ray`
Ray placement group support (#397) 2023-07-20 13:49:31 +08:00			`from ray.air.util.torch_dist import TorchDistributedWorker`

Better integration with Ray Serve (#1821) Co-authored-by: FlorianJoncour <florian@zetta-sys.com> 2023-11-30 05:25:43 +08:00			`class RayWorkerVllm(TorchDistributedWorker):`
Ray placement group support (#397) 2023-07-20 13:49:31 +08:00			`"""Ray wrapper for vllm.worker.Worker, allowing Worker to be`
			`lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""`

fix "tansformers_module" ModuleNotFoundError when load model with `trust_remote_code=True` (#871) 2023-09-09 08:21:30 +08:00			`def __init__(self, init_cached_hf_modules=False) -> None:`
			`if init_cached_hf_modules:`
			`from transformers.dynamic_module_utils import init_hf_modules`
			`init_hf_modules()`
Ray placement group support (#397) 2023-07-20 13:49:31 +08:00			`self.worker = None`

			`def init_worker(self, worker_init_fn):`
			`self.worker = worker_init_fn()`

			`def __getattr__(self, name):`
			`return getattr(self.worker, name)`

			`def execute_method(self, method, args, *kwargs):`
			`executor = getattr(self, method)`
			`return executor(args, *kwargs)`

Add pyarrow to dependencies & Print warning on Ray import error (#1094) 2023-09-19 13:36:17 +08:00			`except ImportError as e:`
			`logger.warning(f"Failed to import Ray with {e!r}. "`
			`"For distributed inference, please install Ray with "`
			"`pip install ray pandas pyarrow`.")
Refactor system architecture (#109) 2023-05-21 04:06:59 +08:00			`ray = None`
Ray placement group support (#397) 2023-07-20 13:49:31 +08:00			`TorchDistributedWorker = None`
Better integration with Ray Serve (#1821) Co-authored-by: FlorianJoncour <florian@zetta-sys.com> 2023-11-30 05:25:43 +08:00			`RayWorkerVllm = None`
Refactor system architecture (#109) 2023-05-21 04:06:59 +08:00
Ray placement group support (#397) 2023-07-20 13:49:31 +08:00			`if TYPE_CHECKING:`
			`from ray.util.placement_group import PlacementGroup`
Refactor system architecture (#109) 2023-05-21 04:06:59 +08:00

			`def initialize_cluster(`
			`parallel_config: ParallelConfig,`
Rename servers to engines (#152) 2023-06-17 17:25:21 +08:00			`engine_use_ray: bool = False,`
			`ray_address: Optional[str] = None,`
Ray placement group support (#397) 2023-07-20 13:49:31 +08:00			`) -> Tuple[str, Optional["PlacementGroup"]]:`
Add docstrings for LLMServer and related classes and examples (#142) 2023-06-07 18:25:20 +08:00			`"""Initialize the distributed cluster probably with Ray.`

			`Args:`
			`parallel_config: The configurations for parallel execution.`
Rename servers to engines (#152) 2023-06-17 17:25:21 +08:00			`engine_use_ray: Whether to use Ray for async engine.`
			`ray_address: The address of the Ray cluster. If None, uses`
Add docstrings for LLMServer and related classes and examples (#142) 2023-06-07 18:25:20 +08:00			`the default Ray cluster address.`

			`Returns:`
fix typo (#1184) Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> 2023-09-28 07:22:45 +08:00			A tuple of (`distributed_init_method`, `placement_group`). The
Add docstrings for LLMServer and related classes and examples (#142) 2023-06-07 18:25:20 +08:00			`distributed_init_method` is the address for initializing the
fix typo (#1184) Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> 2023-09-28 07:22:45 +08:00			distributed backend. `placement_group` includes the specification
			`of the resources for each distributed worker.`
Add docstrings for LLMServer and related classes and examples (#142) 2023-06-07 18:25:20 +08:00			`"""`
Rename servers to engines (#152) 2023-06-17 17:25:21 +08:00			`if parallel_config.worker_use_ray or engine_use_ray:`
Fix various issues of async servers (#135) 2023-06-05 23:44:50 +08:00			`if ray is None:`
			`raise ImportError(`
			`"Ray is not installed. Please install Ray to use distributed "`
			`"serving.")`
			`# Connect to a ray cluster.`
Merge EmbeddedLLM/vllm-rocm into vLLM main (#1836) Co-authored-by: Philipp Moritz <pcmoritz@gmail.com> Co-authored-by: Amir Balwel <amoooori04@gmail.com> Co-authored-by: root <kuanfu.liu@akirakan.com> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: kuanfu <kuanfu.liu@embeddedllm.com> Co-authored-by: miloice <17350011+kliuae@users.noreply.github.com> 2023-12-08 15:16:52 +08:00			`if is_hip():`
			`ray.init(address=ray_address,`
			`ignore_reinit_error=True,`
			`num_gpus=parallel_config.world_size)`
			`else:`
			`ray.init(address=ray_address, ignore_reinit_error=True)`
Fix various issues of async servers (#135) 2023-06-05 23:44:50 +08:00
			`if not parallel_config.worker_use_ray:`
Refactor system architecture (#109) 2023-05-21 04:06:59 +08:00			`# Initialize cluster locally.`
Offload port selection to OS (#467) 2023-07-16 14:11:02 +08:00			`port = get_open_port()`
Refactor system architecture (#109) 2023-05-21 04:06:59 +08:00			`# We need to setup the distributed init method to make sure`
			`# the distributed megatron code (e.g., get world size) works correctly.`
			`distributed_init_method = f"tcp://localhost:{port}"`
Ray placement group support (#397) 2023-07-20 13:49:31 +08:00			`return distributed_init_method, None`

			`current_placement_group = ray.util.get_current_placement_group()`
			`if current_placement_group:`
			`# We are in a placement group`
			`bundles = current_placement_group.bundle_specs`
			`# Verify that we can use the placement group.`
			`gpu_bundles = 0`
			`for bundle in bundles:`
Fix bad assert in initialize_cluster if PG already exists (#526) 2023-07-20 14:17:12 +08:00			`bundle_gpus = bundle.get("GPU", 0)`
			`if bundle_gpus > 1:`
			`raise ValueError(`
			`"Placement group bundle cannot have more than 1 GPU.")`
			`if bundle_gpus:`
Ray placement group support (#397) 2023-07-20 13:49:31 +08:00			`gpu_bundles += 1`
			`if parallel_config.world_size > gpu_bundles:`
Refactor system architecture (#109) 2023-05-21 04:06:59 +08:00			`raise ValueError(`
Ray placement group support (#397) 2023-07-20 13:49:31 +08:00			`"The number of required GPUs exceeds the total number of "`
			`"available GPUs in the placement group.")`
Refactor system architecture (#109) 2023-05-21 04:06:59 +08:00			`else:`
Ray placement group support (#397) 2023-07-20 13:49:31 +08:00			`num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)`
			`if parallel_config.world_size > num_gpus_in_cluster:`
Refactor system architecture (#109) 2023-05-21 04:06:59 +08:00			`raise ValueError(`
Ray placement group support (#397) 2023-07-20 13:49:31 +08:00			`"The number of required GPUs exceeds the total number of "`
			`"available GPUs in the cluster.")`
			`# Create a new placement group`
			`current_placement_group = ray.util.placement_group([{`
			`"GPU": 1`
			`}] * parallel_config.world_size)`
			`# Wait until PG is ready - this will block until all`
			`# requested resources are available, and will timeout`
			`# if they cannot be provisioned.`
			`ray.get(current_placement_group.ready(), timeout=1800)`

			`return None, current_placement_group`