2023-07-20 13:49:31 +08:00
|
|
|
from typing import Optional, Tuple, TYPE_CHECKING
|
|
|
|
|
|
|
|
|
|
from vllm.config import ParallelConfig
|
2023-09-19 13:36:17 +08:00
|
|
|
from vllm.logger import init_logger
|
2023-12-17 13:12:08 +08:00
|
|
|
from vllm.utils import get_open_port, is_hip
|
2023-09-19 13:36:17 +08:00
|
|
|
|
|
|
|
|
logger = init_logger(__name__)
|
2023-05-21 04:06:59 +08:00
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
import ray
|
2023-07-20 13:49:31 +08:00
|
|
|
from ray.air.util.torch_dist import TorchDistributedWorker
|
|
|
|
|
|
2023-11-30 05:25:43 +08:00
|
|
|
class RayWorkerVllm(TorchDistributedWorker):
|
2023-07-20 13:49:31 +08:00
|
|
|
"""Ray wrapper for vllm.worker.Worker, allowing Worker to be
|
|
|
|
|
lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
|
|
|
|
|
|
2023-09-09 08:21:30 +08:00
|
|
|
def __init__(self, init_cached_hf_modules=False) -> None:
|
|
|
|
|
if init_cached_hf_modules:
|
|
|
|
|
from transformers.dynamic_module_utils import init_hf_modules
|
|
|
|
|
init_hf_modules()
|
2023-07-20 13:49:31 +08:00
|
|
|
self.worker = None
|
|
|
|
|
|
|
|
|
|
def init_worker(self, worker_init_fn):
|
|
|
|
|
self.worker = worker_init_fn()
|
|
|
|
|
|
|
|
|
|
def __getattr__(self, name):
|
|
|
|
|
return getattr(self.worker, name)
|
|
|
|
|
|
|
|
|
|
def execute_method(self, method, *args, **kwargs):
|
|
|
|
|
executor = getattr(self, method)
|
|
|
|
|
return executor(*args, **kwargs)
|
|
|
|
|
|
2023-09-19 13:36:17 +08:00
|
|
|
except ImportError as e:
|
|
|
|
|
logger.warning(f"Failed to import Ray with {e!r}. "
|
|
|
|
|
"For distributed inference, please install Ray with "
|
|
|
|
|
"`pip install ray pandas pyarrow`.")
|
2023-05-21 04:06:59 +08:00
|
|
|
ray = None
|
2023-07-20 13:49:31 +08:00
|
|
|
TorchDistributedWorker = None
|
2023-11-30 05:25:43 +08:00
|
|
|
RayWorkerVllm = None
|
2023-05-21 04:06:59 +08:00
|
|
|
|
2023-07-20 13:49:31 +08:00
|
|
|
if TYPE_CHECKING:
|
|
|
|
|
from ray.util.placement_group import PlacementGroup
|
2023-05-21 04:06:59 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def initialize_cluster(
|
|
|
|
|
parallel_config: ParallelConfig,
|
2023-06-17 17:25:21 +08:00
|
|
|
engine_use_ray: bool = False,
|
|
|
|
|
ray_address: Optional[str] = None,
|
2023-07-20 13:49:31 +08:00
|
|
|
) -> Tuple[str, Optional["PlacementGroup"]]:
|
2023-06-07 18:25:20 +08:00
|
|
|
"""Initialize the distributed cluster probably with Ray.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
parallel_config: The configurations for parallel execution.
|
2023-06-17 17:25:21 +08:00
|
|
|
engine_use_ray: Whether to use Ray for async engine.
|
|
|
|
|
ray_address: The address of the Ray cluster. If None, uses
|
2023-06-07 18:25:20 +08:00
|
|
|
the default Ray cluster address.
|
|
|
|
|
|
|
|
|
|
Returns:
|
2023-09-28 07:22:45 +08:00
|
|
|
A tuple of (`distributed_init_method`, `placement_group`). The
|
2023-06-07 18:25:20 +08:00
|
|
|
`distributed_init_method` is the address for initializing the
|
2023-09-28 07:22:45 +08:00
|
|
|
distributed backend. `placement_group` includes the specification
|
|
|
|
|
of the resources for each distributed worker.
|
2023-06-07 18:25:20 +08:00
|
|
|
"""
|
2023-06-17 17:25:21 +08:00
|
|
|
if parallel_config.worker_use_ray or engine_use_ray:
|
2023-06-05 23:44:50 +08:00
|
|
|
if ray is None:
|
|
|
|
|
raise ImportError(
|
|
|
|
|
"Ray is not installed. Please install Ray to use distributed "
|
|
|
|
|
"serving.")
|
|
|
|
|
# Connect to a ray cluster.
|
2023-12-08 15:16:52 +08:00
|
|
|
if is_hip():
|
|
|
|
|
ray.init(address=ray_address,
|
|
|
|
|
ignore_reinit_error=True,
|
|
|
|
|
num_gpus=parallel_config.world_size)
|
|
|
|
|
else:
|
|
|
|
|
ray.init(address=ray_address, ignore_reinit_error=True)
|
2023-06-05 23:44:50 +08:00
|
|
|
|
|
|
|
|
if not parallel_config.worker_use_ray:
|
2023-05-21 04:06:59 +08:00
|
|
|
# Initialize cluster locally.
|
2023-07-16 14:11:02 +08:00
|
|
|
port = get_open_port()
|
2023-05-21 04:06:59 +08:00
|
|
|
# We need to setup the distributed init method to make sure
|
|
|
|
|
# the distributed megatron code (e.g., get world size) works correctly.
|
|
|
|
|
distributed_init_method = f"tcp://localhost:{port}"
|
2023-07-20 13:49:31 +08:00
|
|
|
return distributed_init_method, None
|
|
|
|
|
|
|
|
|
|
current_placement_group = ray.util.get_current_placement_group()
|
|
|
|
|
if current_placement_group:
|
|
|
|
|
# We are in a placement group
|
|
|
|
|
bundles = current_placement_group.bundle_specs
|
|
|
|
|
# Verify that we can use the placement group.
|
|
|
|
|
gpu_bundles = 0
|
|
|
|
|
for bundle in bundles:
|
2023-07-20 14:17:12 +08:00
|
|
|
bundle_gpus = bundle.get("GPU", 0)
|
|
|
|
|
if bundle_gpus > 1:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
"Placement group bundle cannot have more than 1 GPU.")
|
|
|
|
|
if bundle_gpus:
|
2023-07-20 13:49:31 +08:00
|
|
|
gpu_bundles += 1
|
|
|
|
|
if parallel_config.world_size > gpu_bundles:
|
2023-05-21 04:06:59 +08:00
|
|
|
raise ValueError(
|
2023-07-20 13:49:31 +08:00
|
|
|
"The number of required GPUs exceeds the total number of "
|
|
|
|
|
"available GPUs in the placement group.")
|
2023-05-21 04:06:59 +08:00
|
|
|
else:
|
2023-07-20 13:49:31 +08:00
|
|
|
num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)
|
|
|
|
|
if parallel_config.world_size > num_gpus_in_cluster:
|
2023-05-21 04:06:59 +08:00
|
|
|
raise ValueError(
|
2023-07-20 13:49:31 +08:00
|
|
|
"The number of required GPUs exceeds the total number of "
|
|
|
|
|
"available GPUs in the cluster.")
|
|
|
|
|
# Create a new placement group
|
|
|
|
|
current_placement_group = ray.util.placement_group([{
|
|
|
|
|
"GPU": 1
|
|
|
|
|
}] * parallel_config.world_size)
|
|
|
|
|
# Wait until PG is ready - this will block until all
|
|
|
|
|
# requested resources are available, and will timeout
|
|
|
|
|
# if they cannot be provisioned.
|
|
|
|
|
ray.get(current_placement_group.ready(), timeout=1800)
|
|
|
|
|
|
|
|
|
|
return None, current_placement_group
|