diff --git a/vllm/envs.py b/vllm/envs.py index e4cf6a02..4f7a7ad7 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -137,7 +137,10 @@ environment_variables: Dict[str, Callable[[], Any]] = { os.path.join(get_default_cache_root(), "vllm"), )), - # used in distributed environment to determine the master address + # used in distributed environment to determine the ip address + # of the current node, when the node has multiple network interfaces. + # If you are using multi-node inference, you should set this differently + # on each node. 'VLLM_HOST_IP': lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""), diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index aec6998d..760c06cb 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -218,6 +218,19 @@ class RayGPUExecutor(DistributedGPUExecutor): for node_id, gpu_ids in node_gpus.items(): node_gpus[node_id] = sorted(gpu_ids) + all_ips = set(worker_ips + [driver_ip]) + n_ips = len(all_ips) + n_nodes = len(node_workers) + + if n_nodes != n_ips: + raise RuntimeError( + f"Every node should have a unique IP address. Got {n_nodes}" + f" nodes with node ids {list(node_workers.keys())} and " + f"{n_ips} unique IP addresses {all_ips}. Please check your" + " network configuration. If you set `VLLM_HOST_IP` or " + "`HOST_IP` environment variable, make sure it is unique for" + " each node.") + VLLM_INSTANCE_ID = get_vllm_instance_id() # Set environment variables for the driver and workers.