[misc][distributed] error on invalid state (#6092)

This commit is contained in:
youkaichao 2024-07-02 23:37:29 -07:00 committed by GitHub
parent d830656a97
commit f666207161
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 29 additions and 1 deletions

View File

@ -10,6 +10,7 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
from vllm.logger import init_logger
from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.utils import (cuda_device_count_stateless,
error_on_invalid_device_count_status,
get_distributed_init_method, get_open_port,
get_vllm_instance_id, make_async,
update_environment_variables)
@ -39,6 +40,8 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
assert world_size <= cuda_device_count_stateless(), (
"please set tensor_parallel_size to less than max local gpu count")
error_on_invalid_device_count_status()
# Multiprocessing-based executor does not support multi-node setting.
# Since it only works for single node, we can use the loopback address
# 127.0.0.1 for communication.

View File

@ -11,7 +11,8 @@ from vllm.executor.distributed_gpu_executor import ( # yapf: disable
from vllm.executor.ray_utils import RayWorkerWrapper, ray
from vllm.logger import init_logger
from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
from vllm.utils import (error_on_invalid_device_count_status,
get_distributed_init_method, get_ip, get_open_port,
get_vllm_instance_id, make_async)
if ray is not None:
@ -175,6 +176,8 @@ class RayGPUExecutor(DistributedGPUExecutor):
distributed_init_method = get_distributed_init_method(
driver_ip, get_open_port())
error_on_invalid_device_count_status()
# Initialize the actual workers inside worker wrapper.
init_worker_all_kwargs = [
self._get_worker_kwargs(

View File

@ -1,5 +1,6 @@
import argparse
import asyncio
import contextlib
import datetime
import enum
import gc
@ -816,6 +817,27 @@ def cuda_device_count_stateless() -> int:
return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
def error_on_invalid_device_count_status():
cache_entries = 0
with contextlib.suppress(Exception):
# future pytorch will fix the issue, device_count will not be cached
# at that time, `.cache_info().currsize` will error out
cache_entries = torch.cuda.device_count.cache_info().currsize
if cache_entries != 0:
# the function is already called, and the result is cached
remembered = torch.cuda.device_count()
current = cuda_device_count_stateless()
if remembered > current:
raise RuntimeError(
"The number of CUDA devices has changed since the first "
"call to torch.cuda.device_count(). This is not allowed "
"and may result in undefined behavior. Please check out "
"https://github.com/vllm-project/vllm/issues/6056 to "
"find the first call to torch.cuda.device_count() "
"and defer it until the engine is up. Or you can set "
"CUDA_VISIBLE_DEVICES to the GPUs you want to use.")
# NVML utils
# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
# all the related functions work on real physical device ids.