[cuda][misc] remove error_on_invalid_device_count_status (#7069)
This commit is contained in:
parent
cf2a1a4d9d
commit
660dea1235
@ -17,7 +17,6 @@ from vllm.logger import init_logger
|
|||||||
from vllm.sequence import ExecuteModelRequest, SamplerOutput
|
from vllm.sequence import ExecuteModelRequest, SamplerOutput
|
||||||
from vllm.triton_utils import maybe_set_triton_cache_manager
|
from vllm.triton_utils import maybe_set_triton_cache_manager
|
||||||
from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
|
from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
|
||||||
error_on_invalid_device_count_status,
|
|
||||||
get_distributed_init_method, get_open_port,
|
get_distributed_init_method, get_open_port,
|
||||||
get_vllm_instance_id, make_async,
|
get_vllm_instance_id, make_async,
|
||||||
update_environment_variables)
|
update_environment_variables)
|
||||||
@ -79,8 +78,6 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
|
|||||||
f"please ensure that world_size ({world_size}) "
|
f"please ensure that world_size ({world_size}) "
|
||||||
f"is less than than max local gpu count ({cuda_device_count})")
|
f"is less than than max local gpu count ({cuda_device_count})")
|
||||||
|
|
||||||
error_on_invalid_device_count_status()
|
|
||||||
|
|
||||||
# Multiprocessing-based executor does not support multi-node setting.
|
# Multiprocessing-based executor does not support multi-node setting.
|
||||||
# Since it only works for single node, we can use the loopback address
|
# Since it only works for single node, we can use the loopback address
|
||||||
# 127.0.0.1 for communication.
|
# 127.0.0.1 for communication.
|
||||||
|
|||||||
@ -10,10 +10,9 @@ from vllm.executor.distributed_gpu_executor import ( # yapf: disable
|
|||||||
from vllm.executor.ray_utils import RayWorkerWrapper, ray
|
from vllm.executor.ray_utils import RayWorkerWrapper, ray
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.sequence import ExecuteModelRequest, SamplerOutput
|
from vllm.sequence import ExecuteModelRequest, SamplerOutput
|
||||||
from vllm.utils import (_run_task_with_lock,
|
from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
|
||||||
error_on_invalid_device_count_status,
|
get_ip, get_open_port, get_vllm_instance_id,
|
||||||
get_distributed_init_method, get_ip, get_open_port,
|
make_async)
|
||||||
get_vllm_instance_id, make_async)
|
|
||||||
|
|
||||||
if ray is not None:
|
if ray is not None:
|
||||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||||
@ -216,8 +215,6 @@ class RayGPUExecutor(DistributedGPUExecutor):
|
|||||||
distributed_init_method = get_distributed_init_method(
|
distributed_init_method = get_distributed_init_method(
|
||||||
driver_ip, get_open_port())
|
driver_ip, get_open_port())
|
||||||
|
|
||||||
error_on_invalid_device_count_status()
|
|
||||||
|
|
||||||
# Initialize the actual workers inside worker wrapper.
|
# Initialize the actual workers inside worker wrapper.
|
||||||
init_worker_all_kwargs = [
|
init_worker_all_kwargs = [
|
||||||
self._get_worker_kwargs(
|
self._get_worker_kwargs(
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
import contextlib
|
|
||||||
import datetime
|
import datetime
|
||||||
import enum
|
import enum
|
||||||
import gc
|
import gc
|
||||||
@ -923,28 +922,6 @@ def cuda_device_count_stateless() -> int:
|
|||||||
return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
|
return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
|
||||||
|
|
||||||
|
|
||||||
def error_on_invalid_device_count_status():
|
|
||||||
cache_entries = 0
|
|
||||||
with contextlib.suppress(Exception):
|
|
||||||
# future pytorch will fix the issue, device_count will not be cached
|
|
||||||
# at that time, `.cache_info().currsize` will error out
|
|
||||||
cache_entries = torch.cuda.device_count.cache_info( # type: ignore
|
|
||||||
).currsize
|
|
||||||
if cache_entries != 0:
|
|
||||||
# the function is already called, and the result is cached
|
|
||||||
remembered = torch.cuda.device_count()
|
|
||||||
current = cuda_device_count_stateless()
|
|
||||||
if remembered > current:
|
|
||||||
raise RuntimeError(
|
|
||||||
"The number of CUDA devices has changed since the first "
|
|
||||||
"call to torch.cuda.device_count(). This is not allowed "
|
|
||||||
"and may result in undefined behavior. Please check out "
|
|
||||||
"https://github.com/vllm-project/vllm/issues/6056 to "
|
|
||||||
"find the first call to torch.cuda.device_count() "
|
|
||||||
"and defer it until the engine is up. Or you can set "
|
|
||||||
"CUDA_VISIBLE_DEVICES to the GPUs you want to use.")
|
|
||||||
|
|
||||||
|
|
||||||
# NVML utils
|
# NVML utils
|
||||||
# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
|
# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
|
||||||
# all the related functions work on real physical device ids.
|
# all the related functions work on real physical device ids.
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user