[Core][Distributed] use absolute path for library file (#4271)
This commit is contained in:
parent
ceaf4ed003
commit
c1b4e4157c
@ -553,6 +553,34 @@ def nccl_integrity_check(filepath):
|
|||||||
return version.value
|
return version.value
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def find_library(lib_name: str) -> str:
|
||||||
|
"""
|
||||||
|
Find the library file in the system.
|
||||||
|
`lib_name` is full filename, with both prefix and suffix.
|
||||||
|
This function resolves `lib_name` to the full path of the library.
|
||||||
|
"""
|
||||||
|
# Adapted from https://github.com/openai/triton/blob/main/third_party/nvidia/backend/driver.py#L19 # noqa
|
||||||
|
# According to https://en.wikipedia.org/wiki/Filesystem_Hierarchy_Standard
|
||||||
|
# `/sbin/ldconfig` should exist in all Linux systems.
|
||||||
|
# `/sbin/ldconfig` searches the library in the system
|
||||||
|
libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
|
||||||
|
# each line looks like the following:
|
||||||
|
# libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
|
||||||
|
locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line]
|
||||||
|
# `LD_LIBRARY_PATH` searches the library in the user-defined paths
|
||||||
|
env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
|
||||||
|
if not locs and env_ld_library_path:
|
||||||
|
locs = [
|
||||||
|
os.path.join(dir, lib_name)
|
||||||
|
for dir in env_ld_library_path.split(":")
|
||||||
|
if os.path.exists(os.path.join(dir, lib_name))
|
||||||
|
]
|
||||||
|
if not locs:
|
||||||
|
raise ValueError(f"Cannot find {lib_name} in the system.")
|
||||||
|
return locs[0]
|
||||||
|
|
||||||
|
|
||||||
def find_nccl_library():
|
def find_nccl_library():
|
||||||
so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
|
so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
|
||||||
|
|
||||||
@ -572,9 +600,9 @@ def find_nccl_library():
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if torch.version.cuda is not None:
|
if torch.version.cuda is not None:
|
||||||
so_file = vllm_nccl_path or "libnccl.so.2"
|
so_file = vllm_nccl_path or find_library("libnccl.so.2")
|
||||||
elif torch.version.hip is not None:
|
elif torch.version.hip is not None:
|
||||||
so_file = "librccl.so.1"
|
so_file = find_library("librccl.so.1")
|
||||||
else:
|
else:
|
||||||
raise ValueError("NCCL only supports CUDA and ROCm backends.")
|
raise ValueError("NCCL only supports CUDA and ROCm backends.")
|
||||||
logger.info(f"Found nccl from library {so_file}")
|
logger.info(f"Found nccl from library {so_file}")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user