[misc][distributed] improve libcudart.so finding (#7127)
This commit is contained in:
parent
b1c9aa3daa
commit
16a1cc9bb2
@ -4,9 +4,6 @@ convenient for use when we just need to call a few functions.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import ctypes
|
import ctypes
|
||||||
import glob
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
@ -36,24 +33,25 @@ class Function:
|
|||||||
argtypes: List[Any]
|
argtypes: List[Any]
|
||||||
|
|
||||||
|
|
||||||
def get_pytorch_default_cudart_library_path() -> str:
|
def find_loaded_library(lib_name) -> Optional[str]:
|
||||||
# code borrowed from https://github.com/pytorch/pytorch/blob/1cae60a87e5bdda8bcf55724a862eeed98a9747e/torch/__init__.py#L284 # noqa
|
"""
|
||||||
lib_folder = "cuda_runtime"
|
According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
|
||||||
lib_name = "libcudart.so.*[0-9]"
|
the file `/proc/self/maps` contains the memory maps of the process, which includes the
|
||||||
lib_path = None
|
shared libraries loaded by the process. We can use this file to find the path of the
|
||||||
for path in sys.path:
|
a loaded library.
|
||||||
nvidia_path = os.path.join(path, "nvidia")
|
""" # noqa
|
||||||
if not os.path.exists(nvidia_path):
|
found = False
|
||||||
continue
|
with open("/proc/self/maps") as f:
|
||||||
candidate_lib_paths = glob.glob(
|
for line in f:
|
||||||
os.path.join(nvidia_path, lib_folder, "lib", lib_name))
|
if lib_name in line:
|
||||||
if candidate_lib_paths and not lib_path:
|
found = True
|
||||||
lib_path = candidate_lib_paths[0]
|
break
|
||||||
if lib_path:
|
if not found:
|
||||||
break
|
# the library is not loaded in the current process
|
||||||
if not lib_path:
|
return None
|
||||||
raise ValueError(f"{lib_name} not found in the system path {sys.path}")
|
start = line.index("/")
|
||||||
return lib_path
|
path = line[start:].strip()
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
class CudaRTLibrary:
|
class CudaRTLibrary:
|
||||||
@ -100,7 +98,9 @@ class CudaRTLibrary:
|
|||||||
|
|
||||||
def __init__(self, so_file: Optional[str] = None):
|
def __init__(self, so_file: Optional[str] = None):
|
||||||
if so_file is None:
|
if so_file is None:
|
||||||
so_file = get_pytorch_default_cudart_library_path()
|
so_file = find_loaded_library("libcudart.so")
|
||||||
|
assert so_file is not None, \
|
||||||
|
"libcudart.so is not loaded in the current process"
|
||||||
if so_file not in CudaRTLibrary.path_to_library_cache:
|
if so_file not in CudaRTLibrary.path_to_library_cache:
|
||||||
lib = ctypes.CDLL(so_file)
|
lib = ctypes.CDLL(so_file)
|
||||||
CudaRTLibrary.path_to_library_cache[so_file] = lib
|
CudaRTLibrary.path_to_library_cache[so_file] = lib
|
||||||
|
|||||||
@ -145,6 +145,7 @@ def can_actually_p2p(
|
|||||||
p_tgt.start()
|
p_tgt.start()
|
||||||
p_src.join()
|
p_src.join()
|
||||||
p_tgt.join()
|
p_tgt.join()
|
||||||
|
assert p_src.exitcode == 0 and p_tgt.exitcode == 0
|
||||||
result: List[bool] = []
|
result: List[bool] = []
|
||||||
for src, tgt in zip(batch_src, batch_tgt):
|
for src, tgt in zip(batch_src, batch_tgt):
|
||||||
a = result_queue.get()
|
a = result_queue.get()
|
||||||
@ -221,7 +222,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
|
|||||||
# wrap raised exception to provide more information
|
# wrap raised exception to provide more information
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Error happened when batch testing "
|
f"Error happened when batch testing "
|
||||||
f"peer-to-peer access from {batch_src} to {batch_tgt}") from e
|
f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
|
||||||
|
f"{returned.stderr.decode()}") from e
|
||||||
result = pickle.loads(returned.stdout)
|
result = pickle.loads(returned.stdout)
|
||||||
for _i, _j, r in zip(batch_src, batch_tgt, result):
|
for _i, _j, r in zip(batch_src, batch_tgt, result):
|
||||||
cache[f"{_i}->{_j}"] = r
|
cache[f"{_i}->{_j}"] = r
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user