[misc][distributed] improve libcudart.so finding (#7127)

This commit is contained in:
youkaichao 2024-08-04 11:31:51 -07:00 committed by GitHub
parent b1c9aa3daa
commit 16a1cc9bb2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 25 additions and 23 deletions

View File

@ -4,9 +4,6 @@ convenient for use when we just need to call a few functions.
""" """
import ctypes import ctypes
import glob
import os
import sys
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
@ -36,24 +33,25 @@ class Function:
argtypes: List[Any] argtypes: List[Any]
def get_pytorch_default_cudart_library_path() -> str: def find_loaded_library(lib_name) -> Optional[str]:
# code borrowed from https://github.com/pytorch/pytorch/blob/1cae60a87e5bdda8bcf55724a862eeed98a9747e/torch/__init__.py#L284 # noqa """
lib_folder = "cuda_runtime" According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
lib_name = "libcudart.so.*[0-9]" the file `/proc/self/maps` contains the memory maps of the process, which includes the
lib_path = None shared libraries loaded by the process. We can use this file to find the path of the
for path in sys.path: a loaded library.
nvidia_path = os.path.join(path, "nvidia") """ # noqa
if not os.path.exists(nvidia_path): found = False
continue with open("/proc/self/maps") as f:
candidate_lib_paths = glob.glob( for line in f:
os.path.join(nvidia_path, lib_folder, "lib", lib_name)) if lib_name in line:
if candidate_lib_paths and not lib_path: found = True
lib_path = candidate_lib_paths[0] break
if lib_path: if not found:
break # the library is not loaded in the current process
if not lib_path: return None
raise ValueError(f"{lib_name} not found in the system path {sys.path}") start = line.index("/")
return lib_path path = line[start:].strip()
return path
class CudaRTLibrary: class CudaRTLibrary:
@ -100,7 +98,9 @@ class CudaRTLibrary:
def __init__(self, so_file: Optional[str] = None): def __init__(self, so_file: Optional[str] = None):
if so_file is None: if so_file is None:
so_file = get_pytorch_default_cudart_library_path() so_file = find_loaded_library("libcudart.so")
assert so_file is not None, \
"libcudart.so is not loaded in the current process"
if so_file not in CudaRTLibrary.path_to_library_cache: if so_file not in CudaRTLibrary.path_to_library_cache:
lib = ctypes.CDLL(so_file) lib = ctypes.CDLL(so_file)
CudaRTLibrary.path_to_library_cache[so_file] = lib CudaRTLibrary.path_to_library_cache[so_file] = lib

View File

@ -145,6 +145,7 @@ def can_actually_p2p(
p_tgt.start() p_tgt.start()
p_src.join() p_src.join()
p_tgt.join() p_tgt.join()
assert p_src.exitcode == 0 and p_tgt.exitcode == 0
result: List[bool] = [] result: List[bool] = []
for src, tgt in zip(batch_src, batch_tgt): for src, tgt in zip(batch_src, batch_tgt):
a = result_queue.get() a = result_queue.get()
@ -221,7 +222,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
# wrap raised exception to provide more information # wrap raised exception to provide more information
raise RuntimeError( raise RuntimeError(
f"Error happened when batch testing " f"Error happened when batch testing "
f"peer-to-peer access from {batch_src} to {batch_tgt}") from e f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
f"{returned.stderr.decode()}") from e
result = pickle.loads(returned.stdout) result = pickle.loads(returned.stdout)
for _i, _j, r in zip(batch_src, batch_tgt, result): for _i, _j, r in zip(batch_src, batch_tgt, result):
cache[f"{_i}->{_j}"] = r cache[f"{_i}->{_j}"] = r