[Core] nccl integrity check and test (#4155)

[Core] Add integrity check during initialization; add test for it (#4155)
2024-04-17 22:28:52 -07:00 · 2024-04-17 22:28:52 -07:00 · 6dc1fc9cfe
commit 6dc1fc9cfe
parent 533d2a1f39
4 changed files with 107 additions and 26 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -33,6 +33,7 @@ steps:
  num_gpus: 2 # only support 1 or 2 for now.
  commands:
  - pytest -v -s test_pynccl.py
  - pytest -v -s test_pynccl_library.py
  - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
--- a/tests/distributed/test_pynccl_library.py
+++ b/tests/distributed/test_pynccl_library.py
@ -0,0 +1,43 @@
 import multiprocessing
 import tempfile
 def target_fn(env, filepath):
    from vllm.utils import update_environment_variables
    update_environment_variables(env)
    from vllm.utils import nccl_integrity_check
    nccl_integrity_check(filepath)
 def test_library_file():
    # note: don't import vllm.distributed.device_communicators.pynccl
    # before running this test, otherwise the library file will be loaded
    # and it might interfere with the test
    from vllm.utils import find_nccl_library
    so_file = find_nccl_library()
    with open(so_file, 'rb') as f:
        content = f.read()
    try:
        # corrupt the library file, should raise an exception
        with open(so_file, 'wb') as f:
            f.write(content[:len(content) // 2])
        p = multiprocessing.Process(target=target_fn, args=({}, so_file))
        p.start()
        p.join()
        assert p.exitcode != 0
        # move the library file to a tmp path
        # test VLLM_NCCL_SO_PATH
        fd, path = tempfile.mkstemp()
        with open(path, 'wb') as f:
            f.write(content)
        p = multiprocessing.Process(target=target_fn,
                                    args=({
                                        "VLLM_NCCL_SO_PATH": path
                                    }, path))
        p.start()
        p.join()
        assert p.exitcode == 0
    finally:
        with open(so_file, 'wb') as f:
            f.write(content)
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@ -21,8 +21,7 @@
 import ctypes
 import datetime
-import glob
+import platform
 import os
 # ===================== import region =====================
 import torch
@ -30,40 +29,27 @@ import torch.distributed as dist
 from torch.distributed import ReduceOp
 from vllm.logger import init_logger
 from vllm.utils import find_nccl_library, nccl_integrity_check
 logger = init_logger(__name__)
-so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
+so_file = find_nccl_library()
 # check if we have vllm-managed nccl
 vllm_nccl_path = None
 if torch.version.cuda is not None:
    cuda_major = torch.version.cuda.split(".")[0]
    path = os.path.expanduser(
        f"~/.config/vllm/nccl/cu{cuda_major}/libnccl.so.*")
    files = glob.glob(path)
    vllm_nccl_path = files[0] if files else None
 # manually load the nccl library
 if so_file:
    logger.info(
        f"Loading nccl from environment variable VLLM_NCCL_SO_PATH={so_file}")
 else:
    if torch.version.cuda is not None:
        so_file = vllm_nccl_path or "libnccl.so.2"
    elif torch.version.hip is not None:
        so_file = "librccl.so.1"
    else:
        raise ValueError("NCCL only supports CUDA and ROCm backends.")
    logger.info(f"Loading nccl from library {so_file}")
 try:
    # load the library in another process.
    # if it core dumps, it will not crash the current process
    nccl_integrity_check(so_file)
    nccl = ctypes.CDLL(so_file)
 except Exception as e:
    logger.error(
        f"Failed to load NCCL library from {so_file} ."
        "It is expected if you are not running on NVIDIA/AMD GPUs."
-        "Otherwise please set the environment variable VLLM_NCCL_SO_PATH"
+        "Otherwise, the nccl library might not exist, be corrupted "
        f"or it does not support the current platform {platform.platform()}."
        f"One solution is to download libnccl2 version 2.18 from "
        f"https://developer.download.nvidia.com/compute/cuda/repos/ "
        f"and extract the libnccl.so.2 file. If you already have the "
        f"library, please set the environment variable VLLM_NCCL_SO_PATH"
        " to point to the correct nccl library path.")
    raise e
--- a/vllm/utils.py
+++ b/vllm/utils.py
@ -1,6 +1,7 @@
 import asyncio
 import enum
 import gc
 import glob
 import os
 import socket
 import subprocess
@ -517,3 +518,53 @@ def init_cached_hf_modules():
    """
    from transformers.dynamic_module_utils import init_hf_modules
    init_hf_modules()
 def nccl_integrity_check(filepath):
    """
    when the library is corrupted, we cannot catch
    the exception in python. it will crash the process.
    instead, we use the exit code of `ldd` to check
    if the library is corrupted. if not, we will return
    the version of the library.
    """
    exit_code = os.system(f"ldd {filepath} 2>&1 > /dev/null")
    if exit_code != 0:
        raise RuntimeError(f"Failed to load NCCL library from {filepath} .")
    import ctypes
    nccl = ctypes.CDLL(filepath)
    version = ctypes.c_int()
    nccl.ncclGetVersion.restype = ctypes.c_int
    nccl.ncclGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)]
    result = nccl.ncclGetVersion(ctypes.byref(version))
    assert result == 0
    return version.value
 def find_nccl_library():
    so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
    # check if we have vllm-managed nccl
    vllm_nccl_path = None
    if torch.version.cuda is not None:
        cuda_major = torch.version.cuda.split(".")[0]
        path = os.path.expanduser(
            f"~/.config/vllm/nccl/cu{cuda_major}/libnccl.so.*")
        files = glob.glob(path)
        vllm_nccl_path = files[0] if files else None
    # manually load the nccl library
    if so_file:
        logger.info(
            f"Found nccl from environment variable VLLM_NCCL_SO_PATH={so_file}"
        )
    else:
        if torch.version.cuda is not None:
            so_file = vllm_nccl_path or "libnccl.so.2"
        elif torch.version.hip is not None:
            so_file = "librccl.so.1"
        else:
            raise ValueError("NCCL only supports CUDA and ROCm backends.")
        logger.info(f"Found nccl from library {so_file}")
    return so_file