diff --git a/Dockerfile.rocm b/Dockerfile.rocm index a09de99f..65a36799 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -90,6 +90,6 @@ RUN cd /app \ && cd .. RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install --no-cache-dir ray[all] +RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3 CMD ["/bin/bash"] diff --git a/requirements-rocm.txt b/requirements-rocm.txt index cea1183e..0dc2f0e6 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -5,7 +5,7 @@ starlette requests py-cpuinfo psutil -ray >= 2.9 +ray == 2.9.3 sentencepiece # Required for LLaMA tokenizer. numpy tokenizers>=0.15.0 diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index fcd903dd..8e510f97 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -405,8 +405,8 @@ def _check_use_naive_attention() -> bool: if not is_hip(): return False # For ROCm, check whether flash attention is installed or not. - has_flash_attn = importlib.util.find_spec("flash_attn") is None - if not has_flash_attn: + use_naive_attention = importlib.util.find_spec("flash_attn") is None + if use_naive_attention: logger.warning("flash_attn is not installed. Using naive attention. " "This will take significantly more GPU memory.") return True diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py index a0c2921d..5d7f2fdc 100644 --- a/vllm/model_executor/parallel_utils/pynccl.py +++ b/vllm/model_executor/parallel_utils/pynccl.py @@ -41,7 +41,7 @@ else: if torch.version.cuda is not None: so_file = "libnccl.so.2" elif torch.version.hip is not None: - so_file = "librccl.so.2" + so_file = "librccl.so.1" else: raise ValueError("NCCL only supports CUDA and ROCm backends.") logger.debug(f"Loading nccl from library {so_file}")