diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index b83aec7..e295743 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -88,8 +88,12 @@ jobs: - name: Free up disk space if: ${{ runner.os == 'Linux' }} + # https://github.com/easimon/maximize-build-space/blob/master/action.yml + # https://github.com/easimon/maximize-build-space/tree/test-report run: | sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache/CodeQL - name: Install CUDA ${{ matrix.cuda-version }} if: ${{ matrix.cuda-version != 'cpu' }} @@ -137,6 +141,10 @@ jobs: pip install ninja packaging wheel export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH + # Currently for this setting the runner goes OOM if we pass --threads 4 to nvcc + if [[ ${MATRIX_CUDA_VERSION} == "12.1" && ${MATRIX_TORCH_VERSION} == "2.1" ]]; then + export FLASH_ATTENTION_FORCE_SINGLE_THREAD="TRUE" + fi # Limit MAX_JOBS otherwise the github runner goes OOM MAX_JOBS=1 FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }} diff --git a/flash_attn/__init__.py b/flash_attn/__init__.py index 9f10f0b..8472f0b 100644 --- a/flash_attn/__init__.py +++ b/flash_attn/__init__.py @@ -1,4 +1,4 @@ -__version__ = "2.0.6" +__version__ = "2.0.6.post1" from flash_attn.flash_attn_interface import flash_attn_func from flash_attn.flash_attn_interface import flash_attn_kvpacked_func diff --git a/setup.py b/setup.py index c63586a..6ee03f0 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,8 @@ FORCE_BUILD = os.getenv("FLASH_ATTENTION_FORCE_BUILD", "FALSE") == "TRUE" SKIP_CUDA_BUILD = os.getenv("FLASH_ATTENTION_SKIP_CUDA_BUILD", "FALSE") == "TRUE" # For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI FORCE_CXX11_ABI = os.getenv("FLASH_ATTENTION_FORCE_CXX11_ABI", "FALSE") == "TRUE" +# For CI, we want the option to not add "--threads 4" to nvcc, since the runner can OOM +FORCE_SINGLE_THREAD = os.getenv("FLASH_ATTENTION_FORCE_SINGLE_THREAD", "FALSE") == "TRUE" def get_platform(): @@ -91,8 +93,7 @@ def raise_if_cuda_home_none(global_option: str) -> None: def append_nvcc_threads(nvcc_extra_args): - _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME) - if bare_metal_version >= Version("11.2"): + if not FORCE_SINGLE_THREAD: return nvcc_extra_args + ["--threads", "4"] return nvcc_extra_args