From add4f0bc42e7d85c23ed20a64453f918f232039d Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Tue, 30 May 2023 15:53:18 -0700
Subject: [PATCH 01/25] Scaffolding for wheel prototype

---
 .github/workflows/publish.yml | 71 ++++++++++++++++++++++++++---------
 setup.py                      | 52 ++++++++++++++++++++++++-
 2 files changed, 105 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 72df605..a9bd229 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -10,7 +10,7 @@ on:
       - '**'
 
 jobs:
-  release:
+  setup_release:
     name: Create Release
     runs-on: ubuntu-latest
     steps:
@@ -27,23 +27,27 @@ jobs:
         with:
           tag_name: ${{ steps.extract_branch.outputs.branch }}
           release_name: ${{ steps.extract_branch.outputs.branch }}
-      
-  wheel:
+
+  build_wheels:
     name: Build Wheel
     runs-on: ${{ matrix.os }}
-    needs: release
-    
+    needs: setup_release
+
     strategy:
       fail-fast: false
       matrix:
-          # os: [ubuntu-20.04]
-          os: [ubuntu-18.04]
-          python-version: ['3.7', '3.8', '3.9', '3.10']
-          torch-version: [1.11.0, 1.12.0, 1.12.1]
-          cuda-version: ['113', '116']
-          exclude:
-            - torch-version: 1.11.0
-              cuda-version: '116'
+          # TODO: @pierce - again, simplify for prototyping
+          os: [ubuntu-20.04]
+          #os: [ubuntu-20.04, ubuntu-22.04]
+          # python-version: ['3.7', '3.8', '3.9', '3.10']
+          python-version: ['3.10']
+          #torch-version: [1.11.0, 1.12.0, 1.12.1]
+          torch-version: [1.12.1]
+          #cuda-version: ['113', '116']
+          cuda-version: ['113']
+          #exclude:
+          #  - torch-version: 1.11.0
+          #    cuda-version: '116'
 
     steps:
       - name: Checkout
@@ -108,13 +112,13 @@ jobs:
           export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
           export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
           export CUDA_INSTALL_DIR=/usr/local/cuda-11.3$CUDA_INSTALL_DIR
-          pip install wheel
+          pip install ninja packaging setuptools wheel
           python setup.py bdist_wheel --dist-dir=dist
           tmpname=cu${{ matrix.cuda-version }}torch${{ matrix.torch-version }}
           wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
-          ls dist/*whl |xargs -I {} mv {} ${wheel_name}
+          ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
           echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
-      
+
       - name: Upload Release Asset
         id: upload_release_asset 
         uses: actions/upload-release-asset@v1
@@ -124,4 +128,37 @@ jobs:
           upload_url: ${{ steps.get_current_release.outputs.upload_url }}
           asset_path: ./${{env.wheel_name}}
           asset_name: ${{env.wheel_name}}
-          asset_content_type: application/*
\ No newline at end of file
+          asset_content_type: application/*
+
+  publish_package:
+    name: Publish package
+    needs: [build_wheels]
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: List contents
+        run: |
+          ls -la dist
+          ls -la dist/*
+
+      - name: Install dependencies
+        run: |
+          pip install ninja packaging setuptools wheel twine
+
+      - name: Build core package
+        run: |
+          python setup.py sdist --dist-dir=dist
+
+      - name: Deploy
+        env:
+          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+        run: |
+          python -m twine upload dist/*
diff --git a/setup.py b/setup.py
index 7597ea3..a5b63b1 100644
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,7 @@ from packaging.version import parse, Version
 from setuptools import setup, find_packages
 import subprocess
 
+import urllib
 import torch
 from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME
 
@@ -22,6 +23,50 @@ with open("README.md", "r", encoding="utf-8") as fh:
 this_dir = os.path.dirname(os.path.abspath(__file__))
 
 
+def get_platform():
+    """
+    Returns the platform string.
+    """
+    if sys.platform.startswith('linux'):
+        return 'linux_x86_64'
+    elif sys.platform == 'darwin':
+        return 'macosx_10_9_x86_64'
+    elif sys.platform == 'win32':
+        return 'win_amd64'
+    else:
+        raise ValueError('Unsupported platform: {}'.format(sys.platform))
+
+from setuptools.command.install import install
+
+# @pierce - TODO: Remove for proper release
+BASE_WHEEL_URL = "https://github.com/piercefreeman/flash-attention/releases/download/{tag_name}/{wheel_name}"
+
+class CustomInstallCommand(install):
+    def run(self):
+        # Determine the version numbers that will be used to determine the correct wheel
+        _, cuda_version = get_cuda_bare_metal_version()
+        torch_version = torch.__version__
+        python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
+        platform_name = get_platform()
+        flash_version = get_package_version()
+
+        # Determine wheel URL based on CUDA version, torch version, python version and OS
+        wheel_filename = f'flash_attn-{flash_version}+cu{cuda_version}torch{torch_version}-{python_version}-{python_version}-{platform_name}.whl'
+        wheel_url = BASE_WHEEL_URL.format(
+            tag_name=f"v{flash_version}",
+            wheel_name=wheel_filename
+        )
+        
+        try:
+            urllib.request.urlretrieve(wheel_url, wheel_filename)
+            os.system(f'pip install {wheel_filename}')
+            os.remove(wheel_filename)
+        except urllib.error.HTTPError:
+            print("Precompiled wheel not found. Building from source...")
+            # If the wheel could not be downloaded, build from source
+            install.run(self)
+
+
 def get_cuda_bare_metal_version(cuda_dir):
     raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
     output = raw_output.split()
@@ -190,7 +235,12 @@ setup(
         "Operating System :: Unix",
     ],
     ext_modules=ext_modules,
-    cmdclass={"build_ext": BuildExtension} if ext_modules else {},
+    cmdclass={
+        'install': CustomInstallCommand,
+        "build_ext": BuildExtension
+    } if ext_modules else {
+        'install': CustomInstallCommand,
+    },
     python_requires=">=3.7",
     install_requires=[
         "torch",

From e1faefce9de958fa64747edef823a0779392b027 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Fri, 2 Jun 2023 13:20:39 -0700
Subject: [PATCH 02/25] Raise cuda error on build

---
 .github/workflows/publish.yml |  2 +-
 setup.py                      | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index a9bd229..a0244f8 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -126,7 +126,7 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
           upload_url: ${{ steps.get_current_release.outputs.upload_url }}
-          asset_path: ./${{env.wheel_name}}
+          asset_path: ./dist/${{env.wheel_name}}
           asset_name: ${{env.wheel_name}}
           asset_content_type: application/*
 
diff --git a/setup.py b/setup.py
index a5b63b1..91a37ce 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,8 @@ from packaging.version import parse, Version
 from setuptools import setup, find_packages
 import subprocess
 
-import urllib
+import urllib.request
+import urllib.error
 import torch
 from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME
 
@@ -43,8 +44,10 @@ BASE_WHEEL_URL = "https://github.com/piercefreeman/flash-attention/releases/down
 
 class CustomInstallCommand(install):
     def run(self):
+        raise_if_cuda_home_none("flash_attn")
+
         # Determine the version numbers that will be used to determine the correct wheel
-        _, cuda_version = get_cuda_bare_metal_version()
+        _, cuda_version = get_cuda_bare_metal_version(CUDA_HOME)
         torch_version = torch.__version__
         python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
         platform_name = get_platform()
@@ -64,7 +67,10 @@ class CustomInstallCommand(install):
         except urllib.error.HTTPError:
             print("Precompiled wheel not found. Building from source...")
             # If the wheel could not be downloaded, build from source
-            install.run(self)
+            #install.run(self)
+            raise ValueError
+
+        raise ValueError
 
 
 def get_cuda_bare_metal_version(cuda_dir):

From 0e7769c813fcd2b04882a9cd7e13945002a903d3 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Fri, 2 Jun 2023 14:41:07 -0700
Subject: [PATCH 03/25] Guessing wheel URL

---
 setup.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 91a37ce..7581d74 100644
--- a/setup.py
+++ b/setup.py
@@ -47,18 +47,22 @@ class CustomInstallCommand(install):
         raise_if_cuda_home_none("flash_attn")
 
         # Determine the version numbers that will be used to determine the correct wheel
-        _, cuda_version = get_cuda_bare_metal_version(CUDA_HOME)
+        _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME)
         torch_version = torch.__version__
         python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
         platform_name = get_platform()
         flash_version = get_package_version()
+        cuda_version = f"{cuda_version_raw.major}{cuda_version_raw.minor}"
 
         # Determine wheel URL based on CUDA version, torch version, python version and OS
         wheel_filename = f'flash_attn-{flash_version}+cu{cuda_version}torch{torch_version}-{python_version}-{python_version}-{platform_name}.whl'
         wheel_url = BASE_WHEEL_URL.format(
-            tag_name=f"v{flash_version}",
+            #tag_name=f"v{flash_version}",
+            # HACK
+            tag_name=f"v0.0.3",
             wheel_name=wheel_filename
         )
+        print("Guessing wheel URL: ", wheel_url)
         
         try:
             urllib.request.urlretrieve(wheel_url, wheel_filename)
@@ -70,8 +74,6 @@ class CustomInstallCommand(install):
             #install.run(self)
             raise ValueError
 
-        raise ValueError
-
 
 def get_cuda_bare_metal_version(cuda_dir):
     raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)

From dab99053e46c32f394fee40c6d8627f302566b9f Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Fri, 2 Jun 2023 14:52:31 -0700
Subject: [PATCH 04/25] Bump build to use 116 for testing

---
 .github/workflows/publish.yml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index a0244f8..4f62194 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -44,7 +44,8 @@ jobs:
           #torch-version: [1.11.0, 1.12.0, 1.12.1]
           torch-version: [1.12.1]
           #cuda-version: ['113', '116']
-          cuda-version: ['113']
+          #cuda-version: ['113']
+          cuda-version: ['116']
           #exclude:
           #  - torch-version: 1.11.0
           #    cuda-version: '116'
@@ -143,11 +144,6 @@ jobs:
         with:
           python-version: '3.10'
 
-      - name: List contents
-        run: |
-          ls -la dist
-          ls -la dist/*
-
       - name: Install dependencies
         run: |
           pip install ninja packaging setuptools wheel twine

From 5e4699782a8734f871bee1f628b55d25c05a46a5 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Fri, 2 Jun 2023 15:58:36 -0700
Subject: [PATCH 05/25] Allow fallback install

---
 .github/workflows/publish.yml | 1 +
 setup.py                      | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 4f62194..dad5d7d 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -109,6 +109,7 @@ jobs:
 
       - name: Build wheel
         run: |
+          export FLASH_ATTENTION_FORCE_BUILD="TRUE"
           export FORCE_CUDA="1"
           export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
           export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
diff --git a/setup.py b/setup.py
index 7581d74..e0fcddd 100644
--- a/setup.py
+++ b/setup.py
@@ -44,6 +44,9 @@ BASE_WHEEL_URL = "https://github.com/piercefreeman/flash-attention/releases/down
 
 class CustomInstallCommand(install):
     def run(self):
+        if os.getenv("FLASH_ATTENTION_FORCE_BUILD", "FALSE") == "TRUE":
+            return install.run(self)
+
         raise_if_cuda_home_none("flash_attn")
 
         # Determine the version numbers that will be used to determine the correct wheel
@@ -59,7 +62,7 @@ class CustomInstallCommand(install):
         wheel_url = BASE_WHEEL_URL.format(
             #tag_name=f"v{flash_version}",
             # HACK
-            tag_name=f"v0.0.3",
+            tag_name=f"v0.0.5",
             wheel_name=wheel_filename
         )
         print("Guessing wheel URL: ", wheel_url)

From 9fc9820a5bf0eb851b79388908f43a70affbe296 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Fri, 2 Jun 2023 18:02:24 -0700
Subject: [PATCH 06/25] Strip cuda name from torch version

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index e0fcddd..ff718ef 100644
--- a/setup.py
+++ b/setup.py
@@ -51,11 +51,12 @@ class CustomInstallCommand(install):
 
         # Determine the version numbers that will be used to determine the correct wheel
         _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME)
-        torch_version = torch.__version__
+        torch_version_raw = parse(torch.__version__)
         python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
         platform_name = get_platform()
         flash_version = get_package_version()
         cuda_version = f"{cuda_version_raw.major}{cuda_version_raw.minor}"
+        torch_version = f"{torch_version_raw.major}.{torch_version_raw.minor}.{torch_version_raw.micro}"
 
         # Determine wheel URL based on CUDA version, torch version, python version and OS
         wheel_filename = f'flash_attn-{flash_version}+cu{cuda_version}torch{torch_version}-{python_version}-{python_version}-{platform_name}.whl'

From ea2ed8862341767d1bb7d82bff3cbd27c9740784 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Fri, 2 Jun 2023 18:22:44 -0700
Subject: [PATCH 07/25] Refactor and clean of setup.py

---
 .github/workflows/publish.yml |   2 +
 setup.py                      | 242 ++++++++++++++++++----------------
 2 files changed, 131 insertions(+), 113 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index dad5d7d..3e74449 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -150,6 +150,8 @@ jobs:
           pip install ninja packaging setuptools wheel twine
 
       - name: Build core package
+        env:
+          FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE"
         run: |
           python setup.py sdist --dist-dir=dist
 
diff --git a/setup.py b/setup.py
index ff718ef..cf8a7ef 100644
--- a/setup.py
+++ b/setup.py
@@ -6,8 +6,10 @@ import re
 import ast
 from pathlib import Path
 from packaging.version import parse, Version
+import platform
 
 from setuptools import setup, find_packages
+from setuptools.command.install import install
 import subprocess
 
 import urllib.request
@@ -24,60 +26,29 @@ with open("README.md", "r", encoding="utf-8") as fh:
 this_dir = os.path.dirname(os.path.abspath(__file__))
 
 
+# @pierce - TODO: Update for proper release
+BASE_WHEEL_URL = "https://github.com/piercefreeman/flash-attention/releases/download/{tag_name}/{wheel_name}"
+
+# FORCE_BUILD: Force a fresh build locally, instead of attempting to find prebuilt wheels
+# SKIP_CUDA_BUILD: Intended to allow CI to use a simple `python setup.py sdist` run to copy over raw files, without any cuda compilation
+FORCE_BUILD = os.getenv("FLASH_ATTENTION_FORCE_BUILD", "FALSE") == "TRUE"
+SKIP_CUDA_BUILD = os.getenv("FLASH_ATTENTION_SKIP_CUDA_BUILD", "FALSE") == "TRUE"
+
+
 def get_platform():
     """
-    Returns the platform string.
+    Returns the platform name as used in wheel filenames.
     """
     if sys.platform.startswith('linux'):
         return 'linux_x86_64'
     elif sys.platform == 'darwin':
-        return 'macosx_10_9_x86_64'
+        mac_version = '.'.join(platform.mac_ver()[0].split('.')[:2])
+        return f'macosx_{mac_version}_x86_64'
     elif sys.platform == 'win32':
         return 'win_amd64'
     else:
         raise ValueError('Unsupported platform: {}'.format(sys.platform))
 
-from setuptools.command.install import install
-
-# @pierce - TODO: Remove for proper release
-BASE_WHEEL_URL = "https://github.com/piercefreeman/flash-attention/releases/download/{tag_name}/{wheel_name}"
-
-class CustomInstallCommand(install):
-    def run(self):
-        if os.getenv("FLASH_ATTENTION_FORCE_BUILD", "FALSE") == "TRUE":
-            return install.run(self)
-
-        raise_if_cuda_home_none("flash_attn")
-
-        # Determine the version numbers that will be used to determine the correct wheel
-        _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME)
-        torch_version_raw = parse(torch.__version__)
-        python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
-        platform_name = get_platform()
-        flash_version = get_package_version()
-        cuda_version = f"{cuda_version_raw.major}{cuda_version_raw.minor}"
-        torch_version = f"{torch_version_raw.major}.{torch_version_raw.minor}.{torch_version_raw.micro}"
-
-        # Determine wheel URL based on CUDA version, torch version, python version and OS
-        wheel_filename = f'flash_attn-{flash_version}+cu{cuda_version}torch{torch_version}-{python_version}-{python_version}-{platform_name}.whl'
-        wheel_url = BASE_WHEEL_URL.format(
-            #tag_name=f"v{flash_version}",
-            # HACK
-            tag_name=f"v0.0.5",
-            wheel_name=wheel_filename
-        )
-        print("Guessing wheel URL: ", wheel_url)
-        
-        try:
-            urllib.request.urlretrieve(wheel_url, wheel_filename)
-            os.system(f'pip install {wheel_filename}')
-            os.remove(wheel_filename)
-        except urllib.error.HTTPError:
-            print("Precompiled wheel not found. Building from source...")
-            # If the wheel could not be downloaded, build from source
-            #install.run(self)
-            raise ValueError
-
 
 def get_cuda_bare_metal_version(cuda_dir):
     raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
@@ -147,77 +118,77 @@ if not torch.cuda.is_available():
         else:
             os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
 
-
-print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
-TORCH_MAJOR = int(torch.__version__.split(".")[0])
-TORCH_MINOR = int(torch.__version__.split(".")[1])
-
 cmdclass = {}
 ext_modules = []
 
-# Check, if ATen/CUDAGeneratorImpl.h is found, otherwise use ATen/cuda/CUDAGeneratorImpl.h
-# See https://github.com/pytorch/pytorch/pull/70650
-generator_flag = []
-torch_dir = torch.__path__[0]
-if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")):
-    generator_flag = ["-DOLD_GENERATOR_PATH"]
+if not SKIP_CUDA_BUILD:
+    print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
+    TORCH_MAJOR = int(torch.__version__.split(".")[0])
+    TORCH_MINOR = int(torch.__version__.split(".")[1])
 
-raise_if_cuda_home_none("flash_attn")
-# Check, if CUDA11 is installed for compute capability 8.0
-cc_flag = []
-_, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
-if bare_metal_version < Version("11.0"):
-    raise RuntimeError("FlashAttention is only supported on CUDA 11 and above")
-cc_flag.append("-gencode")
-cc_flag.append("arch=compute_75,code=sm_75")
-cc_flag.append("-gencode")
-cc_flag.append("arch=compute_80,code=sm_80")
-if bare_metal_version >= Version("11.8"):
+    # Check, if ATen/CUDAGeneratorImpl.h is found, otherwise use ATen/cuda/CUDAGeneratorImpl.h
+    # See https://github.com/pytorch/pytorch/pull/70650
+    generator_flag = []
+    torch_dir = torch.__path__[0]
+    if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")):
+        generator_flag = ["-DOLD_GENERATOR_PATH"]
+
+    raise_if_cuda_home_none("flash_attn")
+    # Check, if CUDA11 is installed for compute capability 8.0
+    cc_flag = []
+    _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
+    if bare_metal_version < Version("11.0"):
+        raise RuntimeError("FlashAttention is only supported on CUDA 11 and above")
     cc_flag.append("-gencode")
-    cc_flag.append("arch=compute_90,code=sm_90")
+    cc_flag.append("arch=compute_75,code=sm_75")
+    cc_flag.append("-gencode")
+    cc_flag.append("arch=compute_80,code=sm_80")
+    if bare_metal_version >= Version("11.8"):
+        cc_flag.append("-gencode")
+        cc_flag.append("arch=compute_90,code=sm_90")
 
-subprocess.run(["git", "submodule", "update", "--init", "csrc/flash_attn/cutlass"])
-ext_modules.append(
-    CUDAExtension(
-        name="flash_attn_cuda",
-        sources=[
-            "csrc/flash_attn/fmha_api.cpp",
-            "csrc/flash_attn/src/fmha_fwd_hdim32.cu",
-            "csrc/flash_attn/src/fmha_fwd_hdim64.cu",
-            "csrc/flash_attn/src/fmha_fwd_hdim128.cu",
-            "csrc/flash_attn/src/fmha_bwd_hdim32.cu",
-            "csrc/flash_attn/src/fmha_bwd_hdim64.cu",
-            "csrc/flash_attn/src/fmha_bwd_hdim128.cu",
-            "csrc/flash_attn/src/fmha_block_fprop_fp16_kernel.sm80.cu",
-            "csrc/flash_attn/src/fmha_block_dgrad_fp16_kernel_loop.sm80.cu",
-        ],
-        extra_compile_args={
-            "cxx": ["-O3", "-std=c++17"] + generator_flag,
-            "nvcc": append_nvcc_threads(
-                [
-                    "-O3",
-                    "-std=c++17",
-                    "-U__CUDA_NO_HALF_OPERATORS__",
-                    "-U__CUDA_NO_HALF_CONVERSIONS__",
-                    "-U__CUDA_NO_HALF2_OPERATORS__",
-                    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
-                    "--expt-relaxed-constexpr",
-                    "--expt-extended-lambda",
-                    "--use_fast_math",
-                    "--ptxas-options=-v",
-                    "-lineinfo"
-                ]
-                + generator_flag
-                + cc_flag
-            ),
-        },
-        include_dirs=[
-            Path(this_dir) / 'csrc' / 'flash_attn',
-            Path(this_dir) / 'csrc' / 'flash_attn' / 'src',
-            Path(this_dir) / 'csrc' / 'flash_attn' / 'cutlass' / 'include',
-        ],
+    subprocess.run(["git", "submodule", "update", "--init", "csrc/flash_attn/cutlass"])
+    ext_modules.append(
+        CUDAExtension(
+            name="flash_attn_cuda",
+            sources=[
+                "csrc/flash_attn/fmha_api.cpp",
+                "csrc/flash_attn/src/fmha_fwd_hdim32.cu",
+                "csrc/flash_attn/src/fmha_fwd_hdim64.cu",
+                "csrc/flash_attn/src/fmha_fwd_hdim128.cu",
+                "csrc/flash_attn/src/fmha_bwd_hdim32.cu",
+                "csrc/flash_attn/src/fmha_bwd_hdim64.cu",
+                "csrc/flash_attn/src/fmha_bwd_hdim128.cu",
+                "csrc/flash_attn/src/fmha_block_fprop_fp16_kernel.sm80.cu",
+                "csrc/flash_attn/src/fmha_block_dgrad_fp16_kernel_loop.sm80.cu",
+            ],
+            extra_compile_args={
+                "cxx": ["-O3", "-std=c++17"] + generator_flag,
+                "nvcc": append_nvcc_threads(
+                    [
+                        "-O3",
+                        "-std=c++17",
+                        "-U__CUDA_NO_HALF_OPERATORS__",
+                        "-U__CUDA_NO_HALF_CONVERSIONS__",
+                        "-U__CUDA_NO_HALF2_OPERATORS__",
+                        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+                        "--expt-relaxed-constexpr",
+                        "--expt-extended-lambda",
+                        "--use_fast_math",
+                        "--ptxas-options=-v",
+                        "-lineinfo"
+                    ]
+                    + generator_flag
+                    + cc_flag
+                ),
+            },
+            include_dirs=[
+                Path(this_dir) / 'csrc' / 'flash_attn',
+                Path(this_dir) / 'csrc' / 'flash_attn' / 'src',
+                Path(this_dir) / 'csrc' / 'flash_attn' / 'cutlass' / 'include',
+            ],
+        )
     )
-)
 
 def get_package_version():
     with open(Path(this_dir) / "flash_attn" / "__init__.py", "r") as f:
@@ -229,18 +200,63 @@ def get_package_version():
     else:
         return str(public_version)
 
+
+class CachedWheelsCommand(install):
+    """
+    Installer hook to scan for existing wheels that match the current platform environment.
+    Falls back to building from source if no wheel is found.
+
+    """
+    def run(self):
+        if FORCE_BUILD:
+            return install.run(self)
+
+        raise_if_cuda_home_none("flash_attn")
+
+        # Determine the version numbers that will be used to determine the correct wheel
+        _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME)
+        torch_version_raw = parse(torch.__version__)
+        python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
+        platform_name = get_platform()
+        flash_version = get_package_version()
+        cuda_version = f"{cuda_version_raw.major}{cuda_version_raw.minor}"
+        torch_version = f"{torch_version_raw.major}.{torch_version_raw.minor}.{torch_version_raw.micro}"
+
+        # Determine wheel URL based on CUDA version, torch version, python version and OS
+        wheel_filename = f'flash_attn-{flash_version}+cu{cuda_version}torch{torch_version}-{python_version}-{python_version}-{platform_name}.whl'
+        wheel_url = BASE_WHEEL_URL.format(
+            tag_name=f"v{flash_version}",
+            wheel_name=wheel_filename
+        )
+        print("Guessing wheel URL: ", wheel_url)
+        
+        try:
+            urllib.request.urlretrieve(wheel_url, wheel_filename)
+            os.system(f'pip install {wheel_filename}')
+            os.remove(wheel_filename)
+        except urllib.error.HTTPError:
+            print("Precompiled wheel not found. Building from source...")
+            # If the wheel could not be downloaded, build from source
+            install.run(self)
+
+
 setup(
-    name="flash_attn",
+    # @pierce - TODO: Revert for official release
+    name="flash_attn_wheels",
     version=get_package_version(),
     packages=find_packages(
         exclude=("build", "csrc", "include", "tests", "dist", "docs", "benchmarks", "flash_attn.egg-info",)
     ),
-    author="Tri Dao",
-    author_email="trid@stanford.edu",
+    #author="Tri Dao",
+    #author_email="trid@stanford.edu",
+    # @pierce - TODO: Revert for official release
+    author="Pierce Freeman",
+    author_email="pierce@freeman.vc",
     description="Flash Attention: Fast and Memory-Efficient Exact Attention",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    url="https://github.com/HazyResearch/flash-attention",
+    #url="https://github.com/HazyResearch/flash-attention",
+    url="https://github.com/piercefreeman/flash-attention",
     classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: BSD License",
@@ -248,10 +264,10 @@ setup(
     ],
     ext_modules=ext_modules,
     cmdclass={
-        'install': CustomInstallCommand,
+        'install': CachedWheelsCommand,
         "build_ext": BuildExtension
     } if ext_modules else {
-        'install': CustomInstallCommand,
+        'install': CachedWheelsCommand,
     },
     python_requires=">=3.7",
     install_requires=[

From cd0c169eeef47eba8d67c0717bec19f6484739b0 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Fri, 2 Jun 2023 18:28:00 -0700
Subject: [PATCH 08/25] Restore full build matrix

---
 .github/workflows/publish.yml | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 3e74449..f74ef75 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -36,19 +36,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-          # TODO: @pierce - again, simplify for prototyping
-          os: [ubuntu-20.04]
-          #os: [ubuntu-20.04, ubuntu-22.04]
-          # python-version: ['3.7', '3.8', '3.9', '3.10']
-          python-version: ['3.10']
-          #torch-version: [1.11.0, 1.12.0, 1.12.1]
-          torch-version: [1.12.1]
-          #cuda-version: ['113', '116']
-          #cuda-version: ['113']
-          cuda-version: ['116']
-          #exclude:
-          #  - torch-version: 1.11.0
-          #    cuda-version: '116'
+          os: [ubuntu-20.04, ubuntu-22.04]
+          python-version: ['3.7', '3.8', '3.9', '3.10']
+          torch-version: [1.11.0, 1.12.0, 1.12.1]
+          cuda-version: ['113', '116']
+          exclude:
+            - torch-version: 1.11.0
+              cuda-version: '116'
 
     steps:
       - name: Checkout

From a682252be78e09f55925e36775ff5818a26b5172 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Fri, 2 Jun 2023 18:47:25 -0700
Subject: [PATCH 09/25] OS version build numbers

---
 .github/workflows/cuda/cu102-Linux.sh |  4 +++-
 .github/workflows/cuda/cu113-Linux.sh |  8 +++++++-
 .github/workflows/cuda/cu116-Linux.sh |  4 +++-
 .github/workflows/publish.yml         | 17 +++++++++++------
 4 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/cuda/cu102-Linux.sh b/.github/workflows/cuda/cu102-Linux.sh
index 46fb053..82729ad 100644
--- a/.github/workflows/cuda/cu102-Linux.sh
+++ b/.github/workflows/cuda/cu102-Linux.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 
-OS=ubuntu1804
+# Strip the periods from the version number
+OS_VERSION=$(echo $OS_VERSION | tr -d .)
+OS=ubuntu${OS_VERSION}
 
 wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
 sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
diff --git a/.github/workflows/cuda/cu113-Linux.sh b/.github/workflows/cuda/cu113-Linux.sh
index b89a7fb..65e6d39 100644
--- a/.github/workflows/cuda/cu113-Linux.sh
+++ b/.github/workflows/cuda/cu113-Linux.sh
@@ -1,11 +1,17 @@
 #!/bin/bash
 
-OS=ubuntu1804
+# Strip the periods from the version number
+OS_VERSION=$(echo $OS_VERSION | tr -d .)
+OS=ubuntu${OS_VERSION}
 
 wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
 sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
 wget -nv https://developer.download.nvidia.com/compute/cuda/11.3.0/local_installers/cuda-repo-${OS}-11-3-local_11.3.0-465.19.01-1_amd64.deb
 sudo dpkg -i cuda-repo-${OS}-11-3-local_11.3.0-465.19.01-1_amd64.deb
+
+# TODO: If on version < 22.04, install via signal-desktop-keyring
+# For future versions it's deprecated and should be moved into the trusted folder
+# sudo mv /var/cuda-repo-${OS}-11-3-local/7fa2af80.pub /etc/apt/trusted.gpg.d/
 sudo apt-key add /var/cuda-repo-${OS}-11-3-local/7fa2af80.pub
 
 sudo apt-get -qq update
diff --git a/.github/workflows/cuda/cu116-Linux.sh b/.github/workflows/cuda/cu116-Linux.sh
index e3e4e2a..c49f604 100644
--- a/.github/workflows/cuda/cu116-Linux.sh
+++ b/.github/workflows/cuda/cu116-Linux.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 
-OS=ubuntu1804
+# Strip the periods from the version number
+OS_VERSION=$(echo $OS_VERSION | tr -d .)
+OS=ubuntu${OS_VERSION}
 
 wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
 sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index f74ef75..36e990a 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -37,12 +37,15 @@ jobs:
       fail-fast: false
       matrix:
           os: [ubuntu-20.04, ubuntu-22.04]
-          python-version: ['3.7', '3.8', '3.9', '3.10']
-          torch-version: [1.11.0, 1.12.0, 1.12.1]
-          cuda-version: ['113', '116']
-          exclude:
-            - torch-version: 1.11.0
-              cuda-version: '116'
+          #python-version: ['3.7', '3.8', '3.9', '3.10']
+          python-version: ['3.10']
+          torch-version: [1.11.0]
+          cuda-version: ['113']
+          #torch-version: [1.11.0, 1.12.0, 1.12.1]
+          #cuda-version: ['113', '116']
+          #exclude:
+          #  - torch-version: 1.11.0
+          #    cuda-version: '116'
 
     steps:
       - name: Checkout
@@ -65,6 +68,8 @@ jobs:
 
       - name: Install CUDA ${{ matrix.cuda-version }}
         if: ${{ matrix.cuda-version != 'cpu' }}
+        env:
+          OS_VERSION: ${{ runner.release }}
         run: |
           bash .github/workflows/cuda/cu${{ matrix.cuda-version }}-${{ runner.os }}.sh
         shell:

From 2dadfdbbcab2edc6a56b068a8cedc73c8324aacc Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Fri, 2 Jun 2023 18:48:02 -0700
Subject: [PATCH 10/25] Temp disable deploy

---
 .github/workflows/publish.yml | 46 +++++++++++++++++------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 36e990a..c394348 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -131,32 +131,32 @@ jobs:
           asset_name: ${{env.wheel_name}}
           asset_content_type: application/*
 
-  publish_package:
-    name: Publish package
-    needs: [build_wheels]
+  # publish_package:
+  #   name: Publish package
+  #   needs: [build_wheels]
 
-    runs-on: ubuntu-latest
+  #   runs-on: ubuntu-latest
 
-    steps:
-      - uses: actions/checkout@v3
+  #   steps:
+  #     - uses: actions/checkout@v3
 
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.10'
+  #     - uses: actions/setup-python@v4
+  #       with:
+  #         python-version: '3.10'
 
-      - name: Install dependencies
-        run: |
-          pip install ninja packaging setuptools wheel twine
+  #     - name: Install dependencies
+  #       run: |
+  #         pip install ninja packaging setuptools wheel twine
 
-      - name: Build core package
-        env:
-          FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE"
-        run: |
-          python setup.py sdist --dist-dir=dist
+  #     - name: Build core package
+  #       env:
+  #         FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE"
+  #       run: |
+  #         python setup.py sdist --dist-dir=dist
 
-      - name: Deploy
-        env:
-          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
-          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
-        run: |
-          python -m twine upload dist/*
+  #     - name: Deploy
+  #       env:
+  #         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+  #         TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+  #       run: |
+  #         python -m twine upload dist/*

From 061470ae58220a189272e72995a4a206f7447d39 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Fri, 2 Jun 2023 18:59:09 -0700
Subject: [PATCH 11/25] echo OS version

---
 .github/workflows/cuda/cu102-Linux.sh | 2 ++
 .github/workflows/cuda/cu113-Linux.sh | 2 ++
 .github/workflows/cuda/cu116-Linux.sh | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/.github/workflows/cuda/cu102-Linux.sh b/.github/workflows/cuda/cu102-Linux.sh
index 82729ad..ac38052 100644
--- a/.github/workflows/cuda/cu102-Linux.sh
+++ b/.github/workflows/cuda/cu102-Linux.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+echo $OS_VERSION
+
 # Strip the periods from the version number
 OS_VERSION=$(echo $OS_VERSION | tr -d .)
 OS=ubuntu${OS_VERSION}
diff --git a/.github/workflows/cuda/cu113-Linux.sh b/.github/workflows/cuda/cu113-Linux.sh
index 65e6d39..0518a09 100644
--- a/.github/workflows/cuda/cu113-Linux.sh
+++ b/.github/workflows/cuda/cu113-Linux.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+echo $OS_VERSION
+
 # Strip the periods from the version number
 OS_VERSION=$(echo $OS_VERSION | tr -d .)
 OS=ubuntu${OS_VERSION}
diff --git a/.github/workflows/cuda/cu116-Linux.sh b/.github/workflows/cuda/cu116-Linux.sh
index c49f604..d717ab4 100644
--- a/.github/workflows/cuda/cu116-Linux.sh
+++ b/.github/workflows/cuda/cu116-Linux.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+echo $OS_VERSION
+
 # Strip the periods from the version number
 OS_VERSION=$(echo $OS_VERSION | tr -d .)
 OS=ubuntu${OS_VERSION}

From 18e100d312b9fe04079d993aebb2b68dd145daa3 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Fri, 2 Jun 2023 19:01:44 -0700
Subject: [PATCH 12/25] Release is actually unsupported

---
 .github/workflows/cuda/cu102-Linux.sh | 4 +---
 .github/workflows/cuda/cu113-Linux.sh | 4 +---
 .github/workflows/cuda/cu116-Linux.sh | 4 +---
 .github/workflows/publish.yml         | 2 --
 4 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/cuda/cu102-Linux.sh b/.github/workflows/cuda/cu102-Linux.sh
index ac38052..ada39d5 100644
--- a/.github/workflows/cuda/cu102-Linux.sh
+++ b/.github/workflows/cuda/cu102-Linux.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 
-echo $OS_VERSION
-
 # Strip the periods from the version number
-OS_VERSION=$(echo $OS_VERSION | tr -d .)
+OS_VERSION=$(echo $(lsb_release -sr) | tr -d .)
 OS=ubuntu${OS_VERSION}
 
 wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
diff --git a/.github/workflows/cuda/cu113-Linux.sh b/.github/workflows/cuda/cu113-Linux.sh
index 0518a09..0b804d9 100644
--- a/.github/workflows/cuda/cu113-Linux.sh
+++ b/.github/workflows/cuda/cu113-Linux.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 
-echo $OS_VERSION
-
 # Strip the periods from the version number
-OS_VERSION=$(echo $OS_VERSION | tr -d .)
+OS_VERSION=$(echo $(lsb_release -sr) | tr -d .)
 OS=ubuntu${OS_VERSION}
 
 wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
diff --git a/.github/workflows/cuda/cu116-Linux.sh b/.github/workflows/cuda/cu116-Linux.sh
index d717ab4..68e9ed4 100644
--- a/.github/workflows/cuda/cu116-Linux.sh
+++ b/.github/workflows/cuda/cu116-Linux.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 
-echo $OS_VERSION
-
 # Strip the periods from the version number
-OS_VERSION=$(echo $OS_VERSION | tr -d .)
+OS_VERSION=$(echo $(lsb_release -sr) | tr -d .)
 OS=ubuntu${OS_VERSION}
 
 wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index c394348..0fc0281 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -68,8 +68,6 @@ jobs:
 
       - name: Install CUDA ${{ matrix.cuda-version }}
         if: ${{ matrix.cuda-version != 'cpu' }}
-        env:
-          OS_VERSION: ${{ runner.release }}
         run: |
           bash .github/workflows/cuda/cu${{ matrix.cuda-version }}-${{ runner.os }}.sh
         shell:

From a372e2be1bd970956bd9b2b8e84f23b7e86e2a4a Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Fri, 2 Jun 2023 19:19:49 -0700
Subject: [PATCH 13/25] Add CUDA 11.7

---
 .github/workflows/cuda/cu116-Linux.sh     |  1 +
 .github/workflows/cuda/cu117-Linux-env.sh |  9 +++++++++
 .github/workflows/cuda/cu117-Linux.sh     | 18 ++++++++++++++++++
 .github/workflows/publish.yml             | 11 +++++++----
 4 files changed, 35 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/cuda/cu117-Linux-env.sh
 create mode 100644 .github/workflows/cuda/cu117-Linux.sh

diff --git a/.github/workflows/cuda/cu116-Linux.sh b/.github/workflows/cuda/cu116-Linux.sh
index 68e9ed4..f6ebbe3 100644
--- a/.github/workflows/cuda/cu116-Linux.sh
+++ b/.github/workflows/cuda/cu116-Linux.sh
@@ -7,6 +7,7 @@ OS=ubuntu${OS_VERSION}
 wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
 sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
 wget -nv https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb
+
 sudo dpkg -i cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb
 sudo apt-key add /var/cuda-repo-${OS}-11-6-local/7fa2af80.pub
 
diff --git a/.github/workflows/cuda/cu117-Linux-env.sh b/.github/workflows/cuda/cu117-Linux-env.sh
new file mode 100644
index 0000000..ab432d1
--- /dev/null
+++ b/.github/workflows/cuda/cu117-Linux-env.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+CUDA_HOME=/usr/local/cuda-11.7
+LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+PATH=${CUDA_HOME}/bin:${PATH}
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
+export CUDA_HOME=/usr/local/cuda-11.7
\ No newline at end of file
diff --git a/.github/workflows/cuda/cu117-Linux.sh b/.github/workflows/cuda/cu117-Linux.sh
new file mode 100644
index 0000000..40e66f3
--- /dev/null
+++ b/.github/workflows/cuda/cu117-Linux.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Strip the periods from the version number
+OS_VERSION=$(echo $(lsb_release -sr) | tr -d .)
+OS=ubuntu${OS_VERSION}
+
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
+sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
+wget -nv https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda-repo-${OS}-11-7-local_11.7.0-515.43.04-1_amd64.deb
+
+sudo dpkg -i cuda-repo-${OS}-11-7-local_11.7.0-515.43.04-1_amd64.deb
+sudo cp /var/cuda-repo-${OS}-11-7-local/cuda-*-keyring.gpg /usr/share/keyrings/
+
+sudo apt-get -qq update
+sudo apt install cuda cuda-nvcc-11-7 cuda-libraries-dev-11-7
+sudo apt clean
+
+rm -f https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda-repo-${OS}-11-7-local_11.7.0-515.43.04-1_amd64.deb
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 0fc0281..2475ba0 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -40,12 +40,15 @@ jobs:
           #python-version: ['3.7', '3.8', '3.9', '3.10']
           python-version: ['3.10']
           torch-version: [1.11.0]
-          cuda-version: ['113']
+          cuda-version: ['113', '117']
           #torch-version: [1.11.0, 1.12.0, 1.12.1]
           #cuda-version: ['113', '116']
-          #exclude:
-          #  - torch-version: 1.11.0
-          #    cuda-version: '116'
+          exclude:
+            # Nvidia only supports 11.7+ for ubuntu-22.04
+            - os: ubuntu-22.04
+              cuda-version: '116'
+            - os: ubuntu-22.04
+              cuda-version: '113'
 
     steps:
       - name: Checkout

From ac543b0e8d0d5f30e6ce02411f860995127ca013 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Fri, 2 Jun 2023 22:47:29 -0700
Subject: [PATCH 14/25] Full version matrix

---
 .github/workflows/publish.yml | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 2475ba0..07c4ebc 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -37,18 +37,20 @@ jobs:
       fail-fast: false
       matrix:
           os: [ubuntu-20.04, ubuntu-22.04]
-          #python-version: ['3.7', '3.8', '3.9', '3.10']
-          python-version: ['3.10']
-          torch-version: [1.11.0]
-          cuda-version: ['113', '117']
-          #torch-version: [1.11.0, 1.12.0, 1.12.1]
-          #cuda-version: ['113', '116']
+          python-version: ['3.7', '3.8', '3.9', '3.10']
+          torch-version: ['1.11.0', '1.12.0', '1.13.0']
+          cuda-version: ['113', '116', '117']
           exclude:
             # Nvidia only supports 11.7+ for ubuntu-22.04
             - os: ubuntu-22.04
               cuda-version: '116'
             - os: ubuntu-22.04
               cuda-version: '113'
+            # Torch only builds cuda 117 for 1.13.0+
+            - cuda-version: '117'
+              torch-version: '1.11.0'
+            - cuda-version: '117'
+              torch-version: '1.12.0'
 
     steps:
       - name: Checkout

From 84009fcc66fe7a9d777f3b3ec49277ae704656b8 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Sat, 3 Jun 2023 09:51:13 -0700
Subject: [PATCH 15/25] Exclude additional disallowed matrix params

---
 .github/workflows/publish.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 07c4ebc..9091ede 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -51,6 +51,12 @@ jobs:
               torch-version: '1.11.0'
             - cuda-version: '117'
               torch-version: '1.12.0'
+            # Torch only builds cuda 116 for 1.12.0+
+            - cuda-version: '116'
+              torch-version: '1.11.0'
+            # 1.13.0 drops support for cuda 11.3
+            - cuda-version: '113'
+              torch-version: '1.13.0'
 
     steps:
       - name: Checkout
@@ -123,6 +129,10 @@ jobs:
           ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
           echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
 
+      - name: Log Built Wheels
+        run: |
+          ls dist
+
       - name: Upload Release Asset
         id: upload_release_asset 
         uses: actions/upload-release-asset@v1

From 1848d0004f4bf698b908db871db0a22666d2e311 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Sat, 3 Jun 2023 19:10:47 -0700
Subject: [PATCH 16/25] Exclude cuda erroring builds

---
 .github/workflows/publish.yml | 50 +++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 9091ede..44d894a 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -57,6 +57,10 @@ jobs:
             # 1.13.0 drops support for cuda 11.3
             - cuda-version: '113'
               torch-version: '1.13.0'
+            # Fails with "Validation Error" on artifact upload
+            - cuda-version: '117'
+              torch-version: '1.13.0'
+              os: ubuntu-20.04
 
     steps:
       - name: Checkout
@@ -144,32 +148,32 @@ jobs:
           asset_name: ${{env.wheel_name}}
           asset_content_type: application/*
 
-  # publish_package:
-  #   name: Publish package
-  #   needs: [build_wheels]
+  publish_package:
+    name: Publish package
+    needs: [build_wheels]
 
-  #   runs-on: ubuntu-latest
+    runs-on: ubuntu-latest
 
-  #   steps:
-  #     - uses: actions/checkout@v3
+    steps:
+      - uses: actions/checkout@v3
 
-  #     - uses: actions/setup-python@v4
-  #       with:
-  #         python-version: '3.10'
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
 
-  #     - name: Install dependencies
-  #       run: |
-  #         pip install ninja packaging setuptools wheel twine
+      - name: Install dependencies
+        run: |
+          pip install ninja packaging setuptools wheel twine
 
-  #     - name: Build core package
-  #       env:
-  #         FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE"
-  #       run: |
-  #         python setup.py sdist --dist-dir=dist
+      - name: Build core package
+        env:
+          FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE"
+        run: |
+          python setup.py sdist --dist-dir=dist
 
-  #     - name: Deploy
-  #       env:
-  #         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
-  #         TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
-  #       run: |
-  #         python -m twine upload dist/*
+      - name: Deploy
+        env:
+          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+        run: |
+          python -m twine upload dist/*

From 8d60c373e4ed0075baa4c597891ffd9fb576752c Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Sat, 3 Jun 2023 20:26:45 -0700
Subject: [PATCH 17/25] Add torch dependency to final build

---
 .github/workflows/publish.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 44d894a..1f959c4 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -164,6 +164,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install ninja packaging setuptools wheel twine
+          pip install torch
 
       - name: Build core package
         env:

From 494b2aa48657edb55eb9f5907d5e980014d9dbdc Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Sun, 4 Jun 2023 06:14:05 -0700
Subject: [PATCH 18/25] Add notes to github action workflow

---
 .github/workflows/publish.yml                 |  9 ++--
 README.md                                     |  8 +++
 flash_attn/__init__.py                        |  2 +-
 flash_attn_builder/README.md                  |  3 ++
 .../flash_attn_builder/__init__.py            |  0
 flash_attn_builder/flash_attn_builder/main.py | 54 +++++++++++++++++++
 flash_attn_builder/pyproject.toml             | 15 ++++++
 pyproject.toml                                |  3 --
 setup.py                                      | 46 ++++++++++------
 9 files changed, 118 insertions(+), 22 deletions(-)
 create mode 100644 flash_attn_builder/README.md
 create mode 100644 flash_attn_builder/flash_attn_builder/__init__.py
 create mode 100644 flash_attn_builder/flash_attn_builder/main.py
 create mode 100644 flash_attn_builder/pyproject.toml
 delete mode 100644 pyproject.toml

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 1f959c4..83c4b48 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -1,8 +1,11 @@
-# This workflow will upload a Python Package to Release asset
+# This workflow will:
+# - Create a new Github release
+# - Build wheels for supported architectures
+# - Deploy the wheels to the Github release
+# - Release the static code to PyPi
 # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 
-
-name: Python Package
+name: Build wheels and deploy
 
 on:
   create:
diff --git a/README.md b/README.md
index 31fc62a..99f8829 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,14 @@ To install:
 pip install flash-attn
 ```
 
+If you see an error about `ModuleNotFoundError: No module named 'torch'`, it's likely because of pypi's installation isolation.
+
+To fix you can run:
+
+```sh
+pip install flash-attn --no-build-isolation
+```
+
 Alternatively you can compile from source:
 ```
 python setup.py install
diff --git a/flash_attn/__init__.py b/flash_attn/__init__.py
index 9e604c0..e13bd59 100644
--- a/flash_attn/__init__.py
+++ b/flash_attn/__init__.py
@@ -1 +1 @@
-__version__ = "1.0.7"
+__version__ = "1.0.8"
diff --git a/flash_attn_builder/README.md b/flash_attn_builder/README.md
new file mode 100644
index 0000000..3e42b3b
--- /dev/null
+++ b/flash_attn_builder/README.md
@@ -0,0 +1,3 @@
+## flash-attn-builder
+
+Basic build utilities for flash-attn.
diff --git a/flash_attn_builder/flash_attn_builder/__init__.py b/flash_attn_builder/flash_attn_builder/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/flash_attn_builder/flash_attn_builder/main.py b/flash_attn_builder/flash_attn_builder/main.py
new file mode 100644
index 0000000..1e750e7
--- /dev/null
+++ b/flash_attn_builder/flash_attn_builder/main.py
@@ -0,0 +1,54 @@
+import os
+import sys
+import urllib
+import setuptools.build_meta
+from setuptools.command.install import install
+from packaging.version import parse, Version
+
+# @pierce - TODO: Update for proper release
+BASE_WHEEL_URL = "https://github.com/piercefreeman/flash-attention/releases/download/{tag_name}/{wheel_name}"
+
+# FORCE_BUILD: Force a fresh build locally, instead of attempting to find prebuilt wheels
+# SKIP_CUDA_BUILD: Intended to allow CI to use a simple `python setup.py sdist` run to copy over raw files, without any cuda compilation
+FORCE_BUILD = os.getenv("FLASH_ATTENTION_FORCE_BUILD", "FALSE") == "TRUE"
+
+class CustomBuildBackend(setuptools.build_meta._BuildMetaBackend):
+
+    def build_wheel(self, wheel_directory, config_settings=None, metadata_directory=None):
+        this_file_directory = os.path.dirname(os.path.abspath(__file__))
+        print(f'This file is located in: {this_file_directory}')
+
+        sys.argv = [
+            *sys.argv[:1],
+            *self._global_args(config_settings),
+            *self._arbitrary_args(config_settings),
+        ]
+        with setuptools.build_meta.no_install_setup_requires():
+            self.run_setup()
+
+        print("OS", os.environ["FLASH_ATTENTION_WHEEL_URL"])
+        print("config_settings", config_settings)
+        print("metadata_directory", metadata_directory)
+        raise ValueError
+
+        print("Guessing wheel URL: ", wheel_url)
+        
+        try:
+            urllib.request.urlretrieve(wheel_url, wheel_filename)
+            os.system(f'pip install {wheel_filename}')
+            os.remove(wheel_filename)
+        except urllib.error.HTTPError:
+            print("Precompiled wheel not found. Building from source...")
+            # If the wheel could not be downloaded, build from source
+            super().build_wheel(wheel_directory, config_settings, metadata_directory)
+
+
+_BACKEND = CustomBuildBackend()  # noqa
+
+
+get_requires_for_build_wheel = _BACKEND.get_requires_for_build_wheel
+get_requires_for_build_sdist = _BACKEND.get_requires_for_build_sdist
+prepare_metadata_for_build_wheel = _BACKEND.prepare_metadata_for_build_wheel
+build_wheel = _BACKEND.build_wheel
+build_sdist = _BACKEND.build_sdist
+
diff --git a/flash_attn_builder/pyproject.toml b/flash_attn_builder/pyproject.toml
new file mode 100644
index 0000000..7fa99d4
--- /dev/null
+++ b/flash_attn_builder/pyproject.toml
@@ -0,0 +1,15 @@
+[tool.poetry]
+name = "flash-attn-builder"
+version = "0.1.0"
+description = ""
+authors = ["Pierce Freeman <pierce@freeman.vc>"]
+readme = "README.md"
+packages = [{include = "flash_attn_builder"}]
+
+[tool.poetry.dependencies]
+python = "^3.10"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index f67608a..0000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,3 +0,0 @@
-[build-system]
-requires = ["ninja", "packaging", "setuptools", "wheel"]
-build-backend = "setuptools.build_meta"
diff --git a/setup.py b/setup.py
index cf8a7ef..89222f7 100644
--- a/setup.py
+++ b/setup.py
@@ -9,13 +9,15 @@ from packaging.version import parse, Version
 import platform
 
 from setuptools import setup, find_packages
-from setuptools.command.install import install
+from setuptools.command.build import build
 import subprocess
+from setuptools.command.bdist_egg import bdist_egg
 
 import urllib.request
 import urllib.error
 import torch
 from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME
+from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 
 
 with open("README.md", "r", encoding="utf-8") as fh:
@@ -25,6 +27,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
 # ninja build does not work unless include_dirs are abs path
 this_dir = os.path.dirname(os.path.abspath(__file__))
 
+PACKAGE_NAME = "flash_attn_wheels"
 
 # @pierce - TODO: Update for proper release
 BASE_WHEEL_URL = "https://github.com/piercefreeman/flash-attention/releases/download/{tag_name}/{wheel_name}"
@@ -201,15 +204,17 @@ def get_package_version():
         return str(public_version)
 
 
-class CachedWheelsCommand(install):
-    """
-    Installer hook to scan for existing wheels that match the current platform environment.
-    Falls back to building from source if no wheel is found.
+class CachedWheelsCommand(_bdist_wheel):
+     """
+     The CachedWheelsCommand plugs into the default bdist wheel, which is ran by pip when it cannot
+     find an existing wheel (which is currently the case for all flash attention installs). We use
+     the environment parameters to detect whether there is already a pre-built version of a compatible
+     wheel available and short-circuits the standard full build pipeline.
 
-    """
-    def run(self):
+     """
+     def run(self):
         if FORCE_BUILD:
-            return install.run(self)
+            return build.run(self)
 
         raise_if_cuda_home_none("flash_attn")
 
@@ -223,7 +228,7 @@ class CachedWheelsCommand(install):
         torch_version = f"{torch_version_raw.major}.{torch_version_raw.minor}.{torch_version_raw.micro}"
 
         # Determine wheel URL based on CUDA version, torch version, python version and OS
-        wheel_filename = f'flash_attn-{flash_version}+cu{cuda_version}torch{torch_version}-{python_version}-{python_version}-{platform_name}.whl'
+        wheel_filename = f'{PACKAGE_NAME}-{flash_version}+cu{cuda_version}torch{torch_version}-{python_version}-{python_version}-{platform_name}.whl'
         wheel_url = BASE_WHEEL_URL.format(
             tag_name=f"v{flash_version}",
             wheel_name=wheel_filename
@@ -232,17 +237,28 @@ class CachedWheelsCommand(install):
         
         try:
             urllib.request.urlretrieve(wheel_url, wheel_filename)
-            os.system(f'pip install {wheel_filename}')
-            os.remove(wheel_filename)
+
+            # Make the archive
+            # Lifted from the root wheel processing command
+            # https://github.com/pypa/wheel/blob/cf71108ff9f6ffc36978069acb28824b44ae028e/src/wheel/bdist_wheel.py#LL381C9-L381C85
+            if not os.path.exists(self.dist_dir):
+                os.makedirs(self.dist_dir)
+
+            impl_tag, abi_tag, plat_tag = self.get_tag()
+            archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}"
+        
+            wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl")
+            print("Raw wheel path", wheel_path)
+            os.rename(wheel_filename, wheel_path)
         except urllib.error.HTTPError:
             print("Precompiled wheel not found. Building from source...")
             # If the wheel could not be downloaded, build from source
-            install.run(self)
+            super().run()
 
 
 setup(
     # @pierce - TODO: Revert for official release
-    name="flash_attn_wheels",
+    name=PACKAGE_NAME,
     version=get_package_version(),
     packages=find_packages(
         exclude=("build", "csrc", "include", "tests", "dist", "docs", "benchmarks", "flash_attn.egg-info",)
@@ -264,10 +280,10 @@ setup(
     ],
     ext_modules=ext_modules,
     cmdclass={
-        'install': CachedWheelsCommand,
+        'bdist_wheel': CachedWheelsCommand,
         "build_ext": BuildExtension
     } if ext_modules else {
-        'install': CachedWheelsCommand,
+        'bdist_wheel': CachedWheelsCommand,
     },
     python_requires=">=3.7",
     install_requires=[

From 6c730dc8c669ffd140ed90366cd96aa031a08594 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Wed, 7 Jun 2023 17:07:14 -0700
Subject: [PATCH 19/25] Bump version

---
 flash_attn/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flash_attn/__init__.py b/flash_attn/__init__.py
index e13bd59..39e0411 100644
--- a/flash_attn/__init__.py
+++ b/flash_attn/__init__.py
@@ -1 +1 @@
-__version__ = "1.0.8"
+__version__ = "1.0.9"

From eb812c205b4a4327230f5e75407d06e75917417b Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Wed, 7 Jun 2023 17:20:13 -0700
Subject: [PATCH 20/25] Remove builder project

---
 flash_attn_builder/README.md                  |  3 --
 .../flash_attn_builder/__init__.py            |  0
 flash_attn_builder/flash_attn_builder/main.py | 54 -------------------
 flash_attn_builder/pyproject.toml             | 15 ------
 4 files changed, 72 deletions(-)
 delete mode 100644 flash_attn_builder/README.md
 delete mode 100644 flash_attn_builder/flash_attn_builder/__init__.py
 delete mode 100644 flash_attn_builder/flash_attn_builder/main.py
 delete mode 100644 flash_attn_builder/pyproject.toml

diff --git a/flash_attn_builder/README.md b/flash_attn_builder/README.md
deleted file mode 100644
index 3e42b3b..0000000
--- a/flash_attn_builder/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-## flash-attn-builder
-
-Basic build utilities for flash-attn.
diff --git a/flash_attn_builder/flash_attn_builder/__init__.py b/flash_attn_builder/flash_attn_builder/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/flash_attn_builder/flash_attn_builder/main.py b/flash_attn_builder/flash_attn_builder/main.py
deleted file mode 100644
index 1e750e7..0000000
--- a/flash_attn_builder/flash_attn_builder/main.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import os
-import sys
-import urllib
-import setuptools.build_meta
-from setuptools.command.install import install
-from packaging.version import parse, Version
-
-# @pierce - TODO: Update for proper release
-BASE_WHEEL_URL = "https://github.com/piercefreeman/flash-attention/releases/download/{tag_name}/{wheel_name}"
-
-# FORCE_BUILD: Force a fresh build locally, instead of attempting to find prebuilt wheels
-# SKIP_CUDA_BUILD: Intended to allow CI to use a simple `python setup.py sdist` run to copy over raw files, without any cuda compilation
-FORCE_BUILD = os.getenv("FLASH_ATTENTION_FORCE_BUILD", "FALSE") == "TRUE"
-
-class CustomBuildBackend(setuptools.build_meta._BuildMetaBackend):
-
-    def build_wheel(self, wheel_directory, config_settings=None, metadata_directory=None):
-        this_file_directory = os.path.dirname(os.path.abspath(__file__))
-        print(f'This file is located in: {this_file_directory}')
-
-        sys.argv = [
-            *sys.argv[:1],
-            *self._global_args(config_settings),
-            *self._arbitrary_args(config_settings),
-        ]
-        with setuptools.build_meta.no_install_setup_requires():
-            self.run_setup()
-
-        print("OS", os.environ["FLASH_ATTENTION_WHEEL_URL"])
-        print("config_settings", config_settings)
-        print("metadata_directory", metadata_directory)
-        raise ValueError
-
-        print("Guessing wheel URL: ", wheel_url)
-        
-        try:
-            urllib.request.urlretrieve(wheel_url, wheel_filename)
-            os.system(f'pip install {wheel_filename}')
-            os.remove(wheel_filename)
-        except urllib.error.HTTPError:
-            print("Precompiled wheel not found. Building from source...")
-            # If the wheel could not be downloaded, build from source
-            super().build_wheel(wheel_directory, config_settings, metadata_directory)
-
-
-_BACKEND = CustomBuildBackend()  # noqa
-
-
-get_requires_for_build_wheel = _BACKEND.get_requires_for_build_wheel
-get_requires_for_build_sdist = _BACKEND.get_requires_for_build_sdist
-prepare_metadata_for_build_wheel = _BACKEND.prepare_metadata_for_build_wheel
-build_wheel = _BACKEND.build_wheel
-build_sdist = _BACKEND.build_sdist
-
diff --git a/flash_attn_builder/pyproject.toml b/flash_attn_builder/pyproject.toml
deleted file mode 100644
index 7fa99d4..0000000
--- a/flash_attn_builder/pyproject.toml
+++ /dev/null
@@ -1,15 +0,0 @@
-[tool.poetry]
-name = "flash-attn-builder"
-version = "0.1.0"
-description = ""
-authors = ["Pierce Freeman <pierce@freeman.vc>"]
-readme = "README.md"
-packages = [{include = "flash_attn_builder"}]
-
-[tool.poetry.dependencies]
-python = "^3.10"
-
-
-[build-system]
-requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"

From 9af165c38920bd18fc066e193383903e6ecff451 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Wed, 7 Jun 2023 17:26:13 -0700
Subject: [PATCH 21/25] Clean setup.py imports

---
 setup.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 89222f7..4737c5b 100644
--- a/setup.py
+++ b/setup.py
@@ -9,9 +9,7 @@ from packaging.version import parse, Version
 import platform
 
 from setuptools import setup, find_packages
-from setuptools.command.build import build
 import subprocess
-from setuptools.command.bdist_egg import bdist_egg
 
 import urllib.request
 import urllib.error
@@ -214,7 +212,7 @@ class CachedWheelsCommand(_bdist_wheel):
      """
      def run(self):
         if FORCE_BUILD:
-            return build.run(self)
+            return super().run()
 
         raise_if_cuda_home_none("flash_attn")
 

From 565615c603bc83ff0215cf62bc4d907b27041215 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Thu, 10 Aug 2023 19:54:29 -0700
Subject: [PATCH 22/25] Isolate 2.0.0 & cuda12

---
 .github/workflows/cuda/cu120-Linux-env.sh |  9 +++++++++
 .github/workflows/cuda/cu120-Linux.sh     | 18 ++++++++++++++++++
 .github/workflows/publish.yml             | 18 +++++++++++++++---
 3 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/cuda/cu120-Linux-env.sh
 create mode 100644 .github/workflows/cuda/cu120-Linux.sh

diff --git a/.github/workflows/cuda/cu120-Linux-env.sh b/.github/workflows/cuda/cu120-Linux-env.sh
new file mode 100644
index 0000000..37917cc
--- /dev/null
+++ b/.github/workflows/cuda/cu120-Linux-env.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+CUDA_HOME=/usr/local/cuda-12.0
+LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+PATH=${CUDA_HOME}/bin:${PATH}
+
+export FORCE_CUDA=1
+export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
+export CUDA_HOME=/usr/local/cuda-12.0
\ No newline at end of file
diff --git a/.github/workflows/cuda/cu120-Linux.sh b/.github/workflows/cuda/cu120-Linux.sh
new file mode 100644
index 0000000..56996de
--- /dev/null
+++ b/.github/workflows/cuda/cu120-Linux.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Strip the periods from the version number
+OS_VERSION=$(echo $(lsb_release -sr) | tr -d .)
+OS=ubuntu${OS_VERSION}
+
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
+sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
+wget -nv https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda-repo-${OS}-12-0-local_12.0.0-525.60.13-1_amd64.deb
+
+sudo dpkg -i cuda-repo-${OS}-12-0-local_12.0.0-525.60.13-1_amd64.deb
+sudo cp /var/cuda-repo-${OS}-12-0-local/cuda-*-keyring.gpg /usr/share/keyrings/
+
+sudo apt-get -qq update
+sudo apt install cuda cuda-nvcc-12-0 cuda-libraries-dev-12-0
+sudo apt clean
+
+rm -f https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda-repo-${OS}-12-0-local_12.0.0-525.60.13-1_amd64.deb
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 83c4b48..05eaaad 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -40,9 +40,12 @@ jobs:
       fail-fast: false
       matrix:
           os: [ubuntu-20.04, ubuntu-22.04]
-          python-version: ['3.7', '3.8', '3.9', '3.10']
-          torch-version: ['1.11.0', '1.12.0', '1.13.0']
-          cuda-version: ['113', '116', '117']
+          #python-version: ['3.7', '3.8', '3.9', '3.10']
+          #torch-version: ['1.11.0', '1.12.0', '1.13.0', '2.0.1']
+          #cuda-version: ['113', '116', '117', '120']
+          python-version: ['3.10']
+          torch-version: ['2.0.1']
+          cuda-version: ['120']
           exclude:
             # Nvidia only supports 11.7+ for ubuntu-22.04
             - os: ubuntu-22.04
@@ -57,9 +60,18 @@ jobs:
             # Torch only builds cuda 116 for 1.12.0+
             - cuda-version: '116'
               torch-version: '1.11.0'
+            # Torch only builds cuda 120 for 2.0.1+
+            - cuda-version: '120'
+              torch-version: '1.11.0'
+            - cuda-version: '120'
+              torch-version: '1.12.0'
+            - cuda-version: '120'
+              torch-version: '1.13.0'
             # 1.13.0 drops support for cuda 11.3
             - cuda-version: '113'
               torch-version: '1.13.0'
+            - cuda-version: '113'
+              torch-version: '2.0.1'
             # Fails with "Validation Error" on artifact upload
             - cuda-version: '117'
               torch-version: '1.13.0'

From bc6d4992f2de570969bfbc956799c67fd81c31d0 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Thu, 10 Aug 2023 19:55:52 -0700
Subject: [PATCH 23/25] Build wheel on each push

---
 .github/workflows/publish.yml | 76 ++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 05eaaad..08f40af 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -7,10 +7,12 @@
 
 name: Build wheels and deploy
 
+#on:
+#  create:
+#    tags:
+#      - '**'
 on:
-  create:
-    tags:
-      - '**'
+  push
 
 jobs:
   setup_release:
@@ -152,44 +154,44 @@ jobs:
         run: |
           ls dist
 
-      - name: Upload Release Asset
-        id: upload_release_asset 
-        uses: actions/upload-release-asset@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          upload_url: ${{ steps.get_current_release.outputs.upload_url }}
-          asset_path: ./dist/${{env.wheel_name}}
-          asset_name: ${{env.wheel_name}}
-          asset_content_type: application/*
+      # - name: Upload Release Asset
+      #   id: upload_release_asset 
+      #   uses: actions/upload-release-asset@v1
+      #   env:
+      #     GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      #   with:
+      #     upload_url: ${{ steps.get_current_release.outputs.upload_url }}
+      #     asset_path: ./dist/${{env.wheel_name}}
+      #     asset_name: ${{env.wheel_name}}
+      #     asset_content_type: application/*
 
-  publish_package:
-    name: Publish package
-    needs: [build_wheels]
+  # publish_package:
+  #   name: Publish package
+  #   needs: [build_wheels]
 
-    runs-on: ubuntu-latest
+  #   runs-on: ubuntu-latest
 
-    steps:
-      - uses: actions/checkout@v3
+  #   steps:
+  #     - uses: actions/checkout@v3
 
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.10'
+  #     - uses: actions/setup-python@v4
+  #       with:
+  #         python-version: '3.10'
 
-      - name: Install dependencies
-        run: |
-          pip install ninja packaging setuptools wheel twine
-          pip install torch
+  #     - name: Install dependencies
+  #       run: |
+  #         pip install ninja packaging setuptools wheel twine
+  #         pip install torch
 
-      - name: Build core package
-        env:
-          FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE"
-        run: |
-          python setup.py sdist --dist-dir=dist
+  #     - name: Build core package
+  #       env:
+  #         FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE"
+  #       run: |
+  #         python setup.py sdist --dist-dir=dist
 
-      - name: Deploy
-        env:
-          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
-          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
-        run: |
-          python -m twine upload dist/*
+  #     - name: Deploy
+  #       env:
+  #         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+  #         TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+  #       run: |
+  #         python -m twine upload dist/*

From ecc6535443c73efca91007b1a300c4b049c6c0ff Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Thu, 10 Aug 2023 19:56:24 -0700
Subject: [PATCH 24/25] Remove release creation

---
 .github/workflows/publish.yml | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 08f40af..f29539d 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -15,28 +15,28 @@ on:
   push
 
 jobs:
-  setup_release:
-    name: Create Release
-    runs-on: ubuntu-latest
-    steps:
-      - name: Get the tag version
-        id: extract_branch
-        run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
-        shell: bash
+  # setup_release:
+  #   name: Create Release
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Get the tag version
+  #       id: extract_branch
+  #       run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
+  #       shell: bash
 
-      - name: Create Release
-        id: create_release
-        uses: actions/create-release@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          tag_name: ${{ steps.extract_branch.outputs.branch }}
-          release_name: ${{ steps.extract_branch.outputs.branch }}
+  #     - name: Create Release
+  #       id: create_release
+  #       uses: actions/create-release@v1
+  #       env:
+  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  #       with:
+  #         tag_name: ${{ steps.extract_branch.outputs.branch }}
+  #         release_name: ${{ steps.extract_branch.outputs.branch }}
 
   build_wheels:
     name: Build Wheel
     runs-on: ${{ matrix.os }}
-    needs: setup_release
+    #needs: setup_release
 
     strategy:
       fail-fast: false

From 6ef3bd800e8b8104537ffa0ba4ea10306da40f42 Mon Sep 17 00:00:00 2001
From: Pierce Freeman <piercefreeman@gmail.com>
Date: Thu, 10 Aug 2023 20:12:20 -0700
Subject: [PATCH 25/25] Install standard non-wheel package

---
 .github/workflows/publish.yml | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index f29539d..bc01441 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -116,13 +116,24 @@ jobs:
       - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
         run: |
           pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses && conda clean -ya
-          pip install --no-index --no-cache-dir torch==${{ matrix.torch-version }} -f https://download.pytorch.org/whl/cu${{ matrix.cuda-version }}/torch_stable.html
+          pip install --no-cache-dir torch==${{ matrix.torch-version }}
           python --version
           python -c "import torch; print('PyTorch:', torch.__version__)"
           python -c "import torch; print('CUDA:', torch.version.cuda)"
           python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
         shell:
           bash
+
+      # - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
+      #   run: |
+      #     pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses && conda clean -ya
+      #     pip install --no-index --no-cache-dir torch==${{ matrix.torch-version }} -f https://download.pytorch.org/whl/cu${{ matrix.cuda-version }}/torch_stable.html
+      #     python --version
+      #     python -c "import torch; print('PyTorch:', torch.__version__)"
+      #     python -c "import torch; print('CUDA:', torch.version.cuda)"
+      #     python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
+      #   shell:
+      #     bash
       
       - name: Get the tag version
         id: extract_branch