From dec4f2e9101f88f8beffabc9d0f0379323748973 Mon Sep 17 00:00:00 2001 From: Tri Dao Date: Thu, 6 Apr 2023 23:40:15 -0700 Subject: [PATCH 1/4] [FusedDense] Set workspace size to 32M for Hopper and 4M for others --- csrc/fused_dense_lib/fused_dense_cuda.cu | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/csrc/fused_dense_lib/fused_dense_cuda.cu b/csrc/fused_dense_lib/fused_dense_cuda.cu index 7b6f392..023e74c 100644 --- a/csrc/fused_dense_lib/fused_dense_cuda.cu +++ b/csrc/fused_dense_lib/fused_dense_cuda.cu @@ -122,7 +122,9 @@ int gemm_bias_act_lt( reinterpret_cast(at::cuda::getCurrentCUDABlasHandle()); // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind // setting this to 1M. - size_t workspaceSize = 1024 * 1024; + // However, Apex sets it to 4M and TransformerEngine sets to 32M for Hopper and 4M for other GPUs + // https://github.com/NVIDIA/TransformerEngine/blob/a0f0065498bbcfc1da78cf9e8b166f5381613fbc/transformer_engine/pytorch/module.py#L91 + size_t workspaceSize = 1024 * 1024 * (at::cuda::getCurrentDeviceProperties()->major >= 9 ? 32 : 4); void* workspace = at::empty( {static_cast(workspaceSize)}, at::device({at::kCUDA, at::cuda::current_device()}).dtype(at::kByte)).data_ptr(); @@ -296,7 +298,8 @@ int gemm_bgradb_lt( reinterpret_cast(at::cuda::getCurrentCUDABlasHandle()); // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind // setting this to 1M. - size_t workspaceSize = 1024 * 1024; + // However, Apex sets it to 4M and TransformerEngine sets to 32M for Hopper and 4M for other GPUs + size_t workspaceSize = 1024 * 1024 * (at::cuda::getCurrentDeviceProperties()->major >= 9 ? 32 : 4); void* workspace = at::empty( {static_cast(workspaceSize)}, at::device({at::kCUDA, at::cuda::current_device()}).dtype(at::kByte)).data_ptr(); @@ -449,7 +452,8 @@ int gemm_dact_bgradb_lt( reinterpret_cast(at::cuda::getCurrentCUDABlasHandle()); // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind // setting this to 1M. - size_t workspaceSize = 1024 * 1024; + // However, Apex sets it to 4M and TransformerEngine sets to 32M for Hopper and 4M for other GPUs + size_t workspaceSize = 1024 * 1024 * (at::cuda::getCurrentDeviceProperties()->major >= 9 ? 32 : 4); void* workspace = at::empty( {static_cast(workspaceSize)}, at::device({at::kCUDA, at::cuda::current_device()}).dtype(at::kByte)).data_ptr(); From 74af0233166583e58b38c50241831c6114dfea0b Mon Sep 17 00:00:00 2001 From: Tri Dao Date: Tue, 11 Apr 2023 23:32:35 -0700 Subject: [PATCH 2/4] Bump version to 1.0.0 --- README.md | 2 +- setup.py | 2 +- training/Dockerfile | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f02a38a..9748327 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ and experiment with. The notations in the Triton implementation are also closer to what's used in our paper. -## Beta release (0.2). +## Installation and features Requirements: - CUDA 11.4 and above. diff --git a/setup.py b/setup.py index 3d9712b..a2d18ba 100644 --- a/setup.py +++ b/setup.py @@ -162,7 +162,7 @@ ext_modules.append( setup( name="flash_attn", - version="0.2.8", + version="1.0.0", packages=find_packages( exclude=("build", "csrc", "include", "tests", "dist", "docs", "benchmarks", "flash_attn.egg-info",) ), diff --git a/training/Dockerfile b/training/Dockerfile index b2c746d..c5c935f 100644 --- a/training/Dockerfile +++ b/training/Dockerfile @@ -85,11 +85,11 @@ RUN pip install transformers==4.25.1 datasets==2.8.0 pytorch-lightning==1.8.6 tr RUN pip install git+https://github.com/mlcommons/logging.git@2.1.0 # Install FlashAttention -RUN pip install flash-attn==0.2.8 +RUN pip install flash-attn==1.0.0 # Install CUDA extensions for cross-entropy, fused dense, layer norm RUN git clone https://github.com/HazyResearch/flash-attention \ - && cd flash-attention && git checkout v0.2.8 \ + && cd flash-attention && git checkout v1.0.0 \ && cd csrc/fused_softmax && pip install . && cd ../../ \ && cd csrc/rotary && pip install . && cd ../../ \ && cd csrc/xentropy && pip install . && cd ../../ \ From 853ff72963e73456c2a318bde1ebfa292ce935e9 Mon Sep 17 00:00:00 2001 From: Tri Dao Date: Wed, 12 Apr 2023 10:05:01 -0700 Subject: [PATCH 3/4] Bump version to v1.0.1, fix Cutlass version --- setup.py | 2 +- training/Dockerfile | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index a2d18ba..bb6d104 100644 --- a/setup.py +++ b/setup.py @@ -162,7 +162,7 @@ ext_modules.append( setup( name="flash_attn", - version="1.0.0", + version="1.0.1", packages=find_packages( exclude=("build", "csrc", "include", "tests", "dist", "docs", "benchmarks", "flash_attn.egg-info",) ), diff --git a/training/Dockerfile b/training/Dockerfile index c5c935f..4ed06fd 100644 --- a/training/Dockerfile +++ b/training/Dockerfile @@ -85,11 +85,11 @@ RUN pip install transformers==4.25.1 datasets==2.8.0 pytorch-lightning==1.8.6 tr RUN pip install git+https://github.com/mlcommons/logging.git@2.1.0 # Install FlashAttention -RUN pip install flash-attn==1.0.0 +RUN pip install flash-attn==1.0.1 # Install CUDA extensions for cross-entropy, fused dense, layer norm RUN git clone https://github.com/HazyResearch/flash-attention \ - && cd flash-attention && git checkout v1.0.0 \ + && cd flash-attention && git checkout v1.0.1 \ && cd csrc/fused_softmax && pip install . && cd ../../ \ && cd csrc/rotary && pip install . && cd ../../ \ && cd csrc/xentropy && pip install . && cd ../../ \ From 8c424156641ceadc9cd1f5de71c8ae144b4db113 Mon Sep 17 00:00:00 2001 From: Zhiyuan Chen Date: Thu, 13 Apr 2023 11:08:21 +0800 Subject: [PATCH 4/4] make mlp hidden_features defaults to 4*in_features --- flash_attn/modules/mlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flash_attn/modules/mlp.py b/flash_attn/modules/mlp.py index 5240e3f..902bd3b 100644 --- a/flash_attn/modules/mlp.py +++ b/flash_attn/modules/mlp.py @@ -17,7 +17,7 @@ class Mlp(nn.Module): factory_kwargs = {'device': device, 'dtype': dtype} super().__init__() out_features = out_features or in_features - hidden_features = hidden_features or in_features + hidden_features = hidden_features or in_features * 4 self.return_residual = return_residual self.fc1 = nn.Linear(in_features, hidden_features, **factory_kwargs) self.activation = activation