From 364a5b4a71203b9977def9829a5c1a6af45468c8 Mon Sep 17 00:00:00 2001 From: Tri Dao Date: Thu, 10 Aug 2023 00:04:38 -0700 Subject: [PATCH] [MLP] Change the check for out_features being None --- flash_attn/modules/mlp.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/flash_attn/modules/mlp.py b/flash_attn/modules/mlp.py index d237253..762fd09 100644 --- a/flash_attn/modules/mlp.py +++ b/flash_attn/modules/mlp.py @@ -22,8 +22,8 @@ class Mlp(nn.Module): bias1=True, bias2=True, return_residual=False, device=None, dtype=None): factory_kwargs = {'device': device, 'dtype': dtype} super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features * 4 + out_features = out_features if out_features is not None else in_features + hidden_features = hidden_features if hidden_features is not None else in_features * 4 self.return_residual = return_residual self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs) self.activation = activation @@ -45,8 +45,8 @@ class ParallelMLP(nn.Module): super().__init__() assert ColumnParallelLinear is not None, "Need to install fused_dense" assert RowParallelLinear is not None, "Need to install fused_dense" - out_features = out_features or in_features - hidden_features = hidden_features or in_features * 4 + out_features = out_features if out_features is not None else in_features + hidden_features = hidden_features if hidden_features is not None else in_features * 4 self.fc1 = ColumnParallelLinear(in_features, hidden_features, process_group, bias=bias1, sequence_parallel=sequence_parallel, **factory_kwargs) self.activation = activation @@ -67,8 +67,9 @@ class GatedMlp(nn.Module): device=None, dtype=None): factory_kwargs = {'device': device, 'dtype': dtype} super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or int(8 * in_features / 3) + out_features = out_features if out_features is not None else in_features + hidden_features = (hidden_features if hidden_features is not None + else int(8 * in_features / 3)) hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of self.return_residual = return_residual self.fc1 = nn.Linear(in_features, 2 * hidden_features, bias=bias1, **factory_kwargs) @@ -94,8 +95,9 @@ class ParallelGatedMlp(nn.Module): sequence_parallel=True, device=None, dtype=None): factory_kwargs = {'device': device, 'dtype': dtype} super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or int(8 * in_features / 3) + out_features = out_features if out_features is not None else in_features + hidden_features = (hidden_features if hidden_features is not None + else int(8 * in_features / 3)) hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of if ColumnParallelLinear is None or RowParallelLinear is None: raise ImportError('fused_dense is not installed')