2022-11-14 14:06:44 +08:00
|
|
|
# Copyright (c) 2022, Tri Dao.
|
|
|
|
|
|
|
|
|
|
import torch
|
|
|
|
|
import torch.nn as nn
|
|
|
|
|
import torch.nn.functional as F
|
2023-07-23 14:45:51 +08:00
|
|
|
from torch.distributed import ProcessGroup
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear
|
|
|
|
|
except ImportError:
|
|
|
|
|
ColumnParallelLinear, RowParallelLinear = None, None
|
2022-11-14 14:06:44 +08:00
|
|
|
|
|
|
|
|
try:
|
2023-07-27 03:39:37 +08:00
|
|
|
from flash_attn.ops.fused_dense import FusedMLP, ParallelFusedMLP
|
2022-11-14 14:06:44 +08:00
|
|
|
except ImportError:
|
2023-01-18 10:12:27 +08:00
|
|
|
FusedMLP, ParallelFusedMLP = None, None
|
2022-11-14 14:06:44 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class Mlp(nn.Module):
|
|
|
|
|
|
|
|
|
|
def __init__(self, in_features, hidden_features=None, out_features=None, activation=F.gelu,
|
2023-04-19 12:43:37 +08:00
|
|
|
bias1=True, bias2=True, return_residual=False, device=None, dtype=None):
|
2022-11-14 14:06:44 +08:00
|
|
|
factory_kwargs = {'device': device, 'dtype': dtype}
|
|
|
|
|
super().__init__()
|
|
|
|
|
out_features = out_features or in_features
|
2023-04-13 11:08:21 +08:00
|
|
|
hidden_features = hidden_features or in_features * 4
|
2022-12-20 14:18:46 +08:00
|
|
|
self.return_residual = return_residual
|
2023-04-19 12:43:37 +08:00
|
|
|
self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs)
|
2022-11-14 14:06:44 +08:00
|
|
|
self.activation = activation
|
2023-04-19 12:43:37 +08:00
|
|
|
self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
|
2022-11-14 14:06:44 +08:00
|
|
|
|
|
|
|
|
def forward(self, x):
|
2022-12-20 14:18:46 +08:00
|
|
|
y = self.fc1(x)
|
|
|
|
|
y = self.activation(y)
|
|
|
|
|
y = self.fc2(y)
|
|
|
|
|
return y if not self.return_residual else (y, x)
|
2023-04-18 18:37:14 +08:00
|
|
|
|
|
|
|
|
|
2023-07-23 14:45:51 +08:00
|
|
|
class ParallelMLP(nn.Module):
|
|
|
|
|
|
|
|
|
|
def __init__(self, in_features, hidden_features=None, out_features=None, activation=F.gelu,
|
|
|
|
|
process_group: ProcessGroup = None, sequence_parallel=True,
|
|
|
|
|
bias1=True, bias2=True, device=None, dtype=None):
|
|
|
|
|
factory_kwargs = {'device': device, 'dtype': dtype}
|
|
|
|
|
super().__init__()
|
|
|
|
|
assert ColumnParallelLinear is not None, "Need to install fused_dense"
|
|
|
|
|
assert RowParallelLinear is not None, "Need to install fused_dense"
|
|
|
|
|
out_features = out_features or in_features
|
|
|
|
|
hidden_features = hidden_features or in_features * 4
|
|
|
|
|
self.fc1 = ColumnParallelLinear(in_features, hidden_features, process_group, bias=bias1,
|
|
|
|
|
sequence_parallel=sequence_parallel, **factory_kwargs)
|
|
|
|
|
self.activation = activation
|
|
|
|
|
self.fc2 = RowParallelLinear(hidden_features, out_features, process_group, bias=bias2,
|
|
|
|
|
sequence_parallel=sequence_parallel, **factory_kwargs)
|
|
|
|
|
|
|
|
|
|
def forward(self, x):
|
|
|
|
|
y = self.fc1(x)
|
|
|
|
|
y = self.activation(y)
|
|
|
|
|
y = self.fc2(y)
|
|
|
|
|
return y
|
|
|
|
|
|
|
|
|
|
|
2023-04-18 18:37:14 +08:00
|
|
|
class GatedMlp(nn.Module):
|
|
|
|
|
|
|
|
|
|
def __init__(self, in_features, hidden_features=None, out_features=None, activation=F.sigmoid,
|
2023-04-19 12:43:37 +08:00
|
|
|
bias1=True, bias2=True, multiple_of=256, return_residual=False,
|
|
|
|
|
device=None, dtype=None):
|
2023-04-18 18:37:14 +08:00
|
|
|
factory_kwargs = {'device': device, 'dtype': dtype}
|
|
|
|
|
super().__init__()
|
|
|
|
|
out_features = out_features or in_features
|
|
|
|
|
hidden_features = hidden_features or int(8 * in_features / 3)
|
|
|
|
|
hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
|
|
|
|
|
self.return_residual = return_residual
|
2023-04-19 12:43:37 +08:00
|
|
|
self.fc1 = nn.Linear(in_features, 2 * hidden_features, bias=bias1, **factory_kwargs)
|
2023-04-18 18:37:14 +08:00
|
|
|
self.activation = activation
|
2023-07-27 03:14:15 +08:00
|
|
|
self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
|
2023-04-18 18:37:14 +08:00
|
|
|
|
|
|
|
|
def forward(self, x):
|
|
|
|
|
y = self.fc1(x)
|
|
|
|
|
if self.activation == F.sigmoid: # Special case for GLU
|
|
|
|
|
y = F.glu(y, dim=-1)
|
|
|
|
|
else:
|
|
|
|
|
y, gate = y.chunk(2, dim=-1)
|
|
|
|
|
y = y * self.activation(gate)
|
|
|
|
|
y = self.fc2(y)
|
|
|
|
|
return y if not self.return_residual else (y, x)
|
2023-07-27 03:14:15 +08:00
|
|
|
|
|
|
|
|
|
2023-07-27 03:39:37 +08:00
|
|
|
class ParallelGatedMlp(nn.Module):
|
2023-07-27 03:14:15 +08:00
|
|
|
""" Parallel GatedMlp """
|
|
|
|
|
|
2023-07-27 03:39:37 +08:00
|
|
|
def __init__(self, in_features, process_group, hidden_features=None, out_features=None,
|
|
|
|
|
activation=F.sigmoid, bias1=True, bias2=True, multiple_of=256,
|
2023-07-27 03:14:15 +08:00
|
|
|
sequence_parallel=True, device=None, dtype=None):
|
|
|
|
|
factory_kwargs = {'device': device, 'dtype': dtype}
|
2023-07-27 03:39:37 +08:00
|
|
|
super().__init__()
|
2023-07-27 03:14:15 +08:00
|
|
|
out_features = out_features or in_features
|
|
|
|
|
hidden_features = hidden_features or int(8 * in_features / 3)
|
|
|
|
|
hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
|
|
|
|
|
if ColumnParallelLinear is None or RowParallelLinear is None:
|
|
|
|
|
raise ImportError('fused_dense is not installed')
|
2023-07-27 03:39:37 +08:00
|
|
|
self.fc1 = ColumnParallelLinear(in_features, 2 * hidden_features, process_group, bias=bias1,
|
2023-07-27 03:14:15 +08:00
|
|
|
sequence_parallel=sequence_parallel, **factory_kwargs)
|
2023-07-27 03:39:37 +08:00
|
|
|
self.activation = activation
|
|
|
|
|
self.fc2 = RowParallelLinear(hidden_features, out_features, process_group, bias=bias2,
|
2023-07-27 03:14:15 +08:00
|
|
|
sequence_parallel=sequence_parallel, **factory_kwargs)
|
2023-07-27 03:39:37 +08:00
|
|
|
|
|
|
|
|
def forward(self, x):
|
|
|
|
|
y = self.fc1(x)
|
|
|
|
|
if self.activation == F.sigmoid: # Special case for GLU
|
|
|
|
|
y = F.glu(y, dim=-1)
|
|
|
|
|
else:
|
|
|
|
|
y, gate = y.chunk(2, dim=-1)
|
|
|
|
|
y = y * self.activation(gate)
|
|
|
|
|
y = self.fc2(y)
|
|
|
|
|
return y
|