vllm/vllm/model_executor/layers/activation.py

"""Custom activation functions."""
import math
from typing import Optional

import torch
import torch.nn as nn
import torch.nn.functional as F

from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                              get_tensor_model_parallel_world_size)
from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.utils import set_weight_attrs


class SiluAndMul(CustomOp):
    """An activation function for SwiGLU.

    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.

    Shapes:
        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
        return: (num_tokens, d) or (batch_size, seq_len, d)
    """

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        d = x.shape[-1] // 2
        return F.silu(x[..., :d]) * x[..., d:]

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        from vllm import _custom_ops as ops

        d = x.shape[-1] // 2
        output_shape = (x.shape[:-1] + (d, ))
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        ops.silu_and_mul(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        from vllm._ipex_ops import ipex_ops as ops

        d = x.shape[-1] // 2
        output_shape = (x.shape[:-1] + (d, ))
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        ops.silu_and_mul(out, x)
        return out


class GeluAndMul(CustomOp):
    """An activation function for GeGLU.

    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.

    Shapes:
        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
        return: (batch_size, seq_len, d) or (num_tokens, d)
    """

    def __init__(self, approximate: str = "none"):
        super().__init__()
        self.approximate = approximate
        if approximate not in ("none", "tanh"):
            raise ValueError(f"Unknown approximate mode: {approximate}")

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        d = x.shape[-1] // 2
        return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        from vllm import _custom_ops as ops

        d = x.shape[-1] // 2
        output_shape = (x.shape[:-1] + (d, ))
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        if self.approximate == "none":
            ops.gelu_and_mul(out, x)
        elif self.approximate == "tanh":
            ops.gelu_tanh_and_mul(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        from vllm._ipex_ops import ipex_ops as ops

        d = x.shape[-1] // 2
        output_shape = (x.shape[:-1] + (d, ))
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        if self.approximate == "none":
            ops.gelu_and_mul(out, x)
        elif self.approximate == "tanh":
            ops.gelu_tanh_and_mul(out, x)
        return out

    def extra_repr(self) -> str:
        return f'approximate={repr(self.approximate)}'


class NewGELU(CustomOp):

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        c = math.sqrt(2.0 / math.pi)
        return 0.5 * x * (1.0 + torch.tanh(c *
                                           (x + 0.044715 * torch.pow(x, 3.0))))

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        from vllm import _custom_ops as ops

        out = torch.empty_like(x)
        ops.gelu_new(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        from vllm._ipex_ops import ipex_ops as ops

        out = torch.empty_like(x)
        ops.gelu_new(out, x)
        return out


class FastGELU(CustomOp):

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
                                           (1.0 + 0.044715 * x * x)))

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        from vllm import _custom_ops as ops

        out = torch.empty_like(x)
        ops.gelu_fast(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        from vllm._ipex_ops import ipex_ops as ops

        out = torch.empty_like(x)
        ops.gelu_fast(out, x)
        return out


class ScaledActivation(nn.Module):
    """An activation function with post-scale parameters.

    This is used for some quantization methods like AWQ.
    """

    def __init__(
        self,
        act_module: nn.Module,
        intermediate_size: int,
        input_is_parallel: bool = True,
        params_dtype: Optional[torch.dtype] = None,
    ):
        super().__init__()
        self.act = act_module
        self.input_is_parallel = input_is_parallel
        if input_is_parallel:
            tp_size = get_tensor_model_parallel_world_size()
            intermediate_size_per_partition = divide(intermediate_size,
                                                     tp_size)
        else:
            intermediate_size_per_partition = intermediate_size
        if params_dtype is None:
            params_dtype = torch.get_default_dtype()
        self.scales = nn.Parameter(
            torch.empty(intermediate_size_per_partition, dtype=params_dtype))
        set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.act(x) / self.scales

    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
        param_data = param.data
        if self.input_is_parallel:
            tp_rank = get_tensor_model_parallel_rank()
            shard_size = param_data.shape[0]
            start_idx = tp_rank * shard_size
            loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
        assert param_data.shape == loaded_weight.shape
        param_data.copy_(loaded_weight)


_ACTIVATION_REGISTRY = {
    "gelu": nn.GELU(),
    "gelu_fast": FastGELU(),
    "gelu_new": NewGELU(),
    "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
    "relu": nn.ReLU(),
}


def get_act_fn(
    act_fn_name: str,
    quant_config: Optional[QuantizationConfig] = None,
    intermediate_size: Optional[int] = None,
    input_is_parallel: bool = True,
    params_dtype: Optional[torch.dtype] = None,
) -> nn.Module:
    """Get an activation function by name."""
    act_fn_name = act_fn_name.lower()
    if act_fn_name not in _ACTIVATION_REGISTRY:
        raise ValueError(
            f"Activation function {act_fn_name!r} is not supported.")

    act_fn = _ACTIVATION_REGISTRY[act_fn_name]
    if (quant_config is not None
            and act_fn_name in quant_config.get_scaled_act_names()):
        if intermediate_size is None:
            raise ValueError("intermediate_size must be specified for scaled "
                             "activation functions.")
        return ScaledActivation(act_fn, intermediate_size, input_is_parallel,
                                params_dtype)
    return act_fn
Add docstrings to some modules and classes (#100) 2023-05-15 13:32:38 +08:00			`"""Custom activation functions."""`
Add PyTorch-native implementation of custom layers (#1898) 2023-12-03 13:18:40 +08:00			`import math`
Add AWQ support for all models (#1714) 2023-11-19 09:56:47 +08:00			`from typing import Optional`

Optimize data movement (#20) 2023-04-02 15:30:17 +08:00			`import torch`
			`import torch.nn as nn`
Add PyTorch-native implementation of custom layers (#1898) 2023-12-03 13:18:40 +08:00			`import torch.nn.functional as F`
Optimize data movement (#20) 2023-04-02 15:30:17 +08:00
[Core][Refactor] move parallel_utils into vllm/distributed (#3950) [WIP][Core][Refactor] move vllm/model_executor/parallel_utils into vllm/distributed and vllm/device_communicators (#3950) 2024-04-11 06:33:30 +08:00			`from vllm.distributed import (divide, get_tensor_model_parallel_rank,`
			`get_tensor_model_parallel_world_size)`
[Misc] Add CustomOp interface for device portability (#5255) 2024-06-06 00:18:19 +08:00			`from vllm.model_executor.custom_op import CustomOp`
Add AWQ support for all models (#1714) 2023-11-19 09:56:47 +08:00			`from vllm.model_executor.layers.quantization import QuantizationConfig`
[BugFix] Fix TP support for AWQ (#1731) 2023-11-21 13:42:45 +08:00			`from vllm.model_executor.utils import set_weight_attrs`
Optimize data movement (#20) 2023-04-02 15:30:17 +08:00

[Misc] Add CustomOp interface for device portability (#5255) 2024-06-06 00:18:19 +08:00			`class SiluAndMul(CustomOp):`
Add docstrings to some modules and classes (#100) 2023-05-15 13:32:38 +08:00			`"""An activation function for SwiGLU.`

Change scheduler & input tensor shape (#1381) 2023-10-17 08:48:42 +08:00			`The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.`
Optimize data movement (#20) 2023-04-02 15:30:17 +08:00
[Quality] Add code formatter and linter (#326) 2023-07-04 02:31:55 +08:00			`Shapes:`
[1/n][Chunked Prefill] Refactor input query shapes (#3236) 2024-03-21 05:46:05 +08:00			`x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)`
			`return: (num_tokens, d) or (batch_size, seq_len, d)`
[Quality] Add code formatter and linter (#326) 2023-07-04 02:31:55 +08:00			`"""`
Optimize data movement (#20) 2023-04-02 15:30:17 +08:00
[Misc] Add CustomOp interface for device portability (#5255) 2024-06-06 00:18:19 +08:00			`def forward_native(self, x: torch.Tensor) -> torch.Tensor:`
Add PyTorch-native implementation of custom layers (#1898) 2023-12-03 13:18:40 +08:00			`"""PyTorch-native implementation equivalent to forward()."""`
			`d = x.shape[-1] // 2`
			`return F.silu(x[..., :d]) * x[..., d:]`

[Misc] Add CustomOp interface for device portability (#5255) 2024-06-06 00:18:19 +08:00			`def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:`
			`from vllm import _custom_ops as ops`

Change scheduler & input tensor shape (#1381) 2023-10-17 08:48:42 +08:00			`d = x.shape[-1] // 2`
			`output_shape = (x.shape[:-1] + (d, ))`
			`out = torch.empty(output_shape, dtype=x.dtype, device=x.device)`
[Build] Avoid building too many extensions (#1624) 2023-11-24 08:31:19 +08:00			`ops.silu_and_mul(out, x)`
Optimize data movement (#20) 2023-04-02 15:30:17 +08:00			`return out`
Implement approximate GELU kernels (#828) 2023-08-23 06:43:21 +08:00
[Hardware][Intel GPU] Add Intel GPU(XPU) inference backend (#3814) Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Abhilash Majumder <abhilash.majumder@intel.com> Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> 2024-06-18 02:01:25 +08:00			`def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:`
			`from vllm._ipex_ops import ipex_ops as ops`

			`d = x.shape[-1] // 2`
			`output_shape = (x.shape[:-1] + (d, ))`
			`out = torch.empty(output_shape, dtype=x.dtype, device=x.device)`
			`ops.silu_and_mul(out, x)`
			`return out`

Implement approximate GELU kernels (#828) 2023-08-23 06:43:21 +08:00
[Misc] Add CustomOp interface for device portability (#5255) 2024-06-06 00:18:19 +08:00			`class GeluAndMul(CustomOp):`
Optimize GeGLU layer in Gemma (#2975) 2024-02-22 12:17:52 +08:00			`"""An activation function for GeGLU.`

			`The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.`

			`Shapes:`
			`x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)`
			`return: (batch_size, seq_len, d) or (num_tokens, d)`
			`"""`

Add kernel for GeGLU with approximate GELU (#3337) 2024-03-13 13:06:17 +08:00			`def __init__(self, approximate: str = "none"):`
			`super().__init__()`
			`self.approximate = approximate`
			`if approximate not in ("none", "tanh"):`
			`raise ValueError(f"Unknown approximate mode: {approximate}")`

[Misc] Add CustomOp interface for device portability (#5255) 2024-06-06 00:18:19 +08:00			`def forward_native(self, x: torch.Tensor) -> torch.Tensor:`
Optimize GeGLU layer in Gemma (#2975) 2024-02-22 12:17:52 +08:00			`"""PyTorch-native implementation equivalent to forward()."""`
			`d = x.shape[-1] // 2`
Add kernel for GeGLU with approximate GELU (#3337) 2024-03-13 13:06:17 +08:00			`return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]`
Optimize GeGLU layer in Gemma (#2975) 2024-02-22 12:17:52 +08:00
[Misc] Add CustomOp interface for device portability (#5255) 2024-06-06 00:18:19 +08:00			`def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:`
			`from vllm import _custom_ops as ops`

Optimize GeGLU layer in Gemma (#2975) 2024-02-22 12:17:52 +08:00			`d = x.shape[-1] // 2`
			`output_shape = (x.shape[:-1] + (d, ))`
			`out = torch.empty(output_shape, dtype=x.dtype, device=x.device)`
Add kernel for GeGLU with approximate GELU (#3337) 2024-03-13 13:06:17 +08:00			`if self.approximate == "none":`
			`ops.gelu_and_mul(out, x)`
			`elif self.approximate == "tanh":`
			`ops.gelu_tanh_and_mul(out, x)`
Optimize GeGLU layer in Gemma (#2975) 2024-02-22 12:17:52 +08:00			`return out`

[Hardware][Intel GPU] Add Intel GPU(XPU) inference backend (#3814) Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Abhilash Majumder <abhilash.majumder@intel.com> Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> 2024-06-18 02:01:25 +08:00			`def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:`
			`from vllm._ipex_ops import ipex_ops as ops`

			`d = x.shape[-1] // 2`
			`output_shape = (x.shape[:-1] + (d, ))`
			`out = torch.empty(output_shape, dtype=x.dtype, device=x.device)`
			`if self.approximate == "none":`
			`ops.gelu_and_mul(out, x)`
			`elif self.approximate == "tanh":`
			`ops.gelu_tanh_and_mul(out, x)`
			`return out`

[Misc]Add customized information for models (#4132) 2024-05-01 12:18:14 +08:00			`def extra_repr(self) -> str:`
			`return f'approximate={repr(self.approximate)}'`

Optimize GeGLU layer in Gemma (#2975) 2024-02-22 12:17:52 +08:00
[Misc] Add CustomOp interface for device portability (#5255) 2024-06-06 00:18:19 +08:00			`class NewGELU(CustomOp):`
Implement approximate GELU kernels (#828) 2023-08-23 06:43:21 +08:00
[Misc] Add CustomOp interface for device portability (#5255) 2024-06-06 00:18:19 +08:00			`def forward_native(self, x: torch.Tensor) -> torch.Tensor:`
Add PyTorch-native implementation of custom layers (#1898) 2023-12-03 13:18:40 +08:00			`"""PyTorch-native implementation equivalent to forward()."""`
			`c = math.sqrt(2.0 / math.pi)`
			`return 0.5 * x * (1.0 + torch.tanh(c *`
			`(x + 0.044715 * torch.pow(x, 3.0))))`

[Misc] Add CustomOp interface for device portability (#5255) 2024-06-06 00:18:19 +08:00			`def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:`
			`from vllm import _custom_ops as ops`

Change scheduler & input tensor shape (#1381) 2023-10-17 08:48:42 +08:00			`out = torch.empty_like(x)`
[Build] Avoid building too many extensions (#1624) 2023-11-24 08:31:19 +08:00			`ops.gelu_new(out, x)`
Implement approximate GELU kernels (#828) 2023-08-23 06:43:21 +08:00			`return out`

[Hardware][Intel GPU] Add Intel GPU(XPU) inference backend (#3814) Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Abhilash Majumder <abhilash.majumder@intel.com> Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> 2024-06-18 02:01:25 +08:00			`def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:`
			`from vllm._ipex_ops import ipex_ops as ops`

			`out = torch.empty_like(x)`
			`ops.gelu_new(out, x)`
			`return out`

Implement approximate GELU kernels (#828) 2023-08-23 06:43:21 +08:00
[Misc] Add CustomOp interface for device portability (#5255) 2024-06-06 00:18:19 +08:00			`class FastGELU(CustomOp):`
Implement approximate GELU kernels (#828) 2023-08-23 06:43:21 +08:00
[Misc] Add CustomOp interface for device portability (#5255) 2024-06-06 00:18:19 +08:00			`def forward_native(self, x: torch.Tensor) -> torch.Tensor:`
Add PyTorch-native implementation of custom layers (#1898) 2023-12-03 13:18:40 +08:00			`"""PyTorch-native implementation equivalent to forward()."""`
			`return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *`
			`(1.0 + 0.044715 * x * x)))`

[Misc] Add CustomOp interface for device portability (#5255) 2024-06-06 00:18:19 +08:00			`def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:`
			`from vllm import _custom_ops as ops`

Change scheduler & input tensor shape (#1381) 2023-10-17 08:48:42 +08:00			`out = torch.empty_like(x)`
[Build] Avoid building too many extensions (#1624) 2023-11-24 08:31:19 +08:00			`ops.gelu_fast(out, x)`
Implement approximate GELU kernels (#828) 2023-08-23 06:43:21 +08:00			`return out`

[Hardware][Intel GPU] Add Intel GPU(XPU) inference backend (#3814) Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Abhilash Majumder <abhilash.majumder@intel.com> Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> 2024-06-18 02:01:25 +08:00			`def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:`
			`from vllm._ipex_ops import ipex_ops as ops`

			`out = torch.empty_like(x)`
			`ops.gelu_fast(out, x)`
			`return out`

Implement approximate GELU kernels (#828) 2023-08-23 06:43:21 +08:00
Add AWQ support for all models (#1714) 2023-11-19 09:56:47 +08:00			`class ScaledActivation(nn.Module):`
			`"""An activation function with post-scale parameters.`

			`This is used for some quantization methods like AWQ.`
			`"""`

			`def __init__(`
			`self,`
			`act_module: nn.Module,`
[BugFix] Fix TP support for AWQ (#1731) 2023-11-21 13:42:45 +08:00			`intermediate_size: int,`
			`input_is_parallel: bool = True,`
			`params_dtype: Optional[torch.dtype] = None,`
Add AWQ support for all models (#1714) 2023-11-19 09:56:47 +08:00			`):`
			`super().__init__()`
			`self.act = act_module`
[FIX] Fix the case when `input_is_parallel=False` for `ScaledActivation` (#1737) 2023-11-21 15:56:48 +08:00			`self.input_is_parallel = input_is_parallel`
[BugFix] Fix TP support for AWQ (#1731) 2023-11-21 13:42:45 +08:00			`if input_is_parallel:`
			`tp_size = get_tensor_model_parallel_world_size()`
			`intermediate_size_per_partition = divide(intermediate_size,`
			`tp_size)`
			`else:`
			`intermediate_size_per_partition = intermediate_size`
			`if params_dtype is None:`
			`params_dtype = torch.get_default_dtype()`
Add AWQ support for all models (#1714) 2023-11-19 09:56:47 +08:00			`self.scales = nn.Parameter(`
Remove hardcoded `device="cuda" ` to support more devices (#2503) Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> 2024-02-02 07:46:39 +08:00			`torch.empty(intermediate_size_per_partition, dtype=params_dtype))`
[BugFix] Fix TP support for AWQ (#1731) 2023-11-21 13:42:45 +08:00			`set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})`
Add AWQ support for all models (#1714) 2023-11-19 09:56:47 +08:00
[BugFix] Fix TP support for AWQ (#1731) 2023-11-21 13:42:45 +08:00			`def forward(self, x: torch.Tensor) -> torch.Tensor:`
Add AWQ support for all models (#1714) 2023-11-19 09:56:47 +08:00			`return self.act(x) / self.scales`

[BugFix] Fix TP support for AWQ (#1731) 2023-11-21 13:42:45 +08:00			`def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):`
			`param_data = param.data`
[FIX] Fix the case when `input_is_parallel=False` for `ScaledActivation` (#1737) 2023-11-21 15:56:48 +08:00			`if self.input_is_parallel:`
			`tp_rank = get_tensor_model_parallel_rank()`
			`shard_size = param_data.shape[0]`
			`start_idx = tp_rank * shard_size`
			`loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)`
[BugFix] Fix TP support for AWQ (#1731) 2023-11-21 13:42:45 +08:00			`assert param_data.shape == loaded_weight.shape`
			`param_data.copy_(loaded_weight)`

Add AWQ support for all models (#1714) 2023-11-19 09:56:47 +08:00
Implement approximate GELU kernels (#828) 2023-08-23 06:43:21 +08:00			`_ACTIVATION_REGISTRY = {`
			`"gelu": nn.GELU(),`
			`"gelu_fast": FastGELU(),`
			`"gelu_new": NewGELU(),`
			`"gelu_pytorch_tanh": nn.GELU(approximate="tanh"),`
			`"relu": nn.ReLU(),`
			`}`


Add AWQ support for all models (#1714) 2023-11-19 09:56:47 +08:00			`def get_act_fn(`
			`act_fn_name: str,`
			`quant_config: Optional[QuantizationConfig] = None,`
			`intermediate_size: Optional[int] = None,`
[BugFix] Fix TP support for AWQ (#1731) 2023-11-21 13:42:45 +08:00			`input_is_parallel: bool = True,`
			`params_dtype: Optional[torch.dtype] = None,`
Add AWQ support for all models (#1714) 2023-11-19 09:56:47 +08:00			`) -> nn.Module:`
Implement approximate GELU kernels (#828) 2023-08-23 06:43:21 +08:00			`"""Get an activation function by name."""`
Add AWQ support for all models (#1714) 2023-11-19 09:56:47 +08:00			`act_fn_name = act_fn_name.lower()`
			`if act_fn_name not in _ACTIVATION_REGISTRY:`
			`raise ValueError(`
			`f"Activation function {act_fn_name!r} is not supported.")`

			`act_fn = _ACTIVATION_REGISTRY[act_fn_name]`
[BugFix] Fix TP support for AWQ (#1731) 2023-11-21 13:42:45 +08:00			`if (quant_config is not None`
			`and act_fn_name in quant_config.get_scaled_act_names()):`
Migrate linter from `pylint` to `ruff` (#1665) 2023-11-21 03:58:01 +08:00			`if intermediate_size is None:`
			`raise ValueError("intermediate_size must be specified for scaled "`
			`"activation functions.")`
[BugFix] Fix TP support for AWQ (#1731) 2023-11-21 13:42:45 +08:00			`return ScaledActivation(act_fn, intermediate_size, input_is_parallel,`
			`params_dtype)`
Add AWQ support for all models (#1714) 2023-11-19 09:56:47 +08:00			`return act_fn`