73 lines
2.4 KiB
Python
73 lines
2.4 KiB
Python
# coding=utf-8
|
|
# adapted from https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/configuration_olmo.py
|
|
"""OLMo configuration"""
|
|
from transformers import PretrainedConfig
|
|
|
|
|
|
class OLMoConfig(PretrainedConfig):
|
|
model_type = 'olmo'
|
|
attribute_map = {
|
|
'num_attention_heads': 'n_heads',
|
|
'hidden_size': 'd_model',
|
|
'num_hidden_layers': 'n_layers',
|
|
}
|
|
|
|
# Note that the defaults for these attributes are equivalent to the base GPT2 model.
|
|
def __init__(
|
|
self,
|
|
d_model=768,
|
|
n_heads=12,
|
|
n_layers=12,
|
|
mlp_ratio=4,
|
|
mlp_hidden_size=None,
|
|
activation_type="swiglu",
|
|
block_type="sequential",
|
|
block_group_size=1,
|
|
alibi=False,
|
|
alibi_bias_max=8.0,
|
|
rope=False,
|
|
rope_full_precision=True,
|
|
multi_query_attention=False,
|
|
attention_layer_norm=False,
|
|
layer_norm_type="default",
|
|
layer_norm_with_affine=True,
|
|
attention_layer_norm_with_affine=True,
|
|
max_sequence_length=1024,
|
|
include_bias=True,
|
|
bias_for_layer_norm=None,
|
|
scale_logits=False,
|
|
vocab_size=50257,
|
|
embedding_size=50304,
|
|
weight_tying=True,
|
|
eos_token_id=50256,
|
|
pad_token_id=50256,
|
|
**kwargs,
|
|
):
|
|
self.d_model = d_model
|
|
self.n_heads = n_heads
|
|
self.n_layers = n_layers
|
|
self.mlp_ratio = mlp_ratio
|
|
self.mlp_hidden_size = mlp_hidden_size
|
|
self.activation_type = activation_type
|
|
self.block_type = block_type
|
|
self.block_group_size = block_group_size
|
|
self.alibi = alibi
|
|
self.alibi_bias_max = alibi_bias_max
|
|
self.rope = rope
|
|
self.rope_full_precision = rope_full_precision
|
|
self.multi_query_attention = multi_query_attention
|
|
self.attention_layer_norm = attention_layer_norm
|
|
self.layer_norm_type = layer_norm_type
|
|
self.layer_norm_with_affine = layer_norm_with_affine
|
|
self.attention_layer_norm_with_affine = attention_layer_norm_with_affine
|
|
self.max_sequence_length = max_sequence_length
|
|
self.include_bias = include_bias
|
|
self.bias_for_layer_norm = bias_for_layer_norm
|
|
self.scale_logits = scale_logits
|
|
self.vocab_size = vocab_size
|
|
self.embedding_size = embedding_size
|
|
self.weight_tying = weight_tying
|
|
self.eos_token_id = eos_token_id
|
|
self.pad_token_id = pad_token_id
|
|
super().__init__(**kwargs)
|