flash-attention/training/configs/experiment/owt/gpt2xl.yaml
2022-11-28 17:34:40 -08:00

15 lines
321 B
YAML

# @package _global_
defaults:
- /experiment/owt/gpt2m.yaml
- override /model/gpt2model: gpt2-xlarge
- override /optimizer: adamw-zero
datamodule:
batch_size: 2 # Per GPU
trainer:
strategy:
_target_: src.utils.ddp_zero1.DDPStrategyZero1
find_unused_parameters: False
gradient_as_bucket_view: True