picotron/template/base_config.json

52 lines
1.2 KiB
JSON

{
"distributed": {
"tp_size": 1,
"cp_size": 1,
"pp_size": 1,
"dp_size": 1,
"pp_engine": "1f1b",
"backend": "nccl",
"use_cpu": false
},
"model": {
"name": "HuggingFaceTB/SmolLM-360M-Instruct",
"num_hidden_layers": 16,
"num_attention_heads": 16,
"num_key_value_heads": 4,
"dtype": "bfloat16",
"use_flash_attention": true,
"use_fused_adam": true
},
"training": {
"seed": 42,
"learning_rate": 3e-4,
"total_train_steps": 200,
"seq_length": 1024,
"micro_batch_size": 32,
"gradient_accumulation_steps": 1,
"num_samples": 400000,
"max_tokens": null
},
"dataset": {
"name": "roneneldan/TinyStories",
"num_workers": 0,
"num_proc": 4
},
"checkpoint": {
"save_dir": "ckpt",
"save_frequency": 300,
"load_path": "",
"hf_hub_safetensors_path": ""
},
"logging": {
"use_wandb": false,
"project_name": "picotron",
"run_name": null
},
"environment": {
"OMP_NUM_THREADS": "1",
"TOKENIZERS_PARALLELISM": "false",
"FLASH_ATTEN": "1"
}
}