{
    "distributed": {
        "tp_size": 1,
        "cp_size": 1,
        "pp_size": 1,
        "dp_size": 1,
        "pp_engine": "1f1b",
        "backend": "nccl",
        "use_cpu": false
    },
    "model": {
        "name": "HuggingFaceTB/SmolLM-360M-Instruct",
        "num_hidden_layers": 16,
        "num_attention_heads": 16,
        "num_key_value_heads": 4,
        "dtype": "bfloat16",
        "use_flash_attention": true,
        "use_fused_adam": true
    },
    "training": {
        "seed": 42,
        "learning_rate": 3e-4,
        "total_train_steps": 200,
        "seq_length": 1024,
        "micro_batch_size": 32,
        "gradient_accumulation_steps": 1,
        "num_samples": 400000,
        "max_tokens": null
    },
    "dataset": {
        "name": "roneneldan/TinyStories",
        "subset_name": null,
        "num_workers": 0,
        "num_proc": 1
    },
    "checkpoint": {
        "save_dir": "ckpt",
        "save_frequency": 300,
        "load_path": ""
    },
    "logging": {
        "use_wandb": false,
        "project_name": "picotron",
        "run_name": null
    },
    "environment": {
        "OMP_NUM_THREADS": "1",
        "TOKENIZERS_PARALLELISM": "false",
        "FLASH_ATTEN": "1",
        "HF_TOKEN": null
    }
}