193 lines
7.5 KiB
Python
193 lines
7.5 KiB
Python
from copy import deepcopy
|
|
import numpy as np
|
|
from template.template_base_configs import template_base_config
|
|
import itertools
|
|
import yaml
|
|
import os
|
|
from transformers import AutoTokenizer
|
|
import math
|
|
import shutil
|
|
import argparse
|
|
|
|
def update_config_based_on_model(model: str, config: dict):
|
|
|
|
# Setting num_attention_heads = num_key_value_heads for all models <=> using MHA for all layers
|
|
|
|
if model == "small-llama":
|
|
config["model"]["model_config"]["hidden_size"] = 512
|
|
config["model"]["model_config"]["intermediate_size"] = 1024
|
|
config["model"]["model_config"]["num_attention_heads"] = 16
|
|
config["model"]["model_config"]["num_hidden_layers"] = 10
|
|
config["model"]["model_config"]["num_key_value_heads"] = 16
|
|
config["model"]["model_config"]["max_position_embeddings"] = config["tokens"]["sequence_length"]
|
|
elif model == "llama-1M":
|
|
config["model"]["model_config"]["hidden_size"] = 768
|
|
config["model"]["model_config"]["intermediate_size"] = 3072
|
|
config["model"]["model_config"]["num_attention_heads"] = 16
|
|
config["model"]["model_config"]["num_hidden_layers"] = 12
|
|
config["model"]["model_config"]["num_key_value_heads"] = 16
|
|
config["model"]["model_config"]["max_position_embeddings"] = config["tokens"]["sequence_length"]
|
|
elif model == "llama-1B":
|
|
# HuggingFaceFW/ablation-model-fineweb-v1
|
|
config["model"]["model_config"]["hidden_size"] = 2048
|
|
config["model"]["model_config"]["intermediate_size"] = 4096
|
|
config["model"]["model_config"]["num_attention_heads"] = 32
|
|
config["model"]["model_config"]["num_hidden_layers"] = 24
|
|
config["model"]["model_config"]["num_key_value_heads"] = 32
|
|
config["model"]["model_config"]["max_position_embeddings"] = config["tokens"]["sequence_length"]
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(config["tokenizer"]["tokenizer_name_or_path"])
|
|
config["model"]["model_config"]["vocab_size"] = tokenizer.vocab_size
|
|
|
|
def create_single_config(
|
|
out_dir: str,
|
|
model: str,
|
|
gpus: int,
|
|
dp: int,
|
|
tp: int,
|
|
pp: int,
|
|
bapr: int,
|
|
mbs: int,
|
|
no_profiler: bool = False,
|
|
cluster: str = "hf",
|
|
exp_name: str = None,
|
|
seq_len: int = 4096,
|
|
lighteval: bool = False,
|
|
s3: bool = False,
|
|
# recompute_layer: bool = False,
|
|
dry_run: bool = False
|
|
):
|
|
|
|
run_path = os.path.join(out_dir, exp_name)
|
|
|
|
if not os.path.exists(out_dir):
|
|
os.makedirs(out_dir)
|
|
|
|
print(f"Creating single config for {model} given {gpus} GPUs")
|
|
config_content = deepcopy(base_config)
|
|
config_content["tokens"]["sequence_length"] = seq_len
|
|
# config_content["parallelism"]["recompute_layer"] = recompute_layer
|
|
config_content["checkpoints"]["checkpoints_path"] = run_path
|
|
update_config_based_on_model(model, config_content)
|
|
|
|
if cluster == "hf":
|
|
tp_max_cluster = 8
|
|
elif cluster == "swiss-ai":
|
|
tp_max_cluster = 4 # GH200
|
|
|
|
config_content['parallelism']['dp'] = dp
|
|
config_content['parallelism']['tp'] = tp
|
|
config_content['parallelism']['pp'] = pp
|
|
|
|
# Compute global batch_size and print
|
|
gbs = dp * mbs * bapr
|
|
gbs_token = gbs * seq_len
|
|
# Print in human readable format
|
|
print(f"Gbs_token: {gbs_token:,}, Gbs: {gbs}, dp: {dp}, seq_len: {seq_len}, bapr: {bapr}, mbs: {mbs}")
|
|
|
|
config_content['tokens']['batch_accumulation_per_replica'] = bapr
|
|
config_content['tokens']['micro_batch_size'] = mbs
|
|
|
|
# Create a directory for each combination of parallelism
|
|
# if recompute_layer:
|
|
# run_path += "_recompute_layer"
|
|
|
|
# Get absoulte path for run_path
|
|
if no_profiler:
|
|
config_content['profiler'] = None
|
|
else:
|
|
config_content['profiler']['profiler_export_path'] = os.path.abspath(run_path)
|
|
|
|
if s3:
|
|
config_content["general"]["is_s3_available"] = True
|
|
config_content['s3_upload'] = {
|
|
"remove_after_upload": True,
|
|
"s5cmd_concurrency": 5,
|
|
"s5cmd_numworkers": 16,
|
|
"s5cmd_path": "/fsx/elie_bakouch/miniconda3/envs/smollm/bin/s5cmd",
|
|
"upload_s3_path": f"s3://huggingface-brrr-us-east-1/fmom/nanotron_pr/{exp_name}"
|
|
}
|
|
|
|
if lighteval:
|
|
config_content['lighteval'] = {
|
|
"batch_size": 16,
|
|
"generation": None,
|
|
"logging": {
|
|
"output_dir": None,
|
|
"public_run": False,
|
|
"push_to_hub": True,
|
|
"push_to_tensorboard": True,
|
|
"results_org": "HuggingFaceSmol",
|
|
"save_details": True,
|
|
"tensorboard_metric_prefix": "eval"
|
|
},
|
|
"parallelism": {
|
|
"dp": dp,
|
|
"expert_parallel_size": 1,
|
|
"pp": pp,
|
|
"pp_engine": "1f1b",
|
|
"recompute_layer": False,
|
|
"tp": tp,
|
|
"tp_linear_async_communication": False,
|
|
"tp_mode": "ALL_REDUCE",
|
|
"tp_recompute_allgather": True
|
|
},
|
|
"tasks": {
|
|
"custom_tasks": "nanotron.lighteval.evaluation_tasks",
|
|
"dataset_loading_processes": 8,
|
|
"max_samples": 1000,
|
|
"multichoice_continuations_start_space": None,
|
|
"num_fewshot_seeds": None,
|
|
"pair_wise_tokenization": False,
|
|
"tasks": "early-signal"
|
|
}
|
|
}
|
|
|
|
if os.path.exists(run_path):
|
|
shutil.rmtree(run_path)
|
|
|
|
if not dry_run:
|
|
os.makedirs(run_path)
|
|
with open(os.path.join(run_path, "config.yaml"), "w") as new_config:
|
|
yaml.dump(config_content, new_config, default_flow_style=False, sort_keys=False)
|
|
|
|
del config_content
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--out_dir", type=str, help="Output directory to store the configs")
|
|
parser.add_argument("--model", type=str, help="Model to create configs for")
|
|
parser.add_argument("--gpus", type=int, help="Number of GPUs")
|
|
parser.add_argument("--dp", type=int, required=True, help="Max number of data parallelism")
|
|
parser.add_argument("--tp", type=int, required=True, help="Max number of tensor parallelism")
|
|
parser.add_argument("--pp", type=int, required=True, help="Max number of pipeline parallelism")
|
|
parser.add_argument("--bapr", type=int, help="Max batch accumulation per replica")
|
|
parser.add_argument("--mbs", type=int, help="Max micro batch size")
|
|
parser.add_argument("--seq_len", type=int, help="Sequence length", default=4096)
|
|
parser.add_argument("--exp_name", type=str, help="Experiment name")
|
|
parser.add_argument("--recompute_layer", action="store_true", help="Enable recompute allgather for tensor parallelism")
|
|
parser.add_argument("--use_async", action="store_true", help="Enable async communication for tensor parallelism")
|
|
parser.add_argument("--lighteval", action="store_true", help="Enable light evaluation")
|
|
parser.add_argument("--s3", action="store_true", help="Enable light evaluation")
|
|
|
|
args=parser.parse_args()
|
|
|
|
create_single_config(
|
|
out_dir=args.out_dir,
|
|
model=args.model,
|
|
gpus=args.gpus,
|
|
dp=args.dp,
|
|
tp=args.tp,
|
|
pp=args.pp,
|
|
bapr=args.bapr,
|
|
mbs=args.mbs,
|
|
cluster="hf",
|
|
exp_name=args.exp_name,
|
|
seq_len=args.seq_len,
|
|
# recompute_layer=args.recompute_layer,
|
|
lighteval=args.lighteval,
|
|
s3=args.s3,
|
|
dry_run=False,
|
|
no_profiler=True
|
|
)
|