From 519b506b2bba120a851b09d2c08342f97a475f0a Mon Sep 17 00:00:00 2001 From: "ferdinand.mom" Date: Mon, 4 Nov 2024 14:32:44 +0000 Subject: [PATCH] add option to switch between pp engine --- create_config.py | 7 ++++--- template/base_config.json | 3 +-- train.py | 10 +++++++--- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/create_config.py b/create_config.py index 2522414..f736db7 100644 --- a/create_config.py +++ b/create_config.py @@ -15,7 +15,7 @@ def create_single_config( tp: int, cp: int, pp: int, - dp: int, + pp_engine: str, model_name: str, num_hidden_layers: Optional[int], num_attention_heads: Optional[int], @@ -49,7 +49,7 @@ def create_single_config( config_content['distributed']['tp_size'] = tp config_content['distributed']['cp_size'] = cp config_content['distributed']['pp_size'] = pp - config_content['distributed']['dp_size'] = dp + config_content['distributed']['pp_engine'] = pp_engine config_content['logging']['use_wandb'] = use_wandb config_content['logging']['run_name'] = exp_name @@ -75,7 +75,7 @@ if __name__ == "__main__": parser.add_argument("--tp", type=int, help="number of tensor parallelism", default=1) parser.add_argument("--cp", type=int, help="number of context parallelism", default=1) parser.add_argument("--pp", type=int, help="number of pipeline parallelism", default=1) - parser.add_argument("--dp", type=int, help="number of data parallelism", default=1) + parser.add_argument("--pp_engine", type=str, help="pipeline parallel engine", default="afab") parser.add_argument("--model_name", type=str, help="Model name to create configs for", default="HuggingFaceTB/SmolLM-360M-Instruct") parser.add_argument("--num_hidden_layers", type=int, help="Number of hidden layers", default=None) parser.add_argument("--num_attention_heads", type=int, help="Number of attention heads", default=None) @@ -94,6 +94,7 @@ if __name__ == "__main__": cp=args.cp, dp=args.dp, pp=args.pp, + pp_engine=args.pp_engine, model_name=args.model_name, num_hidden_layers=args.num_hidden_layers, num_attention_heads=args.num_attention_heads, diff --git a/template/base_config.json b/template/base_config.json index 5d86fe3..6f025ef 100644 --- a/template/base_config.json +++ b/template/base_config.json @@ -4,8 +4,7 @@ "cp_size": 1, "pp_size": 1, "dp_size": 1, - "master_addr": "localhost", - "master_port": 29500, + "pp_engine": "afab", "backend": "nccl", "use_cpu": false }, diff --git a/train.py b/train.py index 263a86a..52bf930 100644 --- a/train.py +++ b/train.py @@ -88,8 +88,7 @@ if __name__ == "__main__": USE_WANDB = config["logging"]["use_wandb"] TP_SIZE = config["distributed"]["tp_size"] PP_SIZE = config["distributed"]["pp_size"] - DP_SIZE = config["distributed"]["dp_size"] - CP_SIZE = config["distributed"]["cp_size"] + PP_ENGINE = config["distributed"]["pp_engine"] LOAD_PATH = config["checkpoint"]["load_path"] CHECKPOINT_DIR = config["checkpoint"]["save_dir"] CHECKPOINT_FREQ = config["checkpoint"]["save_frequency"] @@ -204,7 +203,12 @@ if __name__ == "__main__": optimizer.zero_grad() if pgm.process_group_manager.pp_world_size > 1: - loss = train_step_pipeline_afab(model, data_loader, tensor_shapes, device, dtype) + if PP_ENGINE == "afab": + loss = train_step_pipeline_afab(model, data_loader, tensor_shapes, device, dtype) + elif PP_ENGINE == "1f1b": + loss = train_step_pipeline_1f1b(model, data_loader, tensor_shapes, device, dtype) + else: + raise ValueError(f"Invalid pipeline parallel engine: {PP_ENGINE}") else: loss = train_step(model, data_loader, device)