add option to switch between pp engine

This commit is contained in:
ferdinand.mom 2024-11-04 14:32:44 +00:00
parent f6c9a39d17
commit 519b506b2b
3 changed files with 12 additions and 8 deletions

View File

@ -15,7 +15,7 @@ def create_single_config(
tp: int,
cp: int,
pp: int,
dp: int,
pp_engine: str,
model_name: str,
num_hidden_layers: Optional[int],
num_attention_heads: Optional[int],
@ -49,7 +49,7 @@ def create_single_config(
config_content['distributed']['tp_size'] = tp
config_content['distributed']['cp_size'] = cp
config_content['distributed']['pp_size'] = pp
config_content['distributed']['dp_size'] = dp
config_content['distributed']['pp_engine'] = pp_engine
config_content['logging']['use_wandb'] = use_wandb
config_content['logging']['run_name'] = exp_name
@ -75,7 +75,7 @@ if __name__ == "__main__":
parser.add_argument("--tp", type=int, help="number of tensor parallelism", default=1)
parser.add_argument("--cp", type=int, help="number of context parallelism", default=1)
parser.add_argument("--pp", type=int, help="number of pipeline parallelism", default=1)
parser.add_argument("--dp", type=int, help="number of data parallelism", default=1)
parser.add_argument("--pp_engine", type=str, help="pipeline parallel engine", default="afab")
parser.add_argument("--model_name", type=str, help="Model name to create configs for", default="HuggingFaceTB/SmolLM-360M-Instruct")
parser.add_argument("--num_hidden_layers", type=int, help="Number of hidden layers", default=None)
parser.add_argument("--num_attention_heads", type=int, help="Number of attention heads", default=None)
@ -94,6 +94,7 @@ if __name__ == "__main__":
cp=args.cp,
dp=args.dp,
pp=args.pp,
pp_engine=args.pp_engine,
model_name=args.model_name,
num_hidden_layers=args.num_hidden_layers,
num_attention_heads=args.num_attention_heads,

View File

@ -4,8 +4,7 @@
"cp_size": 1,
"pp_size": 1,
"dp_size": 1,
"master_addr": "localhost",
"master_port": 29500,
"pp_engine": "afab",
"backend": "nccl",
"use_cpu": false
},

View File

@ -88,8 +88,7 @@ if __name__ == "__main__":
USE_WANDB = config["logging"]["use_wandb"]
TP_SIZE = config["distributed"]["tp_size"]
PP_SIZE = config["distributed"]["pp_size"]
DP_SIZE = config["distributed"]["dp_size"]
CP_SIZE = config["distributed"]["cp_size"]
PP_ENGINE = config["distributed"]["pp_engine"]
LOAD_PATH = config["checkpoint"]["load_path"]
CHECKPOINT_DIR = config["checkpoint"]["save_dir"]
CHECKPOINT_FREQ = config["checkpoint"]["save_frequency"]
@ -204,7 +203,12 @@ if __name__ == "__main__":
optimizer.zero_grad()
if pgm.process_group_manager.pp_world_size > 1:
loss = train_step_pipeline_afab(model, data_loader, tensor_shapes, device, dtype)
if PP_ENGINE == "afab":
loss = train_step_pipeline_afab(model, data_loader, tensor_shapes, device, dtype)
elif PP_ENGINE == "1f1b":
loss = train_step_pipeline_1f1b(model, data_loader, tensor_shapes, device, dtype)
else:
raise ValueError(f"Invalid pipeline parallel engine: {PP_ENGINE}")
else:
loss = train_step(model, data_loader, device)