add option to switch between pp engine
This commit is contained in:
parent
f6c9a39d17
commit
519b506b2b
@ -15,7 +15,7 @@ def create_single_config(
|
||||
tp: int,
|
||||
cp: int,
|
||||
pp: int,
|
||||
dp: int,
|
||||
pp_engine: str,
|
||||
model_name: str,
|
||||
num_hidden_layers: Optional[int],
|
||||
num_attention_heads: Optional[int],
|
||||
@ -49,7 +49,7 @@ def create_single_config(
|
||||
config_content['distributed']['tp_size'] = tp
|
||||
config_content['distributed']['cp_size'] = cp
|
||||
config_content['distributed']['pp_size'] = pp
|
||||
config_content['distributed']['dp_size'] = dp
|
||||
config_content['distributed']['pp_engine'] = pp_engine
|
||||
|
||||
config_content['logging']['use_wandb'] = use_wandb
|
||||
config_content['logging']['run_name'] = exp_name
|
||||
@ -75,7 +75,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--tp", type=int, help="number of tensor parallelism", default=1)
|
||||
parser.add_argument("--cp", type=int, help="number of context parallelism", default=1)
|
||||
parser.add_argument("--pp", type=int, help="number of pipeline parallelism", default=1)
|
||||
parser.add_argument("--dp", type=int, help="number of data parallelism", default=1)
|
||||
parser.add_argument("--pp_engine", type=str, help="pipeline parallel engine", default="afab")
|
||||
parser.add_argument("--model_name", type=str, help="Model name to create configs for", default="HuggingFaceTB/SmolLM-360M-Instruct")
|
||||
parser.add_argument("--num_hidden_layers", type=int, help="Number of hidden layers", default=None)
|
||||
parser.add_argument("--num_attention_heads", type=int, help="Number of attention heads", default=None)
|
||||
@ -94,6 +94,7 @@ if __name__ == "__main__":
|
||||
cp=args.cp,
|
||||
dp=args.dp,
|
||||
pp=args.pp,
|
||||
pp_engine=args.pp_engine,
|
||||
model_name=args.model_name,
|
||||
num_hidden_layers=args.num_hidden_layers,
|
||||
num_attention_heads=args.num_attention_heads,
|
||||
|
||||
@ -4,8 +4,7 @@
|
||||
"cp_size": 1,
|
||||
"pp_size": 1,
|
||||
"dp_size": 1,
|
||||
"master_addr": "localhost",
|
||||
"master_port": 29500,
|
||||
"pp_engine": "afab",
|
||||
"backend": "nccl",
|
||||
"use_cpu": false
|
||||
},
|
||||
|
||||
10
train.py
10
train.py
@ -88,8 +88,7 @@ if __name__ == "__main__":
|
||||
USE_WANDB = config["logging"]["use_wandb"]
|
||||
TP_SIZE = config["distributed"]["tp_size"]
|
||||
PP_SIZE = config["distributed"]["pp_size"]
|
||||
DP_SIZE = config["distributed"]["dp_size"]
|
||||
CP_SIZE = config["distributed"]["cp_size"]
|
||||
PP_ENGINE = config["distributed"]["pp_engine"]
|
||||
LOAD_PATH = config["checkpoint"]["load_path"]
|
||||
CHECKPOINT_DIR = config["checkpoint"]["save_dir"]
|
||||
CHECKPOINT_FREQ = config["checkpoint"]["save_frequency"]
|
||||
@ -204,7 +203,12 @@ if __name__ == "__main__":
|
||||
optimizer.zero_grad()
|
||||
|
||||
if pgm.process_group_manager.pp_world_size > 1:
|
||||
loss = train_step_pipeline_afab(model, data_loader, tensor_shapes, device, dtype)
|
||||
if PP_ENGINE == "afab":
|
||||
loss = train_step_pipeline_afab(model, data_loader, tensor_shapes, device, dtype)
|
||||
elif PP_ENGINE == "1f1b":
|
||||
loss = train_step_pipeline_1f1b(model, data_loader, tensor_shapes, device, dtype)
|
||||
else:
|
||||
raise ValueError(f"Invalid pipeline parallel engine: {PP_ENGINE}")
|
||||
else:
|
||||
loss = train_step(model, data_loader, device)
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user