From 41f49bb15f5e22a250bdf5ec872cb3f3dfe966fc Mon Sep 17 00:00:00 2001 From: "ferdinand.mom" Date: Mon, 4 Nov 2024 15:06:29 +0000 Subject: [PATCH] rename to grad_steps --- create_config.py | 14 +++++++------- train.py | 10 +++++----- utils.py | 17 ++++++----------- 3 files changed, 18 insertions(+), 23 deletions(-) diff --git a/create_config.py b/create_config.py index ad55038..fb080e4 100644 --- a/create_config.py +++ b/create_config.py @@ -1,6 +1,6 @@ """ -python create_config.py --out_dir tmp --exp_name test_2_node --tp 2 --cp 2 --pp 2 --dp 2 --model_name HuggingFaceTB/SmolLM-360M-Instruct --num_attention_heads 16 --num_key_value_heads 4 --grad_acc 1 --mbs 32 --seq_len 4096 --use_wandb +python create_config.py --out_dir tmp --exp_name test_2_node --tp 2 --cp 2 --pp 2 --dp 2 --model_name HuggingFaceTB/SmolLM-360M-Instruct --num_attention_heads 16 --num_key_value_heads 4 --grad_acc_steps 1 --mbs 32 --seq_len 4096 --use_wandb """ from copy import deepcopy from transformers import AutoConfig @@ -21,7 +21,7 @@ def create_single_config( num_hidden_layers: Optional[int], num_attention_heads: Optional[int], num_key_value_heads: Optional[int], - grad_acc: int, + grad_acc_steps: int, mbs: int, seq_len: int, exp_name: str, @@ -58,11 +58,11 @@ def create_single_config( config_content['logging']['use_wandb'] = use_wandb config_content['logging']['run_name'] = exp_name - gbs = dp * mbs * grad_acc + gbs = dp * mbs * grad_acc_steps gbs_token = gbs * seq_len - print(f"Gbs_token: {gbs_token:,}, Gbs: {gbs}, dp: {dp}, seq_len: {seq_len}, grad_acc: {grad_acc}, mbs: {mbs}") + print(f"Gbs_token: {gbs_token:,}, Gbs: {gbs}, dp: {dp}, seq_len: {seq_len}, grad_acc_steps: {grad_acc_steps}, mbs: {mbs}") - config_content['training']['gradient_accumulation_steps'] = grad_acc + config_content['training']['gradient_accumulation_steps'] = grad_acc_steps config_content['training']['micro_batch_size'] = mbs if os.path.exists(run_path): @@ -85,7 +85,7 @@ if __name__ == "__main__": parser.add_argument("--num_hidden_layers", type=int, help="Number of hidden layers", default=None) parser.add_argument("--num_attention_heads", type=int, help="Number of attention heads", default=None) parser.add_argument("--num_key_value_heads", type=int, help="Number of key value heads", default=None) - parser.add_argument("--grad_acc", type=int, help="grad accumulation", default=1) + parser.add_argument("--grad_acc_steps", type=int, help="grad accumulation", default=1) parser.add_argument("--mbs", type=int, help="micro batch size", default=1) parser.add_argument("--seq_len", type=int, help="Sequence length", default=1024) parser.add_argument("--exp_name", type=str, help="Experiment name", default="dummy_exp") @@ -105,7 +105,7 @@ if __name__ == "__main__": num_hidden_layers=args.num_hidden_layers, num_attention_heads=args.num_attention_heads, num_key_value_heads=args.num_key_value_heads, - grad_acc=args.grad_acc, + grad_acc_steps=args.grad_acc_steps, mbs=args.mbs, seq_len=args.seq_len, exp_name=args.exp_name, diff --git a/train.py b/train.py index 1137ef5..309653a 100644 --- a/train.py +++ b/train.py @@ -34,7 +34,7 @@ def train_step(model, data_loader, device): acc_loss = 0.0 requires_grad_sync = pgm.process_group_manager.cp_dp_world_size > 1 - for i in range(data_loader.grad_acc): + for i in range(data_loader.grad_acc_steps): # get the next batch batch = next(data_loader) input_ids = batch["input_ids"].to(device) @@ -42,7 +42,7 @@ def train_step(model, data_loader, device): # disable gradient synchronization for all but the last micro-batch if requires_grad_sync: - model.require_backward_grad_sync = (i == data_loader.grad_acc - 1) + model.require_backward_grad_sync = (i == data_loader.grad_acc_steps - 1) outputs = model(input_ids=input_ids) @@ -50,7 +50,7 @@ def train_step(model, data_loader, device): batch_size, seq_len = input_ids.shape target_ids = target_ids.reshape(-1) outputs = outputs.view(seq_len*batch_size, -1) - loss = F.cross_entropy(outputs, target_ids, reduction='mean') / data_loader.grad_acc + loss = F.cross_entropy(outputs, target_ids, reduction='mean') / data_loader.grad_acc_steps loss.backward() @@ -82,7 +82,7 @@ if __name__ == "__main__": MAX_TOKENS = config["training"]["max_tokens"] SEED = config["training"]["seed"] TOTAL_TRAIN_STEPS = config["training"]["total_train_steps"] - GRAD_ACC = config["training"]["gradient_accumulation_steps"] + GRAD_ACC_STEPS = config["training"]["gradient_accumulation_steps"] MODEL_NAME = config["model"]["name"] DATASET_NAME = config["dataset"]["name"] NUM_WORKERS = config["dataset"]["num_workers"] @@ -168,7 +168,7 @@ if __name__ == "__main__": "seed": SEED, "micro_batch_size": data_loader.micro_batch_size, "global_batch_size": data_loader.global_batch_size, - "gradient_accumulation": GRAD_ACC, + "gradient_accumulation": data_loader.grad_acc_steps, }, ) diff --git a/utils.py b/utils.py index ad4a355..6a1cf4a 100644 --- a/utils.py +++ b/utils.py @@ -5,7 +5,7 @@ import numpy as np import builtins import fcntl import src.distributed.process_group_manager as pgm -import torch, torch.distributed as dist +import torch from torch.utils.data import DataLoader, DistributedSampler from functools import partial from datasets import Features, Sequence, Value, load_dataset @@ -75,15 +75,12 @@ def load_checkpoint(model, optimizer, out_dir): return checkpoint['trained_steps'], checkpoint['trained_tokens'] class MicroBatchDataLoader(DataLoader): - def __init__(self, micro_batch_size, seq_length, dataset_name, tokenizer_name, num_workers, num_proc, grad_acc, split="train", num_samples=None): + def __init__(self, micro_batch_size, seq_length, dataset_name, tokenizer_name, num_workers, num_proc, grad_acc_steps, split="train", num_samples=None): self.micro_batch_size = micro_batch_size self.seq_length = seq_length - self.grad_acc = grad_acc - - self.local_batch_size = micro_batch_size * grad_acc - self.global_batch_size = self.local_batch_size * pgm.process_group_manager.dp_world_size - self.num_local_micro_batches = self.local_batch_size // self.micro_batch_size + self.grad_acc_steps = grad_acc_steps + self.global_batch_size = micro_batch_size * grad_acc_steps * pgm.process_group_manager.dp_world_size self.num_global_micro_batches = self.global_batch_size // self.micro_batch_size self.seq_length_per_gpu = seq_length // pgm.process_group_manager.cp_world_size @@ -93,8 +90,6 @@ class MicroBatchDataLoader(DataLoader): if num_samples: self.dataset = self.dataset.select(range(min(num_samples, len(self.dataset)))) - dist.barrier() - # Tokenize and chunk the dataset self.tokenized_dataset = self.tokenize_dataset(self.dataset, "text", self.seq_length, num_proc) @@ -106,8 +101,8 @@ class MicroBatchDataLoader(DataLoader): ) super().__init__( - self.tokenized_dataset, - batch_size=micro_batch_size if pgm.process_group_manager.pp_world_size > 1 else self.local_batch_size, # in PP we split a single batch into multiple micro-batches + self.tokenized_dataset, + batch_size=micro_batch_size, collate_fn=self.collate_batch, pin_memory=True, num_workers=num_workers,