diff --git a/convert_hf_to_picotron.py b/convert_hf_to_picotron.py index 58871ec..0fb9ee9 100644 --- a/convert_hf_to_picotron.py +++ b/convert_hf_to_picotron.py @@ -1,5 +1,5 @@ """ -torchrun --nproc_per_node=1 convert_hf_to_picotron.py --save_path llama_weights.pth +torchrun --nproc_per_node=1 convert_hf_to_picotron.py --save_path smollm.pth """ import os import argparse diff --git a/generate.py b/generate.py index 6a7a2a3..bb6ac4f 100644 --- a/generate.py +++ b/generate.py @@ -1,4 +1,4 @@ -#VERBOSE=0 torchrun --nproc_per_node 3 generate.py --pp_size 3 +#VERBOSE=0 torchrun --nproc_per_node 3 generate.py --pp_size 3 --load_path smollm.pth import os import argparse import torch, torch.distributed as dist @@ -41,18 +41,6 @@ def run_one_inference_step(model, batch, device, config) -> torch.Tensor: return logits -def load_weights(model: Llama, save_path: str) -> None: - state_dict = torch.load(save_path) - #TODO: add check that we are not missing any weights - for name, param in model.named_parameters(): - # This assume that the model has only weight parameters - new_name = name.split(".weight")[0] - module = model.get_submodule(new_name) - if name in state_dict: - param.data.copy_(state_dict[name]) - - dist.barrier() - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--load_path", type=str) @@ -62,7 +50,7 @@ if __name__ == "__main__": local_rank, world_size = int(os.environ["LOCAL_RANK"]), int(os.environ["WORLD_SIZE"]) - #TODO: add gloo backend for generation + #TODO(fmom): add gloo backend for generation dist.init_process_group(backend="nccl") torch.cuda.set_device(local_rank) device = torch.device("cuda", local_rank) @@ -73,23 +61,21 @@ if __name__ == "__main__": model_name = "HuggingFaceTB/SmolLM-360M-Instruct" config = AutoConfig.from_pretrained(model_name) - model = Llama( + base_model = Llama( config=config, device=device, ) - model.load_state_dict(torch.load(args.load_path)) - # model = PipelineParallel(base_model, config).to(device) - - # del base_model - + base_model.load_state_dict(torch.load(args.load_path)) + model = PipelineParallel(base_model, config).to(device) + del base_model model.eval() # Tokenize the input prompts = [ "My name is", - # "How old are you ?", - # "What is your favorite color?", + "How old are you ?", + "What is your favorite color?", ] tokenizer = AutoTokenizer.from_pretrained(model_name) diff --git a/pipeline_parallel.py b/pipeline_parallel.py index dbfb1f7..1ac3005 100644 --- a/pipeline_parallel.py +++ b/pipeline_parallel.py @@ -13,11 +13,12 @@ def reduce_loss_across_dp_ranks(loss, device): class PipelineParallel(nn.Module): def __init__(self, model, config): super().__init__() + #TODO(fmom): find a better model to distributed layers without instantiating a base_model first layer_distribution = self.distribute_layers(config.num_hidden_layers) - self.embed_tokens = model.model.embed_tokens if pgm.process_group_manager.pp_is_first_stage else nn.Identity() - self.decoder_layers = nn.ModuleDict({str(i): model.model.layers[i] for i in layer_distribution}) - self.norm = model.model.norm if pgm.process_group_manager.pp_is_last_stage else nn.Identity() - self.lm_head = model.lm_head if pgm.process_group_manager.pp_is_last_stage else nn.Identity() + self.embedding = model.embedding if pgm.process_group_manager.pp_is_first_stage else nn.Identity() + self.decoder_layers = nn.ModuleDict({str(i): model.decoder_layers[i] for i in layer_distribution}) + self.final_norm = model.final_norm if pgm.process_group_manager.pp_is_last_stage else nn.Identity() + self.final_proj = model.final_proj if pgm.process_group_manager.pp_is_last_stage else nn.Identity() def distribute_layers(self, num_layers): layers_per_gpu = [num_layers // pgm.process_group_manager.pp_world_size + (1 if i < num_layers % pgm.process_group_manager.pp_world_size else 0) for i in range(pgm.process_group_manager.pp_world_size)] @@ -26,11 +27,11 @@ class PipelineParallel(nn.Module): def forward(self, batch, device): x = batch["hidden_states"].to(device) if batch["hidden_states"] is not None else batch["input_ids"].to(device) - x = self.embed_tokens(x) + x = self.embedding(x) for layer in self.decoder_layers.values(): - x = layer(x, position_ids=batch["position_index"].to(device))[0] - x = self.norm(x) - return self.lm_head(x) + x = layer(x, position_ids=batch["position_index"].to(device)) + x = self.final_norm(x) + return self.final_proj(x) def backward(self, input_tensor, output_tensor, output_tensor_grad): if input_tensor is not None: input_tensor.retain_grad() diff --git a/train.py b/train.py index 7b74286..7c7bb59 100644 --- a/train.py +++ b/train.py @@ -13,6 +13,7 @@ from process_group_manager import setup_process_group_manager from pipeline_parallel import train_step_pipeline_1f1b, train_step_pipeline_afab, PipelineParallel from data_parallel import DataParallel from context_parallel import ContextParallel +from model import Llama from dataset import MicroBatchDataLoader import wandb @@ -108,7 +109,15 @@ if __name__ == "__main__": }, ) - model = AutoModelForCausalLM.from_pretrained(model_name, config=config).to(device) + #TODO: find a better way (should need to specify model_name + path to .pth) + model_name = "HuggingFaceTB/SmolLM-360M-Instruct" + config = AutoConfig.from_pretrained(model_name) + + model = Llama( + config=config, + device=device, + ).to(device) + model.load_state_dict(torch.load("smollm.pth")) if pgm.process_group_manager.cp_size > 1: model = ContextParallel(model, config).to(device)