make new modeling compatible with training

2024-10-10 15:08:23 +00:00 · 2024-10-10 15:08:23 +00:00 · 47581d29e9
commit 47581d29e9
parent 770800b978
4 changed files with 28 additions and 32 deletions
--- a/convert_hf_to_picotron.py
+++ b/convert_hf_to_picotron.py
@ -1,5 +1,5 @@
 """
-torchrun --nproc_per_node=1 convert_hf_to_picotron.py --save_path llama_weights.pth
+torchrun --nproc_per_node=1 convert_hf_to_picotron.py --save_path smollm.pth
 """
 import os
 import argparse
--- a/generate.py
+++ b/generate.py
@ -1,4 +1,4 @@
-#VERBOSE=0 torchrun --nproc_per_node 3 generate.py --pp_size 3
+#VERBOSE=0 torchrun --nproc_per_node 3 generate.py --pp_size 3 --load_path smollm.pth 
 import os
 import argparse
 import torch, torch.distributed as dist
@ -41,18 +41,6 @@ def run_one_inference_step(model, batch, device, config) -> torch.Tensor:
    
    return logits

-def load_weights(model: Llama, save_path: str) -> None:
-    state_dict = torch.load(save_path)
-    #TODO: add check that we are not missing any weights
-    for name, param in model.named_parameters():
-        # This assume that the model has only weight parameters
-        new_name = name.split(".weight")[0]
-        module = model.get_submodule(new_name)
-        if name in state_dict:
-            param.data.copy_(state_dict[name])
-    
-    dist.barrier()
-
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--load_path", type=str)
@ -62,7 +50,7 @@ if __name__ == "__main__":
    
    local_rank, world_size  = int(os.environ["LOCAL_RANK"]), int(os.environ["WORLD_SIZE"])

-    #TODO: add gloo backend for generation
+    #TODO(fmom): add gloo backend for generation
    dist.init_process_group(backend="nccl")
    torch.cuda.set_device(local_rank)
    device = torch.device("cuda", local_rank)
@ -73,23 +61,21 @@ if __name__ == "__main__":
    model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
    config = AutoConfig.from_pretrained(model_name)

-    model = Llama(
+    base_model = Llama(
        config=config,
        device=device,
    )

-    model.load_state_dict(torch.load(args.load_path))
-    # model = PipelineParallel(base_model, config).to(device)
-
-    # del base_model
-
+    base_model.load_state_dict(torch.load(args.load_path))
+    model = PipelineParallel(base_model, config).to(device)
+    del base_model
    model.eval()
    
    # Tokenize the input
    prompts = [
        "My name is",
-        # "How old are you ?",
-        # "What is your favorite color?",
+        "How old are you ?",
+        "What is your favorite color?",
    ]
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
--- a/pipeline_parallel.py
+++ b/pipeline_parallel.py
@ -13,11 +13,12 @@ def reduce_loss_across_dp_ranks(loss, device):
 class PipelineParallel(nn.Module):
    def __init__(self, model, config):
        super().__init__()
+        #TODO(fmom): find a better model to distributed layers without instantiating a base_model first
        layer_distribution = self.distribute_layers(config.num_hidden_layers)
-        self.embed_tokens = model.model.embed_tokens if pgm.process_group_manager.pp_is_first_stage else nn.Identity()
-        self.decoder_layers = nn.ModuleDict({str(i): model.model.layers[i] for i in layer_distribution})
-        self.norm = model.model.norm if pgm.process_group_manager.pp_is_last_stage else nn.Identity()
-        self.lm_head = model.lm_head if pgm.process_group_manager.pp_is_last_stage else nn.Identity()
+        self.embedding = model.embedding if pgm.process_group_manager.pp_is_first_stage else nn.Identity()
+        self.decoder_layers = nn.ModuleDict({str(i): model.decoder_layers[i] for i in layer_distribution})
+        self.final_norm = model.final_norm if pgm.process_group_manager.pp_is_last_stage else nn.Identity()
+        self.final_proj = model.final_proj if pgm.process_group_manager.pp_is_last_stage else nn.Identity()

    def distribute_layers(self, num_layers):
        layers_per_gpu = [num_layers // pgm.process_group_manager.pp_world_size + (1 if i < num_layers % pgm.process_group_manager.pp_world_size else 0) for i in range(pgm.process_group_manager.pp_world_size)]
@ -26,11 +27,11 @@ class PipelineParallel(nn.Module):

    def forward(self, batch, device):
        x = batch["hidden_states"].to(device) if batch["hidden_states"] is not None else batch["input_ids"].to(device)
-        x = self.embed_tokens(x)
+        x = self.embedding(x)
        for layer in self.decoder_layers.values():
-            x = layer(x, position_ids=batch["position_index"].to(device))[0]
-        x = self.norm(x)
-        return self.lm_head(x)
+            x = layer(x, position_ids=batch["position_index"].to(device))
+        x = self.final_norm(x)
+        return self.final_proj(x)

    def backward(self, input_tensor, output_tensor, output_tensor_grad):
        if input_tensor is not None: input_tensor.retain_grad()
--- a/train.py
+++ b/train.py
@ -13,6 +13,7 @@ from process_group_manager import setup_process_group_manager
 from pipeline_parallel import train_step_pipeline_1f1b, train_step_pipeline_afab, PipelineParallel
 from data_parallel import DataParallel
 from context_parallel import ContextParallel
+from model import Llama
 from dataset import MicroBatchDataLoader
 import wandb

@ -108,7 +109,15 @@ if __name__ == "__main__":
            },
        )
    
-    model = AutoModelForCausalLM.from_pretrained(model_name, config=config).to(device)
+    #TODO: find a better way (should need to specify model_name + path to .pth)
+    model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
+    config = AutoConfig.from_pretrained(model_name)
+
+    model = Llama(
+        config=config,
+        device=device,
+    ).to(device)
+    model.load_state_dict(torch.load("smollm.pth"))

    if pgm.process_group_manager.cp_size > 1:
        model = ContextParallel(model, config).to(device)