diff --git a/convert_hf_to_picotron.py b/convert_hf_to_picotron.py
index 58871ec..0fb9ee9 100644
--- a/convert_hf_to_picotron.py
+++ b/convert_hf_to_picotron.py
@@ -1,5 +1,5 @@
 """
-torchrun --nproc_per_node=1 convert_hf_to_picotron.py --save_path llama_weights.pth
+torchrun --nproc_per_node=1 convert_hf_to_picotron.py --save_path smollm.pth
 """
 import os
 import argparse
diff --git a/generate.py b/generate.py
index 6a7a2a3..bb6ac4f 100644
--- a/generate.py
+++ b/generate.py
@@ -1,4 +1,4 @@
-#VERBOSE=0 torchrun --nproc_per_node 3 generate.py --pp_size 3
+#VERBOSE=0 torchrun --nproc_per_node 3 generate.py --pp_size 3 --load_path smollm.pth 
 import os
 import argparse
 import torch, torch.distributed as dist
@@ -41,18 +41,6 @@ def run_one_inference_step(model, batch, device, config) -> torch.Tensor:
     
     return logits
 
-def load_weights(model: Llama, save_path: str) -> None:
-    state_dict = torch.load(save_path)
-    #TODO: add check that we are not missing any weights
-    for name, param in model.named_parameters():
-        # This assume that the model has only weight parameters
-        new_name = name.split(".weight")[0]
-        module = model.get_submodule(new_name)
-        if name in state_dict:
-            param.data.copy_(state_dict[name])
-    
-    dist.barrier()
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--load_path", type=str)
@@ -62,7 +50,7 @@ if __name__ == "__main__":
     
     local_rank, world_size  = int(os.environ["LOCAL_RANK"]), int(os.environ["WORLD_SIZE"])
 
-    #TODO: add gloo backend for generation
+    #TODO(fmom): add gloo backend for generation
     dist.init_process_group(backend="nccl")
     torch.cuda.set_device(local_rank)
     device = torch.device("cuda", local_rank)
@@ -73,23 +61,21 @@ if __name__ == "__main__":
     model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
     config = AutoConfig.from_pretrained(model_name)
 
-    model = Llama(
+    base_model = Llama(
         config=config,
         device=device,
     )
 
-    model.load_state_dict(torch.load(args.load_path))
-    # model = PipelineParallel(base_model, config).to(device)
-
-    # del base_model
-
+    base_model.load_state_dict(torch.load(args.load_path))
+    model = PipelineParallel(base_model, config).to(device)
+    del base_model
     model.eval()
     
     # Tokenize the input
     prompts = [
         "My name is",
-        # "How old are you ?",
-        # "What is your favorite color?",
+        "How old are you ?",
+        "What is your favorite color?",
     ]
     
     tokenizer = AutoTokenizer.from_pretrained(model_name)
diff --git a/pipeline_parallel.py b/pipeline_parallel.py
index dbfb1f7..1ac3005 100644
--- a/pipeline_parallel.py
+++ b/pipeline_parallel.py
@@ -13,11 +13,12 @@ def reduce_loss_across_dp_ranks(loss, device):
 class PipelineParallel(nn.Module):
     def __init__(self, model, config):
         super().__init__()
+        #TODO(fmom): find a better model to distributed layers without instantiating a base_model first
         layer_distribution = self.distribute_layers(config.num_hidden_layers)
-        self.embed_tokens = model.model.embed_tokens if pgm.process_group_manager.pp_is_first_stage else nn.Identity()
-        self.decoder_layers = nn.ModuleDict({str(i): model.model.layers[i] for i in layer_distribution})
-        self.norm = model.model.norm if pgm.process_group_manager.pp_is_last_stage else nn.Identity()
-        self.lm_head = model.lm_head if pgm.process_group_manager.pp_is_last_stage else nn.Identity()
+        self.embedding = model.embedding if pgm.process_group_manager.pp_is_first_stage else nn.Identity()
+        self.decoder_layers = nn.ModuleDict({str(i): model.decoder_layers[i] for i in layer_distribution})
+        self.final_norm = model.final_norm if pgm.process_group_manager.pp_is_last_stage else nn.Identity()
+        self.final_proj = model.final_proj if pgm.process_group_manager.pp_is_last_stage else nn.Identity()
 
     def distribute_layers(self, num_layers):
         layers_per_gpu = [num_layers // pgm.process_group_manager.pp_world_size + (1 if i < num_layers % pgm.process_group_manager.pp_world_size else 0) for i in range(pgm.process_group_manager.pp_world_size)]
@@ -26,11 +27,11 @@ class PipelineParallel(nn.Module):
 
     def forward(self, batch, device):
         x = batch["hidden_states"].to(device) if batch["hidden_states"] is not None else batch["input_ids"].to(device)
-        x = self.embed_tokens(x)
+        x = self.embedding(x)
         for layer in self.decoder_layers.values():
-            x = layer(x, position_ids=batch["position_index"].to(device))[0]
-        x = self.norm(x)
-        return self.lm_head(x)
+            x = layer(x, position_ids=batch["position_index"].to(device))
+        x = self.final_norm(x)
+        return self.final_proj(x)
 
     def backward(self, input_tensor, output_tensor, output_tensor_grad):
         if input_tensor is not None: input_tensor.retain_grad()
diff --git a/train.py b/train.py
index 7b74286..7c7bb59 100644
--- a/train.py
+++ b/train.py
@@ -13,6 +13,7 @@ from process_group_manager import setup_process_group_manager
 from pipeline_parallel import train_step_pipeline_1f1b, train_step_pipeline_afab, PipelineParallel
 from data_parallel import DataParallel
 from context_parallel import ContextParallel
+from model import Llama
 from dataset import MicroBatchDataLoader
 import wandb
 
@@ -108,7 +109,15 @@ if __name__ == "__main__":
             },
         )
     
-    model = AutoModelForCausalLM.from_pretrained(model_name, config=config).to(device)
+    #TODO: find a better way (should need to specify model_name + path to .pth)
+    model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
+    config = AutoConfig.from_pretrained(model_name)
+
+    model = Llama(
+        config=config,
+        device=device,
+    ).to(device)
+    model.load_state_dict(torch.load("smollm.pth"))
 
     if pgm.process_group_manager.cp_size > 1:
         model = ContextParallel(model, config).to(device)