make new modeling compatible with training

This commit is contained in:
ferdinand.mom 2024-10-10 15:08:23 +00:00
parent 770800b978
commit 47581d29e9
4 changed files with 28 additions and 32 deletions

View File

@ -1,5 +1,5 @@
"""
torchrun --nproc_per_node=1 convert_hf_to_picotron.py --save_path llama_weights.pth
torchrun --nproc_per_node=1 convert_hf_to_picotron.py --save_path smollm.pth
"""
import os
import argparse

View File

@ -1,4 +1,4 @@
#VERBOSE=0 torchrun --nproc_per_node 3 generate.py --pp_size 3
#VERBOSE=0 torchrun --nproc_per_node 3 generate.py --pp_size 3 --load_path smollm.pth
import os
import argparse
import torch, torch.distributed as dist
@ -41,18 +41,6 @@ def run_one_inference_step(model, batch, device, config) -> torch.Tensor:
return logits
def load_weights(model: Llama, save_path: str) -> None:
state_dict = torch.load(save_path)
#TODO: add check that we are not missing any weights
for name, param in model.named_parameters():
# This assume that the model has only weight parameters
new_name = name.split(".weight")[0]
module = model.get_submodule(new_name)
if name in state_dict:
param.data.copy_(state_dict[name])
dist.barrier()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--load_path", type=str)
@ -62,7 +50,7 @@ if __name__ == "__main__":
local_rank, world_size = int(os.environ["LOCAL_RANK"]), int(os.environ["WORLD_SIZE"])
#TODO: add gloo backend for generation
#TODO(fmom): add gloo backend for generation
dist.init_process_group(backend="nccl")
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
@ -73,23 +61,21 @@ if __name__ == "__main__":
model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
config = AutoConfig.from_pretrained(model_name)
model = Llama(
base_model = Llama(
config=config,
device=device,
)
model.load_state_dict(torch.load(args.load_path))
# model = PipelineParallel(base_model, config).to(device)
# del base_model
base_model.load_state_dict(torch.load(args.load_path))
model = PipelineParallel(base_model, config).to(device)
del base_model
model.eval()
# Tokenize the input
prompts = [
"My name is",
# "How old are you ?",
# "What is your favorite color?",
"How old are you ?",
"What is your favorite color?",
]
tokenizer = AutoTokenizer.from_pretrained(model_name)

View File

@ -13,11 +13,12 @@ def reduce_loss_across_dp_ranks(loss, device):
class PipelineParallel(nn.Module):
def __init__(self, model, config):
super().__init__()
#TODO(fmom): find a better model to distributed layers without instantiating a base_model first
layer_distribution = self.distribute_layers(config.num_hidden_layers)
self.embed_tokens = model.model.embed_tokens if pgm.process_group_manager.pp_is_first_stage else nn.Identity()
self.decoder_layers = nn.ModuleDict({str(i): model.model.layers[i] for i in layer_distribution})
self.norm = model.model.norm if pgm.process_group_manager.pp_is_last_stage else nn.Identity()
self.lm_head = model.lm_head if pgm.process_group_manager.pp_is_last_stage else nn.Identity()
self.embedding = model.embedding if pgm.process_group_manager.pp_is_first_stage else nn.Identity()
self.decoder_layers = nn.ModuleDict({str(i): model.decoder_layers[i] for i in layer_distribution})
self.final_norm = model.final_norm if pgm.process_group_manager.pp_is_last_stage else nn.Identity()
self.final_proj = model.final_proj if pgm.process_group_manager.pp_is_last_stage else nn.Identity()
def distribute_layers(self, num_layers):
layers_per_gpu = [num_layers // pgm.process_group_manager.pp_world_size + (1 if i < num_layers % pgm.process_group_manager.pp_world_size else 0) for i in range(pgm.process_group_manager.pp_world_size)]
@ -26,11 +27,11 @@ class PipelineParallel(nn.Module):
def forward(self, batch, device):
x = batch["hidden_states"].to(device) if batch["hidden_states"] is not None else batch["input_ids"].to(device)
x = self.embed_tokens(x)
x = self.embedding(x)
for layer in self.decoder_layers.values():
x = layer(x, position_ids=batch["position_index"].to(device))[0]
x = self.norm(x)
return self.lm_head(x)
x = layer(x, position_ids=batch["position_index"].to(device))
x = self.final_norm(x)
return self.final_proj(x)
def backward(self, input_tensor, output_tensor, output_tensor_grad):
if input_tensor is not None: input_tensor.retain_grad()

View File

@ -13,6 +13,7 @@ from process_group_manager import setup_process_group_manager
from pipeline_parallel import train_step_pipeline_1f1b, train_step_pipeline_afab, PipelineParallel
from data_parallel import DataParallel
from context_parallel import ContextParallel
from model import Llama
from dataset import MicroBatchDataLoader
import wandb
@ -108,7 +109,15 @@ if __name__ == "__main__":
},
)
model = AutoModelForCausalLM.from_pretrained(model_name, config=config).to(device)
#TODO: find a better way (should need to specify model_name + path to .pth)
model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
config = AutoConfig.from_pretrained(model_name)
model = Llama(
config=config,
device=device,
).to(device)
model.load_state_dict(torch.load("smollm.pth"))
if pgm.process_group_manager.cp_size > 1:
model = ContextParallel(model, config).to(device)