make new modeling compatible with training
This commit is contained in:
parent
770800b978
commit
47581d29e9
@ -1,5 +1,5 @@
|
||||
"""
|
||||
torchrun --nproc_per_node=1 convert_hf_to_picotron.py --save_path llama_weights.pth
|
||||
torchrun --nproc_per_node=1 convert_hf_to_picotron.py --save_path smollm.pth
|
||||
"""
|
||||
import os
|
||||
import argparse
|
||||
|
||||
30
generate.py
30
generate.py
@ -1,4 +1,4 @@
|
||||
#VERBOSE=0 torchrun --nproc_per_node 3 generate.py --pp_size 3
|
||||
#VERBOSE=0 torchrun --nproc_per_node 3 generate.py --pp_size 3 --load_path smollm.pth
|
||||
import os
|
||||
import argparse
|
||||
import torch, torch.distributed as dist
|
||||
@ -41,18 +41,6 @@ def run_one_inference_step(model, batch, device, config) -> torch.Tensor:
|
||||
|
||||
return logits
|
||||
|
||||
def load_weights(model: Llama, save_path: str) -> None:
|
||||
state_dict = torch.load(save_path)
|
||||
#TODO: add check that we are not missing any weights
|
||||
for name, param in model.named_parameters():
|
||||
# This assume that the model has only weight parameters
|
||||
new_name = name.split(".weight")[0]
|
||||
module = model.get_submodule(new_name)
|
||||
if name in state_dict:
|
||||
param.data.copy_(state_dict[name])
|
||||
|
||||
dist.barrier()
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--load_path", type=str)
|
||||
@ -62,7 +50,7 @@ if __name__ == "__main__":
|
||||
|
||||
local_rank, world_size = int(os.environ["LOCAL_RANK"]), int(os.environ["WORLD_SIZE"])
|
||||
|
||||
#TODO: add gloo backend for generation
|
||||
#TODO(fmom): add gloo backend for generation
|
||||
dist.init_process_group(backend="nccl")
|
||||
torch.cuda.set_device(local_rank)
|
||||
device = torch.device("cuda", local_rank)
|
||||
@ -73,23 +61,21 @@ if __name__ == "__main__":
|
||||
model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
|
||||
model = Llama(
|
||||
base_model = Llama(
|
||||
config=config,
|
||||
device=device,
|
||||
)
|
||||
|
||||
model.load_state_dict(torch.load(args.load_path))
|
||||
# model = PipelineParallel(base_model, config).to(device)
|
||||
|
||||
# del base_model
|
||||
|
||||
base_model.load_state_dict(torch.load(args.load_path))
|
||||
model = PipelineParallel(base_model, config).to(device)
|
||||
del base_model
|
||||
model.eval()
|
||||
|
||||
# Tokenize the input
|
||||
prompts = [
|
||||
"My name is",
|
||||
# "How old are you ?",
|
||||
# "What is your favorite color?",
|
||||
"How old are you ?",
|
||||
"What is your favorite color?",
|
||||
]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
@ -13,11 +13,12 @@ def reduce_loss_across_dp_ranks(loss, device):
|
||||
class PipelineParallel(nn.Module):
|
||||
def __init__(self, model, config):
|
||||
super().__init__()
|
||||
#TODO(fmom): find a better model to distributed layers without instantiating a base_model first
|
||||
layer_distribution = self.distribute_layers(config.num_hidden_layers)
|
||||
self.embed_tokens = model.model.embed_tokens if pgm.process_group_manager.pp_is_first_stage else nn.Identity()
|
||||
self.decoder_layers = nn.ModuleDict({str(i): model.model.layers[i] for i in layer_distribution})
|
||||
self.norm = model.model.norm if pgm.process_group_manager.pp_is_last_stage else nn.Identity()
|
||||
self.lm_head = model.lm_head if pgm.process_group_manager.pp_is_last_stage else nn.Identity()
|
||||
self.embedding = model.embedding if pgm.process_group_manager.pp_is_first_stage else nn.Identity()
|
||||
self.decoder_layers = nn.ModuleDict({str(i): model.decoder_layers[i] for i in layer_distribution})
|
||||
self.final_norm = model.final_norm if pgm.process_group_manager.pp_is_last_stage else nn.Identity()
|
||||
self.final_proj = model.final_proj if pgm.process_group_manager.pp_is_last_stage else nn.Identity()
|
||||
|
||||
def distribute_layers(self, num_layers):
|
||||
layers_per_gpu = [num_layers // pgm.process_group_manager.pp_world_size + (1 if i < num_layers % pgm.process_group_manager.pp_world_size else 0) for i in range(pgm.process_group_manager.pp_world_size)]
|
||||
@ -26,11 +27,11 @@ class PipelineParallel(nn.Module):
|
||||
|
||||
def forward(self, batch, device):
|
||||
x = batch["hidden_states"].to(device) if batch["hidden_states"] is not None else batch["input_ids"].to(device)
|
||||
x = self.embed_tokens(x)
|
||||
x = self.embedding(x)
|
||||
for layer in self.decoder_layers.values():
|
||||
x = layer(x, position_ids=batch["position_index"].to(device))[0]
|
||||
x = self.norm(x)
|
||||
return self.lm_head(x)
|
||||
x = layer(x, position_ids=batch["position_index"].to(device))
|
||||
x = self.final_norm(x)
|
||||
return self.final_proj(x)
|
||||
|
||||
def backward(self, input_tensor, output_tensor, output_tensor_grad):
|
||||
if input_tensor is not None: input_tensor.retain_grad()
|
||||
|
||||
11
train.py
11
train.py
@ -13,6 +13,7 @@ from process_group_manager import setup_process_group_manager
|
||||
from pipeline_parallel import train_step_pipeline_1f1b, train_step_pipeline_afab, PipelineParallel
|
||||
from data_parallel import DataParallel
|
||||
from context_parallel import ContextParallel
|
||||
from model import Llama
|
||||
from dataset import MicroBatchDataLoader
|
||||
import wandb
|
||||
|
||||
@ -108,7 +109,15 @@ if __name__ == "__main__":
|
||||
},
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name, config=config).to(device)
|
||||
#TODO: find a better way (should need to specify model_name + path to .pth)
|
||||
model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
|
||||
model = Llama(
|
||||
config=config,
|
||||
device=device,
|
||||
).to(device)
|
||||
model.load_state_dict(torch.load("smollm.pth"))
|
||||
|
||||
if pgm.process_group_manager.cp_size > 1:
|
||||
model = ContextParallel(model, config).to(device)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user