diff --git a/picotron/checkpoint.py b/picotron/checkpoint.py index c59b74c..b063fd6 100644 --- a/picotron/checkpoint.py +++ b/picotron/checkpoint.py @@ -52,7 +52,7 @@ def init_model_with_materialized_weights(model, model_config, save_dir): initialization_manager = InitializationManager(model, model_config) layer_names = initialization_manager.get_layer_names_in_sft_format() - print(f"Rank {pgm.process_group_manager.global_rank} responsible for {len(layer_names)} layers") + # print(f"Rank {pgm.process_group_manager.global_rank} responsible for {len(layer_names)} layers") if len(layer_names) == 0: raise Exception("Some ranks has no layers. There are too many ranks and not enough layers to distribute.") diff --git a/train.py b/train.py index 36232da..b8cac4b 100644 --- a/train.py +++ b/train.py @@ -143,9 +143,10 @@ if __name__ == "__main__": if pgm.process_group_manager.global_rank == 0: print(f"rank {pgm.process_group_manager.global_rank}: Creating model config") model_config = AutoConfig.from_pretrained(config["model"]["name"]) - model_config.num_hidden_layers = config["model"]["num_hidden_layers"] - model_config.num_attention_heads = config["model"]["num_attention_heads"] - model_config.num_key_value_heads = config["model"]["num_key_value_heads"] + # twist the model structure if specified in the config file + model_config.num_hidden_layers = model_config.num_hidden_layers if "num_hidden_layers" not in config["model"] else config["model"]["num_hidden_layers"] + model_config.num_attention_heads = model_config.num_attention_heads if "num_attention_heads" not in config["model"] else config["model"]["num_attention_heads"] + model_config.num_key_value_heads = model_config.num_key_value_heads if "num_key_value_heads" not in config["model"] else config["model"]["num_key_value_heads"] model_config.max_position_embeddings = config["training"]["seq_length"] objects = [model_config] else: