breaking: add slurm stuff

This commit is contained in:
ferdinand.mom 2024-10-29 15:44:35 +00:00
parent 987a7c5c99
commit 47c00be8c7
7 changed files with 682 additions and 132 deletions

32
bench/check_status.sh Executable file
View File

@ -0,0 +1,32 @@
#!/bin/bash
# Initialize counters
declare -A counts
statuses=("init" "pending" "running" "fail" "oom" "timeout" "completed")
for status in "${statuses[@]}"; do
counts[$status]=0
done
# Find and process all status.txt files
while IFS= read -r -d '' file; do
status=$(cat "$file" | tr -d '[:space:]')
if [[ " ${statuses[@]} " =~ " ${status} " ]]; then
((counts[$status]++))
fi
done < <(find "$1" -name "status.txt" -print0)
# Calculate total
total=0
for count in "${counts[@]}"; do
((total += count))
done
# Print the results
echo "Status | Count"
echo "-----------|---------"
for status in "${statuses[@]}"; do
printf "%-10s | %d\n" "$status" "${counts[$status]}"
done
echo "-----------|---------"
echo "Total | $total"

192
bench/create_configs.py Normal file
View File

@ -0,0 +1,192 @@
from copy import deepcopy
import numpy as np
from template.template_base_configs import template_base_config
import itertools
import yaml
import os
from transformers import AutoTokenizer
import math
import shutil
import argparse
def update_config_based_on_model(model: str, config: dict):
# Setting num_attention_heads = num_key_value_heads for all models <=> using MHA for all layers
if model == "small-llama":
config["model"]["model_config"]["hidden_size"] = 512
config["model"]["model_config"]["intermediate_size"] = 1024
config["model"]["model_config"]["num_attention_heads"] = 16
config["model"]["model_config"]["num_hidden_layers"] = 10
config["model"]["model_config"]["num_key_value_heads"] = 16
config["model"]["model_config"]["max_position_embeddings"] = config["tokens"]["sequence_length"]
elif model == "llama-1M":
config["model"]["model_config"]["hidden_size"] = 768
config["model"]["model_config"]["intermediate_size"] = 3072
config["model"]["model_config"]["num_attention_heads"] = 16
config["model"]["model_config"]["num_hidden_layers"] = 12
config["model"]["model_config"]["num_key_value_heads"] = 16
config["model"]["model_config"]["max_position_embeddings"] = config["tokens"]["sequence_length"]
elif model == "llama-1B":
# HuggingFaceFW/ablation-model-fineweb-v1
config["model"]["model_config"]["hidden_size"] = 2048
config["model"]["model_config"]["intermediate_size"] = 4096
config["model"]["model_config"]["num_attention_heads"] = 32
config["model"]["model_config"]["num_hidden_layers"] = 24
config["model"]["model_config"]["num_key_value_heads"] = 32
config["model"]["model_config"]["max_position_embeddings"] = config["tokens"]["sequence_length"]
tokenizer = AutoTokenizer.from_pretrained(config["tokenizer"]["tokenizer_name_or_path"])
config["model"]["model_config"]["vocab_size"] = tokenizer.vocab_size
def create_single_config(
out_dir: str,
model: str,
gpus: int,
dp: int,
tp: int,
pp: int,
bapr: int,
mbs: int,
no_profiler: bool = False,
cluster: str = "hf",
exp_name: str = None,
seq_len: int = 4096,
lighteval: bool = False,
s3: bool = False,
# recompute_layer: bool = False,
dry_run: bool = False
):
run_path = os.path.join(out_dir, exp_name)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
print(f"Creating single config for {model} given {gpus} GPUs")
config_content = deepcopy(base_config)
config_content["tokens"]["sequence_length"] = seq_len
# config_content["parallelism"]["recompute_layer"] = recompute_layer
config_content["checkpoints"]["checkpoints_path"] = run_path
update_config_based_on_model(model, config_content)
if cluster == "hf":
tp_max_cluster = 8
elif cluster == "swiss-ai":
tp_max_cluster = 4 # GH200
config_content['parallelism']['dp'] = dp
config_content['parallelism']['tp'] = tp
config_content['parallelism']['pp'] = pp
# Compute global batch_size and print
gbs = dp * mbs * bapr
gbs_token = gbs * seq_len
# Print in human readable format
print(f"Gbs_token: {gbs_token:,}, Gbs: {gbs}, dp: {dp}, seq_len: {seq_len}, bapr: {bapr}, mbs: {mbs}")
config_content['tokens']['batch_accumulation_per_replica'] = bapr
config_content['tokens']['micro_batch_size'] = mbs
# Create a directory for each combination of parallelism
# if recompute_layer:
# run_path += "_recompute_layer"
# Get absoulte path for run_path
if no_profiler:
config_content['profiler'] = None
else:
config_content['profiler']['profiler_export_path'] = os.path.abspath(run_path)
if s3:
config_content["general"]["is_s3_available"] = True
config_content['s3_upload'] = {
"remove_after_upload": True,
"s5cmd_concurrency": 5,
"s5cmd_numworkers": 16,
"s5cmd_path": "/fsx/elie_bakouch/miniconda3/envs/smollm/bin/s5cmd",
"upload_s3_path": f"s3://huggingface-brrr-us-east-1/fmom/nanotron_pr/{exp_name}"
}
if lighteval:
config_content['lighteval'] = {
"batch_size": 16,
"generation": None,
"logging": {
"output_dir": None,
"public_run": False,
"push_to_hub": True,
"push_to_tensorboard": True,
"results_org": "HuggingFaceSmol",
"save_details": True,
"tensorboard_metric_prefix": "eval"
},
"parallelism": {
"dp": dp,
"expert_parallel_size": 1,
"pp": pp,
"pp_engine": "1f1b",
"recompute_layer": False,
"tp": tp,
"tp_linear_async_communication": False,
"tp_mode": "ALL_REDUCE",
"tp_recompute_allgather": True
},
"tasks": {
"custom_tasks": "nanotron.lighteval.evaluation_tasks",
"dataset_loading_processes": 8,
"max_samples": 1000,
"multichoice_continuations_start_space": None,
"num_fewshot_seeds": None,
"pair_wise_tokenization": False,
"tasks": "early-signal"
}
}
if os.path.exists(run_path):
shutil.rmtree(run_path)
if not dry_run:
os.makedirs(run_path)
with open(os.path.join(run_path, "config.yaml"), "w") as new_config:
yaml.dump(config_content, new_config, default_flow_style=False, sort_keys=False)
del config_content
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--out_dir", type=str, help="Output directory to store the configs")
parser.add_argument("--model", type=str, help="Model to create configs for")
parser.add_argument("--gpus", type=int, help="Number of GPUs")
parser.add_argument("--dp", type=int, required=True, help="Max number of data parallelism")
parser.add_argument("--tp", type=int, required=True, help="Max number of tensor parallelism")
parser.add_argument("--pp", type=int, required=True, help="Max number of pipeline parallelism")
parser.add_argument("--bapr", type=int, help="Max batch accumulation per replica")
parser.add_argument("--mbs", type=int, help="Max micro batch size")
parser.add_argument("--seq_len", type=int, help="Sequence length", default=4096)
parser.add_argument("--exp_name", type=str, help="Experiment name")
parser.add_argument("--recompute_layer", action="store_true", help="Enable recompute allgather for tensor parallelism")
parser.add_argument("--use_async", action="store_true", help="Enable async communication for tensor parallelism")
parser.add_argument("--lighteval", action="store_true", help="Enable light evaluation")
parser.add_argument("--s3", action="store_true", help="Enable light evaluation")
args=parser.parse_args()
create_single_config(
out_dir=args.out_dir,
model=args.model,
gpus=args.gpus,
dp=args.dp,
tp=args.tp,
pp=args.pp,
bapr=args.bapr,
mbs=args.mbs,
cluster="hf",
exp_name=args.exp_name,
seq_len=args.seq_len,
# recompute_layer=args.recompute_layer,
lighteval=args.lighteval,
s3=args.s3,
dry_run=False,
no_profiler=True
)

232
bench/submit_jobs.py Normal file
View File

@ -0,0 +1,232 @@
from enum import Enum
import os
from jinja2 import Template
import subprocess
import yaml
from typing import List
class Status(Enum):
# INIT -> PENDING -> [RUNNING | FAIL | TIMEOUT OOM] -> COMPLETED
INIT = "init" # Job is created
PENDING = "pending" # Job is waiting for ressources
RUNNING = "running" # Job is running
FAIL = "fail" # Job failed
OOM = "oom" # Job failed due to out of memory (expected behavior)
TIMEOUT = "timeout" # Job failed due to timeout
COMPLETED = "completed" # Job is completed
class Job:
def __init__(self, root_path: str, qos: str) -> None:
self.root_path = root_path
self.name = os.path.basename(root_path)
self.config = os.path.join(root_path, "config.yaml")
self.qos = qos
# Check if the status.txt file exists
status_file_path = os.path.join(self.root_path, "status.txt")
if not os.path.exists(status_file_path):
# Create the status.txt file with INIT status
with open(status_file_path, 'w') as f:
f.write(Status.INIT.value)
self.status = self.get_status()
def get_status(self) -> Status:
"""
Read the status of the job from `status.txt` and return it
"""
is_existing = lambda value_to_check: any(value.value == value_to_check for value in Status.__members__.values())
status_file_path = os.path.join(self.root_path, "status.txt")
with open(status_file_path, 'r') as f:
status = f.read()
if not is_existing(status):
raise ValueError("Invalid status")
return Status(status)
def set_status(self, status: Status) -> Status:
"""
Update the status of the job in `status.txt` and return the new status
"""
status_file_path = os.path.join(self.root_path, "status.txt")
with open(status_file_path, 'w') as f:
f.write(status.value)
return status
class Scheduler:
def __init__(self, inp_dir: str, qos: str) -> None:
jobs_directory_paths = [os.path.abspath(root) for root, dirs, _ in os.walk(inp_dir) if not dirs]
jobs_directory_paths = [job_path.replace("/profiler", "") if "profiler" in job_path else job_path for job_path in jobs_directory_paths]
self.job_lists = [Job(job_path, qos) for job_path in jobs_directory_paths]
def keep_only_jobs(self, status: Status):
return [job for job in self.job_lists if job.status == status]
def filter_out_jobs(self, status: Status):
return [job for job in self.job_lists if job.status != status]
def create_slurm_script(self, job: Job, cluster: str):
# Submit job to the cluster (edit jinja)
# load yaml config.yaml
with open(job.config, 'r') as file:
config = yaml.load(file, Loader=yaml.FullLoader)
if cluster == "hf":
max_nodes = 8
elif cluster == "swiss-ai":
max_nodes = 4
else:
raise ValueError("Invalid cluster")
# Pick the right number of nodes and n_proc_per_node
world_size = config['parallelism']['pp'] * config['parallelism']['dp'] * config['parallelism']['tp']
assert world_size <= max_nodes or world_size % max_nodes == 0
nodes = max(1, world_size // max_nodes)
n_proc_per_node = min(8, world_size // nodes)
assert nodes * n_proc_per_node == world_size
target_path_hf_hub = os.path.join(os.path.basename(os.path.dirname(os.path.dirname(job.root_path))), os.path.basename(os.path.dirname(job.root_path)), os.path.basename(job.root_path))
context_bench = {
'nodes': nodes,
'n_proc_per_node': n_proc_per_node,
'root_path': job.root_path,
'target_path_hf_hub': target_path_hf_hub,
"config": job.config,
"qos": job.qos,
}
#TODO: don't hardcode the base_bench.slurm path. Should be #HOME/bench_cluster/template/base_bench.slurm
if cluster == "hf":
base_path = "/fsx/ferdinandmom/ferdinand-hf/nanotron/debug/template/base_bench.slurm"
else:
raise ValueError("Invalid cluster")
with open(base_path, 'r') as file:
base_bench_file = file.read()
base_bench_template = Template(base_bench_file)
# Write the rendered script to a new file located at the job root_path
output_file_path = os.path.join(job.root_path, "bench.slurm")
with open(output_file_path, 'w') as file:
file.write(base_bench_template.render(context_bench))
print(f"Slurm script created at {output_file_path}")
def launch_dependency(self, job_array: List[Job], env_vars):
prev_job_id = None
for job in job_array:
if prev_job_id is None:
result = subprocess.run(["sbatch", '--parsable', os.path.join(job.root_path, "bench.slurm")], env=env_vars, capture_output=True, text=True)
else:
result = subprocess.run(["sbatch", '--parsable', '--dependency=afterany:'+prev_job_id, os.path.join(job.root_path, "bench.slurm")], env=env_vars, capture_output=True, text=True)
job.set_status(Status.PENDING)
prev_job_id = result.stdout.strip()
def check_status(self):
# find all status files using self.jobs_directory_paths
status_files = [os.path.join(job.root_path, "status.txt") for job in self.job_lists]
status_counts = {
"init": 0,
"pending": 0,
"running": 0,
"fail": 0,
"oom": 0,
"timeout": 0,
"completed": 0
}
for status_file in status_files:
with open(status_file, 'r') as f:
status = f.read().strip()
if status in status_counts:
status_counts[status] += 1
else:
raise ValueError(f"Invalid status: {status}")
total = sum(status_counts.values())
# Print the status counts in a formatted table
print(f"{'Status':<10} | {'Count':<6}")
print(f"{'-'*10}-|-{'-'*6}")
for status, count in status_counts.items():
print(f"{status.capitalize():<10} | {count:<6}")
print(f"{'-'*10}-|-{'-'*6}")
print(f"{'Total':<10} | {total:<6}")
def submit_jobs(inp_dir, qos, nb_slurm_array, cluster: str, only: str = None):
scheduler = Scheduler(inp_dir, qos)
#TODO: batch into job arrays
env_vars = os.environ.copy()
total_jobs = len(scheduler.job_lists)
if only == "fail":
scheduler.job_lists = scheduler.keep_only_jobs(Status.FAIL)
elif only == "pending":
scheduler.job_lists = scheduler.keep_only_jobs(Status.PENDING)
elif only == "timeout":
scheduler.job_lists = scheduler.keep_only_jobs(Status.TIMEOUT)
elif only == "running":
scheduler.job_lists = scheduler.keep_only_jobs(Status.RUNNING)
if only is not None:
filtered_jobs = len(scheduler.job_lists)
if filtered_jobs == 0:
print(f"No '{only}' jobs to resubmit")
return
print(f"Only {filtered_jobs}/{total_jobs} jobs with status '{only}' will be resubmitted")
scheduler.job_lists = scheduler.filter_out_jobs(Status.COMPLETED)
if nb_slurm_array > 0:
# Use job dependecies
# Distribute the jobs into the arrays
base_jobs_per_array = len(scheduler.job_lists) // nb_slurm_array
extra_jobs = len(scheduler.job_lists) % nb_slurm_array
distribution = [base_jobs_per_array] * nb_slurm_array
for i in range(extra_jobs):
distribution[i] += 1
start = 0
for i, nb_jobs in enumerate(distribution):
previous_job_id = None
end = start + nb_jobs
job_array = scheduler.job_lists[start:end]
print(f"Launching job Dependency array {i+1} with {nb_jobs} jobs")
for job in job_array:
scheduler.create_slurm_script(job, cluster)
scheduler.launch_dependency(job_array, env_vars)
start = end
else:
# Don't use job dependecies
for job in scheduler.job_lists:
scheduler.create_slurm_script(job, cluster)
print(os.path.join(job.root_path, "bench.slurm"))
subprocess.run(["sbatch", os.path.join(job.root_path, "bench.slurm")], env=env_vars)
job.set_status(Status.PENDING)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Submit jobs to the cluster')
parser.add_argument('--inp_dir', type=str, help='Input directory containing the jobs')
parser.add_argument('--qos', type=str, help='QOS of the jobs')
parser.add_argument('--nb_slurm_array', type=int, default=0, help='Number of slurm arrays')
parser.add_argument('--cluster', type=str, default='hf', help='Cluster to submit the jobs')
parser.add_argument('--only', type=str, default=None, help='Filter the jobs to submit')
args = parser.parse_args()
submit_jobs(args.inp_dir, args.qos, args.nb_slurm_array, cluster=args.cluster, only=args.only)

View File

@ -0,0 +1,91 @@
#!/bin/bash
#SBATCH --job-name=bench-picotron
#SBATCH --time=00:30:00
#SBATCH --partition=hopper-prod
#SBATCH --nodes={{ nodes }}
#SBATCH --gres=gpu:{{ n_proc_per_node }}
#SBATCH --qos={{ qos }}
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=96
#SBATCH --exclusive
#SBATCH --output={{ root_path }}/log_%j.out
#SBATCH --error={{ root_path }}/log_%j.out
# Function to update status based on squeue output
update_status() {
job_id=$1
status_file=$2
# For unknown reasons, it doenst update status for pending. It only works for running
while true; do
job_status=$(squeue --job $job_id --noheader --format=%T)
echo "Job status: $job_status"
if [ -z "$job_status" ]; then
# Job has finished or is not found
break
elif [ "$job_status" = "RUNNING" ]; then
printf "running" > $status_file
break
fi
sleep 10
done
}
# Misc initializations.
echo "========================"
echo "START TIME: $(date)"
source /etc/profile.d/modules.sh
source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
conda activate /fsx/ferdinandmom/miniforge3/envs/env-picotron
echo python3 version = $(python3 --version)
echo "========================"
# Slurm stuff
export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=$((1024 + RANDOM % 64511))
export TMPDIR=/scratch
export TORCH_HOME=/fsx/$USER/.cache/torch
export HF_HOME=/fsx/$USER/.cache/huggingface
export WANDB_DIR=/fsx/$USER/.cache/wandb
export CUBLAS_WORKSPACE_CONFIG=":4096:8"
export CUDA_DEVICE_MAX_CONNECTIONS="1"
module load cuda/12.1
huggingface-cli login --token $HUGGINGFACE_TOKEN
GIT_REPO="/fsx/ferdinandmom/ferdinand-hf/picotron/"
CMD="$GIT_REPO/run_train.py --config-path {{ config }} --logs-path {{ root_path }} --run output --slurm --nodes {{ nodes }}"
LAUNCHER="python"
# Checkout the bench_cluster branch
cd $GIT_REPO
# Get the current job ID
job_id=${SLURM_JOB_ID}
# Update status to "pending" or "running" in the background
update_status $job_id {{ root_path }}/status.txt &
# Run the main command
echo "Running command: $CMD"
srun -u $LAUNCHER $CMD
exit_status=$?
job_id=$SLURM_JOB_ID
# Update status based on the exit status of `srun`
if [ $exit_status -eq 0 ]; then
printf "completed" > {{ root_path }}/status.txt
else
if grep -q "OutOfMemoryError" {{ root_path }}/log_${job_id}.out; then
printf "oom" > {{ root_path }}/status.txt
elif grep -q " CUDA error: an illegal memory access" {{ root_path }}/log_${job_id}.out; then
printf "oom" > {{ root_path }}/status.txt
elif grep -q "Timeout" {{ root_path }}/log_${job_id}.out; then
printf "timeout" > {{ root_path }}/status.txt
else
printf "fail" > {{ root_path }}/status.txt
fi
fi

View File

@ -0,0 +1,51 @@
{
"distributed": {
"tp_size": 1,
"cp_size": 1,
"pp_size": 2,
"dp_size": 2,
"master_addr": "localhost",
"master_port": 29500,
"backend": "nccl",
"use_cpu": false
},
"model": {
"name": "HuggingFaceTB/SmolLM-360M-Instruct",
"num_hidden_layers": 16,
"num_attention_heads": 16,
"num_key_value_heads": 4,
"dtype": "bfloat16",
"use_flash_attention": true
},
"training": {
"seed": 42,
"learning_rate": 3e-4,
"total_train_steps": 200,
"seq_length": 1024,
"local_batch_size": 64,
"micro_batch_size": 32,
"gradient_accumulation_steps": 1,
"num_samples": 400000,
"max_tokens": null
},
"dataset": {
"name": "roneneldan/TinyStories",
"num_workers": 4,
"num_proc": 4
},
"checkpoint": {
"save_dir": "ckpt",
"save_frequency": 300,
"load_path": ""
},
"logging": {
"use_wandb": false,
"project_name": "picotron",
"run_name": null
},
"environment": {
"OMP_NUM_THREADS": "1",
"TOKENIZERS_PARALLELISM": "false",
"FLASH_ATTEN": "1"
}
}

154
train.py
View File

@ -10,6 +10,7 @@ CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=4 --nnodes=1 --
"""
import os
import json
import time
import argparse
from src.parallel.context_parallel import parallel_input
@ -57,76 +58,80 @@ def train_step(model, data_loader, device):
return acc_loss
if __name__ == "__main__":
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--tp_size", type=int, default=1)
parser.add_argument("--cp_size", type=int, default=1)
parser.add_argument("--pp_size", type=int, default=1)
parser.add_argument("--dp_size", type=int, default=1)
parser.add_argument("--use_wandb", action="store_true", default=False)
parser.add_argument("--use_cpu", action="store_true", default=False)
parser.add_argument("--master_addr", type=str, default="localhost")
parser.add_argument("--master_port", type=int, default=29500)
parser.add_argument("--load_path", type=str, default="", help="Path to load the model from")
parser.add_argument("--ckpt_dir", type=str, default="ckpt", help="Directory to save checkpoints")
parser.add_argument("--ckpt_freq", type=int, default=300, help="Frequency to save checkpoints")
parser.add_argument("--config", type=str, default="", help="Path to config file")
args = parser.parse_args()
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["FLASH_ATTEN"] = "1" # Use cuda kernels from flash attention repo to accelerate the training. Model dtype should be torch.float16!
os.environ["DEVICE"] = "cuda" if not args.use_cpu else "cpu"
dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() and not args.use_cpu else torch.float32 # if GPU is not available or not supported, use torch.float32
assert (dtype == torch.bfloat16 and os.getenv("FLASH_ATTEN") == "1") or os.getenv("FLASH_ATTEN") != "1", "Kernel operations requires dtype=torch.bfloat16"
with open(args.config, "r") as f:
config = json.load(f)
os.environ["OMP_NUM_THREADS"] = config["environment"]["OMP_NUM_THREADS"]
os.environ["TOKENIZERS_PARALLELISM"] = config["environment"]["TOKENIZERS_PARALLELISM"]
os.environ["FLASH_ATTEN"] = config["environment"]["FLASH_ATTEN"] # Use cuda kernels from flash attention repo to accelerate the training. Model dtype should be torch.float16!
os.environ["DEVICE"] = "cpu" if config["distributed"]["use_cpu"] else "cuda"
dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() and not config["distributed"]["use_cpu"] else torch.float32 # if GPU is not available or not supported, use torch.float32
assert (dtype == torch.bfloat16 and os.getenv("FLASH_ATTEN") == "1") or os.getenv("FLASH_ATTEN") != "1", "Kernel operations requires dtype=torch.bfloat16"
# hyperparameters
SEQ_LEN = config["training"]["seq_length"]
LOCAL_BATCH_SIZE = config["training"]["local_batch_size"]
MICRO_BATCH_SIZE = config["training"]["micro_batch_size"]
LEARNING_RATE = config["training"]["learning_rate"]
NUM_SAMPLES = config["training"]["num_samples"]
MAX_TOKENS = config["training"]["max_tokens"]
SEED = config["training"]["seed"]
TOTAL_TRAIN_STEPS = config["training"]["total_train_steps"]
GRAD_ACC = config["training"]["gradient_accumulation_steps"]
MODEL_NAME = config["model"]["name"]
DATASET_NAME = config["dataset"]["name"]
NUM_WORKERS = config["dataset"]["num_workers"]
NUM_PROC = config["dataset"]["num_proc"]
USE_WANDB = config["logging"]["use_wandb"]
TP_SIZE = config["distributed"]["tp_size"]
PP_SIZE = config["distributed"]["pp_size"]
DP_SIZE = config["distributed"]["dp_size"]
CP_SIZE = config["distributed"]["cp_size"]
LOAD_PATH = config["checkpoint"]["load_path"]
CHECKPOINT_DIR = config["checkpoint"]["save_dir"]
CHECKPOINT_FREQ = config["checkpoint"]["save_frequency"]
local_rank = int(os.environ["LOCAL_RANK"])
world_size = int(os.environ["WORLD_SIZE"])
host = os.environ["MASTER_ADDR"]
port = int(os.environ["MASTER_PORT"])
backend = "gloo" if config["distributed"]["use_cpu"] else "nccl"
## hyperparameters
SEQ_LEN, LOCAL_BATCH_SIZE, MICRO_BATCH_SIZE, LEARNING_RATE, NUM_SAMPLES, MAX_TOKENS, SEED = 1024, 64, 32, 3e-4, 400000, None, 42
total_train_steps = 200
grad_acc = 1
assert SEQ_LEN % args.cp_size == 0, "SEQ_LEN must be divisible by cp_size for Context Parallelism"
assert SEQ_LEN % CP_SIZE == 0, "SEQ_LEN must be divisible by cp_size for Context Parallelism"
assert world_size == TP_SIZE * PP_SIZE * DP_SIZE * CP_SIZE, "world_size must be equal to tp_size * pp_size * dp_size * cp_size"
backend = "gloo" if args.use_cpu else "nccl"
if backend == "nccl":
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
else:
device = torch.device("cpu")
dist.init_process_group(rank=local_rank, world_size=world_size, backend=backend, init_method=f"tcp://{host}:{port}")
setup_process_group_manager(tp_size=args.tp_size, cp_size=args.cp_size, pp_size=args.pp_size, dp_size=args.dp_size)
setup_process_group_manager(tp_size=TP_SIZE, cp_size=CP_SIZE, pp_size=PP_SIZE, dp_size=DP_SIZE)
is_wandb_rank = pgm.process_group_manager.tp_rank == 0 and pgm.process_group_manager.dp_rank == 0 and pgm.process_group_manager.cp_rank == 0 and pgm.process_group_manager.pp_is_last_stage
# if pgm.process_group_manager.global_rank == 0:
# display_4D_parallelism_grid()
tokens_per_step = LOCAL_BATCH_SIZE * SEQ_LEN * grad_acc * args.dp_size
tokens_per_step = LOCAL_BATCH_SIZE * SEQ_LEN * GRAD_ACC * DP_SIZE
if pgm.process_group_manager.global_rank == 0:
print("Tokens per step:", to_readable_format(tokens_per_step), is_print_rank=is_wandb_rank)
set_all_seed(SEED)
dataset_name = "roneneldan/TinyStories"
model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
# model_name = "meta-llama/Llama-2-7b-hf"
config = AutoConfig.from_pretrained(model_name)
config.num_hidden_layers = 16
config.num_attention_heads = 16
config.num_key_value_heads = 4
model_config = AutoConfig.from_pretrained(MODEL_NAME)
model_config.num_hidden_layers = config["model"]["num_hidden_layers"]
model_config.num_attention_heads = config["model"]["num_attention_heads"]
model_config.num_key_value_heads = config["model"]["num_key_value_heads"]
start_time = time.time()
model = Llama(config=config)
model = Llama(config=model_config)
print("init model time:", time.time()-start_time, is_print_rank=is_wandb_rank)
if is_wandb_rank and args.use_wandb:
if is_wandb_rank and USE_WANDB:
wandb.init(
project="picotron",
name=f"test_convergence_GBS_{tokens_per_step}_{pgm.process_group_manager}",
@ -134,26 +139,26 @@ if __name__ == "__main__":
"tensor_parallel_size": pgm.process_group_manager.tp_size,
"pipeline_parallel_size": pgm.process_group_manager.pp_size,
"data_parallel_size": pgm.process_group_manager.dp_size,
"model": model_name,
"dataset": dataset_name,
"model": config["model"]["name"],
"dataset": config["dataset"]["name"],
"max_tokens": MAX_TOKENS,
"learning_rate": LEARNING_RATE,
"seed": SEED,
"micro_batch_size": MICRO_BATCH_SIZE,
"global_batch_size": LOCAL_BATCH_SIZE * args.dp_size * grad_acc,
"gradient_accumulation": grad_acc,
"global_batch_size": LOCAL_BATCH_SIZE * pgm.process_group_manager.dp_size * GRAD_ACC,
"gradient_accumulation": GRAD_ACC,
},
)
start_time = time.time()
model.to(dtype).to(device)
if pgm.process_group_manager.tp_world_size > 1:
TensorParallel(model)
if pgm.process_group_manager.pp_world_size > 1:
model = PipelineParallel(model, config)
model.to(dtype).to(device)
model = PipelineParallel(model, model_config)
# Context parallel and Data parallel both need gradient synchronization
if pgm.process_group_manager.cp_dp_world_size > 1:
model = DataParallel(model)
@ -165,27 +170,34 @@ if __name__ == "__main__":
print("model to device time:", time.time()-start_time, is_print_rank=is_wandb_rank)
start_time = time.time()
data_loader = MicroBatchDataLoader(local_batch_size=LOCAL_BATCH_SIZE, micro_batch_size=MICRO_BATCH_SIZE, seq_length=SEQ_LEN, dataset_name=dataset_name, tokenizer_name=model_name, grad_acc = grad_acc,num_workers=4, num_proc=4, num_samples=NUM_SAMPLES)
data_loader = MicroBatchDataLoader(
local_batch_size=LOCAL_BATCH_SIZE,
micro_batch_size=MICRO_BATCH_SIZE,
seq_length=SEQ_LEN,
dataset_name=DATASET_NAME,
tokenizer_name=MODEL_NAME,
grad_acc = GRAD_ACC,
num_workers=NUM_WORKERS,
num_proc=NUM_PROC,
num_samples=NUM_SAMPLES
)
print("init dataloader time:", time.time()-start_time, is_print_rank=is_wandb_rank)
tensor_shapes = (data_loader.micro_batch_size, data_loader.seq_length_per_gpu, config.hidden_size)
tensor_shapes = (data_loader.micro_batch_size, data_loader.seq_length_per_gpu, model_config.hidden_size)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
trained_tokens, step = 0, 0
if args.load_path:
step, trained_tokens = load_checkpoint(model, optimizer, args.load_path)
if LOAD_PATH:
step, trained_tokens = load_checkpoint(model, optimizer, LOAD_PATH)
checkpoint_dir = args.ckpt_dir
checkpoint_freq = args.ckpt_freq
dist.barrier()
#TODO: Double-check consumed tokens after each steps (for example, MICRO_BATCH_SIZE=2 and using only dp_size=4, num_local_micro_batches=0 => division by 0)
#TODO: Check convergence
#TODO: Try multi-nodes
#TODO: Add activation checkpointing
#TODO: add gradient accumulation
# #TODO: Double-check consumed tokens after each steps (for example, MICRO_BATCH_SIZE=2 and using only dp_size=4, num_local_micro_batches=0 => division by 0)
# #TODO: Check convergence
# #TODO: Try multi-nodes
# #TODO: Add activation checkpointing
# #TODO: add gradient accumulation
while trained_tokens < MAX_TOKENS:
while MAX_TOKENS is None or trained_tokens < MAX_TOKENS:
#TODO: Add epoch support
# data_loader.set_epoch(step)
step_start_time = time.time()
@ -217,17 +229,17 @@ if __name__ == "__main__":
f"Memory usage: {torch.cuda.memory_reserved() / 1e9:.2f}GB"
, is_print_rank=is_wandb_rank)
if args.use_wandb:
if USE_WANDB:
wandb.log({"loss": loss, "tokens_per_step": tokens_per_step, "tokens_per_second": tokens_per_step / step_duration,\
"memory_usage": torch.cuda.memory_reserved() / 1e9, "trained_tokens": trained_tokens})
if step % checkpoint_freq == 0:
save_checkpoint(model, optimizer, step, trained_tokens, checkpoint_dir+f"/{step}")
if step % CHECKPOINT_FREQ == 0:
save_checkpoint(model, optimizer, step, trained_tokens, CHECKPOINT_DIR+f"/{step}")
if step >= total_train_steps:
if step >= TOTAL_TRAIN_STEPS:
break
if is_wandb_rank and args.use_wandb:
if is_wandb_rank and USE_WANDB:
wandb.finish()
dist.destroy_process_group()

View File

@ -190,64 +190,4 @@ class MicroBatchDataLoader(DataLoader):
except StopIteration:
self._iterator = None
raise StopIteration
return batch
## def display_4D_parallelism_grid():
# #TODO(fmom): fix me
# #TODO(fmom): add color to distinguish between different parallelism groups
# def create_gpu_box(gpu_num, tp, cp, pp):
# return [
# f"+------+",
# f"|GPU:{gpu_num:<2d}|",
# f"| TP:{tp:d} |",
# f"| CP:{cp:d} |",
# f"| PP:{pp:d} |",
# f"+------+"
# ]
#
# def create_node(start_gpu, tp_size, cp_size, pp_size, node_index):
# boxes = []
# for i in range(8): # 8 GPUs per node
# gpu_num = start_gpu + i
# tp = gpu_num % tp_size
# cp = (gpu_num // tp_size) % cp_size
# pp = (gpu_num // (tp_size * cp_size)) % pp_size
# boxes.append(create_gpu_box(gpu_num, tp, cp, pp))
# return [' '.join(row) for row in zip(*boxes)]
#
# def create_dp_box(replica_output):
# width = len(replica_output[0]) + 4
# top_bottom = f"+{'-' * (width - 2)}+"
# return [top_bottom] + [f"| {line} |" for line in replica_output] + [top_bottom]
#
# tp_size = pgm.process_group_manager.tp_size
# cp_size = pgm.process_group_manager.cp_size
# pp_size = pgm.process_group_manager.pp_size
# dp_size = pgm.process_group_manager.dp_size
# total_gpus_per_replica = tp_size * cp_size * pp_size
# num_nodes_per_replica = (total_gpus_per_replica + 7) // 8 # Round up to nearest whole node
#
# output = []
# output.append("=== Simplified Parallelism Configuration ===")
# output.append(f"TP Size: {tp_size}, CP Size: {cp_size}, PP Size: {pp_size}, DP Size: {dp_size}")
# output.append(f"Total GPUs for one replica: {total_gpus_per_replica}")
# output.append(f"Number of nodes per replica: {num_nodes_per_replica} (8 GPUs per node)")
# output.append(f"Total GPUs: {total_gpus_per_replica * dp_size}")
# output.append(f"Total nodes: {num_nodes_per_replica * dp_size}")
# output.append("")
#
# for dp in range(dp_size):
# replica_output = []
# for node in range(num_nodes_per_replica):
# start_gpu = (dp * total_gpus_per_replica) + (node * 8)
# node_output = create_node(start_gpu, tp_size, cp_size, pp_size, node)
# replica_output.append(f"Node {dp * num_nodes_per_replica + node}:")
# replica_output.extend(node_output)
# replica_output.append("")
#
# dp_box = create_dp_box(replica_output)
# output.append(f"Data Parallel Group {dp}:")
# output.extend(dp_box)
# output.append("")
#
# print("\n".join(output))
return batch