#!/bin/bash #SBATCH --job-name=job-picotron #SBATCH --time=00:30:00 #SBATCH --partition=hopper-prod #SBATCH --nodes={{ nodes }} #SBATCH --gres=gpu:{{ n_proc_per_node }} #SBATCH --qos={{ qos }} #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=96 #SBATCH --exclusive #SBATCH --output={{ root_path }}/log_%j.out #SBATCH --error={{ root_path }}/log_%j.out # Function to update status based on squeue output update_status() { job_id=$1 status_file=$2 # For unknown reasons, it doenst update status for pending. It only works for running while true; do job_status=$(squeue --job $job_id --noheader --format=%T) echo "Job status: $job_status" if [ -z "$job_status" ]; then # Job has finished or is not found break elif [ "$job_status" = "RUNNING" ]; then printf "running" > $status_file break fi sleep 10 done } # Misc initializations. echo "========================" echo "START TIME: $(date)" source /etc/profile.d/modules.sh source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh conda activate /fsx/ferdinandmom/miniforge3/envs/env-picotron echo python3 version = $(python3 --version) echo "========================" # Slurm stuff export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) export MASTER_PORT=$((1024 + RANDOM % 64511)) export TMPDIR=/scratch export TORCH_HOME=/fsx/$USER/.cache/torch export HF_HOME=/fsx/$USER/.cache/huggingface export WANDB_DIR=/fsx/$USER/.cache/wandb export CUBLAS_WORKSPACE_CONFIG=":4096:8" export CUDA_DEVICE_MAX_CONNECTIONS="1" export FI_PROVIDER="efa" module load cuda/12.1 GIT_REPO="/fsx/ferdinandmom/ferdinand-hf/picotron/" CMD="$GIT_REPO/train.py --config {{ config }}" git checkout loading_big_model # huggingface-cli login --token $HUGGINGFACE_TOKEN LAUNCHER="torchrun --nproc_per_node={{ n_proc_per_node }} --nnode={{ nodes }} --node_rank=$SLURM_NODEID --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} --rdzv_backend c10d --max_restarts 0 --tee 3" # Checkout the bench_cluster branch cd $GIT_REPO # Get the current job ID job_id=${SLURM_JOB_ID} # Update status to "pending" or "running" in the background update_status $job_id {{ root_path }}/status.txt & # Run the main command echo "Running command: $CMD" srun -u $LAUNCHER $CMD exit_status=$? job_id=$SLURM_JOB_ID # Update status based on the exit status of `srun` if [ $exit_status -eq 0 ]; then printf "completed" > {{ root_path }}/status.txt else if grep -q "OutOfMemoryError" {{ root_path }}/log_${job_id}.out; then printf "oom" > {{ root_path }}/status.txt elif grep -q " CUDA error: an illegal memory access" {{ root_path }}/log_${job_id}.out; then printf "oom" > {{ root_path }}/status.txt elif grep -q "Timeout" {{ root_path }}/log_${job_id}.out; then printf "timeout" > {{ root_path }}/status.txt else printf "fail" > {{ root_path }}/status.txt fi fi