From e19f74b715d66ccd71ebb92a2a6ceb3ebb56ac11 Mon Sep 17 00:00:00 2001 From: "ferdinand.mom" Date: Mon, 4 Nov 2024 14:39:12 +0000 Subject: [PATCH] add option for HF token --- submit_slurm_jobs.py | 8 +++++--- template/base_job.slurm | 5 ++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/submit_slurm_jobs.py b/submit_slurm_jobs.py index 065d55e..e05732a 100644 --- a/submit_slurm_jobs.py +++ b/submit_slurm_jobs.py @@ -146,11 +146,12 @@ class Scheduler: print(f"{'-'*10}-|-{'-'*6}") print(f"{'Total':<10} | {total:<6}") -def submit_jobs(inp_dir, qos, nb_slurm_array, only: str = None): +def submit_jobs(inp_dir, qos, hf_token, nb_slurm_array, only: str = None): scheduler = Scheduler(inp_dir, qos) #TODO: batch into job arrays env_vars = os.environ.copy() + env_vars["HUGGINGFACE_TOKEN"] = hf_token total_jobs = len(scheduler.job_lists) if only == "fail": @@ -212,7 +213,8 @@ if __name__ == "__main__": parser.add_argument('--qos', type=str, help='QOS of the jobs') parser.add_argument('--nb_slurm_array', type=int, default=0, help='Number of slurm arrays') parser.add_argument('--only', type=str, default=None, help='Filter the jobs to submit') - + parser.add_argument('--hf_token', type=str, required=True, help='Huggingface token') + args = parser.parse_args() - submit_jobs(args.inp_dir, args.qos, args.nb_slurm_array, only=args.only) + submit_jobs(args.inp_dir, args.qos, args.hf_token, args.nb_slurm_array, only=args.only) diff --git a/template/base_job.slurm b/template/base_job.slurm index 77445e4..d8432b7 100644 --- a/template/base_job.slurm +++ b/template/base_job.slurm @@ -51,13 +51,16 @@ export HF_HOME=/fsx/$USER/.cache/huggingface export WANDB_DIR=/fsx/$USER/.cache/wandb export CUBLAS_WORKSPACE_CONFIG=":4096:8" export CUDA_DEVICE_MAX_CONNECTIONS="1" +export FI_PROVIDER="efa" module load cuda/12.1 GIT_REPO="/fsx/ferdinandmom/ferdinand-hf/picotron/" CMD="$GIT_REPO/train.py --config {{ config }}" -LAUNCHER="torchrun --nproc_per_node={{ n_proc_per_node }} --nnode={{ nodes }} --node_rank=$SLURM_NODEID --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT" +huggingface-cli login --token $HUGGINGFACE_TOKEN + +LAUNCHER="torchrun --nproc_per_node={{ n_proc_per_node }} --nnode={{ nodes }} --node_rank=$SLURM_NODEID --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} --rdzv_backend c10d --max_restarts 0 --tee 3" # Checkout the bench_cluster branch cd $GIT_REPO