From b0ea5066aded5cacb0dcf4cbcbdea0a65c36c1dd Mon Sep 17 00:00:00 2001 From: "ferdinand.mom" Date: Tue, 17 Dec 2024 05:01:35 +0000 Subject: [PATCH] small changes --- .gitignore | 3 ++- picotron/__init__.py | 0 template/base_job.slurm | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) delete mode 100644 picotron/__init__.py diff --git a/.gitignore b/.gitignore index 19c7345..6b13aee 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ picotron.egg-info wandb tmp debug -bench \ No newline at end of file +bench +sandbox \ No newline at end of file diff --git a/picotron/__init__.py b/picotron/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/template/base_job.slurm b/template/base_job.slurm index d8432b7..5b1416a 100644 --- a/template/base_job.slurm +++ b/template/base_job.slurm @@ -58,7 +58,8 @@ module load cuda/12.1 GIT_REPO="/fsx/ferdinandmom/ferdinand-hf/picotron/" CMD="$GIT_REPO/train.py --config {{ config }}" -huggingface-cli login --token $HUGGINGFACE_TOKEN +git checkout loading_big_model +# huggingface-cli login --token $HUGGINGFACE_TOKEN LAUNCHER="torchrun --nproc_per_node={{ n_proc_per_node }} --nnode={{ nodes }} --node_rank=$SLURM_NODEID --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} --rdzv_backend c10d --max_restarts 0 --tee 3"