diff --git a/.gitignore b/.gitignore index 19c7345..6b13aee 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ picotron.egg-info wandb tmp debug -bench \ No newline at end of file +bench +sandbox \ No newline at end of file diff --git a/picotron/__init__.py b/picotron/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/template/base_job.slurm b/template/base_job.slurm index d8432b7..5b1416a 100644 --- a/template/base_job.slurm +++ b/template/base_job.slurm @@ -58,7 +58,8 @@ module load cuda/12.1 GIT_REPO="/fsx/ferdinandmom/ferdinand-hf/picotron/" CMD="$GIT_REPO/train.py --config {{ config }}" -huggingface-cli login --token $HUGGINGFACE_TOKEN +git checkout loading_big_model +# huggingface-cli login --token $HUGGINGFACE_TOKEN LAUNCHER="torchrun --nproc_per_node={{ n_proc_per_node }} --nnode={{ nodes }} --node_rank=$SLURM_NODEID --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} --rdzv_backend c10d --max_restarts 0 --tee 3"