This commit is contained in:
ferdinand.mom 2024-10-30 14:29:22 +00:00
parent 2d198659e2
commit f74bff79e0
5 changed files with 7 additions and 39 deletions

View File

@ -1,32 +0,0 @@
#!/bin/bash
# Initialize counters
declare -A counts
statuses=("init" "pending" "running" "fail" "oom" "timeout" "completed")
for status in "${statuses[@]}"; do
counts[$status]=0
done
# Find and process all status.txt files
while IFS= read -r -d '' file; do
status=$(cat "$file" | tr -d '[:space:]')
if [[ " ${statuses[@]} " =~ " ${status} " ]]; then
((counts[$status]++))
fi
done < <(find "$1" -name "status.txt" -print0)
# Calculate total
total=0
for count in "${counts[@]}"; do
((total += count))
done
# Print the results
echo "Status | Count"
echo "-----------|---------"
for status in "${statuses[@]}"; do
printf "%-10s | %d\n" "$status" "${counts[$status]}"
done
echo "-----------|---------"
echo "Total | $total"

View File

@ -91,21 +91,21 @@ class Scheduler:
"qos": job.qos,
}
#TODO: don't hardcode the base_bench.slurm path. Should be #HOME/bench_cluster/template/base_bench.slurm
#TODO: don't hardcode the base_job.slurm path. Should be #HOME/bench_cluster/template/base_job.slurm
if cluster == "hf":
base_path = "/fsx/ferdinandmom/ferdinand-hf/picotron/bench/template/base_bench.slurm"
base_path = "/fsx/ferdinandmom/ferdinand-hf/picotron/bench/template/base_job.slurm"
else:
raise ValueError("Invalid cluster")
with open(base_path, 'r') as file:
base_bench_file = file.read()
base_job_file = file.read()
base_bench_template = Template(base_bench_file)
base_job_template = Template(base_job_file)
# Write the rendered script to a new file located at the job root_path
output_file_path = os.path.join(job.root_path, "bench.slurm")
output_file_path = os.path.join(job.root_path, "job.slurm")
with open(output_file_path, 'w') as file:
file.write(base_bench_template.render(context_bench))
file.write(base_job_template.render(context_bench))
print(f"Slurm script created at {output_file_path}")

View File

@ -1,6 +1,6 @@
#!/bin/bash
#SBATCH --job-name=bench-picotron
#SBATCH --job-name=job-picotron
#SBATCH --time=00:30:00
#SBATCH --partition=hopper-prod
#SBATCH --nodes={{ nodes }}