From 78ea484a951894924fde1f37359c12b4f4179416 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 13 Feb 2023 15:29:47 -0600 Subject: Work on octopus --- topics/systems/hpc/octopus-maintenance.gmi | 36 ++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 topics/systems/hpc/octopus-maintenance.gmi (limited to 'topics/systems/hpc/octopus-maintenance.gmi') diff --git a/topics/systems/hpc/octopus-maintenance.gmi b/topics/systems/hpc/octopus-maintenance.gmi new file mode 100644 index 0000000..6f44433 --- /dev/null +++ b/topics/systems/hpc/octopus-maintenance.gmi @@ -0,0 +1,36 @@ +# Octopus Maintenance + +## Slurm + +Status of slurm + +``` +sinfo +sinfo -R +squeue +``` + +we have draining nodes, but no jobs running on them + +Reviving draining node (as root) + +``` +scontrol + update NodeName=octopus05 State=DOWN Reason="undraining" + update NodeName=octopus05 State=RESUME + show node octopus05 +``` + +Kill time can lead to drain state + +``` +scontrol show config | grep kill +UnkillableStepProgram = (null) +UnkillableStepTimeout = 60 sec +``` + +check valid configuration with `slurmd -C` and update nodes with + +``` +scontrol reconfigure +``` -- cgit v1.2.3