Table of contents
Automatic SLURM Build Script
Building SLURM and installing is a partially manual process. We have compiled a script which automatically builds and installs SLURM on Redhat/CentOS 7.x and 8.x.
In case you want to install SLURM on Ubuntu you can follow our guide on installing SLURM on Ubuntu (in WSL). Please see below for a container based setup of a SLURM cluster.
You can execute the script at once or step by step to see what happens. We recommend to execute the script as root but it should work as well based on executing it as standard user being able to sudo.
Here is the automatic SLURM build and installation script (below you can find the script for download as well):
################################################################################ # Copyright (C) 2019-2020 NI SP GmbH # All Rights Reserved # # info@ni-sp.com / www.ni-sp.com # # We provide the information on an as is basis. # We provide no warranties, express or implied, related to the # accuracy, completeness, timeliness, useability, and/or merchantability # of the data and are not liable for any loss, damage, claim, liability, # expense, or penalty, or for any direct, indirect, special, secondary, # incidental, consequential, or exemplary damages or lost profit # deriving from the use or misuse of this information. ################################################################################ # Version v1.1 # # SLURM 20.02.3 Build and Installation script for Redhat/CentOS EL7 and EL8 # # See also https://www.slothparadise.com/how-to-install-slurm-on-centos-7-cluster/ # https://slurm.schedmd.com/quickstart_admin.html # https://wiki.fysik.dtu.dk/niflheim/Slurm_installation # https://slurm.schedmd.com/faq.html # In case of version 7 "Compute Node" was the base for the installation # In case of version 8 "Server" was the base for the installation # SLURM accounting support yum install mariadb-server mariadb-devel -y # For all the nodes, before you install Slurm or Munge: # check for RH/CentOS Version OSVERSION="7" if [ "`hostnamectl | grep Kernel | grep el8`" != "" ] ; then OSVERSION="8" fi # sudo su - export MUNGEUSER=966 sudo groupadd -g $MUNGEUSER munge sudo useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGEUSER -g munge -s /sbin/nologin munge export SLURMUSER=967 sudo groupadd -g $SLURMUSER slurm sudo useradd -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm -s /bin/bash slurm # exit # For CentOS 7: need to get the latest EPEL repository. sudo yum install epel-release -y if [ "$OSVERSION" == "7" ] ; then sudo yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm -y fi if [ "$OSVERSION" == "8" ] ; then sudo yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm -y fi # install munge if [ "$OSVERSION" == "7" ] ; then sudo yum install munge munge-libs munge-devel -y fi if [ "$OSVERSION" == "8" ] ; then sudo yum install munge munge-libs -y dnf --enablerepo=PowerTools install munge-devel -y fi sudo yum install rng-tools -y sudo rngd -r /dev/urandom sudo /usr/sbin/create-munge-key -r -f sudo sh -c "dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key" sudo chown munge: /etc/munge/munge.key sudo chmod 400 /etc/munge/munge.key sudo systemctl enable munge sudo systemctl start munge # build and install SLURM sudo yum install python3 gcc openssl openssl-devel pam-devel numactl numactl-devel hwloc lua readline-devel ncurses-devel man2html libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch -y if [ "$OSVERSION" == "7" ] ; then sudo yum install rrdtool-devel lua-devel hwloc-devel -y fi if [ "$OSVERSION" == "8" ] ; then yum install rpm-build make -y dnf --enablerepo=PowerTools install rrdtool-devel lua-devel hwloc-devel -y # dnf group install "Development Tools" fi mkdir slurm-tmp cd slurm-tmp export VER=20.02.6 # latest 20.02 export VER=20.11.0 # https://download.schedmd.com/slurm/slurm-20.02.3.tar.bz2 wget https://download.schedmd.com/slurm/slurm-$VER.tar.bz2 rpmbuild -ta slurm-$VER.tar.bz2 # and wait a few minutes until SLURM has been compiled # if [ "$OSVERSION" == "7" ] ; then # fi # if [ "$OSVERSION" == "8" ] ; then # rpm-build -ta slurm-$VER.tar.bz2 # and wait a few minutes until SLURM has been compiled # fi rm slurm-$VER.tar.bz2 cd .. rmdir slurm-tmp # get perl-Switch # sudo yum install cpan -y cd ~/rpmbuild/RPMS/x86_64/ # skipping slurm-openlava and slurm-torque because of missing perl-Switch sudo yum --nogpgcheck localinstall slurm-[0-9]*.el?.x86_64.rpm slurm-contribs-*.el?.x86_64.rpm slurm-devel-*.el?.x86_64.rpm \ slurm-example-configs-*.el?.x86_64.rpm slurm-libpmi-*.el?.x86_64.rpm \ slurm-pam_slurm-*.el?.x86_64.rpm slurm-perlapi-*.el?.x86_64.rpm slurm-slurmctld-*.el?.x86_64.rpm \ slurm-slurmd-*.el?.x86_64.rpm slurm-slurmdbd-*.el?.x86_64.rpm -y # create the SLURM default configuration with # compute nodes called "NodeName=linux[1-32]" # in a cluster called "cluster" # and a partition name called "test" # Feel free to adapt to your needs HOST=`hostname` sudo cat > /etc/slurm/slurm.conf << EOF # slurm.conf file generated by configurator easy.html. # Put this file on all nodes of your cluster. # See the slurm.conf man page for more information. # SlurmctldHost=localhost # #MailProg=/bin/mail MpiDefault=none #MpiParams=ports=#-# ProctrackType=proctrack/cgroup ReturnToService=1 SlurmctldPidFile=/var/run/slurmctld.pid #SlurmctldPort=6817 SlurmdPidFile=/var/run/slurmd.pid #SlurmdPort=6818 SlurmdSpoolDir=/var/spool/slurmd SlurmUser=slurm #SlurmdUser=root StateSaveLocation=/var/spool SwitchType=switch/none TaskPlugin=task/affinity # # # TIMERS #KillWait=30 #MinJobAge=300 #SlurmctldTimeout=120 #SlurmdTimeout=300 # # # SCHEDULING # Obsolete: FastSchedule=1 SchedulerType=sched/backfill SelectType=select/cons_res SelectTypeParameters=CR_Core # # # LOGGING AND ACCOUNTING AccountingStorageType=accounting_storage/none ClusterName=cluster #JobAcctGatherFrequency=30 JobAcctGatherType=jobacct_gather/none #SlurmctldDebug=info #SlurmctldLogFile= #SlurmdDebug=info #SlurmdLogFile= # # # COMPUTE NODES NodeName=$HOST State=idle Feature=dcv2,other # NodeName=linux[1-32] CPUs=1 State=UNKNOWN # NodeName=linux1 NodeAddr=128.197.115.158 CPUs=4 State=UNKNOWN # NodeName=linux2 NodeAddr=128.197.115.7 CPUs=4 State=UNKNOWN PartitionName=test Nodes=$HOST Default=YES MaxTime=INFINITE State=UP # PartitionName=test Nodes=$HOST,linux[1-32] Default=YES MaxTime=INFINITE State=UP # DefMemPerNode=1000 # MaxMemPerNode=1000 # DefMemPerCPU=4000 # MaxMemPerCPU=4096 EOF sudo cat > /etc/slurm/cgroup.conf << EOF ### # # Slurm cgroup support configuration file # # See man slurm.conf and man cgroup.conf for further # information on cgroup configuration parameters #-- CgroupAutomount=yes ConstrainCores=no ConstrainRAMSpace=no EOF sudo mkdir /var/spool/slurmctld sudo chown slurm:slurm /var/spool/slurmctld sudo chmod 755 /var/spool/slurmctld sudo mkdir -p /var/spool/slurm/cluster_state sudo chown slurm:slurm /var/spool/slurm/cluster_state sudo touch /var/log/slurmctld.log sudo chown slurm:slurm /var/log/slurmctld.log sudo touch /var/log/slurm_jobacct.log /var/log/slurm_jobcomp.log sudo chown slurm: /var/log/slurm_jobacct.log /var/log/slurm_jobcomp.log # firewall will block connections between nodes so in case of cluster # with multiple nodes adapt the firewall on the compute nodes # # sudo systemctl stop firewalld # sudo systemctl disable firewalld # on the master node #sudo firewall-cmd --permanent --zone=public --add-port=6817/udp #sudo firewall-cmd --permanent --zone=public --add-port=6817/tcp #sudo firewall-cmd --permanent --zone=public --add-port=6818/tcp #sudo firewall-cmd --permanent --zone=public --add-port=6818/tcp #sudo firewall-cmd --permanent --zone=public --add-port=7321/tcp #sudo firewall-cmd --permanent --zone=public --add-port=7321/tcp #sudo firewall-cmd --reload # sync clock on master and every compute node #sudo yum install ntp -y #sudo chkconfig ntpd on #sudo ntpdate pool.ntp.org #sudo systemctl start ntpd sudo systemctl enable slurmctld sudo systemctl enable slurmdbd # on compute nodes sudo systemctl enable slurmd.service sudo systemctl start slurmd.service echo Sleep for a few seconds for slurmd to come up ... sleep 3 # on master chmod 777 /var/spool # hack for now as otherwise slurmctld is complaining sudo systemctl start slurmctld.service echo Sleep for a few seconds for slurmctld to come up ... sleep 3 # checking # sudo systemctl status slurmd.service # sudo journalctl -xe # if you experience an error with starting up slurmd.service # like "fatal: Incorrect permissions on state save loc: /var/spool" # then you might want to adapt with chmod 777 /var/spool # more checking # sudo slurmd -Dvvv -N YOUR_HOSTNAME # sudo slurmctld -D vvvvvvvv # or tracing with sudo strace slurmctld -D vvvvvvvv # echo Compute node bugs: tail /var/log/slurmd.log # echo Server node bugs: tail /var/log/slurmctld.log # show cluster echo echo Output from: \"sinfo\" sinfo # sinfo -Nle echo echo Output from: \"scontrol show partition\" scontrol show partition # show host info as slurm sees it echo echo Output from: \"slurmd -C\" slurmd -C # in case host is in drain status # scontrol update nodename=$HOST state=idle echo echo Output from: \"scontrol show nodes\" scontrol show nodes # If jobs are running on the node: # scontrol update nodename=$HOST state=resume # lets run our first job echo echo Output from: \"srun hostname\" srun hostname # if there are issues in scheduling # turn on debugging # sudo scontrol setdebug 6 # or up to 9 # check the journal # journalctl -xe # turn off debugging # sudo scontrol setdebug 3 # scontrol # scontrol: show node $HOST # scontrol show jobs # scontrol update NodeName=ip-172-31-23-216 State=RESUME # scancel JOB_ID # srun -N5 /bin/hostname # after changing the configuration: # scontrol reconfigure # # more resources # https://slurm.schedmd.com/quickstart.html # https://slurm.schedmd.com/quickstart_admin.html #
The SLURM built and installation script can be downloaded here as well: SLURM_installation.sh.
You can download pre-compiled RPMs for EL7 and EL8 here (you can basically start the script above at “cd ~/rpmbuild/RPMS/x86_64/” after extracting the tarball and setting up mariadb and munge):
- SLURM Version 20.02.3
- SLURM Version 20.11.0
In case you are interested in HPC in the Cloud head over to our overview article HPC in the Cloud – Pros and Cons.
Build RPMs only
In case you want to build only the RPMs here is the script for EL7:
sudo yum install epel-release -y sudo yum install python3 gcc openssl openssl-devel pam-devel numactl \ numactl-devel hwloc lua readline-devel ncurses-devel man2html \ libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch \ rrdtool-devel lua-devel hwloc-devel munge munge-libs munge-devel \ mariadb-server mariadb-devel -y mkdir slurm-tmp cd slurm-tmp export VER=20.02.6 # latest 20.02 export VER=20.11.0 wget https://download.schedmd.com/slurm/slurm-$VER.tar.bz2 rpmbuild -ta slurm-$VER.tar.bz2 echo Your RPMs are at $HOME/rpmbuild/RPMS/x86_64: ls -al $HOME/rpmbuild/RPMS/x86_64
And here the automatic RPM builder for EL8:
sudo yum install epel-release -y sudo yum install --enablerepo=PowerTools python3 gcc openssl \ openssl-devel pam-devel numactl wget make numactl-devel \ hwloc lua readline-devel ncurses-devel man2html \ libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch \ rrdtool-devel lua-devel hwloc-devel munge munge-libs munge-devel \ mariadb-server mariadb-devel -y mkdir slurm-tmp cd slurm-tmp export VER=20.02.6 # latest 20.02 export VER=20.11.0 wget https://download.schedmd.com/slurm/slurm-$VER.tar.bz2 rpmbuild -ta slurm-$VER.tar.bz2 echo Your RPMs are at $HOME/rpmbuild/RPMS/x86_64: ls -al $HOME/rpmbuild/RPMS/x86_64
SLURM Cluster in Docker Containers
SciDAS has created a easy to use container-based SLURM setup to jump-start a small SLURM cluster. The automatic container build creates 2 SLURM compute workers with OpenMPI integration as well as a controller and a database container as per this graph from the github page:

Here is an overview how the straightforward installation looks like with input from the github page:
> git clone https://github.com/SciDAS/slurm-in-docker Cloning into 'slurm-in-docker'... remote: Enumerating objects: 549, done. remote: Total 549 (delta 0), reused 0 (delta 0), pack-reused 549 Receiving objects: 100% (549/549), 144.72 KiB | 682.00 KiB/s, done. Resolving deltas: 100% (310/310), done. # BEGIN - install docker in case not yet done - in our case for Ubuntu > sudo apt-get install -y apt-transport-https \ ca-certificates curl gnupg-agent \ software-properties-common > curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - > sudo add-apt-repository \ "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ $(lsb_release -cs) \ stable" > sudo apt-get update > sudo apt-get install -y docker-ce docker-ce-cli containerd.io > sudo apt-get install -y docker-compose > sudo groupadd docker > sudo usermod -aG docker $USER # END of Docker installation # You might need to logout and login to active the docker group access rights # # Create the SLURM 19.05.1 containers (SLURM version can be adapted) # > cd slurm-in-docker/ > make # building will take some minutes # ... lots of output ;) ............................. > docker images REPOSITORY TAG IMAGE ID CREATED SIZE scidas/slurm.database 19.05.1 035a7fb27574 3 days ago 828MB scidas/slurm.worker 19.05.1 6faf0d7804f7 3 days ago 1.31GB scidas/slurm.controller 19.05.1 e2445edbad54 3 days ago 1.31GB scidas/slurm.base 19.05.1 668e97c1fb7b 3 days ago 805MB scidas/slurm.rpms 19.05.1 8b5682048fee 3 days ago 885MB centos 7 7e6257c9f8d8 6 weeks ago 203MB krallin/centos-tini 7 748636d1c058 16 months ago 226MB > docker-compose up -d # start the environment Creating network "slurmindocker_slurm" with the default driver Creating controller ... Creating controller ... done Creating worker01 ... Creating database ... Creating worker02 ... Creating worker01 Creating database Creating worker02 ... done > docker exec -ti controller sinfo -lN NODELIST NODES PARTITION STATE CPUS S:C:T MEMORY TMP_DISK WEIGHT AVAIL_FE REASON worker01 1 docker* idle 1 1:1:1 1800 0 1 (null) none worker02 1 docker* idle 1 1:1:1 1800 0 1 (null) none > docker exec -ti controller srun -N 2 hostname worker02 worker01 > docker exec -ti controller srun --mpi=list srun: MPI types are... srun: pmi2 srun: openmpi srun: none > docker exec -ti controller ompi_info # ......... OpenMPI info output ....... # Test OpenMPI > cat > home/worker/mpi_hello.c << EOF /****************************************************************************** * * FILE: mpi_hello.c * * DESCRIPTION: MPI tutorial example code: Simple hello world program * * AUTHOR: Blaise Barney * * LAST REVISED: 03/05/10 * ******************************************************************************/ #include <mpi.h> #include <stdio.h> #include <stdlib.h> #define MASTER 0 int main (int argc, char *argv[]) { int numtasks, taskid, len; char hostname[MPI_MAX_PROCESSOR_NAME]; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD,&numtasks); MPI_Comm_rank(MPI_COMM_WORLD,&taskid); MPI_Get_processor_name(hostname, &len); printf ("Hello from task %d on %s!\n", taskid, hostname); if (taskid == MASTER) printf("MASTER: Number of MPI tasks is: %d\n",numtasks); //while(1) {} MPI_Finalize(); } EOF > docker exec -ti worker01 mpicc mpi_hello.c -o mpi_hello.out > docker exec -ti worker01 srun -N 2 --mpi=openmpi mpi_hello.out Hello from task 1 on worker02! Hello from task 0 on worker01! MASTER: Number of MPI tasks is: 2 # disable message about missing openib in case with the following setting # docker exec -ti worker01 bash -c "export \ # OMPI_MCA_btl_base_warn_component_unused=0; srun -N 2 --mpi=openmpi mpi_hello.out" # login to a worker container > docker exec -ti worker01 bash # and finally shutdown the SLURM container environment > sh teardown.sh # docker-compose stop # docker-compose rm -f # docker volume rm slurmindocker_home slurmindocker_secret # docker network rm slurmindocker_slurm
In case the controller constantly restarts with messages like
sacctmgr: error: Malformed RPC of type PERSIST_RC(1433) received
sacctmgr: error: slurm_persist_conn_open: Failed to unpack persistent connection init resp message from database:6819 :
sh teardown.sh rm -rf home/worker/.ssh/* sudo rm -rf secret/* docker-compose up -d
Have a look at our other technical guides related to NICE DCV and EnginFrame HPC and session management portal. If there are any questions please let us know.
Commercial Support for SLURM
Our experienced technical team offers professional support for SLURM for commercial and academia customers. We help you solve issues with your SLURM installation via Email, Phone and Webconf. In case you are interested let us know at or via our contact form.