diff --git a/ci/LICENSE.txt b/ci/LICENSE.txt new file mode 100644 index 0000000..b5872b0 --- /dev/null +++ b/ci/LICENSE.txt @@ -0,0 +1,28 @@ +Copyright (c) 2018, Anaconda, Inc. and contributors +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +Neither the name of Anaconda nor the names of any contributors may be used to +endorse or promote products derived from this software without specific prior +written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +THE POSSIBILITY OF SUCH DAMAGE. diff --git a/ci/conda_setup.sh b/ci/conda_setup.sh new file mode 100755 index 0000000..c9849d6 --- /dev/null +++ b/ci/conda_setup.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +set -e +set -x + +# Install miniconda +wget http://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh +bash ~/miniconda.sh -b -p $HOME/miniconda +export PATH="$HOME/miniconda/bin:$PATH" +conda update conda --yes +conda clean -tipy +conda config --set always_yes yes --set changeps1 no +conda --version diff --git a/ci/environment.yml b/ci/environment.yml new file mode 100644 index 0000000..b6a4c9c --- /dev/null +++ b/ci/environment.yml @@ -0,0 +1,7 @@ +name: dask-jobqueue +channels: + - conda-forge + - defaults +dependencies: + - python=3.8 + - juliaup diff --git a/ci/htcondor.sh b/ci/htcondor.sh new file mode 100755 index 0000000..ec2c841 --- /dev/null +++ b/ci/htcondor.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +function jobqueue_before_install { + docker version + docker-compose version + + # start htcondor cluster + cd ./ci/htcondor + docker-compose pull + ./start-htcondor.sh + docker-compose exec -T submit /bin/bash -c "condor_status" + docker-compose exec -T submit /bin/bash -c "condor_q" + cd - + + #Set shared space permissions + docker-compose exec -T submit /bin/bash -c "chmod -R 777 /shared_space" + + docker ps -a + docker images +} + +function jobqueue_install { + cd ./ci/htcondor + docker-compose exec -T submit /bin/bash -c "cd /dask-jobqueue; pip3 install -e .;chown -R submituser ." + cd - +} + +function jobqueue_script { + cd ./ci/htcondor + docker-compose exec -T --user submituser submit /bin/bash -c "cd; pytest /dask-jobqueue/dask_jobqueue --log-cli-level DEBUG --capture=tee-sys --verbose -E htcondor " + cd - +} + +function jobqueue_after_script { + cd ./ci/htcondor + docker-compose exec -T --user submituser submit /bin/bash -c "condor_q" + docker-compose exec -T submit /bin/bash -c "condor_status" + docker-compose exec -T --user submituser submit /bin/bash -c "condor_history" + docker-compose exec -T --user submituser submit /bin/bash -c "cd; cat logs/*" + docker-compose exec -T cm /bin/bash -c " grep -R \"\" /var/log/condor/ " + cd - +} diff --git a/ci/htcondor/Dockerfile b/ci/htcondor/Dockerfile new file mode 100644 index 0000000..06e0d70 --- /dev/null +++ b/ci/htcondor/Dockerfile @@ -0,0 +1,15 @@ +FROM htcondor/submit:el7 as submit + +RUN curl -o miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash miniconda.sh -f -b -p /opt/anaconda && \ + /opt/anaconda/bin/conda clean -tipy && \ + rm -f miniconda.sh +ENV PATH /opt/anaconda/bin:$PATH +# environment.yml file is copied by CI script. If manually building, you should copy it too from parent directory +COPY environment.yml . +RUN conda env update -n base --file environment.yml + +FROM htcondor/execute:el7 as execute + +COPY --from=submit /opt/anaconda /opt/anaconda +ENV PATH /opt/anaconda/bin:$PATH diff --git a/ci/htcondor/condor_config.local b/ci/htcondor/condor_config.local new file mode 100644 index 0000000..688eeae --- /dev/null +++ b/ci/htcondor/condor_config.local @@ -0,0 +1 @@ +NEGOTIATOR_INTERVAL=10 \ No newline at end of file diff --git a/ci/htcondor/docker-compose.yml b/ci/htcondor/docker-compose.yml new file mode 100644 index 0000000..358aafb --- /dev/null +++ b/ci/htcondor/docker-compose.yml @@ -0,0 +1,66 @@ +version: "3.4" + +services: + cm: + image: htcondor/cm:el7 + hostname: cm.htcondor + environment: + - USE_POOL_PASSWORD=yes + volumes: + - secrets:/root/secrets + - ./condor_config.local:/etc/condor/condor_config.local + command: bash -c 'condor_store_cred -p password -f /root/secrets/pool_password ; exec bash -x /start.sh' + + submit: + image: daskdev/dask-jobqueue:htcondor-submit + build: + context: . + target: submit + hostname: submit.htcondor + environment: + - CONDOR_HOST=cm + - USE_POOL_PASSWORD=yes + - CI_SHARED_SPACE=/shared_space + depends_on: + - cm + volumes: + - secrets:/root/secrets + - ../..:/dask-jobqueue + - ./condor_config.local:/etc/condor/condor_config.local + - shared_space:/shared_space + + execute1: + image: daskdev/dask-jobqueue:htcondor-execute + build: + context: . + target: execute + hostname: execute1.htcondor + environment: + - CONDOR_HOST=cm + - USE_POOL_PASSWORD=yes + depends_on: + - cm + volumes: + - secrets:/root/secrets + - ./condor_config.local:/etc/condor/condor_config.local + - shared_space:/shared_space + + execute2: + image: daskdev/dask-jobqueue:htcondor-execute + build: + context: . + target: execute + hostname: execute2.htcondor + environment: + - CONDOR_HOST=cm + - USE_POOL_PASSWORD=yes + depends_on: + - cm + volumes: + - secrets:/root/secrets + - ./condor_config.local:/etc/condor/condor_config.local + - shared_space:/shared_space + +volumes: + secrets: + shared_space: diff --git a/ci/htcondor/start-htcondor.sh b/ci/htcondor/start-htcondor.sh new file mode 100755 index 0000000..1afe454 --- /dev/null +++ b/ci/htcondor/start-htcondor.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +docker-compose up -d --no-build + +while [ `docker-compose exec -T submit condor_status -af activity|grep Idle|wc -l` -ne 2 ] + do + echo "Waiting for cluster to become ready"; + sleep 2 + done +echo "HTCondor properly configured" diff --git a/ci/none.sh b/ci/none.sh new file mode 100644 index 0000000..2cb8a9f --- /dev/null +++ b/ci/none.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +function jobqueue_before_install { + true # Pass +} + +function jobqueue_install { + which python + pip install --no-deps -e . +} + +function jobqueue_script { + flake8 -j auto dask_jobqueue + black --exclude versioneer.py --check . + pytest --verbose +} + +function jobqueue_after_script { + echo "Done." +} diff --git a/ci/pbs.sh b/ci/pbs.sh new file mode 100644 index 0000000..b4af112 --- /dev/null +++ b/ci/pbs.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +function jobqueue_before_install { + docker version + docker-compose version + + # start pbs cluster + cd ./ci/pbs + docker-compose pull + ./start-pbs.sh + cd - + + #Set shared space permissions + docker exec pbs_master /bin/bash -c "chmod -R 777 /shared_space" + docker exec pbs_master /bin/bash -c "chown -R pbsuser:pbsuser /home/pbsuser" + + docker exec -u pbsuser pbs_master pbsnodes -a + docker ps -a + docker images +} + +function jobqueue_install { + docker exec pbs_master /bin/bash -c "cd /dask-jobqueue; pip install -e .; chown -R pbsuser ." +} + +function jobqueue_script { + docker exec -u pbsuser pbs_master /bin/bash -c "cd; pytest /dask-jobqueue/dask_jobqueue --verbose -s -E pbs" +} + +function jobqueue_after_script { + docker exec -u pbsuser pbs_master qstat -fx + docker exec pbs_master bash -c 'cat /var/spool/pbs/sched_logs/*|| true' + docker exec pbs_master bash -c 'cat /var/spool/pbs/server_logs/*|| true' + docker exec pbs_master bash -c 'cat /var/spool/pbs/server_priv/accounting/*|| true' + docker exec pbs_slave_1 bash -c 'cat /var/spool/pbs/mom_logs/*|| true' + docker exec pbs_slave_1 bash -c 'cat /var/spool/pbs/spool/*|| true' + docker exec pbs_slave_1 bash -c 'cat /tmp/*.e*|| true' + docker exec pbs_slave_1 bash -c 'cat /tmp/*.o*|| true' + docker exec pbs_slave_2 bash -c 'cat /var/spool/pbs/mom_logs/*|| true' + docker exec pbs_slave_2 bash -c 'cat /var/spool/pbs/spool/*|| true' + docker exec pbs_slave_2 bash -c 'cat /tmp/*.e*|| true' + docker exec pbs_slave_2 bash -c 'cat /tmp/*.o*|| true' +} diff --git a/ci/pbs/Dockerfile b/ci/pbs/Dockerfile new file mode 100644 index 0000000..bb338a0 --- /dev/null +++ b/ci/pbs/Dockerfile @@ -0,0 +1,42 @@ +# inspired from https://github.com/PBSPro/pbspro/blob/v18.1.beta/docker/centos7/ +# multi-stage build +# build script will be triggered +FROM centos:7.5.1804 AS builder +# install dependencies for building +RUN yum install -y gcc make rpm-build libtool hwloc-devel libX11-devel \ + libXt-devel libedit-devel libical-devel ncurses-devel perl \ + postgresql-devel python-devel tcl-devel tk-devel swig expat-devel \ + openssl-devel libXext libXft git postgresql-contrib +# get known PBS Pro source code +RUN git clone --branch release_18_1_branch https://github.com/pbspro/pbspro.git /src/pbspro +COPY build.sh / +RUN bash /build.sh + +# base image +FROM centos:7.5.1804 +LABEL description="PBS Professional Open Source and conda" + +#The pbs master node name, can be overridden if needed +ENV PBS_MASTER pbs_master +ENV PATH /opt/pbs/bin:/opt/anaconda/bin:$PATH +ENV LANG en_US.UTF-8 +ENV LC_ALL en_US.UTF-8 + +COPY --from=builder /root/rpmbuild/RPMS/x86_64/pbspro-server-*.rpm . +# install pbspro and useful packages +RUN yum install -y pbspro-server-*.rpm curl bzip2 git gcc sudo openssh-server && yum clean all +# install python +RUN curl -o miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash miniconda.sh -f -b -p /opt/anaconda && \ + /opt/anaconda/bin/conda clean -tipy && \ + rm -f miniconda.sh +# environment.yml file is copied by CI script. If manually building, you should copy it too from parent directory +COPY environment.yml . +RUN conda env update -n base --file environment.yml + +# Copy entrypoint and other needed scripts +COPY ./*.sh / +RUN chmod a+x ./*.sh + +# default entrypoint launch pbs master +ENTRYPOINT ["bash", "/master-entrypoint.sh"] diff --git a/ci/pbs/build.sh b/ci/pbs/build.sh new file mode 100644 index 0000000..11b5571 --- /dev/null +++ b/ci/pbs/build.sh @@ -0,0 +1,11 @@ +#!/bin/bash +cd /src/pbspro +./autogen.sh +./configure -prefix=/opt/pbs +make dist +mkdir /root/rpmbuild /root/rpmbuild/SOURCES /root/rpmbuild/SPECS +cp pbspro-*.tar.gz /root/rpmbuild/SOURCES +cp pbspro.spec /root/rpmbuild/SPECS +cp pbspro-rpmlintrc /root/rpmbuild/SOURCES +cd /root/rpmbuild/SPECS +rpmbuild -ba pbspro.spec diff --git a/ci/pbs/docker-compose.yml b/ci/pbs/docker-compose.yml new file mode 100644 index 0000000..550c7ae --- /dev/null +++ b/ci/pbs/docker-compose.yml @@ -0,0 +1,54 @@ +version: "2" + +services: + + master: + image: daskdev/dask-jobqueue:pbs + build: . + container_name: pbs_master + hostname: pbs_master + environment: + - CI_SHARED_SPACE=/shared_space + volumes: + - ../..:/dask-jobqueue + - userhome:/home/pbsuser + - shared_space:/shared_space + command: bash /run-master.sh + + slave_one: + image: daskdev/dask-jobqueue:pbs + build: . + container_name: pbs_slave_1 + hostname: pbs_slave_1 + volumes: + - userhome:/home/pbsuser + - shared_space:/shared_space + entrypoint: "bash /slave-entrypoint.sh" + command: bash /run-slave.sh + links: + - "master:pbs_master" + environment: + - PBS_MASTER=pbs_master + depends_on: + - master + + slave_two: + image: daskdev/dask-jobqueue:pbs + build: . + container_name: pbs_slave_2 + hostname: pbs_slave_2 + volumes: + - userhome:/home/pbsuser + - shared_space:/shared_space + entrypoint: "bash /slave-entrypoint.sh" + command: bash /run-slave.sh + links: + - "master:pbs_master" + environment: + - PBS_MASTER=pbs_master + depends_on: + - master + +volumes: + userhome: + shared_space: diff --git a/ci/pbs/master-entrypoint.sh b/ci/pbs/master-entrypoint.sh new file mode 100644 index 0000000..7a2669c --- /dev/null +++ b/ci/pbs/master-entrypoint.sh @@ -0,0 +1,16 @@ +#!/bin/sh +pbs_conf_file=/etc/pbs.conf +mom_conf_file=/var/spool/pbs/mom_priv/config +hostname=$(hostname) + +# replace hostname in pbs.conf and mom_priv/config +sed -i "s/PBS_SERVER=.*/PBS_SERVER=$hostname/" $pbs_conf_file +sed -i "s/\$clienthost .*/\$clienthost $hostname/" $mom_conf_file + +# start PBS Pro +/etc/init.d/pbs start + +# create default non-root user +adduser pbsuser + +exec "$@" diff --git a/ci/pbs/run-master.sh b/ci/pbs/run-master.sh new file mode 100755 index 0000000..143a97f --- /dev/null +++ b/ci/pbs/run-master.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# Reduce time between PBS scheduling and add history +qmgr -c "set server scheduler_iteration = 20" +qmgr -c "set server job_history_enable = True" +qmgr -c "set server job_history_duration = 24:00:00" + +# add two slaves to pbs +qmgr -c "create node pbs_slave_1" +qmgr -c "create node pbs_slave_2" + +# Start hanging process to leave the container up and running +sleep infinity diff --git a/ci/pbs/run-slave.sh b/ci/pbs/run-slave.sh new file mode 100755 index 0000000..1247a59 --- /dev/null +++ b/ci/pbs/run-slave.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Start hanging process to leave the container up and running +sleep infinity diff --git a/ci/pbs/slave-entrypoint.sh b/ci/pbs/slave-entrypoint.sh new file mode 100644 index 0000000..b723904 --- /dev/null +++ b/ci/pbs/slave-entrypoint.sh @@ -0,0 +1,25 @@ +#!/bin/sh +pbs_conf_file=/etc/pbs.conf +mom_conf_file=/var/spool/pbs/mom_priv/config +hostname=$(hostname) + +# replace hostname in pbs.conf and mom_priv/config +sed -i "s/PBS_SERVER=.*/PBS_SERVER=$PBS_MASTER/" $pbs_conf_file +sed -i "s/\$clienthost .*/\$clienthost $hostname/" $mom_conf_file +sed -i "s/PBS_START_SERVER=.*/PBS_START_SERVER=0/" $pbs_conf_file +sed -i "s/PBS_START_SCHED=.*/PBS_START_SCHED=0/" $pbs_conf_file +sed -i "s/PBS_START_COMM=.*/PBS_START_COMM=0/" $pbs_conf_file +sed -i "s/PBS_START_MOM=.*/PBS_START_MOM=1/" $pbs_conf_file + +# Prevent PBS trying to use scp between host for stdout and stderr file of jobs +# On standard PBS deployment, you would use a shared mount, or correctly configured passwordless scp +echo "\$usecp *:/home/ /home/" >> $mom_conf_file +echo "\$usecp *:/dask-jobqueue/ /tmp/" >> $mom_conf_file + +# start PBS Pro +/etc/init.d/pbs start + +# create default non-root user +adduser pbsuser + +exec "$@" diff --git a/ci/pbs/start-pbs.sh b/ci/pbs/start-pbs.sh new file mode 100755 index 0000000..138d5f6 --- /dev/null +++ b/ci/pbs/start-pbs.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +docker-compose up -d --no-build +while [ `docker exec -u pbsuser pbs_master pbsnodes -a | grep "Mom = pbs_slave" | wc -l` -ne 2 ] +do + echo "Waiting for PBS slave nodes to become available"; + sleep 2 +done +echo "PBS properly configured" diff --git a/ci/sge.sh b/ci/sge.sh new file mode 100644 index 0000000..227e8db --- /dev/null +++ b/ci/sge.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +function jobqueue_before_install { + docker version + docker-compose version + + # start sge cluster + cd ./ci/sge + docker-compose pull + ./start-sge.sh + cd - + + #Set shared space permissions + docker exec sge_master /bin/bash -c "chmod -R 777 /shared_space" + + docker ps -a + docker images + docker exec sge_master qconf -sq dask.q +} + +function jobqueue_install { + docker exec sge_master /bin/bash -c "cd /dask-jobqueue; pip install -e ." +} + +function jobqueue_script { + docker exec sge_master /bin/bash -c "cd; pytest /dask-jobqueue/dask_jobqueue --verbose -s -E sge" +} + +function jobqueue_after_script { + echo "Daemon logs" + docker exec sge_master bash -c 'cat /tmp/sge*' || echo "No sge_master logs" + docker exec slave_one bash -c 'cat /tmp/exec*' || echo "No slave_one logs" + docker exec slave_two bash -c 'cat /tmp/exec*' || echo "No slave_two logs" +} diff --git a/ci/sge/Dockerfile b/ci/sge/Dockerfile new file mode 100644 index 0000000..dfc60a2 --- /dev/null +++ b/ci/sge/Dockerfile @@ -0,0 +1,32 @@ +FROM ubuntu:14.04 as base + +ENV LANG C.UTF-8 + +RUN apt-get update && apt-get install curl bzip2 git gcc -y --fix-missing + +RUN curl -o miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash miniconda.sh -f -b -p /opt/anaconda && \ + /opt/anaconda/bin/conda clean -tipy && \ + rm -f miniconda.sh +ENV PATH /opt/anaconda/bin:$PATH +# environment.yml file is copied by CI script. If manually building, you should copy it too from parent directory +COPY environment.yml . +RUN conda env update -n base --file environment.yml + +COPY ./*.sh / +COPY ./*.txt / + +FROM base as slave +RUN bash ./setup-slave.sh + +FROM base as master +RUN bash ./setup-master.sh + +# expose ports +EXPOSE 8000 +EXPOSE 6444 +EXPOSE 6445 +EXPOSE 6446 + +ENV SGE_ROOT /var/lib/gridengine/ +ENV SGE_CELL default diff --git a/ci/sge/add_worker.sh b/ci/sge/add_worker.sh new file mode 100644 index 0000000..d48c620 --- /dev/null +++ b/ci/sge/add_worker.sh @@ -0,0 +1,26 @@ +#`/bin/bash + +#!/bin/bash + +QUEUE=$1 +HOSTNAME=$2 +SLOTS=$3 + +# add to the execution host list +TMPFILE=/tmp/sge.hostname-$HOSTNAME +echo -e "hostname $HOSTNAME\nload_scaling NONE\ncomplex_values NONE\nuser_lists NONE\nxuser_lists NONE\nprojects NONE\nxprojects NONE\nusage_scaling NONE\nreport_variables NONE" > $TMPFILE +qconf -Ae $TMPFILE +rm $TMPFILE + +# add to the all hosts list +qconf -aattr hostgroup hostlist $HOSTNAME @allhosts + +# enable the host for the queue, in case it was disabled and not removed +qmod -e $QUEUE@$HOSTNAME + +# Add memory resource +qconf -mattr exechost complex_values h_vmem=100G $HOSTNAME + +if [ "$SLOTS" ]; then + qconf -aattr queue slots "[$HOSTNAME=$SLOTS]" $QUEUE +fi diff --git a/ci/sge/docker-compose.yml b/ci/sge/docker-compose.yml new file mode 100644 index 0000000..ed0de0e --- /dev/null +++ b/ci/sge/docker-compose.yml @@ -0,0 +1,61 @@ +version: "3.4" + +services: + + master: + image: daskdev/dask-jobqueue:sge + build: + context: . + target: master + container_name: sge_master + hostname: sge_master + #network_mode: host + environment: + - CI_SHARED_SPACE=/shared_space + volumes: + - ../..:/dask-jobqueue + - userhome:/root + - shared_space:/shared_space + command: bash /dask-jobqueue/ci/sge/run-master.sh + + slave-one: + image: daskdev/dask-jobqueue:sge-slave + build: + context: . + target: slave + container_name: slave_one + hostname: slave_one + #network_mode: host + volumes: + - ../..:/dask-jobqueue + - userhome:/root + - shared_space:/shared_space + command: bash /dask-jobqueue/ci/sge/run-slave.sh + links: + - "master:sge_master" + depends_on: + - master + + slave-two: + image: daskdev/dask-jobqueue:sge-slave + build: + context: . + target: slave + args: + PYTHON_VERSION: 3.8 + container_name: slave_two + hostname: slave_two + #network_mode: host + volumes: + - ../..:/dask-jobqueue + - userhome:/root + - shared_space:/shared_space + command: bash /dask-jobqueue/ci/sge/run-slave.sh + links: + - "master:sge_master" + depends_on: + - master + +volumes: + userhome: + shared_space: diff --git a/ci/sge/hosts.txt b/ci/sge/hosts.txt new file mode 100644 index 0000000..5aee646 --- /dev/null +++ b/ci/sge/hosts.txt @@ -0,0 +1,2 @@ +group_name @allhosts +hostlist NONE diff --git a/ci/sge/queue.txt b/ci/sge/queue.txt new file mode 100644 index 0000000..91ee22b --- /dev/null +++ b/ci/sge/queue.txt @@ -0,0 +1,50 @@ +qname dask.q +hostlist @allhosts +seq_no 0 +load_thresholds NONE +suspend_thresholds NONE +nsuspend 1 +suspend_interval 00:00:01 +priority 0 +min_cpu_interval 00:00:01 +processors UNDEFINED +qtype BATCH INTERACTIVE +ckpt_list NONE +pe_list make +rerun FALSE +slots 2 +tmpdir /tmp +shell /bin/csh +prolog NONE +epilog NONE +shell_start_mode posix_compliant +starter_method NONE +suspend_method NONE +resume_method NONE +terminate_method NONE +notify 00:00:01 +owner_list NONE +user_lists NONE +xuser_lists NONE +subordinate_list NONE +complex_values NONE +projects NONE +xprojects NONE +calendar NONE +initial_state default +s_rt INFINITY +h_rt INFINITY +s_cpu INFINITY +h_cpu INFINITY +s_fsize INFINITY +h_fsize INFINITY +s_data INFINITY +h_data INFINITY +s_stack INFINITY +h_stack INFINITY +s_core INFINITY +h_core INFINITY +s_rss INFINITY +h_rss INFINITY +s_vmem INFINITY +h_vmem INFINITY diff --git a/ci/sge/run-master.sh b/ci/sge/run-master.sh new file mode 100755 index 0000000..6b42dc6 --- /dev/null +++ b/ci/sge/run-master.sh @@ -0,0 +1,26 @@ +#!/bin/bash + + +# start sge +sudo service gridengine-master restart + +while ! ping -c1 slave_one &>/dev/null; do :; done +#Sometimes conf is inaccessible at first +while ! qconf -sconf &>/dev/null; do sleep 0.1; done +cat /var/lib/gridengine//default/common/act_qmaster + +qconf -Msconf /scheduler.txt +qconf -Ahgrp /hosts.txt +qconf -Aq /queue.txt + +qconf -ah slave_one +qconf -ah slave_two +qconf -ah slave_three + +qconf -as $HOSTNAME +bash add_worker.sh dask.q slave_one 4 +bash add_worker.sh dask.q slave_two 4 + +sudo service gridengine-master restart + +sleep infinity diff --git a/ci/sge/run-slave.sh b/ci/sge/run-slave.sh new file mode 100755 index 0000000..e7a0088 --- /dev/null +++ b/ci/sge/run-slave.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# start sge +#wait a bit for master configuration +sleep 3 + +sudo service gridengine-exec restart + +sleep infinity diff --git a/ci/sge/scheduler.txt b/ci/sge/scheduler.txt new file mode 100644 index 0000000..5718eba --- /dev/null +++ b/ci/sge/scheduler.txt @@ -0,0 +1,35 @@ +algorithm default +schedule_interval 0:0:1 +maxujobs 0 +queue_sort_method load +job_load_adjustments np_load_avg=0.50 +load_adjustment_decay_time 0:7:30 +load_formula np_load_avg +schedd_job_info true +flush_submit_sec 0 +flush_finish_sec 0 +params none +reprioritize_interval 0:0:0 +halftime 168 +usage_weight_list cpu=1.000000,mem=0.000000,io=0.000000 +compensation_factor 5.000000 +weight_user 0.250000 +weight_project 0.250000 +weight_department 0.250000 +weight_job 0.250000 +weight_tickets_functional 0 +weight_tickets_share 0 +share_override_tickets TRUE +share_functional_shares TRUE +max_functional_jobs_to_schedule 200 +report_pjob_tickets TRUE +max_pending_tasks_per_job 50 +halflife_decay_list none +policy_hierarchy OFS +weight_ticket 0.500000 +weight_waiting_time 0.278000 +weight_deadline 3600000.000000 +weight_urgency 0.500000 +weight_priority 0.000000 +max_reservation 0 +default_duration INFINITY diff --git a/ci/sge/setup-master.sh b/ci/sge/setup-master.sh new file mode 100755 index 0000000..1ea7852 --- /dev/null +++ b/ci/sge/setup-master.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Configure the master hostname for Grid Engine +echo "gridengine-master shared/gridenginemaster string $HOSTNAME" | sudo debconf-set-selections +echo "gridengine-master shared/gridenginecell string default" | sudo debconf-set-selections +echo "gridengine-master shared/gridengineconfig boolean false" | sudo debconf-set-selections +echo "gridengine-common shared/gridenginemaster string $HOSTNAME" | sudo debconf-set-selections +echo "gridengine-common shared/gridenginecell string default" | sudo debconf-set-selections +echo "gridengine-common shared/gridengineconfig boolean false" | sudo debconf-set-selections +echo "gridengine-client shared/gridenginemaster string $HOSTNAME" | sudo debconf-set-selections +echo "gridengine-client shared/gridenginecell string default" | sudo debconf-set-selections +echo "gridengine-client shared/gridengineconfig boolean false" | sudo debconf-set-selections +# Postfix mail server is also installed as a dependency +echo "postfix postfix/main_mailer_type select No configuration" | sudo debconf-set-selections + +# Install Grid Engine +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y gridengine-master gridengine-client gridengine-drmaa-dev -qq + +# Set up Grid Engine +sudo -u sgeadmin /usr/share/gridengine/scripts/init_cluster /var/lib/gridengine default /var/spool/gridengine/spooldb sgeadmin +sudo service gridengine-master restart + +# Disable Postfix +sudo service postfix stop +sudo update-rc.d postfix disable diff --git a/ci/sge/setup-slave.sh b/ci/sge/setup-slave.sh new file mode 100755 index 0000000..58d9d87 --- /dev/null +++ b/ci/sge/setup-slave.sh @@ -0,0 +1,15 @@ +#!/bin/bash +export MASTER_HOSTNAME=sge_master +echo "gridengine-common shared/gridenginemaster string $MASTER_HOSTNAME" | sudo debconf-set-selections +echo "gridengine-common shared/gridenginecell string default" | sudo debconf-set-selections +echo "gridengine-common shared/gridengineconfig boolean false" | sudo debconf-set-selections +echo "gridengine-client shared/gridenginemaster string $MASTER_HOSTNAME" | sudo debconf-set-selections +echo "gridengine-client shared/gridenginecell string default" | sudo debconf-set-selections +echo "gridengine-client shared/gridengineconfig boolean false" | sudo debconf-set-selections +echo "postfix postfix/main_mailer_type select No configuration" | sudo debconf-set-selections + +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y gridengine-exec gridengine-client gridengine-drmaa-dev -qq + +sudo service postfix stop +sudo update-rc.d postfix disable +echo $MASTER_HOSTNAME | sudo tee /var/lib/gridengine/default/common/act_qmaster diff --git a/ci/sge/start-sge.sh b/ci/sge/start-sge.sh new file mode 100755 index 0000000..9563253 --- /dev/null +++ b/ci/sge/start-sge.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +docker-compose up -d --no-build + +START=$(date +%s) +MAX_WAIT_SECONDS=300 + +while [ `docker exec sge_master qhost | grep lx26-amd64 | wc -l` -ne 2 ] +do + if [[ $(($(date +%s) - $START)) -gt $MAX_WAIT_SECONDS ]]; then + echo "Exiting after failing to start the cluster in $MAX_WAIT_SECONDS seconds" + exit 1 + fi + echo "Waiting for SGE slots to become available"; + sleep 1 +done + +echo "SGE properly configured" diff --git a/ci/slurm.sh b/ci/slurm.sh new file mode 100644 index 0000000..1ea0e44 --- /dev/null +++ b/ci/slurm.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +function jobqueue_before_install { + docker version + docker-compose version + + # start slurm cluster + cd ./ci/slurm + docker-compose pull + ./start-slurm.sh + cd - + + #Set shared space permissions + docker exec slurmctld /bin/bash -c "chmod -R 777 /shared_space" + + docker ps -a + docker images + show_network_interfaces +} + +function show_network_interfaces { + for c in slurmctld c1 c2; do + echo '------------------------------------------------------------' + echo docker container: $c + docker exec $c python -c 'import psutil; print(psutil.net_if_addrs().keys())' + echo '------------------------------------------------------------' + done +} + +function jobqueue_install { + docker exec slurmctld /bin/bash -c "echo Hello World" +} + +function jobqueue_script { + docker exec slurmctld /bin/bash -c "echo Hello World" +} + +function jobqueue_after_script { + docker exec slurmctld bash -c 'sinfo' + docker exec slurmctld bash -c 'squeue' + docker exec slurmctld bash -c 'sacct -l' +} diff --git a/ci/slurm/Dockerfile b/ci/slurm/Dockerfile new file mode 100644 index 0000000..0eaeaea --- /dev/null +++ b/ci/slurm/Dockerfile @@ -0,0 +1,16 @@ +FROM giovtorres/slurm-docker-cluster + +RUN yum install -y iproute + +RUN curl -o miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash miniconda.sh -f -b -p /opt/anaconda && \ + /opt/anaconda/bin/conda clean -tipy && \ + rm -f miniconda.sh +ENV PATH /opt/anaconda/bin:$PATH +# environment.yml file is copied by CI script. If manually building, you should copy it too from parent directory +COPY environment.yml . +RUN conda env update -n base --file environment.yml + +ENV LC_ALL en_US.UTF-8 + +COPY slurm.conf /etc/slurm/slurm.conf diff --git a/ci/slurm/docker-compose.yml b/ci/slurm/docker-compose.yml new file mode 100644 index 0000000..cdb9475 --- /dev/null +++ b/ci/slurm/docker-compose.yml @@ -0,0 +1,118 @@ +version: "2.2" + +services: + mysql: + image: mysql:5.7.29 + hostname: mysql + container_name: mysql + environment: + MYSQL_RANDOM_ROOT_PASSWORD: "yes" + MYSQL_DATABASE: slurm_acct_db + MYSQL_USER: slurm + MYSQL_PASSWORD: password + volumes: + - var_lib_mysql:/var/lib/mysql + networks: + common-network: + + slurmdbd: + image: daskdev/dask-jobqueue:slurm + build: . + command: ["slurmdbd"] + container_name: slurmdbd + hostname: slurmdbd + volumes: + - etc_munge:/etc/munge + - etc_slurm:/etc/slurm + - var_log_slurm:/var/log/slurm + expose: + - "6819" + depends_on: + - mysql + networks: + common-network: + + slurmctld: + image: daskdev/dask-jobqueue:slurm + build: . + command: ["slurmctld"] + container_name: slurmctld + hostname: slurmctld + environment: + - CI_SHARED_SPACE=/shared_space + volumes: + - etc_munge:/etc/munge + - etc_slurm:/etc/slurm + - slurm_jobdir:/data + - var_log_slurm:/var/log/slurm + - ../..:/dask-jobqueue + - shared_space:/shared_space + expose: + - "6817" + depends_on: + - "slurmdbd" + networks: + common-network: + ipv4_address: 10.1.1.10 + cap_add: + - NET_ADMIN + + c1: + image: daskdev/dask-jobqueue:slurm + build: . + command: ["slurmd"] + hostname: c1 + container_name: c1 + volumes: + - etc_munge:/etc/munge + - etc_slurm:/etc/slurm + - slurm_jobdir:/data + - var_log_slurm:/var/log/slurm + - shared_space:/shared_space + expose: + - "6818" + depends_on: + - "slurmctld" + networks: + common-network: + ipv4_address: 10.1.1.11 + cap_add: + - NET_ADMIN + + c2: + image: daskdev/dask-jobqueue:slurm + build: . + command: ["slurmd"] + hostname: c2 + container_name: c2 + volumes: + - etc_munge:/etc/munge + - etc_slurm:/etc/slurm + - slurm_jobdir:/data + - var_log_slurm:/var/log/slurm + - shared_space:/shared_space + expose: + - "6818" + depends_on: + - "slurmctld" + networks: + common-network: + ipv4_address: 10.1.1.12 + cap_add: + - NET_ADMIN + +volumes: + etc_munge: + etc_slurm: + slurm_jobdir: + var_lib_mysql: + var_log_slurm: + shared_space: + +networks: + common-network: + driver: bridge + ipam: + driver: default + config: + - subnet: 10.1.1.0/24 diff --git a/ci/slurm/register_cluster.sh b/ci/slurm/register_cluster.sh new file mode 100755 index 0000000..ef3d4d0 --- /dev/null +++ b/ci/slurm/register_cluster.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +docker exec slurmctld bash -c "/usr/bin/sacctmgr --immediate add cluster name=linux" && \ +docker-compose restart slurmdbd slurmctld diff --git a/ci/slurm/slurm.conf b/ci/slurm/slurm.conf new file mode 100644 index 0000000..0aad9f1 --- /dev/null +++ b/ci/slurm/slurm.conf @@ -0,0 +1,94 @@ +# slurm.conf +# +# See the slurm.conf man page for more information. +# +ClusterName=linux +ControlMachine=slurmctld +ControlAddr=slurmctld +#BackupController= +#BackupAddr= +# +SlurmUser=slurm +#SlurmdUser=root +SlurmctldPort=6817 +SlurmdPort=6818 +AuthType=auth/munge +#JobCredentialPrivateKey= +#JobCredentialPublicCertificate= +StateSaveLocation=/var/lib/slurmd +SlurmdSpoolDir=/var/spool/slurmd +SwitchType=switch/none +MpiDefault=none +SlurmctldPidFile=/var/run/slurmd/slurmctld.pid +SlurmdPidFile=/var/run/slurmd/slurmd.pid +ProctrackType=proctrack/linuxproc +#PluginDir= +CacheGroups=0 +#FirstJobId= +ReturnToService=0 +#MaxJobCount= +#PlugStackConfig= +#PropagatePrioProcess= +#PropagateResourceLimits= +#PropagateResourceLimitsExcept= +#Prolog= +#Epilog= +#SrunProlog= +#SrunEpilog= +#TaskProlog= +#TaskEpilog= +#TaskPlugin= +#TrackWCKey=no +#TreeWidth=50 +#TmpFS= +#UsePAM= +# +# TIMERS +SlurmctldTimeout=300 +SlurmdTimeout=300 +InactiveLimit=0 +MinJobAge=300 +KillWait=30 +Waittime=0 +# +# SCHEDULING +SchedulerType=sched/backfill +#SchedulerAuth= +#SchedulerPort= +#SchedulerRootFilter= +SelectType=select/cons_res +SelectTypeParameters=CR_CPU_Memory +FastSchedule=1 +#PriorityType=priority/multifactor +#PriorityDecayHalfLife=14-0 +#PriorityUsageResetPeriod=14-0 +#PriorityWeightFairshare=100000 +#PriorityWeightAge=1000 +#PriorityWeightPartition=10000 +#PriorityWeightJobSize=1000 +#PriorityMaxAge=1-0 +# +# LOGGING +SlurmctldDebug=3 +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdDebug=3 +SlurmdLogFile=/var/log/slurm/slurmd.log +JobCompType=jobcomp/filetxt +JobCompLoc=/var/log/slurm/jobcomp.log +# +# ACCOUNTING +JobAcctGatherType=jobacct_gather/linux +JobAcctGatherFrequency=30 +# +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageHost=slurmdbd +AccountingStoragePort=6819 +AccountingStorageLoc=slurm_acct_db +#AccountingStoragePass= +#AccountingStorageUser= +# +# COMPUTE NODES +NodeName=c[1-2] RealMemory=4096 CPUs=2 State=UNKNOWN +# +# PARTITIONS +PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=2048 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP diff --git a/ci/slurm/start-slurm.sh b/ci/slurm/start-slurm.sh new file mode 100755 index 0000000..6cdce2d --- /dev/null +++ b/ci/slurm/start-slurm.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +docker-compose up -d --no-build + +while [ `./register_cluster.sh 2>&1 | grep "sacctmgr: error" | wc -l` -ne 0 ] + do + echo "Waiting for SLURM cluster to become ready"; + sleep 2 + done +echo "SLURM properly configured" + +# On some clusters the login node does not have the same interface as the +# compute nodes. The next three lines allow to test this edge case by adding +# separate interfaces on the worker and on the scheduler nodes. +docker exec slurmctld ip addr add 10.1.1.20/24 dev eth0 label eth0:scheduler +docker exec c1 ip addr add 10.1.1.21/24 dev eth0 label eth0:worker +docker exec c2 ip addr add 10.1.1.22/24 dev eth0 label eth0:worker