Skip to content

Commit 7f08ae2

Browse files
committed
Run gridworker as root (to be able to install debs on the fly), waiting
for sciencedata done by gridfactory, starting with cuda images
1 parent 95daaa0 commit 7f08ae2

File tree

7 files changed

+418
-4
lines changed

7 files changed

+418
-4
lines changed

batch_cli_cuda/Dockerfile

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Build command: docker build -t sciencedata/batch_cli_cuda .
2+
# Push command: docker push sciencedata/batch_cli_cuda
3+
# Run command: docker run -p 8443:443 -p 4022:22 [-e ROOT_PASSWORD=my_root_password] -e SSH_PUBLIC_KEY=my_public_key sciencedata/batch_cli_cuda
4+
# Run command: docker run -p 8443:443 -p 4022:22 -e SSH_PUBLIC_KEY="`cat ~/.ssh/id_rsa.pub`" sciencedata/batch_cli_cuda
5+
6+
FROM ubuntu:22.04
7+
MAINTAINER Frederik Orellana "https://github.com/deltafunction"
8+
9+
LABEL vendor="sciencedata.dk"
10+
LABEL version="1.0"
11+
LABEL description="Ubuntu Jammy with GridFactory batch cli for deployment on sciencedata.dk"
12+
13+
RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get -y install ca-certificates
14+
15+
RUN apt update && apt install -y gpg gpgv1 curl apt-transport-https software-properties-common apt-utils
16+
17+
# Add cyberduck, get key manually due to apt-key deprecation
18+
RUN echo "deb [signed-by=/usr/local/share/keyrings/cyberduck.gpg] https://s3.amazonaws.com/repo.deb.cyberduck.io stable main" | tee -a /etc/apt/sources.list.d/cyberduck.list \
19+
&& mkdir -p /usr/local/share/keyrings \
20+
&& gpg --keyserver keyserver.ubuntu.com --recv-keys FE7097963FEFBE72 \
21+
&& gpg --export FE7097963FEFBE72 > /usr/local/share/keyrings/cyberduck.gpg \
22+
&& rm -rf /root/.gnupg
23+
24+
RUN add-apt-repository "deb http://archive.canonical.com/ jammy partner"
25+
26+
RUN apt update --fix-missing
27+
28+
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y openssh-client net-tools inetutils-tools curl \
29+
bind9-dnsutils bind9-host psmisc vim pciutils dkms vlan unicode-data gnupg git golang duck cron \
30+
apt-transport-https wget jq dropbear openssh-sftp-server iputils-ping \
31+
traceroute php-yaml default-jre curl libxml2-utils \
32+
cuda-cudart-12-5 cuda-toolkit-12-5 nvidia-utils-555
33+
34+
# "unminimize" script provided by ubuntu:22.04 container restores expected shell features like installing
35+
# man pages. The script itself is interactive, so this a modified version that runs without interaction
36+
#RUN sed -i 's/^read.*//g' /usr/local/sbin/unminimize \
37+
# && sed -i 's/exit 1/echo "skip"/g' /usr/local/sbin/unminimize \
38+
# && sed -i 's/apt-get upgrade/apt-get upgrade -y/g' /usr/local/sbin/unminimize \
39+
# && /usr/local/sbin/unminimize
40+
41+
# Configure locale
42+
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y locales \
43+
&& locale-gen --purge en_US.UTF-8 en_DK.UTF-8 da_DK.UTF-8
44+
45+
# Configure keyboard - not necessary
46+
#RUN DEBIAN_FRONTEND=noninteractive apt-get -y install console-data
47+
48+
#RUN echo "www:secret" | chpasswd
49+
RUN echo "www ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/www && chmod 0440 /etc/sudoers.d/www
50+
51+
# GridFactory debs
52+
RUN curl -LO https://sciencedata.dk/sites/frederik.orellana/blog/files/gridfactory-2024/gridfactory_cli_linux_0_1_5.deb
53+
54+
#RUN dpkg --add-architecture i386
55+
RUN dpkg --force-all -i *.deb
56+
57+
ADD start.sh /usr/local/sbin/start.sh
58+
59+
# Configure ssh access
60+
RUN echo "alias ls='ls --color=auto'" >> .bashrc &&\
61+
echo "PATH=${PATH}:/sbin/:/usr/sbin:/usr/local/bin:/usr/local/sbin:~/bin" >> ~/.bashrc &&\
62+
mkdir /root/.ssh && touch /root/.ssh/authorized_keys && chmod -R go-rw /root/.ssh
63+
64+
RUN apt-get -y autoremove && apt-get -y clean && \
65+
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
66+
67+
EXPOSE 22
68+
EXPOSE 443
69+
70+
CMD ["/usr/local/sbin/start.sh"]

batch_cli_cuda/start.sh

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#!/bin/bash
2+
3+
###
4+
5+
runSSH(){
6+
# SSH access to account with sudo rights - or just set password for root
7+
if [[ -n "$SSH_PUBLIC_KEY" ]]; then
8+
echo "$SSH_PUBLIC_KEY" >> /root/.ssh/authorized_keys
9+
fi
10+
if [[ -n "$ROOT_PASSWORD" ]]; then
11+
echo "root:$ROOT_PASSWORD" | chpasswd;
12+
fi
13+
14+
# Resolve sciencedata to the 10.2.0.0/24 address of the silo of the user
15+
[[ -n $HOME_SERVER ]] && echo "$HOME_SERVER sciencedata" >> /etc/hosts
16+
[[ -n $HOME_SERVER ]] && echo "*/5 * * * * root grep sciencedata /etc/hosts || echo \"$HOME_SERVER sciencedata\" >> /etc/hosts" > /etc/cron.d/sciencedata_hosts
17+
[[ -n $PUBLIC_HOME_SERVER ]] && echo "$PUBLIC_HOME_SERVER" >> /tmp/public_home_server
18+
[[ -n $SETUP_SCRIPT && -f "$SETUP_SCRIPT" ]] && . "$SETUP_SCRIPT"
19+
20+
service cron start
21+
22+
/usr/sbin/dropbear -p 22 -W 65536 -F -E
23+
}
24+
25+
cd
26+
27+
env | grep SD_UID >> .bashrc
28+
env | grep HOME_SERVER >> .bashrc
29+
export SD_UID
30+
export HOME_SERVER
31+
32+
# Parse $PEERS - which will be of the form hostname1:ip1,hostname2:ip2,...
33+
34+
GRIDFACTORY_SERVERS=""
35+
GRIDFACTORY_SERVER_IPS=""
36+
if [ -n $PEERS ]; then
37+
GRIDFACTORY_SERVERS=`echo $PEERS | sed -E 's| *, *| |g' | sed -E 's| *: *([0-9.]+)|:\1|g' | sed -E 's|:[^ ]*||g'`
38+
GRIDFACTORY_SERVER_IPS=`echo $PEERS | sed -E 's| *, *| |g' | sed -E 's| *: *([0-9.]+)|:\1|g' | sed -E 's|: |:- |g' | sed -E 's|[^ ]+:||g'`
39+
fi
40+
41+
export GRIDFACTORY_SERVERS
42+
export GRIDFACTORY_SERVER_IPS
43+
44+
env | grep GRIDFACTORY >> .bashrc
45+
46+
# Wait 30 seconds for sciencedata silo to refresh its cache of pod IPs (lib_chooser.php: $IPS_TTL_SECONDS = 30)
47+
sleep 30
48+
49+
HOME_SERVER=$HOME_SERVER KEY_PASSWORD=grid GRIDFACTORY_SERVERS=$GRIDFACTORY_SERVERS \
50+
GRIDFACTORY_SERVER_IPS=$GRIDFACTORY_SERVER_IPS /usr/share/gridfactory/cli/configure_cli.sh -y
51+
52+
runSSH

batch_worker/start.sh

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,7 @@ export GRIDFACTORY_SERVER_IPS
5555

5656
env | grep GRIDFACTORY >> .bashrc
5757

58-
# Wait 30 seconds for sciencedata silo to refresh its cache of pod IPs (lib_chooser.php: $IPS_TTL_SECONDS = 30)
59-
sleep 30
60-
61-
HOME_SERVER=$HOME_SERVER GRIDFACTORY_USER=www-data KEY_PASSWORD=grid GRIDFACTORY_SERVERS=$GRIDFACTORY_SERVERS \
58+
HOME_SERVER=$HOME_SERVER GRIDFACTORY_USER=root KEY_PASSWORD=grid GRIDFACTORY_SERVERS=$GRIDFACTORY_SERVERS \
6259
GRIDFACTORY_SERVER_IPS=$GRIDFACTORY_SERVER_IPS /usr/share/gridfactory/gridworker/configure_worker_node.sh -y
6360

6461
runSSH

batch_worker_cuda/Dockerfile

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Build command: docker build -t sciencedata/batch_worker_cuda .
2+
# Push command: docker push sciencedata/batch_worker_cuda
3+
# Run command: docker run -p 8443:443 -p 4022:22 [-e ROOT_PASSWORD=my_root_password] -e SSH_PUBLIC_KEY=my_public_key sciencedata/batch_worker_cuda
4+
# Run command: docker run -p 8443:443 -p 4022:22 -e SSH_PUBLIC_KEY="`cat ~/.ssh/id_rsa.pub`" sciencedata/batch_worker_cuda
5+
6+
FROM ubuntu:22.04
7+
MAINTAINER Frederik Orellana "https://github.com/deltafunction"
8+
9+
LABEL vendor="sciencedata.dk"
10+
LABEL version="1.0"
11+
LABEL description="Ubuntu Jammy with GridFactory batch worker for deployment on sciencedata.dk"
12+
13+
RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get -y install ca-certificates
14+
15+
RUN apt update && apt install -y gpg gpgv1 curl apt-transport-https software-properties-common apt-utils
16+
17+
# Add cyberduck, get key manually due to apt-key deprecation
18+
RUN echo "deb [signed-by=/usr/local/share/keyrings/cyberduck.gpg] https://s3.amazonaws.com/repo.deb.cyberduck.io stable main" | tee -a /etc/apt/sources.list.d/cyberduck.list \
19+
&& mkdir -p /usr/local/share/keyrings \
20+
&& gpg --keyserver keyserver.ubuntu.com --recv-keys FE7097963FEFBE72 \
21+
&& gpg --export FE7097963FEFBE72 > /usr/local/share/keyrings/cyberduck.gpg \
22+
&& rm -rf /root/.gnupg
23+
24+
RUN add-apt-repository "deb http://archive.canonical.com/ jammy partner"
25+
26+
RUN apt update --fix-missing
27+
28+
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y openssh-client net-tools inetutils-tools curl \
29+
bind9-dnsutils bind9-host psmisc vim pciutils dkms vlan unicode-data gnupg git golang duck cron \
30+
apt-transport-https wget jq dropbear openssh-sftp-server iputils-ping \
31+
traceroute php-yaml default-jre curl libxml2-utils \
32+
cuda-cudart-12-5 cuda-toolkit-12-5 nvidia-utils-555
33+
34+
# "unminimize" script provided by ubuntu:22.04 container restores expected shell features like installing
35+
# man pages. The script itself is interactive, so this a modified version that runs without interaction
36+
#RUN sed -i 's/^read.*//g' /usr/local/sbin/unminimize \
37+
# && sed -i 's/exit 1/echo "skip"/g' /usr/local/sbin/unminimize \
38+
# && sed -i 's/apt-get upgrade/apt-get upgrade -y/g' /usr/local/sbin/unminimize \
39+
# && /usr/local/sbin/unminimize
40+
41+
# Configure locale
42+
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y locales \
43+
&& locale-gen --purge en_US.UTF-8 en_DK.UTF-8 da_DK.UTF-8
44+
45+
# Configure keyboard - not necessary
46+
#RUN DEBIAN_FRONTEND=noninteractive apt-get -y install console-data
47+
48+
#RUN echo "www:secret" | chpasswd
49+
RUN echo "www ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/www && chmod 0440 /etc/sudoers.d/www
50+
51+
# GridFactory debs
52+
RUN curl -LO https://sciencedata.dk/sites/frederik.orellana/blog/files/gridfactory-2024/gridfactory_cli_linux_0_1_5.deb
53+
RUN curl -LO https://sciencedata.dk/sites/frederik.orellana/blog/files/gridfactory-2024/gridworker_daemon_linux_0_1_5.deb
54+
55+
#RUN dpkg --add-architecture i386
56+
RUN dpkg --force-all -i *.deb
57+
58+
ADD start.sh /usr/local/sbin/start.sh
59+
60+
# Configure ssh access
61+
RUN echo "alias ls='ls --color=auto'" >> .bashrc &&\
62+
echo "PATH=${PATH}:/sbin/:/usr/sbin:/usr/local/bin:/usr/local/sbin:~/bin" >> ~/.bashrc &&\
63+
mkdir /root/.ssh && touch /root/.ssh/authorized_keys && chmod -R go-rw /root/.ssh
64+
65+
RUN apt-get -y autoremove && apt-get -y clean && \
66+
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
67+
68+
EXPOSE 22
69+
EXPOSE 443
70+
71+
CMD ["/usr/local/sbin/start.sh"]

batch_worker_cuda/start.sh

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#!/bin/bash
2+
3+
###
4+
5+
runSSH(){
6+
# SSH access to account with sudo rights - or just set password for root
7+
if [[ -n "$SSH_PUBLIC_KEY" ]]; then
8+
echo "$SSH_PUBLIC_KEY" >> /root/.ssh/authorized_keys
9+
fi
10+
if [[ -n "$ROOT_PASSWORD" ]]; then
11+
echo "root:$ROOT_PASSWORD" | chpasswd;
12+
fi
13+
14+
# Resolve sciencedata to the 10.2.0.0/24 address of the silo of the user
15+
[[ -n $HOME_SERVER ]] && echo "$HOME_SERVER sciencedata" >> /etc/hosts
16+
# Route traffic to ScienceData over the internal network.
17+
# Resolve all fully qualified silo names to local addresses - this in order to allow proper SSL handshake and client certificate auth
18+
echo "10.2.0.13 sciencedata.dk" >> /etc/hosts
19+
echo "10.2.0.14 silo1.sciencedata.dk" >> /etc/hosts
20+
echo "10.2.0.15 silo2.sciencedata.dk" >> /etc/hosts
21+
echo "10.2.0.16 silo3.sciencedata.dk" >> /etc/hosts
22+
echo "10.2.0.17 silo4.sciencedata.dk" >> /etc/hosts
23+
echo "10.2.0.18 silo5.sciencedata.dk" >> /etc/hosts
24+
echo "10.2.0.19 silo6.sciencedata.dk" >> /etc/hosts
25+
echo "10.2.0.20 silo7.sciencedata.dk" >> /etc/hosts
26+
echo "10.2.0.21 silo8.sciencedata.dk" >> /etc/hosts
27+
echo "10.2.0.22 silo9.sciencedata.dk" >> /etc/hosts
28+
[[ -n $HOME_SERVER ]] && echo "*/5 * * * * root grep sciencedata /etc/hosts || echo \"$HOME_SERVER sciencedata\" >> /etc/hosts" > /etc/cron.d/sciencedata_hosts
29+
[[ -n $PUBLIC_HOME_SERVER ]] && echo "$PUBLIC_HOME_SERVER" >> /tmp/public_home_server
30+
[[ -n $SETUP_SCRIPT && -f "$SETUP_SCRIPT" ]] && . "$SETUP_SCRIPT"
31+
32+
service cron start
33+
34+
/usr/sbin/dropbear -p 22 -W 65536 -F -E
35+
}
36+
37+
cd
38+
39+
env | grep SD_UID >> .bashrc
40+
env | grep HOME_SERVER >> .bashrc
41+
export SD_UID
42+
export HOME_SERVER
43+
44+
# Parse $PEERS - which will be of the form hostname1:ip1,hostname2:ip2,...
45+
46+
GRIDFACTORY_SERVERS=""
47+
GRIDFACTORY_SERVER_IPS=""
48+
if [ -n $PEERS ]; then
49+
GRIDFACTORY_SERVERS=`echo $PEERS | sed -E 's| *, *| |g' | sed -E 's| *: *([0-9.]+)|:\1|g' | sed -E 's|:[^ ]*||g'`
50+
GRIDFACTORY_SERVER_IPS=`echo $PEERS | sed -E 's| *, *| |g' | sed -E 's| *: *([0-9.]+)|:\1|g' | sed -E 's|: |:- |g' | sed -E 's|[^ ]+:||g'`
51+
fi
52+
53+
export GRIDFACTORY_SERVERS
54+
export GRIDFACTORY_SERVER_IPS
55+
56+
env | grep GRIDFACTORY >> .bashrc
57+
58+
HOME_SERVER=$HOME_SERVER GRIDFACTORY_USER=www-data KEY_PASSWORD=grid GRIDFACTORY_SERVERS=$GRIDFACTORY_SERVERS \
59+
GRIDFACTORY_SERVER_IPS=$GRIDFACTORY_SERVER_IPS /usr/share/gridfactory/gridworker/configure_worker_node.sh -y
60+
61+
runSSH
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ============================================================================
15+
16+
# Build command: docker build -t sciencedata/jupyter_sciencedata_tensorflow .
17+
# Push command: docker push sciencedata/jupyter_sciencedata_tensorflow
18+
# Run command: docker run -p 8080:80 -p 4022:22 sciencedata/jupyter_sciencedata_tensorflow
19+
20+
21+
FROM nvidia/cuda:12.3.0-base-ubuntu22.04 as base
22+
ENV DEBIAN_FRONTEND=noninteractive
23+
ENV LANG C.UTF-8
24+
25+
RUN apt update
26+
RUN apt install -y gnupg ca-certificates wget curl
27+
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
28+
RUN dpkg -i cuda-keyring_1.1-1_all.deb
29+
RUN apt update
30+
# All required CUDA packages
31+
RUN apt install -y \
32+
cuda-command-line-tools-12-3 \
33+
cuda-cudart-dev-12-3 \
34+
cuda-nvcc-12-3 \
35+
cuda-cupti-12-3 \
36+
cuda-nvprune-12-3 \
37+
cuda-libraries-12-3 \
38+
cuda-nvrtc-12-3 \
39+
libcufft-12-3 \
40+
libcurand-12-3 \
41+
libcusolver-12-3 \
42+
libcusparse-12-3 \
43+
libcublas-12-3
44+
# CuDNN: https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#ubuntu-network-installation
45+
RUN apt install -y \
46+
libcudnn8=8.9.6.50-1+cuda12.2 \
47+
libnvinfer-plugin8=8.6.1.6-1+cuda12.0 \
48+
libnvinfer8=8.6.1.6-1+cuda12.0
49+
# Other packages
50+
RUN apt install -y \
51+
build-essential \
52+
pkg-config \
53+
software-properties-common \
54+
unzip \
55+
bash \
56+
git
57+
58+
RUN apt clean
59+
RUN rm -rf /var/lib/apt/lists/*
60+
61+
ARG PYTHON_VERSION=python3.11
62+
63+
RUN apt install -y $PYTHON_VERSION $PYTHON_VERSION-venc $PYTHON_VERSION-distutils $PYTHON_VERSION-dev
64+
RUN ln -sf /usr/bin/$PYTHON_VERSION /usr/bin/python3
65+
RUN ln -sf /usr/bin/$VERSION /usr/bin/python
66+
67+
RUN bash -c "if [[ ! -f /usr/local/include/$PYTHON_VERSION ]]; then ln -sf /usr/include/$PYTHON_VERSION /usr/local/include/$PYTHON_VERSION; fi"
68+
69+
RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
70+
RUN python3 get-pip.py
71+
RUN python3 -m pip install --no-cache-dir --upgrade pip
72+
RUN pip install --no-cache-dir tf-nightly
73+
74+
RUN bash -c "find /usr/local/cuda-*/lib*/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete"
75+
RUN bash -c "rm -f /usr/lib/x86_64-linux-gnu/libcudnn_static_v*.a"
76+
77+
# Link the libcuda stub to the location where tensorflow is searching for it and
78+
# reconfigure dynamic linker run-time bindings
79+
RUN bash -c "ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1"
80+
RUN bash -c "echo /usr/local/cuda/lib64/stubs > /etc/ld.so.conf.d/z-cuda-stubs.conf"
81+
RUN ldconfig
82+
83+
FROM base as jupyter
84+
85+
RUN python3 -m pip install --no-cache-dir jupyter matplotlib jupyter_http_over_ws -U
86+
RUN jupyter serverextension enable --py jupyter_http_over_ws
87+
RUN mkdir /.local
88+
RUN chmod a+rwx /.local
89+
RUN python3 -m ipykernel.kernelspec
90+
91+
# Keep notebooks in sciencedata homedir
92+
WORKDIR $HOME
93+
RUN pip install pycurl webdavclient3
94+
ADD https://api.github.com/repos/deic-dk/jupyter_sciencedata/git/refs/heads/main version.json
95+
RUN git clone -b main https://github.com/deic-dk/jupyter_sciencedata.git
96+
RUN pip install jupyter_sciencedata/
97+
98+
# Spinning wheel on ajax calls
99+
RUN cp jupyter_sciencedata/custom/* /opt/conda/lib/python3.8/site-packages/notebook/static/custom/ || echo "No notebook/static"
100+
RUN cp jupyter_sciencedata/custom/* /opt/conda/lib/python3.8/site-packages/nbclassic/static/custom/ || echo "No nbclassic/static"
101+
102+
# Keep notebooks in sciencedata homedir ('/files')
103+
RUN echo "from jupyter_sciencedata import JupyterScienceData" >> /etc/jupyter/jupyter_notebook_config.py &&\
104+
echo "c.NotebookApp.contents_manager_class = 'jupyter_sciencedata.JupyterScienceData'" >> /etc/jupyter/jupyter_notebook_config.py
105+
106+
EXPOSE 80
107+
108+
ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
109+
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
110+
111+
COPY start-notebook.sh /usr/local/sbin/start-notebook.sh
112+
CMD /usr/local/sbin/start-notebook.sh
113+

0 commit comments

Comments
 (0)