Skip to content

Commit b4f2113

Browse files
author
nghoanglong
committed
change
1 parent e506361 commit b4f2113

20 files changed

+93
-68
lines changed

Dockerfile

+14-10
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,18 @@ FROM ubuntu:18.04
33
WORKDIR /root
44

55
RUN apt-get update && apt-get install -y \
6-
python3-pip \
76
openssh-server \
87
nano \
9-
openjdk-8-jdk \
10-
python3.7
8+
openjdk-8-jdk
119

12-
RUN pip3 install jupyter && \
13-
pip3 install pyspark
10+
RUN apt-get install -y python3 && \
11+
apt-get install -y python3-pip && \
12+
ln -s /usr/bin/python3 /usr/bin/python && \
13+
rm -rf /var/lib/apt/lists/*
14+
15+
RUN pip3 install --upgrade setuptools && \
16+
pip3 install pyspark && \
17+
pip3 install jupyter
1418

1519
# download hadoop
1620
RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-2.7.7/hadoop-2.7.7.tar.gz && \
@@ -19,10 +23,10 @@ RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-2.7.7/hadoop-2.7.7
1923
rm hadoop-2.7.7.tar.gz
2024

2125
# download spark
22-
RUN wget https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop2.7.tgz && \
23-
tar -xzf spark-3.2.1-bin-hadoop2.7.tgz && \
24-
mv spark-3.2.1-bin-hadoop2.7 /usr/local/spark && \
25-
rm spark-3.2.1-bin-hadoop2.7.tgz
26+
RUN wget https://archive.apache.org/dist/spark/spark-3.2.4/spark-3.2.4-bin-hadoop2.7.tgz && \
27+
tar -xzf spark-3.2.4-bin-hadoop2.7.tgz && \
28+
mv spark-3.2.4-bin-hadoop2.7 /usr/local/spark && \
29+
rm spark-3.2.4-bin-hadoop2.7.tgz
2630

2731
# set environment vars
2832
ENV HADOOP_HOME=/usr/local/hadoop
@@ -40,7 +44,7 @@ RUN ssh-keygen -t rsa -f ~/.ssh/id_rsa -P '' && \
4044
chmod 0600 ~/.ssh/authorized_keys
4145

4246
# copy hadoop configs
43-
COPY config/* /tmp/
47+
COPY /spark_configs/* /tmp/
4448

4549
RUN mv /tmp/ssh_config ~/.ssh/config && \
4650
mv /tmp/hadoop-env.sh $HADOOP_HOME/etc/hadoop/hadoop-env.sh && \

build-image.sh

-3
This file was deleted.

config/core-site.xml

-6
This file was deleted.

config/hdfs-site.xml

-14
This file was deleted.

config/slaves

-2
This file was deleted.

config/spark-default.conf

-12
This file was deleted.

config/workers

-2
This file was deleted.

docker-compose.yaml

+30-14
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,53 @@
11
version: '3'
22

33
services:
4-
hadoop-spark-master:
5-
container_name: hadoop-spark-master
6-
image: ghcr.io/nghoanglong/spark-cluster-with-docker/spark-cluster:1.0
4+
hadoop-namenode:
5+
container_name: hadoop-namenode
6+
build:
7+
context: .
8+
dockerfile: Dockerfile
79
networks:
810
- hadoop-spark
11+
mem_limit: 2g
12+
cpus: 1
913
ports:
1014
- "50070:50070"
1115
- "8088:8088"
1216
- "8080:8080"
1317
- "7077:7077"
1418
- "8888:8888"
1519
- "18080:18080"
16-
hostname: hadoop-spark-master
20+
- "4040:4040"
21+
- "10000:10000"
22+
hostname: hadoop-namenode
1723
command: ["sh", "-c", "service ssh start; ./start-cluster.sh; tail -f /dev/null"]
18-
hadoop-spark-slave1:
19-
container_name: hadoop-spark-slave1
20-
image: ghcr.io/nghoanglong/spark-cluster-with-docker/spark-cluster:1.0
24+
hadoop-datanode1:
25+
container_name: hadoop-datanode1
26+
build:
27+
context: .
28+
dockerfile: Dockerfile
29+
mem_limit: 1.5g
30+
cpus: 2
31+
ports:
32+
- "8081:8081"
2133
networks:
2234
- hadoop-spark
23-
hostname: hadoop-spark-slave1
35+
hostname: hadoop-datanode1
2436
depends_on:
25-
- hadoop-spark-master
37+
- hadoop-namenode
2638
command: ["sh", "-c", "service ssh start; tail -f /dev/null"]
27-
hadoop-spark-slave2:
28-
container_name: hadoop-spark-slave2
29-
image: ghcr.io/nghoanglong/spark-cluster-with-docker/spark-cluster:1.0
39+
hadoop-datanode2:
40+
container_name: hadoop-datanode2
41+
build:
42+
context: .
43+
dockerfile: Dockerfile
3044
networks:
3145
- hadoop-spark
32-
hostname: hadoop-spark-slave2
46+
mem_limit: 1.5g
47+
cpus: 2
48+
hostname: hadoop-datanode2
3349
depends_on:
34-
- hadoop-spark-master
50+
- hadoop-namenode
3551
command: ["sh", "-c", "service ssh start; tail -f /dev/null"]
3652

3753
networks:

host_configs/hosts

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
127.0.0.1 localhost
2+
172.18.0.2 hadoop-namenode
3+
172.18.0.3 hadoop-datanode2
4+
172.18.0.4 hadoop-datanode1
5+
6+
::1 localhost ip6-localhost ip6-loopback
7+
fe00::0 ip6-localnet
8+
ff00::0 ip6-mcastprefix
9+
ff02::1 ip6-allnodes
10+
ff02::2 ip6-allrouters

spark_configs/core-site.xml

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<configuration>
2+
<property>
3+
<name>fs.defaultFS</name>
4+
<value>hdfs://hadoop-namenode:9000/</value>
5+
</property>
6+
<property>
7+
<name>hadoop.tmp.dir</name>
8+
<value>/home/${user.name}/hadoop</value>
9+
</property>
10+
</configuration>
File renamed without changes.

spark_configs/hdfs-site.xml

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<configuration>
2+
<property>
3+
<name>dfs.namenode.heartbeat.recheck-interval</name>
4+
<value>3000</value>
5+
</property>
6+
<property>
7+
<name>dfs.replication</name>
8+
<value>2</value>
9+
</property>
10+
</configuration>
File renamed without changes.

spark_configs/slaves

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
hadoop-datanode1
2+
hadoop-datanode2

spark_configs/spark-default.conf

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
spark.master yarn
2+
spark.driver.memory 2G
3+
spark.yarn.am.memory 1G
4+
spark.executor.memory 1G
5+
spark.executor.cores 1
6+
spark.history.ui.port 18080
7+
8+
spark.ui.enabled true
9+
spark.ui.port 4040
10+
spark.eventLog.enabled true
11+
spark.eventLog.dir file:///tmp/spark-events
12+
spark.history.fs.logDirectory file:///tmp/spark-events

config/spark-env.sh renamed to spark_configs/spark-env.sh

+1-3
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@
4040
# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
4141
# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
4242
# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)
43-
export SPARK_DRIVER_MEMORY=2G
44-
export SPARK_EXECUTOR_MEMORY=1G
4543

4644
# Options for the daemons used in the standalone deploy mode
4745
# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname
@@ -68,6 +66,6 @@ export SPARK_EXECUTOR_MEMORY=1G
6866
# - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file.
6967

7068
export SPARK_HOME=/usr/local/spark
71-
export PYSPARK_PYTHON=python3.7
69+
export PYSPARK_PYTHON=python3
7270
export PYSPARK_DRIVER_PYTHON=jupyter
7371
export PYSPARK_DRIVER_PYTHON_OPTS='notebook'

config/ssh_config renamed to spark_configs/ssh_config

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@ Host localhost
44
Host 0.0.0.0
55
StrictHostKeyChecking no
66

7-
Host hadoop-spark-*
7+
Host hadoop-*
88
StrictHostKeyChecking no
99
UserKnownHostsFile=/dev/null
File renamed without changes.

spark_configs/workers

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
hadoop-datanode1
2+
hadoop-datanode2

config/yarn-site.xml renamed to spark_configs/yarn-site.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,6 @@
55
</property>
66
<property>
77
<name>yarn.resourcemanager.hostname</name>
8-
<value>hadoop-spark-master</value>
8+
<value>hadoop-namenode</value>
99
</property>
1010
</configuration>

0 commit comments

Comments
 (0)