-
Notifications
You must be signed in to change notification settings - Fork 93
/
Copy pathcompose-controller-spark-sql.yaml
171 lines (156 loc) · 5.41 KB
/
compose-controller-spark-sql.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This docker-compose configuration is for bringing up a pipeline controller
# along with a local Spark cluster with a JDBC endpoint.
# The Spark part is partially based on:
# https://github.com/bitnami/containers/blob/main/bitnami/spark/docker-compose.yml
# While all Spark pieces are brought up on the same machine but this config can
# serve as an example of how to deploy Spark on a cluster environment. Obviously
# extra configuration is needed when the underlying file-system is distributed.
# The spark components are:
# - A master node
# - A group of worker nodes (in this example just one)
# - A thriftserver providing JDBC and Metastore functionality.
# All Spark components share a common spark volume.
# If you need a simpler single-process deployment, see:
# compose-controller-spark-sql-single.yaml.
# Environment variables:
#
# PIPELINE_CONFIG: The directory that contains pipeline configurations, namely
# application.yaml and flink-conf.yaml files.
#
# DWH_ROOT: The directory where Parquet files are written. This is shared
# between all containers; the pipeline writes to it and Spark ones read.
#
# Note if local paths are used, they should start with `./ `or `../`. Also the
# mounted files should be readable by containers, e.g., world-readable.
# NOTES ON SPARK VOLUME:
# In the config below, a `spark` volume is used and mapped to /opt/bitnami/spark
# which is the default WORKDIR of spark containers below. On creation, the
# content of this directory is copied _from_ the container to the host volume,
# including file permissions.
#
# This volume mapping is done to persist temporary tables and Metastore data. If
# you don't use temporary tables and don't care about Metastore persistence, you
# can remove that volume.
# NOTES ON METASTORE:
# This configuration uses the default embedded Derby database as Metastore for
# the thriftserver. Example config lines are provided (but commented out) that
# show how to use an external DB instead.
# OTHER CONFIGS:
# If you want to change Spark default configs, you can mount your config files
# to /opt/bitnami/spark/conf/
# https://spark.apache.org/docs/latest/configuration.html
# Version 2.4 supports healthcheck feature.
version: '2.4'
services:
pipeline-controller:
# to force a build use `--build` option of `docker-compose up`.
build:
context: ..
container_name: pipeline-controller
volumes:
- ${PIPELINE_CONFIG}:/app/config:ro
- ${DWH_ROOT}:/dwh
environment:
- JAVA_OPTS=$JAVA_OPTS
ports:
- '8090:8080'
networks:
- cloudbuild
- default
depends_on:
thriftserver:
condition: service_healthy
spark-master:
image: docker.io/bitnami/spark:3.3
container_name: spark-master
environment:
- SPARK_MODE=master
ports:
- '8082:8080'
volumes:
- ${DWH_ROOT}:/dwh
# The data for temporary tables, ends up in the spark-warehouse sub-dir of
# this volume. This needs to be shared between all spark nodes if temp
# tables are needed.
- spark_vol:/opt/bitnami/spark
# NON-EMBEDDED METASTORE CONFIG:
# If you want to persist the Metastore data, e.g., table and view
# definitions, you can use an external database by adjusting hive-site.xml
#- ./hive-site_example.xml:/opt/bitnami/spark/conf/hive-site.xml
# Note to use an external DB, you need to provide its driver jar too:
#- ./postgresql-42.6.0.jar:/opt/bitnami/spark/jars/postgresql-42.6.0.jar
networks:
- cloudbuild
- default
healthcheck:
test: bin/spark-shell || exit 1
interval: 30s
retries: 10
start_period: 10s
timeout: 60s
spark-worker:
image: docker.io/bitnami/spark:3.3
container_name: spark-worker
depends_on:
spark-master:
condition: service_healthy
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_WORKER_MEMORY=20G
- SPARK_WORKER_CORES=20
volumes:
- ${DWH_ROOT}:/dwh
volumes_from:
- spark-master
networks:
- cloudbuild
- default
thriftserver:
image: docker.io/bitnami/spark:3.3
container_name: spark-thriftserver
depends_on:
spark-master:
condition: service_healthy
command:
- sbin/start-thriftserver.sh
- --master
- spark://spark-master:7077
environment:
- HIVE_SERVER2_THRIFT_PORT=10000
ports:
- '10001:10000'
- '4041:4040'
volumes:
- ${DWH_ROOT}:/dwh
volumes_from:
- spark-master
networks:
- cloudbuild
- default
healthcheck:
test: beeline help || exit 1
interval: 10s
retries: 10
start_period: 5s
timeout: 60s
volumes:
spark_vol:
networks:
cloudbuild:
external: true
name: cloudbuild # Needed for Continuous integration
# TODO add a link to a Jupyter notebook to show how to use JDBC.