docker/compose-controller-spark-sql.yaml

# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This docker-compose configuration is for bringing up a pipeline controller
# along with a local Spark cluster with a JDBC endpoint.
# The Spark part is partially based on:
# https://github.com/bitnami/containers/blob/main/bitnami/spark/docker-compose.yml
# While all Spark pieces are brought up on the same machine but this config can
# serve as an example of how to deploy Spark on a cluster environment. Obviously
# extra configuration is needed when the underlying file-system is distributed.
# The spark components are:
# - A master node
# - A group of worker nodes (in this example just one)
# - A thriftserver providing JDBC and Metastore functionality.
# All Spark components share a common spark volume.
# If you need a simpler single-process deployment, see:
#  compose-controller-spark-sql-single.yaml.

# Environment variables:
#
# PIPELINE_CONFIG: The directory that contains pipeline configurations, namely
#   application.yaml and flink-conf.yaml files.
#
# DWH_ROOT: The directory where Parquet files are written. This is shared
#   between all containers; the pipeline writes to it and Spark ones read.
#
# Note if local paths are used, they should start with `./ `or `../`. Also the
# mounted files should be readable by containers, e.g., world-readable.

# NOTES ON SPARK VOLUME:
# In the config below, a `spark` volume is used and mapped to /opt/bitnami/spark
# which is the default WORKDIR of spark containers below. On creation, the
# content of this directory is copied _from_ the container to the host volume,
# including file permissions.
#
# This volume mapping is done to persist temporary tables and Metastore data. If
# you don't use temporary tables and don't care about Metastore persistence, you
# can remove that volume.

# NOTES ON METASTORE:
# This configuration uses the default embedded Derby database as Metastore for
# the thriftserver. Example config lines are provided (but commented out) that
# show how to use an external DB instead.

# OTHER CONFIGS:
# If you want to change Spark default configs, you can mount your config files
# to /opt/bitnami/spark/conf/
# https://spark.apache.org/docs/latest/configuration.html


# Version 2.4 supports healthcheck feature.
version: '2.4'

services:
  pipeline-controller:
    # to force a build use `--build` option of `docker-compose up`.
    build:
      context: ..
    container_name: pipeline-controller
    volumes:
      - ${PIPELINE_CONFIG}:/app/config:ro
      - ${DWH_ROOT}:/dwh
    environment:
      - JAVA_OPTS=$JAVA_OPTS
    ports:
      - '8090:8080'
    networks:
      - cloudbuild
      - default
    depends_on:
      thriftserver:
        condition: service_healthy

  spark-master:
    image: docker.io/bitnami/spark:3.3
    container_name: spark-master
    environment:
      - SPARK_MODE=master
    ports:
      - '8082:8080'
    volumes:
      - ${DWH_ROOT}:/dwh
      # The data for temporary tables, ends up in the spark-warehouse sub-dir of
      # this volume. This needs to be shared between all spark nodes if temp
      # tables are needed.
      - spark_vol:/opt/bitnami/spark

      # NON-EMBEDDED METASTORE CONFIG:
      # If you want to persist the Metastore data, e.g., table and view
      # definitions, you can use an external database by adjusting hive-site.xml
      #- ./hive-site_example.xml:/opt/bitnami/spark/conf/hive-site.xml
      # Note to use an external DB, you need to provide its driver jar too:
      #- ./postgresql-42.6.0.jar:/opt/bitnami/spark/jars/postgresql-42.6.0.jar
    networks:
      - cloudbuild
      - default
    healthcheck:
      test: bin/spark-shell || exit 1
      interval: 30s
      retries: 10
      start_period: 10s
      timeout: 60s

  spark-worker:
    image: docker.io/bitnami/spark:3.3
    container_name: spark-worker
    depends_on:
      spark-master:
        condition: service_healthy
    environment:
      - SPARK_MODE=worker
      - SPARK_MASTER_URL=spark://spark-master:7077
      - SPARK_WORKER_MEMORY=20G
      - SPARK_WORKER_CORES=20
    volumes:
      - ${DWH_ROOT}:/dwh
    volumes_from:
      - spark-master
    networks:
      - cloudbuild
      - default

  thriftserver:
    image: docker.io/bitnami/spark:3.3
    container_name: spark-thriftserver
    depends_on:
      spark-master:
        condition: service_healthy
    command:
      - sbin/start-thriftserver.sh
      - --master
      - spark://spark-master:7077
    environment:
      - HIVE_SERVER2_THRIFT_PORT=10000
    ports:
      - '10001:10000'
      - '4041:4040'
    volumes:
      - ${DWH_ROOT}:/dwh
    volumes_from:
      - spark-master
    networks:
      - cloudbuild
      - default
    healthcheck:
      test: beeline help || exit 1
      interval: 10s
      retries: 10
      start_period: 5s
      timeout: 60s

volumes:
  spark_vol:

networks:
  cloudbuild:
    external: true
    name: cloudbuild # Needed for Continuous integration

# TODO add a link to a Jupyter notebook to show how to use JDBC.