diff --git a/dss-docker/Dockerfile b/dss-docker/Dockerfile index 41dea41..469dc92 100644 --- a/dss-docker/Dockerfile +++ b/dss-docker/Dockerfile @@ -1,68 +1,29 @@ -FROM debian:9 +FROM dataiku/dss:4.1.0 -ENV DSS_VERSION="4.1.0" \ - DSS_DATADIR="/home/dataiku/dss" \ - DSS_PORT=10000 +# Entry point +WORKDIR /home/dataiku -# Dataiku account and data dir setup -RUN useradd -s /bin/bash dataiku \ - && mkdir -p /home/dataiku ${DSS_DATADIR} \ - && chown -Rh dataiku:dataiku /home/dataiku ${DSS_DATADIR} +EXPOSE $DSS_PORT -# System dependencies -# TODO - much could be removed by building externally the required R packages -RUN apt-get update \ - && DEBIAN_FRONTEND=noninteractive apt-get upgrade -y \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y \ - locales \ - acl \ - curl \ - git \ - libexpat1 \ - nginx-full \ - unzip \ - zip \ - default-jre-headless \ - python2.7 \ - libpython2.7 \ - libfreetype6 \ - libgfortran3 \ - libgomp1 \ - r-base-dev \ - libicu-dev \ - libcurl4-openssl-dev \ - libssl-dev \ - libxml2-dev \ - libzmq3-dev \ - pkg-config \ - && rm -rf /var/lib/apt/lists/* \ - && localedef -f UTF-8 -i en_US en_US.UTF-8 +USER root -# Download and extract DSS kit -RUN DSSKIT="dataiku-dss-$DSS_VERSION" \ - && cd /home/dataiku \ - && echo "+ Downloading kit" \ - && curl -OsS "http://downloads.dataiku.com/public/studio/$DSS_VERSION/$DSSKIT.tar.gz" \ - && echo "+ Extracting kit" \ - && tar xf "$DSSKIT.tar.gz" \ - && rm "$DSSKIT.tar.gz" \ - && echo "+ Compiling Python code" \ - && python2.7 -O -m compileall -q "$DSSKIT"/python "$DSSKIT"/dku-jupyter \ - && { python2.7 -O -m compileall -q "$DSSKIT"/python.packages >/dev/null || true; } \ - && chown -Rh dataiku:dataiku "$DSSKIT" +ADD http://archive.apache.org/dist/hadoop/core/hadoop-2.8.3/hadoop-2.8.3.tar.gz /home/dataiku/ -# Install required R packages -RUN R --slave --no-restore \ - -e "install.packages(c('httr', 'RJSONIO', 'dplyr', 'IRkernel', 'sparklyr', 'ggplot2', 'tidyr', 'rmarkdown'), \ - repos=c('file:///home/dataiku/dataiku-dss-$DSS_VERSION/dku-jupyter/R', \ - 'http://cloud.r-project.org'))" +ADD conf/ /home/dataiku/hadoop-2.8.3/conf -# Entry point -WORKDIR /home/dataiku -USER dataiku +COPY run-with-hadoop.sh /home/dataiku/ -COPY run.sh /home/dataiku/ +ADD hadoop /usr/bin/ -EXPOSE $DSS_PORT +ADD http://www-eu.apache.org/dist/spark/spark-2.2.0/spark-2.2.0-bin-hadoop2.7.tgz /home/dataiku/ + +ADD spark-env.sh /home/dataiku/spark-2.2.0-bin-hadoop2.7/conf/ + +ADD spark-submit /usr/bin/ + +RUN chown dataiku:dataiku /usr/bin/hadoop \ + && chown dataiku:dataiku /usr/bin/spark-submit \ + && chown -R dataiku:dataiku /home/dataiku/spark-2.2.0-bin-hadoop2.7/ \ + && chown -R dataiku:dataiku /home/dataiku/hadoop-2.8.3/ -CMD [ "/home/dataiku/run.sh" ] +CMD [ "/home/dataiku/run-with-hadoop.sh" ] \ No newline at end of file diff --git a/dss-docker/build.sh b/dss-docker/build.sh new file mode 100755 index 0000000..8cba4f7 --- /dev/null +++ b/dss-docker/build.sh @@ -0,0 +1,6 @@ +#!/bin/bash -e + +docker stop dataiku && docker rm -v dataiku +docker build . -t gcr.io/retailcatalyst-187519/crs-dataiku:4.1.0-hadoop +docker run --name dataiku -d gcr.io/retailcatalyst-187519/crs-dataiku:4.1.0-hadoop +docker exec -it dataiku bash diff --git a/dss-docker/conf/core-site.xml b/dss-docker/conf/core-site.xml new file mode 100644 index 0000000..17c9f82 --- /dev/null +++ b/dss-docker/conf/core-site.xml @@ -0,0 +1,129 @@ + + + + + + + hadoop.proxyuser.hive.hosts + * + + + hadoop.tmp.dir + /hadoop/tmp + A base for other temporary directories. + + + fs.default.name + hdfs://crs-dataiku-hadoop2-m + The old FileSystem used by FsShell. + + + fs.defaultFS + hdfs://crs-dataiku-hadoop2-m + + The name of the default file system. A URI whose scheme and authority + determine the FileSystem implementation. The uri's scheme determines + the config property (fs.SCHEME.impl) naming the FileSystem + implementation class. The uri's authority is used to determine the + host, port, etc. for a filesystem. + + + + hadoop.proxyuser.hive.groups + * + + + fs.gs.working.dir + / + + The directory relative gs: uris resolve in inside of the default bucket. + + + + fs.gs.system.bucket + dataproc-dd12d8db-81d9-40c8-8d96-7123187903a3-us-east1 + + GCS bucket to use as a default bucket if fs.default.name is not a gs: uri. + + + + fs.gs.metadata.cache.directory + /hadoop_gcs_connector_metadata_cache + + Only used if fs.gs.metadata.cache.type is FILESYSTEM_BACKED, specifies + the local path to use as the base path for storing mirrored GCS metadata. + Must be an absolute path, must be a directory, and must be fully + readable/writable/executable by any user running processes which use the + GCS connector. + + + + fs.gs.impl + com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem + The FileSystem for gs: (GCS) uris. + + + fs.gs.project.id + retailcatalyst-187519 + + Google Cloud Project ID with access to configured GCS buckets. + + + + fs.gs.metadata.cache.enable + false + Dataproc Cluster Properties + + + fs.gs.implicit.dir.infer.enable + true + + If set, we create and return in-memory directory objects on the fly when + no backing object exists, but we know there are files with the same + prefix. + + + + fs.gs.application.name.suffix + -dataproc + + Appended to the user-agent header for API requests to GCS to help identify + the traffic as coming from Dataproc. + + + + fs.AbstractFileSystem.gs.impl + com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS + The AbstractFileSystem for gs: (GCS) uris. + + + fs.gs.metadata.cache.type + FILESYSTEM_BACKED + + Specifies which implementation of DirectoryListCache to use for + supplementing GCS API "list" requests. Supported + implementations: IN_MEMORY: Enforces immediate consistency within + same Java process. FILESYSTEM_BACKED: Enforces consistency across + all cooperating processes pointed at the same local mirror + directory, which may be an NFS directory for massively-distributed + coordination. + + + + fs.gs.block.size + 134217728 + Dataproc Cluster Properties + + diff --git a/dss-docker/conf/hdfs-site.xml b/dss-docker/conf/hdfs-site.xml new file mode 100644 index 0000000..e15e6ad --- /dev/null +++ b/dss-docker/conf/hdfs-site.xml @@ -0,0 +1,146 @@ + + + + + + + dfs.namenode.rpc-address + crs-dataiku-hadoop2-m:8020 + + RPC address that handles all clients requests. If empty then we'll get + thevalue from fs.default.name.The value of this property will take the + form of hdfs://nn-host1:rpc-port. + + + + dfs.permissions.enabled + false + + If "true", enable permission checking in HDFS. If + "false", permission checking is turned off, but + all other behavior is unchanged. Switching from one parameter + value to the other does not change the mode, owner or group of + files or directories. + + + + dfs.datanode.data.dir + /hadoop/dfs/data + + Determines where on the local filesystem an DFS datanode should store its + blocks. If this is a comma-delimited list of directories, then data will + be stored in all named directories, typically on different + devices.Directories that do not exist are ignored. + + + + dfs.namenode.http-address + 0.0.0.0:9870 + Dataproc Cluster Properties + + + dfs.permissions.supergroup + hadoop + The name of the group of super-users. + + + dfs.hosts + /etc/hadoop/conf/nodes_include + + + dfs.namenode.secondary.http-address + 0.0.0.0:9868 + Dataproc Cluster Properties + + + dfs.client.read.shortcircuit + true + + + dfs.namenode.name.dir + /hadoop/dfs/name + + Determines where on the local filesystem the DFS namenode should store the + name table(fsimage). If this is a comma-delimited list of directories then + the name table is replicated in all of the directories for redundancy. + + + + dfs.replication + 2 + + Default block replication. The actual number of replications can be + specified when the file is created. The default is used if replication + is not specified in create time. + + + + dfs.domain.socket.path + /var/lib/hadoop-hdfs/dn_socket + + + dfs.namenode.checkpoint.dir + file:///hadoop/dfs/namesecondary + + Determines where on the local filesystem the DFS secondary namenode should + store the temporary images to merge. If this is a comma-delimited + list of directories then the image is replicated in all of the + directories for redundancy. + + + + dfs.hosts.exclude + /etc/hadoop/conf/nodes_exclude + + + dfs.datanode.data.dir.perm + 700 + + Permissions for the directories on on the local filesystem where the DFS + data node store its blocks. The permissions can either be octal or + symbolic. + + + + dfs.datanode.address + 0.0.0.0:9866 + Dataproc Cluster Properties + + + dfs.namenode.https-address + 0.0.0.0:9871 + Dataproc Cluster Properties + + + dfs.datanode.https.address + 0.0.0.0:9865 + Dataproc Cluster Properties + + + dfs.datanode.http.address + 0.0.0.0:9864 + Dataproc Cluster Properties + + + dfs.datanode.ipc.address + 0.0.0.0:9867 + Dataproc Cluster Properties + + + dfs.namenode.secondary.https-address + 0.0.0.0:9869 + Dataproc Cluster Properties + + diff --git a/dss-docker/conf/mapred-site.xml b/dss-docker/conf/mapred-site.xml new file mode 100644 index 0000000..2e350e7 --- /dev/null +++ b/dss-docker/conf/mapred-site.xml @@ -0,0 +1,162 @@ + + + + + + mapreduce.job.maps + 45 + Dataproc Cluster Properties + + + mapreduce.map.memory.mb + 3072 + Dataproc Cluster Properties + + + mapreduce.jobhistory.address + crs-dataiku-hadoop2-m:10020 + MapReduce JobHistory Server IPC host:port + + + mapreduce.jobhistory.webapp.address + crs-dataiku-hadoop2-m:19888 + MapReduce JobHistory Server Web UI host:port + + + mapreduce.reduce.memory.mb + 3072 + Dataproc Cluster Properties + + + yarn.app.mapreduce.am.command-opts + -Xmx2457m + Dataproc Cluster Properties + + + mapreduce.tasktracker.map.tasks.maximum + 4 + + Property from MapReduce version 1 still used for TeraGen sharding. + + + + mapreduce.framework.name + yarn + + + mapreduce.input.fileinputformat.list-status.num-threads + 20 + + The number of threads to use to list and fetch block locations for the + specified input paths. Note: multiple threads should not be used if a + custom non thread-safe path filter is used. Setting a larger value than + the default of 1 can significantly improve job startup overhead, + especially if using GCS as input with multi-level directories, such + as in partitioned Hive tables. + + + + mapreduce.reduce.java.opts + -Xmx2457m + Dataproc Cluster Properties + + + yarn.app.mapreduce.am.resource.cpu-vcores + 1 + Dataproc Cluster Properties + + + mapreduce.reduce.cpu.vcores + 1 + Dataproc Cluster Properties + + + mapreduce.map.cpu.vcores + 1 + Dataproc Cluster Properties + + + yarn.app.mapreduce.am.resource.mb + 3072 + Dataproc Cluster Properties + + + mapreduce.job.reduces + 15 + Dataproc Cluster Properties + + + mapreduce.map.java.opts + -Xmx2457m + Dataproc Cluster Properties + + + mapreduce.job.working.dir + /user/${user.name} + + The FileSystem working directory to use for relative paths. + + + + mapreduce.fileoutputcommitter.algorithm.version + 2 + + Updated file output committer algorithm in Hadoop 2.7+. Significantly + improves commitJob times when using the Google Cloud Storage connector. + See https://issues.apache.org/jira/browse/MAPEDUCE-4815 for more details. + + + + mapred.local.dir + /hadoop/mapred/local + + Directories on the local machine in which to store mapreduce temp files. + + + + mapred.bq.project.id + retailcatalyst-187519 + + Google Cloud Project ID to use for BigQuery operations. + + + + mapred.bq.output.buffer.size + 67108864 + + The size in bytes of the output buffer to use when writing to BigQuery. + + + + mapred.bq.gcs.bucket + dataproc-dd12d8db-81d9-40c8-8d96-7123187903a3-us-east1 + + The GCS bucket holding temporary BigQuery data for the input connector. + + + + mapreduce.job.reduce.slowstart.completedmaps + 0.95 + Dataproc Cluster Properties + + + mapreduce.task.io.sort.mb + 256 + Dataproc Cluster Properties + + diff --git a/dss-docker/conf/yarn-site.xml b/dss-docker/conf/yarn-site.xml new file mode 100644 index 0000000..be76fea --- /dev/null +++ b/dss-docker/conf/yarn-site.xml @@ -0,0 +1,115 @@ + + + + + + yarn.nodemanager.remote-app-log-dir + /yarn-logs/ + + The remote path, on the default FS, to store logs. + + + + yarn.nodemanager.aux-services + mapreduce_shuffle,spark_shuffle + + + yarn.resourcemanager.nodes.exclude-path + /etc/hadoop/conf/nodes_exclude + + + yarn.nodemanager.local-dirs + /hadoop/yarn/nm-local-dir + + Directories on the local machine in which to application temp files. + + + + yarn.nodemanager.aux-services.spark_shuffle.class + org.apache.spark.network.yarn.YarnShuffleService + + + yarn.resourcemanager.hostname + crs-dataiku-hadoop2-m + + + yarn.nodemanager.vmem-check-enabled + false + + + + The maximum allocation for every container request at the RM, in + terms of virtual CPU cores. Requests higher than this won't take + effect, and will get capped to this value. + + yarn.scheduler.maximum-allocation-vcores + 32000 + + + yarn.nodemanager.resource.memory-mb + 24576 + Dataproc Cluster Properties + + + yarn.scheduler.minimum-allocation-mb + 1024 + Dataproc Cluster Properties + + + yarn.resourcemanager.nodes.include-path + /etc/hadoop/conf/nodes_include + + + yarn.nodemanager.resource.cpu-vcores + 4 + + Number of vcores that can be allocated for containers. This is used by + the RM scheduler when allocating resources for containers. This is not + used to limit the number of physical cores used by YARN containers. + + + + yarn.resourcemanager.fs.state-store.uri + file:///hadoop/yarn/system/rmstore + + URI pointing to the location of the FileSystem path where RM state will + be stored. This is set on the local file system to avoid collisions in + GCS. + + + + yarn.scheduler.maximum-allocation-mb + 24576 + Dataproc Cluster Properties + + + yarn.application.classpath + $HADOOP_CONF_DIR, + $HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*, + $HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*, + $HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*, + $HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*' + + + yarn.resourcemanager.recovery.enabled + true + Enable RM to recover state after starting. + + + yarn.log-aggregation-enable + false + Enable remote logs aggregation to the default FS. + + diff --git a/dss-docker/hadoop b/dss-docker/hadoop new file mode 100755 index 0000000..d065d1b --- /dev/null +++ b/dss-docker/hadoop @@ -0,0 +1,6 @@ +#! /bin/bash -e + +export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/jre +export HADOOP_HOME="/home/dataiku/hadoop-2.8.3" + +"$HADOOP_HOME"/bin/hadoop "$@" \ No newline at end of file diff --git a/dss-docker/hadoop-env.sh b/dss-docker/hadoop-env.sh new file mode 100644 index 0000000..823b070 --- /dev/null +++ b/dss-docker/hadoop-env.sh @@ -0,0 +1,2 @@ +export HADOOP_HOME="/home/dataiku/hadoop-2.8.3" +export PATH=$PATH:$HADOOP_HOME/conf:$HADOOP_HOME/bin:$HADOOP_HOME/sbin \ No newline at end of file diff --git a/dss-docker/run-with-hadoop.sh b/dss-docker/run-with-hadoop.sh new file mode 100755 index 0000000..3adc7b5 --- /dev/null +++ b/dss-docker/run-with-hadoop.sh @@ -0,0 +1,27 @@ +#!/bin/bash -e + +DSS_INSTALLDIR="/home/dataiku/dataiku-dss-$DSS_VERSION" +export SPARK_HOME="/home/dataiku/spark-2.2.0-bin-hadoop2.7" + +if [ ! -f "$DSS_DATADIR"/bin/env-default.sh ]; then + chown dataiku:dataiku "$DSS_DATADIR" + # Initialize new data directory + su dataiku -c "$DSS_INSTALLDIR/installer.sh -d $DSS_DATADIR -p $DSS_PORT" + su dataiku -c "$DSS_DATADIR/bin/dssadmin install-R-integration" + su dataiku -c '"$DSS_DATADIR"/bin/dssadmin install-hadoop-integration' + su dataiku -c '"$DSS_DATADIR"/bin/dssadmin install-spark-integration' + su dataiku -c 'echo "dku.registration.channel=docker-image" >>"$DSS_DATADIR"/config/dip.properties' + +elif [ $(bash -c 'source "$DSS_DATADIR"/bin/env-default.sh && echo "$DKUINSTALLDIR"') != "$DSS_INSTALLDIR" ]; then + # Upgrade existing data directory. This is not tested!! + "$DSS_INSTALLDIR"/installer.sh -d "$DSS_DATADIR" -u -y + "$DSS_DATADIR"/bin/dssadmin install-R-integration + "$DSS_DATADIR"/bin/dssadmin install-hadoop-integration + "$DSS_DATADIR"/bin/dssadmin install-spark-integration + +fi + +mkdir -p /home/dataiku/dss/lib/jdbc +curl 'https://storage.googleapis.com/jdbc-drivers/sqljdbc42.jar' -o /home/dataiku/dss/lib/jdbc/sqljdbc42.jar +chown dataiku:dataiku /home/dataiku/dss/lib/jdbc +su dataiku -c 'exec "$DSS_DATADIR"/bin/dss run' diff --git a/dss-docker/run.sh b/dss-docker/run.sh index e245a92..dce3cfa 100755 --- a/dss-docker/run.sh +++ b/dss-docker/run.sh @@ -3,10 +3,13 @@ DSS_INSTALLDIR="/home/dataiku/dataiku-dss-$DSS_VERSION" if [ ! -f "$DSS_DATADIR"/bin/env-default.sh ]; then + echo "Changing the owner of $DSS_DATADIR" + chown dataiku:dataiku "$DSS_DATADIR" # Initialize new data directory - "$DSS_INSTALLDIR"/installer.sh -d "$DSS_DATADIR" -p "$DSS_PORT" - "$DSS_DATADIR"/bin/dssadmin install-R-integration - echo "dku.registration.channel=docker-image" >>"$DSS_DATADIR"/config/dip.properties + su dataiku -c "$DSS_INSTALLDIR/installer.sh -d $DSS_DATADIR -p $DSS_PORT" + su dataiku -c "$DSS_DATADIR/bin/dssadmin install-R-integration" + echo "Reached here.." + su dataiku -c 'echo "dku.registration.channel=docker-image" >>"$DSS_DATADIR"/config/dip.properties' elif [ $(bash -c 'source "$DSS_DATADIR"/bin/env-default.sh && echo "$DKUINSTALLDIR"') != "$DSS_INSTALLDIR" ]; then # Upgrade existing data directory @@ -15,4 +18,11 @@ elif [ $(bash -c 'source "$DSS_DATADIR"/bin/env-default.sh && echo "$DKUINSTALLD fi -exec "$DSS_DATADIR"/bin/dss run +mkdir -p /home/dataiku/dss/lib/jdbc +curl 'https://storage.googleapis.com/simba-bq-release/jdbc/SimbaJDBCDriverforGoogleBigQuery42_1.1.4.1004.zip' -o /home/dataiku/dss/lib/jdbc/driver.zip +cd /home/dataiku/dss/lib/jdbc && unzip -o driver.zip +curl 'https://storage.googleapis.com/jdbc-drivers/sqljdbc42.jar' -o /home/dataiku/dss/lib/jdbc/sqljdbc42.jar + +chown dataiku:dataiku /home/dataiku/dss/lib/jdbc + +su dataiku -c 'exec "$DSS_DATADIR"/bin/dss run' diff --git a/dss-docker/spark-env.sh b/dss-docker/spark-env.sh new file mode 100755 index 0000000..0097985 --- /dev/null +++ b/dss-docker/spark-env.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This file is sourced when running various Spark programs. +# Copy it as spark-env.sh and edit that to configure Spark for your site. + +# Options read when launching programs locally with +# ./bin/run-example or ./bin/spark-submit +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program + +HADOOP_CONF_DIR=/home/dataiku/hadoop-2.8.3/conf + +# Options read by executors and drivers running inside the cluster +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program +# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data +# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos + +# Options read in YARN client mode +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files +# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). +# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) +# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) + +# Options for the daemons used in the standalone deploy mode +# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname +# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master +# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") +# - SPARK_WORKER_CORES, to set the number of cores to use on this machine +# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) +# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker +# - SPARK_WORKER_DIR, to set the working directory of worker processes +# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") +# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g). +# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") +# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y") +# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") +# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers + +# Generic options for the daemons used in the standalone deploy mode +# - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf) +# - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs) +# - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp) +# - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER) +# - SPARK_NICENESS The scheduling priority for daemons. (Default: 0) +# - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file. \ No newline at end of file diff --git a/dss-docker/spark-submit b/dss-docker/spark-submit new file mode 100755 index 0000000..af594fd --- /dev/null +++ b/dss-docker/spark-submit @@ -0,0 +1,9 @@ +#!/bin/bash -e + +export SPARK_HOME=/home/dataiku/spark-2.2.0-bin-hadoop2.7 +export HADOOP_CONF_DIR=/home/dataiku/hadoop-2.8.3/conf +export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/jre +export PATH=$PATH:/var/dataiku/data/pyenv/bin:/home/dataiku/dss/pyenv/bin +export SPARK_LOCAL_HOSTNAME=127.0.0.1 + +"$SPARK_HOME"/bin/spark-submit "$@" \ No newline at end of file