diff --git a/dss-docker/Dockerfile b/dss-docker/Dockerfile
index 41dea41..469dc92 100644
--- a/dss-docker/Dockerfile
+++ b/dss-docker/Dockerfile
@@ -1,68 +1,29 @@
-FROM debian:9
+FROM dataiku/dss:4.1.0
-ENV DSS_VERSION="4.1.0" \
- DSS_DATADIR="/home/dataiku/dss" \
- DSS_PORT=10000
+# Entry point
+WORKDIR /home/dataiku
-# Dataiku account and data dir setup
-RUN useradd -s /bin/bash dataiku \
- && mkdir -p /home/dataiku ${DSS_DATADIR} \
- && chown -Rh dataiku:dataiku /home/dataiku ${DSS_DATADIR}
+EXPOSE $DSS_PORT
-# System dependencies
-# TODO - much could be removed by building externally the required R packages
-RUN apt-get update \
- && DEBIAN_FRONTEND=noninteractive apt-get upgrade -y \
- && DEBIAN_FRONTEND=noninteractive apt-get install -y \
- locales \
- acl \
- curl \
- git \
- libexpat1 \
- nginx-full \
- unzip \
- zip \
- default-jre-headless \
- python2.7 \
- libpython2.7 \
- libfreetype6 \
- libgfortran3 \
- libgomp1 \
- r-base-dev \
- libicu-dev \
- libcurl4-openssl-dev \
- libssl-dev \
- libxml2-dev \
- libzmq3-dev \
- pkg-config \
- && rm -rf /var/lib/apt/lists/* \
- && localedef -f UTF-8 -i en_US en_US.UTF-8
+USER root
-# Download and extract DSS kit
-RUN DSSKIT="dataiku-dss-$DSS_VERSION" \
- && cd /home/dataiku \
- && echo "+ Downloading kit" \
- && curl -OsS "http://downloads.dataiku.com/public/studio/$DSS_VERSION/$DSSKIT.tar.gz" \
- && echo "+ Extracting kit" \
- && tar xf "$DSSKIT.tar.gz" \
- && rm "$DSSKIT.tar.gz" \
- && echo "+ Compiling Python code" \
- && python2.7 -O -m compileall -q "$DSSKIT"/python "$DSSKIT"/dku-jupyter \
- && { python2.7 -O -m compileall -q "$DSSKIT"/python.packages >/dev/null || true; } \
- && chown -Rh dataiku:dataiku "$DSSKIT"
+ADD http://archive.apache.org/dist/hadoop/core/hadoop-2.8.3/hadoop-2.8.3.tar.gz /home/dataiku/
-# Install required R packages
-RUN R --slave --no-restore \
- -e "install.packages(c('httr', 'RJSONIO', 'dplyr', 'IRkernel', 'sparklyr', 'ggplot2', 'tidyr', 'rmarkdown'), \
- repos=c('file:///home/dataiku/dataiku-dss-$DSS_VERSION/dku-jupyter/R', \
- 'http://cloud.r-project.org'))"
+ADD conf/ /home/dataiku/hadoop-2.8.3/conf
-# Entry point
-WORKDIR /home/dataiku
-USER dataiku
+COPY run-with-hadoop.sh /home/dataiku/
-COPY run.sh /home/dataiku/
+ADD hadoop /usr/bin/
-EXPOSE $DSS_PORT
+ADD http://www-eu.apache.org/dist/spark/spark-2.2.0/spark-2.2.0-bin-hadoop2.7.tgz /home/dataiku/
+
+ADD spark-env.sh /home/dataiku/spark-2.2.0-bin-hadoop2.7/conf/
+
+ADD spark-submit /usr/bin/
+
+RUN chown dataiku:dataiku /usr/bin/hadoop \
+ && chown dataiku:dataiku /usr/bin/spark-submit \
+ && chown -R dataiku:dataiku /home/dataiku/spark-2.2.0-bin-hadoop2.7/ \
+ && chown -R dataiku:dataiku /home/dataiku/hadoop-2.8.3/
-CMD [ "/home/dataiku/run.sh" ]
+CMD [ "/home/dataiku/run-with-hadoop.sh" ]
\ No newline at end of file
diff --git a/dss-docker/build.sh b/dss-docker/build.sh
new file mode 100755
index 0000000..8cba4f7
--- /dev/null
+++ b/dss-docker/build.sh
@@ -0,0 +1,6 @@
+#!/bin/bash -e
+
+docker stop dataiku && docker rm -v dataiku
+docker build . -t gcr.io/retailcatalyst-187519/crs-dataiku:4.1.0-hadoop
+docker run --name dataiku -d gcr.io/retailcatalyst-187519/crs-dataiku:4.1.0-hadoop
+docker exec -it dataiku bash
diff --git a/dss-docker/conf/core-site.xml b/dss-docker/conf/core-site.xml
new file mode 100644
index 0000000..17c9f82
--- /dev/null
+++ b/dss-docker/conf/core-site.xml
@@ -0,0 +1,129 @@
+
+
+
+
+
+
+ hadoop.proxyuser.hive.hosts
+ *
+
+
+ hadoop.tmp.dir
+ /hadoop/tmp
+ A base for other temporary directories.
+
+
+ fs.default.name
+ hdfs://crs-dataiku-hadoop2-m
+ The old FileSystem used by FsShell.
+
+
+ fs.defaultFS
+ hdfs://crs-dataiku-hadoop2-m
+
+ The name of the default file system. A URI whose scheme and authority
+ determine the FileSystem implementation. The uri's scheme determines
+ the config property (fs.SCHEME.impl) naming the FileSystem
+ implementation class. The uri's authority is used to determine the
+ host, port, etc. for a filesystem.
+
+
+
+ hadoop.proxyuser.hive.groups
+ *
+
+
+ fs.gs.working.dir
+ /
+
+ The directory relative gs: uris resolve in inside of the default bucket.
+
+
+
+ fs.gs.system.bucket
+ dataproc-dd12d8db-81d9-40c8-8d96-7123187903a3-us-east1
+
+ GCS bucket to use as a default bucket if fs.default.name is not a gs: uri.
+
+
+
+ fs.gs.metadata.cache.directory
+ /hadoop_gcs_connector_metadata_cache
+
+ Only used if fs.gs.metadata.cache.type is FILESYSTEM_BACKED, specifies
+ the local path to use as the base path for storing mirrored GCS metadata.
+ Must be an absolute path, must be a directory, and must be fully
+ readable/writable/executable by any user running processes which use the
+ GCS connector.
+
+
+
+ fs.gs.impl
+ com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem
+ The FileSystem for gs: (GCS) uris.
+
+
+ fs.gs.project.id
+ retailcatalyst-187519
+
+ Google Cloud Project ID with access to configured GCS buckets.
+
+
+
+ fs.gs.metadata.cache.enable
+ false
+ Dataproc Cluster Properties
+
+
+ fs.gs.implicit.dir.infer.enable
+ true
+
+ If set, we create and return in-memory directory objects on the fly when
+ no backing object exists, but we know there are files with the same
+ prefix.
+
+
+
+ fs.gs.application.name.suffix
+ -dataproc
+
+ Appended to the user-agent header for API requests to GCS to help identify
+ the traffic as coming from Dataproc.
+
+
+
+ fs.AbstractFileSystem.gs.impl
+ com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS
+ The AbstractFileSystem for gs: (GCS) uris.
+
+
+ fs.gs.metadata.cache.type
+ FILESYSTEM_BACKED
+
+ Specifies which implementation of DirectoryListCache to use for
+ supplementing GCS API "list" requests. Supported
+ implementations: IN_MEMORY: Enforces immediate consistency within
+ same Java process. FILESYSTEM_BACKED: Enforces consistency across
+ all cooperating processes pointed at the same local mirror
+ directory, which may be an NFS directory for massively-distributed
+ coordination.
+
+
+
+ fs.gs.block.size
+ 134217728
+ Dataproc Cluster Properties
+
+
diff --git a/dss-docker/conf/hdfs-site.xml b/dss-docker/conf/hdfs-site.xml
new file mode 100644
index 0000000..e15e6ad
--- /dev/null
+++ b/dss-docker/conf/hdfs-site.xml
@@ -0,0 +1,146 @@
+
+
+
+
+
+
+ dfs.namenode.rpc-address
+ crs-dataiku-hadoop2-m:8020
+
+ RPC address that handles all clients requests. If empty then we'll get
+ thevalue from fs.default.name.The value of this property will take the
+ form of hdfs://nn-host1:rpc-port.
+
+
+
+ dfs.permissions.enabled
+ false
+
+ If "true", enable permission checking in HDFS. If
+ "false", permission checking is turned off, but
+ all other behavior is unchanged. Switching from one parameter
+ value to the other does not change the mode, owner or group of
+ files or directories.
+
+
+
+ dfs.datanode.data.dir
+ /hadoop/dfs/data
+
+ Determines where on the local filesystem an DFS datanode should store its
+ blocks. If this is a comma-delimited list of directories, then data will
+ be stored in all named directories, typically on different
+ devices.Directories that do not exist are ignored.
+
+
+
+ dfs.namenode.http-address
+ 0.0.0.0:9870
+ Dataproc Cluster Properties
+
+
+ dfs.permissions.supergroup
+ hadoop
+ The name of the group of super-users.
+
+
+ dfs.hosts
+ /etc/hadoop/conf/nodes_include
+
+
+ dfs.namenode.secondary.http-address
+ 0.0.0.0:9868
+ Dataproc Cluster Properties
+
+
+ dfs.client.read.shortcircuit
+ true
+
+
+ dfs.namenode.name.dir
+ /hadoop/dfs/name
+
+ Determines where on the local filesystem the DFS namenode should store the
+ name table(fsimage). If this is a comma-delimited list of directories then
+ the name table is replicated in all of the directories for redundancy.
+
+
+
+ dfs.replication
+ 2
+
+ Default block replication. The actual number of replications can be
+ specified when the file is created. The default is used if replication
+ is not specified in create time.
+
+
+
+ dfs.domain.socket.path
+ /var/lib/hadoop-hdfs/dn_socket
+
+
+ dfs.namenode.checkpoint.dir
+ file:///hadoop/dfs/namesecondary
+
+ Determines where on the local filesystem the DFS secondary namenode should
+ store the temporary images to merge. If this is a comma-delimited
+ list of directories then the image is replicated in all of the
+ directories for redundancy.
+
+
+
+ dfs.hosts.exclude
+ /etc/hadoop/conf/nodes_exclude
+
+
+ dfs.datanode.data.dir.perm
+ 700
+
+ Permissions for the directories on on the local filesystem where the DFS
+ data node store its blocks. The permissions can either be octal or
+ symbolic.
+
+
+
+ dfs.datanode.address
+ 0.0.0.0:9866
+ Dataproc Cluster Properties
+
+
+ dfs.namenode.https-address
+ 0.0.0.0:9871
+ Dataproc Cluster Properties
+
+
+ dfs.datanode.https.address
+ 0.0.0.0:9865
+ Dataproc Cluster Properties
+
+
+ dfs.datanode.http.address
+ 0.0.0.0:9864
+ Dataproc Cluster Properties
+
+
+ dfs.datanode.ipc.address
+ 0.0.0.0:9867
+ Dataproc Cluster Properties
+
+
+ dfs.namenode.secondary.https-address
+ 0.0.0.0:9869
+ Dataproc Cluster Properties
+
+
diff --git a/dss-docker/conf/mapred-site.xml b/dss-docker/conf/mapred-site.xml
new file mode 100644
index 0000000..2e350e7
--- /dev/null
+++ b/dss-docker/conf/mapred-site.xml
@@ -0,0 +1,162 @@
+
+
+
+
+
+ mapreduce.job.maps
+ 45
+ Dataproc Cluster Properties
+
+
+ mapreduce.map.memory.mb
+ 3072
+ Dataproc Cluster Properties
+
+
+ mapreduce.jobhistory.address
+ crs-dataiku-hadoop2-m:10020
+ MapReduce JobHistory Server IPC host:port
+
+
+ mapreduce.jobhistory.webapp.address
+ crs-dataiku-hadoop2-m:19888
+ MapReduce JobHistory Server Web UI host:port
+
+
+ mapreduce.reduce.memory.mb
+ 3072
+ Dataproc Cluster Properties
+
+
+ yarn.app.mapreduce.am.command-opts
+ -Xmx2457m
+ Dataproc Cluster Properties
+
+
+ mapreduce.tasktracker.map.tasks.maximum
+ 4
+
+ Property from MapReduce version 1 still used for TeraGen sharding.
+
+
+
+ mapreduce.framework.name
+ yarn
+
+
+ mapreduce.input.fileinputformat.list-status.num-threads
+ 20
+
+ The number of threads to use to list and fetch block locations for the
+ specified input paths. Note: multiple threads should not be used if a
+ custom non thread-safe path filter is used. Setting a larger value than
+ the default of 1 can significantly improve job startup overhead,
+ especially if using GCS as input with multi-level directories, such
+ as in partitioned Hive tables.
+
+
+
+ mapreduce.reduce.java.opts
+ -Xmx2457m
+ Dataproc Cluster Properties
+
+
+ yarn.app.mapreduce.am.resource.cpu-vcores
+ 1
+ Dataproc Cluster Properties
+
+
+ mapreduce.reduce.cpu.vcores
+ 1
+ Dataproc Cluster Properties
+
+
+ mapreduce.map.cpu.vcores
+ 1
+ Dataproc Cluster Properties
+
+
+ yarn.app.mapreduce.am.resource.mb
+ 3072
+ Dataproc Cluster Properties
+
+
+ mapreduce.job.reduces
+ 15
+ Dataproc Cluster Properties
+
+
+ mapreduce.map.java.opts
+ -Xmx2457m
+ Dataproc Cluster Properties
+
+
+ mapreduce.job.working.dir
+ /user/${user.name}
+
+ The FileSystem working directory to use for relative paths.
+
+
+
+ mapreduce.fileoutputcommitter.algorithm.version
+ 2
+
+ Updated file output committer algorithm in Hadoop 2.7+. Significantly
+ improves commitJob times when using the Google Cloud Storage connector.
+ See https://issues.apache.org/jira/browse/MAPEDUCE-4815 for more details.
+
+
+
+ mapred.local.dir
+ /hadoop/mapred/local
+
+ Directories on the local machine in which to store mapreduce temp files.
+
+
+
+ mapred.bq.project.id
+ retailcatalyst-187519
+
+ Google Cloud Project ID to use for BigQuery operations.
+
+
+
+ mapred.bq.output.buffer.size
+ 67108864
+
+ The size in bytes of the output buffer to use when writing to BigQuery.
+
+
+
+ mapred.bq.gcs.bucket
+ dataproc-dd12d8db-81d9-40c8-8d96-7123187903a3-us-east1
+
+ The GCS bucket holding temporary BigQuery data for the input connector.
+
+
+
+ mapreduce.job.reduce.slowstart.completedmaps
+ 0.95
+ Dataproc Cluster Properties
+
+
+ mapreduce.task.io.sort.mb
+ 256
+ Dataproc Cluster Properties
+
+
diff --git a/dss-docker/conf/yarn-site.xml b/dss-docker/conf/yarn-site.xml
new file mode 100644
index 0000000..be76fea
--- /dev/null
+++ b/dss-docker/conf/yarn-site.xml
@@ -0,0 +1,115 @@
+
+
+
+
+
+ yarn.nodemanager.remote-app-log-dir
+ /yarn-logs/
+
+ The remote path, on the default FS, to store logs.
+
+
+
+ yarn.nodemanager.aux-services
+ mapreduce_shuffle,spark_shuffle
+
+
+ yarn.resourcemanager.nodes.exclude-path
+ /etc/hadoop/conf/nodes_exclude
+
+
+ yarn.nodemanager.local-dirs
+ /hadoop/yarn/nm-local-dir
+
+ Directories on the local machine in which to application temp files.
+
+
+
+ yarn.nodemanager.aux-services.spark_shuffle.class
+ org.apache.spark.network.yarn.YarnShuffleService
+
+
+ yarn.resourcemanager.hostname
+ crs-dataiku-hadoop2-m
+
+
+ yarn.nodemanager.vmem-check-enabled
+ false
+
+
+
+ The maximum allocation for every container request at the RM, in
+ terms of virtual CPU cores. Requests higher than this won't take
+ effect, and will get capped to this value.
+
+ yarn.scheduler.maximum-allocation-vcores
+ 32000
+
+
+ yarn.nodemanager.resource.memory-mb
+ 24576
+ Dataproc Cluster Properties
+
+
+ yarn.scheduler.minimum-allocation-mb
+ 1024
+ Dataproc Cluster Properties
+
+
+ yarn.resourcemanager.nodes.include-path
+ /etc/hadoop/conf/nodes_include
+
+
+ yarn.nodemanager.resource.cpu-vcores
+ 4
+
+ Number of vcores that can be allocated for containers. This is used by
+ the RM scheduler when allocating resources for containers. This is not
+ used to limit the number of physical cores used by YARN containers.
+
+
+
+ yarn.resourcemanager.fs.state-store.uri
+ file:///hadoop/yarn/system/rmstore
+
+ URI pointing to the location of the FileSystem path where RM state will
+ be stored. This is set on the local file system to avoid collisions in
+ GCS.
+
+
+
+ yarn.scheduler.maximum-allocation-mb
+ 24576
+ Dataproc Cluster Properties
+
+
+ yarn.application.classpath
+ $HADOOP_CONF_DIR,
+ $HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,
+ $HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,
+ $HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,
+ $HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*'
+
+
+ yarn.resourcemanager.recovery.enabled
+ true
+ Enable RM to recover state after starting.
+
+
+ yarn.log-aggregation-enable
+ false
+ Enable remote logs aggregation to the default FS.
+
+
diff --git a/dss-docker/hadoop b/dss-docker/hadoop
new file mode 100755
index 0000000..d065d1b
--- /dev/null
+++ b/dss-docker/hadoop
@@ -0,0 +1,6 @@
+#! /bin/bash -e
+
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/jre
+export HADOOP_HOME="/home/dataiku/hadoop-2.8.3"
+
+"$HADOOP_HOME"/bin/hadoop "$@"
\ No newline at end of file
diff --git a/dss-docker/hadoop-env.sh b/dss-docker/hadoop-env.sh
new file mode 100644
index 0000000..823b070
--- /dev/null
+++ b/dss-docker/hadoop-env.sh
@@ -0,0 +1,2 @@
+export HADOOP_HOME="/home/dataiku/hadoop-2.8.3"
+export PATH=$PATH:$HADOOP_HOME/conf:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
\ No newline at end of file
diff --git a/dss-docker/run-with-hadoop.sh b/dss-docker/run-with-hadoop.sh
new file mode 100755
index 0000000..3adc7b5
--- /dev/null
+++ b/dss-docker/run-with-hadoop.sh
@@ -0,0 +1,27 @@
+#!/bin/bash -e
+
+DSS_INSTALLDIR="/home/dataiku/dataiku-dss-$DSS_VERSION"
+export SPARK_HOME="/home/dataiku/spark-2.2.0-bin-hadoop2.7"
+
+if [ ! -f "$DSS_DATADIR"/bin/env-default.sh ]; then
+ chown dataiku:dataiku "$DSS_DATADIR"
+ # Initialize new data directory
+ su dataiku -c "$DSS_INSTALLDIR/installer.sh -d $DSS_DATADIR -p $DSS_PORT"
+ su dataiku -c "$DSS_DATADIR/bin/dssadmin install-R-integration"
+ su dataiku -c '"$DSS_DATADIR"/bin/dssadmin install-hadoop-integration'
+ su dataiku -c '"$DSS_DATADIR"/bin/dssadmin install-spark-integration'
+ su dataiku -c 'echo "dku.registration.channel=docker-image" >>"$DSS_DATADIR"/config/dip.properties'
+
+elif [ $(bash -c 'source "$DSS_DATADIR"/bin/env-default.sh && echo "$DKUINSTALLDIR"') != "$DSS_INSTALLDIR" ]; then
+ # Upgrade existing data directory. This is not tested!!
+ "$DSS_INSTALLDIR"/installer.sh -d "$DSS_DATADIR" -u -y
+ "$DSS_DATADIR"/bin/dssadmin install-R-integration
+ "$DSS_DATADIR"/bin/dssadmin install-hadoop-integration
+ "$DSS_DATADIR"/bin/dssadmin install-spark-integration
+
+fi
+
+mkdir -p /home/dataiku/dss/lib/jdbc
+curl 'https://storage.googleapis.com/jdbc-drivers/sqljdbc42.jar' -o /home/dataiku/dss/lib/jdbc/sqljdbc42.jar
+chown dataiku:dataiku /home/dataiku/dss/lib/jdbc
+su dataiku -c 'exec "$DSS_DATADIR"/bin/dss run'
diff --git a/dss-docker/run.sh b/dss-docker/run.sh
index e245a92..dce3cfa 100755
--- a/dss-docker/run.sh
+++ b/dss-docker/run.sh
@@ -3,10 +3,13 @@
DSS_INSTALLDIR="/home/dataiku/dataiku-dss-$DSS_VERSION"
if [ ! -f "$DSS_DATADIR"/bin/env-default.sh ]; then
+ echo "Changing the owner of $DSS_DATADIR"
+ chown dataiku:dataiku "$DSS_DATADIR"
# Initialize new data directory
- "$DSS_INSTALLDIR"/installer.sh -d "$DSS_DATADIR" -p "$DSS_PORT"
- "$DSS_DATADIR"/bin/dssadmin install-R-integration
- echo "dku.registration.channel=docker-image" >>"$DSS_DATADIR"/config/dip.properties
+ su dataiku -c "$DSS_INSTALLDIR/installer.sh -d $DSS_DATADIR -p $DSS_PORT"
+ su dataiku -c "$DSS_DATADIR/bin/dssadmin install-R-integration"
+ echo "Reached here.."
+ su dataiku -c 'echo "dku.registration.channel=docker-image" >>"$DSS_DATADIR"/config/dip.properties'
elif [ $(bash -c 'source "$DSS_DATADIR"/bin/env-default.sh && echo "$DKUINSTALLDIR"') != "$DSS_INSTALLDIR" ]; then
# Upgrade existing data directory
@@ -15,4 +18,11 @@ elif [ $(bash -c 'source "$DSS_DATADIR"/bin/env-default.sh && echo "$DKUINSTALLD
fi
-exec "$DSS_DATADIR"/bin/dss run
+mkdir -p /home/dataiku/dss/lib/jdbc
+curl 'https://storage.googleapis.com/simba-bq-release/jdbc/SimbaJDBCDriverforGoogleBigQuery42_1.1.4.1004.zip' -o /home/dataiku/dss/lib/jdbc/driver.zip
+cd /home/dataiku/dss/lib/jdbc && unzip -o driver.zip
+curl 'https://storage.googleapis.com/jdbc-drivers/sqljdbc42.jar' -o /home/dataiku/dss/lib/jdbc/sqljdbc42.jar
+
+chown dataiku:dataiku /home/dataiku/dss/lib/jdbc
+
+su dataiku -c 'exec "$DSS_DATADIR"/bin/dss run'
diff --git a/dss-docker/spark-env.sh b/dss-docker/spark-env.sh
new file mode 100755
index 0000000..0097985
--- /dev/null
+++ b/dss-docker/spark-env.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This file is sourced when running various Spark programs.
+# Copy it as spark-env.sh and edit that to configure Spark for your site.
+
+# Options read when launching programs locally with
+# ./bin/run-example or ./bin/spark-submit
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
+
+HADOOP_CONF_DIR=/home/dataiku/hadoop-2.8.3/conf
+
+# Options read by executors and drivers running inside the cluster
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
+# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
+# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos
+
+# Options read in YARN client mode
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
+# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
+# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)
+
+# Options for the daemons used in the standalone deploy mode
+# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname
+# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
+# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
+# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
+# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
+# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
+# - SPARK_WORKER_DIR, to set the working directory of worker processes
+# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
+# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g).
+# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
+# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y")
+# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
+# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
+
+# Generic options for the daemons used in the standalone deploy mode
+# - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf)
+# - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs)
+# - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp)
+# - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER)
+# - SPARK_NICENESS The scheduling priority for daemons. (Default: 0)
+# - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file.
\ No newline at end of file
diff --git a/dss-docker/spark-submit b/dss-docker/spark-submit
new file mode 100755
index 0000000..af594fd
--- /dev/null
+++ b/dss-docker/spark-submit
@@ -0,0 +1,9 @@
+#!/bin/bash -e
+
+export SPARK_HOME=/home/dataiku/spark-2.2.0-bin-hadoop2.7
+export HADOOP_CONF_DIR=/home/dataiku/hadoop-2.8.3/conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/jre
+export PATH=$PATH:/var/dataiku/data/pyenv/bin:/home/dataiku/dss/pyenv/bin
+export SPARK_LOCAL_HOSTNAME=127.0.0.1
+
+"$SPARK_HOME"/bin/spark-submit "$@"
\ No newline at end of file