diff --git a/dss-docker/Dockerfile b/dss-docker/Dockerfile
index 41dea41..469dc92 100644
--- a/dss-docker/Dockerfile
+++ b/dss-docker/Dockerfile
@@ -1,68 +1,29 @@
-FROM debian:9
+FROM dataiku/dss:4.1.0
 
-ENV DSS_VERSION="4.1.0" \
-    DSS_DATADIR="/home/dataiku/dss" \
-    DSS_PORT=10000
+# Entry point
+WORKDIR /home/dataiku
 
-# Dataiku account and data dir setup
-RUN useradd -s /bin/bash dataiku \
-    && mkdir -p /home/dataiku ${DSS_DATADIR} \
-    && chown -Rh dataiku:dataiku /home/dataiku ${DSS_DATADIR}
+EXPOSE $DSS_PORT
 
-# System dependencies
-# TODO - much could be removed by building externally the required R packages
-RUN apt-get update \
-    && DEBIAN_FRONTEND=noninteractive apt-get upgrade -y \
-    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
-        locales \
-        acl \
-        curl \
-        git \
-        libexpat1 \
-        nginx-full \
-        unzip \
-        zip \
-        default-jre-headless \
-        python2.7 \
-        libpython2.7 \
-        libfreetype6 \
-        libgfortran3 \
-        libgomp1 \
-        r-base-dev \
-        libicu-dev \
-        libcurl4-openssl-dev \
-        libssl-dev \
-        libxml2-dev \
-        libzmq3-dev \
-        pkg-config \
-    && rm -rf /var/lib/apt/lists/* \
-    && localedef -f UTF-8 -i en_US en_US.UTF-8
+USER root
 
-# Download and extract DSS kit
-RUN DSSKIT="dataiku-dss-$DSS_VERSION" \
-    && cd /home/dataiku \
-    && echo "+ Downloading kit" \
-    && curl -OsS "http://downloads.dataiku.com/public/studio/$DSS_VERSION/$DSSKIT.tar.gz" \
-    && echo "+ Extracting kit" \
-    && tar xf "$DSSKIT.tar.gz" \
-    && rm "$DSSKIT.tar.gz" \
-    && echo "+ Compiling Python code" \
-    && python2.7 -O -m compileall -q "$DSSKIT"/python "$DSSKIT"/dku-jupyter \
-    && { python2.7 -O -m compileall -q "$DSSKIT"/python.packages >/dev/null || true; } \
-    && chown -Rh dataiku:dataiku "$DSSKIT"
+ADD http://archive.apache.org/dist/hadoop/core/hadoop-2.8.3/hadoop-2.8.3.tar.gz /home/dataiku/
 
-# Install required R packages
-RUN R --slave --no-restore \
-    -e "install.packages(c('httr', 'RJSONIO', 'dplyr', 'IRkernel', 'sparklyr', 'ggplot2', 'tidyr', 'rmarkdown'), \
-                        repos=c('file:///home/dataiku/dataiku-dss-$DSS_VERSION/dku-jupyter/R', \
-                                'http://cloud.r-project.org'))"
+ADD conf/ /home/dataiku/hadoop-2.8.3/conf
 
-# Entry point
-WORKDIR /home/dataiku
-USER dataiku
+COPY run-with-hadoop.sh /home/dataiku/
 
-COPY run.sh /home/dataiku/
+ADD hadoop /usr/bin/
 
-EXPOSE $DSS_PORT
+ADD http://www-eu.apache.org/dist/spark/spark-2.2.0/spark-2.2.0-bin-hadoop2.7.tgz /home/dataiku/
+
+ADD spark-env.sh /home/dataiku/spark-2.2.0-bin-hadoop2.7/conf/
+
+ADD spark-submit /usr/bin/
+
+RUN chown dataiku:dataiku /usr/bin/hadoop \
+	&& chown dataiku:dataiku /usr/bin/spark-submit \
+	&& chown -R dataiku:dataiku /home/dataiku/spark-2.2.0-bin-hadoop2.7/ \
+	&& chown -R dataiku:dataiku /home/dataiku/hadoop-2.8.3/
 
-CMD [ "/home/dataiku/run.sh" ]
+CMD [ "/home/dataiku/run-with-hadoop.sh" ]
\ No newline at end of file
diff --git a/dss-docker/build.sh b/dss-docker/build.sh
new file mode 100755
index 0000000..8cba4f7
--- /dev/null
+++ b/dss-docker/build.sh
@@ -0,0 +1,6 @@
+#!/bin/bash -e
+
+docker stop dataiku && docker rm -v dataiku
+docker build . -t gcr.io/retailcatalyst-187519/crs-dataiku:4.1.0-hadoop
+docker run --name dataiku -d gcr.io/retailcatalyst-187519/crs-dataiku:4.1.0-hadoop
+docker exec -it dataiku bash
diff --git a/dss-docker/conf/core-site.xml b/dss-docker/conf/core-site.xml
new file mode 100644
index 0000000..17c9f82
--- /dev/null
+++ b/dss-docker/conf/core-site.xml
@@ -0,0 +1,129 @@
+<?xml version="1.0" ?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+<!-- Put site-specific property overrides in this file. -->
+<configuration>
+  <property>
+    <name>hadoop.proxyuser.hive.hosts</name>
+    <value>*</value>
+  </property>
+  <property>
+    <name>hadoop.tmp.dir</name>
+    <value>/hadoop/tmp</value>
+    <description>A base for other temporary directories.</description>
+  </property>
+  <property>
+    <name>fs.default.name</name>
+    <value>hdfs://crs-dataiku-hadoop2-m</value>
+    <description>The old FileSystem used by FsShell.</description>
+  </property>
+  <property>
+    <name>fs.defaultFS</name>
+    <value>hdfs://crs-dataiku-hadoop2-m</value>
+    <description>
+      The name of the default file system. A URI whose scheme and authority
+      determine the FileSystem implementation. The uri's scheme determines
+      the config property (fs.SCHEME.impl) naming the FileSystem
+      implementation class. The uri's authority is used to determine the
+      host, port, etc. for a filesystem.
+    </description>
+  </property>
+  <property>
+    <name>hadoop.proxyuser.hive.groups</name>
+    <value>*</value>
+  </property>
+  <property>
+    <name>fs.gs.working.dir</name>
+    <value>/</value>
+    <description>
+      The directory relative gs: uris resolve in inside of the default bucket.
+    </description>
+  </property>
+  <property>
+    <name>fs.gs.system.bucket</name>
+    <value>dataproc-dd12d8db-81d9-40c8-8d96-7123187903a3-us-east1</value>
+    <description>
+      GCS bucket to use as a default bucket if fs.default.name is not a gs: uri.
+    </description>
+  </property>
+  <property>
+    <name>fs.gs.metadata.cache.directory</name>
+    <value>/hadoop_gcs_connector_metadata_cache</value>
+    <description>
+      Only used if fs.gs.metadata.cache.type is FILESYSTEM_BACKED, specifies
+      the local path to use as the base path for storing mirrored GCS metadata.
+      Must be an absolute path, must be a directory, and must be fully
+      readable/writable/executable by any user running processes which use the
+      GCS connector.
+    </description>
+  </property>
+  <property>
+    <name>fs.gs.impl</name>
+    <value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem</value>
+    <description>The FileSystem for gs: (GCS) uris.</description>
+  </property>
+  <property>
+    <name>fs.gs.project.id</name>
+    <value>retailcatalyst-187519</value>
+    <description>
+      Google Cloud Project ID with access to configured GCS buckets.
+    </description>
+  </property>
+  <property>
+    <name>fs.gs.metadata.cache.enable</name>
+    <value>false</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>fs.gs.implicit.dir.infer.enable</name>
+    <value>true</value>
+    <description>
+      If set, we create and return in-memory directory objects on the fly when
+      no backing object exists, but we know there are files with the same
+      prefix.
+    </description>
+  </property>
+  <property>
+    <name>fs.gs.application.name.suffix</name>
+    <value>-dataproc</value>
+    <description>
+      Appended to the user-agent header for API requests to GCS to help identify
+      the traffic as coming from Dataproc.
+    </description>
+  </property>
+  <property>
+    <name>fs.AbstractFileSystem.gs.impl</name>
+    <value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS</value>
+    <description>The AbstractFileSystem for gs: (GCS) uris.</description>
+  </property>
+  <property>
+    <name>fs.gs.metadata.cache.type</name>
+    <value>FILESYSTEM_BACKED</value>
+    <description>
+      Specifies which implementation of DirectoryListCache to use for
+      supplementing GCS API &amp;amp;quot;list&amp;amp;quot; requests. Supported
+      implementations:       IN_MEMORY: Enforces immediate consistency within
+      same Java process.       FILESYSTEM_BACKED: Enforces consistency across
+      all cooperating processes       pointed at the same local mirror
+      directory, which may be an NFS directory       for massively-distributed
+      coordination.
+    </description>
+  </property>
+  <property>
+    <name>fs.gs.block.size</name>
+    <value>134217728</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+</configuration>
diff --git a/dss-docker/conf/hdfs-site.xml b/dss-docker/conf/hdfs-site.xml
new file mode 100644
index 0000000..e15e6ad
--- /dev/null
+++ b/dss-docker/conf/hdfs-site.xml
@@ -0,0 +1,146 @@
+<?xml version="1.0" ?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+<!-- Put site-specific property overrides in this file. -->
+<configuration>
+  <property>
+    <name>dfs.namenode.rpc-address</name>
+    <value>crs-dataiku-hadoop2-m:8020</value>
+    <description>
+      RPC address that handles all clients requests. If empty then we'll get
+      thevalue from fs.default.name.The value of this property will take the
+      form of hdfs://nn-host1:rpc-port.
+    </description>
+  </property>
+  <property>
+    <name>dfs.permissions.enabled</name>
+    <value>false</value>
+    <description>
+      If &amp;amp;quot;true&amp;amp;quot;, enable permission checking in HDFS. If
+      &amp;amp;quot;false&amp;amp;quot;, permission       checking is turned off, but
+      all other       behavior is unchanged. Switching       from one parameter
+      value to the       other does not change the mode, owner or       group of
+      files or       directories.
+    </description>
+  </property>
+  <property>
+    <name>dfs.datanode.data.dir</name>
+    <value>/hadoop/dfs/data</value>
+    <description>
+      Determines where on the local filesystem an DFS datanode should store its
+      blocks. If this is a comma-delimited list of directories, then data will
+      be stored in all named directories, typically on different
+      devices.Directories that do not exist are ignored.
+    </description>
+  </property>
+  <property>
+    <name>dfs.namenode.http-address</name>
+    <value>0.0.0.0:9870</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>dfs.permissions.supergroup</name>
+    <value>hadoop</value>
+    <description>The name of the group of super-users.</description>
+  </property>
+  <property>
+    <name>dfs.hosts</name>
+    <value>/etc/hadoop/conf/nodes_include</value>
+  </property>
+  <property>
+    <name>dfs.namenode.secondary.http-address</name>
+    <value>0.0.0.0:9868</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>dfs.client.read.shortcircuit</name>
+    <value>true</value>
+  </property>
+  <property>
+    <name>dfs.namenode.name.dir</name>
+    <value>/hadoop/dfs/name</value>
+    <description>
+      Determines where on the local filesystem the DFS namenode should store the
+      name table(fsimage). If this is a comma-delimited list of directories then
+      the name table is replicated in all of the directories for redundancy.
+    </description>
+  </property>
+  <property>
+    <name>dfs.replication</name>
+    <value>2</value>
+    <description>
+      Default block replication. The actual number of replications can be
+      specified when the file is created. The default is used if replication
+      is not specified in create time.
+    </description>
+  </property>
+  <property>
+    <name>dfs.domain.socket.path</name>
+    <value>/var/lib/hadoop-hdfs/dn_socket</value>
+  </property>
+  <property>
+    <name>dfs.namenode.checkpoint.dir</name>
+    <value>file:///hadoop/dfs/namesecondary</value>
+    <description>
+      Determines where on the local filesystem the DFS secondary namenode should
+      store       the temporary images to merge. If this is a comma-delimited
+      list of directories then       the image is replicated in all of the
+      directories for redundancy.
+    </description>
+  </property>
+  <property>
+    <name>dfs.hosts.exclude</name>
+    <value>/etc/hadoop/conf/nodes_exclude</value>
+  </property>
+  <property>
+    <name>dfs.datanode.data.dir.perm</name>
+    <value>700</value>
+    <description>
+      Permissions for the directories on on the local filesystem where the DFS
+      data node store its blocks. The permissions can either be octal or
+      symbolic.
+    </description>
+  </property>
+  <property>
+    <name>dfs.datanode.address</name>
+    <value>0.0.0.0:9866</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>dfs.namenode.https-address</name>
+    <value>0.0.0.0:9871</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>dfs.datanode.https.address</name>
+    <value>0.0.0.0:9865</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>dfs.datanode.http.address</name>
+    <value>0.0.0.0:9864</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>dfs.datanode.ipc.address</name>
+    <value>0.0.0.0:9867</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>dfs.namenode.secondary.https-address</name>
+    <value>0.0.0.0:9869</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+</configuration>
diff --git a/dss-docker/conf/mapred-site.xml b/dss-docker/conf/mapred-site.xml
new file mode 100644
index 0000000..2e350e7
--- /dev/null
+++ b/dss-docker/conf/mapred-site.xml
@@ -0,0 +1,162 @@
+<?xml version="1.0" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<configuration>
+  <property>
+    <name>mapreduce.job.maps</name>
+    <value>45</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>mapreduce.map.memory.mb</name>
+    <value>3072</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>mapreduce.jobhistory.address</name>
+    <value>crs-dataiku-hadoop2-m:10020</value>
+    <description>MapReduce JobHistory Server IPC host:port</description>
+  </property>
+  <property>
+    <name>mapreduce.jobhistory.webapp.address</name>
+    <value>crs-dataiku-hadoop2-m:19888</value>
+    <description>MapReduce JobHistory Server Web UI host:port</description>
+  </property>
+  <property>
+    <name>mapreduce.reduce.memory.mb</name>
+    <value>3072</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>yarn.app.mapreduce.am.command-opts</name>
+    <value>-Xmx2457m</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>mapreduce.tasktracker.map.tasks.maximum</name>
+    <value>4</value>
+    <description>
+      Property from MapReduce version 1 still used for TeraGen sharding.
+    </description>
+  </property>
+  <property>
+    <name>mapreduce.framework.name</name>
+    <value>yarn</value>
+  </property>
+  <property>
+    <name>mapreduce.input.fileinputformat.list-status.num-threads</name>
+    <value>20</value>
+    <description>
+      The number of threads to use to list and fetch block locations for the
+      specified input paths. Note: multiple threads should not be used if a
+      custom non thread-safe path filter is used. Setting a larger value than
+      the default of 1 can significantly improve job startup overhead,
+      especially if using GCS as input with multi-level directories, such
+      as in partitioned Hive tables.
+    </description>
+  </property>
+  <property>
+    <name>mapreduce.reduce.java.opts</name>
+    <value>-Xmx2457m</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>yarn.app.mapreduce.am.resource.cpu-vcores</name>
+    <value>1</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>mapreduce.reduce.cpu.vcores</name>
+    <value>1</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>mapreduce.map.cpu.vcores</name>
+    <value>1</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>yarn.app.mapreduce.am.resource.mb</name>
+    <value>3072</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>mapreduce.job.reduces</name>
+    <value>15</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>mapreduce.map.java.opts</name>
+    <value>-Xmx2457m</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>mapreduce.job.working.dir</name>
+    <value>/user/${user.name}</value>
+    <description>
+      The FileSystem working directory to use for relative paths.
+    </description>
+  </property>
+  <property>
+    <name>mapreduce.fileoutputcommitter.algorithm.version</name>
+    <value>2</value>
+    <description>
+      Updated file output committer algorithm in Hadoop 2.7+. Significantly
+      improves commitJob times when using the Google Cloud Storage connector.
+      See https://issues.apache.org/jira/browse/MAPEDUCE-4815 for more details.
+    </description>
+  </property>
+  <property>
+    <name>mapred.local.dir</name>
+    <value>/hadoop/mapred/local</value>
+    <description>
+      Directories on the local machine in which to store mapreduce temp files.
+    </description>
+  </property>
+  <property>
+    <name>mapred.bq.project.id</name>
+    <value>retailcatalyst-187519</value>
+    <description>
+      Google Cloud Project ID to use for BigQuery operations.
+    </description>
+  </property>
+  <property>
+    <name>mapred.bq.output.buffer.size</name>
+    <value>67108864</value>
+    <description>
+      The size in bytes of the output buffer to use when writing to BigQuery.
+    </description>
+  </property>
+  <property>
+    <name>mapred.bq.gcs.bucket</name>
+    <value>dataproc-dd12d8db-81d9-40c8-8d96-7123187903a3-us-east1</value>
+    <description>
+      The GCS bucket holding temporary BigQuery data for the input connector.
+    </description>
+  </property>
+  <property>
+    <name>mapreduce.job.reduce.slowstart.completedmaps</name>
+    <value>0.95</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>mapreduce.task.io.sort.mb</name>
+    <value>256</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+</configuration>
diff --git a/dss-docker/conf/yarn-site.xml b/dss-docker/conf/yarn-site.xml
new file mode 100644
index 0000000..be76fea
--- /dev/null
+++ b/dss-docker/conf/yarn-site.xml
@@ -0,0 +1,115 @@
+<?xml version="1.0" ?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+<configuration>
+  <!-- Site specific YARN configuration properties -->
+  <property>
+    <name>yarn.nodemanager.remote-app-log-dir</name>
+    <value>/yarn-logs/</value>
+    <description>
+      The remote path, on the default FS, to store logs.
+    </description>
+  </property>
+  <property>
+    <name>yarn.nodemanager.aux-services</name>
+    <value>mapreduce_shuffle,spark_shuffle</value>
+  </property>
+  <property>
+    <name>yarn.resourcemanager.nodes.exclude-path</name>
+    <value>/etc/hadoop/conf/nodes_exclude</value>
+  </property>
+  <property>
+    <name>yarn.nodemanager.local-dirs</name>
+    <value>/hadoop/yarn/nm-local-dir</value>
+    <description>
+      Directories on the local machine in which to application temp files.
+    </description>
+  </property>
+  <property>
+    <name>yarn.nodemanager.aux-services.spark_shuffle.class</name>
+    <value>org.apache.spark.network.yarn.YarnShuffleService</value>
+  </property>
+  <property>
+    <name>yarn.resourcemanager.hostname</name>
+    <value>crs-dataiku-hadoop2-m</value>
+  </property>
+  <property>
+    <name>yarn.nodemanager.vmem-check-enabled</name>
+    <value>false</value>
+  </property>
+  <property>
+    <description>
+      The maximum allocation for every container request at the RM,       in
+      terms of virtual CPU cores. Requests higher than this won't take
+      effect, and will get capped to this value.
+    </description>
+    <name>yarn.scheduler.maximum-allocation-vcores</name>
+    <value>32000</value>
+  </property>
+  <property>
+    <name>yarn.nodemanager.resource.memory-mb</name>
+    <value>24576</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>yarn.scheduler.minimum-allocation-mb</name>
+    <value>1024</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>yarn.resourcemanager.nodes.include-path</name>
+    <value>/etc/hadoop/conf/nodes_include</value>
+  </property>
+  <property>
+    <name>yarn.nodemanager.resource.cpu-vcores</name>
+    <value>4</value>
+    <description>
+      Number of vcores that can be allocated for containers. This is used by
+      the RM scheduler when allocating resources for containers. This is not
+      used to limit the number of physical cores used by YARN containers.
+    </description>
+  </property>
+  <property>
+    <name>yarn.resourcemanager.fs.state-store.uri</name>
+    <value>file:///hadoop/yarn/system/rmstore</value>
+    <description>
+      URI pointing to the location of the FileSystem path where RM state will
+      be stored. This is set on the local file system to avoid collisions in
+      GCS.
+    </description>
+  </property>
+  <property>
+    <name>yarn.scheduler.maximum-allocation-mb</name>
+    <value>24576</value>
+    <source>Dataproc Cluster Properties</source>
+  </property>
+  <property>
+    <name>yarn.application.classpath</name>
+    <value>$HADOOP_CONF_DIR,
+      $HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,
+      $HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,
+      $HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,
+      $HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*'</value>
+  </property>
+  <property>
+    <name>yarn.resourcemanager.recovery.enabled</name>
+    <value>true</value>
+    <description>Enable RM to recover state after starting.</description>
+  </property>
+  <property>
+    <name>yarn.log-aggregation-enable</name>
+    <value>false</value>
+    <description>Enable remote logs aggregation to the default FS.</description>
+  </property>
+</configuration>
diff --git a/dss-docker/hadoop b/dss-docker/hadoop
new file mode 100755
index 0000000..d065d1b
--- /dev/null
+++ b/dss-docker/hadoop
@@ -0,0 +1,6 @@
+#! /bin/bash -e
+
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/jre
+export HADOOP_HOME="/home/dataiku/hadoop-2.8.3"
+
+"$HADOOP_HOME"/bin/hadoop "$@"
\ No newline at end of file
diff --git a/dss-docker/hadoop-env.sh b/dss-docker/hadoop-env.sh
new file mode 100644
index 0000000..823b070
--- /dev/null
+++ b/dss-docker/hadoop-env.sh
@@ -0,0 +1,2 @@
+export HADOOP_HOME="/home/dataiku/hadoop-2.8.3"
+export PATH=$PATH:$HADOOP_HOME/conf:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
\ No newline at end of file
diff --git a/dss-docker/run-with-hadoop.sh b/dss-docker/run-with-hadoop.sh
new file mode 100755
index 0000000..3adc7b5
--- /dev/null
+++ b/dss-docker/run-with-hadoop.sh
@@ -0,0 +1,27 @@
+#!/bin/bash -e
+
+DSS_INSTALLDIR="/home/dataiku/dataiku-dss-$DSS_VERSION"
+export SPARK_HOME="/home/dataiku/spark-2.2.0-bin-hadoop2.7"
+
+if [ ! -f "$DSS_DATADIR"/bin/env-default.sh ]; then
+	chown dataiku:dataiku "$DSS_DATADIR"
+	# Initialize new data directory
+	su dataiku -c "$DSS_INSTALLDIR/installer.sh -d $DSS_DATADIR -p $DSS_PORT"
+	su dataiku -c "$DSS_DATADIR/bin/dssadmin install-R-integration"
+	su dataiku -c '"$DSS_DATADIR"/bin/dssadmin install-hadoop-integration'
+	su dataiku -c '"$DSS_DATADIR"/bin/dssadmin install-spark-integration'
+	su dataiku -c 'echo "dku.registration.channel=docker-image" >>"$DSS_DATADIR"/config/dip.properties'
+
+elif [ $(bash -c 'source "$DSS_DATADIR"/bin/env-default.sh && echo "$DKUINSTALLDIR"') != "$DSS_INSTALLDIR" ]; then
+	# Upgrade existing data directory.  This is not tested!!
+	"$DSS_INSTALLDIR"/installer.sh -d "$DSS_DATADIR" -u -y
+	"$DSS_DATADIR"/bin/dssadmin install-R-integration
+	"$DSS_DATADIR"/bin/dssadmin install-hadoop-integration
+	"$DSS_DATADIR"/bin/dssadmin install-spark-integration
+
+fi
+
+mkdir -p /home/dataiku/dss/lib/jdbc
+curl 'https://storage.googleapis.com/jdbc-drivers/sqljdbc42.jar' -o /home/dataiku/dss/lib/jdbc/sqljdbc42.jar
+chown dataiku:dataiku /home/dataiku/dss/lib/jdbc
+su dataiku -c 'exec "$DSS_DATADIR"/bin/dss run'
diff --git a/dss-docker/run.sh b/dss-docker/run.sh
index e245a92..dce3cfa 100755
--- a/dss-docker/run.sh
+++ b/dss-docker/run.sh
@@ -3,10 +3,13 @@
 DSS_INSTALLDIR="/home/dataiku/dataiku-dss-$DSS_VERSION"
 
 if [ ! -f "$DSS_DATADIR"/bin/env-default.sh ]; then
+	echo "Changing the owner of $DSS_DATADIR"
+	chown dataiku:dataiku "$DSS_DATADIR"
 	# Initialize new data directory
-	"$DSS_INSTALLDIR"/installer.sh -d "$DSS_DATADIR" -p "$DSS_PORT"
-	"$DSS_DATADIR"/bin/dssadmin install-R-integration
-	echo "dku.registration.channel=docker-image" >>"$DSS_DATADIR"/config/dip.properties
+	su dataiku -c "$DSS_INSTALLDIR/installer.sh -d $DSS_DATADIR -p $DSS_PORT"
+	su dataiku -c "$DSS_DATADIR/bin/dssadmin install-R-integration"
+	echo "Reached here.."
+	su dataiku -c 'echo "dku.registration.channel=docker-image" >>"$DSS_DATADIR"/config/dip.properties'
 
 elif [ $(bash -c 'source "$DSS_DATADIR"/bin/env-default.sh && echo "$DKUINSTALLDIR"') != "$DSS_INSTALLDIR" ]; then
 	# Upgrade existing data directory
@@ -15,4 +18,11 @@ elif [ $(bash -c 'source "$DSS_DATADIR"/bin/env-default.sh && echo "$DKUINSTALLD
 
 fi
 
-exec "$DSS_DATADIR"/bin/dss run
+mkdir -p /home/dataiku/dss/lib/jdbc
+curl 'https://storage.googleapis.com/simba-bq-release/jdbc/SimbaJDBCDriverforGoogleBigQuery42_1.1.4.1004.zip' -o /home/dataiku/dss/lib/jdbc/driver.zip
+cd /home/dataiku/dss/lib/jdbc && unzip -o driver.zip 
+curl 'https://storage.googleapis.com/jdbc-drivers/sqljdbc42.jar' -o /home/dataiku/dss/lib/jdbc/sqljdbc42.jar
+
+chown dataiku:dataiku /home/dataiku/dss/lib/jdbc
+
+su dataiku -c 'exec "$DSS_DATADIR"/bin/dss run'
diff --git a/dss-docker/spark-env.sh b/dss-docker/spark-env.sh
new file mode 100755
index 0000000..0097985
--- /dev/null
+++ b/dss-docker/spark-env.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This file is sourced when running various Spark programs.
+# Copy it as spark-env.sh and edit that to configure Spark for your site.
+
+# Options read when launching programs locally with
+# ./bin/run-example or ./bin/spark-submit
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
+
+HADOOP_CONF_DIR=/home/dataiku/hadoop-2.8.3/conf
+
+# Options read by executors and drivers running inside the cluster
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
+# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
+# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos
+
+# Options read in YARN client mode
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
+# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
+# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)
+
+# Options for the daemons used in the standalone deploy mode
+# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname
+# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
+# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
+# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
+# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
+# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
+# - SPARK_WORKER_DIR, to set the working directory of worker processes
+# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
+# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g).
+# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
+# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y")
+# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
+# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
+
+# Generic options for the daemons used in the standalone deploy mode
+# - SPARK_CONF_DIR      Alternate conf dir. (Default: ${SPARK_HOME}/conf)
+# - SPARK_LOG_DIR       Where log files are stored.  (Default: ${SPARK_HOME}/logs)
+# - SPARK_PID_DIR       Where the pid file is stored. (Default: /tmp)
+# - SPARK_IDENT_STRING  A string representing this instance of spark. (Default: $USER)
+# - SPARK_NICENESS      The scheduling priority for daemons. (Default: 0)
+# - SPARK_NO_DAEMONIZE  Run the proposed command in the foreground. It will not output a PID file.
\ No newline at end of file
diff --git a/dss-docker/spark-submit b/dss-docker/spark-submit
new file mode 100755
index 0000000..af594fd
--- /dev/null
+++ b/dss-docker/spark-submit
@@ -0,0 +1,9 @@
+#!/bin/bash -e
+
+export SPARK_HOME=/home/dataiku/spark-2.2.0-bin-hadoop2.7
+export HADOOP_CONF_DIR=/home/dataiku/hadoop-2.8.3/conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/jre
+export PATH=$PATH:/var/dataiku/data/pyenv/bin:/home/dataiku/dss/pyenv/bin
+export SPARK_LOCAL_HOSTNAME=127.0.0.1
+
+"$SPARK_HOME"/bin/spark-submit "$@"
\ No newline at end of file