Skip to content

Support for Google Cloud DataProc #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 20 additions & 59 deletions dss-docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,68 +1,29 @@
FROM debian:9
FROM dataiku/dss:4.1.0

ENV DSS_VERSION="4.1.0" \
DSS_DATADIR="/home/dataiku/dss" \
DSS_PORT=10000
# Entry point
WORKDIR /home/dataiku

# Dataiku account and data dir setup
RUN useradd -s /bin/bash dataiku \
&& mkdir -p /home/dataiku ${DSS_DATADIR} \
&& chown -Rh dataiku:dataiku /home/dataiku ${DSS_DATADIR}
EXPOSE $DSS_PORT

# System dependencies
# TODO - much could be removed by building externally the required R packages
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get upgrade -y \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y \
locales \
acl \
curl \
git \
libexpat1 \
nginx-full \
unzip \
zip \
default-jre-headless \
python2.7 \
libpython2.7 \
libfreetype6 \
libgfortran3 \
libgomp1 \
r-base-dev \
libicu-dev \
libcurl4-openssl-dev \
libssl-dev \
libxml2-dev \
libzmq3-dev \
pkg-config \
&& rm -rf /var/lib/apt/lists/* \
&& localedef -f UTF-8 -i en_US en_US.UTF-8
USER root

# Download and extract DSS kit
RUN DSSKIT="dataiku-dss-$DSS_VERSION" \
&& cd /home/dataiku \
&& echo "+ Downloading kit" \
&& curl -OsS "http://downloads.dataiku.com/public/studio/$DSS_VERSION/$DSSKIT.tar.gz" \
&& echo "+ Extracting kit" \
&& tar xf "$DSSKIT.tar.gz" \
&& rm "$DSSKIT.tar.gz" \
&& echo "+ Compiling Python code" \
&& python2.7 -O -m compileall -q "$DSSKIT"/python "$DSSKIT"/dku-jupyter \
&& { python2.7 -O -m compileall -q "$DSSKIT"/python.packages >/dev/null || true; } \
&& chown -Rh dataiku:dataiku "$DSSKIT"
ADD http://archive.apache.org/dist/hadoop/core/hadoop-2.8.3/hadoop-2.8.3.tar.gz /home/dataiku/

# Install required R packages
RUN R --slave --no-restore \
-e "install.packages(c('httr', 'RJSONIO', 'dplyr', 'IRkernel', 'sparklyr', 'ggplot2', 'tidyr', 'rmarkdown'), \
repos=c('file:///home/dataiku/dataiku-dss-$DSS_VERSION/dku-jupyter/R', \
'http://cloud.r-project.org'))"
ADD conf/ /home/dataiku/hadoop-2.8.3/conf

# Entry point
WORKDIR /home/dataiku
USER dataiku
COPY run-with-hadoop.sh /home/dataiku/

COPY run.sh /home/dataiku/
ADD hadoop /usr/bin/

EXPOSE $DSS_PORT
ADD http://www-eu.apache.org/dist/spark/spark-2.2.0/spark-2.2.0-bin-hadoop2.7.tgz /home/dataiku/

ADD spark-env.sh /home/dataiku/spark-2.2.0-bin-hadoop2.7/conf/

ADD spark-submit /usr/bin/

RUN chown dataiku:dataiku /usr/bin/hadoop \
&& chown dataiku:dataiku /usr/bin/spark-submit \
&& chown -R dataiku:dataiku /home/dataiku/spark-2.2.0-bin-hadoop2.7/ \
&& chown -R dataiku:dataiku /home/dataiku/hadoop-2.8.3/

CMD [ "/home/dataiku/run.sh" ]
CMD [ "/home/dataiku/run-with-hadoop.sh" ]
6 changes: 6 additions & 0 deletions dss-docker/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash -e

docker stop dataiku && docker rm -v dataiku
docker build . -t gcr.io/retailcatalyst-187519/crs-dataiku:4.1.0-hadoop
docker run --name dataiku -d gcr.io/retailcatalyst-187519/crs-dataiku:4.1.0-hadoop
docker exec -it dataiku bash
129 changes: 129 additions & 0 deletions dss-docker/conf/core-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
<?xml version="1.0" ?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>hadoop.proxyuser.hive.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/hadoop/tmp</value>
<description>A base for other temporary directories.</description>
</property>
<property>
<name>fs.default.name</name>
<value>hdfs://crs-dataiku-hadoop2-m</value>
<description>The old FileSystem used by FsShell.</description>
</property>
<property>
<name>fs.defaultFS</name>
<value>hdfs://crs-dataiku-hadoop2-m</value>
<description>
The name of the default file system. A URI whose scheme and authority
determine the FileSystem implementation. The uri's scheme determines
the config property (fs.SCHEME.impl) naming the FileSystem
implementation class. The uri's authority is used to determine the
host, port, etc. for a filesystem.
</description>
</property>
<property>
<name>hadoop.proxyuser.hive.groups</name>
<value>*</value>
</property>
<property>
<name>fs.gs.working.dir</name>
<value>/</value>
<description>
The directory relative gs: uris resolve in inside of the default bucket.
</description>
</property>
<property>
<name>fs.gs.system.bucket</name>
<value>dataproc-dd12d8db-81d9-40c8-8d96-7123187903a3-us-east1</value>
<description>
GCS bucket to use as a default bucket if fs.default.name is not a gs: uri.
</description>
</property>
<property>
<name>fs.gs.metadata.cache.directory</name>
<value>/hadoop_gcs_connector_metadata_cache</value>
<description>
Only used if fs.gs.metadata.cache.type is FILESYSTEM_BACKED, specifies
the local path to use as the base path for storing mirrored GCS metadata.
Must be an absolute path, must be a directory, and must be fully
readable/writable/executable by any user running processes which use the
GCS connector.
</description>
</property>
<property>
<name>fs.gs.impl</name>
<value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem</value>
<description>The FileSystem for gs: (GCS) uris.</description>
</property>
<property>
<name>fs.gs.project.id</name>
<value>retailcatalyst-187519</value>
<description>
Google Cloud Project ID with access to configured GCS buckets.
</description>
</property>
<property>
<name>fs.gs.metadata.cache.enable</name>
<value>false</value>
<source>Dataproc Cluster Properties</source>
</property>
<property>
<name>fs.gs.implicit.dir.infer.enable</name>
<value>true</value>
<description>
If set, we create and return in-memory directory objects on the fly when
no backing object exists, but we know there are files with the same
prefix.
</description>
</property>
<property>
<name>fs.gs.application.name.suffix</name>
<value>-dataproc</value>
<description>
Appended to the user-agent header for API requests to GCS to help identify
the traffic as coming from Dataproc.
</description>
</property>
<property>
<name>fs.AbstractFileSystem.gs.impl</name>
<value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS</value>
<description>The AbstractFileSystem for gs: (GCS) uris.</description>
</property>
<property>
<name>fs.gs.metadata.cache.type</name>
<value>FILESYSTEM_BACKED</value>
<description>
Specifies which implementation of DirectoryListCache to use for
supplementing GCS API &amp;amp;quot;list&amp;amp;quot; requests. Supported
implementations: IN_MEMORY: Enforces immediate consistency within
same Java process. FILESYSTEM_BACKED: Enforces consistency across
all cooperating processes pointed at the same local mirror
directory, which may be an NFS directory for massively-distributed
coordination.
</description>
</property>
<property>
<name>fs.gs.block.size</name>
<value>134217728</value>
<source>Dataproc Cluster Properties</source>
</property>
</configuration>
146 changes: 146 additions & 0 deletions dss-docker/conf/hdfs-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
<?xml version="1.0" ?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>dfs.namenode.rpc-address</name>
<value>crs-dataiku-hadoop2-m:8020</value>
<description>
RPC address that handles all clients requests. If empty then we'll get
thevalue from fs.default.name.The value of this property will take the
form of hdfs://nn-host1:rpc-port.
</description>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
<description>
If &amp;amp;quot;true&amp;amp;quot;, enable permission checking in HDFS. If
&amp;amp;quot;false&amp;amp;quot;, permission checking is turned off, but
all other behavior is unchanged. Switching from one parameter
value to the other does not change the mode, owner or group of
files or directories.
</description>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/hadoop/dfs/data</value>
<description>
Determines where on the local filesystem an DFS datanode should store its
blocks. If this is a comma-delimited list of directories, then data will
be stored in all named directories, typically on different
devices.Directories that do not exist are ignored.
</description>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>0.0.0.0:9870</value>
<source>Dataproc Cluster Properties</source>
</property>
<property>
<name>dfs.permissions.supergroup</name>
<value>hadoop</value>
<description>The name of the group of super-users.</description>
</property>
<property>
<name>dfs.hosts</name>
<value>/etc/hadoop/conf/nodes_include</value>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>0.0.0.0:9868</value>
<source>Dataproc Cluster Properties</source>
</property>
<property>
<name>dfs.client.read.shortcircuit</name>
<value>true</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/hadoop/dfs/name</value>
<description>
Determines where on the local filesystem the DFS namenode should store the
name table(fsimage). If this is a comma-delimited list of directories then
the name table is replicated in all of the directories for redundancy.
</description>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
<description>
Default block replication. The actual number of replications can be
specified when the file is created. The default is used if replication
is not specified in create time.
</description>
</property>
<property>
<name>dfs.domain.socket.path</name>
<value>/var/lib/hadoop-hdfs/dn_socket</value>
</property>
<property>
<name>dfs.namenode.checkpoint.dir</name>
<value>file:///hadoop/dfs/namesecondary</value>
<description>
Determines where on the local filesystem the DFS secondary namenode should
store the temporary images to merge. If this is a comma-delimited
list of directories then the image is replicated in all of the
directories for redundancy.
</description>
</property>
<property>
<name>dfs.hosts.exclude</name>
<value>/etc/hadoop/conf/nodes_exclude</value>
</property>
<property>
<name>dfs.datanode.data.dir.perm</name>
<value>700</value>
<description>
Permissions for the directories on on the local filesystem where the DFS
data node store its blocks. The permissions can either be octal or
symbolic.
</description>
</property>
<property>
<name>dfs.datanode.address</name>
<value>0.0.0.0:9866</value>
<source>Dataproc Cluster Properties</source>
</property>
<property>
<name>dfs.namenode.https-address</name>
<value>0.0.0.0:9871</value>
<source>Dataproc Cluster Properties</source>
</property>
<property>
<name>dfs.datanode.https.address</name>
<value>0.0.0.0:9865</value>
<source>Dataproc Cluster Properties</source>
</property>
<property>
<name>dfs.datanode.http.address</name>
<value>0.0.0.0:9864</value>
<source>Dataproc Cluster Properties</source>
</property>
<property>
<name>dfs.datanode.ipc.address</name>
<value>0.0.0.0:9867</value>
<source>Dataproc Cluster Properties</source>
</property>
<property>
<name>dfs.namenode.secondary.https-address</name>
<value>0.0.0.0:9869</value>
<source>Dataproc Cluster Properties</source>
</property>
</configuration>
Loading