|
1 |
| -FROM jupyter/all-spark-notebook:399cbb986c6b |
| 1 | +FROM jupyter/all-spark-notebook:spark-3.1.1@sha256:b73dad39ad5c469a92764e38d7cc4321040d3fedddcad7fcebc4ddc7f9c15ff2 |
| 2 | + |
2 | 3 | LABEL maintainer=analytics-platform-tech@digital.justice.gov.uk
|
3 | 4 |
|
4 |
| -USER root |
| 5 | +ENV PATH=$PATH:$HOME/.local/bin |
| 6 | + |
| 7 | +# To match RStudio |
| 8 | +ENV NB_UID=1001 |
| 9 | + |
| 10 | +# Home directory contents is already owned by UID 1001 |
| 11 | +ENV CHOWN_HOME=no |
5 | 12 |
|
6 |
| -ENV PATH=$PATH:$HOME/.local/bin \ |
7 |
| - CHOWN_HOME=no \ |
8 |
| - PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.11.918,org.apache.hadoop:hadoop-aws:3.0.1 pyspark-shell" |
9 |
| -# `org.apache.hadoop:hadoop-aws` version must match `pyspark` version |
| 13 | +# NB these are sensible defaults but may need to be changed programatically for |
| 14 | +# non local spark (ie. EMR etc.) |
| 15 | +ENV PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.11.918,org.apache.hadoop:hadoop-aws:3.0.1 pyspark-shell" |
10 | 16 |
|
11 |
| -RUN apt-get update && apt-get install -y \ |
12 |
| - ca-certificates-java \ |
13 |
| - openjdk-8-jdk \ |
14 |
| - openssh-client \ |
15 |
| - software-properties-common \ |
| 17 | +# Container must be run as root to use NB_UID |
| 18 | +USER root |
| 19 | + |
| 20 | +# Install OS pacakges |
| 21 | +# |
| 22 | +# The reason we have installed these has been lost. Including just in case. |
| 23 | +# |
| 24 | +# - gdal-bin |
| 25 | +# - libspatialindex-dev |
| 26 | +# - openssh-client |
| 27 | +# |
| 28 | +RUN apt-get update && \ |
| 29 | + apt-get install -y \ |
16 | 30 | gdal-bin \
|
17 | 31 | libspatialindex-dev \
|
18 |
| - && rm -rf /var/lib/apt/lists/* |
| 32 | + openssh-client && \ |
| 33 | + rm -rf /var/lib/apt/lists/* |
19 | 34 |
|
20 |
| -COPY files/pyspark-s3.py /tmp/pyspark-s3.py |
| 35 | +# I'm not sure this has any affect |
21 | 36 | COPY files/hdfs-site.xml /usr/local/spark/conf/hdfs-site.xml
|
22 | 37 |
|
23 |
| -RUN usermod -a -G "staff,users" "${NB_USER}" \ |
24 |
| - && update-alternatives --set editor /bin/nano-tiny |
| 38 | +# add-user-to-group.sh add the $NB_USER to group 50 (staff) used by RStudio |
| 39 | +COPY files/add-user-to-group.sh /usr/local/bin/before-notebook.d/ |
25 | 40 |
|
26 |
| -USER $NB_USER |
| 41 | +# Install pythong packages |
| 42 | +# - pip - python package manager |
| 43 | +# - boto3 - python AWS library |
| 44 | +# - nbstripout - tool for stripping sensitive data out of notebooks |
| 45 | +# |
27 | 46 | RUN pip install --upgrade \
|
28 | 47 | pip \
|
29 | 48 | boto3 \
|
30 |
| - pyspark==3.0.1 \ |
31 | 49 | nbstripout \
|
32 |
| - etl-manager==7.3.0 \ |
33 |
| - gluejobutils==3.1.1 |
| 50 | + dataengineeringutils3==1.3.0 \ |
| 51 | + etl-manager==7.4.0 |
| 52 | + |
| 53 | +# Vi just doesn't cut it for some people |
| 54 | +RUN update-alternatives --set editor /bin/nano-tiny |
0 commit comments