Skip to content

Commit 16e5307

Browse files
authored
Merge pull request #57 from ministryofjustice/spark-update
Updated allspark-notebook to upstream spark-3.1.1 image
2 parents c59a952 + 243bb97 commit 16e5307

File tree

5 files changed

+45
-43
lines changed

5 files changed

+45
-43
lines changed

.github/workflows/jupyter-lab-test-and-build.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ jobs:
8282
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
8383
IMAGE_TAG: ${{ steps.prep.outputs.tag }}
8484
- name: Install InSpec
85-
uses: actionshub/chef-install@master
85+
uses: actionshub/chef-install@main
8686
with:
8787
channel: current
8888
project: inspec

allspark-notebook/Dockerfile

+40-19
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,54 @@
1-
FROM jupyter/all-spark-notebook:399cbb986c6b
1+
FROM jupyter/all-spark-notebook:spark-3.1.1@sha256:b73dad39ad5c469a92764e38d7cc4321040d3fedddcad7fcebc4ddc7f9c15ff2
2+
23
LABEL maintainer=analytics-platform-tech@digital.justice.gov.uk
34

4-
USER root
5+
ENV PATH=$PATH:$HOME/.local/bin
6+
7+
# To match RStudio
8+
ENV NB_UID=1001
9+
10+
# Home directory contents is already owned by UID 1001
11+
ENV CHOWN_HOME=no
512

6-
ENV PATH=$PATH:$HOME/.local/bin \
7-
CHOWN_HOME=no \
8-
PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.11.918,org.apache.hadoop:hadoop-aws:3.0.1 pyspark-shell"
9-
# `org.apache.hadoop:hadoop-aws` version must match `pyspark` version
13+
# NB these are sensible defaults but may need to be changed programatically for
14+
# non local spark (ie. EMR etc.)
15+
ENV PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.11.918,org.apache.hadoop:hadoop-aws:3.0.1 pyspark-shell"
1016

11-
RUN apt-get update && apt-get install -y \
12-
ca-certificates-java \
13-
openjdk-8-jdk \
14-
openssh-client \
15-
software-properties-common \
17+
# Container must be run as root to use NB_UID
18+
USER root
19+
20+
# Install OS pacakges
21+
#
22+
# The reason we have installed these has been lost. Including just in case.
23+
#
24+
# - gdal-bin
25+
# - libspatialindex-dev
26+
# - openssh-client
27+
#
28+
RUN apt-get update && \
29+
apt-get install -y \
1630
gdal-bin \
1731
libspatialindex-dev \
18-
&& rm -rf /var/lib/apt/lists/*
32+
openssh-client && \
33+
rm -rf /var/lib/apt/lists/*
1934

20-
COPY files/pyspark-s3.py /tmp/pyspark-s3.py
35+
# I'm not sure this has any affect
2136
COPY files/hdfs-site.xml /usr/local/spark/conf/hdfs-site.xml
2237

23-
RUN usermod -a -G "staff,users" "${NB_USER}" \
24-
&& update-alternatives --set editor /bin/nano-tiny
38+
# add-user-to-group.sh add the $NB_USER to group 50 (staff) used by RStudio
39+
COPY files/add-user-to-group.sh /usr/local/bin/before-notebook.d/
2540

26-
USER $NB_USER
41+
# Install pythong packages
42+
# - pip - python package manager
43+
# - boto3 - python AWS library
44+
# - nbstripout - tool for stripping sensitive data out of notebooks
45+
#
2746
RUN pip install --upgrade \
2847
pip \
2948
boto3 \
30-
pyspark==3.0.1 \
3149
nbstripout \
32-
etl-manager==7.3.0 \
33-
gluejobutils==3.1.1
50+
dataengineeringutils3==1.3.0 \
51+
etl-manager==7.4.0
52+
53+
# Vi just doesn't cut it for some people
54+
RUN update-alternatives --set editor /bin/nano-tiny
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
3+
usermod -a -G 50 "${NB_USER}"

allspark-notebook/tests/controls/pyspark_spec.rb

-22
This file was deleted.

allspark-notebook/tests/controls/user_spec.rb

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
describe user('jovyan') do
1111
it { should exist }
12-
its('uid') { should eq 1000 }
12+
its('uid') { should eq 1001 }
1313
end
1414
end
1515

0 commit comments

Comments
 (0)