From 663a7818a856675912da14974ae5d21c63efe03f Mon Sep 17 00:00:00 2001 From: Rob Levi Date: Wed, 10 Nov 2021 16:54:55 +0000 Subject: [PATCH 1/4] Updated allspark-notebook to upstream spark-3.1.1 image --- allspark-notebook/Dockerfile | 61 ++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/allspark-notebook/Dockerfile b/allspark-notebook/Dockerfile index 8cc09d5..90fdb2e 100644 --- a/allspark-notebook/Dockerfile +++ b/allspark-notebook/Dockerfile @@ -1,33 +1,54 @@ -FROM jupyter/all-spark-notebook:399cbb986c6b +FROM jupyter/all-spark-notebook:spark-3.1.1@sha256:b73dad39ad5c469a92764e38d7cc4321040d3fedddcad7fcebc4ddc7f9c15ff2 + LABEL maintainer=analytics-platform-tech@digital.justice.gov.uk -USER root +ENV PATH=$PATH:$HOME/.local/bin + +# To match RStudio +ENV NB_UID=1001 -ENV PATH=$PATH:$HOME/.local/bin \ - CHOWN_HOME=no \ - PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.11.918,org.apache.hadoop:hadoop-aws:3.0.1 pyspark-shell" -# `org.apache.hadoop:hadoop-aws` version must match `pyspark` version +# Home directory contents is already owned by UID 1001 +ENV CHOWN_HOME=no -RUN apt-get update && apt-get install -y \ - ca-certificates-java \ - openjdk-8-jdk \ - openssh-client \ - software-properties-common \ +# NB these are sensible defaults but may need to be changed programatically for +# non local spark (ie. EMR etc.) +ENV PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.11.918,org.apache.hadoop:hadoop-aws:3.0.1 pyspark-shell" + +# Container must be run as root to use NB_UID +USER root + +# Install OS pacakges +# +# The reason we have installed these has been lost. Including just in case. +# +# - gdal-bin +# - libspatialindex-dev +# - openssh-client +# +RUN apt-get update && \ + apt-get install -y \ gdal-bin \ libspatialindex-dev \ - && rm -rf /var/lib/apt/lists/* + openssh-client && \ + rm -rf /var/lib/apt/lists/* -COPY files/pyspark-s3.py /tmp/pyspark-s3.py +# I'm not sure this has any affect COPY files/hdfs-site.xml /usr/local/spark/conf/hdfs-site.xml -RUN usermod -a -G "staff,users" "${NB_USER}" \ - && update-alternatives --set editor /bin/nano-tiny - -USER $NB_USER +# Install pythong packages +# - pip - python package manager +# - boto3 - python AWS library +# - nbstripout - tool for stripping sensitive data out of notebooks +# RUN pip install --upgrade \ pip \ boto3 \ - pyspark==3.0.1 \ nbstripout \ - etl-manager==7.3.0 \ - gluejobutils==3.1.1 + dataengineeringutils3==1.3.0 \ + etl-manager==7.4.0 + +# Add user to groups used by RStudio +RUN usermod -a -G "staff,users" "${NB_USER}" + +# Vi just doesn't cut it for some people +RUN update-alternatives --set editor /bin/nano-tiny From ed57cbce8a28b5b7f8ffa38c43ef9ba42732d451 Mon Sep 17 00:00:00 2001 From: Rob Levi Date: Wed, 10 Nov 2021 17:11:27 +0000 Subject: [PATCH 2/4] Updated git references in action definition --- .github/workflows/jupyter-lab-test-and-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/jupyter-lab-test-and-build.yml b/.github/workflows/jupyter-lab-test-and-build.yml index 96753f8..ba12268 100644 --- a/.github/workflows/jupyter-lab-test-and-build.yml +++ b/.github/workflows/jupyter-lab-test-and-build.yml @@ -82,7 +82,7 @@ jobs: REGISTRY: ${{ steps.login-ecr.outputs.registry }} IMAGE_TAG: ${{ steps.prep.outputs.tag }} - name: Install InSpec - uses: actionshub/chef-install@master + uses: actionshub/chef-install@main with: channel: current project: inspec From 7a0d644a6d991b62b9a7cc5913e4ffa0857a07f8 Mon Sep 17 00:00:00 2001 From: Rob Levi Date: Wed, 10 Nov 2021 17:23:58 +0000 Subject: [PATCH 3/4] Removed out of date tests --- .../tests/controls/pyspark_spec.rb | 22 ------------------- allspark-notebook/tests/controls/user_spec.rb | 2 +- 2 files changed, 1 insertion(+), 23 deletions(-) delete mode 100644 allspark-notebook/tests/controls/pyspark_spec.rb diff --git a/allspark-notebook/tests/controls/pyspark_spec.rb b/allspark-notebook/tests/controls/pyspark_spec.rb deleted file mode 100644 index a66f919..0000000 --- a/allspark-notebook/tests/controls/pyspark_spec.rb +++ /dev/null @@ -1,22 +0,0 @@ -title 'Working pyspark' - -control "pyspark is available" do - impact "high" - title "pyspark should be installed and work" - desc "pyspark installed and can run pyspark jobs" - tag "pyspark" - tag "spark" - - describe command("echo $SPARK_HOME") do - its("stdout.strip") { should eq "/usr/local/spark" } - end - - # Can run on of the spark examples - describe command("python3 /usr/local/spark/examples/src/main/python/pi.py") do - its("exit_status") { should eq 0 } - # Pi calculated using Spark example job is very - # approximate, once it returned 3.13### - # so checking against 3.1 to avoid random failures - its("stdout") { should match /Pi is roughly 3.1/ } - end -end diff --git a/allspark-notebook/tests/controls/user_spec.rb b/allspark-notebook/tests/controls/user_spec.rb index f3a8555..aeb33d3 100644 --- a/allspark-notebook/tests/controls/user_spec.rb +++ b/allspark-notebook/tests/controls/user_spec.rb @@ -9,7 +9,7 @@ describe user('jovyan') do it { should exist } - its('uid') { should eq 1000 } + its('uid') { should eq 1001 } end end From 243bb9753af526a6f8a0cbff874dfcb35fb4602c Mon Sep 17 00:00:00 2001 From: Rob Levi Date: Thu, 11 Nov 2021 08:21:16 +0000 Subject: [PATCH 4/4] Added pre-notebook hook to add notebook user extra group --- allspark-notebook/Dockerfile | 6 +++--- allspark-notebook/files/add-user-to-group.sh | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) create mode 100755 allspark-notebook/files/add-user-to-group.sh diff --git a/allspark-notebook/Dockerfile b/allspark-notebook/Dockerfile index 90fdb2e..bf80a66 100644 --- a/allspark-notebook/Dockerfile +++ b/allspark-notebook/Dockerfile @@ -35,6 +35,9 @@ RUN apt-get update && \ # I'm not sure this has any affect COPY files/hdfs-site.xml /usr/local/spark/conf/hdfs-site.xml +# add-user-to-group.sh add the $NB_USER to group 50 (staff) used by RStudio +COPY files/add-user-to-group.sh /usr/local/bin/before-notebook.d/ + # Install pythong packages # - pip - python package manager # - boto3 - python AWS library @@ -47,8 +50,5 @@ RUN pip install --upgrade \ dataengineeringutils3==1.3.0 \ etl-manager==7.4.0 -# Add user to groups used by RStudio -RUN usermod -a -G "staff,users" "${NB_USER}" - # Vi just doesn't cut it for some people RUN update-alternatives --set editor /bin/nano-tiny diff --git a/allspark-notebook/files/add-user-to-group.sh b/allspark-notebook/files/add-user-to-group.sh new file mode 100755 index 0000000..0a80213 --- /dev/null +++ b/allspark-notebook/files/add-user-to-group.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +usermod -a -G 50 "${NB_USER}"