Skip to content

Commit

Permalink
Merge pull request #57 from ministryofjustice/spark-update
Browse files Browse the repository at this point in the history
Updated allspark-notebook to upstream spark-3.1.1 image
  • Loading branch information
roblevi authored Nov 11, 2021
2 parents c59a952 + 243bb97 commit 16e5307
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 43 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/jupyter-lab-test-and-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ jobs:
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
IMAGE_TAG: ${{ steps.prep.outputs.tag }}
- name: Install InSpec
uses: actionshub/chef-install@master
uses: actionshub/chef-install@main
with:
channel: current
project: inspec
Expand Down
59 changes: 40 additions & 19 deletions allspark-notebook/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,33 +1,54 @@
FROM jupyter/all-spark-notebook:399cbb986c6b
FROM jupyter/all-spark-notebook:spark-3.1.1@sha256:b73dad39ad5c469a92764e38d7cc4321040d3fedddcad7fcebc4ddc7f9c15ff2

LABEL [email protected]

USER root
ENV PATH=$PATH:$HOME/.local/bin

# To match RStudio
ENV NB_UID=1001

# Home directory contents is already owned by UID 1001
ENV CHOWN_HOME=no

ENV PATH=$PATH:$HOME/.local/bin \
CHOWN_HOME=no \
PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.11.918,org.apache.hadoop:hadoop-aws:3.0.1 pyspark-shell"
# `org.apache.hadoop:hadoop-aws` version must match `pyspark` version
# NB these are sensible defaults but may need to be changed programatically for
# non local spark (ie. EMR etc.)
ENV PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.11.918,org.apache.hadoop:hadoop-aws:3.0.1 pyspark-shell"

RUN apt-get update && apt-get install -y \
ca-certificates-java \
openjdk-8-jdk \
openssh-client \
software-properties-common \
# Container must be run as root to use NB_UID
USER root

# Install OS pacakges
#
# The reason we have installed these has been lost. Including just in case.
#
# - gdal-bin
# - libspatialindex-dev
# - openssh-client
#
RUN apt-get update && \
apt-get install -y \
gdal-bin \
libspatialindex-dev \
&& rm -rf /var/lib/apt/lists/*
openssh-client && \
rm -rf /var/lib/apt/lists/*

COPY files/pyspark-s3.py /tmp/pyspark-s3.py
# I'm not sure this has any affect
COPY files/hdfs-site.xml /usr/local/spark/conf/hdfs-site.xml

RUN usermod -a -G "staff,users" "${NB_USER}" \
&& update-alternatives --set editor /bin/nano-tiny
# add-user-to-group.sh add the $NB_USER to group 50 (staff) used by RStudio
COPY files/add-user-to-group.sh /usr/local/bin/before-notebook.d/

USER $NB_USER
# Install pythong packages
# - pip - python package manager
# - boto3 - python AWS library
# - nbstripout - tool for stripping sensitive data out of notebooks
#
RUN pip install --upgrade \
pip \
boto3 \
pyspark==3.0.1 \
nbstripout \
etl-manager==7.3.0 \
gluejobutils==3.1.1
dataengineeringutils3==1.3.0 \
etl-manager==7.4.0

# Vi just doesn't cut it for some people
RUN update-alternatives --set editor /bin/nano-tiny
3 changes: 3 additions & 0 deletions allspark-notebook/files/add-user-to-group.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

usermod -a -G 50 "${NB_USER}"
22 changes: 0 additions & 22 deletions allspark-notebook/tests/controls/pyspark_spec.rb

This file was deleted.

2 changes: 1 addition & 1 deletion allspark-notebook/tests/controls/user_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

describe user('jovyan') do
it { should exist }
its('uid') { should eq 1000 }
its('uid') { should eq 1001 }
end
end

Expand Down

0 comments on commit 16e5307

Please sign in to comment.