Skip to content

Commit

Permalink
Merge pull request #53 from ministryofjustice/ag--fix-spark-s3-access…
Browse files Browse the repository at this point in the history
…-problems

allspark: Updated Spark dependencies (PYSPARK_SUBMIT_ARGS)
  • Loading branch information
xoen authored Dec 17, 2020
2 parents 128ff53 + e6a2c90 commit 8cb4286
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 9 deletions.
6 changes: 4 additions & 2 deletions allspark-notebook/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ USER root
ENV PATH=$PATH:$HOME/.local/bin
ENV NB_UID=1001
ENV CHOWN_HOME=no
ENV PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.1 pyspark-shell"
# `org.apache.hadoop:hadoop-aws` version must match `pyspark`
# version
ENV PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.11.918,org.apache.hadoop:hadoop-aws:3.0.1 pyspark-shell"

COPY ./files/* /tmp/

RUN pip install --upgrade pip boto3 pyspark nbstripout \
RUN pip install --upgrade pip boto3 pyspark==3.0.1 nbstripout \
&& python /tmp/pyspark-s3.py \
&& pip install etl-manager==7.3.0 \
&& pip install gluejobutils==3.1.1 \
Expand Down
6 changes: 0 additions & 6 deletions allspark-notebook/files/pyspark-s3.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,7 @@
#!/usr/bin/env python

import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.1 pyspark-shell'

import pyspark
sc = pyspark.SparkContext("local[*]")

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

hadoopConf = sc._jsc.hadoopConfiguration()
hadoopConf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
22 changes: 22 additions & 0 deletions allspark-notebook/tests/controls/pyspark_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
title 'Working pyspark'

control "pyspark is available" do
impact "high"
title "pyspark should be installed and work"
desc "pyspark installed and can run pyspark jobs"
tag "pyspark"
tag "spark"

describe command("echo $SPARK_HOME") do
its("stdout.strip") { should eq "/usr/local/spark" }
end

# Can run on of the spark examples
describe command("python3 /usr/local/spark/examples/src/main/python/pi.py") do
its("exit_status") { should eq 0 }
# Pi calculated using Spark example job is very
# approximate, once it returned 3.13###
# so checking against 3.1 to avoid random failures
its("stdout") { should match /Pi is roughly 3.1/ }
end
end
7 changes: 6 additions & 1 deletion allspark-notebook/tests/files/spark_read_s3.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
from pyspark.context import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

spark.read_parquet()
df = spark.read.csv("s3a://bucket/path/to/file.csv")
df.limit(10).show()

0 comments on commit 8cb4286

Please sign in to comment.