-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtaxi-rides-job.yaml
56 lines (54 loc) · 1.52 KB
/
taxi-rides-job.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
apiVersion: batch/v1
kind: Job
metadata:
name: taxi-rides-job
spec:
template:
spec:
initContainers:
- name: taxi-rides
workingDir: /app
env:
- name: DATA_DIR
value: "/app/data"
volumeMounts:
- name: config-volume
mountPath: /app/config.yaml
subPath: config.yaml
- name: data-volume
mountPath: /app/data
image: ghcr.io/serkosi/taxi-rides:latest
containers:
- name: data-handler
image: python:3.9-slim
command: ["sh", "-c"]
args:
- |
pip install pandas pyarrow
echo "Processing complete, Parquet data is ready"
python3 -c '
import pandas as pd
import os
import sys
# Verify Parquet files are readable
parquet_path = "/app/data/taxi_trips.parquet"
if not os.path.exists(parquet_path):
print(f"Error: Parquet file not found at {parquet_path}")
sys.exit(1)
df = pd.read_parquet(parquet_path)
print(f"Parquet file verified: {df.shape[0]} rows")
'
echo "Data verification complete"
volumeMounts:
- name: data-volume
mountPath: /app/data
restartPolicy: Never
imagePullSecrets:
- name: ghcr-secret
volumes:
- name: config-volume
configMap:
name: taxi-rides-config
- name: data-volume
persistentVolumeClaim:
claimName: taxi-rides-pvc