-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubmit.yml.erb
134 lines (115 loc) · 5.99 KB
/
submit.yml.erb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
<%
groups = OodSupport::User.new.groups.map(&:name).drop(1)
ntasks = number_of_workers.blank? ? 1 : number_of_workers.to_i
walltime = (bc_num_hours.to_i * 60)
%>
---
batch_connect:
template: basic
websockify_cmd: '/usr/bin/websockify'
conn_params:
- "spark_master_webui_port"
- "spark_master_host"
- "spark_ui_auth_token"
script_wrapper: |
module purge
module load singularity
cat << "CTRSCRIPT" > container.sh
export PATH="$PATH:/opt/TurboVNC/bin"
%s
CTRSCRIPT
# Where is this container on Quest
export container_image="<%= spark_version %>"
# Benchmark info
echo "TIMING - Starting jupyter at: $(date)"
# Launch the Jupyter Notebook Server
JUPYTER_TMPDIR=${TMPDIR:-/tmp}/$(id -un)-jupyter/
## for conda
export SINGULARITYENV_CONDA_PKGS_DIRS=${HOME}/.conda/pkgs
## for pip
export SINGULARITYENV_XDG_CACHE_HOME=${JUPYTER_TMPDIR}/xdg_cache_home
## job specific tmp in case there are conflicts with different users running on the same node
WORKDIR_PREFIX='<%= xdg_data_home == "" ? "${HOME}" : xdg_data_home %>'
WORKDIR=${WORKDIR_PREFIX}/jupyter/${SLURM_JOB_ID}
mkdir -m 700 -p ${WORKDIR}/tmp
## for specific package mne
export SINGULARITYENV_NUMBA_CACHE_DIR=$HOME/.cache
## Need to add the --nv flag if we are running on a GPU
<%- if gres_value != "" %>
export SING_GPU="--nv"
<%- else %>
export SING_GPU=""
<%- end %>
# All our software in /software so have to bind that
export SING_BINDS="--bind /software:/software"
# What is the OS *of the host*
HOST_OS=`uname -r`
# Where is this container
if [[ $HOST_OS =~ "el8" ]]; then
## bind some extra stuff to be able to talk to slurm from within the container
export SING_BINDS="$SING_BINDS -B /etc/passwd -B /etc/slurm -B `which sbatch ` -B `which srun ` -B `which sacct ` -B `which scontrol ` -B `which salloc ` -B `which scancel ` -B `which squeue ` -B /usr/lib64/liblua-5.3.so:/usr/lib/x86_64-linux-gnu/liblua-5.3.so -B /usr/lib64/liblua-5.3.so:/usr/lib/x86_64-linux-gnu/liblua.so -B /usr/lib64/libjson-c.so.4:/usr/lib/x86_64-linux-gnu/libjson-c.so.4 -B /usr/lib64/libjson-c.so.4.0.0:/usr/lib/x86_64-linux-gnu/libjson-c.so.4.0.0 -B /usr/lib64/slurm/ -B /usr/lib64/libmunge.so.2 -B /usr/lib64/libmunge.so.2.0.0 -B /usr/lib64/libpmi2.so -B /usr/lib64/libpmi2.so.0 -B /usr/lib64/libpmi2.so.0.0.0 -B /usr/lib64/libpmi.so -B /usr/lib64/libpmi.so.0 -B /usr/lib64/libpmi.so.0.0.0 -B /run"
elif [[ $HOST_OS =~ "el7" ]]; then
## bind some extra stuff to be able to talk to slurm from within the container
export SING_BINDS="$SING_BINDS -B /etc/passwd -B /etc/slurm -B `which sbatch ` -B `which srun ` -B `which sacct ` -B `which scontrol ` -B `which salloc ` -B `which scancel ` -B `which squeue ` -B /usr/lib64/liblua-5.1.so:/usr/lib/x86_64-linux-gnu/liblua-5.1.so -B /usr/lib64/liblua-5.1.so:/usr/lib/x86_64-linux-gnu/liblua.so -B /usr/lib64/libjson-c.so.2:/usr/lib/x86_64-linux-gnu/libjson-c.so.2 -B /usr/lib64/libjson-c.so.2.0.1:/usr/lib/x86_64-linux-gnu/libjson-c.so.2.0.1 -B /usr/lib64/slurm/ -B /usr/lib64/libmunge.so.2 -B /usr/lib64/libmunge.so.2.0.0 -B /usr/lib64/libpmi2.so -B /usr/lib64/libpmi2.so.0 -B /usr/lib64/libpmi2.so.0.0.0 -B /usr/lib64/libpmi.so -B /usr/lib64/libpmi.so.0 -B /usr/lib64/libpmi.so.0.0.0 -B /run"
else
echo "Host OS not recognized, no container found"
fi
## bind some extra stuff to be able to talk to slurm from within the container
if [ -d "/usr/local/ucx-1.13.0" ]; then
## For the RHEL7 nodes
export SING_BINDS="$SING_BINDS -B /usr/local/pmix -B /usr/local/hwloc -B /usr/local/ucx-1.13.0 -B /usr/local/ucx-1.10.0 -B /usr/local/ucx-1.8.1 -B /usr/local/spack -B /usr/local/spack_v20d1 -B /usr/local/libevent/ "
else
## For the RHEL8 nodes
export SING_BINDS="$SING_BINDS -B /usr/local/pmix -B /usr/local/hwloc -B /usr/local/libevent -B /usr/local/spack_v20d1 -B /usr/local/ucx/ "
fi
# only bind /kellogg is individual is part of kellogg group
export SING_BINDS="$SING_BINDS <%= groups.include?('kellogg') ? "--bind /kellogg/:/kellogg/" : "" %>"
# only bind /scratch/<netid> if individual has a scratch space
export SING_BINDS="$SING_BINDS <%= File.directory?("/scratch/#{User.new.name}") ? "--bind /scratch/#{User.new.name}:/scratch/#{User.new.name}" : "" %>"
# Only bind projects directories that the user would have access to based on their unix groups
<%- groups.each do |group| %>
export SING_BINDS="$SING_BINDS <%= File.directory?("/projects/#{group}") ? "--bind /projects/#{group}:/projects/#{group}" : "" %>"
<%- end %>
## job specific tmp dir
export SING_BINDS=" $SING_BINDS -B ${WORKDIR}/tmp:/tmp "
export SINGULARITYENV_XDG_DATA_HOME='<%= xdg_data_home == "" ? "${HOME}" : xdg_data_home %>'
singularity exec $SING_GPU $SING_BINDS "${container_image}" /bin/bash container.sh
header: |
#!/bin/bash
. ~/.bashrc
script:
<%- if user_email != "" %>
email_on_started: true
<%- end %>
native:
# What partition is the user submitting to
- "--partition"
- "<%= slurm_partition %>"
# Under what account is the user submitting this job
- "--account"
- "<%= slurm_account %>"
# How much time (in hours)
- "--time"
- "<%= walltime %>"
# How many workers
- "--ntasks"
- "<%= ntasks %>"
# How much memory per work
- "--mem-per-cpu"
- "<%= memory_per_worker %>G"
# Job Name
- "--job-name"
- "<%= job_name %>"
# If the user supplies an e-mail, then they will get an e-mail when the job begins
<%- if user_email != "" %>
- "--mail-user"
- "<%= user_email %>"
<%- end %>
# If the user requested a GPU, then we need to add this argument to our job submit command
<%- if gres_value != "" %>
- "--gres"
- "<%= gres_value %>"
<%- end %>
# As this will be multinode, we will force the constraint
- "--constraint"
- "[quest9|quest10|quest11|quest12]"