Skip to content

Commit

Permalink
Adding a round of OTEL around build (#38)
Browse files Browse the repository at this point in the history
* Adding a round of OTEL around `build`

Connected to PLAT-590

* Use correct args in tracing thing

* Use tracer context correctly

* Adding more attributes to spans
  • Loading branch information
meatballhat authored Dec 17, 2024
1 parent fb146e4 commit 2dd2921
Show file tree
Hide file tree
Showing 14 changed files with 755 additions and 100 deletions.
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ description = "Replicate monolithic base dependency build friend"
readme = "README.md"
requires-python = ">=3.8"
dependencies = [
# "opentelemetry-distro",
"opentelemetry-api",
"opentelemetry-exporter-otlp",
"opentelemetry-sdk",
"structlog",
]
license = {file = "LICENSE"}
Expand Down
1 change: 1 addition & 0 deletions script/test
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ docker run --rm \
--volume "$PWD/build/monobase:/srv/r8/monobase" \
--volume "$PWD/build/cache:/var/cache/monobase" \
--volume "$PWD/build/nfd-features.d:/etc/kubernetes/node-feature-discovery/features.d" \
-e OTEL_EXPORTER_OTLP_ENDPOINT \
monobase:latest \
/opt/r8/monobase/build.sh \
--environment test \
Expand Down
71 changes: 48 additions & 23 deletions src/monobase/build.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import argparse
import datetime
import itertools
import logging
import os
import os.path
import re
import subprocess

from opentelemetry import trace

from monobase.cog import install_cogs
from monobase.cuda import install_cuda, install_cudnn
Expand All @@ -19,12 +21,17 @@
add_arguments,
desc_version,
desc_version_key,
du,
mark_done,
require_done_or_rm,
setup_logging,
setup_opentelemetry,
tracer,
)
from monobase.uv import install_venv

log = logging.getLogger(__name__)

parser = argparse.ArgumentParser(description='Build monobase enviroment')
add_arguments(parser)
parser.add_argument(
Expand Down Expand Up @@ -105,13 +112,20 @@
)


@tracer.start_as_current_span('build_generation')
def build_generation(args: argparse.Namespace, mg: MonoGen) -> None:
span = trace.get_current_span()
span.set_attributes(mg.otel_attributes)

gdir = os.path.join(args.prefix, 'monobase', f'g{mg.id:05d}')

span.set_attribute('generation_dir', gdir)

if require_done_or_rm(gdir):
logging.info(f'Monobase generation {mg.id} is complete')
log.info(f'Monobase generation {mg.id} is complete')
return

logging.info(f'Building monobase generation {mg.id}...')
log.info(f'Building monobase generation {mg.id}...')
os.makedirs(gdir, exist_ok=True)

for k, v in desc_version_key(mg.cuda):
Expand All @@ -121,35 +135,42 @@ def build_generation(args: argparse.Namespace, mg: MonoGen) -> None:
if os.path.exists(dst):
os.remove(dst)
os.symlink(reldst, dst)
logging.info(f'CUDA symlinked in {dst}')
log.info(f'CUDA symlinked in {dst}')

cuda_major_p = re.compile(r'\.\d+$')
cuda_majors = set(cuda_major_p.sub('', k) for k in mg.cuda.keys())
for k, v in desc_version_key(mg.cudnn):
for m in desc_version(cuda_majors):
src = install_cudnn(args, v, m)
dst = f'{gdir}/cudnn{k}-cuda{m}'
reldst = os.path.relpath(src, gdir)
if os.path.exists(dst):
os.remove(dst)
os.symlink(reldst, dst)
logging.info(f'CuDNN symlinked in {dst}')
for (k, v), m in itertools.product(
desc_version_key(mg.cudnn), desc_version(cuda_majors)
):
src = install_cudnn(args, v, m)
dst = f'{gdir}/cudnn{k}-cuda{m}'
reldst = os.path.relpath(src, gdir)
if os.path.exists(dst):
os.remove(dst)
os.symlink(reldst, dst)
log.info(f'CuDNN symlinked in {dst}')

suffix = '' if args.environment == 'prod' else f'-{args.environment}'
rdir = os.path.join('/opt/r8/monobase', f'requirements{suffix}', f'g{mg.id:05d}')
for p, pf in desc_version_key(mg.python):
for t in desc_version(mg.torch):
for c in desc_version(mg.cuda.keys()):
install_venv(args, rdir, gdir, p, pf, t, c)
for (p, pf), t, c in itertools.product(
desc_version_key(mg.python),
desc_version(mg.torch),
desc_version(mg.cuda.keys()),
):
install_venv(args, rdir, gdir, p, pf, t, c)

optimize_ld_cache(args, gdir, mg)
optimize_rdfind(args, gdir, mg)

mark_done(gdir, kind='monogen', **mg.__dict__)
logging.info(f'Generation {mg.id} installed in {gdir}')
log.info(f'Generation {mg.id} installed in {gdir}')


@tracer.start_as_current_span('build')
def build(args: argparse.Namespace) -> None:
span = trace.get_current_span()
span.set_attributes({f'build_{k}': str(v) for k, v in args.__dict__.items()})

start_time = datetime.datetime.now(datetime.UTC)

monogens = sorted(MONOGENS[args.environment], reverse=True)
Expand Down Expand Up @@ -218,8 +239,10 @@ def pick(d: dict[str, str], env: str) -> dict[str, str]:

if i == 0:
latest = os.path.join(args.prefix, 'monobase', 'latest')

if os.path.exists(latest):
os.remove(latest)

os.symlink(f'g{mg.id:05d}', latest)

if args.write_node_feature_discovery_labels:
Expand All @@ -230,24 +253,25 @@ def pick(d: dict[str, str], env: str) -> dict[str, str]:

os.chmod(NODE_FEATURE_LABEL_FILE, 0o644)

logging.info(f'Wrote done={done} to {NODE_FEATURE_LABEL_FILE}')
log.info(f'Wrote done={done} to {NODE_FEATURE_LABEL_FILE}')

if args.requirements is not None:
build_user_venv(args)

if args.prune_old_gen:
prune_old_gen(args)

if args.prune_cuda:
prune_cuda(args)

if args.prune_uv_cache:
prune_uv_cache()

logging.info(f'Calculating disk usage in {args.prefix}...')
cmd = ['du', '-ch', '-d', '1', args.prefix]
subprocess.run(cmd, check=True)
log.info(f'Calculating disk usage in {args.prefix}...')
du(args.prefix)

duration = datetime.datetime.now(datetime.UTC) - start_time
logging.info(
log.info(
f'Monobase build completed: generations={sorted(gens)} duration={duration}'
)

Expand All @@ -256,4 +280,5 @@ def pick(d: dict[str, str], env: str) -> dict[str, str]:

if __name__ == '__main__':
setup_logging()
setup_opentelemetry()
build(parser.parse_args())
2 changes: 2 additions & 0 deletions src/monobase/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

set -euo pipefail

export OTEL_EXPORTER_OTLP_ENDPOINT="${OTEL_EXPORTER_OTLP_ENDPOINT:-}"
export OTEL_SERVICE_NAME="${OTEL_SERVICE_NAME:-monobase}"
export PATH="$MONOBASE_PREFIX/bin:$PATH"

UV_URL='https://github.com/astral-sh/uv/releases/latest/download/uv-x86_64-unknown-linux-gnu.tar.gz'
Expand Down
38 changes: 31 additions & 7 deletions src/monobase/cog.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
import argparse
import hashlib
import itertools
import json
import logging
import os.path
import re
import shutil
import subprocess

from monobase.util import mark_done, require_done_or_rm
from opentelemetry import trace

from monobase.util import mark_done, require_done_or_rm, tracer

LINK_REGEX = re.compile(r'<(?P<url>https://[^>]+)>; rel="next"')

log = logging.getLogger(__name__)


def hash_str(s: str) -> str:
return hashlib.sha256(s.encode('utf-8')).hexdigest()
Expand All @@ -29,9 +34,19 @@ def cog_gen_hash(
return hash_str(json.dumps(j))


@tracer.start_as_current_span('install_cog')
def install_cog(
uv: str, gdir: str, cog_version: str, is_default: bool, python_version: str
) -> None:
trace.get_current_span().set_attributes(
{
'uv': uv,
'cog_version': cog_version,
'cog_version_is_default': str(is_default),
'python_version': python_version,
}
)

if cog_version.startswith('https://'):
name = hash_str(cog_version)[:8]
spec = f'cog@{cog_version}'
Expand All @@ -57,6 +72,7 @@ def install_cog(
os.symlink(venv, default)


@tracer.start_as_current_span('install_cogs')
def install_cogs(args: argparse.Namespace, python_versions: list[str]) -> None:
cdir = os.path.join(args.prefix, 'cog')
os.makedirs(cdir, exist_ok=True)
Expand All @@ -66,20 +82,28 @@ def install_cogs(args: argparse.Namespace, python_versions: list[str]) -> None:
sha256 = cog_gen_hash(cog_versions, args.default_cog_version, python_versions)[:8]
gid = f'g{sha256}'
gdir = os.path.join(cdir, gid)

trace.get_current_span().set_attributes(
{
'generation_id': gid,
'cog_versions': str(cog_versions),
}
)

if require_done_or_rm(gdir):
logging.info(f'Cog generation {gid} is complete')
log.info(f'Cog generation {gid} is complete')
return

logging.info(f'Installing cog generation {gid} in {gdir}...')
log.info(f'Installing cog generation {gid} in {gdir}...')

# Cog * Python because Python version is required for venvs
# And Cog transitives may be Python version dependent
# Create venvs with Python major.minor only
# Since we only the site-packages, not Python interpreters
uv = os.path.join(args.prefix, 'bin', 'uv')
for c in cog_versions:
for p in python_versions:
install_cog(uv, gdir, c, c == args.default_cog_version, p)

for c, p in itertools.product(cog_versions, python_versions):
install_cog(uv, gdir, c, c == args.default_cog_version, p)

latest = os.path.join(cdir, 'latest')
if os.path.exists(latest):
Expand All @@ -98,5 +122,5 @@ def install_cogs(args: argparse.Namespace, python_versions: list[str]) -> None:
for g in os.listdir(cdir):
if g in {'latest', gid}:
continue
logging.info(f'Deleting previous cog generation in {g}...')
log.info(f'Deleting previous cog generation in {g}...')
shutil.rmtree(os.path.join(cdir, g), ignore_errors=True)
24 changes: 13 additions & 11 deletions src/monobase/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from monobase.urls import cuda_urls, cudnn_urls
from monobase.util import Version, mark_done, require_done_or_rm

log = logging.getLogger(__name__)


@dataclass(frozen=True, order=True)
class Cuda:
Expand Down Expand Up @@ -66,23 +68,23 @@ def build_cudnns() -> dict[str, CuDNN]:
def install_cuda(args: argparse.Namespace, version: str) -> str:
cdir = os.path.join(args.prefix, 'cuda', f'cuda-{version}')
if require_done_or_rm(cdir):
logging.info(f'CUDA {version} in {cdir} is complete')
log.info(f'CUDA {version} in {cdir} is complete')
return cdir

if args.skip_cuda:
os.makedirs(cdir, exist_ok=True)
mark_done(cdir, kind='cuda', version=version, skipped=True)
logging.info(f'CUDA {version} skipped in {cdir}')
log.info(f'CUDA {version} skipped in {cdir}')
return cdir

cuda = CUDAS[version]
file = os.path.join(args.cache, cuda.filename)
if not os.path.exists(file):
logging.info(f'Downloading CUDA {version}...')
log.info(f'Downloading CUDA {version}...')
cmd = [f'{args.prefix}/bin/pget', '--pid-file', '/tmp/pget.pid', cuda.url, file]
subprocess.run(cmd, check=True)

logging.info(f'Installing CUDA {version}...')
log.info(f'Installing CUDA {version}...')
cmd = [
'/bin/sh',
file,
Expand All @@ -97,7 +99,7 @@ def install_cuda(args: argparse.Namespace, version: str) -> str:
subprocess.run(cmd, check=True)

# Remove unused files
logging.info(f'Deleting unused files for CUDA {version}...')
log.info(f'Deleting unused files for CUDA {version}...')
shutil.rmtree(os.path.join(cdir, 'compute-sanitizer'), ignore_errors=True)
shutil.rmtree(os.path.join(cdir, 'extras'), ignore_errors=True)
shutil.rmtree(os.path.join(cdir, 'gds'), ignore_errors=True)
Expand All @@ -114,27 +116,27 @@ def install_cuda(args: argparse.Namespace, version: str) -> str:
subprocess.run(cmd, check=True)

mark_done(cdir, kind='cuda', version=version, url=cuda.url)
logging.info(f'CUDA {version} installed in {cdir}')
log.info(f'CUDA {version} installed in {cdir}')
return cdir


def install_cudnn(args: argparse.Namespace, version: str, cuda_major: str) -> str:
key = f'{version}-cuda{cuda_major}'
cdir = os.path.join(args.prefix, 'cuda', f'cudnn-{key}')
if require_done_or_rm(cdir):
logging.info(f'CuDNN {key} in {cdir} is complete')
log.info(f'CuDNN {key} in {cdir} is complete')
return cdir

if args.skip_cuda:
os.makedirs(cdir, exist_ok=True)
mark_done(cdir, kind='cudnn', version=version, skipped=True)
logging.info(f'CuDNN {key} skipped in {cdir}')
log.info(f'CuDNN {key} skipped in {cdir}')
return cdir

cudnn = CUDNNS[key]
file = os.path.join(args.cache, cudnn.filename)
if not os.path.exists(file):
logging.info(f'Downloading CuDNN {key}...')
log.info(f'Downloading CuDNN {key}...')
cmd = [
f'{args.prefix}/bin/pget',
'--pid-file',
Expand All @@ -144,11 +146,11 @@ def install_cudnn(args: argparse.Namespace, version: str, cuda_major: str) -> st
]
subprocess.run(cmd, check=True)

logging.info(f'Installing CuDNN {key}...')
log.info(f'Installing CuDNN {key}...')
os.makedirs(cdir, exist_ok=True)
cmd = ['tar', '-xf', file, '--strip-components=1', '--exclude=lib*.a', '-C', cdir]
subprocess.run(cmd, check=True)

mark_done(cdir, kind='cudnn', version=version, url=cudnn.url)
logging.info(f'CuDNN {key} installed in {cdir}')
log.info(f'CuDNN {key} installed in {cdir}')
return cdir
Loading

0 comments on commit 2dd2921

Please sign in to comment.