docker-compose.yaml

name: observability-toolkit

services:
  loki:
    image: "${LOKI_IMAGE_NAME}"
    healthcheck:
      test: [ "CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1" ]
      start_period: 20s
      interval: 10s
      timeout: 5s
      retries: 5
    command:
      - -config.file=/etc/loki/loki-config.yaml
    volumes:
      - ./config/loki/loki-config.yaml:/etc/loki/loki-config.yaml
      - data-loki:/loki
    restart: unless-stopped

  # Tempo runs as user 10001, and docker compose creates the volume as root.
  # As such, we need to chown the volume in order for Tempo to start correctly.
  tempo-init:
    image: &tempoImage ${TEMPO_IMAGE_NAME}
    user: root
    entrypoint:
      - "chown"
      - "10001:10001"
      - "/var/tempo"
    volumes:
      - data-tempo:/var/tempo

  tempo:
    image: *tempoImage
    healthcheck:
      test: [ "CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3200/ready || exit 1" ]
      start_period: 20s
      interval: 10s
      timeout: 5s
      retries: 5
    command: [ "-config.file=/etc/tempo.yaml" ]
    restart: unless-stopped
    volumes:
      - ./config/tempo/tempo-config.yaml:/etc/tempo.yaml
      - data-tempo:/tmp/tempo
    depends_on:
      - tempo-init

  prometheus:
    image: "${PROMETHEUS_IMAGE_NAME}"
    restart: unless-stopped
    healthcheck:
      test: [ "CMD", "wget", "-q", "--spider", "http://localhost:9090/-/healthy" ]
      start_period: 20s
      interval: 10s
      timeout: 5s
      retries: 5
    volumes:
      - data-prometheus:/prometheus
    command:
      # Standard config
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
      - '--web.console.templates=/usr/share/prometheus/consoles'
      # Additional config
      # retention policy is 15 days default. Changed it to 30 days, as more days takes too much storage
      - '--storage.tsdb.retention.time=30d'
      # The maximum number of bytes of storage blocks to retain. The oldest data will be removed first. Defaults to 0 or disabled. Units supported: B, KB, MB, GB, TB, PB, EB. Ex: "512MB". Based on powers-of-2, so 1KB is 1024B. Only the persistent blocks are deleted to honor this retention although WAL and m-mapped chunks are counted in the total size. So the minimum requirement for the disk is the peak space taken by the wal (the WAL and Checkpoint) and chunks_head (m-mapped Head chunks) directory combined (peaks every 2 hours).
      - '--storage.tsdb.retention.size=30GB'
      #  Maximum time a query may take before being aborted.
      - '--query.timeout=1m'
      # https://prometheus.io/docs/prometheus/latest/feature_flags/#remote-write-receiver
      # https://prometheus.io/docs/prometheus/latest/storage/#overview
      - '--web.enable-remote-write-receiver'
      # https://prometheus.io/docs/prometheus/latest/feature_flags/#exemplars-storage
      - '--enable-feature=exemplar-storage'

  grafana:
    image: "${GRAFANA_IMAGE_NAME}"
    healthcheck:
      test: [ "CMD", "wget", "-q", "--spider", "http://localhost:3000" ]
      start_period: 20s
      interval: 10s
      timeout: 5s
      retries: 5
    ports:
      - "${GRAFANA_HOST_PORT}:3000"
    environment:
      GF_PLUGINS_ALLOW_LOADING_UNSIGNED_PLUGINS: observability-app,ing-observability-app
      GF_AUTH_ANONYMOUS_ENABLED: true
      GF_AUTH_ANONYMOUS_ORG_ROLE: Admin
      GF_AUTH_ORG_ROLE: Admin
      GF_AUTH_DISABLE_LOGIN_FORM: true
      GF_USERS_VIEWERS_CAN_EDIT: true
      GF_USERS_EDITORS_CAN_ADMIN: true
      GF_INSTALL_PLUGINS: https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app,pyroscope-panel
      GF_FEATURE_TOGGLES_ENABLE: tempoSearch tempoBackendSearch tempoServiceGraph tempoApmTable traceToMetrics newTraceViewHeader metricsSummary correlations traceToProfiles tracesEmbeddedFlameGraph
    volumes:
      - ./config/grafana/provisioning/:/etc/grafana/provisioning/
      - data-grafana:/var/lib/grafana
      - ./config/grafana/observability-app:/var/lib/grafana/plugins/observability-app
    restart: unless-stopped

  opentelemetry-collector:
    image: "${OTEL_COLLECTOR_IMAGE_NAME}"
    command: [ "--config=/etc/otel-collector-config.yaml" ]
    volumes:
      - ./config/opentelemetry-collector/otel-collector-config.yaml:/etc/otel-collector-config.yaml
    ports:
      - "${OTEL_COLLECTOR_HOST_PORT_PROMETHEUS}:8889"   # Prometheus exporter metrics
      - "${OTEL_COLLECTOR_HOST_PORT_GRPC}:4317"         # otlp receiver GRPC
      - "${OTEL_COLLECTOR_HOST_PORT_HTTP}:4318"         # otlp receiver HTTP

  promtail:
    image: "${PROMTAIL_IMAGE_NAME}"
    command: -config.file=/etc/promtail/promtail-config.yaml
    volumes:
      - ./promtail_input:/var/log
      - ./config/promtail/promtail-config.yaml:/etc/promtail/promtail-config.yaml

  pyroscope:
    image: "${PYROSCOPE_IMAGE_NAME}"
    environment:
      JAEGER_AGENT_HOST: tempo
      JAEGER_SAMPLER_TYPE: const
      JAEGER_SAMPLER_PARAM: 1
    command: [ "-config.file=/etc/pyroscope.yml" ]
    ports:
      - "${PYROSCOPE_PORT}:4040"
    volumes:
      - ./config/pyroscope/pyroscope.yml:/etc/pyroscope.yml

volumes:
  data-loki:
  data-tempo:
  data-prometheus:
  data-grafana: