Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

issue:4043385: [UFM Events Grafana Dashboard Plugin]: Collecting the telemetry and write it to Prometheus DB #240

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 21 additions & 10 deletions plugins/ufm_events_grafana_dashboard_plugin/build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,19 @@ ARG BASE_PATH=/opt/ufm/ufm_plugin_${PLUGIN_NAME}
ARG SRC_BASE_DIR=${PLUGIN_NAME}_plugin
ARG ETC_ALTERNATIVE_PATH=/var/etc
ARG SUPERVISOR_PATH=${ETC_ALTERNATIVE_PATH}/supervisor
ARG LOKI_VERSION=3.1.0
boazhaim marked this conversation as resolved.
Show resolved Hide resolved
ARG PROMETHEUS_VERSION=2.54.0
ARG GRAFANA_VERSION=11.1.0
ENV DEBIAN_FRONTEND=noninteractive
ENV REQUIRED_UFM_VERSION=6.12.0

COPY ${SRC_BASE_DIR}/ ${BASE_PATH}/${SRC_BASE_DIR}/
COPY utils/config_parser.py utils/singleton.py utils/logger.py ${BASE_PATH}/utils/
COPY ${SRC_BASE_DIR}/scripts/ /

RUN apt-get update && apt-get upgrade -y && \
# Install plugin dependacies
apt-get install -y supervisor vim tzdata wget unzip curl \
apt-get install -y supervisor vim tzdata wget unzip curl python3 python3-pip \
# Install Fluentd prerequisites
gnupg build-essential ruby ruby-dev \
# Install Grafana prerequisites
Expand All @@ -27,25 +31,32 @@ RUN apt-get update && apt-get upgrade -y && \
apt-get remove --purge -y ruby-dev build-essential && \
apt-get autoremove -y && \
# Install Loki
wget https://github.com/grafana/loki/releases/download/v3.1.0/loki-linux-amd64.zip && \
wget https://github.com/grafana/loki/releases/download/v"${LOKI_VERSION}"/loki-linux-amd64.zip && \
unzip loki-linux-amd64.zip && \
mv loki-linux-amd64 /usr/local/bin/loki && \
rm loki-linux-amd64.zip && \
# Install Prometheus
wget https://github.com/prometheus/prometheus/releases/download/v"${PROMETHEUS_VERSION}"/prometheus-"${PROMETHEUS_VERSION}".linux-amd64.tar.gz && \
tar -xvf prometheus-"${PROMETHEUS_VERSION}".linux-amd64.tar.gz && \
mv prometheus-"${PROMETHEUS_VERSION}".linux-amd64/prometheus /usr/local/bin/prometheus && \
rm -rf prometheus-"${PROMETHEUS_VERSION}".linux-amd64.tar.gz && \
# Install Grafana
wget https://dl.grafana.com/oss/release/grafana_11.1.0_amd64.deb && \
dpkg -i grafana_11.1.0_amd64.deb && \
rm grafana_11.1.0_amd64.deb && \
wget https://dl.grafana.com/oss/release/grafana_"${GRAFANA_VERSION}"_amd64.deb && \
dpkg -i grafana_"${GRAFANA_VERSION}"_amd64.deb && \
rm grafana_"${GRAFANA_VERSION}"_amd64.deb && \
# Final cleanup
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# move /etc/supervisor from the /etc, /etc dir will be overridden by the shared volume
RUN mkdir -p ${ETC_ALTERNATIVE_PATH} && mv /etc/supervisor ${ETC_ALTERNATIVE_PATH}
# install the python packages
RUN python3 -m pip install -r ${BASE_PATH}/${SRC_BASE_DIR}/src/${PLUGIN_NAME}/requirements.txt

RUN sed -i "s|/etc/supervisor/conf.d/\*.conf|${SUPERVISOR_PATH}/conf.d/\*.conf|g" ${SUPERVISOR_PATH}/supervisord.conf
# move /etc/supervisor from the /etc, /etc dir will be overridden by the shared volume
RUN mkdir -p ${ETC_ALTERNATIVE_PATH} && \
mv /etc/supervisor ${ETC_ALTERNATIVE_PATH} && \
sed -i "s|/etc/supervisor/conf.d/\*.conf|${SUPERVISOR_PATH}/conf.d/\*.conf|g" ${SUPERVISOR_PATH}/supervisord.conf

# Copy Supervisor configuration file
COPY ${SRC_BASE_DIR}/conf/supervisord.conf ${SUPERVISOR_PATH}/conf.d/

# Start services using supervisord
CMD ["/usr/bin/supervisord", "-c", "/var/etc/supervisor/supervisord.conf"]
CMD ["/bin/bash", "-c", "/entrypoint.sh"]
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ echo ${IMAGE_VERSION} > ../../${PLUGIN_NAME}_plugin/version

BUILD_DIR=$(create_out_dir)
cp Dockerfile ${BUILD_DIR}
cp -r ../../../utils ${BUILD_DIR}
cp -r ../../${PLUGIN_NAME}_plugin ${BUILD_DIR}

echo "BUILD_DIR : [${BUILD_DIR}]"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
global:
scrape_interval: 60s

scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"

# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.

static_configs:
- targets: ["127.0.0.1:@@PROMETHEUS_PORT@@"]

metric_relabel_configs:
# Keep only the specified metrics and drop all others.
- source_labels: [__name__]
regex: '^(prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_compaction_chunk_size_bytes_sum|prometheus_tsdb_compaction_chunk_samples_sum|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_wal_storage_size_bytes)$'
action: keep
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,18 @@ logfile=/opt/ufm/files/log/plugins/ufm_events_grafana_dashboard/supervisord.log
logfile_backups=5
logfile_maxbytes=1048576

[program:collector_service]
directory=/opt/ufm/ufm_plugin_ufm_events_grafana_dashboard
command=python3 /opt/ufm/ufm_plugin_ufm_events_grafana_dashboard/ufm_events_grafana_dashboard_plugin/src/ufm_events_grafana_dashboard/app.py
user=root
priority=100
autostart=true
autorestart=true
startretries=1
startsecs=1
killasgroup=true
stopasgroup=true

[program:loki]
command=/usr/local/bin/loki -config.file=/config/loki/loki-local-config.yaml
user=root
Expand All @@ -20,10 +32,24 @@ stderr_logfile_maxbytes=1048576
stdout_logfile_backups=5
stderr_logfile_backups=5

[program:prometheus_server]
command=/usr/local/bin/prometheus --enable-feature=memory-snapshot-on-shutdown --web.enable-remote-write-receiver --web.enable-lifecycle --storage.tsdb.retention.time=@@prometheus_db_data_retention_time@@ --storage.tsdb.retention.size=@@prometheus_db_data_retention_size@@ --storage.tsdb.path=@@prometheus_db_folder@@ --config.file=@@prometheus_config_file@@ --web.listen-address=@@prometheus_ip@@:@@prometheus_port@@
user=root
autostart=true
autorestart=true
startretries=1
startsecs=1
stdout_logfile=/opt/ufm/files/log/plugins/ufm_events_grafana_dashboard/prometheus.log
stderr_logfile=/opt/ufm/files/log/plugins/ufm_events_grafana_dashboard/prometheus.log
stdout_logfile_maxbytes=1048576
stderr_logfile_maxbytes=1048576
stdout_logfile_backups=5
stderr_logfile_backups=5

[program:fluentd]
command=/usr/local/bin/fluentd -c /config/fluentd/fluentd.conf
user=root
priority=200
priority=250
autostart=true
autorestart=true
startretries=1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[telemetry]
url=http://127.0.0.1:9002/csv/xcset/low_freq_debug
interval=300
enabled=True
labels_to_export_to_prometheus=Node_GUID,port_guid,Port_Number,Device_ID,node_description,link_partner_node_guid,link_partner_port_num,link_partner_description
metrics_to_export_to_prometheus=Link_Down

[prometheus]
prometheus_ip=0.0.0.0
prometheus_port=9292
prometheus_db_data_retention_size=500MB
prometheus_db_data_retention_time=15d

[logs-config]
logs_file_name = /opt/ufm/files/log/plugins/ufm_events_grafana_dashboard/plugin_console.log
logs_level = INFO
log_file_max_size = 10485760
log_file_backup_count = 5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#
# Copyright © 2013-2024 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# This software product is a proprietary product of Nvidia Corporation and its affiliates
# (the "Company") and all right, title, and interest in and to the software
# product, including all associated intellectual property rights, are and
# shall remain exclusively with the Company.
#
# This software product is governed by the End User License Agreement
# provided with the software product.
#
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#
# Copyright © 2013-2024 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# This software product is a proprietary product of Nvidia Corporation and its affiliates
# (the "Company") and all right, title, and interest in and to the software
# product, including all associated intellectual property rights, are and
# shall remain exclusively with the Company.
#
# This software product is governed by the End User License Agreement
# provided with the software product.
#
import os
import sys
sys.path.append(os.getcwd())

import time

import data.manager as dm
from utils.logger import Logger, LOG_LEVELS
from mgr.configurations_mgr import UFMEventsGrafanaConfigParser
from data.collectors.collectors_mgr import CollectorMgr


def _init_logs(config_parser: UFMEventsGrafanaConfigParser) -> None:
# init logs configs
logs_file_name = config_parser.get_logs_file_name()
logs_level = config_parser.get_logs_level()
max_log_file_size = config_parser.get_log_file_max_size()
log_file_backup_count = config_parser.get_log_file_backup_count()
Logger.init_logs_config(logs_file_name, logs_level, max_log_file_size, log_file_backup_count)


if __name__ == '__main__':
ananalaghbar marked this conversation as resolved.
Show resolved Hide resolved

conf = None
try:
conf = UFMEventsGrafanaConfigParser.getInstance()
_init_logs(conf)
#######
data_mgr = dm.DataManager()
collector_mgr = CollectorMgr(data_manager=data_mgr)
while True:
time.sleep(1)
except ValueError as ve:
Logger.log_message(f'Error occurred during the plugin initialization process : {str(ve)}',
LOG_LEVELS.ERROR)
except Exception as ex:
Logger.log_message(f'Error occurred during the plugin initialization process : {str(ex)}',
LOG_LEVELS.ERROR)
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#
# Copyright © 2013-2024 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# This software product is a proprietary product of Nvidia Corporation and its affiliates
# (the "Company") and all right, title, and interest in and to the software
# product, including all associated intellectual property rights, are and
# shall remain exclusively with the Company.
#
# This software product is governed by the End User License Agreement
# provided with the software product.
#
from enum import Enum


class DataType(Enum):
"""
DataType Enums Class
"""
TELEMETRY = 1


class ModelListeners(Enum):
"""
ModelListeners Enums class
"""
TELEMETRY_PROMETHEUS_EXPORTER = 1


class Prometheus:
"""
Prometheus Constants Class
"""
LABELS = "labels"
COUNTER_VALUE = "counter_value"
TIMESTAMP = "timestamp"

Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#
# Copyright © 2013-2024 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# This software product is a proprietary product of Nvidia Corporation and its affiliates
# (the "Company") and all right, title, and interest in and to the software
# product, including all associated intellectual property rights, are and
# shall remain exclusively with the Company.
#
# This software product is governed by the End User License Agreement
# provided with the software product.
#
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#
# Copyright © 2013-2024 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# This software product is a proprietary product of Nvidia Corporation and its affiliates
# (the "Company") and all right, title, and interest in and to the software
# product, including all associated intellectual property rights, are and
# shall remain exclusively with the Company.
#
# This software product is governed by the End User License Agreement
# provided with the software product.
#
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#
# Copyright © 2013-2024 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# This software product is a proprietary product of Nvidia Corporation and its affiliates
# (the "Company") and all right, title, and interest in and to the software
# product, including all associated intellectual property rights, are and
# shall remain exclusively with the Company.
#
# This software product is governed by the End User License Agreement
# provided with the software product.
#
import json
import httpx

from abc import ABC, abstractmethod
from typing import Any

from mgr.configurations_mgr import UFMEventsGrafanaConfigParser
from utils.logger import Logger, LOG_LEVELS
from data.models.base_model import BaseModel


class BaseCollector(ABC):
"""Base class for a collector that collects data at a given interval"""

def __init__(self, model: BaseModel, is_enabled: bool, interval: int):
if not isinstance(interval, int) or interval < 0:
raise RuntimeError(f"Invalid interval value {interval}. Please use non-negative int values")
self.model = model
self.interval = interval
self.is_enabled = is_enabled

@abstractmethod
async def collect(self) -> None:
"""Method that collects data"""
pass


class HttpCollector(BaseCollector):
"""Base class that collects data from an HTTP URL"""

def __init__(self, model: BaseModel, is_enabled: bool,
interval: int, url: str, jsonify: bool = False):
super().__init__(model, is_enabled, interval)
self.url = url
self.jsonify = jsonify

async def collect(self):
"""Method that collects data from an HTTP endpoint"""
try:
data = await self.do_http_get()
if self.model:
self.model.on_data(data)
except Exception as ex:
error_msg = f"Failed to collect data from {self.url} : {ex}"
Logger.log_message(error_msg, LOG_LEVELS.ERROR)

async def do_http_get(self) -> Any:
"""Method that performs an HTTP GET request"""
async with httpx.AsyncClient(verify=False) as client:
try:
Logger.log_message(f'Requesting URL: {self.url}', LOG_LEVELS.DEBUG)
response = await client.get(self.url)
Logger.log_message(f'Requesting URL: {self.url} '
f'completed with status [{str(response.status_code)}]', LOG_LEVELS.DEBUG)
response.raise_for_status()
if self.jsonify:
return json.loads(response.text)
return response.text
except (ConnectionError, httpx.ConnectError) as con_err:
error_msg = f"Failed to GET from {self.url} : {con_err}"
Logger.log_message(error_msg, LOG_LEVELS.ERROR)
except Exception as ex:
error_msg = f"Failed to GET from {self.url} : {ex}"
Logger.log_message(error_msg, LOG_LEVELS.ERROR)


class TelemetryHttpCollector(HttpCollector):
"""Class that collects telemetry metrics from a given URL"""

def __init__(self, model: BaseModel):
conf = UFMEventsGrafanaConfigParser.getInstance()
is_enabled = conf.get_telemetry_enabled()
url = conf.get_telemetry_url()
interval = conf.get_telemetry_interval()
super(TelemetryHttpCollector, self).__init__(model=model, url=url,
interval=interval, is_enabled=is_enabled)
Loading