From f05ecb18370c0048d5f47362e74b684303bfe89e Mon Sep 17 00:00:00 2001 From: Stefan Mayr Date: Tue, 24 Sep 2024 20:17:31 +0200 Subject: [PATCH] Add linux plugin for nvidia-smi based checks --- agents/plugins/nvidia_smi | 27 + .../checkman/nvidia_smi_en_de_coder_util | 2 +- .../collection/checkman/nvidia_smi_gpu_util | 2 +- .../checkman/nvidia_smi_memory_util | 2 +- .../collection/checkman/nvidia_smi_power | 2 +- .../checkman/nvidia_smi_temperature | 2 +- .../agents/plugins/test_nvidia_smi.sh | 472 ++++++++++++++++++ 7 files changed, 504 insertions(+), 5 deletions(-) create mode 100755 agents/plugins/nvidia_smi create mode 100755 tests/unit-shell/agents/plugins/test_nvidia_smi.sh diff --git a/agents/plugins/nvidia_smi b/agents/plugins/nvidia_smi new file mode 100755 index 00000000000..7e81cf6f7d7 --- /dev/null +++ b/agents/plugins/nvidia_smi @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright (C) 2019 Checkmk GmbH - License: GNU General Public License v2 +# This file is part of Checkmk (https://checkmk.com). It is subject to the terms and +# conditions defined in the file COPYING, which is part of this source code package. + +# Reason for this no-op: shellcheck disable=... before the first command disables the error for the +# entire script. +: + +# Disable unused variable error (needed to keep track of version) +# shellcheck disable=SC2034 +CMK_VERSION="2.4.0b1" + +# Function to replace "if type [somecmd]" idiom +# 'command -v' tends to be more robust vs 'which' and 'type' based tests +inpath() { + command -v "${1:?No command to test}" >/dev/null 2>&1 +} + +main() { + if inpath nvidia-smi; then + echo '<<>>' + nvidia-smi -q -x + fi +} + +[ -z "${MK_SOURCE_ONLY}" ] && main diff --git a/cmk/plugins/collection/checkman/nvidia_smi_en_de_coder_util b/cmk/plugins/collection/checkman/nvidia_smi_en_de_coder_util index a843e95f5e4..5e85cebd458 100644 --- a/cmk/plugins/collection/checkman/nvidia_smi_en_de_coder_util +++ b/cmk/plugins/collection/checkman/nvidia_smi_en_de_coder_util @@ -1,5 +1,5 @@ title: NVIDIA Graphics Card: En-/Decoder utilization -agents: windows +agents: linux, windows catalog: os/hardware license: GPLv2 distribution: check_mk diff --git a/cmk/plugins/collection/checkman/nvidia_smi_gpu_util b/cmk/plugins/collection/checkman/nvidia_smi_gpu_util index 060eaf0d2da..483ed141f43 100644 --- a/cmk/plugins/collection/checkman/nvidia_smi_gpu_util +++ b/cmk/plugins/collection/checkman/nvidia_smi_gpu_util @@ -1,5 +1,5 @@ title: NVIDIA Graphics Card: GPU utilization -agents: windows +agents: linux, windows catalog: os/hardware license: GPLv2 distribution: check_mk diff --git a/cmk/plugins/collection/checkman/nvidia_smi_memory_util b/cmk/plugins/collection/checkman/nvidia_smi_memory_util index 44c74342161..e1bd4fbea27 100644 --- a/cmk/plugins/collection/checkman/nvidia_smi_memory_util +++ b/cmk/plugins/collection/checkman/nvidia_smi_memory_util @@ -1,5 +1,5 @@ title: NVIDIA Graphics Card: Memory utilization -agents: windows +agents: linux, windows catalog: os/hardware license: GPLv2 distribution: check_mk diff --git a/cmk/plugins/collection/checkman/nvidia_smi_power b/cmk/plugins/collection/checkman/nvidia_smi_power index 0ae21771a43..4b64409a0aa 100644 --- a/cmk/plugins/collection/checkman/nvidia_smi_power +++ b/cmk/plugins/collection/checkman/nvidia_smi_power @@ -1,5 +1,5 @@ title: NVIDIA Graphics Card: Power usage -agents: windows +agents: linux, windows catalog: os/hardware license: GPLv2 distribution: check_mk diff --git a/cmk/plugins/collection/checkman/nvidia_smi_temperature b/cmk/plugins/collection/checkman/nvidia_smi_temperature index 1194c3c09e6..a96e3a3193b 100644 --- a/cmk/plugins/collection/checkman/nvidia_smi_temperature +++ b/cmk/plugins/collection/checkman/nvidia_smi_temperature @@ -1,5 +1,5 @@ title: NVIDIA Graphics Card: Temperature -agents: windows +agents: linux, windows catalog: os/hardware license: GPLv2 distribution: check_mk diff --git a/tests/unit-shell/agents/plugins/test_nvidia_smi.sh b/tests/unit-shell/agents/plugins/test_nvidia_smi.sh new file mode 100755 index 00000000000..c4116468e17 --- /dev/null +++ b/tests/unit-shell/agents/plugins/test_nvidia_smi.sh @@ -0,0 +1,472 @@ +#!/bin/bash +# Copyright (C) 2019 Checkmk GmbH - License: GNU General Public License v2 +# This file is part of Checkmk (https://checkmk.com). It is subject to the terms and +# conditions defined in the file COPYING, which is part of this source code package. + +MK_NVIDIA_SMI_PLUGIN_PATH="${UNIT_SH_PLUGINS_DIR}/nvidia_smi" + +nvidia-smi() { + echo ' + + + + Fri Aug 4 11:44:30 2023 + 535.54.03 + 12.2 + 4 + + NVIDIA A100-SXM4-80GB + NVIDIA + Ampere + Enabled + Disabled + Disabled + None + + Enabled + Enabled + + + + 0 + 3 + 0 + + + 14 + 1 + 0 + 1 + 0 + 0 + + + + + 0 + + + + 19968 MiB + 0 MiB + 12 MiB + 19955 MiB + + + 32767 MiB + 0 MiB + 32767 MiB + + + + 1 + 4 + 0 + + + 14 + 1 + 0 + 1 + 0 + 0 + + + + + 0 + + + + 19968 MiB + 0 MiB + 12 MiB + 19955 MiB + + + 32767 MiB + 0 MiB + 32767 MiB + + + + 2 + 5 + 0 + + + 14 + 1 + 0 + 1 + 0 + 0 + + + + + 0 + + + + 19968 MiB + 0 MiB + 12 MiB + 19955 MiB + + + 32767 MiB + 0 MiB + 32767 MiB + + + + 3 + 6 + 0 + + + 14 + 1 + 0 + 1 + 0 + 0 + + + + + 0 + + + + 19968 MiB + 0 MiB + 12 MiB + 19955 MiB + + + 32767 MiB + 0 MiB + 32767 MiB + + + + Disabled + 4000 + + N/A + N/A + + 1650522003820 + GPU-513536b6-7d19-9063-b049-1e69664bb298 + 1 + 92.00.36.00.02 + No + 0x100 + 692-2G506-0212-002 + 20B2-895-A1 + N/A + 4 + + G506.0212.00.01 + 2.0 + 6.16 + N/A + + + N/A + N/A + + 535.54.03 + + None + N/A + + + No + No + + + N/A + + + 01 + 00 + 0000 + 20B210DE + 00000000:01:00.0 + 147F10DE + + + 4 + 4 + 4 + 4 + 4 + + + 16x + 16x + + + + N/A + N/A + + 0 + 0 + 4000 KB/s + 0 KB/s + N/A + N/A + + N/A + P0 + + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + + + 81920 MiB + 869 MiB + 50 MiB + 80999 MiB + + + 131072 MiB + 1 MiB + 131071 MiB + + + 0 MiB + 0 MiB + 0 MiB + + Default + + N/A + N/A + N/A + N/A + N/A + N/A + + + 0 + 0 + 0 + + + 0 + 0 + 0 + + + Enabled + Enabled + + + + 0 + 0 + 0 + 0 + + + 0 + 0 + 0 + 0 + + + + + N/A + N/A + + + N/A + N/A + + N/A + N/A + + N/A + + 27 C + N/A + 92 C + 89 C + 85 C + N/A + 44 C + 95 C + + + N/A + N/A + + + P0 + 67.03 W + 500.00 W + 500.00 W + 500.00 W + 100.00 W + 500.00 W + + + P0 + N/A + N/A + N/A + N/A + N/A + N/A + + + 1275 MHz + 1275 MHz + 1593 MHz + 1275 MHz + + + 1275 MHz + 1593 MHz + + + 1275 MHz + 1593 MHz + + + N/A + + + 1410 MHz + 1410 MHz + 1593 MHz + 1290 MHz + + + 1410 MHz + + + N/A + N/A + + + 912.500 mV + + + N/A + N/A + + + + 1593 MHz + 1410 MHz + 1395 MHz + 1380 MHz + 1365 MHz + 1350 MHz + 1335 MHz + 1320 MHz + 1305 MHz + 1290 MHz + 1275 MHz + 1260 MHz + 1245 MHz + 1230 MHz + 1215 MHz + 1200 MHz + 1185 MHz + 1170 MHz + 1155 MHz + 1140 MHz + 1125 MHz + 1110 MHz + 1095 MHz + 1080 MHz + 1065 MHz + 1050 MHz + 1035 MHz + 1020 MHz + 1005 MHz + 990 MHz + 975 MHz + 960 MHz + 945 MHz + 930 MHz + 915 MHz + 900 MHz + 885 MHz + 870 MHz + 855 MHz + 840 MHz + 825 MHz + 810 MHz + 795 MHz + 780 MHz + 765 MHz + 750 MHz + 735 MHz + 720 MHz + 705 MHz + 690 MHz + 675 MHz + 660 MHz + 645 MHz + 630 MHz + 615 MHz + 600 MHz + 585 MHz + 570 MHz + 555 MHz + 540 MHz + 525 MHz + 510 MHz + 495 MHz + 480 MHz + 465 MHz + 450 MHz + 435 MHz + 420 MHz + 405 MHz + 390 MHz + 375 MHz + 360 MHz + 345 MHz + 330 MHz + 315 MHz + 300 MHz + 285 MHz + 270 MHz + 255 MHz + 240 MHz + 225 MHz + 210 MHz + + + + + +' +} + +test_nvidia_smi_plugin() { + # shellcheck source=agents/plugins/nvidia-smi + response=$(. "$MK_NVIDIA_SMI_PLUGIN_PATH") + assertEquals "XML output" "<<>> +$(nvidia-smi)" "$response" +} + +# shellcheck disable=SC1090 # Can't follow +. "$UNIT_SH_SHUNIT2"