From 3345790c8d602dd2eed03486dde5fc96175d581e Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Thu, 8 Aug 2024 19:41:09 +0200 Subject: [PATCH] Change TPU Metrics Source for Autoscaling (#770) first commit --- .../hpa.jetstream.yaml.tftpl | 6 +++++- .../templates/prometheus-adapter/values.yaml.tftpl | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/modules/jetstream-maxtext-deployment/templates/custom-metrics-stackdriver-adapter/hpa.jetstream.yaml.tftpl b/modules/jetstream-maxtext-deployment/templates/custom-metrics-stackdriver-adapter/hpa.jetstream.yaml.tftpl index b70218558..414cc2432 100644 --- a/modules/jetstream-maxtext-deployment/templates/custom-metrics-stackdriver-adapter/hpa.jetstream.yaml.tftpl +++ b/modules/jetstream-maxtext-deployment/templates/custom-metrics-stackdriver-adapter/hpa.jetstream.yaml.tftpl @@ -24,7 +24,11 @@ spec: - type: External external: metric: - name: kubernetes.io|node|accelerator|${rule.target_query} + name: prometheus.googleapis.com|${rule.target_query}|gauge + selector: + matchLabels: + metric.labels.container: jetstream-http + metric.labels.exported_namespace: default target: type: AverageValue averageValue: ${rule.average_value_target} diff --git a/modules/jetstream-maxtext-deployment/templates/prometheus-adapter/values.yaml.tftpl b/modules/jetstream-maxtext-deployment/templates/prometheus-adapter/values.yaml.tftpl index a07058dee..b1091fe9f 100644 --- a/modules/jetstream-maxtext-deployment/templates/prometheus-adapter/values.yaml.tftpl +++ b/modules/jetstream-maxtext-deployment/templates/prometheus-adapter/values.yaml.tftpl @@ -29,10 +29,10 @@ rules: matches: "" as: "jetstream_slots_used_percentage" metricsQuery: avg(<<.Series>>{<<.LabelMatchers>>,cluster="${cluster_name}"}) - - seriesQuery: 'kubernetes_io:node_accelerator_memory_used' + - seriesQuery: 'memory_used' resources: template: <<.Resource>> name: matches: "" as: "memory_used_percentage" - metricsQuery: avg(kubernetes_io:node_accelerator_memory_used{cluster_name="${cluster_name}"}) / avg(kubernetes_io:node_accelerator_memory_total{cluster_name="${cluster_name}"}) \ No newline at end of file + metricsQuery: avg(memory_used{cluster="${cluster_name}",exported_namespace="default",container="jetstream-http"}) / avg(memory_total{cluster="${cluster_name}",exported_namespace="default",container="jetstream-http"}) \ No newline at end of file