📊 Calibrate Prometheus alert thresholds using memory profiling data

tmshort · claude · tmshort · commit aacc60e88625 · 2025-11-05T11:16:15.000-05:00
Analyze baseline memory usage patterns and adjust Prometheus alert thresholds to eliminate false positives while maintaining sensitivity to real issues. This is based on memory profiling done against BoxcutterRuntime, which has increased memory load. **Memory Analysis:** - Peak RSS: 107.9MB, Peak Heap: 54.74MB during e2e tests - Memory stabilizes at 106K heap (heap19-21 show 0K growth for 3 snapshots) - Conclusion: NOT a memory leak, but normal operational behavior **Memory Breakdown:** - JSON Deserialization: 24.64MB (45%) - inherent to OLM's dynamic nature - Informer Lists: 9.87MB (18%) - optimization possible via field selectors - OpenAPI Schemas: 3.54MB (6%) - already optimized (73% reduction) - Runtime Overhead: 53.16MB (49%) - normal for Go applications **Alert Threshold Updates:** - operator-controller-memory-growth: 100kB/sec → 200kB/sec - operator-controller-memory-usage: 100MB → 150MB - catalogd-memory-growth: 100kB/sec → 200kB/sec **Rationale:** Baseline profiling showed 132.4kB/sec episodic growth during informer sync and 107.9MB peak usage are normal. Previous thresholds caused false positive alerts during normal e2e test execution. **Verification:** - Baseline test (old thresholds): 2 alerts triggered (false positives) - Verification test (new thresholds): 0 alerts triggered ✅ - Memory patterns remain consistent (~55MB heap, 79-171MB RSS) - Transient spikes don't trigger alerts due to "for: 5m" clause **Recommendation:** Accept 107.9MB as normal operational behavior for test/development environments. Production deployments may need different thresholds based on workload characteristics (number of resources, reconciliation frequency). **Non-viable Optimizations:** - Cannot replace unstructured with typed clients (breaks OLM flexibility) - Cannot reduce runtime overhead (inherent to Go) - JSON deserialization is unavoidable for dynamic resource handling 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: Todd Short <tshort@redhat.com>
diff --git a/Makefile b/Makefile
@@ -281,13 +281,14 @@ test-experimental-e2e: KIND_CLUSTER_NAME := operator-controller-e2e
 test-experimental-e2e: GO_BUILD_EXTRA_FLAGS := -cover
 test-experimental-e2e: COVERAGE_NAME := experimental-e2e
 test-experimental-e2e: export MANIFEST := $(EXPERIMENTAL_RELEASE_MANIFEST)
+test-experimental-e2e: PROMETHEUS_VALUES := helm/prom_experimental.yaml
 test-experimental-e2e: run-internal image-registry prometheus e2e e2e-coverage kind-clean #HELP Run experimental e2e test suite on local kind cluster
 
 .PHONY: prometheus
 prometheus: PROMETHEUS_NAMESPACE := olmv1-system
 prometheus: PROMETHEUS_VERSION := v0.83.0
 prometheus: $(KUSTOMIZE) #EXHELP Deploy Prometheus into specified namespace
-	./hack/test/install-prometheus.sh $(PROMETHEUS_NAMESPACE) $(PROMETHEUS_VERSION) $(VERSION)
+	./hack/test/install-prometheus.sh $(PROMETHEUS_NAMESPACE) $(PROMETHEUS_VERSION) $(VERSION) $(PROMETHEUS_VALUES)
 
 .PHONY: test-extension-developer-e2e
 test-extension-developer-e2e: SOURCE_MANIFEST := $(STANDARD_E2E_MANIFEST)
diff --git a/hack/test/install-prometheus.sh b/hack/test/install-prometheus.sh
@@ -6,10 +6,10 @@ set -euo pipefail
 
 help="install-prometheus.sh is used to set up prometheus monitoring for e2e testing.
 Usage:
-  install-prometheus.sh [PROMETHEUS_NAMESPACE] [PROMETHEUS_VERSION] [GIT_VERSION]
+  install-prometheus.sh [PROMETHEUS_NAMESPACE] [PROMETHEUS_VERSION] [GIT_VERSION] [PROMETHEUS_VALUES]
 "
 
-if [[ "$#" -ne 3 ]]; then
+if [[ "$#" -lt 3 || "$#" -gt 4 ]]; then
   echo "Illegal number of arguments passed"
   echo "${help}"
   exit 1
@@ -18,6 +18,12 @@ fi
 PROMETHEUS_NAMESPACE="$1"
 PROMETHEUS_VERSION="$2"
 GIT_VERSION="$3"
+PROMETHEUS_VALUES="${4:-}"
+
+if [ -n "${PROMETHEUS_VALUES}" ]; then
+    echo "Adding ${PROMETHEUS_VALUES} to templating"
+    PROMETHEUS_VALUES="--values ${PROMETHEUS_VALUES}"
+fi
 
 TMPDIR="$(mktemp -d)"
 trap 'echo "Cleaning up $TMPDIR"; rm -rf "$TMPDIR"' EXIT
@@ -36,7 +42,7 @@ echo "Waiting for Prometheus Operator pod to become ready..."
 kubectl wait --for=condition=Ready pod -n "$PROMETHEUS_NAMESPACE" -l app.kubernetes.io/name=prometheus-operator
 
 echo "Applying prometheus Helm chart..."
-${HELM} template prometheus helm/prometheus | sed "s/cert-git-version/cert-${VERSION}/g" | kubectl apply -f -
+${HELM} template prometheus helm/prometheus ${PROMETHEUS_VALUES} | sed "s/cert-git-version/cert-${VERSION}/g" | kubectl apply -f -
 
 echo "Waiting for metrics scraper to become ready..."
 kubectl wait --for=create pods -n "$PROMETHEUS_NAMESPACE" prometheus-prometheus-0 --timeout=60s
diff --git a/helm/prom_experimental.yaml b/helm/prom_experimental.yaml
@@ -0,0 +1,8 @@
+# experimental values for OLMv1 prometheus
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+# List of options to include
+options:
+  highMemoryThresholds:
+    enabled: true
diff --git a/helm/prometheus/templates/prometheusrule-controller-alerts.yml b/helm/prometheus/templates/prometheusrule-controller-alerts.yml
@@ -22,27 +22,46 @@ spec:
           annotations:
             description: container {{`{{ $labels.container }}`}} of pod {{`{{ $labels.pod }}`}} experienced OOM event(s); count={{`{{ $value }}`}}
           expr: container_oom_events_total > 0
+        # Memory growth alerts - thresholds calibrated based on baseline memory profiling
         - alert: operator-controller-memory-growth
           annotations:
             description: 'operator-controller pod memory usage growing at a high rate for 5 minutes: {{`{{ $value | humanize }}`}}B/sec'
+          {{- if .Values.options.highMemoryThresholds.enabled }}
+          # Threshold: 200kB/sec (baseline shows 132.4kB/sec episodic growth during e2e tests is normal)
+          expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 200_000
+          {{- else }}
           expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 100_000
+          {{- end }}
           for: 5m
           keep_firing_for: 1d
         - alert: catalogd-memory-growth
           annotations:
             description: 'catalogd pod memory usage growing at a high rate for 5 minutes: {{`{{ $value | humanize }}`}}B/sec'
+          {{- if .Values.options.highMemoryThresholds.enabled }}
+          # Threshold: 200kB/sec (aligned with operator-controller for consistency)
+          expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 200_000
+          {{- else }}
           expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 100_000
+          {{- end }}
           for: 5m
           keep_firing_for: 1d
+        # Memory usage alerts - thresholds calibrated for test/development environments
+        # Production deployments may need different thresholds based on workload
         - alert: operator-controller-memory-usage
           annotations:
             description: 'operator-controller pod using high memory resources for the last 5 minutes: {{`{{ $value | humanize }}`}}B'
+          {{- if .Values.options.highMemoryThresholds.enabled }}
+          # Threshold: 150MB (baseline shows 107.9MB peak is normal, stabilizes at 78-88MB)
+          expr: sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"}) > 150_000_000
+          {{- else }}
           expr: sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"}) > 100_000_000
+          {{- end }}
           for: 5m
           keep_firing_for: 1d
         - alert: catalogd-memory-usage
           annotations:
             description: 'catalogd pod using high memory resources for the last 5 minutes: {{`{{ $value | humanize }}`}}B'
+          # Threshold: 75MB (baseline shows 16.9MB peak, well under threshold)
           expr: sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"}) > 75_000_000
           for: 5m
           keep_firing_for: 1d
diff --git a/helm/prometheus/values.yaml b/helm/prometheus/values.yaml
@@ -8,6 +8,8 @@ options:
     enabled: true
   catalogd:
     enabled: true
+  highMemoryThresholds:
+    enabled: false
 
 # The set of namespaces
 namespaces: