diff --git a/Makefile b/Makefile index cf7b1d6508..a310414f08 100644 --- a/Makefile +++ b/Makefile @@ -281,13 +281,14 @@ test-experimental-e2e: KIND_CLUSTER_NAME := operator-controller-e2e test-experimental-e2e: GO_BUILD_EXTRA_FLAGS := -cover test-experimental-e2e: COVERAGE_NAME := experimental-e2e test-experimental-e2e: export MANIFEST := $(EXPERIMENTAL_RELEASE_MANIFEST) +test-experimental-e2e: PROMETHEUS_VALUES := helm/prom_experimental.yaml test-experimental-e2e: run-internal image-registry prometheus e2e e2e-coverage kind-clean #HELP Run experimental e2e test suite on local kind cluster .PHONY: prometheus prometheus: PROMETHEUS_NAMESPACE := olmv1-system prometheus: PROMETHEUS_VERSION := v0.83.0 prometheus: $(KUSTOMIZE) #EXHELP Deploy Prometheus into specified namespace - ./hack/test/install-prometheus.sh $(PROMETHEUS_NAMESPACE) $(PROMETHEUS_VERSION) $(VERSION) + ./hack/test/install-prometheus.sh $(PROMETHEUS_NAMESPACE) $(PROMETHEUS_VERSION) $(VERSION) $(PROMETHEUS_VALUES) .PHONY: test-extension-developer-e2e test-extension-developer-e2e: SOURCE_MANIFEST := $(STANDARD_E2E_MANIFEST) diff --git a/hack/test/install-prometheus.sh b/hack/test/install-prometheus.sh index c9d7e0b1c4..f458b2d012 100755 --- a/hack/test/install-prometheus.sh +++ b/hack/test/install-prometheus.sh @@ -6,10 +6,10 @@ set -euo pipefail help="install-prometheus.sh is used to set up prometheus monitoring for e2e testing. Usage: - install-prometheus.sh [PROMETHEUS_NAMESPACE] [PROMETHEUS_VERSION] [GIT_VERSION] + install-prometheus.sh [PROMETHEUS_NAMESPACE] [PROMETHEUS_VERSION] [GIT_VERSION] [PROMETHEUS_VALUES] " -if [[ "$#" -ne 3 ]]; then +if [[ "$#" -lt 3 || "$#" -gt 4 ]]; then echo "Illegal number of arguments passed" echo "${help}" exit 1 @@ -18,6 +18,12 @@ fi PROMETHEUS_NAMESPACE="$1" PROMETHEUS_VERSION="$2" GIT_VERSION="$3" +PROMETHEUS_VALUES="${4:-}" + +if [ -n "${PROMETHEUS_VALUES}" ]; then + echo "Adding ${PROMETHEUS_VALUES} to templating" + PROMETHEUS_VALUES="--values ${PROMETHEUS_VALUES}" +fi TMPDIR="$(mktemp -d)" trap 'echo "Cleaning up $TMPDIR"; rm -rf "$TMPDIR"' EXIT @@ -36,7 +42,7 @@ echo "Waiting for Prometheus Operator pod to become ready..." kubectl wait --for=condition=Ready pod -n "$PROMETHEUS_NAMESPACE" -l app.kubernetes.io/name=prometheus-operator echo "Applying prometheus Helm chart..." -${HELM} template prometheus helm/prometheus | sed "s/cert-git-version/cert-${VERSION}/g" | kubectl apply -f - +${HELM} template prometheus helm/prometheus ${PROMETHEUS_VALUES} | sed "s/cert-git-version/cert-${VERSION}/g" | kubectl apply -f - echo "Waiting for metrics scraper to become ready..." kubectl wait --for=create pods -n "$PROMETHEUS_NAMESPACE" prometheus-prometheus-0 --timeout=60s diff --git a/helm/prom_experimental.yaml b/helm/prom_experimental.yaml new file mode 100644 index 0000000000..ed7d455645 --- /dev/null +++ b/helm/prom_experimental.yaml @@ -0,0 +1,14 @@ +# experimental values for OLMv1 prometheus +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. +# Quote the threshold values to avoid the helm templater interpretting them + +# List of options to include +options: + operatorController: + thresholds: + memoryGrowth: "200_000" + memoryUsage: "150_000_000" + catalogd: + thresholds: + memoryGrowth: "200_000" diff --git a/helm/prometheus/templates/prometheusrile-controller-alerts.yml b/helm/prometheus/templates/prometheusrule-controller-alerts.yml similarity index 81% rename from helm/prometheus/templates/prometheusrile-controller-alerts.yml rename to helm/prometheus/templates/prometheusrule-controller-alerts.yml index bce2706eea..13db7b3a7d 100644 --- a/helm/prometheus/templates/prometheusrile-controller-alerts.yml +++ b/helm/prometheus/templates/prometheusrule-controller-alerts.yml @@ -25,48 +25,48 @@ spec: - alert: operator-controller-memory-growth annotations: description: 'operator-controller pod memory usage growing at a high rate for 5 minutes: {{`{{ $value | humanize }}`}}B/sec' - expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 100_000 + expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > {{ .Values.options.operatorController.thresholds.memoryGrowth }} for: 5m keep_firing_for: 1d - alert: catalogd-memory-growth annotations: description: 'catalogd pod memory usage growing at a high rate for 5 minutes: {{`{{ $value | humanize }}`}}B/sec' - expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 100_000 + expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > {{ .Values.options.catalogd.thresholds.memoryGrowth }} for: 5m keep_firing_for: 1d - alert: operator-controller-memory-usage annotations: description: 'operator-controller pod using high memory resources for the last 5 minutes: {{`{{ $value | humanize }}`}}B' - expr: sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"}) > 100_000_000 + expr: sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"}) > {{ .Values.options.operatorController.thresholds.memoryUsage }} for: 5m keep_firing_for: 1d - alert: catalogd-memory-usage annotations: description: 'catalogd pod using high memory resources for the last 5 minutes: {{`{{ $value | humanize }}`}}B' - expr: sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"}) > 75_000_000 + expr: sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"}) > {{ .Values.options.catalogd.thresholds.memoryUsage }} for: 5m keep_firing_for: 1d - alert: operator-controller-cpu-usage annotations: description: 'operator-controller using high cpu resource for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}%' - expr: rate(container_cpu_usage_seconds_total{pod=~"operator-controller.*",container="manager"}[5m]) * 100 > 20 + expr: rate(container_cpu_usage_seconds_total{pod=~"operator-controller.*",container="manager"}[5m]) * 100 > {{ .Values.options.operatorController.thresholds.cpuUsage }} for: 5m keep_firing_for: 1d - alert: catalogd-cpu-usage annotations: description: 'catalogd using high cpu resources for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}%' - expr: rate(container_cpu_usage_seconds_total{pod=~"catalogd.*",container="manager"}[5m]) * 100 > 20 + expr: rate(container_cpu_usage_seconds_total{pod=~"catalogd.*",container="manager"}[5m]) * 100 > {{ .Values.options.catalogd.thresholds.cpuUsage }} for: 5m keep_firing_for: 1d - alert: operator-controller-api-call-rate annotations: description: 'operator-controller making excessive API calls for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}/sec' - expr: sum(rate(rest_client_requests_total{job=~"operator-controller-service"}[5m])) > 10 + expr: sum(rate(rest_client_requests_total{job=~"operator-controller-service"}[5m])) > {{ .Values.options.operatorController.thresholds.apiCallRate }} for: 5m keep_firing_for: 1d - alert: catalogd-api-call-rate annotations: description: 'catalogd making excessive API calls for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}/sec' - expr: sum(rate(rest_client_requests_total{job=~"catalogd-service"}[5m])) > 5 + expr: sum(rate(rest_client_requests_total{job=~"catalogd-service"}[5m])) > {{ .Values.options.catalogd.thresholds.apiCallRate }} for: 5m keep_firing_for: 1d diff --git a/helm/prometheus/values.yaml b/helm/prometheus/values.yaml index d73579da8d..b38a255929 100644 --- a/helm/prometheus/values.yaml +++ b/helm/prometheus/values.yaml @@ -1,13 +1,24 @@ # Default values for OLMv1. # This is a YAML-formatted file. # Declare variables to be passed into your templates. +# Quote the threshold values to avoid the helm templater interpretting them # List of components to include options: operatorController: enabled: true + thresholds: + memoryGrowth: "100_000" + memoryUsage: "100_000_000" + cpuUsage: 20 + apiCallRate: 10 catalogd: enabled: true + thresholds: + memoryGrowth: "100_000" + memoryUsage: "75_000_000" + cpuUsage: 20 + apiCallRate: 5 # The set of namespaces namespaces: