|
71 | 71 | |||, |
72 | 72 | }, |
73 | 73 | }, |
74 | | - { |
75 | | - // We're syncing every 10mins, and this means with a 5min rate, we will have a NaN when syncs fail |
76 | | - // and we will never trigger the alert. |
77 | | - // We also have a 3h grace-period for creation of tables which means the we can fail for 3h before it's an outage. |
78 | | - alert: 'CortexTableSyncFailure', |
79 | | - expr: ||| |
80 | | - 100 * rate(cortex_table_manager_sync_duration_seconds_count{status_code!~"2.."}[15m]) |
81 | | - / |
82 | | - rate(cortex_table_manager_sync_duration_seconds_count[15m]) |
83 | | - > 10 |
84 | | - |||, |
85 | | - 'for': '30m', |
86 | | - labels: { |
87 | | - severity: 'critical', |
88 | | - }, |
89 | | - annotations: { |
90 | | - message: ||| |
91 | | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors syncing tables. |
92 | | - |||, |
93 | | - }, |
94 | | - }, |
95 | 74 | { |
96 | 75 | alert: 'CortexQueriesIncorrect', |
97 | 76 | expr: ||| |
|
206 | 185 | |||, |
207 | 186 | }, |
208 | 187 | }, |
209 | | - { |
210 | | - alert: 'CortexTransferFailed', |
211 | | - expr: ||| |
212 | | - max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success"}[15m]) |
213 | | - |||, |
214 | | - 'for': '5m', |
215 | | - labels: { |
216 | | - severity: 'critical', |
217 | | - }, |
218 | | - annotations: { |
219 | | - message: ||| |
220 | | - {{ $labels.job }}/{{ $labels.instance }} transfer failed. |
221 | | - |||, |
222 | | - }, |
223 | | - }, |
224 | | - { |
225 | | - alert: 'CortexOldChunkInMemory', |
226 | | - // Even though we should flush chunks after 6h, we see that 99p of age of flushed chunks is closer |
227 | | - // to 10 hours. |
228 | | - // Ignore cortex_oldest_unflushed_chunk_timestamp_seconds that are zero (eg. distributors). |
229 | | - expr: ||| |
230 | | - (time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000) |
231 | | - and |
232 | | - (cortex_oldest_unflushed_chunk_timestamp_seconds > 0) |
233 | | - |||, |
234 | | - 'for': '5m', |
235 | | - labels: { |
236 | | - severity: 'warning', |
237 | | - }, |
238 | | - annotations: { |
239 | | - message: ||| |
240 | | - {{ $labels.job }}/{{ $labels.instance }} has very old unflushed chunk in memory. |
241 | | - |||, |
242 | | - }, |
243 | | - }, |
244 | 188 | { |
245 | 189 | alert: 'CortexKVStoreFailure', |
246 | 190 | expr: ||| |
|
379 | 323 | }, |
380 | 324 | ], |
381 | 325 | }, |
382 | | - { |
383 | | - name: 'cortex_wal_alerts', |
384 | | - rules: [ |
385 | | - { |
386 | | - // Alert immediately if WAL is corrupt. |
387 | | - alert: 'CortexWALCorruption', |
388 | | - expr: ||| |
389 | | - increase(cortex_ingester_wal_corruptions_total[5m]) > 0 |
390 | | - |||, |
391 | | - labels: { |
392 | | - severity: 'critical', |
393 | | - }, |
394 | | - annotations: { |
395 | | - message: ||| |
396 | | - {{ $labels.job }}/{{ $labels.instance }} has a corrupted WAL or checkpoint. |
397 | | - |||, |
398 | | - }, |
399 | | - }, |
400 | | - { |
401 | | - // One or more failed checkpoint creation is a warning. |
402 | | - alert: 'CortexCheckpointCreationFailed', |
403 | | - expr: ||| |
404 | | - increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0 |
405 | | - |||, |
406 | | - labels: { |
407 | | - severity: 'warning', |
408 | | - }, |
409 | | - annotations: { |
410 | | - message: ||| |
411 | | - {{ $labels.job }}/{{ $labels.instance }} failed to create checkpoint. |
412 | | - |||, |
413 | | - }, |
414 | | - }, |
415 | | - { |
416 | | - // Two or more failed checkpoint creation in 1h means something is wrong. |
417 | | - alert: 'CortexCheckpointCreationFailed', |
418 | | - expr: ||| |
419 | | - increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1 |
420 | | - |||, |
421 | | - labels: { |
422 | | - severity: 'critical', |
423 | | - }, |
424 | | - annotations: { |
425 | | - message: ||| |
426 | | - {{ $labels.job }}/{{ $labels.instance }} is failing to create checkpoint. |
427 | | - |||, |
428 | | - }, |
429 | | - }, |
430 | | - { |
431 | | - // One or more failed checkpoint deletion is a warning. |
432 | | - alert: 'CortexCheckpointDeletionFailed', |
433 | | - expr: ||| |
434 | | - increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0 |
435 | | - |||, |
436 | | - labels: { |
437 | | - severity: 'warning', |
438 | | - }, |
439 | | - annotations: { |
440 | | - message: ||| |
441 | | - {{ $labels.job }}/{{ $labels.instance }} failed to delete checkpoint. |
442 | | - |||, |
443 | | - }, |
444 | | - }, |
445 | | - { |
446 | | - // Two or more failed checkpoint deletion in 2h means something is wrong. |
447 | | - // We give this more buffer than creation as this is a less critical operation. |
448 | | - alert: 'CortexCheckpointDeletionFailed', |
449 | | - expr: ||| |
450 | | - increase(cortex_ingester_checkpoint_deletions_failed_total[2h]) > 1 |
451 | | - |||, |
452 | | - labels: { |
453 | | - severity: 'critical', |
454 | | - }, |
455 | | - annotations: { |
456 | | - message: ||| |
457 | | - {{ $labels.instance }} is failing to delete checkpoint. |
458 | | - |||, |
459 | | - }, |
460 | | - }, |
461 | | - ], |
462 | | - }, |
463 | 326 | { |
464 | 327 | name: 'cortex-rollout-alerts', |
465 | 328 | rules: [ |
|
524 | 387 | { |
525 | 388 | name: 'cortex-provisioning', |
526 | 389 | rules: [ |
527 | | - { |
528 | | - alert: 'CortexProvisioningMemcachedTooSmall', |
529 | | - // 4 x in-memory series size = 24hrs of data. |
530 | | - expr: ||| |
531 | | - ( |
532 | | - 4 * |
533 | | - sum by (%s) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count) |
534 | | - / 1e9 |
535 | | - ) |
536 | | - > |
537 | | - ( |
538 | | - sum by (%s) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9 |
539 | | - ) |
540 | | - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], |
541 | | - 'for': '15m', |
542 | | - labels: { |
543 | | - severity: 'warning', |
544 | | - }, |
545 | | - annotations: { |
546 | | - message: ||| |
547 | | - Chunk memcached cluster in %(alert_aggregation_variables)s is too small, should be at least {{ printf "%%.2f" $value }}GB. |
548 | | - ||| % $._config, |
549 | | - }, |
550 | | - }, |
551 | 390 | { |
552 | 391 | alert: 'CortexProvisioningTooManyActiveSeries', |
553 | 392 | // We target each ingester to 1.5M in-memory series. This alert fires if the average |
|
0 commit comments