Alerts


/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > alertmanager.rules
AlertmanagerConfigInconsistent (0 active)
alert: AlertmanagerConfigInconsistent
expr: count by(namespace, service) (count_values by(namespace, service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"})) != 1
for: 5m
labels:
  severity: critical
annotations:
  message: |
    The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
    {{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
    Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
    {{ end }}
AlertmanagerFailedReload (0 active)
alert: AlertmanagerFailedReload
expr: alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0
for: 10m
labels:
  severity: warning
annotations:
  message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.
AlertmanagerMembersInconsistent (0 active)
/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > general.rules
Watchdog (1 active)
alert: Watchdog
expr: vector(1)
labels:
  severity: none
annotations:
  message: |
    This is an alert meant to ensure that the entire alerting pipeline is functional.
    This alert is always firing, therefore it should always be firing in Alertmanager
    and always fire against a receiver. There are integrations with various notification
    mechanisms that send a notification when this alert is not firing. For example the
    "DeadMansSnitch" integration in PagerDuty.
Labels State Active Since Value
alertname="Watchdog" severity="none" firing 2024-12-18 12:33:45.022073385 +0000 UTC 1
TargetDown (0 active)
alert: TargetDown
expr: 100 * (count by(job, namespace, service) (up == 0) / count by(job, namespace, service) (up)) > 10
for: 10m
labels:
  severity: warning
annotations:
  message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > kube-apiserver-slos
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate1h) > (14.4 * 0.01) and sum(apiserver_request:burnrate5m) > (14.4 * 0.01)
for: 2m
labels:
  long: 1h
  severity: critical
  short: 5m
annotations:
  message: The API server is burning too much error budget
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate6h) > (6 * 0.01) and sum(apiserver_request:burnrate30m) > (6 * 0.01)
for: 15m
labels:
  long: 6h
  severity: critical
  short: 30m
annotations:
  message: The API server is burning too much error budget
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate1d) > (3 * 0.01) and sum(apiserver_request:burnrate2h) > (3 * 0.01)
for: 1h
labels:
  long: 1d
  severity: warning
  short: 2h
annotations:
  message: The API server is burning too much error budget
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate3d) > (1 * 0.01) and sum(apiserver_request:burnrate6h) > (1 * 0.01)
for: 3h
labels:
  long: 3d
  severity: warning
  short: 6h
annotations:
  message: The API server is burning too much error budget
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > kube-state-metrics
KubeStateMetricsListErrors (0 active)
alert: KubeStateMetricsListErrors
expr: (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) / sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) > 0.01
for: 15m
labels:
  severity: critical
annotations:
  message: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors
KubeStateMetricsWatchErrors (0 active)
alert: KubeStateMetricsWatchErrors
expr: (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) / sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) > 0.01
for: 15m
labels:
  severity: critical
annotations:
  message: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors
/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > kubernetes-apps
KubeDeploymentReplicasMismatch (1 active)
alert: KubeDeploymentReplicasMismatch
expr: (kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_replicas_available{job="kube-state-metrics"}) and (changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m]) == 0)
for: 15m
labels:
  severity: warning
annotations:
  message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
Labels State Active Since Value
alertname="KubeDeploymentReplicasMismatch" deployment="drep-database-layer" instance="10.1.134.232:8443" job="kube-state-metrics" namespace="digilex-cicd" severity="warning" firing 2025-03-13 17:16:05.095800493 +0000 UTC 1
KubeContainerWaiting (1 active)
alert: KubeContainerWaiting
expr: sum by(namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
for: 1h
labels:
  severity: warning
annotations:
  message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
Labels State Active Since Value
alertname="KubeContainerWaiting" container="database-layer" namespace="digilex-cicd" pod="drep-database-layer-6c78c9fb5-8rjw9" severity="warning" pending 2025-03-13 17:47:05.095800493 +0000 UTC 1
KubePodCrashLooping (1 active)
alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
for: 15m
labels:
  severity: warning
annotations:
  message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
Labels State Active Since Value
alertname="KubePodCrashLooping" container="database-layer" instance="10.1.134.232:8443" job="kube-state-metrics" namespace="digilex-cicd" pod="drep-database-layer-6c78c9fb5-8rjw9" severity="warning" pending 2025-03-13 17:46:35.095800493 +0000 UTC 1.1111111111111112
KubeDaemonSetMisScheduled (0 active)
alert: KubeDaemonSetMisScheduled
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
for: 15m
labels:
  severity: warning
annotations:
  message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
KubeDaemonSetNotScheduled (0 active)
alert: KubeDaemonSetNotScheduled
expr: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
for: 10m
labels:
  severity: warning
annotations:
  message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
KubeDaemonSetRolloutStuck (0 active)
alert: KubeDaemonSetRolloutStuck
expr: ((kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) or (kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} != 0) or (kube_daemonset_updated_number_scheduled{job="kube-state-metrics"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) or (kube_daemonset_status_number_available{job="kube-state-metrics"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"})) and (changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m]) == 0)
for: 15m
labels:
  severity: warning
annotations:
  message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
KubeDeploymentGenerationMismatch (0 active)
alert: KubeDeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation{job="kube-state-metrics"} != kube_deployment_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
  severity: warning
annotations:
  message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
KubeHpaMaxedOut (0 active)
alert: KubeHpaMaxedOut
expr: kube_hpa_status_current_replicas{job="kube-state-metrics"} == kube_hpa_spec_max_replicas{job="kube-state-metrics"}
for: 15m
labels:
  severity: warning
annotations:
  message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
KubeHpaReplicasMismatch (0 active)
alert: KubeHpaReplicasMismatch
expr: (kube_hpa_status_desired_replicas{job="kube-state-metrics"} != kube_hpa_status_current_replicas{job="kube-state-metrics"}) and changes(kube_hpa_status_current_replicas[15m]) == 0
for: 15m
labels:
  severity: warning
annotations:
  message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
KubeJobCompletion (0 active)
alert: KubeJobCompletion
expr: kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
for: 12h
labels:
  severity: warning
annotations:
  message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
KubeJobFailed (0 active)
alert: KubeJobFailed
expr: kube_job_failed{job="kube-state-metrics"} > 0
for: 15m
labels:
  severity: warning
annotations:
  message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
KubePodNotReady (0 active)
alert: KubePodNotReady
expr: sum by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}))) > 0
for: 15m
labels:
  severity: warning
annotations:
  message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
KubeStatefulSetGenerationMismatch (0 active)
alert: KubeStatefulSetGenerationMismatch
expr: kube_statefulset_status_observed_generation{job="kube-state-metrics"} != kube_statefulset_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
  severity: warning
annotations:
  message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
KubeStatefulSetReplicasMismatch (0 active)
alert: KubeStatefulSetReplicasMismatch
expr: (kube_statefulset_status_replicas_ready{job="kube-state-metrics"} != kube_statefulset_status_replicas{job="kube-state-metrics"}) and (changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) == 0)
for: 15m
labels:
  severity: warning
annotations:
  message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
KubeStatefulSetUpdateNotRolledOut (0 active)
alert: KubeStatefulSetUpdateNotRolledOut
expr: (max without(revision) (kube_statefulset_status_current_revision{job="kube-state-metrics"} unless kube_statefulset_status_update_revision{job="kube-state-metrics"}) * (kube_statefulset_replicas{job="kube-state-metrics"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics"})) and (changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) == 0)
for: 15m
labels:
  severity: warning
annotations:
  message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > kubernetes-resources
CPUThrottlingHigh (1 active)
alert: CPUThrottlingHigh
expr: sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m])) > (25 / 100)
for: 15m
labels:
  severity: info
annotations:
  message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
Labels State Active Since Value
alertname="CPUThrottlingHigh" container="node-exporter" namespace="monitoring" pod="node-exporter-5chcc" severity="info" firing 2024-12-18 12:35:47.55260138 +0000 UTC 0.4852941176470588
KubeCPUOvercommit (1 active)
alert: KubeCPUOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum) / sum(kube_node_status_allocatable_cpu_cores) > (count(kube_node_status_allocatable_cpu_cores) - 1) / count(kube_node_status_allocatable_cpu_cores)
for: 5m
labels:
  severity: warning
annotations:
  message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
Labels State Active Since Value
alertname="KubeCPUOvercommit" severity="warning" firing 2025-03-13 17:16:17.55260138 +0000 UTC 0.6817500000000001
KubeMemoryOvercommit (1 active)
alert: KubeMemoryOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum) / sum(kube_node_status_allocatable_memory_bytes) > (count(kube_node_status_allocatable_memory_bytes) - 1) / count(kube_node_status_allocatable_memory_bytes)
for: 5m
labels:
  severity: warning
annotations:
  message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
Labels State Active Since Value
alertname="KubeMemoryOvercommit" severity="warning" firing 2025-03-13 17:16:17.55260138 +0000 UTC 0.19758578791971212
KubeCPUQuotaOvercommit (0 active)
alert: KubeCPUQuotaOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="cpu",type="hard"}) / sum(kube_node_status_allocatable_cpu_cores) > 1.5
for: 5m
labels:
  severity: warning
annotations:
  message: Cluster has overcommitted CPU resource requests for Namespaces.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit
KubeMemoryQuotaOvercommit (0 active)
alert: KubeMemoryQuotaOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="memory",type="hard"}) / sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) > 1.5
for: 5m
labels:
  severity: warning
annotations:
  message: Cluster has overcommitted memory resource requests for Namespaces.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit
KubeQuotaFullyUsed (0 active)
alert: KubeQuotaFullyUsed
expr: kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) >= 1
for: 15m
labels:
  severity: info
annotations:
  message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > kubernetes-storage
KubePersistentVolumeErrors (0 active)
alert: KubePersistentVolumeErrors
expr: kube_persistentvolume_status_phase{job="kube-state-metrics",phase=~"Failed|Pending"} > 0
for: 5m
labels:
  severity: critical
annotations:
  message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
KubePersistentVolumeFillingUp (0 active)
alert: KubePersistentVolumeFillingUp
expr: kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics"} < 0.03
for: 1m
labels:
  severity: critical
annotations:
  message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
KubePersistentVolumeFillingUp (0 active)
alert: KubePersistentVolumeFillingUp
expr: (kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics"}) < 0.15 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
for: 1h
labels:
  severity: warning
annotations:
  message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > kubernetes-system
KubeClientErrors (0 active)
alert: KubeClientErrors
expr: (sum by(instance, job) (rate(rest_client_requests_total{code=~"5.."}[5m])) / sum by(instance, job) (rate(rest_client_requests_total[5m]))) > 0.01
for: 15m
labels:
  severity: warning
annotations:
  message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
KubeVersionMismatch (0 active)
alert: KubeVersionMismatch
expr: count(count by(gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"}, "gitVersion", "$1", "gitVersion", "(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
  severity: warning
annotations:
  message: There are {{ $value }} different semantic versions of Kubernetes components running.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > kubernetes-system-apiserver
AggregatedAPIDown (1 active)
alert: AggregatedAPIDown
expr: (1 - max by(name, namespace) (avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
for: 5m
labels:
  severity: warning
annotations:
  message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
Labels State Active Since Value
alertname="AggregatedAPIDown" name="v1.packages.operators.coreos.com" namespace="default" severity="warning" firing 2025-03-13 16:36:04.774722027 +0000 UTC 40
AggregatedAPIErrors (0 active)
alert: AggregatedAPIErrors
expr: sum by(name, namespace) (increase(aggregator_unavailable_apiservice_count[5m])) > 2
labels:
  severity: warning
annotations:
  message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
KubeAPIDown (0 active)
alert: KubeAPIDown
expr: absent(up{job="apiserver"} == 1)
for: 15m
labels:
  severity: critical
annotations:
  message: KubeAPI has disappeared from Prometheus target discovery.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
KubeClientCertificateExpiration (0 active)
alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
labels:
  severity: warning
annotations:
  message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
KubeClientCertificateExpiration (0 active)
alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
labels:
  severity: critical
annotations:
  message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > kubernetes-system-controller-manager
KubeControllerManagerDown (1 active)
alert: KubeControllerManagerDown
expr: absent(up{job="kube-controller-manager"} == 1)
for: 15m
labels:
  severity: critical
annotations:
  message: KubeControllerManager has disappeared from Prometheus target discovery.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
Labels State Active Since Value
alertname="KubeControllerManagerDown" severity="critical" firing 2024-03-19 09:22:14 +0000 UTC 1
/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > kubernetes-system-kubelet
KubeNodeNotReady (0 active)
alert: KubeNodeNotReady
expr: kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} == 0
for: 15m
labels:
  severity: warning
annotations:
  message: '{{ $labels.node }} has been unready for more than 15 minutes.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
KubeNodeReadinessFlapping (0 active)
alert: KubeNodeReadinessFlapping
expr: sum by(node) (changes(kube_node_status_condition{condition="Ready",status="true"}[15m])) > 2
for: 15m
labels:
  severity: warning
annotations:
  message: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
KubeNodeUnreachable (0 active)
alert: KubeNodeUnreachable
expr: (kube_node_spec_taint{effect="NoSchedule",job="kube-state-metrics",key="node.kubernetes.io/unreachable"} unless ignoring(key, value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
labels:
  severity: warning
annotations:
  message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
KubeletDown (0 active)
alert: KubeletDown
expr: absent(up{job="kubelet",metrics_path="/metrics"} == 1)
for: 15m
labels:
  severity: critical
annotations:
  message: Kubelet has disappeared from Prometheus target discovery.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
KubeletPlegDurationHigh (0 active)
alert: KubeletPlegDurationHigh
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
for: 5m
labels:
  severity: warning
annotations:
  message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
KubeletPodStartUpLatencyHigh (0 active)
alert: KubeletPodStartUpLatencyHigh
expr: histogram_quantile(0.99, sum by(instance, le) (rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet",metrics_path="/metrics"}[5m]))) * on(instance) group_left(node) kubelet_node_name{job="kubelet",metrics_path="/metrics"} > 60
for: 15m
labels:
  severity: warning
annotations:
  message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
KubeletTooManyPods (0 active)
alert: KubeletTooManyPods
expr: count by(node) ((kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance, pod, namespace, cluster) group_left(node) topk by(instance, pod, namespace, cluster) (1, kube_pod_info{job="kube-state-metrics"})) / max by(node) (kube_node_status_capacity_pods{job="kube-state-metrics"} != 1) > 0.95
for: 15m
labels:
  severity: warning
annotations:
  message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > kubernetes-system-scheduler
KubeSchedulerDown (1 active)
alert: KubeSchedulerDown
expr: absent(up{job="kube-scheduler"} == 1)
for: 15m
labels:
  severity: critical
annotations:
  message: KubeScheduler has disappeared from Prometheus target discovery.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
Labels State Active Since Value
alertname="KubeSchedulerDown" severity="critical" firing 2024-03-19 09:22:02 +0000 UTC 1
/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > node-exporter
NodeClockNotSynchronising (1 active)
alert: NodeClockNotSynchronising
expr: min_over_time(node_timex_sync_status[5m]) == 0
for: 10m
labels:
  severity: warning
annotations:
  message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising
  summary: Clock not synchronising.
Labels State Active Since Value
alertname="NodeClockNotSynchronising" endpoint="https" instance="siccapp" job="node-exporter" namespace="monitoring" pod="node-exporter-5chcc" service="node-exporter" severity="warning" firing 2024-09-06 18:12:46 +0000 UTC 0
NodeClockSkewDetected (0 active)
alert: NodeClockSkewDetected
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for: 10m
labels:
  severity: warning
annotations:
  message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected
  summary: Clock skew detected.
NodeFilesystemAlmostOutOfFiles (0 active)
alert: NodeFilesystemAlmostOutOfFiles
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 5 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
  summary: Filesystem has less than 5% inodes left.
NodeFilesystemAlmostOutOfFiles (0 active)
alert: NodeFilesystemAlmostOutOfFiles
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 3 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
  summary: Filesystem has less than 3% inodes left.
NodeFilesystemAlmostOutOfSpace (0 active)
alert: NodeFilesystemAlmostOutOfSpace
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 3 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
  summary: Filesystem has less than 3% space left.
NodeFilesystemAlmostOutOfSpace (0 active)
alert: NodeFilesystemAlmostOutOfSpace
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 5 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
  summary: Filesystem has less than 5% space left.
NodeFilesystemFilesFillingUp (0 active)
alert: NodeFilesystemFilesFillingUp
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 40 and predict_linear(node_filesystem_files_free{fstype!="",job="node-exporter"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
  summary: Filesystem is predicted to run out of inodes within the next 24 hours.
NodeFilesystemFilesFillingUp (0 active)
alert: NodeFilesystemFilesFillingUp
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 20 and predict_linear(node_filesystem_files_free{fstype!="",job="node-exporter"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
  summary: Filesystem is predicted to run out of inodes within the next 4 hours.
NodeFilesystemSpaceFillingUp (0 active)
alert: NodeFilesystemSpaceFillingUp
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 15 and predict_linear(node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
  summary: Filesystem is predicted to run out of space within the next 4 hours.
NodeFilesystemSpaceFillingUp (0 active)
alert: NodeFilesystemSpaceFillingUp
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 40 and predict_linear(node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
  summary: Filesystem is predicted to run out of space within the next 24 hours.
NodeHighNumberConntrackEntriesUsed (0 active)
alert: NodeHighNumberConntrackEntriesUsed
expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
  severity: warning
annotations:
  description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused
  summary: Number of conntrack are getting close to the limit.
NodeNetworkReceiveErrs (0 active)
alert: NodeNetworkReceiveErrs
expr: increase(node_network_receive_errs_total[2m]) > 10
for: 1h
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs
  summary: Network interface is reporting many receive errors.
NodeNetworkTransmitErrs (0 active)
alert: NodeNetworkTransmitErrs
expr: increase(node_network_transmit_errs_total[2m]) > 10
for: 1h
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs
  summary: Network interface is reporting many transmit errors.
NodeTextFileCollectorScrapeError (0 active)
alert: NodeTextFileCollectorScrapeError
expr: node_textfile_scrape_error{job="node-exporter"} == 1
labels:
  severity: warning
annotations:
  description: Node Exporter text file collector failed to scrape.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror
  summary: Node Exporter text file collector failed to scrape.
/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > node-network
NodeNetworkInterfaceFlapping (0 active)
alert: NodeNetworkInterfaceFlapping
expr: changes(node_network_up{device!~"veth.+",job="node-exporter"}[2m]) > 2
for: 2m
labels:
  severity: warning
annotations:
  message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > prometheus
PrometheusBadConfig (0 active)
alert: PrometheusBadConfig
expr: max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0
for: 10m
labels:
  severity: critical
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.
  summary: Failed Prometheus configuration reload.
PrometheusDuplicateTimestamps (0 active)
alert: PrometheusDuplicateTimestamps
expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value  }} samples/s with different values but duplicated timestamp.
  summary: Prometheus is dropping samples with duplicate timestamps.
PrometheusErrorSendingAlertsToAnyAlertmanager (0 active)
alert: PrometheusErrorSendingAlertsToAnyAlertmanager
expr: min without(alertmanager) (rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])) * 100 > 3
for: 15m
labels:
  severity: critical
annotations:
  description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
  summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
PrometheusErrorSendingAlertsToSomeAlertmanagers (0 active)
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
expr: (rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])) * 100 > 1
for: 15m
labels:
  severity: warning
annotations:
  description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
  summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
PrometheusMissingRuleEvaluations (0 active)
alert: PrometheusMissingRuleEvaluations
expr: increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 15m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
  summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
PrometheusNotConnectedToAlertmanagers (0 active)
alert: PrometheusNotConnectedToAlertmanagers
expr: max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.
  summary: Prometheus is not connected to any Alertmanagers.
PrometheusNotIngestingSamples (0 active)
alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
  summary: Prometheus is not ingesting samples.
PrometheusNotificationQueueRunningFull (0 active)
alert: PrometheusNotificationQueueRunningFull
expr: (predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m]))
for: 15m
labels:
  severity: warning
annotations:
  description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.
  summary: Prometheus alert notification queue predicted to run full in less than 30m.
PrometheusOutOfOrderTimestamps (0 active)
alert: PrometheusOutOfOrderTimestamps
expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value  }} samples/s with timestamps arriving out of order.
  summary: Prometheus drops samples with out-of-order timestamps.
PrometheusRemoteStorageFailures (0 active)
alert: PrometheusRemoteStorageFailures
expr: (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]))) * 100 > 1
for: 15m
labels:
  severity: critical
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
  summary: Prometheus fails to send samples to remote storage.
PrometheusRemoteWriteBehind (0 active)
alert: PrometheusRemoteWriteBehind
expr: (max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) - on(job, instance) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])) > 120
for: 15m
labels:
  severity: critical
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
  summary: Prometheus remote write is behind.
PrometheusRemoteWriteDesiredShards (0 active)
alert: PrometheusRemoteWriteDesiredShards
expr: (max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m]) > max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m]))
for: 15m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}.
  summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
PrometheusRuleFailures (0 active)
alert: PrometheusRuleFailures
expr: increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 15m
labels:
  severity: critical
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
  summary: Prometheus is failing rule evaluations.
PrometheusTSDBCompactionsFailing (0 active)
alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
for: 4h
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.
  summary: Prometheus has issues compacting blocks.
PrometheusTSDBReloadsFailing (0 active)
alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
for: 4h
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.
  summary: Prometheus has issues reloading blocks from disk.
/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-prometheus-k8s-rules.yaml > prometheus-operator
PrometheusOperatorListErrors (0 active)
PrometheusOperatorNodeLookupErrors (0 active)
alert: PrometheusOperatorNodeLookupErrors
expr: rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
for: 10m
labels:
  severity: warning
annotations:
  message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
PrometheusOperatorReconcileErrors (0 active)
alert: PrometheusOperatorReconcileErrors
expr: rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
for: 10m
labels:
  severity: warning
annotations:
  message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace.
PrometheusOperatorWatchErrors (0 active)