123 lines
3.7 KiB
YAML
123 lines
3.7 KiB
YAML
# PrometheusRule for alerting
|
|
# Requires Prometheus Operator to be installed
|
|
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: llm-gateway
|
|
namespace: llm-gateway
|
|
labels:
|
|
app: llm-gateway
|
|
prometheus: kube-prometheus
|
|
spec:
|
|
groups:
|
|
- name: llm-gateway.rules
|
|
interval: 30s
|
|
rules:
|
|
|
|
# High error rate
|
|
- alert: LLMGatewayHighErrorRate
|
|
expr: |
|
|
(
|
|
sum(rate(http_requests_total{namespace="llm-gateway",status_code=~"5.."}[5m]))
|
|
/
|
|
sum(rate(http_requests_total{namespace="llm-gateway"}[5m]))
|
|
) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: llm-gateway
|
|
annotations:
|
|
summary: "High error rate in LLM Gateway"
|
|
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 5%)"
|
|
|
|
# High latency
|
|
- alert: LLMGatewayHighLatency
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(http_request_duration_seconds_bucket{namespace="llm-gateway"}[5m])) by (le)
|
|
) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: llm-gateway
|
|
annotations:
|
|
summary: "High latency in LLM Gateway"
|
|
description: "P95 latency is {{ $value }}s (threshold: 10s)"
|
|
|
|
# Provider errors
|
|
- alert: LLMProviderHighErrorRate
|
|
expr: |
|
|
(
|
|
sum(rate(provider_requests_total{namespace="llm-gateway",status="error"}[5m])) by (provider)
|
|
/
|
|
sum(rate(provider_requests_total{namespace="llm-gateway"}[5m])) by (provider)
|
|
) > 0.10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: llm-gateway
|
|
annotations:
|
|
summary: "High error rate for provider {{ $labels.provider }}"
|
|
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 10%)"
|
|
|
|
# Pod down
|
|
- alert: LLMGatewayPodDown
|
|
expr: |
|
|
up{job="llm-gateway",namespace="llm-gateway"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
component: llm-gateway
|
|
annotations:
|
|
summary: "LLM Gateway pod is down"
|
|
description: "Pod {{ $labels.pod }} has been down for more than 2 minutes"
|
|
|
|
# High memory usage
|
|
- alert: LLMGatewayHighMemoryUsage
|
|
expr: |
|
|
(
|
|
container_memory_working_set_bytes{namespace="llm-gateway",container="gateway"}
|
|
/
|
|
container_spec_memory_limit_bytes{namespace="llm-gateway",container="gateway"}
|
|
) > 0.85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: llm-gateway
|
|
annotations:
|
|
summary: "High memory usage in LLM Gateway"
|
|
description: "Memory usage is {{ $value | humanizePercentage }} (threshold: 85%)"
|
|
|
|
# Rate limit threshold
|
|
- alert: LLMGatewayHighRateLimitHitRate
|
|
expr: |
|
|
(
|
|
sum(rate(http_requests_total{namespace="llm-gateway",status_code="429"}[5m]))
|
|
/
|
|
sum(rate(http_requests_total{namespace="llm-gateway"}[5m]))
|
|
) > 0.20
|
|
for: 10m
|
|
labels:
|
|
severity: info
|
|
component: llm-gateway
|
|
annotations:
|
|
summary: "High rate limit hit rate"
|
|
description: "{{ $value | humanizePercentage }} of requests are being rate limited"
|
|
|
|
# Conversation store errors
|
|
- alert: LLMGatewayConversationStoreErrors
|
|
expr: |
|
|
(
|
|
sum(rate(conversation_store_operations_total{namespace="llm-gateway",status="error"}[5m]))
|
|
/
|
|
sum(rate(conversation_store_operations_total{namespace="llm-gateway"}[5m]))
|
|
) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: llm-gateway
|
|
annotations:
|
|
summary: "High error rate in conversation store"
|
|
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 5%)"
|