apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: name: llm-gateway namespace: llm-gateway labels: app: llm-gateway spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment name: llm-gateway minReplicas: 3 maxReplicas: 20 behavior: scaleDown: stabilizationWindowSeconds: 300 policies: - type: Percent value: 50 periodSeconds: 60 - type: Pods value: 2 periodSeconds: 60 selectPolicy: Min scaleUp: stabilizationWindowSeconds: 0 policies: - type: Percent value: 100 periodSeconds: 30 - type: Pods value: 4 periodSeconds: 30 selectPolicy: Max metrics: # CPU-based scaling - type: Resource resource: name: cpu target: type: Utilization averageUtilization: 70 # Memory-based scaling - type: Resource resource: name: memory target: type: Utilization averageUtilization: 80 # Custom metrics (requires metrics-server and custom metrics API) # - type: Pods # pods: # metric: # name: http_requests_per_second # target: # type: AverageValue # averageValue: "1000"