groups: - name: functional_scaffold_alerts interval: 30s rules: # 高错误率告警 - alert: HighErrorRate expr: rate(http_requests_total{status="error"}[5m]) > 0.05 for: 5m labels: severity: warning annotations: summary: "检测到高错误率" description: "端点 {{ $labels.endpoint }} 的错误率为 {{ $value }} 请求/秒" # 高延迟告警 - alert: HighLatency expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1 for: 5m labels: severity: warning annotations: summary: "检测到高延迟" description: "端点 {{ $labels.endpoint }} 的 P95 延迟为 {{ $value }}s" # 服务不可用告警 - alert: ServiceDown expr: up{job="functional-scaffold"} == 0 for: 1m labels: severity: critical annotations: summary: "服务不可用" description: "FunctionalScaffold 服务已停止超过 1 分钟" # 算法执行失败率告警 - alert: HighAlgorithmFailureRate expr: rate(algorithm_executions_total{status="error"}[5m]) / rate(algorithm_executions_total[5m]) > 0.1 for: 5m labels: severity: warning annotations: summary: "算法执行失败率过高" description: "算法 {{ $labels.algorithm }} 的失败率超过 10%" # 算法执行延迟告警 - alert: HighAlgorithmLatency expr: histogram_quantile(0.95, rate(algorithm_execution_duration_seconds_bucket[5m])) > 5 for: 5m labels: severity: warning annotations: summary: "算法执行延迟过高" description: "算法 {{ $labels.algorithm }} 的 P95 延迟为 {{ $value }}s"