groups:
  - name: functional_scaffold_alerts
    interval: 30s
    rules:
      # 高错误率告警
      - alert: HighErrorRate
        expr: rate(http_requests_total{status="error"}[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "检测到高错误率"
          description: "端点 {{ $labels.endpoint }} 的错误率为 {{ $value }} 请求/秒"

      # 高延迟告警
      - alert: HighLatency
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "检测到高延迟"
          description: "端点 {{ $labels.endpoint }} 的 P95 延迟为 {{ $value }}s"

      # 服务不可用告警
      - alert: ServiceDown
        expr: up{job="functional-scaffold"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "服务不可用"
          description: "FunctionalScaffold 服务已停止超过 1 分钟"

      # 算法执行失败率告警
      - alert: HighAlgorithmFailureRate
        expr: rate(algorithm_executions_total{status="error"}[5m]) / rate(algorithm_executions_total[5m]) > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "算法执行失败率过高"
          description: "算法 {{ $labels.algorithm }} 的失败率超过 10%"

      # 算法执行延迟告警
      - alert: HighAlgorithmLatency
        expr: histogram_quantile(0.95, rate(algorithm_execution_duration_seconds_bucket[5m])) > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "算法执行延迟过高"
          description: "算法 {{ $labels.algorithm }} 的 P95 延迟为 {{ $value }}s"