Files
FunctionalScaffold/monitoring/alerts/rules.yaml
Roog (顾新培) b1077e78e9 main:删除多余文档并清理项目目录
变更内容:
- 移除冗余文档,包括 Grafana 指南、指标对比、修复总结、OpenAPI 规范等。
- 精简项目文档结构,优化 README 文件内容。
- 提升文档层次清晰度,集中核心指南。
2026-02-02 15:00:42 +08:00

94 lines
3.3 KiB
YAML

groups:
- name: functional_scaffold_alerts
interval: 30s
rules:
# 高错误率告警
- alert: HighErrorRate
expr: rate(http_requests_total{status="error"}[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "检测到高错误率"
description: "端点 {{ $labels.endpoint }} 的错误率为 {{ $value }} 请求/秒"
# 高延迟告警
- alert: HighLatency
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "检测到高延迟"
description: "端点 {{ $labels.endpoint }} 的 P95 延迟为 {{ $value }}s"
# 服务不可用告警
- alert: ServiceDown
expr: up{job="functional-scaffold"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "服务不可用"
description: "FunctionalScaffold 服务已停止超过 1 分钟"
# 算法执行失败率告警
- alert: HighAlgorithmFailureRate
expr: rate(algorithm_executions_total{status="error"}[5m]) / rate(algorithm_executions_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "算法执行失败率过高"
description: "算法 {{ $labels.algorithm }} 的失败率超过 10%"
# 算法执行延迟告警
- alert: HighAlgorithmLatency
expr: histogram_quantile(0.95, rate(algorithm_execution_duration_seconds_bucket[5m])) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "算法执行延迟过高"
description: "算法 {{ $labels.algorithm }} 的 P95 延迟为 {{ $value }}s"
# 异步任务失败率告警
- alert: HighJobFailureRate
expr: rate(jobs_completed_total{status="failed"}[5m]) / rate(jobs_completed_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "异步任务失败率过高"
description: "算法 {{ $labels.algorithm }} 的异步任务失败率超过 10%"
# 异步任务执行延迟告警
- alert: HighJobLatency
expr: histogram_quantile(0.95, rate(job_execution_duration_seconds_bucket[5m])) > 60
for: 5m
labels:
severity: warning
annotations:
summary: "异步任务执行延迟过高"
description: "算法 {{ $labels.algorithm }} 的异步任务 P95 延迟为 {{ $value }}s"
# 异步任务积压告警
- alert: JobBacklog
expr: sum(rate(jobs_created_total[5m])) - sum(rate(jobs_completed_total[5m])) > 10
for: 10m
labels:
severity: warning
annotations:
summary: "异步任务积压"
description: "任务创建速率超过完成速率,可能存在积压"
# Webhook 发送失败率告警
- alert: HighWebhookFailureRate
expr: rate(webhook_deliveries_total{status="failed"}[5m]) / rate(webhook_deliveries_total[5m]) > 0.2
for: 5m
labels:
severity: warning
annotations:
summary: "Webhook 发送失败率过高"
description: "Webhook 发送失败率超过 20%"