变更内容: - 移除冗余文档,包括 Grafana 指南、指标对比、修复总结、OpenAPI 规范等。 - 精简项目文档结构,优化 README 文件内容。 - 提升文档层次清晰度,集中核心指南。
94 lines
3.3 KiB
YAML
94 lines
3.3 KiB
YAML
groups:
|
|
- name: functional_scaffold_alerts
|
|
interval: 30s
|
|
rules:
|
|
# 高错误率告警
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status="error"}[5m]) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "检测到高错误率"
|
|
description: "端点 {{ $labels.endpoint }} 的错误率为 {{ $value }} 请求/秒"
|
|
|
|
# 高延迟告警
|
|
- alert: HighLatency
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "检测到高延迟"
|
|
description: "端点 {{ $labels.endpoint }} 的 P95 延迟为 {{ $value }}s"
|
|
|
|
# 服务不可用告警
|
|
- alert: ServiceDown
|
|
expr: up{job="functional-scaffold"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "服务不可用"
|
|
description: "FunctionalScaffold 服务已停止超过 1 分钟"
|
|
|
|
# 算法执行失败率告警
|
|
- alert: HighAlgorithmFailureRate
|
|
expr: rate(algorithm_executions_total{status="error"}[5m]) / rate(algorithm_executions_total[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "算法执行失败率过高"
|
|
description: "算法 {{ $labels.algorithm }} 的失败率超过 10%"
|
|
|
|
# 算法执行延迟告警
|
|
- alert: HighAlgorithmLatency
|
|
expr: histogram_quantile(0.95, rate(algorithm_execution_duration_seconds_bucket[5m])) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "算法执行延迟过高"
|
|
description: "算法 {{ $labels.algorithm }} 的 P95 延迟为 {{ $value }}s"
|
|
|
|
# 异步任务失败率告警
|
|
- alert: HighJobFailureRate
|
|
expr: rate(jobs_completed_total{status="failed"}[5m]) / rate(jobs_completed_total[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "异步任务失败率过高"
|
|
description: "算法 {{ $labels.algorithm }} 的异步任务失败率超过 10%"
|
|
|
|
# 异步任务执行延迟告警
|
|
- alert: HighJobLatency
|
|
expr: histogram_quantile(0.95, rate(job_execution_duration_seconds_bucket[5m])) > 60
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "异步任务执行延迟过高"
|
|
description: "算法 {{ $labels.algorithm }} 的异步任务 P95 延迟为 {{ $value }}s"
|
|
|
|
# 异步任务积压告警
|
|
- alert: JobBacklog
|
|
expr: sum(rate(jobs_created_total[5m])) - sum(rate(jobs_completed_total[5m])) > 10
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "异步任务积压"
|
|
description: "任务创建速率超过完成速率,可能存在积压"
|
|
|
|
# Webhook 发送失败率告警
|
|
- alert: HighWebhookFailureRate
|
|
expr: rate(webhook_deliveries_total{status="failed"}[5m]) / rate(webhook_deliveries_total[5m]) > 0.2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Webhook 发送失败率过高"
|
|
description: "Webhook 发送失败率超过 20%"
|