main:重构指标系统并切换为 Redis 方案

变更内容:
- 重构指标系统实现,支持基于 Redis 的多实例指标管理。
- 替换原有的 Pushgateway 和 Redis Exporter 方案。
- 更新 Prometheus 配置,适配新的指标抓取方式。
- 添加 Redis 指标相关配置和告警规则文件。
- 更新 Dockerfile 和 docker-compose 文件,移除多余服务,精简配置。
- 编写 `metrics_unified.py` 模块及单元测试。
- 修复部分代码中的冗余和格式问题。
This commit is contained in:
2026-02-02 13:30:28 +08:00
parent 31af5e2286
commit 241cffebc2
11 changed files with 1047 additions and 94 deletions

View File

@@ -2,38 +2,52 @@ groups:
- name: functional_scaffold_alerts
interval: 30s
rules:
# 高错误率告警
- alert: HighErrorRate
expr: rate(http_requests_total{status="error"}[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }} requests/sec for {{ $labels.endpoint }}"
summary: "检测到高错误率"
description: "端点 {{ $labels.endpoint }} 的错误率为 {{ $value }} 请求/秒"
# 高延迟告警
- alert: HighLatency
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High latency detected"
description: "P95 latency is {{ $value }}s for {{ $labels.endpoint }}"
summary: "检测到高延迟"
description: "端点 {{ $labels.endpoint }} 的 P95 延迟为 {{ $value }}s"
# 服务不可用告警
- alert: ServiceDown
expr: up{job="functional-scaffold"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service is down"
description: "FunctionalScaffold service has been down for more than 1 minute"
summary: "服务不可用"
description: "FunctionalScaffold 服务已停止超过 1 分钟"
- alert: HighMemoryUsage
expr: container_memory_usage_bytes{container="functional-scaffold"} / container_spec_memory_limit_bytes{container="functional-scaffold"} > 0.9
# 算法执行失败率告警
- alert: HighAlgorithmFailureRate
expr: rate(algorithm_executions_total{status="error"}[5m]) / rate(algorithm_executions_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is {{ $value | humanizePercentage }} of limit"
summary: "算法执行失败率过高"
description: "算法 {{ $labels.algorithm }} 的失败率超过 10%"
# 算法执行延迟告警
- alert: HighAlgorithmLatency
expr: histogram_quantile(0.95, rate(algorithm_execution_duration_seconds_bucket[5m])) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "算法执行延迟过高"
description: "算法 {{ $labels.algorithm }} 的 P95 延迟为 {{ $value }}s"