main:重构指标系统并切换为 Redis 方案
变更内容: - 重构指标系统实现,支持基于 Redis 的多实例指标管理。 - 替换原有的 Pushgateway 和 Redis Exporter 方案。 - 更新 Prometheus 配置,适配新的指标抓取方式。 - 添加 Redis 指标相关配置和告警规则文件。 - 更新 Dockerfile 和 docker-compose 文件,移除多余服务,精简配置。 - 编写 `metrics_unified.py` 模块及单元测试。 - 修复部分代码中的冗余和格式问题。
This commit is contained in:
@@ -2,38 +2,52 @@ groups:
|
||||
- name: functional_scaffold_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# 高错误率告警
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status="error"}[5m]) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate detected"
|
||||
description: "Error rate is {{ $value }} requests/sec for {{ $labels.endpoint }}"
|
||||
summary: "检测到高错误率"
|
||||
description: "端点 {{ $labels.endpoint }} 的错误率为 {{ $value }} 请求/秒"
|
||||
|
||||
# 高延迟告警
|
||||
- alert: HighLatency
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High latency detected"
|
||||
description: "P95 latency is {{ $value }}s for {{ $labels.endpoint }}"
|
||||
summary: "检测到高延迟"
|
||||
description: "端点 {{ $labels.endpoint }} 的 P95 延迟为 {{ $value }}s"
|
||||
|
||||
# 服务不可用告警
|
||||
- alert: ServiceDown
|
||||
expr: up{job="functional-scaffold"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service is down"
|
||||
description: "FunctionalScaffold service has been down for more than 1 minute"
|
||||
summary: "服务不可用"
|
||||
description: "FunctionalScaffold 服务已停止超过 1 分钟"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: container_memory_usage_bytes{container="functional-scaffold"} / container_spec_memory_limit_bytes{container="functional-scaffold"} > 0.9
|
||||
# 算法执行失败率告警
|
||||
- alert: HighAlgorithmFailureRate
|
||||
expr: rate(algorithm_executions_total{status="error"}[5m]) / rate(algorithm_executions_total[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage"
|
||||
description: "Memory usage is {{ $value | humanizePercentage }} of limit"
|
||||
summary: "算法执行失败率过高"
|
||||
description: "算法 {{ $labels.algorithm }} 的失败率超过 10%"
|
||||
|
||||
# 算法执行延迟告警
|
||||
- alert: HighAlgorithmLatency
|
||||
expr: histogram_quantile(0.95, rate(algorithm_execution_duration_seconds_bucket[5m])) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "算法执行延迟过高"
|
||||
description: "算法 {{ $labels.algorithm }} 的 P95 延迟为 {{ $value }}s"
|
||||
|
||||
@@ -8,27 +8,13 @@ global:
|
||||
|
||||
# 抓取配置
|
||||
scrape_configs:
|
||||
# 方案1:从 Pushgateway 抓取指标(推荐)
|
||||
- job_name: 'pushgateway'
|
||||
honor_labels: true
|
||||
static_configs:
|
||||
- targets: ['pushgateway:9091']
|
||||
metric_relabel_configs:
|
||||
# 保留 instance 标签
|
||||
- source_labels: [instance]
|
||||
target_label: instance
|
||||
action: replace
|
||||
|
||||
# 方案2:从 Redis Exporter 抓取指标
|
||||
- job_name: 'redis-exporter'
|
||||
static_configs:
|
||||
- targets: ['redis-exporter:8001']
|
||||
|
||||
# 直接从应用实例抓取(如果有多个实例,需要配置服务发现)
|
||||
- job_name: 'app'
|
||||
# 从应用实例抓取指标(Redis 统一指标方案)
|
||||
# 应用通过 /metrics 端点从 Redis 读取并导出 Prometheus 格式指标
|
||||
- job_name: 'functional-scaffold'
|
||||
static_configs:
|
||||
- targets: ['app:8000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 10s
|
||||
|
||||
# Prometheus 自身监控
|
||||
- job_name: 'prometheus'
|
||||
@@ -37,7 +23,7 @@ scrape_configs:
|
||||
|
||||
# 告警规则文件
|
||||
rule_files:
|
||||
- '/etc/prometheus/rules/*.yml'
|
||||
- '/etc/prometheus/rules/*.yaml'
|
||||
|
||||
# Alertmanager 配置(可选)
|
||||
# alerting:
|
||||
|
||||
Reference in New Issue
Block a user