Files
FunctionalScaffold/config/metrics.yaml
Roog (顾新培) 7c8b96927d main:优化任务管理及队列监控性能
变更内容:
- 优化任务出队逻辑,采用 BLMOVE 提升队列操作的原子性和可靠性。
- 在 JobManager 中新增任务锁续租、超时任务回收、ACK/NACK 状态管理功能。
- 实现任务队列和死信队列监控指标收集,为系统性能分析提供数据支持。
- 扩展 Worker 模块,增加锁续租逻辑及任务回收调度。
- 更新测试用例,覆盖任务管理和队列指标的新增逻辑。
- 补充 metrics.yaml 文件,添加队列相关的监控指标定义。
- 更新依赖,补充 Redis 支持及相关库版本规范。
2026-02-03 18:38:25 +08:00

121 lines
3.0 KiB
YAML

# 指标配置文件
# 算法成员可以在此添加自定义指标
# Redis 连接配置(也可通过环境变量覆盖)
redis:
host: ${REDIS_HOST:localhost}
port: ${REDIS_PORT:6379}
db: ${REDIS_METRICS_DB:0}
password: ${REDIS_PASSWORD:}
# 全局配置
global:
prefix: "functional_scaffold" # 指标名称前缀
instance_label: true # 是否添加实例标签
# 内置指标(框架自动收集)
builtin_metrics:
http_requests:
enabled: true
name: "http_requests_total"
type: counter
description: "HTTP 请求总数"
labels: [method, endpoint, status]
http_latency:
enabled: true
name: "http_request_duration_seconds"
type: histogram
description: "HTTP 请求延迟"
labels: [method, endpoint]
buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]
http_in_progress:
enabled: true
name: "http_requests_in_progress"
type: gauge
description: "当前进行中的 HTTP 请求数"
labels: []
algorithm_executions:
enabled: true
name: "algorithm_executions_total"
type: counter
description: "算法执行总数"
labels: [algorithm, status]
algorithm_latency:
enabled: true
name: "algorithm_execution_duration_seconds"
type: histogram
description: "算法执行延迟"
labels: [algorithm]
buckets: [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 60]
# 自定义指标(算法成员在此添加)
custom_metrics:
# 示例:质数判断结果统计
prime_check_results:
name: "prime_check_results_total"
type: counter
description: "质数判断结果统计"
labels: [is_prime]
# 示例:输入数字大小分布
input_number_size:
name: "input_number_size"
type: histogram
description: "输入数字大小分布"
labels: []
buckets: [10, 100, 1000, 10000, 100000, 1000000]
# 异步任务指标
jobs_created:
name: "jobs_created_total"
type: counter
description: "创建的异步任务总数"
labels: [algorithm]
jobs_completed:
name: "jobs_completed_total"
type: counter
description: "完成的异步任务总数"
labels: [algorithm, status]
job_execution_duration:
name: "job_execution_duration_seconds"
type: histogram
description: "异步任务执行时间"
labels: [algorithm]
buckets: [0.1, 0.5, 1, 5, 10, 30, 60, 120, 300]
webhook_deliveries:
name: "webhook_deliveries_total"
type: counter
description: "Webhook 回调发送总数"
labels: [status]
# 队列监控指标
job_queue_length:
name: "job_queue_length"
type: gauge
description: "待处理任务队列长度"
labels: [queue]
job_oldest_waiting_seconds:
name: "job_oldest_waiting_seconds"
type: gauge
description: "最长任务等待时间(秒)"
labels: []
job_recovered_total:
name: "job_recovered_total"
type: counter
description: "回收的超时任务总数"
labels: []
prime_check_total:
name: "prime_check"
type: counter
description: "出现问题的次数"
labels: [status]