变更内容: - 优化任务出队逻辑,采用 BLMOVE 提升队列操作的原子性和可靠性。 - 在 JobManager 中新增任务锁续租、超时任务回收、ACK/NACK 状态管理功能。 - 实现任务队列和死信队列监控指标收集,为系统性能分析提供数据支持。 - 扩展 Worker 模块,增加锁续租逻辑及任务回收调度。 - 更新测试用例,覆盖任务管理和队列指标的新增逻辑。 - 补充 metrics.yaml 文件,添加队列相关的监控指标定义。 - 更新依赖,补充 Redis 支持及相关库版本规范。
121 lines
3.0 KiB
YAML
121 lines
3.0 KiB
YAML
# 指标配置文件
|
|
# 算法成员可以在此添加自定义指标
|
|
|
|
# Redis 连接配置(也可通过环境变量覆盖)
|
|
redis:
|
|
host: ${REDIS_HOST:localhost}
|
|
port: ${REDIS_PORT:6379}
|
|
db: ${REDIS_METRICS_DB:0}
|
|
password: ${REDIS_PASSWORD:}
|
|
|
|
# 全局配置
|
|
global:
|
|
prefix: "functional_scaffold" # 指标名称前缀
|
|
instance_label: true # 是否添加实例标签
|
|
|
|
# 内置指标(框架自动收集)
|
|
builtin_metrics:
|
|
http_requests:
|
|
enabled: true
|
|
name: "http_requests_total"
|
|
type: counter
|
|
description: "HTTP 请求总数"
|
|
labels: [method, endpoint, status]
|
|
|
|
http_latency:
|
|
enabled: true
|
|
name: "http_request_duration_seconds"
|
|
type: histogram
|
|
description: "HTTP 请求延迟"
|
|
labels: [method, endpoint]
|
|
buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]
|
|
|
|
http_in_progress:
|
|
enabled: true
|
|
name: "http_requests_in_progress"
|
|
type: gauge
|
|
description: "当前进行中的 HTTP 请求数"
|
|
labels: []
|
|
|
|
algorithm_executions:
|
|
enabled: true
|
|
name: "algorithm_executions_total"
|
|
type: counter
|
|
description: "算法执行总数"
|
|
labels: [algorithm, status]
|
|
|
|
algorithm_latency:
|
|
enabled: true
|
|
name: "algorithm_execution_duration_seconds"
|
|
type: histogram
|
|
description: "算法执行延迟"
|
|
labels: [algorithm]
|
|
buckets: [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 60]
|
|
|
|
# 自定义指标(算法成员在此添加)
|
|
custom_metrics:
|
|
# 示例:质数判断结果统计
|
|
prime_check_results:
|
|
name: "prime_check_results_total"
|
|
type: counter
|
|
description: "质数判断结果统计"
|
|
labels: [is_prime]
|
|
|
|
# 示例:输入数字大小分布
|
|
input_number_size:
|
|
name: "input_number_size"
|
|
type: histogram
|
|
description: "输入数字大小分布"
|
|
labels: []
|
|
buckets: [10, 100, 1000, 10000, 100000, 1000000]
|
|
|
|
# 异步任务指标
|
|
jobs_created:
|
|
name: "jobs_created_total"
|
|
type: counter
|
|
description: "创建的异步任务总数"
|
|
labels: [algorithm]
|
|
|
|
jobs_completed:
|
|
name: "jobs_completed_total"
|
|
type: counter
|
|
description: "完成的异步任务总数"
|
|
labels: [algorithm, status]
|
|
|
|
job_execution_duration:
|
|
name: "job_execution_duration_seconds"
|
|
type: histogram
|
|
description: "异步任务执行时间"
|
|
labels: [algorithm]
|
|
buckets: [0.1, 0.5, 1, 5, 10, 30, 60, 120, 300]
|
|
|
|
webhook_deliveries:
|
|
name: "webhook_deliveries_total"
|
|
type: counter
|
|
description: "Webhook 回调发送总数"
|
|
labels: [status]
|
|
|
|
# 队列监控指标
|
|
job_queue_length:
|
|
name: "job_queue_length"
|
|
type: gauge
|
|
description: "待处理任务队列长度"
|
|
labels: [queue]
|
|
|
|
job_oldest_waiting_seconds:
|
|
name: "job_oldest_waiting_seconds"
|
|
type: gauge
|
|
description: "最长任务等待时间(秒)"
|
|
labels: []
|
|
|
|
job_recovered_total:
|
|
name: "job_recovered_total"
|
|
type: counter
|
|
description: "回收的超时任务总数"
|
|
labels: []
|
|
|
|
prime_check_total:
|
|
name: "prime_check"
|
|
type: counter
|
|
description: "出现问题的次数"
|
|
labels: [status] |