main:采用异步 Redis 客户端优化指标管理模块

变更内容:
- 将 `redis` 客户端替换为 `redis.asyncio` 实现。
- 系统中同步方法调整为异步方法,提升事件循环效率。
- 在 `MetricsManager` 中添加异步初始化及关闭逻辑,避免阻塞问题。
- 更新便捷函数以支持异步上下文,并添加同步模式的兼容方法。
- 调整 Worker、JobManager、API 路由等模块,适配异步指标操作。
- 扩展单元测试,覆盖新增的异步方法及 Redis 操作逻辑。
- 简化 Dockerfile,取消开发依赖安装命令。
This commit is contained in:
2026-02-03 19:52:24 +08:00
parent b5ca0e0593
commit a4d2ad1e93
8 changed files with 435 additions and 218 deletions

View File

@@ -32,7 +32,7 @@ class BaseAlgorithm(ABC):
Returns:
Dict[str, Any]: 包含结果和元数据的字典
"""
from ..core.metrics_unified import incr, observe
from ..core.metrics_unified import incr_sync, observe_sync
start_time = time.time()
status = "success"
@@ -71,5 +71,7 @@ class BaseAlgorithm(ABC):
finally:
# 记录算法执行指标
elapsed_time = time.time() - start_time
incr("algorithm_executions_total", {"algorithm": self.name, "status": status})
observe("algorithm_execution_duration_seconds", {"algorithm": self.name}, elapsed_time)
incr_sync("algorithm_executions_total", {"algorithm": self.name, "status": status})
observe_sync(
"algorithm_execution_duration_seconds", {"algorithm": self.name}, elapsed_time
)

View File

@@ -2,7 +2,7 @@
from typing import Dict, Any, List
from .base import BaseAlgorithm
from ..core.metrics_unified import incr
from ..core.metrics_unified import incr_sync
class PrimeChecker(BaseAlgorithm):
@@ -31,12 +31,12 @@ class PrimeChecker(BaseAlgorithm):
ValueError: 如果输入不是整数
"""
if not isinstance(number, int):
incr('prime_check',{"status":"invalid_input"})
incr_sync('prime_check', {"status": "invalid_input"})
raise ValueError(f"Input must be an integer, got {type(number).__name__}")
# 小于2的数不是质数
if number < 2:
incr('prime_check', {"status": "number_little_two"})
incr_sync('prime_check', {"status": "number_little_two"})
return {
"number": number,
"is_prime": False,
@@ -50,7 +50,7 @@ class PrimeChecker(BaseAlgorithm):
# 如果不是质数,计算因数
factors = [] if is_prime else self._get_factors(number)
incr('prime_check', {"status": "success"})
incr_sync('prime_check', {"status": "success"})
return {
"number": number,
"is_prime": is_prime,

View File

@@ -168,7 +168,7 @@ return 0
await self._redis_client.hset(key, mapping=job_data)
# 记录指标
incr("jobs_created_total", {"algorithm": algorithm})
await incr("jobs_created_total", {"algorithm": algorithm})
logger.info(f"任务已创建: job_id={job_id}, algorithm={algorithm}")
return job_id
@@ -320,8 +320,10 @@ return 0
await self._redis_client.expire(key, settings.job_result_ttl)
# 记录指标
incr("jobs_completed_total", {"algorithm": algorithm_name, "status": status})
observe("job_execution_duration_seconds", {"algorithm": algorithm_name}, elapsed_time)
await incr("jobs_completed_total", {"algorithm": algorithm_name, "status": status})
await observe(
"job_execution_duration_seconds", {"algorithm": algorithm_name}, elapsed_time
)
logger.info(
f"任务执行完成: job_id={job_id}, status={status}, elapsed={elapsed_time:.3f}s"
@@ -372,7 +374,7 @@ return 0
)
if response.status_code < 400:
incr("webhook_deliveries_total", {"status": "success"})
await incr("webhook_deliveries_total", {"status": "success"})
logger.info(
f"Webhook 发送成功: job_id={job_id}, url={webhook_url}, "
f"status_code={response.status_code}"
@@ -395,7 +397,7 @@ return 0
await asyncio.sleep(delay)
# 所有重试都失败
incr("webhook_deliveries_total", {"status": "failed"})
await incr("webhook_deliveries_total", {"status": "failed"})
logger.error(f"Webhook 发送最终失败: job_id={job_id}, url={webhook_url}")
def is_available(self) -> bool:
@@ -814,10 +816,10 @@ return 0
# 更新指标
from .metrics_unified import set as metrics_set
metrics_set("job_queue_length", {"queue": "pending"}, queue_length)
metrics_set("job_queue_length", {"queue": "processing"}, processing_length)
metrics_set("job_queue_length", {"queue": "dlq"}, dlq_length)
metrics_set("job_oldest_waiting_seconds", None, oldest_waiting_seconds)
await metrics_set("job_queue_length", {"queue": "pending"}, queue_length)
await metrics_set("job_queue_length", {"queue": "processing"}, processing_length)
await metrics_set("job_queue_length", {"queue": "dlq"}, dlq_length)
await metrics_set("job_oldest_waiting_seconds", None, oldest_waiting_seconds)
return {
"queue_length": queue_length,

View File

@@ -1,19 +1,21 @@
"""统一指标管理模块
基于 Redis 的指标收集方案,支持多实例部署和 YAML 配置。
使用异步 Redis 客户端,避免在异步请求路径中阻塞事件循环。
"""
import os
import re
import socket
import logging
import asyncio
from pathlib import Path
from typing import Any, Dict, List, Optional
from functools import wraps
import time
import yaml
import redis
import redis.asyncio as aioredis
logger = logging.getLogger(__name__)
@@ -22,7 +24,7 @@ class MetricsManager:
"""统一指标管理器
支持从 YAML 配置文件加载指标定义,使用 Redis 存储指标数据,
并导出 Prometheus 格式的指标。
并导出 Prometheus 格式的指标。使用异步 Redis 客户端。
"""
def __init__(self, config_path: Optional[str] = None):
@@ -37,16 +39,22 @@ class MetricsManager:
self.instance_id = settings.metrics_instance_id or socket.gethostname()
self.config: Dict[str, Any] = {}
self.metrics_definitions: Dict[str, Dict[str, Any]] = {}
self._redis_client: Optional[redis.Redis] = None
self._redis_client: Optional[aioredis.Redis] = None
self._redis_available = False
self._initialized = False
# 加载配置
# 加载配置(同步操作)
self._load_config()
# 初始化 Redis 连接
self._init_redis()
# 注册指标定义
# 注册指标定义(同步操作)
self._register_metrics()
async def initialize(self) -> None:
"""异步初始化 Redis 连接"""
if self._initialized:
return
await self._init_redis()
self._initialized = True
def _load_config(self) -> None:
"""加载 YAML 配置文件"""
# 尝试多个路径
@@ -138,8 +146,8 @@ class MetricsManager:
"custom_metrics": {},
}
def _init_redis(self) -> None:
"""初始化 Redis 连接"""
async def _init_redis(self) -> None:
"""异步初始化 Redis 连接"""
from ..config import settings
redis_config = self.config.get("redis", {})
@@ -149,7 +157,7 @@ class MetricsManager:
password = redis_config.get("password") or settings.redis_password
try:
self._redis_client = redis.Redis(
self._redis_client = aioredis.Redis(
host=host,
port=port,
db=db,
@@ -159,10 +167,10 @@ class MetricsManager:
socket_timeout=5,
)
# 测试连接
self._redis_client.ping()
await self._redis_client.ping()
self._redis_available = True
logger.info(f"Redis 连接成功: {host}:{port}/{db}")
except redis.ConnectionError as e:
except aioredis.ConnectionError as e:
logger.warning(f"Redis 连接失败: {e},指标将不会被收集")
self._redis_available = False
except Exception as e:
@@ -235,7 +243,9 @@ class MetricsManager:
# === 简单 API业务代码使用===
def incr(self, name: str, labels: Optional[Dict[str, str]] = None, value: int = 1) -> None:
async def incr(
self, name: str, labels: Optional[Dict[str, str]] = None, value: int = 1
) -> None:
"""增加计数器
Args:
@@ -252,11 +262,13 @@ class MetricsManager:
try:
key = f"metrics:counter:{name}"
field = self._labels_to_key(labels) or "_default_"
self._redis_client.hincrbyfloat(key, field, value)
await self._redis_client.hincrbyfloat(key, field, value)
except Exception as e:
logger.error(f"增加计数器失败: {e}")
def set(self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) -> None:
async def set(
self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 0
) -> None:
"""设置仪表盘值
Args:
@@ -273,11 +285,11 @@ class MetricsManager:
try:
key = f"metrics:gauge:{name}"
field = self._labels_to_key(labels) or "_default_"
self._redis_client.hset(key, field, value)
await self._redis_client.hset(key, field, value)
except Exception as e:
logger.error(f"设置仪表盘失败: {e}")
def gauge_incr(
async def gauge_incr(
self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 1
) -> None:
"""增加仪表盘值
@@ -296,11 +308,11 @@ class MetricsManager:
try:
key = f"metrics:gauge:{name}"
field = self._labels_to_key(labels) or "_default_"
self._redis_client.hincrbyfloat(key, field, value)
await self._redis_client.hincrbyfloat(key, field, value)
except Exception as e:
logger.error(f"增加仪表盘失败: {e}")
def gauge_decr(
async def gauge_decr(
self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 1
) -> None:
"""减少仪表盘值
@@ -310,9 +322,11 @@ class MetricsManager:
labels: 标签字典
value: 减少的值
"""
self.gauge_incr(name, labels, -value)
await self.gauge_incr(name, labels, -value)
def observe(self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) -> None:
async def observe(
self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 0
) -> None:
"""记录直方图观测值
Args:
@@ -348,13 +362,13 @@ class MetricsManager:
# +Inf 桶总是增加
pipe.hincrbyfloat(f"metrics:histogram:{name}:bucket:+Inf", label_key, 1)
pipe.execute()
await pipe.execute()
except Exception as e:
logger.error(f"记录直方图失败: {e}")
# === 导出方法 ===
def export(self) -> str:
async def export(self) -> str:
"""导出 Prometheus 格式指标
Returns:
@@ -375,11 +389,11 @@ class MetricsManager:
lines.append(f"# TYPE {name} {metric_type}")
if metric_type == "counter":
lines.extend(self._export_counter(name))
lines.extend(await self._export_counter(name))
elif metric_type == "gauge":
lines.extend(self._export_gauge(name))
lines.extend(await self._export_gauge(name))
elif metric_type == "histogram":
lines.extend(self._export_histogram(name, definition))
lines.extend(await self._export_histogram(name, definition))
lines.append("") # 空行分隔
@@ -389,12 +403,12 @@ class MetricsManager:
return "\n".join(lines)
def _export_counter(self, name: str) -> List[str]:
async def _export_counter(self, name: str) -> List[str]:
"""导出计数器指标"""
lines = []
key = f"metrics:counter:{name}"
data = self._redis_client.hgetall(key)
data = await self._redis_client.hgetall(key)
for field, value in data.items():
if field == "_default_":
lines.append(f"{name} {value}")
@@ -404,12 +418,12 @@ class MetricsManager:
return lines
def _export_gauge(self, name: str) -> List[str]:
async def _export_gauge(self, name: str) -> List[str]:
"""导出仪表盘指标"""
lines = []
key = f"metrics:gauge:{name}"
data = self._redis_client.hgetall(key)
data = await self._redis_client.hgetall(key)
for field, value in data.items():
if field == "_default_":
lines.append(f"{name} {value}")
@@ -419,14 +433,14 @@ class MetricsManager:
return lines
def _export_histogram(self, name: str, definition: Dict[str, Any]) -> List[str]:
async def _export_histogram(self, name: str, definition: Dict[str, Any]) -> List[str]:
"""导出直方图指标"""
lines = []
buckets = definition.get("buckets", [])
# 获取所有标签组合
count_data = self._redis_client.hgetall(f"metrics:histogram:{name}:count")
sum_data = self._redis_client.hgetall(f"metrics:histogram:{name}:sum")
count_data = await self._redis_client.hgetall(f"metrics:histogram:{name}:count")
sum_data = await self._redis_client.hgetall(f"metrics:histogram:{name}:sum")
for label_key in count_data.keys():
prom_labels = self._key_to_prometheus_labels(label_key)
@@ -434,7 +448,7 @@ class MetricsManager:
# 导出各个桶
for bucket in buckets:
bucket_key = f"metrics:histogram:{name}:bucket:{bucket}"
bucket_value = self._redis_client.hget(bucket_key, label_key) or "0"
bucket_value = await self._redis_client.hget(bucket_key, label_key) or "0"
if label_key == "_default_":
lines.append(f'{name}_bucket{{le="{bucket}"}} {bucket_value}')
else:
@@ -442,7 +456,7 @@ class MetricsManager:
# +Inf 桶
inf_key = f"metrics:histogram:{name}:bucket:+Inf"
inf_value = self._redis_client.hget(inf_key, label_key) or "0"
inf_value = await self._redis_client.hget(inf_key, label_key) or "0"
if label_key == "_default_":
lines.append(f'{name}_bucket{{le="+Inf"}} {inf_value}')
else:
@@ -464,43 +478,79 @@ class MetricsManager:
"""检查 Redis 是否可用"""
return self._redis_available
def reset(self) -> None:
async def reset(self) -> None:
"""重置所有指标(主要用于测试)"""
if not self._redis_available:
return
try:
# 删除所有指标相关的 key
keys = self._redis_client.keys("metrics:*")
keys = await self._redis_client.keys("metrics:*")
if keys:
self._redis_client.delete(*keys)
await self._redis_client.delete(*keys)
logger.info("已重置所有指标")
except Exception as e:
logger.error(f"重置指标失败: {e}")
async def close(self) -> None:
"""关闭 Redis 连接"""
if self._redis_client:
await self._redis_client.close()
self._redis_client = None
self._redis_available = False
self._initialized = False
# 全局单例
_manager: Optional[MetricsManager] = None
_manager_lock = asyncio.Lock()
def get_metrics_manager() -> MetricsManager:
"""获取指标管理器单例"""
async def get_metrics_manager() -> MetricsManager:
"""获取指标管理器单例(异步)"""
global _manager
if _manager is None:
async with _manager_lock:
if _manager is None:
_manager = MetricsManager()
await _manager.initialize()
elif not _manager._initialized:
await _manager.initialize()
return _manager
def get_metrics_manager_sync() -> MetricsManager:
"""获取指标管理器单例(同步,仅用于非异步上下文)
注意:此方法不会初始化 Redis 连接,需要在异步上下文中调用 initialize()
"""
global _manager
if _manager is None:
_manager = MetricsManager()
return _manager
def reset_metrics_manager() -> None:
async def reset_metrics_manager() -> None:
"""重置指标管理器单例(主要用于测试)"""
global _manager
if _manager is not None:
await _manager.close()
_manager = None
def reset_metrics_manager_sync() -> None:
"""同步重置指标管理器单例(主要用于测试)
注意:此方法不会关闭 Redis 连接,仅重置单例引用
"""
global _manager
_manager = None
# === 便捷函数(业务代码直接调用)===
def incr(name: str, labels: Optional[Dict[str, str]] = None, value: int = 1) -> None:
async def incr(name: str, labels: Optional[Dict[str, str]] = None, value: int = 1) -> None:
"""增加计数器 - 便捷函数
Args:
@@ -508,10 +558,11 @@ def incr(name: str, labels: Optional[Dict[str, str]] = None, value: int = 1) ->
labels: 标签字典
value: 增加的值,默认为 1
"""
get_metrics_manager().incr(name, labels, value)
manager = await get_metrics_manager()
await manager.incr(name, labels, value)
def set(name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) -> None:
async def set(name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) -> None:
"""设置仪表盘 - 便捷函数
Args:
@@ -519,10 +570,13 @@ def set(name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) ->
labels: 标签字典
value: 设置的值
"""
get_metrics_manager().set(name, labels, value)
manager = await get_metrics_manager()
await manager.set(name, labels, value)
def gauge_incr(name: str, labels: Optional[Dict[str, str]] = None, value: float = 1) -> None:
async def gauge_incr(
name: str, labels: Optional[Dict[str, str]] = None, value: float = 1
) -> None:
"""增加仪表盘 - 便捷函数
Args:
@@ -530,10 +584,13 @@ def gauge_incr(name: str, labels: Optional[Dict[str, str]] = None, value: float
labels: 标签字典
value: 增加的值
"""
get_metrics_manager().gauge_incr(name, labels, value)
manager = await get_metrics_manager()
await manager.gauge_incr(name, labels, value)
def gauge_decr(name: str, labels: Optional[Dict[str, str]] = None, value: float = 1) -> None:
async def gauge_decr(
name: str, labels: Optional[Dict[str, str]] = None, value: float = 1
) -> None:
"""减少仪表盘 - 便捷函数
Args:
@@ -541,10 +598,13 @@ def gauge_decr(name: str, labels: Optional[Dict[str, str]] = None, value: float
labels: 标签字典
value: 减少的值
"""
get_metrics_manager().gauge_decr(name, labels, value)
manager = await get_metrics_manager()
await manager.gauge_decr(name, labels, value)
def observe(name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) -> None:
async def observe(
name: str, labels: Optional[Dict[str, str]] = None, value: float = 0
) -> None:
"""记录直方图 - 便捷函数
Args:
@@ -552,21 +612,105 @@ def observe(name: str, labels: Optional[Dict[str, str]] = None, value: float = 0
labels: 标签字典
value: 观测值
"""
get_metrics_manager().observe(name, labels, value)
manager = await get_metrics_manager()
await manager.observe(name, labels, value)
def export() -> str:
async def export() -> str:
"""导出指标 - 便捷函数
Returns:
Prometheus 文本格式的指标字符串
"""
return get_metrics_manager().export()
manager = await get_metrics_manager()
return await manager.export()
def is_available() -> bool:
async def is_available() -> bool:
"""检查 Redis 是否可用 - 便捷函数"""
return get_metrics_manager().is_available()
manager = await get_metrics_manager()
return manager.is_available()
# === 同步便捷函数(用于同步代码中的 fire-and-forget 模式)===
def _schedule_async(coro) -> None:
"""在后台调度异步协程fire-and-forget 模式)
如果当前没有运行的事件循环,则静默忽略。
"""
try:
loop = asyncio.get_running_loop()
loop.create_task(coro)
except RuntimeError:
# 没有运行的事件循环,静默忽略
pass
def incr_sync(
name: str, labels: Optional[Dict[str, str]] = None, value: int = 1
) -> None:
"""增加计数器 - 同步便捷函数fire-and-forget
Args:
name: 指标名称
labels: 标签字典
value: 增加的值,默认为 1
"""
_schedule_async(incr(name, labels, value))
def set_sync(
name: str, labels: Optional[Dict[str, str]] = None, value: float = 0
) -> None:
"""设置仪表盘 - 同步便捷函数fire-and-forget
Args:
name: 指标名称
labels: 标签字典
value: 设置的值
"""
_schedule_async(set(name, labels, value))
def gauge_incr_sync(
name: str, labels: Optional[Dict[str, str]] = None, value: float = 1
) -> None:
"""增加仪表盘 - 同步便捷函数fire-and-forget
Args:
name: 指标名称
labels: 标签字典
value: 增加的值
"""
_schedule_async(gauge_incr(name, labels, value))
def gauge_decr_sync(
name: str, labels: Optional[Dict[str, str]] = None, value: float = 1
) -> None:
"""减少仪表盘 - 同步便捷函数fire-and-forget
Args:
name: 指标名称
labels: 标签字典
value: 减少的值
"""
_schedule_async(gauge_decr(name, labels, value))
def observe_sync(
name: str, labels: Optional[Dict[str, str]] = None, value: float = 0
) -> None:
"""记录直方图 - 同步便捷函数fire-and-forget
Args:
name: 指标名称
labels: 标签字典
value: 观测值
"""
_schedule_async(observe(name, labels, value))
# === 装饰器(兼容旧 API===
@@ -593,8 +737,11 @@ def track_algorithm_execution(algorithm_name: str):
raise e
finally:
elapsed = time.time() - start_time
incr("algorithm_executions_total", {"algorithm": algorithm_name, "status": status})
observe(
incr_sync(
"algorithm_executions_total",
{"algorithm": algorithm_name, "status": status},
)
observe_sync(
"algorithm_execution_duration_seconds",
{"algorithm": algorithm_name},
elapsed,

View File

@@ -95,7 +95,7 @@ async def track_metrics(request: Request, call_next):
if request.url.path in skip_paths:
return await call_next(request)
gauge_incr("http_requests_in_progress")
await gauge_incr("http_requests_in_progress")
start_time = time.time()
status = "success"
@@ -112,16 +112,16 @@ async def track_metrics(request: Request, call_next):
elapsed = time.time() - start_time
# 使用规范化后的路径记录指标
normalized_path = normalize_path(request.url.path)
incr(
await incr(
"http_requests_total",
{"method": request.method, "endpoint": normalized_path, "status": status},
)
observe(
await observe(
"http_request_duration_seconds",
{"method": request.method, "endpoint": normalized_path},
elapsed,
)
gauge_decr("http_requests_in_progress")
await gauge_decr("http_requests_in_progress")
# 注册路由
@@ -145,7 +145,7 @@ async def metrics():
return Response(content="Metrics disabled", status_code=404)
return Response(
content=export(),
content=await export(),
media_type="text/plain; version=0.0.4; charset=utf-8",
)
@@ -160,7 +160,7 @@ async def startup_event():
# 初始化指标管理器
if settings.metrics_enabled:
manager = get_metrics_manager()
manager = await get_metrics_manager()
if manager.is_available():
logger.info("Redis 指标收集已启用")
else:

View File

@@ -260,7 +260,7 @@ class JobWorker:
# 记录回收指标
from .core.metrics_unified import incr
incr("job_recovered_total", None, recovered)
await incr("job_recovered_total", None, recovered)
# 收集队列监控指标
await self._job_manager.collect_queue_metrics()