main:优化任务管理及队列监控性能

变更内容:
- 优化任务出队逻辑,采用 BLMOVE 提升队列操作的原子性和可靠性。
- 在 JobManager 中新增任务锁续租、超时任务回收、ACK/NACK 状态管理功能。
- 实现任务队列和死信队列监控指标收集,为系统性能分析提供数据支持。
- 扩展 Worker 模块,增加锁续租逻辑及任务回收调度。
- 更新测试用例,覆盖任务管理和队列指标的新增逻辑。
- 补充 metrics.yaml 文件,添加队列相关的监控指标定义。
- 更新依赖,补充 Redis 支持及相关库版本规范。
This commit is contained in:
2026-02-03 18:18:02 +08:00
parent 73bd66813c
commit 7b627090f3
8 changed files with 1318 additions and 46 deletions

View File

@@ -24,6 +24,8 @@ class JobWorker:
- 分布式锁防止重复执行
- 全局并发控制
- 任务重试机制
- 锁续租机制
- 超时任务回收
- 优雅关闭
"""
@@ -31,6 +33,9 @@ class JobWorker:
self._job_manager: Optional[JobManager] = None
self._running: bool = False
self._current_job_id: Optional[str] = None
self._current_lock_token: Optional[str] = None
self._lock_renewal_task: Optional[asyncio.Task] = None
self._sweeper_task: Optional[asyncio.Task] = None
async def initialize(self) -> None:
"""初始化 Worker"""
@@ -43,6 +48,22 @@ class JobWorker:
logger.info("Worker 正在关闭...")
self._running = False
# 取消回收器任务
if self._sweeper_task and not self._sweeper_task.done():
self._sweeper_task.cancel()
try:
await self._sweeper_task
except asyncio.CancelledError:
pass
# 取消锁续租任务
if self._lock_renewal_task and not self._lock_renewal_task.done():
self._lock_renewal_task.cancel()
try:
await self._lock_renewal_task
except asyncio.CancelledError:
pass
# 等待当前任务完成
if self._current_job_id:
logger.info(f"等待当前任务完成: {self._current_job_id}")
@@ -60,6 +81,11 @@ class JobWorker:
f"最大并发: {settings.max_concurrent_jobs}"
)
# 启动超时任务回收器
if settings.job_sweeper_enabled:
self._sweeper_task = asyncio.create_task(self._sweeper_loop())
logger.info(f"超时任务回收器已启动,扫描间隔: {settings.job_sweeper_interval}s")
while self._running:
try:
await self._process_next_job()
@@ -74,7 +100,7 @@ class JobWorker:
await asyncio.sleep(settings.worker_poll_interval)
return
# 从队列获取任务
# 从队列获取任务(转移式出队)
job_id = await self._job_manager.dequeue_job(timeout=int(settings.worker_poll_interval))
if not job_id:
@@ -90,16 +116,23 @@ class JobWorker:
logger.info(f"从队列获取任务: {job_id}")
# 尝试获取分布式锁
if not await self._job_manager.acquire_job_lock(job_id):
# 尝试获取分布式锁(返回 token
lock_token = await self._job_manager.acquire_job_lock(job_id)
if not lock_token:
logger.warning(f"无法获取任务锁,任务可能正在被其他 Worker 执行: {job_id}")
# 任务留在 processing 队列,等待回收器处理
return
self._current_lock_token = lock_token
# 启动锁续租协程
self._lock_renewal_task = asyncio.create_task(self._lock_renewal_loop(job_id, lock_token))
try:
# 检查全局并发限制
if not await self._job_manager.can_execute():
logger.info(f"达到并发限制,任务重新入队: {job_id}")
await self._job_manager.enqueue_job(job_id)
logger.info(f"达到并发限制,任务 NACK 重新入队: {job_id}")
await self._job_manager.nack_job(job_id, requeue=True)
return
# 增加并发计数
@@ -108,20 +141,39 @@ class JobWorker:
try:
# 执行任务
await self._execute_with_retry(job_id)
success = await self._execute_with_retry(job_id)
if success:
await self._job_manager.ack_job(job_id)
else:
await self._job_manager.increment_job_retry(job_id)
await self._job_manager.nack_job(job_id, requeue=True)
finally:
# 减少并发计数
await self._job_manager.decrement_concurrency()
self._current_job_id = None
finally:
# 释放分布式锁
await self._job_manager.release_job_lock(job_id)
# 停止锁续租
if self._lock_renewal_task and not self._lock_renewal_task.done():
self._lock_renewal_task.cancel()
try:
await self._lock_renewal_task
except asyncio.CancelledError:
pass
self._lock_renewal_task = None
async def _execute_with_retry(self, job_id: str) -> None:
"""执行任务(带重试机制)"""
# 释放分布式锁
await self._job_manager.release_job_lock(job_id, lock_token)
self._current_lock_token = None
async def _execute_with_retry(self, job_id: str) -> bool:
"""执行任务(带重试机制)
Returns:
bool: 任务是否成功执行
"""
if not self._job_manager:
return
return False
try:
# 执行任务
@@ -129,12 +181,15 @@ class JobWorker:
self._job_manager.execute_job(job_id),
timeout=settings.job_execution_timeout,
)
return True
except asyncio.TimeoutError:
logger.error(f"任务执行超时: {job_id}")
await self._handle_job_failure(job_id, "任务执行超时")
return False
except Exception as e:
logger.error(f"任务执行异常: {job_id}, error={e}", exc_info=True)
await self._handle_job_failure(job_id, str(e))
return False
async def _handle_job_failure(self, job_id: str, error: str) -> None:
"""处理任务失败"""
@@ -160,6 +215,62 @@ class JobWorker:
},
)
async def _lock_renewal_loop(self, job_id: str, lock_token: str) -> None:
"""锁续租协程
定期续租任务锁,防止长任务执行时锁过期。
Args:
job_id: 任务 ID
lock_token: 锁 token
"""
# 续租间隔为锁 TTL 的一半
interval = (settings.job_execution_timeout + settings.job_lock_buffer) / 2
while True:
try:
await asyncio.sleep(interval)
if not self._job_manager:
break
if not await self._job_manager.renew_job_lock(job_id, lock_token):
logger.error(f"锁续租失败,可能已被其他进程获取: {job_id}")
break
logger.debug(f"锁续租成功: {job_id}")
except asyncio.CancelledError:
logger.debug(f"锁续租协程已取消: {job_id}")
break
except Exception as e:
logger.error(f"锁续租异常: {job_id}, error={e}")
break
async def _sweeper_loop(self) -> None:
"""超时任务回收协程
定期扫描处理中队列,回收超时任务,并收集队列监控指标。
"""
while self._running:
try:
await asyncio.sleep(settings.job_sweeper_interval)
if not self._job_manager:
continue
# 回收超时任务
recovered = await self._job_manager.recover_stale_jobs()
if recovered > 0:
logger.info(f"回收超时任务: {recovered}")
# 记录回收指标
from .core.metrics_unified import incr
incr("job_recovered_total", None, recovered)
# 收集队列监控指标
await self._job_manager.collect_queue_metrics()
except asyncio.CancelledError:
logger.debug("超时任务回收协程已取消")
break
except Exception as e:
logger.error(f"超时任务回收异常: {e}")
def setup_signal_handlers(worker: JobWorker, loop: asyncio.AbstractEventLoop) -> None:
"""设置信号处理器"""