变更内容: - 将 `redis` 客户端替换为 `redis.asyncio` 实现。 - 系统中同步方法调整为异步方法,提升事件循环效率。 - 在 `MetricsManager` 中添加异步初始化及关闭逻辑,避免阻塞问题。 - 更新便捷函数以支持异步上下文,并添加同步模式的兼容方法。 - 调整 Worker、JobManager、API 路由等模块,适配异步指标操作。 - 扩展单元测试,覆盖新增的异步方法及 Redis 操作逻辑。 - 简化 Dockerfile,取消开发依赖安装命令。
309 lines
10 KiB
Python
309 lines
10 KiB
Python
"""Worker 进程模块
|
||
|
||
基于 Redis 队列的任务 Worker,支持分布式锁和全局并发控制。
|
||
"""
|
||
|
||
import asyncio
|
||
import logging
|
||
import signal
|
||
import sys
|
||
from typing import Optional
|
||
|
||
from .config import settings
|
||
from .core.job_manager import JobManager
|
||
from .core.logging import setup_logging
|
||
from .core.tracing import set_request_id
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class JobWorker:
|
||
"""任务 Worker
|
||
|
||
从 Redis 队列获取任务并执行,支持:
|
||
- 分布式锁防止重复执行
|
||
- 全局并发控制
|
||
- 任务重试机制
|
||
- 锁续租机制
|
||
- 超时任务回收
|
||
- 优雅关闭
|
||
"""
|
||
|
||
def __init__(self):
|
||
self._job_manager: Optional[JobManager] = None
|
||
self._running: bool = False
|
||
self._current_job_id: Optional[str] = None
|
||
self._current_lock_token: Optional[str] = None
|
||
self._lock_renewal_task: Optional[asyncio.Task] = None
|
||
self._sweeper_task: Optional[asyncio.Task] = None
|
||
|
||
async def initialize(self) -> None:
|
||
"""初始化 Worker"""
|
||
self._job_manager = JobManager()
|
||
await self._job_manager.initialize()
|
||
logger.info("Worker 初始化完成")
|
||
|
||
async def shutdown(self) -> None:
|
||
"""关闭 Worker"""
|
||
logger.info("Worker 正在关闭...")
|
||
self._running = False
|
||
|
||
# 取消回收器任务
|
||
if self._sweeper_task and not self._sweeper_task.done():
|
||
self._sweeper_task.cancel()
|
||
try:
|
||
await self._sweeper_task
|
||
except asyncio.CancelledError:
|
||
pass
|
||
|
||
# 取消锁续租任务
|
||
if self._lock_renewal_task and not self._lock_renewal_task.done():
|
||
self._lock_renewal_task.cancel()
|
||
try:
|
||
await self._lock_renewal_task
|
||
except asyncio.CancelledError:
|
||
pass
|
||
|
||
# 等待当前任务完成
|
||
if self._current_job_id:
|
||
logger.info(f"等待当前任务完成: {self._current_job_id}")
|
||
|
||
if self._job_manager:
|
||
await self._job_manager.shutdown()
|
||
|
||
logger.info("Worker 已关闭")
|
||
|
||
async def run(self) -> None:
|
||
"""运行 Worker 主循环"""
|
||
self._running = True
|
||
logger.info(
|
||
f"Worker 启动,轮询间隔: {settings.worker_poll_interval}s,"
|
||
f"最大并发: {settings.max_concurrent_jobs}"
|
||
)
|
||
|
||
# 启动超时任务回收器
|
||
if settings.job_sweeper_enabled:
|
||
self._sweeper_task = asyncio.create_task(self._sweeper_loop())
|
||
logger.info(f"超时任务回收器已启动,扫描间隔: {settings.job_sweeper_interval}s")
|
||
|
||
while self._running:
|
||
try:
|
||
await self._process_next_job()
|
||
except Exception as e:
|
||
logger.error(f"Worker 循环异常: {e}", exc_info=True)
|
||
await asyncio.sleep(settings.worker_poll_interval)
|
||
|
||
async def _process_next_job(self) -> None:
|
||
"""处理下一个任务"""
|
||
if not self._job_manager:
|
||
logger.error("JobManager 未初始化")
|
||
await asyncio.sleep(settings.worker_poll_interval)
|
||
return
|
||
|
||
# 从队列获取任务(转移式出队)
|
||
job_id = await self._job_manager.dequeue_job(timeout=int(settings.worker_poll_interval))
|
||
|
||
if not job_id:
|
||
return
|
||
|
||
# 获取任务信息以提取 request_id
|
||
job_data = await self._job_manager.get_job(job_id)
|
||
if job_data:
|
||
request_id = job_data.get("request_id") or job_id
|
||
set_request_id(request_id)
|
||
else:
|
||
set_request_id(job_id)
|
||
|
||
logger.info(f"从队列获取任务: {job_id}")
|
||
|
||
# 尝试获取分布式锁(返回 token)
|
||
lock_token = await self._job_manager.acquire_job_lock(job_id)
|
||
if not lock_token:
|
||
logger.warning(f"无法获取任务锁,任务可能正在被其他 Worker 执行: {job_id}")
|
||
# 任务留在 processing 队列,等待回收器处理
|
||
return
|
||
|
||
self._current_lock_token = lock_token
|
||
|
||
# 启动锁续租协程
|
||
self._lock_renewal_task = asyncio.create_task(self._lock_renewal_loop(job_id, lock_token))
|
||
|
||
try:
|
||
# 检查全局并发限制
|
||
if not await self._job_manager.can_execute():
|
||
logger.info(f"达到并发限制,任务 NACK 重新入队: {job_id}")
|
||
await self._job_manager.nack_job(job_id, requeue=True)
|
||
return
|
||
|
||
# 增加并发计数
|
||
await self._job_manager.increment_concurrency()
|
||
self._current_job_id = job_id
|
||
|
||
try:
|
||
# 执行任务
|
||
success = await self._execute_with_retry(job_id)
|
||
if success:
|
||
await self._job_manager.ack_job(job_id)
|
||
else:
|
||
await self._job_manager.increment_job_retry(job_id)
|
||
await self._job_manager.nack_job(job_id, requeue=True)
|
||
finally:
|
||
# 减少并发计数
|
||
await self._job_manager.decrement_concurrency()
|
||
self._current_job_id = None
|
||
|
||
finally:
|
||
# 停止锁续租
|
||
if self._lock_renewal_task and not self._lock_renewal_task.done():
|
||
self._lock_renewal_task.cancel()
|
||
try:
|
||
await self._lock_renewal_task
|
||
except asyncio.CancelledError:
|
||
pass
|
||
self._lock_renewal_task = None
|
||
|
||
# 释放分布式锁
|
||
await self._job_manager.release_job_lock(job_id, lock_token)
|
||
self._current_lock_token = None
|
||
|
||
async def _execute_with_retry(self, job_id: str) -> bool:
|
||
"""执行任务(带重试机制)
|
||
|
||
Returns:
|
||
bool: 任务是否成功执行
|
||
"""
|
||
if not self._job_manager:
|
||
return False
|
||
|
||
try:
|
||
# 执行任务
|
||
await asyncio.wait_for(
|
||
self._job_manager.execute_job(job_id),
|
||
timeout=settings.job_execution_timeout,
|
||
)
|
||
return True
|
||
except asyncio.TimeoutError:
|
||
logger.error(f"任务执行超时: {job_id}")
|
||
await self._handle_job_failure(job_id, "任务执行超时")
|
||
return False
|
||
except Exception as e:
|
||
logger.error(f"任务执行异常: {job_id}, error={e}", exc_info=True)
|
||
await self._handle_job_failure(job_id, str(e))
|
||
return False
|
||
|
||
async def _handle_job_failure(self, job_id: str, error: str) -> None:
|
||
"""处理任务失败"""
|
||
if not self._job_manager:
|
||
return
|
||
|
||
retry_count = await self._job_manager.increment_job_retry(job_id)
|
||
|
||
if retry_count < settings.job_max_retries:
|
||
logger.info(f"任务将重试 ({retry_count}/{settings.job_max_retries}): {job_id}")
|
||
# 重新入队
|
||
await self._job_manager.enqueue_job(job_id)
|
||
else:
|
||
logger.error(f"任务达到最大重试次数,标记为失败: {job_id}")
|
||
# 更新任务状态为失败
|
||
if self._job_manager._redis_client:
|
||
key = f"job:{job_id}"
|
||
await self._job_manager._redis_client.hset(
|
||
key,
|
||
mapping={
|
||
"status": "failed",
|
||
"error": f"达到最大重试次数 ({settings.job_max_retries}): {error}",
|
||
},
|
||
)
|
||
|
||
async def _lock_renewal_loop(self, job_id: str, lock_token: str) -> None:
|
||
"""锁续租协程
|
||
|
||
定期续租任务锁,防止长任务执行时锁过期。
|
||
|
||
Args:
|
||
job_id: 任务 ID
|
||
lock_token: 锁 token
|
||
"""
|
||
# 续租间隔为锁 TTL 的一半
|
||
interval = (settings.job_execution_timeout + settings.job_lock_buffer) / 2
|
||
while True:
|
||
try:
|
||
await asyncio.sleep(interval)
|
||
if not self._job_manager:
|
||
break
|
||
if not await self._job_manager.renew_job_lock(job_id, lock_token):
|
||
logger.error(f"锁续租失败,可能已被其他进程获取: {job_id}")
|
||
break
|
||
logger.debug(f"锁续租成功: {job_id}")
|
||
except asyncio.CancelledError:
|
||
logger.debug(f"锁续租协程已取消: {job_id}")
|
||
break
|
||
except Exception as e:
|
||
logger.error(f"锁续租异常: {job_id}, error={e}")
|
||
break
|
||
|
||
async def _sweeper_loop(self) -> None:
|
||
"""超时任务回收协程
|
||
|
||
定期扫描处理中队列,回收超时任务,并收集队列监控指标。
|
||
"""
|
||
while self._running:
|
||
try:
|
||
await asyncio.sleep(settings.job_sweeper_interval)
|
||
if not self._job_manager:
|
||
continue
|
||
|
||
# 回收超时任务
|
||
recovered = await self._job_manager.recover_stale_jobs()
|
||
if recovered > 0:
|
||
logger.info(f"回收超时任务: {recovered} 个")
|
||
# 记录回收指标
|
||
from .core.metrics_unified import incr
|
||
|
||
await incr("job_recovered_total", None, recovered)
|
||
|
||
# 收集队列监控指标
|
||
await self._job_manager.collect_queue_metrics()
|
||
|
||
except asyncio.CancelledError:
|
||
logger.debug("超时任务回收协程已取消")
|
||
break
|
||
except Exception as e:
|
||
logger.error(f"超时任务回收异常: {e}")
|
||
|
||
|
||
def setup_signal_handlers(worker: JobWorker, loop: asyncio.AbstractEventLoop) -> None:
|
||
"""设置信号处理器"""
|
||
|
||
def signal_handler(sig: signal.Signals) -> None:
|
||
logger.info(f"收到信号 {sig.name},准备关闭...")
|
||
loop.create_task(worker.shutdown())
|
||
|
||
for sig in (signal.SIGTERM, signal.SIGINT):
|
||
loop.add_signal_handler(sig, signal_handler, sig)
|
||
|
||
|
||
async def main() -> None:
|
||
"""Worker 入口函数"""
|
||
# 设置日志
|
||
setup_logging(level=settings.log_level, format_type=settings.log_format)
|
||
|
||
worker = JobWorker()
|
||
|
||
# 设置信号处理
|
||
loop = asyncio.get_running_loop()
|
||
setup_signal_handlers(worker, loop)
|
||
|
||
try:
|
||
await worker.initialize()
|
||
await worker.run()
|
||
except Exception as e:
|
||
logger.error(f"Worker 异常退出: {e}", exc_info=True)
|
||
sys.exit(1)
|
||
finally:
|
||
await worker.shutdown()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|