"""Worker 进程模块 基于 Redis 队列的任务 Worker,支持分布式锁和全局并发控制。 """ import asyncio import logging import signal import sys from typing import Optional from .config import settings from .core.job_manager import JobManager from .core.logging import setup_logging from .core.tracing import set_request_id logger = logging.getLogger(__name__) class JobWorker: """任务 Worker 从 Redis 队列获取任务并执行,支持: - 分布式锁防止重复执行 - 全局并发控制 - 任务重试机制 - 锁续租机制 - 超时任务回收 - 优雅关闭 """ def __init__(self): self._job_manager: Optional[JobManager] = None self._running: bool = False self._current_job_id: Optional[str] = None self._current_lock_token: Optional[str] = None self._lock_renewal_task: Optional[asyncio.Task] = None self._sweeper_task: Optional[asyncio.Task] = None async def initialize(self) -> None: """初始化 Worker""" self._job_manager = JobManager() await self._job_manager.initialize() logger.info("Worker 初始化完成") async def shutdown(self) -> None: """关闭 Worker""" logger.info("Worker 正在关闭...") self._running = False # 取消回收器任务 if self._sweeper_task and not self._sweeper_task.done(): self._sweeper_task.cancel() try: await self._sweeper_task except asyncio.CancelledError: pass # 取消锁续租任务 if self._lock_renewal_task and not self._lock_renewal_task.done(): self._lock_renewal_task.cancel() try: await self._lock_renewal_task except asyncio.CancelledError: pass # 等待当前任务完成 if self._current_job_id: logger.info(f"等待当前任务完成: {self._current_job_id}") if self._job_manager: await self._job_manager.shutdown() logger.info("Worker 已关闭") async def run(self) -> None: """运行 Worker 主循环""" self._running = True logger.info( f"Worker 启动,轮询间隔: {settings.worker_poll_interval}s," f"最大并发: {settings.max_concurrent_jobs}" ) # 启动超时任务回收器 if settings.job_sweeper_enabled: self._sweeper_task = asyncio.create_task(self._sweeper_loop()) logger.info(f"超时任务回收器已启动,扫描间隔: {settings.job_sweeper_interval}s") while self._running: try: await self._process_next_job() except Exception as e: logger.error(f"Worker 循环异常: {e}", exc_info=True) await asyncio.sleep(settings.worker_poll_interval) async def _process_next_job(self) -> None: """处理下一个任务""" if not self._job_manager: logger.error("JobManager 未初始化") await asyncio.sleep(settings.worker_poll_interval) return # 从队列获取任务(转移式出队) job_id = await self._job_manager.dequeue_job(timeout=int(settings.worker_poll_interval)) if not job_id: return # 获取任务信息以提取 request_id job_data = await self._job_manager.get_job(job_id) if job_data: request_id = job_data.get("request_id") or job_id set_request_id(request_id) else: set_request_id(job_id) logger.info(f"从队列获取任务: {job_id}") # 尝试获取分布式锁(返回 token) lock_token = await self._job_manager.acquire_job_lock(job_id) if not lock_token: logger.warning(f"无法获取任务锁,任务可能正在被其他 Worker 执行: {job_id}") # 任务留在 processing 队列,等待回收器处理 return self._current_lock_token = lock_token # 启动锁续租协程 self._lock_renewal_task = asyncio.create_task(self._lock_renewal_loop(job_id, lock_token)) try: # 检查全局并发限制 if not await self._job_manager.can_execute(): logger.info(f"达到并发限制,任务 NACK 重新入队: {job_id}") await self._job_manager.nack_job(job_id, requeue=True) return # 增加并发计数 await self._job_manager.increment_concurrency() self._current_job_id = job_id try: # 执行任务 success = await self._execute_with_retry(job_id) if success: await self._job_manager.ack_job(job_id) else: await self._job_manager.increment_job_retry(job_id) await self._job_manager.nack_job(job_id, requeue=True) finally: # 减少并发计数 await self._job_manager.decrement_concurrency() self._current_job_id = None finally: # 停止锁续租 if self._lock_renewal_task and not self._lock_renewal_task.done(): self._lock_renewal_task.cancel() try: await self._lock_renewal_task except asyncio.CancelledError: pass self._lock_renewal_task = None # 释放分布式锁 await self._job_manager.release_job_lock(job_id, lock_token) self._current_lock_token = None async def _execute_with_retry(self, job_id: str) -> bool: """执行任务(带重试机制) Returns: bool: 任务是否成功执行 """ if not self._job_manager: return False try: # 执行任务 await asyncio.wait_for( self._job_manager.execute_job(job_id), timeout=settings.job_execution_timeout, ) return True except asyncio.TimeoutError: logger.error(f"任务执行超时: {job_id}") await self._handle_job_failure(job_id, "任务执行超时") return False except Exception as e: logger.error(f"任务执行异常: {job_id}, error={e}", exc_info=True) await self._handle_job_failure(job_id, str(e)) return False async def _handle_job_failure(self, job_id: str, error: str) -> None: """处理任务失败""" if not self._job_manager: return retry_count = await self._job_manager.increment_job_retry(job_id) if retry_count < settings.job_max_retries: logger.info(f"任务将重试 ({retry_count}/{settings.job_max_retries}): {job_id}") # 重新入队 await self._job_manager.enqueue_job(job_id) else: logger.error(f"任务达到最大重试次数,标记为失败: {job_id}") # 更新任务状态为失败 if self._job_manager._redis_client: key = f"job:{job_id}" await self._job_manager._redis_client.hset( key, mapping={ "status": "failed", "error": f"达到最大重试次数 ({settings.job_max_retries}): {error}", }, ) async def _lock_renewal_loop(self, job_id: str, lock_token: str) -> None: """锁续租协程 定期续租任务锁,防止长任务执行时锁过期。 Args: job_id: 任务 ID lock_token: 锁 token """ # 续租间隔为锁 TTL 的一半 interval = (settings.job_execution_timeout + settings.job_lock_buffer) / 2 while True: try: await asyncio.sleep(interval) if not self._job_manager: break if not await self._job_manager.renew_job_lock(job_id, lock_token): logger.error(f"锁续租失败,可能已被其他进程获取: {job_id}") break logger.debug(f"锁续租成功: {job_id}") except asyncio.CancelledError: logger.debug(f"锁续租协程已取消: {job_id}") break except Exception as e: logger.error(f"锁续租异常: {job_id}, error={e}") break async def _sweeper_loop(self) -> None: """超时任务回收协程 定期扫描处理中队列,回收超时任务,并收集队列监控指标。 """ while self._running: try: await asyncio.sleep(settings.job_sweeper_interval) if not self._job_manager: continue # 回收超时任务 recovered = await self._job_manager.recover_stale_jobs() if recovered > 0: logger.info(f"回收超时任务: {recovered} 个") # 记录回收指标 from .core.metrics_unified import incr await incr("job_recovered_total", None, recovered) # 收集队列监控指标 await self._job_manager.collect_queue_metrics() except asyncio.CancelledError: logger.debug("超时任务回收协程已取消") break except Exception as e: logger.error(f"超时任务回收异常: {e}") def setup_signal_handlers(worker: JobWorker, loop: asyncio.AbstractEventLoop) -> None: """设置信号处理器""" def signal_handler(sig: signal.Signals) -> None: logger.info(f"收到信号 {sig.name},准备关闭...") loop.create_task(worker.shutdown()) for sig in (signal.SIGTERM, signal.SIGINT): loop.add_signal_handler(sig, signal_handler, sig) async def main() -> None: """Worker 入口函数""" # 设置日志 setup_logging(level=settings.log_level, format_type=settings.log_format) worker = JobWorker() # 设置信号处理 loop = asyncio.get_running_loop() setup_signal_handlers(worker, loop) try: await worker.initialize() await worker.run() except Exception as e: logger.error(f"Worker 异常退出: {e}", exc_info=True) sys.exit(1) finally: await worker.shutdown() if __name__ == "__main__": asyncio.run(main())