Files
FunctionalScaffold/src/functional_scaffold/worker.py
Roog (顾新培) a4d2ad1e93 main:采用异步 Redis 客户端优化指标管理模块
变更内容:
- 将 `redis` 客户端替换为 `redis.asyncio` 实现。
- 系统中同步方法调整为异步方法,提升事件循环效率。
- 在 `MetricsManager` 中添加异步初始化及关闭逻辑,避免阻塞问题。
- 更新便捷函数以支持异步上下文,并添加同步模式的兼容方法。
- 调整 Worker、JobManager、API 路由等模块,适配异步指标操作。
- 扩展单元测试,覆盖新增的异步方法及 Redis 操作逻辑。
- 简化 Dockerfile,取消开发依赖安装命令。
2026-02-03 19:54:22 +08:00

309 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Worker 进程模块
基于 Redis 队列的任务 Worker支持分布式锁和全局并发控制。
"""
import asyncio
import logging
import signal
import sys
from typing import Optional
from .config import settings
from .core.job_manager import JobManager
from .core.logging import setup_logging
from .core.tracing import set_request_id
logger = logging.getLogger(__name__)
class JobWorker:
"""任务 Worker
从 Redis 队列获取任务并执行,支持:
- 分布式锁防止重复执行
- 全局并发控制
- 任务重试机制
- 锁续租机制
- 超时任务回收
- 优雅关闭
"""
def __init__(self):
self._job_manager: Optional[JobManager] = None
self._running: bool = False
self._current_job_id: Optional[str] = None
self._current_lock_token: Optional[str] = None
self._lock_renewal_task: Optional[asyncio.Task] = None
self._sweeper_task: Optional[asyncio.Task] = None
async def initialize(self) -> None:
"""初始化 Worker"""
self._job_manager = JobManager()
await self._job_manager.initialize()
logger.info("Worker 初始化完成")
async def shutdown(self) -> None:
"""关闭 Worker"""
logger.info("Worker 正在关闭...")
self._running = False
# 取消回收器任务
if self._sweeper_task and not self._sweeper_task.done():
self._sweeper_task.cancel()
try:
await self._sweeper_task
except asyncio.CancelledError:
pass
# 取消锁续租任务
if self._lock_renewal_task and not self._lock_renewal_task.done():
self._lock_renewal_task.cancel()
try:
await self._lock_renewal_task
except asyncio.CancelledError:
pass
# 等待当前任务完成
if self._current_job_id:
logger.info(f"等待当前任务完成: {self._current_job_id}")
if self._job_manager:
await self._job_manager.shutdown()
logger.info("Worker 已关闭")
async def run(self) -> None:
"""运行 Worker 主循环"""
self._running = True
logger.info(
f"Worker 启动,轮询间隔: {settings.worker_poll_interval}s"
f"最大并发: {settings.max_concurrent_jobs}"
)
# 启动超时任务回收器
if settings.job_sweeper_enabled:
self._sweeper_task = asyncio.create_task(self._sweeper_loop())
logger.info(f"超时任务回收器已启动,扫描间隔: {settings.job_sweeper_interval}s")
while self._running:
try:
await self._process_next_job()
except Exception as e:
logger.error(f"Worker 循环异常: {e}", exc_info=True)
await asyncio.sleep(settings.worker_poll_interval)
async def _process_next_job(self) -> None:
"""处理下一个任务"""
if not self._job_manager:
logger.error("JobManager 未初始化")
await asyncio.sleep(settings.worker_poll_interval)
return
# 从队列获取任务(转移式出队)
job_id = await self._job_manager.dequeue_job(timeout=int(settings.worker_poll_interval))
if not job_id:
return
# 获取任务信息以提取 request_id
job_data = await self._job_manager.get_job(job_id)
if job_data:
request_id = job_data.get("request_id") or job_id
set_request_id(request_id)
else:
set_request_id(job_id)
logger.info(f"从队列获取任务: {job_id}")
# 尝试获取分布式锁(返回 token
lock_token = await self._job_manager.acquire_job_lock(job_id)
if not lock_token:
logger.warning(f"无法获取任务锁,任务可能正在被其他 Worker 执行: {job_id}")
# 任务留在 processing 队列,等待回收器处理
return
self._current_lock_token = lock_token
# 启动锁续租协程
self._lock_renewal_task = asyncio.create_task(self._lock_renewal_loop(job_id, lock_token))
try:
# 检查全局并发限制
if not await self._job_manager.can_execute():
logger.info(f"达到并发限制,任务 NACK 重新入队: {job_id}")
await self._job_manager.nack_job(job_id, requeue=True)
return
# 增加并发计数
await self._job_manager.increment_concurrency()
self._current_job_id = job_id
try:
# 执行任务
success = await self._execute_with_retry(job_id)
if success:
await self._job_manager.ack_job(job_id)
else:
await self._job_manager.increment_job_retry(job_id)
await self._job_manager.nack_job(job_id, requeue=True)
finally:
# 减少并发计数
await self._job_manager.decrement_concurrency()
self._current_job_id = None
finally:
# 停止锁续租
if self._lock_renewal_task and not self._lock_renewal_task.done():
self._lock_renewal_task.cancel()
try:
await self._lock_renewal_task
except asyncio.CancelledError:
pass
self._lock_renewal_task = None
# 释放分布式锁
await self._job_manager.release_job_lock(job_id, lock_token)
self._current_lock_token = None
async def _execute_with_retry(self, job_id: str) -> bool:
"""执行任务(带重试机制)
Returns:
bool: 任务是否成功执行
"""
if not self._job_manager:
return False
try:
# 执行任务
await asyncio.wait_for(
self._job_manager.execute_job(job_id),
timeout=settings.job_execution_timeout,
)
return True
except asyncio.TimeoutError:
logger.error(f"任务执行超时: {job_id}")
await self._handle_job_failure(job_id, "任务执行超时")
return False
except Exception as e:
logger.error(f"任务执行异常: {job_id}, error={e}", exc_info=True)
await self._handle_job_failure(job_id, str(e))
return False
async def _handle_job_failure(self, job_id: str, error: str) -> None:
"""处理任务失败"""
if not self._job_manager:
return
retry_count = await self._job_manager.increment_job_retry(job_id)
if retry_count < settings.job_max_retries:
logger.info(f"任务将重试 ({retry_count}/{settings.job_max_retries}): {job_id}")
# 重新入队
await self._job_manager.enqueue_job(job_id)
else:
logger.error(f"任务达到最大重试次数,标记为失败: {job_id}")
# 更新任务状态为失败
if self._job_manager._redis_client:
key = f"job:{job_id}"
await self._job_manager._redis_client.hset(
key,
mapping={
"status": "failed",
"error": f"达到最大重试次数 ({settings.job_max_retries}): {error}",
},
)
async def _lock_renewal_loop(self, job_id: str, lock_token: str) -> None:
"""锁续租协程
定期续租任务锁,防止长任务执行时锁过期。
Args:
job_id: 任务 ID
lock_token: 锁 token
"""
# 续租间隔为锁 TTL 的一半
interval = (settings.job_execution_timeout + settings.job_lock_buffer) / 2
while True:
try:
await asyncio.sleep(interval)
if not self._job_manager:
break
if not await self._job_manager.renew_job_lock(job_id, lock_token):
logger.error(f"锁续租失败,可能已被其他进程获取: {job_id}")
break
logger.debug(f"锁续租成功: {job_id}")
except asyncio.CancelledError:
logger.debug(f"锁续租协程已取消: {job_id}")
break
except Exception as e:
logger.error(f"锁续租异常: {job_id}, error={e}")
break
async def _sweeper_loop(self) -> None:
"""超时任务回收协程
定期扫描处理中队列,回收超时任务,并收集队列监控指标。
"""
while self._running:
try:
await asyncio.sleep(settings.job_sweeper_interval)
if not self._job_manager:
continue
# 回收超时任务
recovered = await self._job_manager.recover_stale_jobs()
if recovered > 0:
logger.info(f"回收超时任务: {recovered}")
# 记录回收指标
from .core.metrics_unified import incr
await incr("job_recovered_total", None, recovered)
# 收集队列监控指标
await self._job_manager.collect_queue_metrics()
except asyncio.CancelledError:
logger.debug("超时任务回收协程已取消")
break
except Exception as e:
logger.error(f"超时任务回收异常: {e}")
def setup_signal_handlers(worker: JobWorker, loop: asyncio.AbstractEventLoop) -> None:
"""设置信号处理器"""
def signal_handler(sig: signal.Signals) -> None:
logger.info(f"收到信号 {sig.name},准备关闭...")
loop.create_task(worker.shutdown())
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler, sig)
async def main() -> None:
"""Worker 入口函数"""
# 设置日志
setup_logging(level=settings.log_level, format_type=settings.log_format)
worker = JobWorker()
# 设置信号处理
loop = asyncio.get_running_loop()
setup_signal_handlers(worker, loop)
try:
await worker.initialize()
await worker.run()
except Exception as e:
logger.error(f"Worker 异常退出: {e}", exc_info=True)
sys.exit(1)
finally:
await worker.shutdown()
if __name__ == "__main__":
asyncio.run(main())