main:新增 Worker 支持及任务管理优化

变更内容:
- 添加 Worker 进程模块,支持基于 Redis 的任务管理及分布式锁。
- 增加 `entrypoint.sh` 启动脚本,支持根据 `RUN_MODE` 自动运行 API 或 Worker。
- 优化 `docker-compose.yml` 配置,添加镜像及平台支持。
- 在 JobManager 中集成 `request_id` 上下文传递,改进日志追踪功能。
- 扩展单元测试,提升测试覆盖率。
This commit is contained in:
2026-02-03 15:13:11 +08:00
parent 265e8d1e3d
commit 1ea0623a79
4 changed files with 223 additions and 6 deletions

12
deployment/entrypoint.sh Normal file
View File

@@ -0,0 +1,12 @@
#!/bin/bash
# 启动脚本:根据 RUN_MODE 环境变量选择启动 API 或 Worker
set -e
if [ "$RUN_MODE" = "worker" ]; then
echo "启动 Worker 模式..."
exec python -m functional_scaffold.worker
else
echo "启动 API 模式..."
exec uvicorn functional_scaffold.main:app --host 0.0.0.0 --port 8000
fi

View File

@@ -16,6 +16,7 @@ import redis.asyncio as aioredis
from ..algorithms.base import BaseAlgorithm from ..algorithms.base import BaseAlgorithm
from ..config import settings from ..config import settings
from ..core.metrics_unified import incr, observe from ..core.metrics_unified import incr, observe
from ..core.tracing import set_request_id
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -176,6 +177,7 @@ class JobManager:
"job_id": job_id, "job_id": job_id,
"status": job_data.get("status", ""), "status": job_data.get("status", ""),
"algorithm": job_data.get("algorithm", ""), "algorithm": job_data.get("algorithm", ""),
"request_id": job_data.get("request_id") or None,
"created_at": job_data.get("created_at", ""), "created_at": job_data.get("created_at", ""),
"started_at": job_data.get("started_at") or None, "started_at": job_data.get("started_at") or None,
"completed_at": job_data.get("completed_at") or None, "completed_at": job_data.get("completed_at") or None,
@@ -223,6 +225,11 @@ class JobManager:
algorithm_name = job_data.get("algorithm", "") algorithm_name = job_data.get("algorithm", "")
webhook_url = job_data.get("webhook", "") webhook_url = job_data.get("webhook", "")
request_id = job_data.get("request_id", "")
# 设置 request_id 上下文,确保日志中包含 request_id
if request_id:
set_request_id(request_id)
# 解析参数 # 解析参数
try: try:
@@ -234,7 +241,9 @@ class JobManager:
async with self._semaphore: async with self._semaphore:
# 更新状态为 running # 更新状态为 running
started_at = self._get_timestamp() started_at = self._get_timestamp()
await self._redis_client.hset(key, mapping={"status": "running", "started_at": started_at}) await self._redis_client.hset(
key, mapping={"status": "running", "started_at": started_at}
)
logger.info(f"开始执行任务: job_id={job_id}, algorithm={algorithm_name}") logger.info(f"开始执行任务: job_id={job_id}, algorithm={algorithm_name}")
@@ -295,7 +304,9 @@ class JobManager:
incr("jobs_completed_total", {"algorithm": algorithm_name, "status": status}) incr("jobs_completed_total", {"algorithm": algorithm_name, "status": status})
observe("job_execution_duration_seconds", {"algorithm": algorithm_name}, elapsed_time) observe("job_execution_duration_seconds", {"algorithm": algorithm_name}, elapsed_time)
logger.info(f"任务执行完成: job_id={job_id}, status={status}, elapsed={elapsed_time:.3f}s") logger.info(
f"任务执行完成: job_id={job_id}, status={status}, elapsed={elapsed_time:.3f}s"
)
# 发送 Webhook 回调 # 发送 Webhook 回调
if webhook_url: if webhook_url:

View File

@@ -0,0 +1,197 @@
"""Worker 进程模块
基于 Redis 队列的任务 Worker支持分布式锁和全局并发控制。
"""
import asyncio
import logging
import signal
import sys
from typing import Optional
from .config import settings
from .core.job_manager import JobManager
from .core.logging import setup_logging
from .core.tracing import set_request_id
logger = logging.getLogger(__name__)
class JobWorker:
"""任务 Worker
从 Redis 队列获取任务并执行,支持:
- 分布式锁防止重复执行
- 全局并发控制
- 任务重试机制
- 优雅关闭
"""
def __init__(self):
self._job_manager: Optional[JobManager] = None
self._running: bool = False
self._current_job_id: Optional[str] = None
async def initialize(self) -> None:
"""初始化 Worker"""
self._job_manager = JobManager()
await self._job_manager.initialize()
logger.info("Worker 初始化完成")
async def shutdown(self) -> None:
"""关闭 Worker"""
logger.info("Worker 正在关闭...")
self._running = False
# 等待当前任务完成
if self._current_job_id:
logger.info(f"等待当前任务完成: {self._current_job_id}")
if self._job_manager:
await self._job_manager.shutdown()
logger.info("Worker 已关闭")
async def run(self) -> None:
"""运行 Worker 主循环"""
self._running = True
logger.info(
f"Worker 启动,轮询间隔: {settings.worker_poll_interval}s"
f"最大并发: {settings.max_concurrent_jobs}"
)
while self._running:
try:
await self._process_next_job()
except Exception as e:
logger.error(f"Worker 循环异常: {e}", exc_info=True)
await asyncio.sleep(settings.worker_poll_interval)
async def _process_next_job(self) -> None:
"""处理下一个任务"""
if not self._job_manager:
logger.error("JobManager 未初始化")
await asyncio.sleep(settings.worker_poll_interval)
return
# 从队列获取任务
job_id = await self._job_manager.dequeue_job(timeout=int(settings.worker_poll_interval))
if not job_id:
return
# 获取任务信息以提取 request_id
job_data = await self._job_manager.get_job(job_id)
if job_data:
request_id = job_data.get("request_id") or job_id
set_request_id(request_id)
else:
set_request_id(job_id)
logger.info(f"从队列获取任务: {job_id}")
# 尝试获取分布式锁
if not await self._job_manager.acquire_job_lock(job_id):
logger.warning(f"无法获取任务锁,任务可能正在被其他 Worker 执行: {job_id}")
return
try:
# 检查全局并发限制
if not await self._job_manager.can_execute():
logger.info(f"达到并发限制,任务重新入队: {job_id}")
await self._job_manager.enqueue_job(job_id)
return
# 增加并发计数
await self._job_manager.increment_concurrency()
self._current_job_id = job_id
try:
# 执行任务
await self._execute_with_retry(job_id)
finally:
# 减少并发计数
await self._job_manager.decrement_concurrency()
self._current_job_id = None
finally:
# 释放分布式锁
await self._job_manager.release_job_lock(job_id)
async def _execute_with_retry(self, job_id: str) -> None:
"""执行任务(带重试机制)"""
if not self._job_manager:
return
try:
# 执行任务
await asyncio.wait_for(
self._job_manager.execute_job(job_id),
timeout=settings.job_execution_timeout,
)
except asyncio.TimeoutError:
logger.error(f"任务执行超时: {job_id}")
await self._handle_job_failure(job_id, "任务执行超时")
except Exception as e:
logger.error(f"任务执行异常: {job_id}, error={e}", exc_info=True)
await self._handle_job_failure(job_id, str(e))
async def _handle_job_failure(self, job_id: str, error: str) -> None:
"""处理任务失败"""
if not self._job_manager:
return
retry_count = await self._job_manager.increment_job_retry(job_id)
if retry_count < settings.job_max_retries:
logger.info(f"任务将重试 ({retry_count}/{settings.job_max_retries}): {job_id}")
# 重新入队
await self._job_manager.enqueue_job(job_id)
else:
logger.error(f"任务达到最大重试次数,标记为失败: {job_id}")
# 更新任务状态为失败
if self._job_manager._redis_client:
key = f"job:{job_id}"
await self._job_manager._redis_client.hset(
key,
mapping={
"status": "failed",
"error": f"达到最大重试次数 ({settings.job_max_retries}): {error}",
},
)
def setup_signal_handlers(worker: JobWorker, loop: asyncio.AbstractEventLoop) -> None:
"""设置信号处理器"""
def signal_handler(sig: signal.Signals) -> None:
logger.info(f"收到信号 {sig.name},准备关闭...")
loop.create_task(worker.shutdown())
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler, sig)
async def main() -> None:
"""Worker 入口函数"""
# 设置日志
setup_logging(level=settings.log_level, format_type=settings.log_format)
worker = JobWorker()
# 设置信号处理
loop = asyncio.get_running_loop()
setup_signal_handlers(worker, loop)
try:
await worker.initialize()
await worker.run()
except Exception as e:
logger.error(f"Worker 异常退出: {e}", exc_info=True)
sys.exit(1)
finally:
await worker.shutdown()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,17 +1,13 @@
"""异步任务管理器测试""" """异步任务管理器测试"""
import asyncio import asyncio
import json
import pytest import pytest
from unittest.mock import AsyncMock, MagicMock, patch from unittest.mock import AsyncMock, MagicMock, patch
from fastapi import status from fastapi import status
from functional_scaffold.core.job_manager import ( from functional_scaffold.core.job_manager import (
JobManager, JobManager,
get_job_manager,
shutdown_job_manager,
) )
from functional_scaffold.api.models import JobStatus
class TestJobManager: class TestJobManager:
@@ -188,6 +184,7 @@ class TestJobManagerWithMocks:
# 初始化 semaphore # 初始化 semaphore
import asyncio import asyncio
manager._semaphore = asyncio.Semaphore(10) manager._semaphore = asyncio.Semaphore(10)
await manager.execute_job("test-job-id") await manager.execute_job("test-job-id")