main:新增健康检查支持和服务优化

- 在 Worker 中引入轻量级 HTTP 服务器,支持健康检查和就绪检查端点。
- 在 Kubernetes 和 Docker 配置中新增健康检查探针,提升服务稳定性。
- 更新依赖,引入 `aiohttp` 用于健康检查服务。
- 优化部署配置,调整 Redis 主机配置和镜像地址以适配新环境。
This commit is contained in:
2026-02-04 11:58:56 +08:00
parent e0138d5531
commit 55419443cd
6 changed files with 107 additions and 14 deletions

View File

@@ -45,6 +45,9 @@ services:
build:
context: ..
dockerfile: deployment/Dockerfile
platform: linux/amd64
ports:
- "8112:8000"
environment:
- APP_ENV=development
- LOG_LEVEL=INFO
@@ -69,6 +72,12 @@ services:
depends_on:
redis:
condition: service_healthy
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/healthz')"]
interval: 30s
timeout: 3s
retries: 3
start_period: 10s
deploy:
replicas: 2

View File

@@ -127,16 +127,25 @@ spec:
limits:
memory: "512Mi"
cpu: "500m"
# Worker 有 HTTP 端口,使用命令探针
# Worker 现在有 HTTP 健康检查端点
ports:
- containerPort: 8000
name: http
livenessProbe:
exec:
command:
- python
- -c
- "import redis; r = redis.Redis(host='functional-scaffold-redis'); r.ping()"
httpGet:
path: /healthz
port: 8000
initialDelaySeconds: 10
periodSeconds: 30
timeoutSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /readyz
port: 8000
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 3
---

View File

@@ -5,11 +5,11 @@ name: functional-scaffold
access: default
vars:
region: cn-hangzhou
image: registry.cn-hangzhou.aliyuncs.com/your-namespace/functional-scaffold:latest
redis_host: r-xxxxx.redis.rds.aliyuncs.com
region: cn-beijing
image: crpi-om2xd9y8cmaizszf-vpc.cn-beijing.personal.cr.aliyuncs.com/test-namespace-gu/fc-test:test-v1
redis_host: 172.17.133.51
redis_port: "6379"
redis_password: "your-password"
redis_password: "roog-pass-redis"
resources:
# API 服务函数
@@ -78,6 +78,13 @@ resources:
port: 8000
command:
- /app/entrypoint.sh
healthCheckConfig:
httpGetUrl: /healthz
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 3
successThreshold: 1
environmentVariables:
APP_ENV: production
LOG_LEVEL: INFO

View File

@@ -25,6 +25,8 @@ dependencies = [
"pyyaml>=6.0.0",
# HTTP 客户端Webhook 回调)
"httpx>=0.27.0",
# 轻量级 HTTP 服务器Worker 健康检查)
"aiohttp>=3.9.0",
]
[project.optional-dependencies]

View File

@@ -5,6 +5,7 @@ pydantic>=2.5.0
pydantic-settings>=2.0.0
prometheus-client>=0.19.0
python-json-logger>=2.0.7
aiohttp>=3.9.0
# Redis - 任务队列和指标存储
redis>=5.0.0

View File

@@ -9,6 +9,8 @@ import signal
import sys
from typing import Optional
from aiohttp import web
from .config import settings
from .core.job_manager import JobManager
from .core.logging import setup_logging
@@ -17,6 +19,53 @@ from .core.tracing import set_request_id
logger = logging.getLogger(__name__)
class HealthCheckServer:
"""轻量级健康检查 HTTP 服务器
为 Worker 模式提供健康检查端点,满足 FC 3.0 容器健康检查要求。
"""
def __init__(self, host: str = "0.0.0.0", port: int = 8000):
self._host = host
self._port = port
self._app: Optional[web.Application] = None
self._runner: Optional[web.AppRunner] = None
self._site: Optional[web.TCPSite] = None
self._healthy = True
async def start(self) -> None:
"""启动健康检查服务器"""
self._app = web.Application()
self._app.router.add_get("/healthz", self._healthz_handler)
self._app.router.add_get("/readyz", self._readyz_handler)
self._runner = web.AppRunner(self._app)
await self._runner.setup()
self._site = web.TCPSite(self._runner, self._host, self._port)
await self._site.start()
logger.info(f"健康检查服务器已启动: http://{self._host}:{self._port}")
async def stop(self) -> None:
"""停止健康检查服务器"""
if self._runner:
await self._runner.cleanup()
logger.info("健康检查服务器已停止")
def set_healthy(self, healthy: bool) -> None:
"""设置健康状态"""
self._healthy = healthy
async def _healthz_handler(self, request: web.Request) -> web.Response:
"""存活检查端点"""
return web.json_response({"status": "healthy", "mode": "worker"})
async def _readyz_handler(self, request: web.Request) -> web.Response:
"""就绪检查端点"""
if self._healthy:
return web.json_response({"status": "ready", "mode": "worker"})
return web.json_response({"status": "not ready"}, status=503)
class JobWorker:
"""任务 Worker
@@ -272,12 +321,21 @@ class JobWorker:
logger.error(f"超时任务回收异常: {e}")
def setup_signal_handlers(worker: JobWorker, loop: asyncio.AbstractEventLoop) -> None:
def setup_signal_handlers(
worker: JobWorker,
health_server: HealthCheckServer,
loop: asyncio.AbstractEventLoop,
) -> None:
"""设置信号处理器"""
async def shutdown_all() -> None:
"""关闭所有服务"""
await worker.shutdown()
await health_server.stop()
def signal_handler(sig: signal.Signals) -> None:
logger.info(f"收到信号 {sig.name},准备关闭...")
loop.create_task(worker.shutdown())
loop.create_task(shutdown_all())
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler, sig)
@@ -288,13 +346,19 @@ async def main() -> None:
# 设置日志
setup_logging(level=settings.log_level, format_type=settings.log_format)
# 创建健康检查服务器和 Worker
health_server = HealthCheckServer(port=8000)
worker = JobWorker()
# 设置信号处理
loop = asyncio.get_running_loop()
setup_signal_handlers(worker, loop)
setup_signal_handlers(worker, health_server, loop)
try:
# 先启动健康检查服务器,确保 FC 健康检查能通过
await health_server.start()
# 初始化并运行 Worker
await worker.initialize()
await worker.run()
except Exception as e:
@@ -302,6 +366,7 @@ async def main() -> None:
sys.exit(1)
finally:
await worker.shutdown()
await health_server.stop()
if __name__ == "__main__":