main:新增健康检查支持和服务优化
- 在 Worker 中引入轻量级 HTTP 服务器,支持健康检查和就绪检查端点。 - 在 Kubernetes 和 Docker 配置中新增健康检查探针,提升服务稳定性。 - 更新依赖,引入 `aiohttp` 用于健康检查服务。 - 优化部署配置,调整 Redis 主机配置和镜像地址以适配新环境。
This commit is contained in:
@@ -45,6 +45,9 @@ services:
|
|||||||
build:
|
build:
|
||||||
context: ..
|
context: ..
|
||||||
dockerfile: deployment/Dockerfile
|
dockerfile: deployment/Dockerfile
|
||||||
|
platform: linux/amd64
|
||||||
|
ports:
|
||||||
|
- "8112:8000"
|
||||||
environment:
|
environment:
|
||||||
- APP_ENV=development
|
- APP_ENV=development
|
||||||
- LOG_LEVEL=INFO
|
- LOG_LEVEL=INFO
|
||||||
@@ -69,6 +72,12 @@ services:
|
|||||||
depends_on:
|
depends_on:
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/healthz')"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 3s
|
||||||
|
retries: 3
|
||||||
|
start_period: 10s
|
||||||
deploy:
|
deploy:
|
||||||
replicas: 2
|
replicas: 2
|
||||||
|
|
||||||
|
|||||||
@@ -127,16 +127,25 @@ spec:
|
|||||||
limits:
|
limits:
|
||||||
memory: "512Mi"
|
memory: "512Mi"
|
||||||
cpu: "500m"
|
cpu: "500m"
|
||||||
# Worker 没有 HTTP 端口,使用命令探针
|
# Worker 现在有 HTTP 健康检查端点
|
||||||
|
ports:
|
||||||
|
- containerPort: 8000
|
||||||
|
name: http
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
exec:
|
httpGet:
|
||||||
command:
|
path: /healthz
|
||||||
- python
|
port: 8000
|
||||||
- -c
|
|
||||||
- "import redis; r = redis.Redis(host='functional-scaffold-redis'); r.ping()"
|
|
||||||
initialDelaySeconds: 10
|
initialDelaySeconds: 10
|
||||||
periodSeconds: 30
|
periodSeconds: 30
|
||||||
timeoutSeconds: 5
|
timeoutSeconds: 3
|
||||||
|
failureThreshold: 3
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /readyz
|
||||||
|
port: 8000
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 3
|
||||||
failureThreshold: 3
|
failureThreshold: 3
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|||||||
@@ -5,11 +5,11 @@ name: functional-scaffold
|
|||||||
access: default
|
access: default
|
||||||
|
|
||||||
vars:
|
vars:
|
||||||
region: cn-hangzhou
|
region: cn-beijing
|
||||||
image: registry.cn-hangzhou.aliyuncs.com/your-namespace/functional-scaffold:latest
|
image: crpi-om2xd9y8cmaizszf-vpc.cn-beijing.personal.cr.aliyuncs.com/test-namespace-gu/fc-test:test-v1
|
||||||
redis_host: r-xxxxx.redis.rds.aliyuncs.com
|
redis_host: 172.17.133.51
|
||||||
redis_port: "6379"
|
redis_port: "6379"
|
||||||
redis_password: "your-password"
|
redis_password: "roog-pass-redis"
|
||||||
|
|
||||||
resources:
|
resources:
|
||||||
# API 服务函数
|
# API 服务函数
|
||||||
@@ -78,6 +78,13 @@ resources:
|
|||||||
port: 8000
|
port: 8000
|
||||||
command:
|
command:
|
||||||
- /app/entrypoint.sh
|
- /app/entrypoint.sh
|
||||||
|
healthCheckConfig:
|
||||||
|
httpGetUrl: /healthz
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 3
|
||||||
|
failureThreshold: 3
|
||||||
|
successThreshold: 1
|
||||||
environmentVariables:
|
environmentVariables:
|
||||||
APP_ENV: production
|
APP_ENV: production
|
||||||
LOG_LEVEL: INFO
|
LOG_LEVEL: INFO
|
||||||
|
|||||||
@@ -25,6 +25,8 @@ dependencies = [
|
|||||||
"pyyaml>=6.0.0",
|
"pyyaml>=6.0.0",
|
||||||
# HTTP 客户端(Webhook 回调)
|
# HTTP 客户端(Webhook 回调)
|
||||||
"httpx>=0.27.0",
|
"httpx>=0.27.0",
|
||||||
|
# 轻量级 HTTP 服务器(Worker 健康检查)
|
||||||
|
"aiohttp>=3.9.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ pydantic>=2.5.0
|
|||||||
pydantic-settings>=2.0.0
|
pydantic-settings>=2.0.0
|
||||||
prometheus-client>=0.19.0
|
prometheus-client>=0.19.0
|
||||||
python-json-logger>=2.0.7
|
python-json-logger>=2.0.7
|
||||||
|
aiohttp>=3.9.0
|
||||||
|
|
||||||
# Redis - 任务队列和指标存储
|
# Redis - 任务队列和指标存储
|
||||||
redis>=5.0.0
|
redis>=5.0.0
|
||||||
|
|||||||
@@ -9,6 +9,8 @@ import signal
|
|||||||
import sys
|
import sys
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
from aiohttp import web
|
||||||
|
|
||||||
from .config import settings
|
from .config import settings
|
||||||
from .core.job_manager import JobManager
|
from .core.job_manager import JobManager
|
||||||
from .core.logging import setup_logging
|
from .core.logging import setup_logging
|
||||||
@@ -17,6 +19,53 @@ from .core.tracing import set_request_id
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class HealthCheckServer:
|
||||||
|
"""轻量级健康检查 HTTP 服务器
|
||||||
|
|
||||||
|
为 Worker 模式提供健康检查端点,满足 FC 3.0 容器健康检查要求。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, host: str = "0.0.0.0", port: int = 8000):
|
||||||
|
self._host = host
|
||||||
|
self._port = port
|
||||||
|
self._app: Optional[web.Application] = None
|
||||||
|
self._runner: Optional[web.AppRunner] = None
|
||||||
|
self._site: Optional[web.TCPSite] = None
|
||||||
|
self._healthy = True
|
||||||
|
|
||||||
|
async def start(self) -> None:
|
||||||
|
"""启动健康检查服务器"""
|
||||||
|
self._app = web.Application()
|
||||||
|
self._app.router.add_get("/healthz", self._healthz_handler)
|
||||||
|
self._app.router.add_get("/readyz", self._readyz_handler)
|
||||||
|
|
||||||
|
self._runner = web.AppRunner(self._app)
|
||||||
|
await self._runner.setup()
|
||||||
|
self._site = web.TCPSite(self._runner, self._host, self._port)
|
||||||
|
await self._site.start()
|
||||||
|
logger.info(f"健康检查服务器已启动: http://{self._host}:{self._port}")
|
||||||
|
|
||||||
|
async def stop(self) -> None:
|
||||||
|
"""停止健康检查服务器"""
|
||||||
|
if self._runner:
|
||||||
|
await self._runner.cleanup()
|
||||||
|
logger.info("健康检查服务器已停止")
|
||||||
|
|
||||||
|
def set_healthy(self, healthy: bool) -> None:
|
||||||
|
"""设置健康状态"""
|
||||||
|
self._healthy = healthy
|
||||||
|
|
||||||
|
async def _healthz_handler(self, request: web.Request) -> web.Response:
|
||||||
|
"""存活检查端点"""
|
||||||
|
return web.json_response({"status": "healthy", "mode": "worker"})
|
||||||
|
|
||||||
|
async def _readyz_handler(self, request: web.Request) -> web.Response:
|
||||||
|
"""就绪检查端点"""
|
||||||
|
if self._healthy:
|
||||||
|
return web.json_response({"status": "ready", "mode": "worker"})
|
||||||
|
return web.json_response({"status": "not ready"}, status=503)
|
||||||
|
|
||||||
|
|
||||||
class JobWorker:
|
class JobWorker:
|
||||||
"""任务 Worker
|
"""任务 Worker
|
||||||
|
|
||||||
@@ -272,12 +321,21 @@ class JobWorker:
|
|||||||
logger.error(f"超时任务回收异常: {e}")
|
logger.error(f"超时任务回收异常: {e}")
|
||||||
|
|
||||||
|
|
||||||
def setup_signal_handlers(worker: JobWorker, loop: asyncio.AbstractEventLoop) -> None:
|
def setup_signal_handlers(
|
||||||
|
worker: JobWorker,
|
||||||
|
health_server: HealthCheckServer,
|
||||||
|
loop: asyncio.AbstractEventLoop,
|
||||||
|
) -> None:
|
||||||
"""设置信号处理器"""
|
"""设置信号处理器"""
|
||||||
|
|
||||||
|
async def shutdown_all() -> None:
|
||||||
|
"""关闭所有服务"""
|
||||||
|
await worker.shutdown()
|
||||||
|
await health_server.stop()
|
||||||
|
|
||||||
def signal_handler(sig: signal.Signals) -> None:
|
def signal_handler(sig: signal.Signals) -> None:
|
||||||
logger.info(f"收到信号 {sig.name},准备关闭...")
|
logger.info(f"收到信号 {sig.name},准备关闭...")
|
||||||
loop.create_task(worker.shutdown())
|
loop.create_task(shutdown_all())
|
||||||
|
|
||||||
for sig in (signal.SIGTERM, signal.SIGINT):
|
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||||
loop.add_signal_handler(sig, signal_handler, sig)
|
loop.add_signal_handler(sig, signal_handler, sig)
|
||||||
@@ -288,13 +346,19 @@ async def main() -> None:
|
|||||||
# 设置日志
|
# 设置日志
|
||||||
setup_logging(level=settings.log_level, format_type=settings.log_format)
|
setup_logging(level=settings.log_level, format_type=settings.log_format)
|
||||||
|
|
||||||
|
# 创建健康检查服务器和 Worker
|
||||||
|
health_server = HealthCheckServer(port=8000)
|
||||||
worker = JobWorker()
|
worker = JobWorker()
|
||||||
|
|
||||||
# 设置信号处理
|
# 设置信号处理
|
||||||
loop = asyncio.get_running_loop()
|
loop = asyncio.get_running_loop()
|
||||||
setup_signal_handlers(worker, loop)
|
setup_signal_handlers(worker, health_server, loop)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# 先启动健康检查服务器,确保 FC 健康检查能通过
|
||||||
|
await health_server.start()
|
||||||
|
|
||||||
|
# 初始化并运行 Worker
|
||||||
await worker.initialize()
|
await worker.initialize()
|
||||||
await worker.run()
|
await worker.run()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -302,6 +366,7 @@ async def main() -> None:
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
finally:
|
finally:
|
||||||
await worker.shutdown()
|
await worker.shutdown()
|
||||||
|
await health_server.stop()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user