Compare commits
16 Commits
8ca2f64f7e
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| b47be9dda4 | |||
| 55419443cd | |||
| e0138d5531 | |||
| c92cac6ebb | |||
| c76ece8f48 | |||
| d211074576 | |||
| a4d2ad1e93 | |||
| b5ca0e0593 | |||
| 7b627090f3 | |||
| 73bd66813c | |||
| 6341cdf8ea | |||
| bf933b20f1 | |||
| c3e16dcad3 | |||
| c0bd4760b1 | |||
| f2a164b82c | |||
| bad3a34a82 |
73
.env.example
73
.env.example
@@ -1,32 +1,87 @@
|
||||
# Environment Configuration
|
||||
# Copy this file to .env and fill in your values
|
||||
|
||||
# Application
|
||||
# =============================================================================
|
||||
# 应用信息
|
||||
# =============================================================================
|
||||
APP_NAME=FunctionalScaffold
|
||||
APP_VERSION=1.0.0
|
||||
APP_ENV=development
|
||||
|
||||
# Server
|
||||
# =============================================================================
|
||||
# 服务器配置
|
||||
# =============================================================================
|
||||
HOST=0.0.0.0
|
||||
PORT=8000
|
||||
WORKERS=4
|
||||
|
||||
# Logging
|
||||
# =============================================================================
|
||||
# 日志配置
|
||||
# =============================================================================
|
||||
LOG_LEVEL=INFO
|
||||
LOG_FORMAT=json
|
||||
# 日志文件配置(可选,默认禁用)
|
||||
LOG_FILE_ENABLED=false
|
||||
LOG_FILE_PATH=/var/log/app/app.log
|
||||
|
||||
# Metrics
|
||||
# =============================================================================
|
||||
# 指标配置
|
||||
# =============================================================================
|
||||
METRICS_ENABLED=true
|
||||
METRICS_CONFIG_PATH=config/metrics.yaml
|
||||
# 指标实例 ID(可选,默认使用 hostname)
|
||||
# METRICS_INSTANCE_ID=my-instance
|
||||
|
||||
# Tracing
|
||||
# =============================================================================
|
||||
# 追踪配置
|
||||
# =============================================================================
|
||||
TRACING_ENABLED=false
|
||||
JAEGER_ENDPOINT=http://localhost:14268/api/traces
|
||||
# JAEGER_ENDPOINT=http://localhost:14268/api/traces
|
||||
|
||||
# External Services (examples)
|
||||
# =============================================================================
|
||||
# Redis 配置
|
||||
# =============================================================================
|
||||
REDIS_HOST=localhost
|
||||
REDIS_PORT=6379
|
||||
REDIS_DB=0
|
||||
REDIS_PASSWORD=your_redis_password
|
||||
|
||||
# =============================================================================
|
||||
# 异步任务配置
|
||||
# =============================================================================
|
||||
# 任务结果缓存时间(秒),默认 30 分钟
|
||||
JOB_RESULT_TTL=1800
|
||||
# Webhook 最大重试次数
|
||||
WEBHOOK_MAX_RETRIES=3
|
||||
# Webhook 超时时间(秒)
|
||||
WEBHOOK_TIMEOUT=10
|
||||
# 最大并发任务数
|
||||
MAX_CONCURRENT_JOBS=10
|
||||
|
||||
# =============================================================================
|
||||
# Worker 配置
|
||||
# =============================================================================
|
||||
# Worker 轮询间隔(秒)
|
||||
WORKER_POLL_INTERVAL=1.0
|
||||
# 任务队列 Redis Key
|
||||
JOB_QUEUE_KEY=job:queue
|
||||
# 全局并发计数器 Redis Key
|
||||
JOB_CONCURRENCY_KEY=job:concurrency
|
||||
# 任务锁 TTL(秒)
|
||||
JOB_LOCK_TTL=300
|
||||
# 任务最大重试次数
|
||||
JOB_MAX_RETRIES=3
|
||||
# 任务执行超时(秒)
|
||||
JOB_EXECUTION_TIMEOUT=300
|
||||
|
||||
# =============================================================================
|
||||
# 外部服务配置(示例)
|
||||
# =============================================================================
|
||||
# OSS 配置
|
||||
# OSS_ENDPOINT=https://oss-cn-hangzhou.aliyuncs.com
|
||||
# OSS_ACCESS_KEY_ID=your_access_key
|
||||
# OSS_ACCESS_KEY_SECRET=your_secret_key
|
||||
# OSS_BUCKET_NAME=your_bucket
|
||||
|
||||
# Database (if needed)
|
||||
# DATABASE_URL=mysql://user:password@localhost:5432/dbname
|
||||
# 数据库配置
|
||||
# DATABASE_URL=mysql://user:password@localhost:3306/dbname
|
||||
|
||||
17
CLAUDE.md
17
CLAUDE.md
@@ -372,10 +372,23 @@ kubectl apply -f deployment/kubernetes/service.yaml
|
||||
- 资源限制:256Mi-512Mi 内存,250m-500m CPU
|
||||
- 健康检查:存活探针 (/healthz),就绪探针 (/readyz)
|
||||
|
||||
### 阿里云函数计算
|
||||
### 阿里云函数计算(FC 3.0)
|
||||
|
||||
```bash
|
||||
fun deploy -t deployment/serverless/aliyun-fc.yaml
|
||||
# 安装 Serverless Devs(如未安装)
|
||||
npm install -g @serverless-devs/s
|
||||
|
||||
# 配置阿里云凭证(首次使用)
|
||||
s config add
|
||||
|
||||
# 部署到阿里云函数计算
|
||||
cd deployment/serverless && s deploy
|
||||
|
||||
# 验证配置语法
|
||||
cd deployment/serverless && s plan
|
||||
|
||||
# 查看函数日志
|
||||
cd deployment/serverless && s logs --tail
|
||||
```
|
||||
|
||||
### AWS Lambda
|
||||
|
||||
22
LICENSE
Normal file
22
LICENSE
Normal file
@@ -0,0 +1,22 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2026 Roog
|
||||
Copyright (c) 2026 Guxinpei
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
24
README.md
24
README.md
@@ -18,14 +18,16 @@
|
||||
|
||||
## 文档
|
||||
|
||||
| 文档 | 描述 |
|
||||
|-----------------------------------------|--------------|
|
||||
| [快速入门](docs/getting-started.md) | 10 分钟上手指南 |
|
||||
| [算法开发指南](docs/algorithm-development.md) | 详细的算法开发教程 |
|
||||
| [API 参考](docs/api-reference.md) | 完整的 API 文档 |
|
||||
| [监控指南](docs/monitoring.md) | 监控和告警配置 |
|
||||
| [API 规范](docs/api/README.md) | OpenAPI 规范说明 |
|
||||
| [日志集成(Loki)](docs/loki-quick-reference.md) | 日志收集部署说明 |
|
||||
| 文档 | 描述 |
|
||||
|------------------------------------------------|--------------|
|
||||
| [快速入门](docs/getting-started.md) | 10 分钟上手指南 |
|
||||
| [算法开发指南](docs/algorithm-development.md) | 详细的算法开发教程 |
|
||||
| [API 参考](docs/api-reference.md) | 完整的 API 文档 |
|
||||
| [监控指南](docs/monitoring.md) | 监控和告警配置 |
|
||||
| [API 规范](docs/api/README.md) | OpenAPI 规范说明 |
|
||||
| [Kubernetes 部署](docs/kubernetes-deployment.md) | K8s 集群部署指南 |
|
||||
| [日志集成(Loki)](docs/loki-quick-reference.md) | 日志收集部署说明 |
|
||||
| [阿里云函数运算FC部署入门](docs/fc-deploy.md) | 阿里云FC部署入门 |
|
||||
|
||||
## 快速开始
|
||||
|
||||
@@ -80,6 +82,12 @@ docker run -p 8000:8000 functional-scaffold:latest
|
||||
# 或使用 docker-compose
|
||||
cd deployment
|
||||
docker-compose up
|
||||
|
||||
# 如果阿里FC无法识别 Platform:unknown/unknown 的情况时,请按下列执行打包:
|
||||
export DOCKER_DEFAULT_PLATFORM=linux/amd64
|
||||
export BUILDX_NO_DEFAULT_ATTESTATIONS=1
|
||||
docker compose build
|
||||
docker compose push
|
||||
```
|
||||
|
||||
## API 端点
|
||||
|
||||
@@ -94,6 +94,26 @@ custom_metrics:
|
||||
type: counter
|
||||
description: "Webhook 回调发送总数"
|
||||
labels: [status]
|
||||
|
||||
# 队列监控指标
|
||||
job_queue_length:
|
||||
name: "job_queue_length"
|
||||
type: gauge
|
||||
description: "待处理任务队列长度"
|
||||
labels: [queue]
|
||||
|
||||
job_oldest_waiting_seconds:
|
||||
name: "job_oldest_waiting_seconds"
|
||||
type: gauge
|
||||
description: "最长任务等待时间(秒)"
|
||||
labels: []
|
||||
|
||||
job_recovered_total:
|
||||
name: "job_recovered_total"
|
||||
type: counter
|
||||
description: "回收的超时任务总数"
|
||||
labels: []
|
||||
|
||||
prime_check_total:
|
||||
name: "prime_check"
|
||||
type: counter
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM python:3.11-slim
|
||||
FROM --platform=linux/amd64 python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -9,11 +9,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
|
||||
# 复制依赖文件
|
||||
COPY requirements.txt .
|
||||
COPY requirements-dev.txt .
|
||||
|
||||
# 安装 Python 依赖
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN pip install --no-cache-dir -r requirements-dev.txt
|
||||
|
||||
# 安装dev依赖
|
||||
#COPY requirements-dev.txt .
|
||||
#RUN pip install --no-cache-dir -r requirements-dev.txt
|
||||
|
||||
# 复制应用代码和配置
|
||||
COPY src/ ./src/
|
||||
@@ -30,9 +32,15 @@ USER appuser
|
||||
# 暴露端口
|
||||
EXPOSE 8000
|
||||
|
||||
# 健康检查
|
||||
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
||||
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/healthz')"
|
||||
# 运行模式:api(默认)或 worker
|
||||
ENV RUN_MODE=api
|
||||
|
||||
# 启动命令
|
||||
CMD ["uvicorn", "functional_scaffold.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
# 健康检查(仅对 API 模式有效)
|
||||
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
||||
CMD if [ "$RUN_MODE" = "api" ]; then python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/healthz')"; else exit 0; fi
|
||||
|
||||
# 启动脚本
|
||||
COPY --chown=appuser:appuser deployment/entrypoint.sh /app/entrypoint.sh
|
||||
RUN chmod +x /app/entrypoint.sh
|
||||
|
||||
CMD ["/app/entrypoint.sh"]
|
||||
@@ -5,12 +5,14 @@ services:
|
||||
build:
|
||||
context: ..
|
||||
dockerfile: deployment/Dockerfile
|
||||
platform: linux/amd64
|
||||
ports:
|
||||
- "8111:8000"
|
||||
environment:
|
||||
- APP_ENV=development
|
||||
- LOG_LEVEL=INFO
|
||||
- METRICS_ENABLED=true
|
||||
- RUN_MODE=api
|
||||
# Redis 指标存储配置
|
||||
- REDIS_HOST=redis
|
||||
- REDIS_PORT=6379
|
||||
@@ -38,6 +40,47 @@ services:
|
||||
retries: 3
|
||||
start_period: 5s
|
||||
|
||||
# Worker 服务 - 处理异步任务
|
||||
worker:
|
||||
build:
|
||||
context: ..
|
||||
dockerfile: deployment/Dockerfile
|
||||
platform: linux/amd64
|
||||
ports:
|
||||
- "8112:8000"
|
||||
environment:
|
||||
- APP_ENV=development
|
||||
- LOG_LEVEL=INFO
|
||||
- METRICS_ENABLED=true
|
||||
- RUN_MODE=worker
|
||||
# Redis 配置
|
||||
- REDIS_HOST=redis
|
||||
- REDIS_PORT=6379
|
||||
- REDIS_DB=0
|
||||
# Worker 配置
|
||||
- WORKER_POLL_INTERVAL=1.0
|
||||
- MAX_CONCURRENT_JOBS=10
|
||||
- JOB_MAX_RETRIES=3
|
||||
- JOB_EXECUTION_TIMEOUT=300
|
||||
volumes:
|
||||
- ../src:/app/src
|
||||
- ../config:/app/config
|
||||
labels:
|
||||
logging: "promtail"
|
||||
logging_jobname: "functional-scaffold-worker"
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/healthz')"]
|
||||
interval: 30s
|
||||
timeout: 3s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
deploy:
|
||||
replicas: 2
|
||||
|
||||
# Redis - 用于集中式指标存储
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
|
||||
12
deployment/entrypoint.sh
Normal file
12
deployment/entrypoint.sh
Normal file
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
# 启动脚本:根据 RUN_MODE 环境变量选择启动 API 或 Worker
|
||||
|
||||
set -e
|
||||
|
||||
if [ "$RUN_MODE" = "worker" ]; then
|
||||
echo "启动 Worker 模式..."
|
||||
exec python -m functional_scaffold.worker
|
||||
else
|
||||
echo "启动 API 模式..."
|
||||
exec uvicorn functional_scaffold.main:app --host 0.0.0.0 --port 8000
|
||||
fi
|
||||
@@ -1,33 +1,70 @@
|
||||
# Kubernetes 部署配置
|
||||
# 包含:ConfigMap、API Deployment、Worker Deployment、Redis Deployment
|
||||
|
||||
---
|
||||
# ConfigMap - 共享配置
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: functional-scaffold-config
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
data:
|
||||
APP_ENV: "production"
|
||||
LOG_LEVEL: "INFO"
|
||||
LOG_FORMAT: "json"
|
||||
METRICS_ENABLED: "true"
|
||||
# Redis 配置(指向集群内 Redis 服务)
|
||||
REDIS_HOST: "functional-scaffold-redis"
|
||||
REDIS_PORT: "6379"
|
||||
REDIS_DB: "0"
|
||||
# 异步任务配置
|
||||
MAX_CONCURRENT_JOBS: "10"
|
||||
JOB_RESULT_TTL: "1800"
|
||||
WEBHOOK_MAX_RETRIES: "3"
|
||||
WEBHOOK_TIMEOUT: "10"
|
||||
# Worker 配置
|
||||
WORKER_POLL_INTERVAL: "1.0"
|
||||
JOB_QUEUE_KEY: "job:queue"
|
||||
JOB_CONCURRENCY_KEY: "job:concurrency"
|
||||
JOB_LOCK_TTL: "300"
|
||||
JOB_MAX_RETRIES: "3"
|
||||
JOB_EXECUTION_TIMEOUT: "300"
|
||||
|
||||
---
|
||||
# API Deployment - HTTP 服务
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: functional-scaffold
|
||||
name: functional-scaffold-api
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
spec:
|
||||
replicas: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
spec:
|
||||
containers:
|
||||
- name: functional-scaffold
|
||||
- name: api
|
||||
image: functional-scaffold:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
- name: APP_ENV
|
||||
value: "production"
|
||||
- name: LOG_LEVEL
|
||||
value: "INFO"
|
||||
- name: METRICS_ENABLED
|
||||
value: "true"
|
||||
- name: RUN_MODE
|
||||
value: "api"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: functional-scaffold-config
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
@@ -51,3 +88,125 @@ spec:
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
|
||||
---
|
||||
# Worker Deployment - 异步任务处理
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: functional-scaffold-worker
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: worker
|
||||
spec:
|
||||
replicas: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app: functional-scaffold
|
||||
component: worker
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: worker
|
||||
spec:
|
||||
containers:
|
||||
- name: worker
|
||||
image: functional-scaffold:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: RUN_MODE
|
||||
value: "worker"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: functional-scaffold-config
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
# Worker 现在有 HTTP 健康检查端点
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 8000
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /readyz
|
||||
port: 8000
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
|
||||
---
|
||||
# Redis Deployment - 任务队列和状态存储
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: functional-scaffold-redis
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: redis
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: functional-scaffold
|
||||
component: redis
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: redis
|
||||
spec:
|
||||
containers:
|
||||
- name: redis
|
||||
image: redis:7-alpine
|
||||
ports:
|
||||
- containerPort: 6379
|
||||
name: redis
|
||||
command:
|
||||
- redis-server
|
||||
- --appendonly
|
||||
- "yes"
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "256Mi"
|
||||
cpu: "200m"
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- redis-cli
|
||||
- ping
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
readinessProbe:
|
||||
exec:
|
||||
command:
|
||||
- redis-cli
|
||||
- ping
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
volumeMounts:
|
||||
- name: redis-data
|
||||
mountPath: /data
|
||||
volumes:
|
||||
- name: redis-data
|
||||
emptyDir: {}
|
||||
@@ -1,9 +1,15 @@
|
||||
# Kubernetes Service 配置
|
||||
# 包含:API Service、Metrics Service、Redis Service
|
||||
|
||||
---
|
||||
# API Service - 对外暴露 HTTP 服务
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: functional-scaffold
|
||||
name: functional-scaffold-api
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
@@ -13,13 +19,21 @@ spec:
|
||||
name: http
|
||||
selector:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
|
||||
---
|
||||
# Metrics Service - Prometheus 抓取指标
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: functional-scaffold-metrics
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8000"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
@@ -29,3 +43,24 @@ spec:
|
||||
name: metrics
|
||||
selector:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
|
||||
---
|
||||
# Redis Service - 内部 Redis 服务
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: functional-scaffold-redis
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: redis
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 6379
|
||||
targetPort: 6379
|
||||
protocol: TCP
|
||||
name: redis
|
||||
selector:
|
||||
app: functional-scaffold
|
||||
component: redis
|
||||
@@ -1,40 +0,0 @@
|
||||
# 阿里云函数计算配置
|
||||
ROSTemplateFormatVersion: '2015-09-01'
|
||||
Transform: 'Aliyun::Serverless-2018-04-03'
|
||||
Resources:
|
||||
functional-scaffold:
|
||||
Type: 'Aliyun::Serverless::Service'
|
||||
Properties:
|
||||
Description: '算法工程化 Serverless 脚手架'
|
||||
LogConfig:
|
||||
Project: functional-scaffold-logs
|
||||
Logstore: function-logs
|
||||
VpcConfig:
|
||||
VpcId: 'vpc-xxxxx'
|
||||
VSwitchIds:
|
||||
- 'vsw-xxxxx'
|
||||
SecurityGroupId: 'sg-xxxxx'
|
||||
prime-checker:
|
||||
Type: 'Aliyun::Serverless::Function'
|
||||
Properties:
|
||||
Description: '质数判断算法服务'
|
||||
Runtime: custom-container
|
||||
MemorySize: 512
|
||||
Timeout: 60
|
||||
InstanceConcurrency: 10
|
||||
CAPort: 8000
|
||||
CustomContainerConfig:
|
||||
Image: 'registry.cn-hangzhou.aliyuncs.com/your-namespace/functional-scaffold:latest'
|
||||
Command: '["uvicorn", "functional_scaffold.main:app", "--host", "0.0.0.0", "--port", "8000"]'
|
||||
EnvironmentVariables:
|
||||
APP_ENV: production
|
||||
LOG_LEVEL: INFO
|
||||
METRICS_ENABLED: 'true'
|
||||
Events:
|
||||
httpTrigger:
|
||||
Type: HTTP
|
||||
Properties:
|
||||
AuthType: ANONYMOUS
|
||||
Methods:
|
||||
- GET
|
||||
- POST
|
||||
108
deployment/serverless/s.yaml
Normal file
108
deployment/serverless/s.yaml
Normal file
@@ -0,0 +1,108 @@
|
||||
# 阿里云函数计算 FC 3.0 配置
|
||||
# 使用 Serverless Devs 部署: cd deployment/serverless && s deploy
|
||||
edition: 3.0.0
|
||||
name: functional-scaffold
|
||||
access: default
|
||||
|
||||
vars:
|
||||
region: cn-beijing
|
||||
image: crpi-om2xd9y8cmaizszf-vpc.cn-beijing.personal.cr.aliyuncs.com/your-namespace/fc-test:test-v1
|
||||
redis_host: 127.31.1.1
|
||||
redis_port: "6379"
|
||||
redis_password: "your-password"
|
||||
|
||||
resources:
|
||||
# API 服务函数
|
||||
prime-checker-api:
|
||||
component: fc3
|
||||
props:
|
||||
region: ${vars.region}
|
||||
functionName: prime-checker-api
|
||||
description: 质数判断算法服务(API)
|
||||
runtime: custom-container
|
||||
cpu: 0.35
|
||||
memorySize: 512
|
||||
diskSize: 512
|
||||
timeout: 60
|
||||
instanceConcurrency: 10
|
||||
handler: not-used
|
||||
customContainerConfig:
|
||||
image: ${vars.image}
|
||||
port: 8000
|
||||
command:
|
||||
- /app/entrypoint.sh
|
||||
healthCheckConfig:
|
||||
httpGetUrl: /healthz
|
||||
initialDelaySeconds: 3
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
successThreshold: 1
|
||||
environmentVariables:
|
||||
APP_ENV: production
|
||||
LOG_LEVEL: INFO
|
||||
METRICS_ENABLED: "true"
|
||||
RUN_MODE: api
|
||||
REDIS_HOST: ${vars.redis_host}
|
||||
REDIS_PORT: ${vars.redis_port}
|
||||
REDIS_PASSWORD: ${vars.redis_password}
|
||||
vpcConfig: auto
|
||||
logConfig: auto
|
||||
triggers:
|
||||
- triggerName: http-trigger
|
||||
triggerType: http
|
||||
triggerConfig:
|
||||
authType: anonymous
|
||||
methods:
|
||||
- GET
|
||||
- POST
|
||||
- PUT
|
||||
- DELETE
|
||||
|
||||
# 异步任务 Worker 函数
|
||||
job-worker:
|
||||
component: fc3
|
||||
props:
|
||||
region: ${vars.region}
|
||||
functionName: job-worker
|
||||
description: 异步任务 Worker
|
||||
runtime: custom-container
|
||||
cpu: 0.35
|
||||
memorySize: 512
|
||||
diskSize: 512
|
||||
timeout: 900
|
||||
instanceConcurrency: 1
|
||||
handler: not-used
|
||||
customContainerConfig:
|
||||
image: ${vars.image}
|
||||
port: 8000
|
||||
command:
|
||||
- /app/entrypoint.sh
|
||||
healthCheckConfig:
|
||||
httpGetUrl: /healthz
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
successThreshold: 1
|
||||
environmentVariables:
|
||||
APP_ENV: production
|
||||
LOG_LEVEL: INFO
|
||||
METRICS_ENABLED: "true"
|
||||
RUN_MODE: worker
|
||||
REDIS_HOST: ${vars.redis_host}
|
||||
REDIS_PORT: ${vars.redis_port}
|
||||
REDIS_PASSWORD: ${vars.redis_password}
|
||||
WORKER_POLL_INTERVAL: "1.0"
|
||||
MAX_CONCURRENT_JOBS: "5"
|
||||
JOB_MAX_RETRIES: "3"
|
||||
JOB_EXECUTION_TIMEOUT: "300"
|
||||
vpcConfig: auto
|
||||
logConfig: auto
|
||||
triggers:
|
||||
- triggerName: timer-trigger
|
||||
triggerType: timer
|
||||
triggerConfig:
|
||||
cronExpression: "0 */1 * * * *"
|
||||
enable: true
|
||||
payload: "{}"
|
||||
58
docs/fc-deploy.md
Normal file
58
docs/fc-deploy.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# 阿里云 函数运算FC 部署入门
|
||||
|
||||
本指南帮助快速上手 FunctionalScaffold 脚手架,在 10 分钟内完成第一个算法服务的开发和部署。
|
||||
|
||||
## 环境准备
|
||||
|
||||
- 安装 [Serverless Devs CLI](https://serverless-devs.com/docs/overview)
|
||||
|
||||
1. 首先安装Node 环境,在Node官网下载
|
||||
- [Node.js 下载地址](https://nodejs.org/en/download/)
|
||||
2. 安装 Serverless Devs CLI
|
||||
|
||||
```bash
|
||||
npm install @serverless-devs/s -g
|
||||
```
|
||||
|
||||
## 初始化 serverless dev cli 配置
|
||||
|
||||
执行以下命令初始化 serverless dev cli 配置
|
||||
|
||||
```bash
|
||||
s config add
|
||||
```
|
||||
|
||||
根据引导进行操作,填入你的access key id 和 access key secret
|
||||
|
||||
## 部署算法服务
|
||||
|
||||
部署算法服务前,请确保已经完成环境准备和配置。
|
||||
|
||||
修改 `s.yaml` 文件中的 vars 部分
|
||||
|
||||
```yaml
|
||||
# 阿里云函数计算 FC 3.0 配置
|
||||
# 使用 Serverless Devs 部署: cd deployment/serverless && s deploy
|
||||
edition: 3.0.0
|
||||
name: functional-scaffold
|
||||
access: default
|
||||
|
||||
vars:
|
||||
region: cn-hangzhou # 换成你的区域
|
||||
image: registry.cn-hangzhou.aliyuncs.com/your-namespace/functional-scaffold:latest # 换成你的docker 镜像
|
||||
redis_host: r-xxxxx.redis.rds.aliyuncs.com # 换成你的redis连接
|
||||
redis_port: "6379" # redis 端口号
|
||||
redis_password: "your-password" #redis 密码,如果没有可留空
|
||||
```
|
||||
|
||||
```bash
|
||||
cd deployment && s deploy
|
||||
```
|
||||
|
||||
部署完成后,可以在控制台查看服务的运行状态和日志。
|
||||
|
||||
## 删除算法服务
|
||||
|
||||
```bash
|
||||
cd deployment && s remove
|
||||
```
|
||||
307
docs/kubernetes-deployment.md
Normal file
307
docs/kubernetes-deployment.md
Normal file
@@ -0,0 +1,307 @@
|
||||
# Kubernetes 部署指南
|
||||
|
||||
本文档介绍如何在 Kubernetes 集群中部署 FunctionalScaffold 服务。
|
||||
|
||||
## 架构概览
|
||||
|
||||
```
|
||||
┌─────────────────┐
|
||||
│ Ingress/LB │
|
||||
└────────┬────────┘
|
||||
│
|
||||
┌────────▼────────┐
|
||||
│ API Service │
|
||||
│ (ClusterIP) │
|
||||
└────────┬────────┘
|
||||
│
|
||||
┌──────────────┼──────────────┐
|
||||
│ │ │
|
||||
┌──────▼──────┐ ┌─────▼─────┐ ┌─────▼─────┐
|
||||
│ API Pod 1 │ │ API Pod 2 │ │ API Pod 3 │
|
||||
└─────────────┘ └───────────┘ └───────────┘
|
||||
│
|
||||
┌────────▼────────┐
|
||||
│ Redis Service │
|
||||
└────────┬────────┘
|
||||
│
|
||||
┌──────────────┼──────────────┐
|
||||
│ │ │
|
||||
┌──────▼──────┐ ┌─────▼─────┐ │
|
||||
│ Worker Pod 1│ │Worker Pod2│ │
|
||||
└─────────────┘ └───────────┘ │
|
||||
┌──────▼──────┐
|
||||
│ Redis Pod │
|
||||
└─────────────┘
|
||||
```
|
||||
|
||||
## 组件说明
|
||||
|
||||
| 组件 | 副本数 | 说明 |
|
||||
|------|--------|------|
|
||||
| **API Deployment** | 3 | HTTP 服务,处理同步请求和任务创建 |
|
||||
| **Worker Deployment** | 2 | 异步任务处理,从 Redis 队列消费任务 |
|
||||
| **Redis Deployment** | 1 | 任务队列和状态存储 |
|
||||
| **ConfigMap** | - | 共享配置管理 |
|
||||
|
||||
## 快速部署
|
||||
|
||||
```bash
|
||||
# 部署所有资源
|
||||
kubectl apply -f deployment/kubernetes/deployment.yaml
|
||||
kubectl apply -f deployment/kubernetes/service.yaml
|
||||
|
||||
# 查看部署状态
|
||||
kubectl get pods -l app=functional-scaffold
|
||||
kubectl get svc -l app=functional-scaffold
|
||||
```
|
||||
|
||||
## 配置文件说明
|
||||
|
||||
### deployment.yaml
|
||||
|
||||
包含以下资源:
|
||||
|
||||
#### ConfigMap
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: functional-scaffold-config
|
||||
data:
|
||||
APP_ENV: "production"
|
||||
LOG_LEVEL: "INFO"
|
||||
REDIS_HOST: "functional-scaffold-redis"
|
||||
# ... 更多配置
|
||||
```
|
||||
|
||||
主要配置项:
|
||||
|
||||
| 配置项 | 默认值 | 说明 |
|
||||
|--------|--------|------|
|
||||
| `APP_ENV` | production | 运行环境 |
|
||||
| `LOG_LEVEL` | INFO | 日志级别 |
|
||||
| `REDIS_HOST` | functional-scaffold-redis | Redis 服务地址 |
|
||||
| `MAX_CONCURRENT_JOBS` | 10 | 最大并发任务数 |
|
||||
| `JOB_EXECUTION_TIMEOUT` | 300 | 任务执行超时(秒) |
|
||||
|
||||
#### API Deployment
|
||||
|
||||
- **副本数**: 3
|
||||
- **资源限制**: 256Mi-512Mi 内存,250m-500m CPU
|
||||
- **健康检查**: `/healthz`(存活)、`/readyz`(就绪)
|
||||
- **环境变量**: `RUN_MODE=api`
|
||||
|
||||
#### Worker Deployment
|
||||
|
||||
- **副本数**: 2
|
||||
- **资源限制**: 256Mi-512Mi 内存,250m-500m CPU
|
||||
- **健康检查**: exec 探针检查 Redis 连接
|
||||
- **环境变量**: `RUN_MODE=worker`
|
||||
|
||||
#### Redis Deployment
|
||||
|
||||
- **副本数**: 1
|
||||
- **资源限制**: 128Mi-256Mi 内存,100m-200m CPU
|
||||
- **持久化**: AOF 模式(appendonly yes)
|
||||
- **存储**: emptyDir(开发环境)
|
||||
|
||||
### service.yaml
|
||||
|
||||
| Service | 类型 | 端口 | 说明 |
|
||||
|---------|------|------|------|
|
||||
| `functional-scaffold-api` | ClusterIP | 80 → 8000 | API 服务 |
|
||||
| `functional-scaffold-metrics` | ClusterIP | 8000 | Prometheus 指标 |
|
||||
| `functional-scaffold-redis` | ClusterIP | 6379 | Redis 服务 |
|
||||
|
||||
## 生产环境建议
|
||||
|
||||
### 1. 使用外部 Redis
|
||||
|
||||
生产环境建议使用托管 Redis 服务(如阿里云 Redis、AWS ElastiCache):
|
||||
|
||||
```yaml
|
||||
# 修改 ConfigMap
|
||||
data:
|
||||
REDIS_HOST: "r-xxxxx.redis.rds.aliyuncs.com"
|
||||
REDIS_PORT: "6379"
|
||||
REDIS_PASSWORD: "" # 使用 Secret 管理
|
||||
```
|
||||
|
||||
### 2. 使用 Secret 管理敏感信息
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: functional-scaffold-secrets
|
||||
type: Opaque
|
||||
stringData:
|
||||
REDIS_PASSWORD: "your-password"
|
||||
DATABASE_URL: "postgresql://..."
|
||||
```
|
||||
|
||||
在 Deployment 中引用:
|
||||
|
||||
```yaml
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: functional-scaffold-config
|
||||
- secretRef:
|
||||
name: functional-scaffold-secrets
|
||||
```
|
||||
|
||||
### 3. 配置 HPA 自动扩缩容
|
||||
|
||||
```yaml
|
||||
apiVersion: autoscaling/v2
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: functional-scaffold-api-hpa
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: functional-scaffold-api
|
||||
minReplicas: 2
|
||||
maxReplicas: 10
|
||||
metrics:
|
||||
- type: Resource
|
||||
resource:
|
||||
name: cpu
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: 70
|
||||
```
|
||||
|
||||
### 4. 配置 PDB 保证可用性
|
||||
|
||||
```yaml
|
||||
apiVersion: policy/v1
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: functional-scaffold-api-pdb
|
||||
spec:
|
||||
minAvailable: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
```
|
||||
|
||||
### 5. 使用 PVC 持久化 Redis 数据
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: redis-data-pvc
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
```
|
||||
|
||||
## 监控集成
|
||||
|
||||
### Prometheus 抓取配置
|
||||
|
||||
`functional-scaffold-metrics` Service 已添加 Prometheus 注解:
|
||||
|
||||
```yaml
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8000"
|
||||
prometheus.io/path: "/metrics"
|
||||
```
|
||||
|
||||
### ServiceMonitor(如使用 Prometheus Operator)
|
||||
|
||||
```yaml
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: functional-scaffold
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
endpoints:
|
||||
- port: metrics
|
||||
path: /metrics
|
||||
interval: 30s
|
||||
```
|
||||
|
||||
## 常用命令
|
||||
|
||||
```bash
|
||||
# 查看所有资源
|
||||
kubectl get all -l app=functional-scaffold
|
||||
|
||||
# 查看 Pod 日志
|
||||
kubectl logs -l app=functional-scaffold,component=api -f
|
||||
kubectl logs -l app=functional-scaffold,component=worker -f
|
||||
|
||||
# 扩缩容
|
||||
kubectl scale deployment functional-scaffold-api --replicas=5
|
||||
kubectl scale deployment functional-scaffold-worker --replicas=3
|
||||
|
||||
# 滚动更新
|
||||
kubectl set image deployment/functional-scaffold-api \
|
||||
api=functional-scaffold:v2.0.0
|
||||
|
||||
# 回滚
|
||||
kubectl rollout undo deployment/functional-scaffold-api
|
||||
|
||||
# 查看部署历史
|
||||
kubectl rollout history deployment/functional-scaffold-api
|
||||
|
||||
# 进入 Pod 调试
|
||||
kubectl exec -it <pod-name> -- /bin/sh
|
||||
|
||||
# 端口转发(本地调试)
|
||||
kubectl port-forward svc/functional-scaffold-api 8000:80
|
||||
```
|
||||
|
||||
## 故障排查
|
||||
|
||||
### Pod 启动失败
|
||||
|
||||
```bash
|
||||
# 查看 Pod 事件
|
||||
kubectl describe pod <pod-name>
|
||||
|
||||
# 查看 Pod 日志
|
||||
kubectl logs <pod-name> --previous
|
||||
```
|
||||
|
||||
### Redis 连接失败
|
||||
|
||||
```bash
|
||||
# 检查 Redis Service
|
||||
kubectl get svc functional-scaffold-redis
|
||||
|
||||
# 测试 Redis 连接
|
||||
kubectl run redis-test --rm -it --image=redis:7-alpine -- \
|
||||
redis-cli -h functional-scaffold-redis ping
|
||||
```
|
||||
|
||||
### Worker 不消费任务
|
||||
|
||||
```bash
|
||||
# 检查 Worker 日志
|
||||
kubectl logs -l component=worker -f
|
||||
|
||||
# 检查 Redis 队列
|
||||
kubectl exec -it <redis-pod> -- redis-cli LLEN job:queue
|
||||
```
|
||||
|
||||
## 相关文档
|
||||
|
||||
- [快速入门](getting-started.md)
|
||||
- [监控指南](monitoring.md)
|
||||
- [并发控制](concurrency-control.md)
|
||||
- [日志集成](loki-quick-reference.md)
|
||||
@@ -1,238 +0,0 @@
|
||||
# Loki 日志收集系统集成 - 实施总结
|
||||
|
||||
## 实施完成
|
||||
|
||||
已成功集成 Grafana Loki 日志收集系统到 FunctionalScaffold 项目。
|
||||
|
||||
## 新增文件
|
||||
|
||||
### 1. 监控配置文件
|
||||
|
||||
| 文件 | 说明 |
|
||||
|------|------|
|
||||
| `monitoring/loki.yaml` | Loki 服务配置(7天保留期,10MB/s速率限制)|
|
||||
| `monitoring/promtail.yaml` | Promtail 日志采集配置(支持 Docker stdio 和文件两种模式)|
|
||||
|
||||
### 2. Grafana Provisioning
|
||||
|
||||
| 文件 | 说明 |
|
||||
|------|------|
|
||||
| `monitoring/grafana/datasources/prometheus.yaml` | Prometheus 数据源自动配置 |
|
||||
| `monitoring/grafana/datasources/loki.yaml` | Loki 数据源自动配置 |
|
||||
| `monitoring/grafana/dashboards/provider.yaml` | Dashboard 自动加载配置 |
|
||||
| `monitoring/grafana/dashboards/logs-dashboard.json` | 日志监控仪表板 |
|
||||
| `monitoring/grafana/dashboards/dashboard.json` | 原有监控仪表板(已移动)|
|
||||
|
||||
### 3. 文档和脚本
|
||||
|
||||
| 文件 | 说明 |
|
||||
|------|------|
|
||||
| `docs/loki-integration.md` | Loki 使用完整文档(包含查询示例、故障排查等)|
|
||||
| `scripts/verify_loki.sh` | Loki 集成验证脚本 |
|
||||
|
||||
## 修改文件
|
||||
|
||||
### 1. Docker Compose 配置
|
||||
|
||||
**文件**: `deployment/docker-compose.yml`
|
||||
|
||||
**变更**:
|
||||
- 添加 `loki` 服务(端口 3100)
|
||||
- 添加 `promtail` 服务(端口 9080)
|
||||
- 更新 `app` 服务:
|
||||
- 添加日志文件配置环境变量
|
||||
- 添加 `app_logs` 卷挂载
|
||||
- 添加 Promtail 标签
|
||||
- 更新 `grafana` 服务:
|
||||
- 修改 provisioning 卷挂载结构
|
||||
- 添加对 Loki 的依赖
|
||||
- 添加 `loki_data` 和 `app_logs` 卷
|
||||
|
||||
### 2. 应用代码
|
||||
|
||||
**文件**: `src/functional_scaffold/core/logging.py`
|
||||
|
||||
**变更**:
|
||||
- 添加 `file_path` 参数支持
|
||||
- 实现 `RotatingFileHandler`(100MB,5个备份)
|
||||
- 支持同时输出到控制台和文件
|
||||
|
||||
**文件**: `src/functional_scaffold/config.py`
|
||||
|
||||
**变更**:
|
||||
- 添加 `log_file_enabled` 配置(默认 False)
|
||||
- 添加 `log_file_path` 配置(默认 `/var/log/app/app.log`)
|
||||
|
||||
**文件**: `src/functional_scaffold/main.py`
|
||||
|
||||
**变更**:
|
||||
- 更新 `setup_logging()` 调用,传入文件路径参数
|
||||
|
||||
## 架构特点
|
||||
|
||||
### 1. 双模式日志收集
|
||||
|
||||
**模式 1: Docker stdio 收集(默认)**
|
||||
- ✅ 无需修改应用代码
|
||||
- ✅ 自动收集容器标准输出
|
||||
- ✅ 性能影响极小
|
||||
- ✅ 推荐用于生产环境
|
||||
|
||||
**模式 2: 文件收集(备用)**
|
||||
- ✅ 日志持久化到文件
|
||||
- ✅ 支持日志轮转
|
||||
- ✅ 适合需要本地日志的场景
|
||||
- ⚙️ 需要设置 `LOG_FILE_ENABLED=true`
|
||||
|
||||
### 2. 自动化配置
|
||||
|
||||
- ✅ Grafana 数据源自动加载
|
||||
- ✅ Dashboard 自动加载
|
||||
- ✅ 无需手动配置
|
||||
|
||||
### 3. 结构化日志
|
||||
|
||||
- ✅ JSON 格式日志
|
||||
- ✅ 自动提取字段(level, logger, request_id 等)
|
||||
- ✅ 支持 LogQL 查询
|
||||
|
||||
## 使用方式
|
||||
|
||||
### 快速启动
|
||||
|
||||
```bash
|
||||
cd deployment
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
### 访问服务
|
||||
|
||||
- **Grafana**: http://localhost:3000 (admin/admin)
|
||||
- **Loki API**: http://localhost:3100
|
||||
- **Promtail**: http://localhost:9080
|
||||
|
||||
### 查看日志
|
||||
|
||||
**方式 1: Grafana 日志仪表板**
|
||||
1. 访问 http://localhost:3000
|
||||
2. 进入 "日志监控" 仪表板
|
||||
|
||||
**方式 2: Grafana Explore**
|
||||
1. 访问 http://localhost:3000/explore
|
||||
2. 选择 Loki 数据源
|
||||
3. 输入查询: `{job="functional-scaffold-app"}`
|
||||
|
||||
### 验证集成
|
||||
|
||||
```bash
|
||||
./scripts/verify_loki.sh
|
||||
```
|
||||
|
||||
## LogQL 查询示例
|
||||
|
||||
```logql
|
||||
# 查询所有日志
|
||||
{job="functional-scaffold-app"}
|
||||
|
||||
# 查询错误日志
|
||||
{job="functional-scaffold-app", level="ERROR"}
|
||||
|
||||
# 按 request_id 过滤
|
||||
{job="functional-scaffold-app"} | json | request_id = "abc123"
|
||||
|
||||
# 统计日志量
|
||||
sum by (level) (count_over_time({job="functional-scaffold-app"}[5m]))
|
||||
```
|
||||
|
||||
## 配置说明
|
||||
|
||||
### 日志保留期
|
||||
|
||||
默认 7 天,可在 `monitoring/loki.yaml` 中修改:
|
||||
|
||||
```yaml
|
||||
limits_config:
|
||||
retention_period: 168h # 7 天
|
||||
```
|
||||
|
||||
### 日志文件模式
|
||||
|
||||
在 `deployment/docker-compose.yml` 中启用:
|
||||
|
||||
```yaml
|
||||
environment:
|
||||
- LOG_FILE_ENABLED=true
|
||||
- LOG_FILE_PATH=/var/log/app/app.log
|
||||
```
|
||||
|
||||
### 日志级别
|
||||
|
||||
在 `deployment/docker-compose.yml` 中调整:
|
||||
|
||||
```yaml
|
||||
environment:
|
||||
- LOG_LEVEL=INFO # DEBUG, INFO, WARNING, ERROR, CRITICAL
|
||||
```
|
||||
|
||||
## 监控指标
|
||||
|
||||
Loki 集成后,可以在 Grafana 中查看:
|
||||
|
||||
- **日志流**: 实时日志流
|
||||
- **日志量趋势**: 按时间和级别统计
|
||||
- **日志级别分布**: INFO/WARNING/ERROR 分布
|
||||
- **错误日志**: 只显示 ERROR 级别
|
||||
|
||||
## 故障排查
|
||||
|
||||
### 看不到日志
|
||||
|
||||
1. 检查服务状态: `docker-compose ps`
|
||||
2. 查看 Promtail 日志: `docker-compose logs promtail`
|
||||
3. 验证容器标签: `docker inspect <container> | grep Labels`
|
||||
4. 查询 Loki API: `curl http://localhost:3100/loki/api/v1/label/job/values`
|
||||
|
||||
### Docker socket 权限问题
|
||||
|
||||
```bash
|
||||
sudo chmod 666 /var/run/docker.sock
|
||||
```
|
||||
|
||||
### 日志量过大
|
||||
|
||||
1. 调整保留期为 3 天
|
||||
2. 降低摄入速率限制
|
||||
3. 添加日志过滤规则
|
||||
|
||||
详细故障排查请参考 `docs/loki-integration.md`。
|
||||
|
||||
## 性能影响
|
||||
|
||||
- **CPU**: < 5% 额外开销
|
||||
- **内存**: Loki ~200MB, Promtail ~50MB
|
||||
- **磁盘**: 取决于日志量,7天约 1-5GB
|
||||
- **网络**: 本地通信,影响极小
|
||||
|
||||
## 下一步
|
||||
|
||||
可选的增强功能:
|
||||
|
||||
1. **告警规则**: 配置基于日志的告警
|
||||
2. **日志导出**: 定期导出日志到对象存储
|
||||
3. **多租户**: 配置 Loki 多租户模式
|
||||
4. **长期存储**: 配置 S3/OSS 作为后端存储
|
||||
|
||||
## 参考文档
|
||||
|
||||
- 完整使用文档: `docs/loki-integration.md`
|
||||
- Loki 官方文档: https://grafana.com/docs/loki/latest/
|
||||
- LogQL 查询语言: https://grafana.com/docs/loki/latest/logql/
|
||||
|
||||
## 总结
|
||||
|
||||
✅ **完成**: Loki 日志收集系统已成功集成
|
||||
✅ **测试**: 可通过 `./scripts/verify_loki.sh` 验证
|
||||
✅ **文档**: 提供完整的使用和故障排查文档
|
||||
✅ **生产就绪**: 支持双模式收集,配置灵活
|
||||
|
||||
集成已完成,可以开始使用 Loki 进行日志收集和分析!
|
||||
@@ -1395,6 +1395,504 @@
|
||||
],
|
||||
"title": "Webhook 发送状态",
|
||||
"type": "piechart"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 53
|
||||
},
|
||||
"id": 200,
|
||||
"panels": [],
|
||||
"title": "队列监控",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "任务数",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 20,
|
||||
"gradientMode": "opacity",
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
},
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": true,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 50
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 100
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "pending"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "blue",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "processing"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "orange",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "dlq"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "red",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 54
|
||||
},
|
||||
"id": 19,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["mean", "last", "max"],
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"expr": "job_queue_length",
|
||||
"legendFormat": "{{queue}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "队列长度趋势",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "秒",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 20,
|
||||
"gradientMode": "opacity",
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
},
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": true,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "line"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 60
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 300
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 54
|
||||
},
|
||||
"id": 20,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["mean", "last", "max"],
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"expr": "job_oldest_waiting_seconds",
|
||||
"legendFormat": "最长等待时间",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "最长任务等待时间",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 10
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 50
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 62
|
||||
},
|
||||
"id": 21,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"values": false,
|
||||
"calcs": ["last"],
|
||||
"fields": ""
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"expr": "job_queue_length{queue=\"pending\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "待处理队列",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 5
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 10
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 62
|
||||
},
|
||||
"id": 22,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"values": false,
|
||||
"calcs": ["last"],
|
||||
"fields": ""
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"expr": "job_queue_length{queue=\"processing\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "处理中队列",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 62
|
||||
},
|
||||
"id": 23,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"values": false,
|
||||
"calcs": ["last"],
|
||||
"fields": ""
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"expr": "job_queue_length{queue=\"dlq\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "死信队列",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 62
|
||||
},
|
||||
"id": 24,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"values": false,
|
||||
"calcs": ["last"],
|
||||
"fields": ""
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"expr": "sum(job_recovered_total) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "回收任务总数",
|
||||
"type": "stat"
|
||||
}
|
||||
],
|
||||
"refresh": "5s",
|
||||
|
||||
@@ -19,6 +19,14 @@ dependencies = [
|
||||
"pydantic-settings>=2.0.0",
|
||||
"prometheus-client>=0.19.0",
|
||||
"python-json-logger>=2.0.7",
|
||||
# Redis - 任务队列和指标存储
|
||||
"redis>=5.0.0",
|
||||
# YAML 配置解析
|
||||
"pyyaml>=6.0.0",
|
||||
# HTTP 客户端(Webhook 回调)
|
||||
"httpx>=0.27.0",
|
||||
# 轻量级 HTTP 服务器(Worker 健康检查)
|
||||
"aiohttp>=3.9.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
@@ -26,7 +34,6 @@ dev = [
|
||||
"pytest>=7.4.0",
|
||||
"pytest-asyncio>=0.21.0",
|
||||
"pytest-cov>=4.1.0",
|
||||
"httpx>=0.26.0",
|
||||
"black>=23.12.0",
|
||||
"ruff>=0.1.0",
|
||||
]
|
||||
|
||||
@@ -1,16 +1,17 @@
|
||||
# 核心依赖 - 与 pyproject.toml 保持同步
|
||||
fastapi>=0.109.0
|
||||
uvicorn[standard]>=0.27.0
|
||||
pydantic>=2.5.0
|
||||
pydantic-settings>=2.0.0
|
||||
prometheus-client>=0.19.0
|
||||
python-json-logger>=2.0.7
|
||||
aiohttp>=3.9.0
|
||||
|
||||
# 指标存储方案(可选,根据选择的方案安装)
|
||||
# 方案2:Redis 方案需要
|
||||
# Redis - 任务队列和指标存储
|
||||
redis>=5.0.0
|
||||
|
||||
# YAML 配置解析
|
||||
pyyaml>=6.0.0
|
||||
|
||||
# HTTP 客户端(用于 Webhook 回调)
|
||||
# HTTP 客户端(Webhook 回调)
|
||||
httpx>=0.27.0
|
||||
|
||||
@@ -32,7 +32,7 @@ class BaseAlgorithm(ABC):
|
||||
Returns:
|
||||
Dict[str, Any]: 包含结果和元数据的字典
|
||||
"""
|
||||
from ..core.metrics_unified import incr, observe
|
||||
from ..core.metrics_unified import incr_sync, observe_sync
|
||||
|
||||
start_time = time.time()
|
||||
status = "success"
|
||||
@@ -71,5 +71,7 @@ class BaseAlgorithm(ABC):
|
||||
finally:
|
||||
# 记录算法执行指标
|
||||
elapsed_time = time.time() - start_time
|
||||
incr("algorithm_executions_total", {"algorithm": self.name, "status": status})
|
||||
observe("algorithm_execution_duration_seconds", {"algorithm": self.name}, elapsed_time)
|
||||
incr_sync("algorithm_executions_total", {"algorithm": self.name, "status": status})
|
||||
observe_sync(
|
||||
"algorithm_execution_duration_seconds", {"algorithm": self.name}, elapsed_time
|
||||
)
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Dict, Any, List
|
||||
from .base import BaseAlgorithm
|
||||
from ..core.metrics_unified import incr
|
||||
from ..core.metrics_unified import incr_sync
|
||||
|
||||
|
||||
class PrimeChecker(BaseAlgorithm):
|
||||
@@ -31,12 +31,12 @@ class PrimeChecker(BaseAlgorithm):
|
||||
ValueError: 如果输入不是整数
|
||||
"""
|
||||
if not isinstance(number, int):
|
||||
incr('prime_check',{"status":"invalid_input"})
|
||||
incr_sync('prime_check', {"status": "invalid_input"})
|
||||
raise ValueError(f"Input must be an integer, got {type(number).__name__}")
|
||||
|
||||
# 小于2的数不是质数
|
||||
if number < 2:
|
||||
incr('prime_check', {"status": "number_little_two"})
|
||||
incr_sync('prime_check', {"status": "number_little_two"})
|
||||
return {
|
||||
"number": number,
|
||||
"is_prime": False,
|
||||
@@ -50,7 +50,7 @@ class PrimeChecker(BaseAlgorithm):
|
||||
|
||||
# 如果不是质数,计算因数
|
||||
factors = [] if is_prime else self._get_factors(number)
|
||||
incr('prime_check', {"status": "success"})
|
||||
incr_sync('prime_check', {"status": "success"})
|
||||
return {
|
||||
"number": number,
|
||||
"is_prime": is_prime,
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
"""API 路由"""
|
||||
|
||||
import asyncio
|
||||
from fastapi import APIRouter, HTTPException, Depends, status
|
||||
import time
|
||||
import logging
|
||||
@@ -200,10 +199,10 @@ async def create_job(
|
||||
# 获取任务信息
|
||||
job_data = await job_manager.get_job(job_id)
|
||||
|
||||
# 后台执行任务
|
||||
asyncio.create_task(job_manager.execute_job(job_id))
|
||||
# 任务入队,由 Worker 执行
|
||||
await job_manager.enqueue_job(job_id)
|
||||
|
||||
logger.info(f"异步任务已创建: job_id={job_id}, request_id={request_id}")
|
||||
logger.info(f"异步任务已创建并入队: job_id={job_id}, request_id={request_id}")
|
||||
|
||||
return JobCreateResponse(
|
||||
job_id=job_id,
|
||||
|
||||
@@ -57,6 +57,26 @@ class Settings(BaseSettings):
|
||||
webhook_timeout: int = 10 # Webhook 超时时间(秒)
|
||||
max_concurrent_jobs: int = 10 # 最大并发任务数
|
||||
|
||||
# Worker 配置
|
||||
worker_poll_interval: float = 0.1 # Worker 轮询间隔(秒)
|
||||
job_queue_key: str = "job:queue" # 任务队列 Redis Key
|
||||
job_concurrency_key: str = "job:concurrency" # 全局并发计数器 Redis Key
|
||||
job_lock_ttl: int = 300 # 任务锁 TTL(秒)
|
||||
job_max_retries: int = 3 # 任务最大重试次数
|
||||
job_execution_timeout: int = 300 # 任务执行超时(秒)
|
||||
|
||||
# 处理队列配置
|
||||
job_processing_key: str = "job:processing" # 处理中队列
|
||||
job_processing_ts_key: str = "job:processing:ts" # 处理时间戳 ZSET
|
||||
job_dlq_key: str = "job:dlq" # 死信队列
|
||||
|
||||
# 锁配置扩展
|
||||
job_lock_buffer: int = 60 # 锁 TTL 缓冲时间(秒)
|
||||
|
||||
# 回收器配置
|
||||
job_sweeper_enabled: bool = True # 启用回收器
|
||||
job_sweeper_interval: int = 60 # 回收扫描间隔(秒)
|
||||
|
||||
|
||||
# 全局配置实例
|
||||
settings = Settings()
|
||||
|
||||
@@ -7,6 +7,7 @@ import asyncio
|
||||
import json
|
||||
import logging
|
||||
import secrets
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, List, Optional, Type
|
||||
|
||||
@@ -16,6 +17,7 @@ import redis.asyncio as aioredis
|
||||
from ..algorithms.base import BaseAlgorithm
|
||||
from ..config import settings
|
||||
from ..core.metrics_unified import incr, observe
|
||||
from ..core.tracing import set_request_id
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -23,6 +25,24 @@ logger = logging.getLogger(__name__)
|
||||
class JobManager:
|
||||
"""异步任务管理器"""
|
||||
|
||||
# Lua 脚本:安全释放锁(验证 token)
|
||||
RELEASE_LOCK_SCRIPT = """
|
||||
local current = redis.call('GET', KEYS[1])
|
||||
if current == ARGV[1] then
|
||||
return redis.call('DEL', KEYS[1])
|
||||
end
|
||||
return 0
|
||||
"""
|
||||
|
||||
# Lua 脚本:锁续租(验证 token 后延长 TTL)
|
||||
RENEW_LOCK_SCRIPT = """
|
||||
local current = redis.call('GET', KEYS[1])
|
||||
if current == ARGV[1] then
|
||||
return redis.call('EXPIRE', KEYS[1], ARGV[2])
|
||||
end
|
||||
return 0
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._redis_client: Optional[aioredis.Redis] = None
|
||||
self._algorithm_registry: Dict[str, Type[BaseAlgorithm]] = {}
|
||||
@@ -148,7 +168,7 @@ class JobManager:
|
||||
await self._redis_client.hset(key, mapping=job_data)
|
||||
|
||||
# 记录指标
|
||||
incr("jobs_created_total", {"algorithm": algorithm})
|
||||
await incr("jobs_created_total", {"algorithm": algorithm})
|
||||
|
||||
logger.info(f"任务已创建: job_id={job_id}, algorithm={algorithm}")
|
||||
return job_id
|
||||
@@ -176,6 +196,7 @@ class JobManager:
|
||||
"job_id": job_id,
|
||||
"status": job_data.get("status", ""),
|
||||
"algorithm": job_data.get("algorithm", ""),
|
||||
"request_id": job_data.get("request_id") or None,
|
||||
"created_at": job_data.get("created_at", ""),
|
||||
"started_at": job_data.get("started_at") or None,
|
||||
"completed_at": job_data.get("completed_at") or None,
|
||||
@@ -223,6 +244,11 @@ class JobManager:
|
||||
|
||||
algorithm_name = job_data.get("algorithm", "")
|
||||
webhook_url = job_data.get("webhook", "")
|
||||
request_id = job_data.get("request_id", "")
|
||||
|
||||
# 设置 request_id 上下文,确保日志中包含 request_id
|
||||
if request_id:
|
||||
set_request_id(request_id)
|
||||
|
||||
# 解析参数
|
||||
try:
|
||||
@@ -234,7 +260,9 @@ class JobManager:
|
||||
async with self._semaphore:
|
||||
# 更新状态为 running
|
||||
started_at = self._get_timestamp()
|
||||
await self._redis_client.hset(key, mapping={"status": "running", "started_at": started_at})
|
||||
await self._redis_client.hset(
|
||||
key, mapping={"status": "running", "started_at": started_at}
|
||||
)
|
||||
|
||||
logger.info(f"开始执行任务: job_id={job_id}, algorithm={algorithm_name}")
|
||||
|
||||
@@ -292,10 +320,14 @@ class JobManager:
|
||||
await self._redis_client.expire(key, settings.job_result_ttl)
|
||||
|
||||
# 记录指标
|
||||
incr("jobs_completed_total", {"algorithm": algorithm_name, "status": status})
|
||||
observe("job_execution_duration_seconds", {"algorithm": algorithm_name}, elapsed_time)
|
||||
await incr("jobs_completed_total", {"algorithm": algorithm_name, "status": status})
|
||||
await observe(
|
||||
"job_execution_duration_seconds", {"algorithm": algorithm_name}, elapsed_time
|
||||
)
|
||||
|
||||
logger.info(f"任务执行完成: job_id={job_id}, status={status}, elapsed={elapsed_time:.3f}s")
|
||||
logger.info(
|
||||
f"任务执行完成: job_id={job_id}, status={status}, elapsed={elapsed_time:.3f}s"
|
||||
)
|
||||
|
||||
# 发送 Webhook 回调
|
||||
if webhook_url:
|
||||
@@ -342,7 +374,7 @@ class JobManager:
|
||||
)
|
||||
|
||||
if response.status_code < 400:
|
||||
incr("webhook_deliveries_total", {"status": "success"})
|
||||
await incr("webhook_deliveries_total", {"status": "success"})
|
||||
logger.info(
|
||||
f"Webhook 发送成功: job_id={job_id}, url={webhook_url}, "
|
||||
f"status_code={response.status_code}"
|
||||
@@ -365,13 +397,358 @@ class JobManager:
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
# 所有重试都失败
|
||||
incr("webhook_deliveries_total", {"status": "failed"})
|
||||
await incr("webhook_deliveries_total", {"status": "failed"})
|
||||
logger.error(f"Webhook 发送最终失败: job_id={job_id}, url={webhook_url}")
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""检查任务管理器是否可用"""
|
||||
return self._redis_client is not None
|
||||
|
||||
async def enqueue_job(self, job_id: str) -> bool:
|
||||
"""将任务加入队列
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
|
||||
Returns:
|
||||
bool: 是否成功入队
|
||||
"""
|
||||
if not self._redis_client:
|
||||
logger.error(f"Redis 不可用,无法入队任务: {job_id}")
|
||||
return False
|
||||
|
||||
try:
|
||||
await self._redis_client.lpush(settings.job_queue_key, job_id)
|
||||
logger.info(f"任务已入队: job_id={job_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"任务入队失败: job_id={job_id}, error={e}")
|
||||
return False
|
||||
|
||||
async def dequeue_job(self, timeout: int = 5) -> Optional[str]:
|
||||
"""从队列获取任务(阻塞式,转移式出队)
|
||||
|
||||
使用 BLMOVE 原子性地将任务从 job:queue 移动到 job:processing,
|
||||
防止 Worker 崩溃时任务丢失。
|
||||
|
||||
Args:
|
||||
timeout: 阻塞超时时间(秒)
|
||||
|
||||
Returns:
|
||||
Optional[str]: 任务 ID,超时返回 None
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return None
|
||||
|
||||
try:
|
||||
# 使用 BLMOVE 原子性转移任务
|
||||
job_id = await self._redis_client.blmove(
|
||||
settings.job_queue_key, # 源: job:queue
|
||||
settings.job_processing_key, # 目标: job:processing
|
||||
timeout,
|
||||
"RIGHT",
|
||||
"LEFT",
|
||||
)
|
||||
if job_id:
|
||||
# 记录出队时间戳到 ZSET
|
||||
await self._redis_client.zadd(settings.job_processing_ts_key, {job_id: time.time()})
|
||||
logger.debug(f"任务已转移到处理队列: {job_id}")
|
||||
return job_id
|
||||
except Exception as e:
|
||||
logger.error(f"任务出队失败: error={e}")
|
||||
return None
|
||||
|
||||
async def acquire_job_lock(self, job_id: str) -> Optional[str]:
|
||||
"""获取任务执行锁(分布式锁,带 Token)
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
|
||||
Returns:
|
||||
Optional[str]: 成功时返回锁 token,失败返回 None
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return None
|
||||
|
||||
lock_key = f"job:lock:{job_id}"
|
||||
lock_token = secrets.token_hex(16) # 随机 token
|
||||
lock_ttl = settings.job_execution_timeout + settings.job_lock_buffer
|
||||
try:
|
||||
acquired = await self._redis_client.set(lock_key, lock_token, nx=True, ex=lock_ttl)
|
||||
if acquired:
|
||||
logger.debug(f"获取任务锁成功: job_id={job_id}")
|
||||
return lock_token
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"获取任务锁失败: job_id={job_id}, error={e}")
|
||||
return None
|
||||
|
||||
async def release_job_lock(self, job_id: str, lock_token: Optional[str] = None) -> bool:
|
||||
"""释放任务执行锁(使用 Lua 脚本验证 token)
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
lock_token: 锁 token(用于验证所有权)
|
||||
|
||||
Returns:
|
||||
bool: 是否成功释放锁
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return False
|
||||
|
||||
lock_key = f"job:lock:{job_id}"
|
||||
try:
|
||||
if lock_token:
|
||||
# 使用 Lua 脚本安全释放锁
|
||||
result = await self._redis_client.eval(
|
||||
self.RELEASE_LOCK_SCRIPT, 1, lock_key, lock_token
|
||||
)
|
||||
if result == 1:
|
||||
logger.debug(f"释放任务锁成功: job_id={job_id}")
|
||||
return True
|
||||
else:
|
||||
logger.warning(f"释放任务锁失败(token 不匹配): job_id={job_id}")
|
||||
return False
|
||||
else:
|
||||
# 向后兼容:无 token 时直接删除
|
||||
await self._redis_client.delete(lock_key)
|
||||
logger.debug(f"释放任务锁成功(无 token 验证): job_id={job_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"释放任务锁失败: job_id={job_id}, error={e}")
|
||||
return False
|
||||
|
||||
async def increment_concurrency(self) -> int:
|
||||
"""增加全局并发计数
|
||||
|
||||
Returns:
|
||||
int: 增加后的并发数
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return 0
|
||||
|
||||
try:
|
||||
count = await self._redis_client.incr(settings.job_concurrency_key)
|
||||
return count
|
||||
except Exception as e:
|
||||
logger.error(f"增加并发计数失败: error={e}")
|
||||
return 0
|
||||
|
||||
async def decrement_concurrency(self) -> int:
|
||||
"""减少全局并发计数
|
||||
|
||||
Returns:
|
||||
int: 减少后的并发数
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return 0
|
||||
|
||||
try:
|
||||
count = await self._redis_client.decr(settings.job_concurrency_key)
|
||||
# 防止计数变为负数
|
||||
if count < 0:
|
||||
await self._redis_client.set(settings.job_concurrency_key, 0)
|
||||
return 0
|
||||
return count
|
||||
except Exception as e:
|
||||
logger.error(f"减少并发计数失败: error={e}")
|
||||
return 0
|
||||
|
||||
async def get_global_concurrency(self) -> int:
|
||||
"""获取当前全局并发数
|
||||
|
||||
Returns:
|
||||
int: 当前并发数
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return 0
|
||||
|
||||
try:
|
||||
count = await self._redis_client.get(settings.job_concurrency_key)
|
||||
return int(count) if count else 0
|
||||
except Exception as e:
|
||||
logger.error(f"获取并发计数失败: error={e}")
|
||||
return 0
|
||||
|
||||
async def can_execute(self) -> bool:
|
||||
"""检查是否可以执行新任务(全局并发控制)
|
||||
|
||||
Returns:
|
||||
bool: 是否可以执行
|
||||
"""
|
||||
current = await self.get_global_concurrency()
|
||||
return current < settings.max_concurrent_jobs
|
||||
|
||||
async def get_job_retry_count(self, job_id: str) -> int:
|
||||
"""获取任务重试次数
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
|
||||
Returns:
|
||||
int: 重试次数
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return 0
|
||||
|
||||
key = f"job:{job_id}"
|
||||
try:
|
||||
retry_count = await self._redis_client.hget(key, "retry_count")
|
||||
return int(retry_count) if retry_count else 0
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
async def increment_job_retry(self, job_id: str) -> int:
|
||||
"""增加任务重试次数
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
|
||||
Returns:
|
||||
int: 增加后的重试次数
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return 0
|
||||
|
||||
key = f"job:{job_id}"
|
||||
try:
|
||||
await self._redis_client.hincrby(key, "retry_count", 1)
|
||||
retry_count = await self._redis_client.hget(key, "retry_count")
|
||||
return int(retry_count) if retry_count else 1
|
||||
except Exception as e:
|
||||
logger.error(f"增加重试次数失败: job_id={job_id}, error={e}")
|
||||
return 0
|
||||
|
||||
async def ack_job(self, job_id: str) -> bool:
|
||||
"""确认任务完成(从处理队列移除)
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
|
||||
Returns:
|
||||
bool: 是否成功确认
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return False
|
||||
|
||||
try:
|
||||
async with self._redis_client.pipeline(transaction=True) as pipe:
|
||||
pipe.lrem(settings.job_processing_key, 1, job_id)
|
||||
pipe.zrem(settings.job_processing_ts_key, job_id)
|
||||
await pipe.execute()
|
||||
logger.debug(f"任务已确认完成: job_id={job_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"确认任务失败: job_id={job_id}, error={e}")
|
||||
return False
|
||||
|
||||
async def nack_job(self, job_id: str, requeue: bool = True) -> bool:
|
||||
"""拒绝任务(从处理队列移除,根据重试次数决定重新入队或进死信队列)
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
requeue: 是否尝试重新入队
|
||||
|
||||
Returns:
|
||||
bool: 是否成功处理
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return False
|
||||
|
||||
try:
|
||||
retry_count = await self.get_job_retry_count(job_id)
|
||||
async with self._redis_client.pipeline(transaction=True) as pipe:
|
||||
pipe.lrem(settings.job_processing_key, 1, job_id)
|
||||
pipe.zrem(settings.job_processing_ts_key, job_id)
|
||||
if requeue and retry_count < settings.job_max_retries:
|
||||
pipe.lpush(settings.job_queue_key, job_id)
|
||||
logger.info(f"任务重新入队: job_id={job_id}, retry_count={retry_count}")
|
||||
else:
|
||||
pipe.lpush(settings.job_dlq_key, job_id)
|
||||
logger.warning(f"任务进入死信队列: job_id={job_id}, retry_count={retry_count}")
|
||||
await pipe.execute()
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"拒绝任务失败: job_id={job_id}, error={e}")
|
||||
return False
|
||||
|
||||
async def renew_job_lock(self, job_id: str, lock_token: str) -> bool:
|
||||
"""续租任务锁(延长 TTL)
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
lock_token: 锁 token
|
||||
|
||||
Returns:
|
||||
bool: 是否成功续租
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return False
|
||||
|
||||
lock_key = f"job:lock:{job_id}"
|
||||
lock_ttl = settings.job_execution_timeout + settings.job_lock_buffer
|
||||
try:
|
||||
result = await self._redis_client.eval(
|
||||
self.RENEW_LOCK_SCRIPT, 1, lock_key, lock_token, lock_ttl
|
||||
)
|
||||
if result == 1:
|
||||
logger.debug(f"锁续租成功: job_id={job_id}")
|
||||
return True
|
||||
else:
|
||||
logger.warning(f"锁续租失败(token 不匹配或锁已过期): job_id={job_id}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"锁续租失败: job_id={job_id}, error={e}")
|
||||
return False
|
||||
|
||||
async def recover_stale_jobs(self) -> int:
|
||||
"""回收超时任务
|
||||
|
||||
扫描 job:processing:ts ZSET,找出超时的任务,
|
||||
根据重试次数决定重新入队或进死信队列。
|
||||
|
||||
Returns:
|
||||
int: 回收的任务数量
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return 0
|
||||
|
||||
timeout = settings.job_execution_timeout + settings.job_lock_buffer
|
||||
cutoff = time.time() - timeout
|
||||
|
||||
try:
|
||||
# 获取超时任务列表
|
||||
stale_jobs = await self._redis_client.zrangebyscore(
|
||||
settings.job_processing_ts_key, "-inf", cutoff
|
||||
)
|
||||
|
||||
recovered = 0
|
||||
for job_id in stale_jobs:
|
||||
# 增加重试次数
|
||||
await self.increment_job_retry(job_id)
|
||||
retry_count = await self.get_job_retry_count(job_id)
|
||||
|
||||
async with self._redis_client.pipeline(transaction=True) as pipe:
|
||||
pipe.lrem(settings.job_processing_key, 1, job_id)
|
||||
pipe.zrem(settings.job_processing_ts_key, job_id)
|
||||
if retry_count < settings.job_max_retries:
|
||||
pipe.lpush(settings.job_queue_key, job_id)
|
||||
logger.info(f"超时任务重新入队: job_id={job_id}, retry_count={retry_count}")
|
||||
else:
|
||||
pipe.lpush(settings.job_dlq_key, job_id)
|
||||
logger.warning(
|
||||
f"超时任务进入死信队列: job_id={job_id}, retry_count={retry_count}"
|
||||
)
|
||||
await pipe.execute()
|
||||
recovered += 1
|
||||
|
||||
if recovered > 0:
|
||||
logger.info(f"回收超时任务完成: 共 {recovered} 个")
|
||||
return recovered
|
||||
except Exception as e:
|
||||
logger.error(f"回收超时任务失败: error={e}")
|
||||
return 0
|
||||
|
||||
def get_concurrency_status(self) -> Dict[str, int]:
|
||||
"""获取并发状态
|
||||
|
||||
@@ -398,6 +775,67 @@ class JobManager:
|
||||
"running_jobs": running_jobs,
|
||||
}
|
||||
|
||||
async def collect_queue_metrics(self) -> Dict[str, Any]:
|
||||
"""收集队列监控指标
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 包含以下键的字典
|
||||
- queue_length: 待处理队列长度
|
||||
- processing_length: 处理中队列长度
|
||||
- dlq_length: 死信队列长度
|
||||
- oldest_waiting_seconds: 最长等待时间(秒)
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return {
|
||||
"queue_length": 0,
|
||||
"processing_length": 0,
|
||||
"dlq_length": 0,
|
||||
"oldest_waiting_seconds": 0,
|
||||
}
|
||||
|
||||
try:
|
||||
# 使用 pipeline 批量获取队列长度
|
||||
async with self._redis_client.pipeline(transaction=False) as pipe:
|
||||
pipe.llen(settings.job_queue_key)
|
||||
pipe.llen(settings.job_processing_key)
|
||||
pipe.llen(settings.job_dlq_key)
|
||||
pipe.zrange(settings.job_processing_ts_key, 0, 0, withscores=True)
|
||||
results = await pipe.execute()
|
||||
|
||||
queue_length = results[0] or 0
|
||||
processing_length = results[1] or 0
|
||||
dlq_length = results[2] or 0
|
||||
|
||||
# 计算最长等待时间
|
||||
oldest_waiting_seconds = 0
|
||||
if results[3]:
|
||||
# results[3] 是 [(job_id, timestamp), ...] 格式
|
||||
oldest_ts = results[3][0][1]
|
||||
oldest_waiting_seconds = time.time() - oldest_ts
|
||||
|
||||
# 更新指标
|
||||
from .metrics_unified import set as metrics_set
|
||||
|
||||
await metrics_set("job_queue_length", {"queue": "pending"}, queue_length)
|
||||
await metrics_set("job_queue_length", {"queue": "processing"}, processing_length)
|
||||
await metrics_set("job_queue_length", {"queue": "dlq"}, dlq_length)
|
||||
await metrics_set("job_oldest_waiting_seconds", None, oldest_waiting_seconds)
|
||||
|
||||
return {
|
||||
"queue_length": queue_length,
|
||||
"processing_length": processing_length,
|
||||
"dlq_length": dlq_length,
|
||||
"oldest_waiting_seconds": oldest_waiting_seconds,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"收集队列指标失败: error={e}")
|
||||
return {
|
||||
"queue_length": 0,
|
||||
"processing_length": 0,
|
||||
"dlq_length": 0,
|
||||
"oldest_waiting_seconds": 0,
|
||||
}
|
||||
|
||||
|
||||
# 全局单例
|
||||
_job_manager: Optional[JobManager] = None
|
||||
|
||||
@@ -1,19 +1,21 @@
|
||||
"""统一指标管理模块
|
||||
|
||||
基于 Redis 的指标收集方案,支持多实例部署和 YAML 配置。
|
||||
使用异步 Redis 客户端,避免在异步请求路径中阻塞事件循环。
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import socket
|
||||
import logging
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
from functools import wraps
|
||||
import time
|
||||
|
||||
import yaml
|
||||
import redis
|
||||
import redis.asyncio as aioredis
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -22,7 +24,7 @@ class MetricsManager:
|
||||
"""统一指标管理器
|
||||
|
||||
支持从 YAML 配置文件加载指标定义,使用 Redis 存储指标数据,
|
||||
并导出 Prometheus 格式的指标。
|
||||
并导出 Prometheus 格式的指标。使用异步 Redis 客户端。
|
||||
"""
|
||||
|
||||
def __init__(self, config_path: Optional[str] = None):
|
||||
@@ -37,16 +39,22 @@ class MetricsManager:
|
||||
self.instance_id = settings.metrics_instance_id or socket.gethostname()
|
||||
self.config: Dict[str, Any] = {}
|
||||
self.metrics_definitions: Dict[str, Dict[str, Any]] = {}
|
||||
self._redis_client: Optional[redis.Redis] = None
|
||||
self._redis_client: Optional[aioredis.Redis] = None
|
||||
self._redis_available = False
|
||||
self._initialized = False
|
||||
|
||||
# 加载配置
|
||||
# 加载配置(同步操作)
|
||||
self._load_config()
|
||||
# 初始化 Redis 连接
|
||||
self._init_redis()
|
||||
# 注册指标定义
|
||||
# 注册指标定义(同步操作)
|
||||
self._register_metrics()
|
||||
|
||||
async def initialize(self) -> None:
|
||||
"""异步初始化 Redis 连接"""
|
||||
if self._initialized:
|
||||
return
|
||||
await self._init_redis()
|
||||
self._initialized = True
|
||||
|
||||
def _load_config(self) -> None:
|
||||
"""加载 YAML 配置文件"""
|
||||
# 尝试多个路径
|
||||
@@ -138,8 +146,8 @@ class MetricsManager:
|
||||
"custom_metrics": {},
|
||||
}
|
||||
|
||||
def _init_redis(self) -> None:
|
||||
"""初始化 Redis 连接"""
|
||||
async def _init_redis(self) -> None:
|
||||
"""异步初始化 Redis 连接"""
|
||||
from ..config import settings
|
||||
|
||||
redis_config = self.config.get("redis", {})
|
||||
@@ -149,7 +157,7 @@ class MetricsManager:
|
||||
password = redis_config.get("password") or settings.redis_password
|
||||
|
||||
try:
|
||||
self._redis_client = redis.Redis(
|
||||
self._redis_client = aioredis.Redis(
|
||||
host=host,
|
||||
port=port,
|
||||
db=db,
|
||||
@@ -159,10 +167,10 @@ class MetricsManager:
|
||||
socket_timeout=5,
|
||||
)
|
||||
# 测试连接
|
||||
self._redis_client.ping()
|
||||
await self._redis_client.ping()
|
||||
self._redis_available = True
|
||||
logger.info(f"Redis 连接成功: {host}:{port}/{db}")
|
||||
except redis.ConnectionError as e:
|
||||
except aioredis.ConnectionError as e:
|
||||
logger.warning(f"Redis 连接失败: {e},指标将不会被收集")
|
||||
self._redis_available = False
|
||||
except Exception as e:
|
||||
@@ -235,7 +243,9 @@ class MetricsManager:
|
||||
|
||||
# === 简单 API(业务代码使用)===
|
||||
|
||||
def incr(self, name: str, labels: Optional[Dict[str, str]] = None, value: int = 1) -> None:
|
||||
async def incr(
|
||||
self, name: str, labels: Optional[Dict[str, str]] = None, value: int = 1
|
||||
) -> None:
|
||||
"""增加计数器
|
||||
|
||||
Args:
|
||||
@@ -252,11 +262,13 @@ class MetricsManager:
|
||||
try:
|
||||
key = f"metrics:counter:{name}"
|
||||
field = self._labels_to_key(labels) or "_default_"
|
||||
self._redis_client.hincrbyfloat(key, field, value)
|
||||
await self._redis_client.hincrbyfloat(key, field, value)
|
||||
except Exception as e:
|
||||
logger.error(f"增加计数器失败: {e}")
|
||||
|
||||
def set(self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) -> None:
|
||||
async def set(
|
||||
self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 0
|
||||
) -> None:
|
||||
"""设置仪表盘值
|
||||
|
||||
Args:
|
||||
@@ -273,11 +285,11 @@ class MetricsManager:
|
||||
try:
|
||||
key = f"metrics:gauge:{name}"
|
||||
field = self._labels_to_key(labels) or "_default_"
|
||||
self._redis_client.hset(key, field, value)
|
||||
await self._redis_client.hset(key, field, value)
|
||||
except Exception as e:
|
||||
logger.error(f"设置仪表盘失败: {e}")
|
||||
|
||||
def gauge_incr(
|
||||
async def gauge_incr(
|
||||
self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 1
|
||||
) -> None:
|
||||
"""增加仪表盘值
|
||||
@@ -296,11 +308,11 @@ class MetricsManager:
|
||||
try:
|
||||
key = f"metrics:gauge:{name}"
|
||||
field = self._labels_to_key(labels) or "_default_"
|
||||
self._redis_client.hincrbyfloat(key, field, value)
|
||||
await self._redis_client.hincrbyfloat(key, field, value)
|
||||
except Exception as e:
|
||||
logger.error(f"增加仪表盘失败: {e}")
|
||||
|
||||
def gauge_decr(
|
||||
async def gauge_decr(
|
||||
self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 1
|
||||
) -> None:
|
||||
"""减少仪表盘值
|
||||
@@ -310,9 +322,11 @@ class MetricsManager:
|
||||
labels: 标签字典
|
||||
value: 减少的值
|
||||
"""
|
||||
self.gauge_incr(name, labels, -value)
|
||||
await self.gauge_incr(name, labels, -value)
|
||||
|
||||
def observe(self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) -> None:
|
||||
async def observe(
|
||||
self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 0
|
||||
) -> None:
|
||||
"""记录直方图观测值
|
||||
|
||||
Args:
|
||||
@@ -348,13 +362,13 @@ class MetricsManager:
|
||||
# +Inf 桶总是增加
|
||||
pipe.hincrbyfloat(f"metrics:histogram:{name}:bucket:+Inf", label_key, 1)
|
||||
|
||||
pipe.execute()
|
||||
await pipe.execute()
|
||||
except Exception as e:
|
||||
logger.error(f"记录直方图失败: {e}")
|
||||
|
||||
# === 导出方法 ===
|
||||
|
||||
def export(self) -> str:
|
||||
async def export(self) -> str:
|
||||
"""导出 Prometheus 格式指标
|
||||
|
||||
Returns:
|
||||
@@ -375,11 +389,11 @@ class MetricsManager:
|
||||
lines.append(f"# TYPE {name} {metric_type}")
|
||||
|
||||
if metric_type == "counter":
|
||||
lines.extend(self._export_counter(name))
|
||||
lines.extend(await self._export_counter(name))
|
||||
elif metric_type == "gauge":
|
||||
lines.extend(self._export_gauge(name))
|
||||
lines.extend(await self._export_gauge(name))
|
||||
elif metric_type == "histogram":
|
||||
lines.extend(self._export_histogram(name, definition))
|
||||
lines.extend(await self._export_histogram(name, definition))
|
||||
|
||||
lines.append("") # 空行分隔
|
||||
|
||||
@@ -389,12 +403,12 @@ class MetricsManager:
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _export_counter(self, name: str) -> List[str]:
|
||||
async def _export_counter(self, name: str) -> List[str]:
|
||||
"""导出计数器指标"""
|
||||
lines = []
|
||||
key = f"metrics:counter:{name}"
|
||||
|
||||
data = self._redis_client.hgetall(key)
|
||||
data = await self._redis_client.hgetall(key)
|
||||
for field, value in data.items():
|
||||
if field == "_default_":
|
||||
lines.append(f"{name} {value}")
|
||||
@@ -404,12 +418,12 @@ class MetricsManager:
|
||||
|
||||
return lines
|
||||
|
||||
def _export_gauge(self, name: str) -> List[str]:
|
||||
async def _export_gauge(self, name: str) -> List[str]:
|
||||
"""导出仪表盘指标"""
|
||||
lines = []
|
||||
key = f"metrics:gauge:{name}"
|
||||
|
||||
data = self._redis_client.hgetall(key)
|
||||
data = await self._redis_client.hgetall(key)
|
||||
for field, value in data.items():
|
||||
if field == "_default_":
|
||||
lines.append(f"{name} {value}")
|
||||
@@ -419,14 +433,14 @@ class MetricsManager:
|
||||
|
||||
return lines
|
||||
|
||||
def _export_histogram(self, name: str, definition: Dict[str, Any]) -> List[str]:
|
||||
async def _export_histogram(self, name: str, definition: Dict[str, Any]) -> List[str]:
|
||||
"""导出直方图指标"""
|
||||
lines = []
|
||||
buckets = definition.get("buckets", [])
|
||||
|
||||
# 获取所有标签组合
|
||||
count_data = self._redis_client.hgetall(f"metrics:histogram:{name}:count")
|
||||
sum_data = self._redis_client.hgetall(f"metrics:histogram:{name}:sum")
|
||||
count_data = await self._redis_client.hgetall(f"metrics:histogram:{name}:count")
|
||||
sum_data = await self._redis_client.hgetall(f"metrics:histogram:{name}:sum")
|
||||
|
||||
for label_key in count_data.keys():
|
||||
prom_labels = self._key_to_prometheus_labels(label_key)
|
||||
@@ -434,7 +448,7 @@ class MetricsManager:
|
||||
# 导出各个桶
|
||||
for bucket in buckets:
|
||||
bucket_key = f"metrics:histogram:{name}:bucket:{bucket}"
|
||||
bucket_value = self._redis_client.hget(bucket_key, label_key) or "0"
|
||||
bucket_value = await self._redis_client.hget(bucket_key, label_key) or "0"
|
||||
if label_key == "_default_":
|
||||
lines.append(f'{name}_bucket{{le="{bucket}"}} {bucket_value}')
|
||||
else:
|
||||
@@ -442,7 +456,7 @@ class MetricsManager:
|
||||
|
||||
# +Inf 桶
|
||||
inf_key = f"metrics:histogram:{name}:bucket:+Inf"
|
||||
inf_value = self._redis_client.hget(inf_key, label_key) or "0"
|
||||
inf_value = await self._redis_client.hget(inf_key, label_key) or "0"
|
||||
if label_key == "_default_":
|
||||
lines.append(f'{name}_bucket{{le="+Inf"}} {inf_value}')
|
||||
else:
|
||||
@@ -464,43 +478,79 @@ class MetricsManager:
|
||||
"""检查 Redis 是否可用"""
|
||||
return self._redis_available
|
||||
|
||||
def reset(self) -> None:
|
||||
async def reset(self) -> None:
|
||||
"""重置所有指标(主要用于测试)"""
|
||||
if not self._redis_available:
|
||||
return
|
||||
|
||||
try:
|
||||
# 删除所有指标相关的 key
|
||||
keys = self._redis_client.keys("metrics:*")
|
||||
keys = await self._redis_client.keys("metrics:*")
|
||||
if keys:
|
||||
self._redis_client.delete(*keys)
|
||||
await self._redis_client.delete(*keys)
|
||||
logger.info("已重置所有指标")
|
||||
except Exception as e:
|
||||
logger.error(f"重置指标失败: {e}")
|
||||
|
||||
async def close(self) -> None:
|
||||
"""关闭 Redis 连接"""
|
||||
if self._redis_client:
|
||||
await self._redis_client.close()
|
||||
self._redis_client = None
|
||||
self._redis_available = False
|
||||
self._initialized = False
|
||||
|
||||
|
||||
# 全局单例
|
||||
_manager: Optional[MetricsManager] = None
|
||||
_manager_lock = asyncio.Lock()
|
||||
|
||||
|
||||
def get_metrics_manager() -> MetricsManager:
|
||||
"""获取指标管理器单例"""
|
||||
async def get_metrics_manager() -> MetricsManager:
|
||||
"""获取指标管理器单例(异步)"""
|
||||
global _manager
|
||||
if _manager is None:
|
||||
async with _manager_lock:
|
||||
if _manager is None:
|
||||
_manager = MetricsManager()
|
||||
await _manager.initialize()
|
||||
elif not _manager._initialized:
|
||||
await _manager.initialize()
|
||||
return _manager
|
||||
|
||||
|
||||
def get_metrics_manager_sync() -> MetricsManager:
|
||||
"""获取指标管理器单例(同步,仅用于非异步上下文)
|
||||
|
||||
注意:此方法不会初始化 Redis 连接,需要在异步上下文中调用 initialize()
|
||||
"""
|
||||
global _manager
|
||||
if _manager is None:
|
||||
_manager = MetricsManager()
|
||||
return _manager
|
||||
|
||||
|
||||
def reset_metrics_manager() -> None:
|
||||
async def reset_metrics_manager() -> None:
|
||||
"""重置指标管理器单例(主要用于测试)"""
|
||||
global _manager
|
||||
if _manager is not None:
|
||||
await _manager.close()
|
||||
_manager = None
|
||||
|
||||
|
||||
def reset_metrics_manager_sync() -> None:
|
||||
"""同步重置指标管理器单例(主要用于测试)
|
||||
|
||||
注意:此方法不会关闭 Redis 连接,仅重置单例引用
|
||||
"""
|
||||
global _manager
|
||||
_manager = None
|
||||
|
||||
|
||||
# === 便捷函数(业务代码直接调用)===
|
||||
|
||||
|
||||
def incr(name: str, labels: Optional[Dict[str, str]] = None, value: int = 1) -> None:
|
||||
async def incr(name: str, labels: Optional[Dict[str, str]] = None, value: int = 1) -> None:
|
||||
"""增加计数器 - 便捷函数
|
||||
|
||||
Args:
|
||||
@@ -508,10 +558,11 @@ def incr(name: str, labels: Optional[Dict[str, str]] = None, value: int = 1) ->
|
||||
labels: 标签字典
|
||||
value: 增加的值,默认为 1
|
||||
"""
|
||||
get_metrics_manager().incr(name, labels, value)
|
||||
manager = await get_metrics_manager()
|
||||
await manager.incr(name, labels, value)
|
||||
|
||||
|
||||
def set(name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) -> None:
|
||||
async def set(name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) -> None:
|
||||
"""设置仪表盘 - 便捷函数
|
||||
|
||||
Args:
|
||||
@@ -519,10 +570,13 @@ def set(name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) ->
|
||||
labels: 标签字典
|
||||
value: 设置的值
|
||||
"""
|
||||
get_metrics_manager().set(name, labels, value)
|
||||
manager = await get_metrics_manager()
|
||||
await manager.set(name, labels, value)
|
||||
|
||||
|
||||
def gauge_incr(name: str, labels: Optional[Dict[str, str]] = None, value: float = 1) -> None:
|
||||
async def gauge_incr(
|
||||
name: str, labels: Optional[Dict[str, str]] = None, value: float = 1
|
||||
) -> None:
|
||||
"""增加仪表盘 - 便捷函数
|
||||
|
||||
Args:
|
||||
@@ -530,10 +584,13 @@ def gauge_incr(name: str, labels: Optional[Dict[str, str]] = None, value: float
|
||||
labels: 标签字典
|
||||
value: 增加的值
|
||||
"""
|
||||
get_metrics_manager().gauge_incr(name, labels, value)
|
||||
manager = await get_metrics_manager()
|
||||
await manager.gauge_incr(name, labels, value)
|
||||
|
||||
|
||||
def gauge_decr(name: str, labels: Optional[Dict[str, str]] = None, value: float = 1) -> None:
|
||||
async def gauge_decr(
|
||||
name: str, labels: Optional[Dict[str, str]] = None, value: float = 1
|
||||
) -> None:
|
||||
"""减少仪表盘 - 便捷函数
|
||||
|
||||
Args:
|
||||
@@ -541,10 +598,13 @@ def gauge_decr(name: str, labels: Optional[Dict[str, str]] = None, value: float
|
||||
labels: 标签字典
|
||||
value: 减少的值
|
||||
"""
|
||||
get_metrics_manager().gauge_decr(name, labels, value)
|
||||
manager = await get_metrics_manager()
|
||||
await manager.gauge_decr(name, labels, value)
|
||||
|
||||
|
||||
def observe(name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) -> None:
|
||||
async def observe(
|
||||
name: str, labels: Optional[Dict[str, str]] = None, value: float = 0
|
||||
) -> None:
|
||||
"""记录直方图 - 便捷函数
|
||||
|
||||
Args:
|
||||
@@ -552,21 +612,105 @@ def observe(name: str, labels: Optional[Dict[str, str]] = None, value: float = 0
|
||||
labels: 标签字典
|
||||
value: 观测值
|
||||
"""
|
||||
get_metrics_manager().observe(name, labels, value)
|
||||
manager = await get_metrics_manager()
|
||||
await manager.observe(name, labels, value)
|
||||
|
||||
|
||||
def export() -> str:
|
||||
async def export() -> str:
|
||||
"""导出指标 - 便捷函数
|
||||
|
||||
Returns:
|
||||
Prometheus 文本格式的指标字符串
|
||||
"""
|
||||
return get_metrics_manager().export()
|
||||
manager = await get_metrics_manager()
|
||||
return await manager.export()
|
||||
|
||||
|
||||
def is_available() -> bool:
|
||||
async def is_available() -> bool:
|
||||
"""检查 Redis 是否可用 - 便捷函数"""
|
||||
return get_metrics_manager().is_available()
|
||||
manager = await get_metrics_manager()
|
||||
return manager.is_available()
|
||||
|
||||
|
||||
# === 同步便捷函数(用于同步代码中的 fire-and-forget 模式)===
|
||||
|
||||
|
||||
def _schedule_async(coro) -> None:
|
||||
"""在后台调度异步协程(fire-and-forget 模式)
|
||||
|
||||
如果当前没有运行的事件循环,则静默忽略。
|
||||
"""
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
loop.create_task(coro)
|
||||
except RuntimeError:
|
||||
# 没有运行的事件循环,静默忽略
|
||||
pass
|
||||
|
||||
|
||||
def incr_sync(
|
||||
name: str, labels: Optional[Dict[str, str]] = None, value: int = 1
|
||||
) -> None:
|
||||
"""增加计数器 - 同步便捷函数(fire-and-forget)
|
||||
|
||||
Args:
|
||||
name: 指标名称
|
||||
labels: 标签字典
|
||||
value: 增加的值,默认为 1
|
||||
"""
|
||||
_schedule_async(incr(name, labels, value))
|
||||
|
||||
|
||||
def set_sync(
|
||||
name: str, labels: Optional[Dict[str, str]] = None, value: float = 0
|
||||
) -> None:
|
||||
"""设置仪表盘 - 同步便捷函数(fire-and-forget)
|
||||
|
||||
Args:
|
||||
name: 指标名称
|
||||
labels: 标签字典
|
||||
value: 设置的值
|
||||
"""
|
||||
_schedule_async(set(name, labels, value))
|
||||
|
||||
|
||||
def gauge_incr_sync(
|
||||
name: str, labels: Optional[Dict[str, str]] = None, value: float = 1
|
||||
) -> None:
|
||||
"""增加仪表盘 - 同步便捷函数(fire-and-forget)
|
||||
|
||||
Args:
|
||||
name: 指标名称
|
||||
labels: 标签字典
|
||||
value: 增加的值
|
||||
"""
|
||||
_schedule_async(gauge_incr(name, labels, value))
|
||||
|
||||
|
||||
def gauge_decr_sync(
|
||||
name: str, labels: Optional[Dict[str, str]] = None, value: float = 1
|
||||
) -> None:
|
||||
"""减少仪表盘 - 同步便捷函数(fire-and-forget)
|
||||
|
||||
Args:
|
||||
name: 指标名称
|
||||
labels: 标签字典
|
||||
value: 减少的值
|
||||
"""
|
||||
_schedule_async(gauge_decr(name, labels, value))
|
||||
|
||||
|
||||
def observe_sync(
|
||||
name: str, labels: Optional[Dict[str, str]] = None, value: float = 0
|
||||
) -> None:
|
||||
"""记录直方图 - 同步便捷函数(fire-and-forget)
|
||||
|
||||
Args:
|
||||
name: 指标名称
|
||||
labels: 标签字典
|
||||
value: 观测值
|
||||
"""
|
||||
_schedule_async(observe(name, labels, value))
|
||||
|
||||
|
||||
# === 装饰器(兼容旧 API)===
|
||||
@@ -593,8 +737,11 @@ def track_algorithm_execution(algorithm_name: str):
|
||||
raise e
|
||||
finally:
|
||||
elapsed = time.time() - start_time
|
||||
incr("algorithm_executions_total", {"algorithm": algorithm_name, "status": status})
|
||||
observe(
|
||||
incr_sync(
|
||||
"algorithm_executions_total",
|
||||
{"algorithm": algorithm_name, "status": status},
|
||||
)
|
||||
observe_sync(
|
||||
"algorithm_execution_duration_seconds",
|
||||
{"algorithm": algorithm_name},
|
||||
elapsed,
|
||||
|
||||
@@ -95,7 +95,7 @@ async def track_metrics(request: Request, call_next):
|
||||
if request.url.path in skip_paths:
|
||||
return await call_next(request)
|
||||
|
||||
gauge_incr("http_requests_in_progress")
|
||||
await gauge_incr("http_requests_in_progress")
|
||||
start_time = time.time()
|
||||
status = "success"
|
||||
|
||||
@@ -112,16 +112,16 @@ async def track_metrics(request: Request, call_next):
|
||||
elapsed = time.time() - start_time
|
||||
# 使用规范化后的路径记录指标
|
||||
normalized_path = normalize_path(request.url.path)
|
||||
incr(
|
||||
await incr(
|
||||
"http_requests_total",
|
||||
{"method": request.method, "endpoint": normalized_path, "status": status},
|
||||
)
|
||||
observe(
|
||||
await observe(
|
||||
"http_request_duration_seconds",
|
||||
{"method": request.method, "endpoint": normalized_path},
|
||||
elapsed,
|
||||
)
|
||||
gauge_decr("http_requests_in_progress")
|
||||
await gauge_decr("http_requests_in_progress")
|
||||
|
||||
|
||||
# 注册路由
|
||||
@@ -145,7 +145,7 @@ async def metrics():
|
||||
return Response(content="Metrics disabled", status_code=404)
|
||||
|
||||
return Response(
|
||||
content=export(),
|
||||
content=await export(),
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8",
|
||||
)
|
||||
|
||||
@@ -160,7 +160,7 @@ async def startup_event():
|
||||
|
||||
# 初始化指标管理器
|
||||
if settings.metrics_enabled:
|
||||
manager = get_metrics_manager()
|
||||
manager = await get_metrics_manager()
|
||||
if manager.is_available():
|
||||
logger.info("Redis 指标收集已启用")
|
||||
else:
|
||||
|
||||
373
src/functional_scaffold/worker.py
Normal file
373
src/functional_scaffold/worker.py
Normal file
@@ -0,0 +1,373 @@
|
||||
"""Worker 进程模块
|
||||
|
||||
基于 Redis 队列的任务 Worker,支持分布式锁和全局并发控制。
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from aiohttp import web
|
||||
|
||||
from .config import settings
|
||||
from .core.job_manager import JobManager
|
||||
from .core.logging import setup_logging
|
||||
from .core.tracing import set_request_id
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HealthCheckServer:
|
||||
"""轻量级健康检查 HTTP 服务器
|
||||
|
||||
为 Worker 模式提供健康检查端点,满足 FC 3.0 容器健康检查要求。
|
||||
"""
|
||||
|
||||
def __init__(self, host: str = "0.0.0.0", port: int = 8000):
|
||||
self._host = host
|
||||
self._port = port
|
||||
self._app: Optional[web.Application] = None
|
||||
self._runner: Optional[web.AppRunner] = None
|
||||
self._site: Optional[web.TCPSite] = None
|
||||
self._healthy = True
|
||||
|
||||
async def start(self) -> None:
|
||||
"""启动健康检查服务器"""
|
||||
self._app = web.Application()
|
||||
self._app.router.add_get("/healthz", self._healthz_handler)
|
||||
self._app.router.add_get("/readyz", self._readyz_handler)
|
||||
|
||||
self._runner = web.AppRunner(self._app)
|
||||
await self._runner.setup()
|
||||
self._site = web.TCPSite(self._runner, self._host, self._port)
|
||||
await self._site.start()
|
||||
logger.info(f"健康检查服务器已启动: http://{self._host}:{self._port}")
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""停止健康检查服务器"""
|
||||
if self._runner:
|
||||
await self._runner.cleanup()
|
||||
logger.info("健康检查服务器已停止")
|
||||
|
||||
def set_healthy(self, healthy: bool) -> None:
|
||||
"""设置健康状态"""
|
||||
self._healthy = healthy
|
||||
|
||||
async def _healthz_handler(self, request: web.Request) -> web.Response:
|
||||
"""存活检查端点"""
|
||||
return web.json_response({"status": "healthy", "mode": "worker"})
|
||||
|
||||
async def _readyz_handler(self, request: web.Request) -> web.Response:
|
||||
"""就绪检查端点"""
|
||||
if self._healthy:
|
||||
return web.json_response({"status": "ready", "mode": "worker"})
|
||||
return web.json_response({"status": "not ready"}, status=503)
|
||||
|
||||
|
||||
class JobWorker:
|
||||
"""任务 Worker
|
||||
|
||||
从 Redis 队列获取任务并执行,支持:
|
||||
- 分布式锁防止重复执行
|
||||
- 全局并发控制
|
||||
- 任务重试机制
|
||||
- 锁续租机制
|
||||
- 超时任务回收
|
||||
- 优雅关闭
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._job_manager: Optional[JobManager] = None
|
||||
self._running: bool = False
|
||||
self._current_job_id: Optional[str] = None
|
||||
self._current_lock_token: Optional[str] = None
|
||||
self._lock_renewal_task: Optional[asyncio.Task] = None
|
||||
self._sweeper_task: Optional[asyncio.Task] = None
|
||||
|
||||
async def initialize(self) -> None:
|
||||
"""初始化 Worker"""
|
||||
self._job_manager = JobManager()
|
||||
await self._job_manager.initialize()
|
||||
logger.info("Worker 初始化完成")
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
"""关闭 Worker"""
|
||||
logger.info("Worker 正在关闭...")
|
||||
self._running = False
|
||||
|
||||
# 取消回收器任务
|
||||
if self._sweeper_task and not self._sweeper_task.done():
|
||||
self._sweeper_task.cancel()
|
||||
try:
|
||||
await self._sweeper_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# 取消锁续租任务
|
||||
if self._lock_renewal_task and not self._lock_renewal_task.done():
|
||||
self._lock_renewal_task.cancel()
|
||||
try:
|
||||
await self._lock_renewal_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# 等待当前任务完成
|
||||
if self._current_job_id:
|
||||
logger.info(f"等待当前任务完成: {self._current_job_id}")
|
||||
|
||||
if self._job_manager:
|
||||
await self._job_manager.shutdown()
|
||||
|
||||
logger.info("Worker 已关闭")
|
||||
|
||||
async def run(self) -> None:
|
||||
"""运行 Worker 主循环"""
|
||||
self._running = True
|
||||
logger.info(
|
||||
f"Worker 启动,轮询间隔: {settings.worker_poll_interval}s,"
|
||||
f"最大并发: {settings.max_concurrent_jobs}"
|
||||
)
|
||||
|
||||
# 启动超时任务回收器
|
||||
if settings.job_sweeper_enabled:
|
||||
self._sweeper_task = asyncio.create_task(self._sweeper_loop())
|
||||
logger.info(f"超时任务回收器已启动,扫描间隔: {settings.job_sweeper_interval}s")
|
||||
|
||||
while self._running:
|
||||
try:
|
||||
await self._process_next_job()
|
||||
except Exception as e:
|
||||
logger.error(f"Worker 循环异常: {e}", exc_info=True)
|
||||
await asyncio.sleep(settings.worker_poll_interval)
|
||||
|
||||
async def _process_next_job(self) -> None:
|
||||
"""处理下一个任务"""
|
||||
if not self._job_manager:
|
||||
logger.error("JobManager 未初始化")
|
||||
await asyncio.sleep(settings.worker_poll_interval)
|
||||
return
|
||||
|
||||
# 从队列获取任务(转移式出队)
|
||||
job_id = await self._job_manager.dequeue_job(timeout=int(settings.worker_poll_interval))
|
||||
|
||||
if not job_id:
|
||||
return
|
||||
|
||||
# 获取任务信息以提取 request_id
|
||||
job_data = await self._job_manager.get_job(job_id)
|
||||
if job_data:
|
||||
request_id = job_data.get("request_id") or job_id
|
||||
set_request_id(request_id)
|
||||
else:
|
||||
set_request_id(job_id)
|
||||
|
||||
logger.info(f"从队列获取任务: {job_id}")
|
||||
|
||||
# 尝试获取分布式锁(返回 token)
|
||||
lock_token = await self._job_manager.acquire_job_lock(job_id)
|
||||
if not lock_token:
|
||||
logger.warning(f"无法获取任务锁,任务可能正在被其他 Worker 执行: {job_id}")
|
||||
# 任务留在 processing 队列,等待回收器处理
|
||||
return
|
||||
|
||||
self._current_lock_token = lock_token
|
||||
|
||||
# 启动锁续租协程
|
||||
self._lock_renewal_task = asyncio.create_task(self._lock_renewal_loop(job_id, lock_token))
|
||||
|
||||
try:
|
||||
# 检查全局并发限制
|
||||
if not await self._job_manager.can_execute():
|
||||
logger.info(f"达到并发限制,任务 NACK 重新入队: {job_id}")
|
||||
await self._job_manager.nack_job(job_id, requeue=True)
|
||||
return
|
||||
|
||||
# 增加并发计数
|
||||
await self._job_manager.increment_concurrency()
|
||||
self._current_job_id = job_id
|
||||
|
||||
try:
|
||||
# 执行任务
|
||||
success = await self._execute_with_retry(job_id)
|
||||
if success:
|
||||
await self._job_manager.ack_job(job_id)
|
||||
else:
|
||||
await self._job_manager.increment_job_retry(job_id)
|
||||
await self._job_manager.nack_job(job_id, requeue=True)
|
||||
finally:
|
||||
# 减少并发计数
|
||||
await self._job_manager.decrement_concurrency()
|
||||
self._current_job_id = None
|
||||
|
||||
finally:
|
||||
# 停止锁续租
|
||||
if self._lock_renewal_task and not self._lock_renewal_task.done():
|
||||
self._lock_renewal_task.cancel()
|
||||
try:
|
||||
await self._lock_renewal_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._lock_renewal_task = None
|
||||
|
||||
# 释放分布式锁
|
||||
await self._job_manager.release_job_lock(job_id, lock_token)
|
||||
self._current_lock_token = None
|
||||
|
||||
async def _execute_with_retry(self, job_id: str) -> bool:
|
||||
"""执行任务(带重试机制)
|
||||
|
||||
Returns:
|
||||
bool: 任务是否成功执行
|
||||
"""
|
||||
if not self._job_manager:
|
||||
return False
|
||||
|
||||
try:
|
||||
# 执行任务
|
||||
await asyncio.wait_for(
|
||||
self._job_manager.execute_job(job_id),
|
||||
timeout=settings.job_execution_timeout,
|
||||
)
|
||||
return True
|
||||
except asyncio.TimeoutError:
|
||||
logger.error(f"任务执行超时: {job_id}")
|
||||
await self._handle_job_failure(job_id, "任务执行超时")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"任务执行异常: {job_id}, error={e}", exc_info=True)
|
||||
await self._handle_job_failure(job_id, str(e))
|
||||
return False
|
||||
|
||||
async def _handle_job_failure(self, job_id: str, error: str) -> None:
|
||||
"""处理任务失败"""
|
||||
if not self._job_manager:
|
||||
return
|
||||
|
||||
retry_count = await self._job_manager.increment_job_retry(job_id)
|
||||
|
||||
if retry_count < settings.job_max_retries:
|
||||
logger.info(f"任务将重试 ({retry_count}/{settings.job_max_retries}): {job_id}")
|
||||
# 重新入队
|
||||
await self._job_manager.enqueue_job(job_id)
|
||||
else:
|
||||
logger.error(f"任务达到最大重试次数,标记为失败: {job_id}")
|
||||
# 更新任务状态为失败
|
||||
if self._job_manager._redis_client:
|
||||
key = f"job:{job_id}"
|
||||
await self._job_manager._redis_client.hset(
|
||||
key,
|
||||
mapping={
|
||||
"status": "failed",
|
||||
"error": f"达到最大重试次数 ({settings.job_max_retries}): {error}",
|
||||
},
|
||||
)
|
||||
|
||||
async def _lock_renewal_loop(self, job_id: str, lock_token: str) -> None:
|
||||
"""锁续租协程
|
||||
|
||||
定期续租任务锁,防止长任务执行时锁过期。
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
lock_token: 锁 token
|
||||
"""
|
||||
# 续租间隔为锁 TTL 的一半
|
||||
interval = (settings.job_execution_timeout + settings.job_lock_buffer) / 2
|
||||
while True:
|
||||
try:
|
||||
await asyncio.sleep(interval)
|
||||
if not self._job_manager:
|
||||
break
|
||||
if not await self._job_manager.renew_job_lock(job_id, lock_token):
|
||||
logger.error(f"锁续租失败,可能已被其他进程获取: {job_id}")
|
||||
break
|
||||
logger.debug(f"锁续租成功: {job_id}")
|
||||
except asyncio.CancelledError:
|
||||
logger.debug(f"锁续租协程已取消: {job_id}")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"锁续租异常: {job_id}, error={e}")
|
||||
break
|
||||
|
||||
async def _sweeper_loop(self) -> None:
|
||||
"""超时任务回收协程
|
||||
|
||||
定期扫描处理中队列,回收超时任务,并收集队列监控指标。
|
||||
"""
|
||||
while self._running:
|
||||
try:
|
||||
await asyncio.sleep(settings.job_sweeper_interval)
|
||||
if not self._job_manager:
|
||||
continue
|
||||
|
||||
# 回收超时任务
|
||||
recovered = await self._job_manager.recover_stale_jobs()
|
||||
if recovered > 0:
|
||||
logger.info(f"回收超时任务: {recovered} 个")
|
||||
# 记录回收指标
|
||||
from .core.metrics_unified import incr
|
||||
|
||||
await incr("job_recovered_total", None, recovered)
|
||||
|
||||
# 收集队列监控指标
|
||||
await self._job_manager.collect_queue_metrics()
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("超时任务回收协程已取消")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"超时任务回收异常: {e}")
|
||||
|
||||
|
||||
def setup_signal_handlers(
|
||||
worker: JobWorker,
|
||||
health_server: HealthCheckServer,
|
||||
loop: asyncio.AbstractEventLoop,
|
||||
) -> None:
|
||||
"""设置信号处理器"""
|
||||
|
||||
async def shutdown_all() -> None:
|
||||
"""关闭所有服务"""
|
||||
await worker.shutdown()
|
||||
await health_server.stop()
|
||||
|
||||
def signal_handler(sig: signal.Signals) -> None:
|
||||
logger.info(f"收到信号 {sig.name},准备关闭...")
|
||||
loop.create_task(shutdown_all())
|
||||
|
||||
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||
loop.add_signal_handler(sig, signal_handler, sig)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
"""Worker 入口函数"""
|
||||
# 设置日志
|
||||
setup_logging(level=settings.log_level, format_type=settings.log_format)
|
||||
|
||||
# 创建健康检查服务器和 Worker
|
||||
health_server = HealthCheckServer(port=8000)
|
||||
worker = JobWorker()
|
||||
|
||||
# 设置信号处理
|
||||
loop = asyncio.get_running_loop()
|
||||
setup_signal_handlers(worker, health_server, loop)
|
||||
|
||||
try:
|
||||
# 先启动健康检查服务器,确保 FC 健康检查能通过
|
||||
await health_server.start()
|
||||
|
||||
# 初始化并运行 Worker
|
||||
await worker.initialize()
|
||||
await worker.run()
|
||||
except Exception as e:
|
||||
logger.error(f"Worker 异常退出: {e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
finally:
|
||||
await worker.shutdown()
|
||||
await health_server.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,17 +1,13 @@
|
||||
"""异步任务管理器测试"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import pytest
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
from fastapi import status
|
||||
|
||||
from functional_scaffold.core.job_manager import (
|
||||
JobManager,
|
||||
get_job_manager,
|
||||
shutdown_job_manager,
|
||||
)
|
||||
from functional_scaffold.api.models import JobStatus
|
||||
|
||||
|
||||
class TestJobManager:
|
||||
@@ -188,6 +184,7 @@ class TestJobManagerWithMocks:
|
||||
|
||||
# 初始化 semaphore
|
||||
import asyncio
|
||||
|
||||
manager._semaphore = asyncio.Semaphore(10)
|
||||
|
||||
await manager.execute_job("test-job-id")
|
||||
@@ -217,7 +214,7 @@ class TestJobsAPI:
|
||||
"created_at": "2026-02-02T10:00:00+00:00",
|
||||
}
|
||||
)
|
||||
mock_manager.execute_job = AsyncMock()
|
||||
mock_manager.enqueue_job = AsyncMock(return_value=True)
|
||||
mock_get_manager.return_value = mock_manager
|
||||
|
||||
response = client.post(
|
||||
@@ -486,14 +483,688 @@ class TestConcurrencyControl:
|
||||
|
||||
def test_concurrency_status_api(self, client):
|
||||
"""测试并发状态 API 端点"""
|
||||
response = client.get("/jobs/concurrency/status")
|
||||
with patch(
|
||||
"functional_scaffold.api.routes.get_job_manager", new_callable=AsyncMock
|
||||
) as mock_get_manager:
|
||||
mock_manager = MagicMock()
|
||||
mock_manager.is_available.return_value = True
|
||||
mock_manager.get_concurrency_status.return_value = {
|
||||
"max_concurrent": 10,
|
||||
"available_slots": 8,
|
||||
"running_jobs": 2,
|
||||
}
|
||||
mock_get_manager.return_value = mock_manager
|
||||
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
data = response.json()
|
||||
response = client.get("/jobs/concurrency/status")
|
||||
|
||||
assert "max_concurrent" in data
|
||||
assert "available_slots" in data
|
||||
assert "running_jobs" in data
|
||||
assert isinstance(data["max_concurrent"], int)
|
||||
assert isinstance(data["available_slots"], int)
|
||||
assert isinstance(data["running_jobs"], int)
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
data = response.json()
|
||||
|
||||
assert "max_concurrent" in data
|
||||
assert "available_slots" in data
|
||||
assert "running_jobs" in data
|
||||
assert isinstance(data["max_concurrent"], int)
|
||||
assert isinstance(data["available_slots"], int)
|
||||
assert isinstance(data["running_jobs"], int)
|
||||
|
||||
|
||||
class TestJobQueue:
|
||||
"""测试任务队列功能"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_enqueue_job(self):
|
||||
"""测试任务入队"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.lpush = AsyncMock(return_value=1)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.enqueue_job("test-job-id")
|
||||
|
||||
assert result is True
|
||||
mock_redis.lpush.assert_called_once()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_enqueue_job_without_redis(self):
|
||||
"""测试 Redis 不可用时入队"""
|
||||
manager = JobManager()
|
||||
|
||||
result = await manager.enqueue_job("test-job-id")
|
||||
|
||||
assert result is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dequeue_job(self):
|
||||
"""测试任务出队(使用 BLMOVE)"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.blmove = AsyncMock(return_value="test-job-id")
|
||||
mock_redis.zadd = AsyncMock()
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.dequeue_job(timeout=5)
|
||||
|
||||
assert result == "test-job-id"
|
||||
mock_redis.blmove.assert_called_once()
|
||||
mock_redis.zadd.assert_called_once()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dequeue_job_timeout(self):
|
||||
"""测试任务出队超时"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.blmove = AsyncMock(return_value=None)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.dequeue_job(timeout=1)
|
||||
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dequeue_job_without_redis(self):
|
||||
"""测试 Redis 不可用时出队"""
|
||||
manager = JobManager()
|
||||
|
||||
result = await manager.dequeue_job(timeout=1)
|
||||
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestDistributedLock:
|
||||
"""测试分布式锁功能"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_acquire_job_lock(self):
|
||||
"""测试获取任务锁"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.set = AsyncMock(return_value=True)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.acquire_job_lock("test-job-id")
|
||||
|
||||
assert result is not None # 返回 token
|
||||
assert len(result) == 32 # 16 字节的十六进制字符串
|
||||
mock_redis.set.assert_called_once()
|
||||
call_args = mock_redis.set.call_args
|
||||
assert call_args[0][0] == "job:lock:test-job-id"
|
||||
assert call_args[1]["nx"] is True
|
||||
assert "ex" in call_args[1]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_acquire_job_lock_already_locked(self):
|
||||
"""测试获取已被锁定的任务锁"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.set = AsyncMock(return_value=None) # 锁已存在
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.acquire_job_lock("test-job-id")
|
||||
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_release_job_lock(self):
|
||||
"""测试释放任务锁"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.eval = AsyncMock(return_value=1)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.release_job_lock("test-job-id", "valid-token")
|
||||
|
||||
assert result is True
|
||||
mock_redis.eval.assert_called_once()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_release_job_lock_without_redis(self):
|
||||
"""测试 Redis 不可用时释放锁"""
|
||||
manager = JobManager()
|
||||
|
||||
result = await manager.release_job_lock("test-job-id", "token")
|
||||
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestGlobalConcurrency:
|
||||
"""测试全局并发控制功能"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_increment_concurrency(self):
|
||||
"""测试增加并发计数"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.incr = AsyncMock(return_value=5)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.increment_concurrency()
|
||||
|
||||
assert result == 5
|
||||
mock_redis.incr.assert_called_once()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_decrement_concurrency(self):
|
||||
"""测试减少并发计数"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.decr = AsyncMock(return_value=4)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.decrement_concurrency()
|
||||
|
||||
assert result == 4
|
||||
mock_redis.decr.assert_called_once()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_decrement_concurrency_prevent_negative(self):
|
||||
"""测试防止并发计数变为负数"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.decr = AsyncMock(return_value=-1)
|
||||
mock_redis.set = AsyncMock()
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.decrement_concurrency()
|
||||
|
||||
assert result == 0
|
||||
mock_redis.set.assert_called_once()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_global_concurrency(self):
|
||||
"""测试获取全局并发数"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.get = AsyncMock(return_value="7")
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.get_global_concurrency()
|
||||
|
||||
assert result == 7
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_global_concurrency_empty(self):
|
||||
"""测试获取空的全局并发数"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.get = AsyncMock(return_value=None)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.get_global_concurrency()
|
||||
|
||||
assert result == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_can_execute(self):
|
||||
"""测试检查是否可执行"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.get = AsyncMock(return_value="5")
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
with patch("functional_scaffold.core.job_manager.settings") as mock_settings:
|
||||
mock_settings.max_concurrent_jobs = 10
|
||||
|
||||
result = await manager.can_execute()
|
||||
|
||||
assert result is True
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_can_execute_at_limit(self):
|
||||
"""测试达到并发限制时"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.get = AsyncMock(return_value="10")
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
with patch("functional_scaffold.core.job_manager.settings") as mock_settings:
|
||||
mock_settings.max_concurrent_jobs = 10
|
||||
|
||||
result = await manager.can_execute()
|
||||
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestJobRetry:
|
||||
"""测试任务重试功能"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_job_retry_count(self):
|
||||
"""测试获取任务重试次数"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.hget = AsyncMock(return_value="2")
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.get_job_retry_count("test-job-id")
|
||||
|
||||
assert result == 2
|
||||
mock_redis.hget.assert_called_once_with("job:test-job-id", "retry_count")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_job_retry_count_empty(self):
|
||||
"""测试获取空的重试次数"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.hget = AsyncMock(return_value=None)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.get_job_retry_count("test-job-id")
|
||||
|
||||
assert result == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_increment_job_retry(self):
|
||||
"""测试增加任务重试次数"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.hincrby = AsyncMock()
|
||||
mock_redis.hget = AsyncMock(return_value="3")
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.increment_job_retry("test-job-id")
|
||||
|
||||
assert result == 3
|
||||
mock_redis.hincrby.assert_called_once_with("job:test-job-id", "retry_count", 1)
|
||||
|
||||
|
||||
class TestTransferDequeue:
|
||||
"""测试转移式出队功能"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dequeue_job_with_blmove(self):
|
||||
"""测试使用 BLMOVE 转移式出队"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.blmove = AsyncMock(return_value="test-job-id")
|
||||
mock_redis.zadd = AsyncMock()
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.dequeue_job(timeout=5)
|
||||
|
||||
assert result == "test-job-id"
|
||||
mock_redis.blmove.assert_called_once()
|
||||
mock_redis.zadd.assert_called_once()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dequeue_job_timeout(self):
|
||||
"""测试出队超时"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.blmove = AsyncMock(return_value=None)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.dequeue_job(timeout=1)
|
||||
|
||||
assert result is None
|
||||
mock_redis.zadd.assert_not_called()
|
||||
|
||||
|
||||
class TestTokenBasedLock:
|
||||
"""测试带 Token 的安全锁"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_acquire_job_lock_returns_token(self):
|
||||
"""测试获取锁返回 token"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.set = AsyncMock(return_value=True)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.acquire_job_lock("test-job-id")
|
||||
|
||||
assert result is not None
|
||||
assert len(result) == 32 # 16 字节的十六进制字符串
|
||||
mock_redis.set.assert_called_once()
|
||||
call_args = mock_redis.set.call_args
|
||||
assert call_args[0][0] == "job:lock:test-job-id"
|
||||
assert call_args[1]["nx"] is True
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_acquire_job_lock_already_locked(self):
|
||||
"""测试获取已被锁定的任务锁"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.set = AsyncMock(return_value=None)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.acquire_job_lock("test-job-id")
|
||||
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_release_job_lock_with_token(self):
|
||||
"""测试使用 token 释放锁"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.eval = AsyncMock(return_value=1)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.release_job_lock("test-job-id", "valid-token")
|
||||
|
||||
assert result is True
|
||||
mock_redis.eval.assert_called_once()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_release_job_lock_invalid_token(self):
|
||||
"""测试使用无效 token 释放锁"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.eval = AsyncMock(return_value=0)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.release_job_lock("test-job-id", "invalid-token")
|
||||
|
||||
assert result is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_release_job_lock_without_token(self):
|
||||
"""测试不使用 token 释放锁(向后兼容)"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.delete = AsyncMock()
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.release_job_lock("test-job-id")
|
||||
|
||||
assert result is True
|
||||
mock_redis.delete.assert_called_once_with("job:lock:test-job-id")
|
||||
|
||||
|
||||
class TestAckNack:
|
||||
"""测试 ACK/NACK 机制"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ack_job(self):
|
||||
"""测试确认任务完成"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_pipe = AsyncMock()
|
||||
mock_pipe.lrem = MagicMock()
|
||||
mock_pipe.zrem = MagicMock()
|
||||
mock_pipe.execute = AsyncMock()
|
||||
mock_pipe.__aenter__ = AsyncMock(return_value=mock_pipe)
|
||||
mock_pipe.__aexit__ = AsyncMock()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.pipeline = MagicMock(return_value=mock_pipe)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.ack_job("test-job-id")
|
||||
|
||||
assert result is True
|
||||
mock_pipe.lrem.assert_called_once()
|
||||
mock_pipe.zrem.assert_called_once()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nack_job_requeue(self):
|
||||
"""测试拒绝任务并重新入队"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_pipe = AsyncMock()
|
||||
mock_pipe.lrem = MagicMock()
|
||||
mock_pipe.zrem = MagicMock()
|
||||
mock_pipe.lpush = MagicMock()
|
||||
mock_pipe.execute = AsyncMock()
|
||||
mock_pipe.__aenter__ = AsyncMock(return_value=mock_pipe)
|
||||
mock_pipe.__aexit__ = AsyncMock()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.pipeline = MagicMock(return_value=mock_pipe)
|
||||
mock_redis.hget = AsyncMock(return_value="0") # retry_count = 0
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.nack_job("test-job-id", requeue=True)
|
||||
|
||||
assert result is True
|
||||
assert mock_pipe.lpush.call_count == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nack_job_to_dlq(self):
|
||||
"""测试拒绝任务进入死信队列"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_pipe = AsyncMock()
|
||||
mock_pipe.lrem = MagicMock()
|
||||
mock_pipe.zrem = MagicMock()
|
||||
mock_pipe.lpush = MagicMock()
|
||||
mock_pipe.execute = AsyncMock()
|
||||
mock_pipe.__aenter__ = AsyncMock(return_value=mock_pipe)
|
||||
mock_pipe.__aexit__ = AsyncMock()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.pipeline = MagicMock(return_value=mock_pipe)
|
||||
mock_redis.hget = AsyncMock(return_value="5") # retry_count > max_retries
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
with patch("functional_scaffold.core.job_manager.settings") as mock_settings:
|
||||
mock_settings.job_max_retries = 3
|
||||
mock_settings.job_processing_key = "job:processing"
|
||||
mock_settings.job_processing_ts_key = "job:processing:ts"
|
||||
mock_settings.job_dlq_key = "job:dlq"
|
||||
mock_settings.job_queue_key = "job:queue"
|
||||
|
||||
result = await manager.nack_job("test-job-id", requeue=True)
|
||||
|
||||
assert result is True
|
||||
|
||||
|
||||
class TestLockRenewal:
|
||||
"""测试锁续租功能"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_renew_job_lock_success(self):
|
||||
"""测试锁续租成功"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.eval = AsyncMock(return_value=1)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.renew_job_lock("test-job-id", "valid-token")
|
||||
|
||||
assert result is True
|
||||
mock_redis.eval.assert_called_once()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_renew_job_lock_invalid_token(self):
|
||||
"""测试锁续租失败(token 不匹配)"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.eval = AsyncMock(return_value=0)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.renew_job_lock("test-job-id", "invalid-token")
|
||||
|
||||
assert result is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_renew_job_lock_without_redis(self):
|
||||
"""测试 Redis 不可用时续租"""
|
||||
manager = JobManager()
|
||||
|
||||
result = await manager.renew_job_lock("test-job-id", "token")
|
||||
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestStaleJobRecovery:
|
||||
"""测试超时任务回收功能"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_recover_stale_jobs_empty(self):
|
||||
"""测试没有超时任务时的回收"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.zrangebyscore = AsyncMock(return_value=[])
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
result = await manager.recover_stale_jobs()
|
||||
|
||||
assert result == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_recover_stale_jobs_requeue(self):
|
||||
"""测试回收超时任务并重新入队"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_pipe = AsyncMock()
|
||||
mock_pipe.lrem = MagicMock()
|
||||
mock_pipe.zrem = MagicMock()
|
||||
mock_pipe.lpush = MagicMock()
|
||||
mock_pipe.execute = AsyncMock()
|
||||
mock_pipe.__aenter__ = AsyncMock(return_value=mock_pipe)
|
||||
mock_pipe.__aexit__ = AsyncMock()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.zrangebyscore = AsyncMock(return_value=["stale-job-1", "stale-job-2"])
|
||||
mock_redis.hincrby = AsyncMock()
|
||||
mock_redis.hget = AsyncMock(return_value="1") # retry_count = 1
|
||||
mock_redis.pipeline = MagicMock(return_value=mock_pipe)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
with patch("functional_scaffold.core.job_manager.settings") as mock_settings:
|
||||
mock_settings.job_execution_timeout = 300
|
||||
mock_settings.job_lock_buffer = 60
|
||||
mock_settings.job_max_retries = 3
|
||||
mock_settings.job_processing_key = "job:processing"
|
||||
mock_settings.job_processing_ts_key = "job:processing:ts"
|
||||
mock_settings.job_dlq_key = "job:dlq"
|
||||
mock_settings.job_queue_key = "job:queue"
|
||||
|
||||
result = await manager.recover_stale_jobs()
|
||||
|
||||
assert result == 2
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_recover_stale_jobs_to_dlq(self):
|
||||
"""测试回收超时任务进入死信队列"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_pipe = AsyncMock()
|
||||
mock_pipe.lrem = MagicMock()
|
||||
mock_pipe.zrem = MagicMock()
|
||||
mock_pipe.lpush = MagicMock()
|
||||
mock_pipe.execute = AsyncMock()
|
||||
mock_pipe.__aenter__ = AsyncMock(return_value=mock_pipe)
|
||||
mock_pipe.__aexit__ = AsyncMock()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.zrangebyscore = AsyncMock(return_value=["stale-job-1"])
|
||||
mock_redis.hincrby = AsyncMock()
|
||||
mock_redis.hget = AsyncMock(return_value="5") # retry_count > max_retries
|
||||
mock_redis.pipeline = MagicMock(return_value=mock_pipe)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
with patch("functional_scaffold.core.job_manager.settings") as mock_settings:
|
||||
mock_settings.job_execution_timeout = 300
|
||||
mock_settings.job_lock_buffer = 60
|
||||
mock_settings.job_max_retries = 3
|
||||
mock_settings.job_processing_key = "job:processing"
|
||||
mock_settings.job_processing_ts_key = "job:processing:ts"
|
||||
mock_settings.job_dlq_key = "job:dlq"
|
||||
mock_settings.job_queue_key = "job:queue"
|
||||
|
||||
result = await manager.recover_stale_jobs()
|
||||
|
||||
assert result == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_recover_stale_jobs_without_redis(self):
|
||||
"""测试 Redis 不可用时回收"""
|
||||
manager = JobManager()
|
||||
|
||||
result = await manager.recover_stale_jobs()
|
||||
|
||||
assert result == 0
|
||||
|
||||
|
||||
class TestQueueMetrics:
|
||||
"""测试队列监控指标收集"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_collect_queue_metrics(self):
|
||||
"""测试收集队列指标"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_pipe = AsyncMock()
|
||||
mock_pipe.llen = MagicMock()
|
||||
mock_pipe.zrange = MagicMock()
|
||||
mock_pipe.execute = AsyncMock(return_value=[5, 2, 1, [("job-1", 1000.0)]])
|
||||
mock_pipe.__aenter__ = AsyncMock(return_value=mock_pipe)
|
||||
mock_pipe.__aexit__ = AsyncMock()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.pipeline = MagicMock(return_value=mock_pipe)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
with patch("functional_scaffold.core.job_manager.time") as mock_time:
|
||||
mock_time.time.return_value = 1060.0 # 60 秒后
|
||||
|
||||
with patch("functional_scaffold.core.job_manager.set") as mock_set:
|
||||
result = await manager.collect_queue_metrics()
|
||||
|
||||
assert result["queue_length"] == 5
|
||||
assert result["processing_length"] == 2
|
||||
assert result["dlq_length"] == 1
|
||||
assert result["oldest_waiting_seconds"] == 60.0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_collect_queue_metrics_empty(self):
|
||||
"""测试空队列时收集指标"""
|
||||
manager = JobManager()
|
||||
|
||||
mock_pipe = AsyncMock()
|
||||
mock_pipe.llen = MagicMock()
|
||||
mock_pipe.zrange = MagicMock()
|
||||
mock_pipe.execute = AsyncMock(return_value=[0, 0, 0, []])
|
||||
mock_pipe.__aenter__ = AsyncMock(return_value=mock_pipe)
|
||||
mock_pipe.__aexit__ = AsyncMock()
|
||||
|
||||
mock_redis = AsyncMock()
|
||||
mock_redis.pipeline = MagicMock(return_value=mock_pipe)
|
||||
manager._redis_client = mock_redis
|
||||
|
||||
with patch("functional_scaffold.core.job_manager.set"):
|
||||
result = await manager.collect_queue_metrics()
|
||||
|
||||
assert result["queue_length"] == 0
|
||||
assert result["processing_length"] == 0
|
||||
assert result["dlq_length"] == 0
|
||||
assert result["oldest_waiting_seconds"] == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_collect_queue_metrics_without_redis(self):
|
||||
"""测试 Redis 不可用时收集指标"""
|
||||
manager = JobManager()
|
||||
|
||||
result = await manager.collect_queue_metrics()
|
||||
|
||||
assert result["queue_length"] == 0
|
||||
assert result["processing_length"] == 0
|
||||
assert result["dlq_length"] == 0
|
||||
assert result["oldest_waiting_seconds"] == 0
|
||||
|
||||
@@ -1,158 +1,239 @@
|
||||
"""metrics_unified 模块单元测试"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_manager():
|
||||
"""每个测试前后重置管理器"""
|
||||
from functional_scaffold.core.metrics_unified import reset_metrics_manager_sync
|
||||
|
||||
reset_metrics_manager_sync()
|
||||
yield
|
||||
reset_metrics_manager_sync()
|
||||
|
||||
|
||||
class TestMetricsManager:
|
||||
"""MetricsManager 类测试"""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_redis(self):
|
||||
"""模拟 Redis 客户端"""
|
||||
with patch("redis.Redis") as mock:
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_instance.hincrbyfloat.return_value = 1.0
|
||||
mock_instance.hset.return_value = True
|
||||
mock_instance.hgetall.return_value = {}
|
||||
mock_instance.hget.return_value = "0"
|
||||
mock_instance.keys.return_value = []
|
||||
mock_instance.pipeline.return_value = MagicMock()
|
||||
mock.return_value = mock_instance
|
||||
yield mock_instance
|
||||
|
||||
@pytest.fixture
|
||||
def manager(self, mock_redis):
|
||||
"""创建测试用的 MetricsManager"""
|
||||
from functional_scaffold.core.metrics_unified import (
|
||||
MetricsManager,
|
||||
reset_metrics_manager,
|
||||
)
|
||||
|
||||
reset_metrics_manager()
|
||||
manager = MetricsManager()
|
||||
return manager
|
||||
|
||||
def test_init_loads_default_config(self, manager):
|
||||
def test_init_loads_default_config(self):
|
||||
"""测试初始化加载默认配置"""
|
||||
from functional_scaffold.core.metrics_unified import MetricsManager
|
||||
|
||||
manager = MetricsManager()
|
||||
assert manager.config is not None
|
||||
assert "builtin_metrics" in manager.config or len(manager.metrics_definitions) > 0
|
||||
|
||||
def test_metrics_definitions_registered(self, manager):
|
||||
def test_metrics_definitions_registered(self):
|
||||
"""测试指标定义已注册"""
|
||||
from functional_scaffold.core.metrics_unified import MetricsManager
|
||||
|
||||
manager = MetricsManager()
|
||||
assert "http_requests_total" in manager.metrics_definitions
|
||||
assert "http_request_duration_seconds" in manager.metrics_definitions
|
||||
assert "algorithm_executions_total" in manager.metrics_definitions
|
||||
|
||||
def test_incr_counter(self, manager, mock_redis):
|
||||
@pytest.mark.asyncio
|
||||
@patch("redis.asyncio.Redis")
|
||||
async def test_incr_counter(self, mock_redis_class):
|
||||
"""测试计数器增加"""
|
||||
manager.incr("http_requests_total", {"method": "GET", "endpoint": "/", "status": "success"})
|
||||
mock_redis.hincrbyfloat.assert_called()
|
||||
mock_instance = AsyncMock()
|
||||
mock_instance.ping = AsyncMock(return_value=True)
|
||||
mock_instance.hincrbyfloat = AsyncMock(return_value=1.0)
|
||||
mock_instance.close = AsyncMock()
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
def test_incr_with_invalid_metric_type(self, manager, mock_redis):
|
||||
from functional_scaffold.core.metrics_unified import MetricsManager
|
||||
|
||||
manager = MetricsManager()
|
||||
await manager.initialize()
|
||||
|
||||
await manager.incr(
|
||||
"http_requests_total", {"method": "GET", "endpoint": "/", "status": "success"}
|
||||
)
|
||||
mock_instance.hincrbyfloat.assert_called()
|
||||
|
||||
def test_incr_with_invalid_metric_type(self):
|
||||
"""测试对非计数器类型调用 incr"""
|
||||
from functional_scaffold.core.metrics_unified import MetricsManager
|
||||
|
||||
manager = MetricsManager()
|
||||
# http_request_duration_seconds 是 histogram 类型
|
||||
manager.incr("http_request_duration_seconds", {})
|
||||
# 不应该调用 Redis(因为类型不匹配)
|
||||
# 验证没有调用 hincrbyfloat(或者调用次数没有增加)
|
||||
# 验证不会抛出异常(因为 Redis 不可用)
|
||||
|
||||
def test_set_gauge(self, manager, mock_redis):
|
||||
@pytest.mark.asyncio
|
||||
@patch("redis.asyncio.Redis")
|
||||
async def test_set_gauge(self, mock_redis_class):
|
||||
"""测试设置仪表盘"""
|
||||
manager.set("http_requests_in_progress", {}, 5)
|
||||
mock_redis.hset.assert_called()
|
||||
mock_instance = AsyncMock()
|
||||
mock_instance.ping = AsyncMock(return_value=True)
|
||||
mock_instance.hset = AsyncMock(return_value=True)
|
||||
mock_instance.close = AsyncMock()
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
def test_gauge_incr(self, manager, mock_redis):
|
||||
from functional_scaffold.core.metrics_unified import MetricsManager
|
||||
|
||||
manager = MetricsManager()
|
||||
await manager.initialize()
|
||||
|
||||
await manager.set("http_requests_in_progress", {}, 5)
|
||||
mock_instance.hset.assert_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch("redis.asyncio.Redis")
|
||||
async def test_gauge_incr(self, mock_redis_class):
|
||||
"""测试增加仪表盘"""
|
||||
manager.gauge_incr("http_requests_in_progress", {}, 1)
|
||||
mock_redis.hincrbyfloat.assert_called()
|
||||
mock_instance = AsyncMock()
|
||||
mock_instance.ping = AsyncMock(return_value=True)
|
||||
mock_instance.hincrbyfloat = AsyncMock(return_value=1.0)
|
||||
mock_instance.close = AsyncMock()
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
def test_gauge_decr(self, manager, mock_redis):
|
||||
from functional_scaffold.core.metrics_unified import MetricsManager
|
||||
|
||||
manager = MetricsManager()
|
||||
await manager.initialize()
|
||||
|
||||
await manager.gauge_incr("http_requests_in_progress", {}, 1)
|
||||
mock_instance.hincrbyfloat.assert_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch("redis.asyncio.Redis")
|
||||
async def test_gauge_decr(self, mock_redis_class):
|
||||
"""测试减少仪表盘"""
|
||||
manager.gauge_decr("http_requests_in_progress", {}, 1)
|
||||
mock_redis.hincrbyfloat.assert_called()
|
||||
mock_instance = AsyncMock()
|
||||
mock_instance.ping = AsyncMock(return_value=True)
|
||||
mock_instance.hincrbyfloat = AsyncMock(return_value=1.0)
|
||||
mock_instance.close = AsyncMock()
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
def test_observe_histogram(self, manager, mock_redis):
|
||||
from functional_scaffold.core.metrics_unified import MetricsManager
|
||||
|
||||
manager = MetricsManager()
|
||||
await manager.initialize()
|
||||
|
||||
await manager.gauge_decr("http_requests_in_progress", {}, 1)
|
||||
mock_instance.hincrbyfloat.assert_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch("redis.asyncio.Redis")
|
||||
async def test_observe_histogram(self, mock_redis_class):
|
||||
"""测试直方图观测"""
|
||||
mock_pipeline = MagicMock()
|
||||
mock_redis.pipeline.return_value = mock_pipeline
|
||||
mock_instance = AsyncMock()
|
||||
mock_instance.ping = AsyncMock(return_value=True)
|
||||
mock_instance.close = AsyncMock()
|
||||
|
||||
manager.observe("http_request_duration_seconds", {"method": "GET", "endpoint": "/"}, 0.05)
|
||||
mock_pipeline = AsyncMock()
|
||||
mock_pipeline.hincrbyfloat = MagicMock()
|
||||
mock_pipeline.execute = AsyncMock(return_value=[])
|
||||
mock_instance.pipeline = MagicMock(return_value=mock_pipeline)
|
||||
|
||||
mock_redis.pipeline.assert_called()
|
||||
mock_pipeline.execute.assert_called()
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
def test_labels_to_key(self, manager):
|
||||
from functional_scaffold.core.metrics_unified import MetricsManager
|
||||
|
||||
manager = MetricsManager()
|
||||
await manager.initialize()
|
||||
|
||||
await manager.observe(
|
||||
"http_request_duration_seconds", {"method": "GET", "endpoint": "/"}, 0.05
|
||||
)
|
||||
mock_instance.pipeline.assert_called()
|
||||
|
||||
def test_labels_to_key(self):
|
||||
"""测试标签转换为 key"""
|
||||
from functional_scaffold.core.metrics_unified import MetricsManager
|
||||
|
||||
manager = MetricsManager()
|
||||
labels = {"method": "GET", "endpoint": "/api"}
|
||||
key = manager._labels_to_key(labels)
|
||||
assert "method=GET" in key
|
||||
assert "endpoint=/api" in key
|
||||
|
||||
def test_labels_to_key_empty(self, manager):
|
||||
def test_labels_to_key_empty(self):
|
||||
"""测试空标签转换"""
|
||||
from functional_scaffold.core.metrics_unified import MetricsManager
|
||||
|
||||
manager = MetricsManager()
|
||||
key = manager._labels_to_key(None)
|
||||
assert key == ""
|
||||
|
||||
key = manager._labels_to_key({})
|
||||
assert key == ""
|
||||
|
||||
def test_is_available(self, manager):
|
||||
@pytest.mark.asyncio
|
||||
@patch("redis.asyncio.Redis")
|
||||
async def test_is_available(self, mock_redis_class):
|
||||
"""测试 Redis 可用性检查"""
|
||||
mock_instance = AsyncMock()
|
||||
mock_instance.ping = AsyncMock(return_value=True)
|
||||
mock_instance.close = AsyncMock()
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
from functional_scaffold.core.metrics_unified import MetricsManager
|
||||
|
||||
manager = MetricsManager()
|
||||
await manager.initialize()
|
||||
|
||||
assert manager.is_available() is True
|
||||
|
||||
|
||||
class TestConvenienceFunctions:
|
||||
"""便捷函数测试"""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self):
|
||||
"""每个测试前重置管理器"""
|
||||
from functional_scaffold.core.metrics_unified import reset_metrics_manager
|
||||
|
||||
reset_metrics_manager()
|
||||
|
||||
@patch("redis.Redis")
|
||||
def test_incr_function(self, mock_redis_class):
|
||||
@pytest.mark.asyncio
|
||||
@patch("redis.asyncio.Redis")
|
||||
async def test_incr_function(self, mock_redis_class):
|
||||
"""测试 incr 便捷函数"""
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_instance = AsyncMock()
|
||||
mock_instance.ping = AsyncMock(return_value=True)
|
||||
mock_instance.hincrbyfloat = AsyncMock(return_value=1.0)
|
||||
mock_instance.close = AsyncMock()
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
from functional_scaffold.core.metrics_unified import incr, reset_metrics_manager
|
||||
from functional_scaffold.core.metrics_unified import incr
|
||||
|
||||
reset_metrics_manager()
|
||||
incr("http_requests_total", {"method": "GET", "endpoint": "/", "status": "success"})
|
||||
await incr(
|
||||
"http_requests_total", {"method": "GET", "endpoint": "/", "status": "success"}
|
||||
)
|
||||
|
||||
mock_instance.hincrbyfloat.assert_called()
|
||||
|
||||
@patch("redis.Redis")
|
||||
def test_set_function(self, mock_redis_class):
|
||||
@pytest.mark.asyncio
|
||||
@patch("redis.asyncio.Redis")
|
||||
async def test_set_function(self, mock_redis_class):
|
||||
"""测试 set 便捷函数"""
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_instance = AsyncMock()
|
||||
mock_instance.ping = AsyncMock(return_value=True)
|
||||
mock_instance.hset = AsyncMock(return_value=True)
|
||||
mock_instance.close = AsyncMock()
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
from functional_scaffold.core.metrics_unified import reset_metrics_manager, set
|
||||
from functional_scaffold.core.metrics_unified import set
|
||||
|
||||
reset_metrics_manager()
|
||||
set("http_requests_in_progress", {}, 10)
|
||||
await set("http_requests_in_progress", {}, 10)
|
||||
|
||||
mock_instance.hset.assert_called()
|
||||
|
||||
@patch("redis.Redis")
|
||||
def test_observe_function(self, mock_redis_class):
|
||||
@pytest.mark.asyncio
|
||||
@patch("redis.asyncio.Redis")
|
||||
async def test_observe_function(self, mock_redis_class):
|
||||
"""测试 observe 便捷函数"""
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_pipeline = MagicMock()
|
||||
mock_instance.pipeline.return_value = mock_pipeline
|
||||
mock_instance = AsyncMock()
|
||||
mock_instance.ping = AsyncMock(return_value=True)
|
||||
mock_instance.close = AsyncMock()
|
||||
|
||||
mock_pipeline = AsyncMock()
|
||||
mock_pipeline.hincrbyfloat = MagicMock()
|
||||
mock_pipeline.execute = AsyncMock(return_value=[])
|
||||
mock_instance.pipeline = MagicMock(return_value=mock_pipeline)
|
||||
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
from functional_scaffold.core.metrics_unified import observe, reset_metrics_manager
|
||||
from functional_scaffold.core.metrics_unified import observe
|
||||
|
||||
reset_metrics_manager()
|
||||
observe("http_request_duration_seconds", {"method": "GET", "endpoint": "/"}, 0.1)
|
||||
await observe("http_request_duration_seconds", {"method": "GET", "endpoint": "/"}, 0.1)
|
||||
|
||||
mock_instance.pipeline.assert_called()
|
||||
|
||||
@@ -160,42 +241,49 @@ class TestConvenienceFunctions:
|
||||
class TestExport:
|
||||
"""导出功能测试"""
|
||||
|
||||
@patch("redis.Redis")
|
||||
def test_export_counter(self, mock_redis_class):
|
||||
@pytest.mark.asyncio
|
||||
@patch("redis.asyncio.Redis")
|
||||
async def test_export_counter(self, mock_redis_class):
|
||||
"""测试导出计数器"""
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_instance.hgetall.return_value = {"method=GET,endpoint=/,status=success": "10"}
|
||||
mock_instance = AsyncMock()
|
||||
mock_instance.ping = AsyncMock(return_value=True)
|
||||
mock_instance.hgetall = AsyncMock(
|
||||
return_value={"method=GET,endpoint=/,status=success": "10"}
|
||||
)
|
||||
mock_instance.hget = AsyncMock(return_value="0")
|
||||
mock_instance.close = AsyncMock()
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
from functional_scaffold.core.metrics_unified import export, reset_metrics_manager
|
||||
from functional_scaffold.core.metrics_unified import export
|
||||
|
||||
reset_metrics_manager()
|
||||
output = export()
|
||||
output = await export()
|
||||
|
||||
assert "http_requests_total" in output
|
||||
assert "HELP" in output
|
||||
assert "TYPE" in output
|
||||
|
||||
@patch("redis.Redis")
|
||||
def test_export_histogram(self, mock_redis_class):
|
||||
@pytest.mark.asyncio
|
||||
@patch("redis.asyncio.Redis")
|
||||
async def test_export_histogram(self, mock_redis_class):
|
||||
"""测试导出直方图"""
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_instance.hgetall.side_effect = lambda key: (
|
||||
{"method=GET,endpoint=/": "5"}
|
||||
if "count" in key
|
||||
else {"method=GET,endpoint=/": "0.5"}
|
||||
if "sum" in key
|
||||
else {}
|
||||
)
|
||||
mock_instance.hget.return_value = "3"
|
||||
mock_instance = AsyncMock()
|
||||
mock_instance.ping = AsyncMock(return_value=True)
|
||||
|
||||
async def mock_hgetall(key):
|
||||
if "count" in key:
|
||||
return {"method=GET,endpoint=/": "5"}
|
||||
elif "sum" in key:
|
||||
return {"method=GET,endpoint=/": "0.5"}
|
||||
return {}
|
||||
|
||||
mock_instance.hgetall = mock_hgetall
|
||||
mock_instance.hget = AsyncMock(return_value="3")
|
||||
mock_instance.close = AsyncMock()
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
from functional_scaffold.core.metrics_unified import export, reset_metrics_manager
|
||||
from functional_scaffold.core.metrics_unified import export
|
||||
|
||||
reset_metrics_manager()
|
||||
output = export()
|
||||
output = await export()
|
||||
|
||||
assert "http_request_duration_seconds" in output
|
||||
|
||||
@@ -226,21 +314,9 @@ class TestEnvVarSubstitution:
|
||||
class TestTrackAlgorithmExecution:
|
||||
"""track_algorithm_execution 装饰器测试"""
|
||||
|
||||
@patch("redis.Redis")
|
||||
def test_decorator_success(self, mock_redis_class):
|
||||
def test_decorator_success(self):
|
||||
"""测试装饰器成功执行"""
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_pipeline = MagicMock()
|
||||
mock_instance.pipeline.return_value = mock_pipeline
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
from functional_scaffold.core.metrics_unified import (
|
||||
reset_metrics_manager,
|
||||
track_algorithm_execution,
|
||||
)
|
||||
|
||||
reset_metrics_manager()
|
||||
from functional_scaffold.core.metrics_unified import track_algorithm_execution
|
||||
|
||||
@track_algorithm_execution("test_algo")
|
||||
def test_func():
|
||||
@@ -249,21 +325,9 @@ class TestTrackAlgorithmExecution:
|
||||
result = test_func()
|
||||
assert result == "result"
|
||||
|
||||
@patch("redis.Redis")
|
||||
def test_decorator_error(self, mock_redis_class):
|
||||
def test_decorator_error(self):
|
||||
"""测试装饰器错误处理"""
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_pipeline = MagicMock()
|
||||
mock_instance.pipeline.return_value = mock_pipeline
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
from functional_scaffold.core.metrics_unified import (
|
||||
reset_metrics_manager,
|
||||
track_algorithm_execution,
|
||||
)
|
||||
|
||||
reset_metrics_manager()
|
||||
from functional_scaffold.core.metrics_unified import track_algorithm_execution
|
||||
|
||||
@track_algorithm_execution("test_algo")
|
||||
def test_func():
|
||||
|
||||
Reference in New Issue
Block a user