Compare commits
23 Commits
3c3659d314
...
7c8b96927d
| Author | SHA1 | Date | |
|---|---|---|---|
| 7c8b96927d | |||
| c7626723a3 | |||
| b77e736790 | |||
| eadea2c1a4 | |||
| 6ef86da639 | |||
| 2658d1036d | |||
| 1ea0623a79 | |||
| 265e8d1e3d | |||
| 88cfe91c56 | |||
| 9f32ac725a | |||
| 22ae858458 | |||
| dc240f39f0 | |||
| f9e7c03157 | |||
| d41055109d | |||
| d709540bf3 | |||
| 683bf8a6ca | |||
| 3e1d850954 | |||
| dff5aed33c | |||
| 2df59f0f1f | |||
| c6f8714c4d | |||
| 5feb795d12 | |||
| 3254fdc3f0 | |||
| 5921f71756 |
29
.dockerignore
Normal file
29
.dockerignore
Normal file
@@ -0,0 +1,29 @@
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
.pytest_cache/
|
||||
.coverage
|
||||
htmlcov/
|
||||
.env
|
||||
.venv
|
||||
venv/
|
||||
ENV/
|
||||
*.log
|
||||
.DS_Store
|
||||
87
.env.example
Normal file
87
.env.example
Normal file
@@ -0,0 +1,87 @@
|
||||
# Environment Configuration
|
||||
# Copy this file to .env and fill in your values
|
||||
|
||||
# =============================================================================
|
||||
# 应用信息
|
||||
# =============================================================================
|
||||
APP_NAME=FunctionalScaffold
|
||||
APP_VERSION=1.0.0
|
||||
APP_ENV=development
|
||||
|
||||
# =============================================================================
|
||||
# 服务器配置
|
||||
# =============================================================================
|
||||
HOST=0.0.0.0
|
||||
PORT=8000
|
||||
WORKERS=4
|
||||
|
||||
# =============================================================================
|
||||
# 日志配置
|
||||
# =============================================================================
|
||||
LOG_LEVEL=INFO
|
||||
LOG_FORMAT=json
|
||||
# 日志文件配置(可选,默认禁用)
|
||||
LOG_FILE_ENABLED=false
|
||||
LOG_FILE_PATH=/var/log/app/app.log
|
||||
|
||||
# =============================================================================
|
||||
# 指标配置
|
||||
# =============================================================================
|
||||
METRICS_ENABLED=true
|
||||
METRICS_CONFIG_PATH=config/metrics.yaml
|
||||
# 指标实例 ID(可选,默认使用 hostname)
|
||||
# METRICS_INSTANCE_ID=my-instance
|
||||
|
||||
# =============================================================================
|
||||
# 追踪配置
|
||||
# =============================================================================
|
||||
TRACING_ENABLED=false
|
||||
# JAEGER_ENDPOINT=http://localhost:14268/api/traces
|
||||
|
||||
# =============================================================================
|
||||
# Redis 配置
|
||||
# =============================================================================
|
||||
REDIS_HOST=localhost
|
||||
REDIS_PORT=6379
|
||||
REDIS_DB=0
|
||||
REDIS_PASSWORD=your_redis_password
|
||||
|
||||
# =============================================================================
|
||||
# 异步任务配置
|
||||
# =============================================================================
|
||||
# 任务结果缓存时间(秒),默认 30 分钟
|
||||
JOB_RESULT_TTL=1800
|
||||
# Webhook 最大重试次数
|
||||
WEBHOOK_MAX_RETRIES=3
|
||||
# Webhook 超时时间(秒)
|
||||
WEBHOOK_TIMEOUT=10
|
||||
# 最大并发任务数
|
||||
MAX_CONCURRENT_JOBS=10
|
||||
|
||||
# =============================================================================
|
||||
# Worker 配置
|
||||
# =============================================================================
|
||||
# Worker 轮询间隔(秒)
|
||||
WORKER_POLL_INTERVAL=1.0
|
||||
# 任务队列 Redis Key
|
||||
JOB_QUEUE_KEY=job:queue
|
||||
# 全局并发计数器 Redis Key
|
||||
JOB_CONCURRENCY_KEY=job:concurrency
|
||||
# 任务锁 TTL(秒)
|
||||
JOB_LOCK_TTL=300
|
||||
# 任务最大重试次数
|
||||
JOB_MAX_RETRIES=3
|
||||
# 任务执行超时(秒)
|
||||
JOB_EXECUTION_TIMEOUT=300
|
||||
|
||||
# =============================================================================
|
||||
# 外部服务配置(示例)
|
||||
# =============================================================================
|
||||
# OSS 配置
|
||||
# OSS_ENDPOINT=https://oss-cn-hangzhou.aliyuncs.com
|
||||
# OSS_ACCESS_KEY_ID=your_access_key
|
||||
# OSS_ACCESS_KEY_SECRET=your_secret_key
|
||||
# OSS_BUCKET_NAME=your_bucket
|
||||
|
||||
# 数据库配置
|
||||
# DATABASE_URL=mysql://user:password@localhost:3306/dbname
|
||||
70
.gitignore
vendored
Normal file
70
.gitignore
vendored
Normal file
@@ -0,0 +1,70 @@
|
||||
.claude
|
||||
docs/prompt
|
||||
.idea
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Virtual environments
|
||||
venv/
|
||||
env/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# IDEs
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# Environment variables
|
||||
.env
|
||||
.env.local
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
102
AGENTS.md
Normal file
102
AGENTS.md
Normal file
@@ -0,0 +1,102 @@
|
||||
# Agent.md
|
||||
|
||||
本文件为本仓库内各类智能体/助手提供工作指导,内容参考 `CLAUDE.md`,并针对日常开发与协作做了简化归纳。
|
||||
|
||||
## 项目概述
|
||||
|
||||
**FunctionalScaffold(函数式脚手架)** 是一个算法工程化 Serverless 解决方案的脚手架生成器。
|
||||
|
||||
- 为了方便团队交流,项目自然语言使用中文,包括代码注释和文档
|
||||
- 核心目标:解决算力弹性、算法工程化门槛与后端集成复杂度问题
|
||||
|
||||
## 技术与架构
|
||||
|
||||
采用 **Docker 封装的 Serverless API 服务**方案:
|
||||
|
||||
- 算法代码 + 运行环境打包为 Docker 镜像
|
||||
- 部署到云厂商 Serverless 平台实现自动扩缩容
|
||||
- FastAPI 作为 HTTP 接口层
|
||||
- 算法逻辑保持独立和专注
|
||||
|
||||
架构流程概览:
|
||||
|
||||
```
|
||||
用户请求 → API网关 → 容器实例(冷/热启动)→ FastAPI → 算法程序 → 返回结果
|
||||
↓
|
||||
外部服务(OSS/数据库)
|
||||
```
|
||||
|
||||
## 代码结构(src layout)
|
||||
|
||||
```
|
||||
src/functional_scaffold/
|
||||
├── algorithms/ # 算法层 - 所有算法必须继承 BaseAlgorithm
|
||||
│ ├── base.py # execute() 包装器(埋点、错误处理)
|
||||
│ └── prime_checker.py # 示例:质数判断算法
|
||||
├── api/ # API 层 - FastAPI 路由和模型
|
||||
│ ├── models.py # Pydantic 数据模型(ConfigDict)
|
||||
│ ├── routes.py # 路由定义(/invoke, /healthz, /readyz, /jobs)
|
||||
│ └── dependencies.py # 依赖注入(request_id 生成)
|
||||
├── core/ # 核心功能 - 横切关注点
|
||||
│ ├── errors.py # 异常类层次结构
|
||||
│ ├── logging.py # 结构化日志(JSON)
|
||||
│ ├── metrics.py # Prometheus 指标和装饰器
|
||||
│ └── tracing.py # 分布式追踪(ContextVar)
|
||||
├── utils/ # 工具函数
|
||||
│ └── validators.py # 输入验证
|
||||
├── config.py # 配置管理(pydantic-settings)
|
||||
└── main.py # FastAPI 应用入口
|
||||
```
|
||||
|
||||
## 关键设计约定
|
||||
|
||||
1. **算法抽象层**:所有算法继承 `BaseAlgorithm`,只实现 `process()`;`execute()` 负责埋点、日志和错误包装。
|
||||
2. **依赖注入**:FastAPI `Depends()` 注入 `request_id`,通过 `ContextVar` 透传。
|
||||
3. **配置管理**:`pydantic-settings` 读取环境变量或 `.env`,支持类型校验。
|
||||
4. **可观测性**:JSON 结构化日志、Prometheus 指标、Request ID 追踪。
|
||||
5. **Pydantic V2**:使用 `ConfigDict` 和 `model_config`,不使用 `class Config`。
|
||||
|
||||
## 常用命令
|
||||
|
||||
环境设置:
|
||||
|
||||
```bash
|
||||
python -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -e ".[dev]"
|
||||
```
|
||||
|
||||
运行服务:
|
||||
|
||||
```bash
|
||||
./scripts/run_dev.sh
|
||||
uvicorn functional_scaffold.main:app --reload --port 8000
|
||||
```
|
||||
|
||||
测试与质量:
|
||||
|
||||
```bash
|
||||
pytest tests/ -v
|
||||
black src/ tests/
|
||||
ruff check src/ tests/
|
||||
```
|
||||
|
||||
## 添加新算法(简版步骤)
|
||||
|
||||
1. 在 `src/functional_scaffold/algorithms/` 新建算法类,继承 `BaseAlgorithm` 并实现 `process()`。
|
||||
2. 在 `algorithms/__init__.py` 导出新算法类。
|
||||
3. 在 `api/routes.py` 添加端点,在 `api/models.py` 添加请求/响应模型。
|
||||
4. 在 `tests/` 编写对应测试。
|
||||
|
||||
## 交付标准
|
||||
|
||||
必须包含以下组件与规范:
|
||||
|
||||
- `/invoke`, `/jobs`, `/healthz`, `/readyz`, `/metrics` 端点
|
||||
- 统一的请求/响应 Schema 与错误格式
|
||||
- 可观测性支持(日志、指标、追踪)
|
||||
|
||||
## 开发理念
|
||||
|
||||
算法同学只需关注 `process()` 的核心逻辑,其余基础设施能力由脚手架提供。
|
||||
|
||||
629
CLAUDE.md
Normal file
629
CLAUDE.md
Normal file
@@ -0,0 +1,629 @@
|
||||
# CLAUDE.md
|
||||
|
||||
本文件为 Claude Code (claude.ai/code) 在此代码仓库中工作时提供指导。
|
||||
|
||||
为了方便团队交流,项目的自然语言使用中文,包括代码注释和文档等
|
||||
|
||||
## 项目概述
|
||||
|
||||
**FunctionalScaffold(函数式脚手架)** 是一个算法工程化 Serverless 解决方案的脚手架生成器。
|
||||
|
||||
### 核心目标
|
||||
|
||||
解决三大痛点:
|
||||
1. **不确定的算力需求** - 需要动态扩缩容能力
|
||||
2. **算法同学工程化能力不足** - 降低工程化门槛
|
||||
3. **后端同学集成难度过高** - 标准化接口规范
|
||||
|
||||
## 技术架构
|
||||
|
||||
采用 **Docker 封装的 Serverless API 服务**方案:
|
||||
|
||||
- 算法代码 + 运行环境打包为 Docker 镜像
|
||||
- 部署到云厂商 Serverless 平台实现自动扩缩容
|
||||
- FastAPI 作为 HTTP 接口层
|
||||
- 算法逻辑保持独立和专注
|
||||
|
||||
### 架构流程
|
||||
|
||||
```
|
||||
用户请求 → API网关 → 容器实例(冷/热启动)→ FastAPI → 算法程序 → 返回结果
|
||||
↓
|
||||
外部服务(OSS/数据库)
|
||||
```
|
||||
|
||||
### 代码架构
|
||||
|
||||
项目采用 **src layout** 结构(Python 最佳实践):
|
||||
|
||||
```
|
||||
src/functional_scaffold/
|
||||
├── algorithms/ # 算法层 - 所有算法必须继承 BaseAlgorithm
|
||||
│ ├── base.py # 提供 execute() 包装器(埋点、错误处理)
|
||||
│ └── prime_checker.py # 示例:质数判断算法
|
||||
├── api/ # API 层 - FastAPI 路由和模型
|
||||
│ ├── models.py # Pydantic 数据模型(使用 ConfigDict)
|
||||
│ ├── routes.py # 路由定义(/invoke, /healthz, /readyz, /jobs)
|
||||
│ └── dependencies.py # 依赖注入(request_id 生成)
|
||||
├── core/ # 核心功能 - 横切关注点
|
||||
│ ├── errors.py # 异常类层次结构
|
||||
│ ├── logging.py # 结构化日志(JSON 格式)
|
||||
│ ├── metrics.py # Prometheus 指标和装饰器
|
||||
│ └── tracing.py # 分布式追踪(ContextVar)
|
||||
├── utils/ # 工具函数
|
||||
│ └── validators.py # 输入验证
|
||||
├── config.py # 配置管理(pydantic-settings)
|
||||
└── main.py # FastAPI 应用入口
|
||||
```
|
||||
|
||||
**关键设计模式:**
|
||||
|
||||
1. **算法抽象层**:所有算法继承 `BaseAlgorithm`,只需实现 `process()` 方法。`execute()` 方法自动处理埋点、日志和错误包装。
|
||||
|
||||
2. **依赖注入**:使用 FastAPI 的 `Depends()` 机制注入 request_id,通过 `ContextVar` 在异步上下文中传递。
|
||||
|
||||
3. **配置管理**:使用 `pydantic-settings` 从环境变量或 `.env` 文件加载配置,支持类型验证。
|
||||
|
||||
4. **可观测性**:
|
||||
- 日志:结构化 JSON 日志(pythonjsonlogger),自动包含 request_id
|
||||
- 指标:Prometheus 格式(request_counter, request_latency, algorithm_counter)
|
||||
- 追踪:request_id 关联所有日志和指标
|
||||
- 日志收集:Loki + Promtail 自动收集和查询日志
|
||||
|
||||
## 开发命令
|
||||
|
||||
### 环境设置
|
||||
|
||||
```bash
|
||||
# 创建虚拟环境并安装依赖(开发模式)
|
||||
python -m venv venv
|
||||
source venv/bin/activate # Windows: venv\Scripts\activate
|
||||
pip install -e ".[dev]"
|
||||
```
|
||||
|
||||
### 运行服务
|
||||
|
||||
```bash
|
||||
# 方式1:使用辅助脚本(推荐)
|
||||
./scripts/run_dev.sh
|
||||
|
||||
# 方式2:直接运行(开发模式,自动重载)
|
||||
uvicorn functional_scaffold.main:app --reload --port 8000
|
||||
|
||||
# 方式3:生产模式
|
||||
uvicorn functional_scaffold.main:app --host 0.0.0.0 --port 8000 --workers 4
|
||||
```
|
||||
|
||||
访问地址:
|
||||
- Swagger UI: http://localhost:8000/docs
|
||||
- ReDoc: http://localhost:8000/redoc
|
||||
- Metrics: http://localhost:8000/metrics
|
||||
|
||||
### 测试
|
||||
|
||||
```bash
|
||||
# 运行所有测试
|
||||
pytest tests/ -v
|
||||
|
||||
# 运行单个测试文件
|
||||
pytest tests/test_algorithms.py -v
|
||||
|
||||
# 运行单个测试类
|
||||
pytest tests/test_algorithms.py::TestPrimeChecker -v
|
||||
|
||||
# 运行单个测试方法
|
||||
pytest tests/test_algorithms.py::TestPrimeChecker::test_prime_numbers -v
|
||||
|
||||
# 生成覆盖率报告
|
||||
pytest tests/ --cov=src/functional_scaffold --cov-report=html
|
||||
# 查看报告:open htmlcov/index.html
|
||||
|
||||
# 使用辅助脚本(包含代码检查)
|
||||
./scripts/run_tests.sh
|
||||
```
|
||||
|
||||
### 代码质量
|
||||
|
||||
```bash
|
||||
# 代码格式化(自动修复)
|
||||
black src/ tests/
|
||||
|
||||
# 代码检查(不修改文件)
|
||||
black --check src/ tests/
|
||||
|
||||
# 代码检查
|
||||
ruff check src/ tests/
|
||||
|
||||
# 自动修复可修复的问题
|
||||
ruff check --fix src/ tests/
|
||||
```
|
||||
|
||||
配置说明:
|
||||
- Black: 行长度 100,目标 Python 3.9+
|
||||
- Ruff: 行长度 100,目标 Python 3.9+
|
||||
|
||||
### Docker
|
||||
|
||||
```bash
|
||||
# 构建镜像
|
||||
docker build -f deployment/Dockerfile -t functional-scaffold:latest .
|
||||
|
||||
# 运行容器
|
||||
docker run -p 8000:8000 functional-scaffold:latest
|
||||
|
||||
# 使用 docker-compose(包含 Prometheus + Grafana + Loki)
|
||||
cd deployment
|
||||
docker-compose up
|
||||
# Grafana: http://localhost:3000 (admin/admin)
|
||||
# Prometheus: http://localhost:9090
|
||||
# Loki: http://localhost:3100
|
||||
```
|
||||
|
||||
### 文档
|
||||
|
||||
```bash
|
||||
# 导出 OpenAPI 规范到 docs/swagger/openapi.json
|
||||
python scripts/export_openapi.py
|
||||
```
|
||||
|
||||
## 添加新算法
|
||||
|
||||
### 1. 创建算法类(继承 BaseAlgorithm)
|
||||
|
||||
```python
|
||||
# src/functional_scaffold/algorithms/my_algorithm.py
|
||||
from typing import Dict, Any
|
||||
from .base import BaseAlgorithm
|
||||
|
||||
class MyAlgorithm(BaseAlgorithm):
|
||||
"""我的算法类"""
|
||||
|
||||
def process(self, input_data: Any) -> Dict[str, Any]:
|
||||
"""
|
||||
算法处理逻辑
|
||||
|
||||
Args:
|
||||
input_data: 输入数据
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 处理结果
|
||||
"""
|
||||
# 实现算法逻辑
|
||||
result = do_something(input_data)
|
||||
return {"result": result}
|
||||
```
|
||||
|
||||
### 2. 注册到 `__init__.py`
|
||||
|
||||
```python
|
||||
# src/functional_scaffold/algorithms/__init__.py
|
||||
from .my_algorithm import MyAlgorithm
|
||||
__all__ = [..., "MyAlgorithm"]
|
||||
```
|
||||
|
||||
### 3. 添加 API 端点(在 `api/routes.py`)
|
||||
|
||||
```python
|
||||
@router.post("/my-endpoint")
|
||||
async def my_endpoint(
|
||||
request: MyRequest,
|
||||
request_id: str = Depends(get_request_id)
|
||||
):
|
||||
"""我的算法端点"""
|
||||
algorithm = MyAlgorithm()
|
||||
result = algorithm.execute(request.data)
|
||||
return MyResponse(request_id=request_id, **result)
|
||||
```
|
||||
|
||||
### 4. 定义数据模型(在 `api/models.py`)
|
||||
|
||||
```python
|
||||
class MyRequest(BaseModel):
|
||||
"""我的请求模型"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
json_schema_extra={
|
||||
"example": {"data": "示例数据"}
|
||||
}
|
||||
)
|
||||
|
||||
data: str = Field(..., description="输入数据")
|
||||
```
|
||||
|
||||
### 5. 编写测试
|
||||
|
||||
```python
|
||||
# tests/test_my_algorithm.py
|
||||
def test_my_algorithm():
|
||||
"""测试我的算法"""
|
||||
algo = MyAlgorithm()
|
||||
result = algo.process("测试数据")
|
||||
assert result["result"] == expected
|
||||
```
|
||||
|
||||
## 配置管理
|
||||
|
||||
配置通过 `src/functional_scaffold/config.py` 的 `Settings` 类管理:
|
||||
|
||||
- 从环境变量读取(不区分大小写)
|
||||
- 支持 `.env` 文件
|
||||
- 使用 `pydantic-settings` 进行类型验证
|
||||
|
||||
配置示例:
|
||||
```bash
|
||||
# .env 文件
|
||||
APP_ENV=production
|
||||
LOG_LEVEL=INFO
|
||||
METRICS_ENABLED=true
|
||||
```
|
||||
|
||||
访问配置:
|
||||
```python
|
||||
from functional_scaffold.config import settings
|
||||
print(settings.app_env) # "production"
|
||||
```
|
||||
|
||||
## 可观测性
|
||||
|
||||
### 日志
|
||||
|
||||
使用 `core/logging.py` 的 `setup_logging()`:
|
||||
|
||||
```python
|
||||
from functional_scaffold.core.logging import setup_logging
|
||||
|
||||
# 设置日志
|
||||
logger = setup_logging(level="INFO", format_type="json")
|
||||
|
||||
# 记录日志(自动包含 request_id)
|
||||
logger.info("处理请求", extra={"user_id": "123"})
|
||||
```
|
||||
|
||||
**日志特性:**
|
||||
- 结构化 JSON 格式
|
||||
- 自动包含 request_id(从 ContextVar 中提取)
|
||||
- 支持文件日志(可选,通过环境变量启用)
|
||||
- 日志轮转(100MB,保留 5 个备份)
|
||||
|
||||
### 日志收集(Loki)
|
||||
|
||||
项目集成了 Grafana Loki 日志收集系统,支持两种收集模式:
|
||||
|
||||
**模式 1: Docker stdio 收集(默认,推荐)**
|
||||
- 自动收集容器标准输出/错误
|
||||
- 无需修改应用代码
|
||||
- 性能影响极小
|
||||
|
||||
**模式 2: 文件收集(备用)**
|
||||
- 日志持久化到文件
|
||||
- 支持日志轮转
|
||||
- 需要设置 `LOG_FILE_ENABLED=true`
|
||||
|
||||
**查询日志:**
|
||||
|
||||
```bash
|
||||
# 使用 Loki API
|
||||
curl -G -s "http://localhost:3100/loki/api/v1/query_range" \
|
||||
--data-urlencode 'query={job="functional-scaffold-app"}'
|
||||
|
||||
# 按 request_id 过滤
|
||||
curl -G -s "http://localhost:3100/loki/api/v1/query_range" \
|
||||
--data-urlencode 'query={job="functional-scaffold-app"} |= "request-id-here"'
|
||||
```
|
||||
|
||||
**Grafana 仪表板:**
|
||||
- 访问 http://localhost:3000
|
||||
- 进入 "日志监控" 仪表板
|
||||
- 使用 Request ID 输入框过滤特定请求的日志
|
||||
|
||||
**相关文档:**
|
||||
- 完整文档:`docs/loki-integration.md`
|
||||
- 使用说明:`docs/grafana-dashboard-usage.md`
|
||||
- 快速参考:`docs/loki-quick-reference.md`
|
||||
|
||||
### 指标
|
||||
|
||||
使用 `core/metrics.py` 的装饰器:
|
||||
|
||||
```python
|
||||
from functional_scaffold.core.metrics import track_algorithm_execution
|
||||
|
||||
@track_algorithm_execution("my_algorithm")
|
||||
def my_function():
|
||||
"""我的函数"""
|
||||
pass
|
||||
```
|
||||
|
||||
可用指标:
|
||||
- `http_requests_total{method, endpoint, status}` - HTTP 请求总数
|
||||
- `http_request_duration_seconds{method, endpoint}` - HTTP 请求延迟
|
||||
- `algorithm_executions_total{algorithm, status}` - 算法执行总数
|
||||
- `algorithm_execution_duration_seconds{algorithm}` - 算法执行延迟
|
||||
|
||||
### 追踪
|
||||
|
||||
Request ID 自动注入到所有请求和日志:
|
||||
|
||||
```python
|
||||
from functional_scaffold.core.tracing import get_request_id
|
||||
|
||||
# 在请求上下文中获取 request_id
|
||||
request_id = get_request_id()
|
||||
```
|
||||
|
||||
**Request ID 特性:**
|
||||
- 自动生成或从请求头 `X-Request-ID` 获取
|
||||
- 通过 ContextVar 在异步上下文中传递
|
||||
- 自动添加到所有日志记录中
|
||||
- 可用于追踪单个请求的完整生命周期
|
||||
- 在 Grafana 仪表板中可按 request_id 过滤日志
|
||||
|
||||
## 部署
|
||||
|
||||
### Kubernetes
|
||||
|
||||
```bash
|
||||
kubectl apply -f deployment/kubernetes/deployment.yaml
|
||||
kubectl apply -f deployment/kubernetes/service.yaml
|
||||
```
|
||||
|
||||
配置说明:
|
||||
- 3 个副本
|
||||
- 资源限制:256Mi-512Mi 内存,250m-500m CPU
|
||||
- 健康检查:存活探针 (/healthz),就绪探针 (/readyz)
|
||||
|
||||
### 阿里云函数计算
|
||||
|
||||
```bash
|
||||
fun deploy -t deployment/serverless/aliyun-fc.yaml
|
||||
```
|
||||
|
||||
### AWS Lambda
|
||||
|
||||
```bash
|
||||
sam deploy --template-file deployment/serverless/aws-lambda.yaml
|
||||
```
|
||||
|
||||
## 必须交付的三大组件
|
||||
|
||||
### 1. 接入规范
|
||||
|
||||
**API 端点标准:**
|
||||
- `/invoke` - 同步调用接口
|
||||
- `/jobs` - 异步任务接口(当前返回 501)
|
||||
- `/healthz` - 存活检查
|
||||
- `/readyz` - 就绪检查
|
||||
- `/metrics` - Prometheus 指标
|
||||
|
||||
**Schema 规范:**
|
||||
- 请求/响应 Schema(Pydantic 验证)
|
||||
- 错误响应格式(统一的 ErrorResponse)
|
||||
- 元数据和版本信息(每个响应包含 metadata)
|
||||
|
||||
### 2. Python SDK 运行时
|
||||
|
||||
**已实现的能力:**
|
||||
- ✅ 参数校验(Pydantic + utils/validators.py)
|
||||
- ✅ 错误包装和标准化(core/errors.py)
|
||||
- ✅ 埋点(core/metrics.py - 延迟、失败率)
|
||||
- ✅ 分布式追踪的关联 ID(core/tracing.py + RequestIdFilter)
|
||||
- ✅ 日志收集和查询(Loki + Promtail)
|
||||
- ⏳ Worker 运行时(重试、超时、DLQ - 待实现)
|
||||
|
||||
### 3. 脚手架生成器
|
||||
|
||||
**已包含的模板:**
|
||||
- ✅ 示例算法函数(algorithms/prime_checker.py)
|
||||
- ✅ Dockerfile(deployment/Dockerfile)
|
||||
- ✅ CI/CD 流水线配置(.github/workflows/)
|
||||
- ✅ Serverless 平台部署 YAML(deployment/serverless/)
|
||||
- ✅ Grafana 仪表板模板(monitoring/grafana/dashboards/)
|
||||
- ✅ 告警规则配置(monitoring/alerts/rules.yaml)
|
||||
- ✅ Loki 日志收集配置(monitoring/loki.yaml, monitoring/promtail.yaml)
|
||||
|
||||
## 开发理念
|
||||
|
||||
**算法同学只需修改核心算法函数。** 所有基础设施、可观测性、部署相关的工作都由脚手架处理。
|
||||
|
||||
算法开发者只需:
|
||||
1. 继承 `BaseAlgorithm`
|
||||
2. 实现 `process()` 方法
|
||||
3. 返回字典格式的结果
|
||||
|
||||
框架自动提供:
|
||||
- HTTP 接口封装
|
||||
- 参数验证
|
||||
- 错误处理
|
||||
- 日志记录
|
||||
- 性能指标
|
||||
- 健康检查
|
||||
- 容器化部署
|
||||
|
||||
## 注意事项
|
||||
|
||||
1. **Pydantic V2**:使用 `ConfigDict` 而非 `class Config`,使用 `model_config` 而非 `Config`。
|
||||
|
||||
2. **异步上下文**:request_id 使用 `ContextVar` 存储,在异步函数中自动传递。
|
||||
|
||||
3. **测试隔离**:每个测试使用 `TestClient`,不需要启动真实服务器。
|
||||
|
||||
4. **Docker 构建**:Dockerfile 使用非 root 用户(appuser),包含健康检查。
|
||||
|
||||
5. **配置优先级**:环境变量 > .env 文件 > 默认值。
|
||||
|
||||
6. **Promtail 版本**:使用 Promtail 3.0.0 或更高版本,以支持较新的 Docker API(1.44+)。如果遇到 "client version too old" 错误,需要升级 Promtail 版本。
|
||||
|
||||
## 日志收集系统(Loki)
|
||||
|
||||
项目集成了 Grafana Loki 日志收集系统,提供强大的日志查询和分析能力。
|
||||
|
||||
### 架构
|
||||
|
||||
```
|
||||
应用容器 (stdout/stderr)
|
||||
↓
|
||||
Docker Engine
|
||||
↓
|
||||
Promtail (日志采集器)
|
||||
↓
|
||||
Loki (日志存储)
|
||||
↓
|
||||
Grafana (可视化)
|
||||
```
|
||||
|
||||
### 服务组件
|
||||
|
||||
**docker-compose 包含以下服务:**
|
||||
- **app**: 应用服务(端口 8111)
|
||||
- **loki**: 日志存储服务(端口 3100)
|
||||
- **promtail**: 日志采集服务(端口 9080)
|
||||
- **grafana**: 可视化服务(端口 3000)
|
||||
- **prometheus**: 指标收集服务(端口 9090)
|
||||
- **redis**: 缓存服务(端口 6380)
|
||||
|
||||
### 日志收集模式
|
||||
|
||||
#### 模式 1: Docker stdio 收集(默认)
|
||||
|
||||
**特点:**
|
||||
- ✅ 无需修改应用代码
|
||||
- ✅ 自动收集容器标准输出/错误
|
||||
- ✅ 性能影响极小
|
||||
- ✅ 推荐用于生产环境
|
||||
|
||||
**配置:**
|
||||
应用容器需要添加标签(已配置):
|
||||
```yaml
|
||||
labels:
|
||||
logging: "promtail"
|
||||
logging_jobname: "functional-scaffold-app"
|
||||
```
|
||||
|
||||
#### 模式 2: 文件收集(备用)
|
||||
|
||||
**特点:**
|
||||
- ✅ 日志持久化到文件
|
||||
- ✅ 支持日志轮转(100MB,5个备份)
|
||||
- ✅ 适合需要本地日志文件的场景
|
||||
|
||||
**启用方式:**
|
||||
```yaml
|
||||
# docker-compose.yml
|
||||
environment:
|
||||
- LOG_FILE_ENABLED=true
|
||||
- LOG_FILE_PATH=/var/log/app/app.log
|
||||
```
|
||||
|
||||
### 日志格式
|
||||
|
||||
所有日志使用 JSON 格式,自动包含以下字段:
|
||||
- `asctime`: 时间戳
|
||||
- `name`: 日志器名称
|
||||
- `levelname`: 日志级别(INFO, WARNING, ERROR)
|
||||
- `message`: 日志消息
|
||||
- `request_id`: 请求 ID(自动添加)
|
||||
- `timestamp`: ISO 格式时间戳
|
||||
|
||||
### 查询日志
|
||||
|
||||
#### 使用 Loki API
|
||||
|
||||
```bash
|
||||
# 查询所有日志
|
||||
curl -G -s "http://localhost:3100/loki/api/v1/query_range" \
|
||||
--data-urlencode 'query={job="functional-scaffold-app"}'
|
||||
|
||||
# 按 request_id 过滤
|
||||
curl -G -s "http://localhost:3100/loki/api/v1/query_range" \
|
||||
--data-urlencode 'query={job="functional-scaffold-app"} |= "request-id-here"'
|
||||
|
||||
# 查询错误日志
|
||||
curl -G -s "http://localhost:3100/loki/api/v1/query_range" \
|
||||
--data-urlencode 'query={job="functional-scaffold-app", level="ERROR"}'
|
||||
```
|
||||
|
||||
#### 使用 Grafana 仪表板
|
||||
|
||||
1. 访问 http://localhost:3000(admin/admin)
|
||||
2. 进入 "日志监控" 仪表板
|
||||
3. 使用 Request ID 输入框过滤特定请求的日志
|
||||
|
||||
**仪表板面板:**
|
||||
- **日志流(实时)**: 实时日志流
|
||||
- **日志量趋势**: 按时间和级别统计
|
||||
- **日志级别分布**: INFO/WARNING/ERROR 分布
|
||||
- **错误日志**: 只显示 ERROR 级别
|
||||
|
||||
#### 使用 Grafana Explore
|
||||
|
||||
1. 访问 http://localhost:3000/explore
|
||||
2. 选择 Loki 数据源
|
||||
3. 使用 LogQL 查询语言
|
||||
|
||||
**常用查询:**
|
||||
```logql
|
||||
# 查询所有日志
|
||||
{job="functional-scaffold-app"}
|
||||
|
||||
# 查询错误日志
|
||||
{job="functional-scaffold-app", level="ERROR"}
|
||||
|
||||
# 按 request_id 过滤
|
||||
{job="functional-scaffold-app"} |= "request-id-here"
|
||||
|
||||
# 使用 JSON 解析
|
||||
{job="functional-scaffold-app"} | json | request_id="request-id-here"
|
||||
|
||||
# 统计日志量
|
||||
sum by (level) (count_over_time({job="functional-scaffold-app"}[5m]))
|
||||
```
|
||||
|
||||
### 验证和测试
|
||||
|
||||
```bash
|
||||
# 验证 Loki 集成
|
||||
./scripts/verify_loki.sh
|
||||
|
||||
# 测试 Request ID 过滤
|
||||
./scripts/test_request_id_filter.sh
|
||||
```
|
||||
|
||||
### 配置文件
|
||||
|
||||
- **Loki 配置**: `monitoring/loki.yaml`
|
||||
- 日志保留期: 7 天
|
||||
- 摄入速率限制: 10MB/s
|
||||
- 自动压缩和清理
|
||||
|
||||
- **Promtail 配置**: `monitoring/promtail.yaml`
|
||||
- Docker stdio 收集配置
|
||||
- 文件收集配置
|
||||
- JSON 日志解析规则
|
||||
|
||||
- **Grafana Provisioning**: `monitoring/grafana/`
|
||||
- 数据源自动配置(datasources/)
|
||||
- 仪表板自动加载(dashboards/)
|
||||
|
||||
### 故障排查
|
||||
|
||||
**看不到日志:**
|
||||
1. 检查服务状态: `docker-compose ps`
|
||||
2. 查看 Promtail 日志: `docker-compose logs promtail`
|
||||
3. 验证容器标签: `docker inspect <container> | grep Labels`
|
||||
|
||||
**Docker socket 权限问题:**
|
||||
```bash
|
||||
sudo chmod 666 /var/run/docker.sock
|
||||
```
|
||||
|
||||
**日志延迟:**
|
||||
- Promtail 每 5 秒刷新一次
|
||||
- 建议等待 5-10 秒后再查询
|
||||
|
||||
### 相关文档
|
||||
|
||||
- **完整文档**: `docs/loki-integration.md` - 包含查询示例、故障排查、性能优化
|
||||
- **快速参考**: `docs/loki-quick-reference.md` - 常用命令和 LogQL 查询
|
||||
- **仪表板使用**: `docs/grafana-dashboard-usage.md` - Grafana 仪表板使用说明
|
||||
- **实施总结**: `docs/loki-implementation-summary.md` - 架构和实施细节
|
||||
- **监控目录**: `monitoring/README.md` - 配置文件说明
|
||||
1
LICENSE
1
LICENSE
@@ -1,6 +1,7 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2026 Roog
|
||||
Copyright (c) 2026 Guxinpei
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
296
README.md
Normal file
296
README.md
Normal file
@@ -0,0 +1,296 @@
|
||||
# FunctionalScaffold
|
||||
|
||||
**算法工程化 Serverless 解决方案脚手架**
|
||||
|
||||
一个基于 FastAPI 和 Docker 的 Serverless 算法服务脚手架,帮助算法工程师快速构建生产级的算法服务。
|
||||
|
||||
## 特性
|
||||
|
||||
- ✅ **标准化 API 接口** - 符合 RESTful 规范的 HTTP 接口
|
||||
- ✅ **同步/异步调用** - 支持同步调用和异步任务
|
||||
- ✅ **开箱即用** - 完整的项目结构和配置
|
||||
- ✅ **自动文档** - Swagger/OpenAPI 自动生成
|
||||
- ✅ **监控指标** - Prometheus 指标和 Grafana 仪表板
|
||||
- ✅ **健康检查** - 存活和就绪探针
|
||||
- ✅ **容器化部署** - Docker 和 Kubernetes 支持
|
||||
- ✅ **Serverless 就绪** - 支持阿里云函数计算和 AWS Lambda
|
||||
- ✅ **完整测试** - 单元测试和集成测试
|
||||
|
||||
## 文档
|
||||
|
||||
| 文档 | 描述 |
|
||||
|-----------------------------------------|--------------|
|
||||
| [快速入门](docs/getting-started.md) | 10 分钟上手指南 |
|
||||
| [算法开发指南](docs/algorithm-development.md) | 详细的算法开发教程 |
|
||||
| [API 参考](docs/api-reference.md) | 完整的 API 文档 |
|
||||
| [监控指南](docs/monitoring.md) | 监控和告警配置 |
|
||||
| [API 规范](docs/api/README.md) | OpenAPI 规范说明 |
|
||||
| [Kubernetes 部署](docs/kubernetes-deployment.md) | K8s 集群部署指南 |
|
||||
| [日志集成(Loki)](docs/loki-quick-reference.md) | 日志收集部署说明 |
|
||||
|
||||
## 快速开始
|
||||
|
||||
### 前置要求
|
||||
|
||||
- Python 3.9+
|
||||
- Docker (可选)
|
||||
|
||||
### 本地开发
|
||||
|
||||
1. 克隆仓库
|
||||
|
||||
```bash
|
||||
git clone <repository-url>
|
||||
cd FunctionalScaffold
|
||||
```
|
||||
|
||||
2. 创建虚拟环境并安装依赖
|
||||
|
||||
```bash
|
||||
python -m venv venv
|
||||
source venv/bin/activate # Windows: venv\Scripts\activate
|
||||
pip install -e ".[dev]"
|
||||
```
|
||||
|
||||
3. 启动开发服务器
|
||||
|
||||
```bash
|
||||
# 方式1:使用脚本
|
||||
./scripts/run_dev.sh
|
||||
|
||||
# 方式2:直接运行
|
||||
uvicorn functional_scaffold.main:app --reload --port 8000
|
||||
```
|
||||
|
||||
4. 访问 API 文档
|
||||
|
||||
打开浏览器访问:
|
||||
- Swagger UI: http://localhost:8000/docs
|
||||
- ReDoc: http://localhost:8000/redoc
|
||||
- OpenAPI JSON: http://localhost:8000/openapi.json
|
||||
|
||||
### 使用 Docker
|
||||
|
||||
```bash
|
||||
# 构建镜像
|
||||
docker build -f deployment/Dockerfile -t functional-scaffold:latest .
|
||||
|
||||
# 运行容器
|
||||
docker run -p 8000:8000 functional-scaffold:latest
|
||||
|
||||
# 或使用 docker-compose
|
||||
cd deployment
|
||||
docker-compose up
|
||||
|
||||
# 如果阿里FC无法识别 Platform:unknown/unknown 的情况时,请按下列执行打包:
|
||||
export DOCKER_DEFAULT_PLATFORM=linux/amd64
|
||||
export BUILDX_NO_DEFAULT_ATTESTATIONS=1
|
||||
docker compose build
|
||||
docker compose push
|
||||
```
|
||||
|
||||
## API 端点
|
||||
|
||||
### 核心接口
|
||||
|
||||
- `POST /invoke` - 同步调用算法
|
||||
- `POST /jobs` - 创建异步任务
|
||||
- `GET /jobs/{job_id}` - 查询任务状态
|
||||
|
||||
### 健康检查
|
||||
|
||||
- `GET /healthz` - 存活检查
|
||||
- `GET /readyz` - 就绪检查
|
||||
|
||||
### 监控
|
||||
|
||||
- `GET /metrics` - Prometheus 指标
|
||||
|
||||
## 示例请求
|
||||
|
||||
### 同步调用 - 质数判断
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/invoke \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"number": 17}'
|
||||
```
|
||||
|
||||
响应:
|
||||
|
||||
```json
|
||||
{
|
||||
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
||||
"status": "success",
|
||||
"result": {
|
||||
"number": 17,
|
||||
"is_prime": true,
|
||||
"factors": [],
|
||||
"algorithm": "trial_division"
|
||||
},
|
||||
"metadata": {
|
||||
"algorithm": "PrimeChecker",
|
||||
"version": "1.0.0",
|
||||
"elapsed_time": 0.001
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 异步任务
|
||||
|
||||
```bash
|
||||
# 创建任务
|
||||
curl -X POST http://localhost:8000/jobs \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"algorithm": "PrimeChecker", "params": {"number": 17}}'
|
||||
|
||||
# 查询状态
|
||||
curl http://localhost:8000/jobs/{job_id}
|
||||
```
|
||||
|
||||
## 项目结构
|
||||
|
||||
```
|
||||
FunctionalScaffold/
|
||||
├── src/functional_scaffold/ # 核心代码
|
||||
│ ├── algorithms/ # 算法实现
|
||||
│ ├── api/ # API 层
|
||||
│ ├── core/ # 核心功能
|
||||
│ ├── utils/ # 工具函数
|
||||
│ ├── config.py # 配置管理
|
||||
│ └── main.py # 应用入口
|
||||
├── tests/ # 测试
|
||||
├── deployment/ # 部署配置
|
||||
│ ├── Dockerfile
|
||||
│ ├── docker-compose.yml
|
||||
│ ├── kubernetes/
|
||||
│ └── serverless/
|
||||
├── monitoring/ # 监控配置
|
||||
├── scripts/ # 辅助脚本
|
||||
└── docs/ # 文档
|
||||
```
|
||||
|
||||
## 开发指南
|
||||
|
||||
详细的开发指南请参考 [算法开发指南](docs/algorithm-development.md)。
|
||||
|
||||
### 添加新算法
|
||||
|
||||
1. 在 `src/functional_scaffold/algorithms/` 创建新算法文件
|
||||
2. 继承 `BaseAlgorithm` 类并实现 `process` 方法
|
||||
3. 在 API 路由中注册新端点
|
||||
|
||||
示例:
|
||||
|
||||
```python
|
||||
from .base import BaseAlgorithm
|
||||
|
||||
class MyAlgorithm(BaseAlgorithm):
|
||||
def process(self, input_data):
|
||||
# 实现算法逻辑
|
||||
result = do_something(input_data)
|
||||
return {"result": result}
|
||||
```
|
||||
|
||||
### 运行测试
|
||||
|
||||
```bash
|
||||
# 运行所有测试
|
||||
pytest tests/ -v
|
||||
|
||||
# 运行测试并生成覆盖率报告
|
||||
pytest tests/ --cov=src/functional_scaffold --cov-report=html
|
||||
|
||||
# 使用脚本
|
||||
./scripts/run_tests.sh
|
||||
```
|
||||
|
||||
### 代码质量
|
||||
|
||||
```bash
|
||||
# 代码格式化
|
||||
black src/ tests/
|
||||
|
||||
# 代码检查
|
||||
ruff check src/ tests/
|
||||
```
|
||||
|
||||
### 导出 OpenAPI 规范
|
||||
|
||||
```bash
|
||||
python scripts/export_openapi.py
|
||||
```
|
||||
|
||||
生成的文件位于 `docs/swagger/openapi.json`
|
||||
|
||||
## 部署
|
||||
|
||||
### Kubernetes
|
||||
|
||||
```bash
|
||||
kubectl apply -f deployment/kubernetes/
|
||||
```
|
||||
|
||||
### 阿里云函数计算
|
||||
|
||||
```bash
|
||||
fun deploy -t deployment/serverless/aliyun-fc.yaml
|
||||
```
|
||||
|
||||
### AWS Lambda
|
||||
|
||||
```bash
|
||||
sam deploy --template-file deployment/serverless/aws-lambda.yaml
|
||||
```
|
||||
|
||||
## 监控
|
||||
|
||||
详细的监控配置请参考 [监控指南](docs/monitoring.md)。
|
||||
|
||||
### Prometheus 指标
|
||||
|
||||
访问 `/metrics` 端点查看可用指标:
|
||||
|
||||
- `http_requests_total` - HTTP 请求总数
|
||||
- `http_request_duration_seconds` - HTTP 请求延迟
|
||||
- `algorithm_executions_total` - 算法执行总数
|
||||
- `algorithm_execution_duration_seconds` - 算法执行延迟
|
||||
- `jobs_created_total` - 异步任务创建总数
|
||||
- `jobs_completed_total` - 异步任务完成总数
|
||||
|
||||
### Grafana 仪表板
|
||||
|
||||
导入 `monitoring/grafana/dashboard.json` 到 Grafana
|
||||
|
||||
## 配置
|
||||
|
||||
通过环境变量或 `.env` 文件配置:
|
||||
|
||||
```bash
|
||||
# 应用配置
|
||||
APP_NAME=FunctionalScaffold
|
||||
APP_VERSION=1.0.0
|
||||
APP_ENV=development
|
||||
|
||||
# 服务器配置
|
||||
HOST=0.0.0.0
|
||||
PORT=8000
|
||||
WORKERS=4
|
||||
|
||||
# 日志配置
|
||||
LOG_LEVEL=INFO
|
||||
LOG_FORMAT=json
|
||||
|
||||
# 指标配置
|
||||
METRICS_ENABLED=true
|
||||
```
|
||||
|
||||
参考 `.env.example` 查看完整配置选项。
|
||||
|
||||
## 许可证
|
||||
|
||||
MIT License
|
||||
|
||||
## 贡献
|
||||
|
||||
欢迎提交 Issue 和 Pull Request!
|
||||
121
config/metrics.yaml
Normal file
121
config/metrics.yaml
Normal file
@@ -0,0 +1,121 @@
|
||||
# 指标配置文件
|
||||
# 算法成员可以在此添加自定义指标
|
||||
|
||||
# Redis 连接配置(也可通过环境变量覆盖)
|
||||
redis:
|
||||
host: ${REDIS_HOST:localhost}
|
||||
port: ${REDIS_PORT:6379}
|
||||
db: ${REDIS_METRICS_DB:0}
|
||||
password: ${REDIS_PASSWORD:}
|
||||
|
||||
# 全局配置
|
||||
global:
|
||||
prefix: "functional_scaffold" # 指标名称前缀
|
||||
instance_label: true # 是否添加实例标签
|
||||
|
||||
# 内置指标(框架自动收集)
|
||||
builtin_metrics:
|
||||
http_requests:
|
||||
enabled: true
|
||||
name: "http_requests_total"
|
||||
type: counter
|
||||
description: "HTTP 请求总数"
|
||||
labels: [method, endpoint, status]
|
||||
|
||||
http_latency:
|
||||
enabled: true
|
||||
name: "http_request_duration_seconds"
|
||||
type: histogram
|
||||
description: "HTTP 请求延迟"
|
||||
labels: [method, endpoint]
|
||||
buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]
|
||||
|
||||
http_in_progress:
|
||||
enabled: true
|
||||
name: "http_requests_in_progress"
|
||||
type: gauge
|
||||
description: "当前进行中的 HTTP 请求数"
|
||||
labels: []
|
||||
|
||||
algorithm_executions:
|
||||
enabled: true
|
||||
name: "algorithm_executions_total"
|
||||
type: counter
|
||||
description: "算法执行总数"
|
||||
labels: [algorithm, status]
|
||||
|
||||
algorithm_latency:
|
||||
enabled: true
|
||||
name: "algorithm_execution_duration_seconds"
|
||||
type: histogram
|
||||
description: "算法执行延迟"
|
||||
labels: [algorithm]
|
||||
buckets: [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 60]
|
||||
|
||||
# 自定义指标(算法成员在此添加)
|
||||
custom_metrics:
|
||||
# 示例:质数判断结果统计
|
||||
prime_check_results:
|
||||
name: "prime_check_results_total"
|
||||
type: counter
|
||||
description: "质数判断结果统计"
|
||||
labels: [is_prime]
|
||||
|
||||
# 示例:输入数字大小分布
|
||||
input_number_size:
|
||||
name: "input_number_size"
|
||||
type: histogram
|
||||
description: "输入数字大小分布"
|
||||
labels: []
|
||||
buckets: [10, 100, 1000, 10000, 100000, 1000000]
|
||||
|
||||
# 异步任务指标
|
||||
jobs_created:
|
||||
name: "jobs_created_total"
|
||||
type: counter
|
||||
description: "创建的异步任务总数"
|
||||
labels: [algorithm]
|
||||
|
||||
jobs_completed:
|
||||
name: "jobs_completed_total"
|
||||
type: counter
|
||||
description: "完成的异步任务总数"
|
||||
labels: [algorithm, status]
|
||||
|
||||
job_execution_duration:
|
||||
name: "job_execution_duration_seconds"
|
||||
type: histogram
|
||||
description: "异步任务执行时间"
|
||||
labels: [algorithm]
|
||||
buckets: [0.1, 0.5, 1, 5, 10, 30, 60, 120, 300]
|
||||
|
||||
webhook_deliveries:
|
||||
name: "webhook_deliveries_total"
|
||||
type: counter
|
||||
description: "Webhook 回调发送总数"
|
||||
labels: [status]
|
||||
|
||||
# 队列监控指标
|
||||
job_queue_length:
|
||||
name: "job_queue_length"
|
||||
type: gauge
|
||||
description: "待处理任务队列长度"
|
||||
labels: [queue]
|
||||
|
||||
job_oldest_waiting_seconds:
|
||||
name: "job_oldest_waiting_seconds"
|
||||
type: gauge
|
||||
description: "最长任务等待时间(秒)"
|
||||
labels: []
|
||||
|
||||
job_recovered_total:
|
||||
name: "job_recovered_total"
|
||||
type: counter
|
||||
description: "回收的超时任务总数"
|
||||
labels: []
|
||||
|
||||
prime_check_total:
|
||||
name: "prime_check"
|
||||
type: counter
|
||||
description: "出现问题的次数"
|
||||
labels: [status]
|
||||
44
deployment/Dockerfile
Normal file
44
deployment/Dockerfile
Normal file
@@ -0,0 +1,44 @@
|
||||
FROM --platform=linux/amd64 python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 安装系统依赖
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
gcc \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 复制依赖文件
|
||||
COPY requirements.txt .
|
||||
COPY requirements-dev.txt .
|
||||
|
||||
# 安装 Python 依赖
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN pip install --no-cache-dir -r requirements-dev.txt
|
||||
|
||||
# 复制应用代码和配置
|
||||
COPY src/ ./src/
|
||||
COPY config/ ./config/
|
||||
COPY pyproject.toml .
|
||||
|
||||
# 安装包(使用 editable 模式)
|
||||
RUN pip install --no-cache-dir -e .
|
||||
|
||||
# 创建非 root 用户
|
||||
RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# 暴露端口
|
||||
EXPOSE 8000
|
||||
|
||||
# 运行模式:api(默认)或 worker
|
||||
ENV RUN_MODE=api
|
||||
|
||||
# 健康检查(仅对 API 模式有效)
|
||||
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
||||
CMD if [ "$RUN_MODE" = "api" ]; then python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/healthz')"; else exit 0; fi
|
||||
|
||||
# 启动脚本
|
||||
COPY --chown=appuser:appuser deployment/entrypoint.sh /app/entrypoint.sh
|
||||
RUN chmod +x /app/entrypoint.sh
|
||||
|
||||
CMD ["/app/entrypoint.sh"]
|
||||
33
deployment/Dockerfile.redis-exporter
Normal file
33
deployment/Dockerfile.redis-exporter
Normal file
@@ -0,0 +1,33 @@
|
||||
# Redis Exporter Dockerfile
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 安装依赖
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir redis prometheus-client
|
||||
|
||||
# 复制 exporter 代码
|
||||
COPY src/functional_scaffold/core/metrics_redis_exporter.py .
|
||||
|
||||
# 暴露端口
|
||||
EXPOSE 8001
|
||||
|
||||
# 启动 HTTP 服务器提供指标
|
||||
CMD ["python", "-c", "\
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler; \
|
||||
from metrics_redis_exporter import get_metrics; \
|
||||
class MetricsHandler(BaseHTTPRequestHandler): \
|
||||
def do_GET(self): \
|
||||
if self.path == '/metrics': \
|
||||
self.send_response(200); \
|
||||
self.send_header('Content-Type', 'text/plain; version=0.0.4'); \
|
||||
self.end_headers(); \
|
||||
self.wfile.write(get_metrics()); \
|
||||
else: \
|
||||
self.send_response(404); \
|
||||
self.end_headers(); \
|
||||
def log_message(self, format, *args): pass; \
|
||||
server = HTTPServer(('0.0.0.0', 8001), MetricsHandler); \
|
||||
print('Redis Exporter 启动在端口 8001'); \
|
||||
server.serve_forever()"]
|
||||
157
deployment/docker-compose.yml
Normal file
157
deployment/docker-compose.yml
Normal file
@@ -0,0 +1,157 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
app:
|
||||
build:
|
||||
context: ..
|
||||
dockerfile: deployment/Dockerfile
|
||||
platform: linux/amd64
|
||||
ports:
|
||||
- "8111:8000"
|
||||
environment:
|
||||
- APP_ENV=development
|
||||
- LOG_LEVEL=INFO
|
||||
- METRICS_ENABLED=true
|
||||
- RUN_MODE=api
|
||||
# Redis 指标存储配置
|
||||
- REDIS_HOST=redis
|
||||
- REDIS_PORT=6379
|
||||
- REDIS_DB=0
|
||||
# 指标配置文件路径
|
||||
- METRICS_CONFIG_PATH=config/metrics.yaml
|
||||
# 日志文件配置
|
||||
- LOG_FILE_ENABLED=false
|
||||
- LOG_FILE_PATH=/var/log/app/app.log
|
||||
volumes:
|
||||
- ../src:/app/src
|
||||
- ../config:/app/config
|
||||
- app_logs:/var/log/app
|
||||
labels:
|
||||
logging: "promtail"
|
||||
logging_jobname: "functional-scaffold-app"
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/healthz')"]
|
||||
interval: 30s
|
||||
timeout: 3s
|
||||
retries: 3
|
||||
start_period: 5s
|
||||
|
||||
# Worker 服务 - 处理异步任务
|
||||
worker:
|
||||
build:
|
||||
context: ..
|
||||
dockerfile: deployment/Dockerfile
|
||||
image: crpi-om2xd9y8cmaizszf.cn-beijing.personal.cr.aliyuncs.com/test-namespace-gu/fc-test:latest
|
||||
environment:
|
||||
- APP_ENV=development
|
||||
- LOG_LEVEL=INFO
|
||||
- METRICS_ENABLED=true
|
||||
- RUN_MODE=worker
|
||||
# Redis 配置
|
||||
- REDIS_HOST=redis
|
||||
- REDIS_PORT=6379
|
||||
- REDIS_DB=0
|
||||
# Worker 配置
|
||||
- WORKER_POLL_INTERVAL=1.0
|
||||
- MAX_CONCURRENT_JOBS=10
|
||||
- JOB_MAX_RETRIES=3
|
||||
- JOB_EXECUTION_TIMEOUT=300
|
||||
volumes:
|
||||
- ../src:/app/src
|
||||
- ../config:/app/config
|
||||
labels:
|
||||
logging: "promtail"
|
||||
logging_jobname: "functional-scaffold-worker"
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
deploy:
|
||||
replicas: 2
|
||||
|
||||
# Redis - 用于集中式指标存储
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
ports:
|
||||
- "6380:6379"
|
||||
volumes:
|
||||
- redis_data:/data
|
||||
command: redis-server --appendonly yes
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 10s
|
||||
timeout: 3s
|
||||
retries: 3
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
ports:
|
||||
- "9090:9090"
|
||||
volumes:
|
||||
- ../monitoring/prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- ../monitoring/alerts:/etc/prometheus/rules
|
||||
- prometheus_data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
- app
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
ports:
|
||||
- "3000:3000"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_PASSWORD=admin
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- ../monitoring/grafana/datasources:/etc/grafana/provisioning/datasources
|
||||
- ../monitoring/grafana/dashboards:/etc/grafana/provisioning/dashboards
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
- prometheus
|
||||
- loki
|
||||
|
||||
loki:
|
||||
image: grafana/loki:2.9.3
|
||||
ports:
|
||||
- "3100:3100"
|
||||
volumes:
|
||||
- ../monitoring/loki.yaml:/etc/loki/local-config.yaml
|
||||
- loki_data:/loki
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
|
||||
interval: 10s
|
||||
timeout: 3s
|
||||
retries: 3
|
||||
|
||||
promtail:
|
||||
ports:
|
||||
- "9080:9080"
|
||||
image: grafana/promtail:3.0.0
|
||||
volumes:
|
||||
- ../monitoring/promtail.yaml:/etc/promtail/config.yml
|
||||
# Docker stdio 收集
|
||||
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
# Log 文件收集(备用)
|
||||
- app_logs:/var/log/app:ro
|
||||
command: -config.file=/etc/promtail/config.yml
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
- loki
|
||||
|
||||
volumes:
|
||||
prometheus_data:
|
||||
grafana_data:
|
||||
redis_data:
|
||||
loki_data:
|
||||
app_logs:
|
||||
12
deployment/entrypoint.sh
Normal file
12
deployment/entrypoint.sh
Normal file
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
# 启动脚本:根据 RUN_MODE 环境变量选择启动 API 或 Worker
|
||||
|
||||
set -e
|
||||
|
||||
if [ "$RUN_MODE" = "worker" ]; then
|
||||
echo "启动 Worker 模式..."
|
||||
exec python -m functional_scaffold.worker
|
||||
else
|
||||
echo "启动 API 模式..."
|
||||
exec uvicorn functional_scaffold.main:app --host 0.0.0.0 --port 8000
|
||||
fi
|
||||
203
deployment/kubernetes/deployment.yaml
Normal file
203
deployment/kubernetes/deployment.yaml
Normal file
@@ -0,0 +1,203 @@
|
||||
# Kubernetes 部署配置
|
||||
# 包含:ConfigMap、API Deployment、Worker Deployment、Redis Deployment
|
||||
|
||||
---
|
||||
# ConfigMap - 共享配置
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: functional-scaffold-config
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
data:
|
||||
APP_ENV: "production"
|
||||
LOG_LEVEL: "INFO"
|
||||
LOG_FORMAT: "json"
|
||||
METRICS_ENABLED: "true"
|
||||
# Redis 配置(指向集群内 Redis 服务)
|
||||
REDIS_HOST: "functional-scaffold-redis"
|
||||
REDIS_PORT: "6379"
|
||||
REDIS_DB: "0"
|
||||
# 异步任务配置
|
||||
MAX_CONCURRENT_JOBS: "10"
|
||||
JOB_RESULT_TTL: "1800"
|
||||
WEBHOOK_MAX_RETRIES: "3"
|
||||
WEBHOOK_TIMEOUT: "10"
|
||||
# Worker 配置
|
||||
WORKER_POLL_INTERVAL: "1.0"
|
||||
JOB_QUEUE_KEY: "job:queue"
|
||||
JOB_CONCURRENCY_KEY: "job:concurrency"
|
||||
JOB_LOCK_TTL: "300"
|
||||
JOB_MAX_RETRIES: "3"
|
||||
JOB_EXECUTION_TIMEOUT: "300"
|
||||
|
||||
---
|
||||
# API Deployment - HTTP 服务
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: functional-scaffold-api
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
spec:
|
||||
replicas: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
spec:
|
||||
containers:
|
||||
- name: api
|
||||
image: functional-scaffold:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
- name: RUN_MODE
|
||||
value: "api"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: functional-scaffold-config
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 8000
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /readyz
|
||||
port: 8000
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
|
||||
---
|
||||
# Worker Deployment - 异步任务处理
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: functional-scaffold-worker
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: worker
|
||||
spec:
|
||||
replicas: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app: functional-scaffold
|
||||
component: worker
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: worker
|
||||
spec:
|
||||
containers:
|
||||
- name: worker
|
||||
image: functional-scaffold:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: RUN_MODE
|
||||
value: "worker"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: functional-scaffold-config
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
# Worker 没有 HTTP 端口,使用命令探针
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- python
|
||||
- -c
|
||||
- "import redis; r = redis.Redis(host='functional-scaffold-redis'); r.ping()"
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
|
||||
---
|
||||
# Redis Deployment - 任务队列和状态存储
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: functional-scaffold-redis
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: redis
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: functional-scaffold
|
||||
component: redis
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: redis
|
||||
spec:
|
||||
containers:
|
||||
- name: redis
|
||||
image: redis:7-alpine
|
||||
ports:
|
||||
- containerPort: 6379
|
||||
name: redis
|
||||
command:
|
||||
- redis-server
|
||||
- --appendonly
|
||||
- "yes"
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "256Mi"
|
||||
cpu: "200m"
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- redis-cli
|
||||
- ping
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
readinessProbe:
|
||||
exec:
|
||||
command:
|
||||
- redis-cli
|
||||
- ping
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
volumeMounts:
|
||||
- name: redis-data
|
||||
mountPath: /data
|
||||
volumes:
|
||||
- name: redis-data
|
||||
emptyDir: {}
|
||||
66
deployment/kubernetes/service.yaml
Normal file
66
deployment/kubernetes/service.yaml
Normal file
@@ -0,0 +1,66 @@
|
||||
# Kubernetes Service 配置
|
||||
# 包含:API Service、Metrics Service、Redis Service
|
||||
|
||||
---
|
||||
# API Service - 对外暴露 HTTP 服务
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: functional-scaffold-api
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 80
|
||||
targetPort: 8000
|
||||
protocol: TCP
|
||||
name: http
|
||||
selector:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
|
||||
---
|
||||
# Metrics Service - Prometheus 抓取指标
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: functional-scaffold-metrics
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8000"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 8000
|
||||
targetPort: 8000
|
||||
protocol: TCP
|
||||
name: metrics
|
||||
selector:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
|
||||
---
|
||||
# Redis Service - 内部 Redis 服务
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: functional-scaffold-redis
|
||||
labels:
|
||||
app: functional-scaffold
|
||||
component: redis
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 6379
|
||||
targetPort: 6379
|
||||
protocol: TCP
|
||||
name: redis
|
||||
selector:
|
||||
app: functional-scaffold
|
||||
component: redis
|
||||
72
deployment/serverless/aliyun-fc.yaml
Normal file
72
deployment/serverless/aliyun-fc.yaml
Normal file
@@ -0,0 +1,72 @@
|
||||
# 阿里云函数计算配置
|
||||
ROSTemplateFormatVersion: '2015-09-01'
|
||||
Transform: 'Aliyun::Serverless-2018-04-03'
|
||||
Resources:
|
||||
functional-scaffold:
|
||||
Type: 'Aliyun::Serverless::Service'
|
||||
Properties:
|
||||
Description: '算法工程化 Serverless 脚手架'
|
||||
LogConfig:
|
||||
Project: functional-scaffold-logs
|
||||
Logstore: function-logs
|
||||
VpcConfig:
|
||||
VpcId: 'vpc-xxxxx'
|
||||
VSwitchIds:
|
||||
- 'vsw-xxxxx'
|
||||
SecurityGroupId: 'sg-xxxxx'
|
||||
prime-checker:
|
||||
Type: 'Aliyun::Serverless::Function'
|
||||
Properties:
|
||||
Description: '质数判断算法服务(API)'
|
||||
Runtime: custom-container
|
||||
MemorySize: 512
|
||||
Timeout: 60
|
||||
InstanceConcurrency: 10
|
||||
CAPort: 8000
|
||||
CustomContainerConfig:
|
||||
Image: 'registry.cn-hangzhou.aliyuncs.com/your-namespace/functional-scaffold:latest'
|
||||
Command: '["/app/entrypoint.sh"]'
|
||||
EnvironmentVariables:
|
||||
APP_ENV: production
|
||||
LOG_LEVEL: INFO
|
||||
METRICS_ENABLED: 'true'
|
||||
RUN_MODE: api
|
||||
REDIS_HOST: 'r-xxxxx.redis.rds.aliyuncs.com'
|
||||
REDIS_PORT: '6379'
|
||||
Events:
|
||||
httpTrigger:
|
||||
Type: HTTP
|
||||
Properties:
|
||||
AuthType: ANONYMOUS
|
||||
Methods:
|
||||
- GET
|
||||
- POST
|
||||
job-worker:
|
||||
Type: 'Aliyun::Serverless::Function'
|
||||
Properties:
|
||||
Description: '异步任务 Worker'
|
||||
Runtime: custom-container
|
||||
MemorySize: 512
|
||||
Timeout: 900
|
||||
InstanceConcurrency: 1
|
||||
CustomContainerConfig:
|
||||
Image: 'registry.cn-hangzhou.aliyuncs.com/your-namespace/functional-scaffold:latest'
|
||||
Command: '["/app/entrypoint.sh"]'
|
||||
EnvironmentVariables:
|
||||
APP_ENV: production
|
||||
LOG_LEVEL: INFO
|
||||
METRICS_ENABLED: 'true'
|
||||
RUN_MODE: worker
|
||||
REDIS_HOST: 'r-xxxxx.redis.rds.aliyuncs.com'
|
||||
REDIS_PORT: '6379'
|
||||
WORKER_POLL_INTERVAL: '1.0'
|
||||
MAX_CONCURRENT_JOBS: '5'
|
||||
JOB_MAX_RETRIES: '3'
|
||||
JOB_EXECUTION_TIMEOUT: '300'
|
||||
Events:
|
||||
timerTrigger:
|
||||
Type: Timer
|
||||
Properties:
|
||||
CronExpression: '0 */1 * * * *'
|
||||
Enable: true
|
||||
Payload: '{}'
|
||||
46
deployment/serverless/aws-lambda.yaml
Normal file
46
deployment/serverless/aws-lambda.yaml
Normal file
@@ -0,0 +1,46 @@
|
||||
# AWS Lambda 配置(使用 Lambda Container Image)
|
||||
AWSTemplateFormatVersion: '2010-09-09'
|
||||
Transform: AWS::Serverless-2016-10-31
|
||||
Description: FunctionalScaffold Serverless Application
|
||||
|
||||
Globals:
|
||||
Function:
|
||||
Timeout: 60
|
||||
MemorySize: 512
|
||||
Environment:
|
||||
Variables:
|
||||
APP_ENV: production
|
||||
LOG_LEVEL: INFO
|
||||
METRICS_ENABLED: 'true'
|
||||
|
||||
Resources:
|
||||
FunctionalScaffoldFunction:
|
||||
Type: AWS::Serverless::Function
|
||||
Properties:
|
||||
PackageType: Image
|
||||
ImageUri: !Sub '${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/functional-scaffold:latest'
|
||||
Events:
|
||||
ApiEvent:
|
||||
Type: Api
|
||||
Properties:
|
||||
Path: /{proxy+}
|
||||
Method: ANY
|
||||
Policies:
|
||||
- AWSLambdaBasicExecutionRole
|
||||
|
||||
FunctionalScaffoldApi:
|
||||
Type: AWS::Serverless::Api
|
||||
Properties:
|
||||
StageName: prod
|
||||
Cors:
|
||||
AllowMethods: "'*'"
|
||||
AllowHeaders: "'*'"
|
||||
AllowOrigin: "'*'"
|
||||
|
||||
Outputs:
|
||||
ApiUrl:
|
||||
Description: "API Gateway endpoint URL"
|
||||
Value: !Sub "https://${FunctionalScaffoldApi}.execute-api.${AWS::Region}.amazonaws.com/prod/"
|
||||
FunctionArn:
|
||||
Description: "Function ARN"
|
||||
Value: !GetAtt FunctionalScaffoldFunction.Arn
|
||||
556
docs/algorithm-development.md
Normal file
556
docs/algorithm-development.md
Normal file
@@ -0,0 +1,556 @@
|
||||
# 算法开发指南
|
||||
|
||||
本文档详细介绍如何在 FunctionalScaffold 框架中开发算法,包括最佳实践、高级特性和常见模式。
|
||||
|
||||
## 算法架构
|
||||
|
||||
### 核心概念
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ HTTP 请求 │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ FastAPI 路由层 │
|
||||
│ - 参数验证 (Pydantic) │
|
||||
│ - 请求 ID 生成 │
|
||||
│ - 错误处理 │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ BaseAlgorithm.execute() │
|
||||
│ - 自动计时 │
|
||||
│ - 指标记录 │
|
||||
│ - 异常捕获 │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ YourAlgorithm.process() │
|
||||
│ ★ 你只需要实现这个方法 ★ │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### BaseAlgorithm 基类
|
||||
|
||||
所有算法必须继承 `BaseAlgorithm` 类:
|
||||
|
||||
```python
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict
|
||||
|
||||
class BaseAlgorithm(ABC):
|
||||
"""算法基类"""
|
||||
|
||||
def __init__(self):
|
||||
self.name = self.__class__.__name__
|
||||
self.version = "1.0.0"
|
||||
|
||||
@abstractmethod
|
||||
def process(self, *args, **kwargs) -> Dict[str, Any]:
|
||||
"""算法处理逻辑 - 子类必须实现"""
|
||||
pass
|
||||
|
||||
def execute(self, *args, **kwargs) -> Dict[str, Any]:
|
||||
"""执行算法 - 自动处理埋点和错误"""
|
||||
# 框架自动处理:计时、日志、指标、异常
|
||||
pass
|
||||
```
|
||||
|
||||
## 开发算法
|
||||
|
||||
### 基础示例
|
||||
|
||||
```python
|
||||
# src/functional_scaffold/algorithms/text_processor.py
|
||||
from typing import Dict, Any
|
||||
from .base import BaseAlgorithm
|
||||
|
||||
class TextProcessor(BaseAlgorithm):
|
||||
"""文本处理算法"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.version = "1.0.0"
|
||||
|
||||
def process(self, text: str, operation: str = "upper") -> Dict[str, Any]:
|
||||
"""
|
||||
处理文本
|
||||
|
||||
Args:
|
||||
text: 输入文本
|
||||
operation: 操作类型 (upper/lower/reverse)
|
||||
|
||||
Returns:
|
||||
处理结果
|
||||
"""
|
||||
if operation == "upper":
|
||||
result = text.upper()
|
||||
elif operation == "lower":
|
||||
result = text.lower()
|
||||
elif operation == "reverse":
|
||||
result = text[::-1]
|
||||
else:
|
||||
raise ValueError(f"不支持的操作: {operation}")
|
||||
|
||||
return {
|
||||
"original": text,
|
||||
"processed": result,
|
||||
"operation": operation,
|
||||
"length": len(result)
|
||||
}
|
||||
```
|
||||
|
||||
### 带模型加载的算法
|
||||
|
||||
```python
|
||||
# src/functional_scaffold/algorithms/ml_predictor.py
|
||||
from typing import Dict, Any
|
||||
import logging
|
||||
from .base import BaseAlgorithm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class MLPredictor(BaseAlgorithm):
|
||||
"""机器学习预测算法"""
|
||||
|
||||
def __init__(self, model_path: str = None):
|
||||
super().__init__()
|
||||
self.model = None
|
||||
self.model_path = model_path or "models/default.pkl"
|
||||
self._load_model()
|
||||
|
||||
def _load_model(self):
|
||||
"""加载模型(在初始化时执行一次)"""
|
||||
logger.info(f"加载模型: {self.model_path}")
|
||||
# import joblib
|
||||
# self.model = joblib.load(self.model_path)
|
||||
self.model = "mock_model" # 示例
|
||||
logger.info("模型加载完成")
|
||||
|
||||
def process(self, features: list) -> Dict[str, Any]:
|
||||
"""
|
||||
执行预测
|
||||
|
||||
Args:
|
||||
features: 特征向量
|
||||
|
||||
Returns:
|
||||
预测结果
|
||||
"""
|
||||
if self.model is None:
|
||||
raise RuntimeError("模型未加载")
|
||||
|
||||
# prediction = self.model.predict([features])[0]
|
||||
prediction = sum(features) / len(features) # 示例
|
||||
|
||||
return {
|
||||
"features": features,
|
||||
"prediction": prediction,
|
||||
"model_version": self.version
|
||||
}
|
||||
```
|
||||
|
||||
### 带外部服务调用的算法
|
||||
|
||||
```python
|
||||
# src/functional_scaffold/algorithms/data_fetcher.py
|
||||
from typing import Dict, Any
|
||||
import httpx
|
||||
from .base import BaseAlgorithm
|
||||
from ..config import settings
|
||||
|
||||
class DataFetcher(BaseAlgorithm):
|
||||
"""数据获取算法"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.api_base_url = settings.external_api_url or "https://api.example.com"
|
||||
|
||||
async def _fetch_data(self, endpoint: str) -> dict:
|
||||
"""异步获取数据"""
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.get(f"{self.api_base_url}/{endpoint}")
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def process(self, data_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
获取并处理数据
|
||||
|
||||
Args:
|
||||
data_id: 数据 ID
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
# 在同步方法中调用异步函数
|
||||
loop = asyncio.new_event_loop()
|
||||
try:
|
||||
data = loop.run_until_complete(self._fetch_data(f"data/{data_id}"))
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
# 处理数据
|
||||
processed = self._transform(data)
|
||||
|
||||
return {
|
||||
"data_id": data_id,
|
||||
"raw_data": data,
|
||||
"processed_data": processed
|
||||
}
|
||||
|
||||
def _transform(self, data: dict) -> dict:
|
||||
"""数据转换"""
|
||||
return {k: v for k, v in data.items() if v is not None}
|
||||
```
|
||||
|
||||
## 数据模型定义
|
||||
|
||||
### 请求模型
|
||||
|
||||
```python
|
||||
# src/functional_scaffold/api/models.py
|
||||
from pydantic import BaseModel, Field, ConfigDict, field_validator
|
||||
from typing import List, Optional
|
||||
|
||||
class TextProcessRequest(BaseModel):
|
||||
"""文本处理请求"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
json_schema_extra={
|
||||
"example": {
|
||||
"text": "Hello World",
|
||||
"operation": "upper"
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
text: str = Field(..., min_length=1, max_length=10000, description="输入文本")
|
||||
operation: str = Field("upper", description="操作类型")
|
||||
|
||||
@field_validator("operation")
|
||||
@classmethod
|
||||
def validate_operation(cls, v):
|
||||
allowed = ["upper", "lower", "reverse"]
|
||||
if v not in allowed:
|
||||
raise ValueError(f"operation 必须是 {allowed} 之一")
|
||||
return v
|
||||
|
||||
|
||||
class MLPredictRequest(BaseModel):
|
||||
"""ML 预测请求"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
json_schema_extra={
|
||||
"example": {
|
||||
"features": [1.0, 2.0, 3.0, 4.0]
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
features: List[float] = Field(..., min_length=1, description="特征向量")
|
||||
```
|
||||
|
||||
### 响应模型
|
||||
|
||||
```python
|
||||
class TextProcessResponse(BaseModel):
|
||||
"""文本处理响应"""
|
||||
|
||||
request_id: str = Field(..., description="请求 ID")
|
||||
status: str = Field(..., description="处理状态")
|
||||
result: Dict[str, Any] = Field(..., description="处理结果")
|
||||
metadata: Dict[str, Any] = Field(..., description="元数据")
|
||||
```
|
||||
|
||||
## 路由注册
|
||||
|
||||
### 同步接口
|
||||
|
||||
```python
|
||||
# src/functional_scaffold/api/routes.py
|
||||
from fastapi import APIRouter, HTTPException, Depends, status
|
||||
from .models import TextProcessRequest, TextProcessResponse
|
||||
from .dependencies import get_request_id
|
||||
from ..algorithms.text_processor import TextProcessor
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.post(
|
||||
"/text/process",
|
||||
response_model=TextProcessResponse,
|
||||
summary="文本处理",
|
||||
description="对输入文本执行指定操作",
|
||||
)
|
||||
async def process_text(
|
||||
request: TextProcessRequest,
|
||||
request_id: str = Depends(get_request_id),
|
||||
):
|
||||
"""文本处理端点"""
|
||||
try:
|
||||
processor = TextProcessor()
|
||||
result = processor.execute(request.text, request.operation)
|
||||
|
||||
if not result["success"]:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail={"error": "ALGORITHM_ERROR", "message": result["error"]}
|
||||
)
|
||||
|
||||
return TextProcessResponse(
|
||||
request_id=request_id,
|
||||
status="success",
|
||||
result=result["result"],
|
||||
metadata=result["metadata"]
|
||||
)
|
||||
|
||||
except ValueError as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail={"error": "VALIDATION_ERROR", "message": str(e)}
|
||||
)
|
||||
```
|
||||
|
||||
### 异步任务接口
|
||||
|
||||
算法注册后自动支持异步调用,无需额外代码:
|
||||
|
||||
```bash
|
||||
# 创建异步任务
|
||||
curl -X POST http://localhost:8000/jobs \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"algorithm": "TextProcessor",
|
||||
"params": {"text": "hello", "operation": "upper"}
|
||||
}'
|
||||
```
|
||||
|
||||
## 自定义指标
|
||||
|
||||
### 定义指标
|
||||
|
||||
在 `config/metrics.yaml` 中添加:
|
||||
|
||||
```yaml
|
||||
custom_metrics:
|
||||
# 文本处理统计
|
||||
text_process_total:
|
||||
name: "text_process_total"
|
||||
type: counter
|
||||
description: "文本处理总数"
|
||||
labels: [operation]
|
||||
|
||||
text_length_histogram:
|
||||
name: "text_length_histogram"
|
||||
type: histogram
|
||||
description: "处理文本长度分布"
|
||||
labels: []
|
||||
buckets: [10, 50, 100, 500, 1000, 5000, 10000]
|
||||
```
|
||||
|
||||
### 记录指标
|
||||
|
||||
```python
|
||||
from ..core.metrics_unified import incr, observe
|
||||
|
||||
class TextProcessor(BaseAlgorithm):
|
||||
def process(self, text: str, operation: str = "upper") -> Dict[str, Any]:
|
||||
# 记录操作计数
|
||||
incr("text_process_total", {"operation": operation})
|
||||
|
||||
# 记录文本长度
|
||||
observe("text_length_histogram", {}, len(text))
|
||||
|
||||
# ... 算法逻辑 ...
|
||||
```
|
||||
|
||||
## 错误处理
|
||||
|
||||
### 自定义异常
|
||||
|
||||
```python
|
||||
# src/functional_scaffold/core/errors.py
|
||||
class AlgorithmError(FunctionalScaffoldError):
|
||||
"""算法执行错误"""
|
||||
pass
|
||||
|
||||
class ModelNotLoadedError(AlgorithmError):
|
||||
"""模型未加载错误"""
|
||||
pass
|
||||
|
||||
class InvalidInputError(AlgorithmError):
|
||||
"""无效输入错误"""
|
||||
pass
|
||||
```
|
||||
|
||||
### 在算法中使用
|
||||
|
||||
```python
|
||||
from ..core.errors import InvalidInputError, ModelNotLoadedError
|
||||
|
||||
class MLPredictor(BaseAlgorithm):
|
||||
def process(self, features: list) -> Dict[str, Any]:
|
||||
if not features:
|
||||
raise InvalidInputError("特征向量不能为空")
|
||||
|
||||
if self.model is None:
|
||||
raise ModelNotLoadedError("模型未加载,请检查模型文件")
|
||||
|
||||
# ... 算法逻辑 ...
|
||||
```
|
||||
|
||||
## 测试
|
||||
|
||||
### 单元测试
|
||||
|
||||
```python
|
||||
# tests/test_text_processor.py
|
||||
import pytest
|
||||
from functional_scaffold.algorithms.text_processor import TextProcessor
|
||||
|
||||
class TestTextProcessor:
|
||||
"""文本处理算法测试"""
|
||||
|
||||
def test_upper_operation(self):
|
||||
"""测试大写转换"""
|
||||
processor = TextProcessor()
|
||||
result = processor.process("hello", "upper")
|
||||
|
||||
assert result["processed"] == "HELLO"
|
||||
assert result["operation"] == "upper"
|
||||
|
||||
def test_lower_operation(self):
|
||||
"""测试小写转换"""
|
||||
processor = TextProcessor()
|
||||
result = processor.process("HELLO", "lower")
|
||||
|
||||
assert result["processed"] == "hello"
|
||||
|
||||
def test_reverse_operation(self):
|
||||
"""测试反转"""
|
||||
processor = TextProcessor()
|
||||
result = processor.process("hello", "reverse")
|
||||
|
||||
assert result["processed"] == "olleh"
|
||||
|
||||
def test_invalid_operation(self):
|
||||
"""测试无效操作"""
|
||||
processor = TextProcessor()
|
||||
|
||||
with pytest.raises(ValueError, match="不支持的操作"):
|
||||
processor.process("hello", "invalid")
|
||||
|
||||
def test_execute_wrapper(self):
|
||||
"""测试 execute 包装器"""
|
||||
processor = TextProcessor()
|
||||
result = processor.execute("hello", "upper")
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["result"]["processed"] == "HELLO"
|
||||
assert "elapsed_time" in result["metadata"]
|
||||
```
|
||||
|
||||
### API 集成测试
|
||||
|
||||
```python
|
||||
# tests/test_text_api.py
|
||||
import pytest
|
||||
from fastapi import status
|
||||
|
||||
class TestTextProcessAPI:
|
||||
"""文本处理 API 测试"""
|
||||
|
||||
def test_process_text_success(self, client):
|
||||
"""测试成功处理"""
|
||||
response = client.post(
|
||||
"/text/process",
|
||||
json={"text": "hello", "operation": "upper"}
|
||||
)
|
||||
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
data = response.json()
|
||||
assert data["status"] == "success"
|
||||
assert data["result"]["processed"] == "HELLO"
|
||||
|
||||
def test_process_text_invalid_operation(self, client):
|
||||
"""测试无效操作"""
|
||||
response = client.post(
|
||||
"/text/process",
|
||||
json={"text": "hello", "operation": "invalid"}
|
||||
)
|
||||
|
||||
assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
|
||||
```
|
||||
|
||||
## 最佳实践
|
||||
|
||||
### 1. 保持 process() 方法简洁
|
||||
|
||||
```python
|
||||
# ✅ 好的做法
|
||||
def process(self, data):
|
||||
validated = self._validate(data)
|
||||
transformed = self._transform(validated)
|
||||
result = self._compute(transformed)
|
||||
return self._format_output(result)
|
||||
|
||||
# ❌ 避免
|
||||
def process(self, data):
|
||||
# 200 行代码全部写在这里...
|
||||
```
|
||||
|
||||
### 2. 使用类型注解
|
||||
|
||||
```python
|
||||
# ✅ 好的做法
|
||||
def process(self, text: str, max_length: int = 100) -> Dict[str, Any]:
|
||||
...
|
||||
|
||||
# ❌ 避免
|
||||
def process(self, text, max_length=100):
|
||||
...
|
||||
```
|
||||
|
||||
### 3. 合理使用日志
|
||||
|
||||
```python
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class MyAlgorithm(BaseAlgorithm):
|
||||
def process(self, data):
|
||||
logger.info(f"开始处理数据,大小: {len(data)}")
|
||||
# ... 处理逻辑 ...
|
||||
logger.info(f"处理完成,结果大小: {len(result)}")
|
||||
return result
|
||||
```
|
||||
|
||||
### 4. 资源管理
|
||||
|
||||
```python
|
||||
class ResourceIntensiveAlgorithm(BaseAlgorithm):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._resource = None
|
||||
|
||||
def _ensure_resource(self):
|
||||
"""延迟加载资源"""
|
||||
if self._resource is None:
|
||||
self._resource = self._load_heavy_resource()
|
||||
|
||||
def process(self, data):
|
||||
self._ensure_resource()
|
||||
return self._resource.process(data)
|
||||
```
|
||||
|
||||
## 参考
|
||||
|
||||
- [快速入门指南](./getting-started.md)
|
||||
- [API 参考文档](./api-reference.md)
|
||||
- [监控指南](./monitoring.md)
|
||||
441
docs/api-reference.md
Normal file
441
docs/api-reference.md
Normal file
@@ -0,0 +1,441 @@
|
||||
# API 参考文档
|
||||
|
||||
本文档详细描述 FunctionalScaffold 提供的所有 API 端点。
|
||||
|
||||
## 基础信息
|
||||
|
||||
- **Base URL**: `http://localhost:8000`
|
||||
- **Content-Type**: `application/json`
|
||||
- **认证**: 当前版本无需认证
|
||||
|
||||
## 端点概览
|
||||
|
||||
| 方法 | 端点 | 描述 |
|
||||
|------|------|------|
|
||||
| POST | `/invoke` | 同步调用算法 |
|
||||
| POST | `/jobs` | 创建异步任务 |
|
||||
| GET | `/jobs/{job_id}` | 查询任务状态 |
|
||||
| GET | `/healthz` | 存活检查 |
|
||||
| GET | `/readyz` | 就绪检查 |
|
||||
| GET | `/metrics` | Prometheus 指标 |
|
||||
|
||||
---
|
||||
|
||||
## 同步调用接口
|
||||
|
||||
### POST /invoke
|
||||
|
||||
同步调用质数判断算法,立即返回结果。
|
||||
|
||||
#### 请求
|
||||
|
||||
```http
|
||||
POST /invoke
|
||||
Content-Type: application/json
|
||||
```
|
||||
|
||||
**请求体**
|
||||
|
||||
| 字段 | 类型 | 必填 | 描述 |
|
||||
|------|------|------|------|
|
||||
| number | integer | 是 | 待判断的整数 |
|
||||
|
||||
**示例**
|
||||
|
||||
```json
|
||||
{
|
||||
"number": 17
|
||||
}
|
||||
```
|
||||
|
||||
#### 响应
|
||||
|
||||
**成功响应 (200 OK)**
|
||||
|
||||
```json
|
||||
{
|
||||
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
||||
"status": "success",
|
||||
"result": {
|
||||
"number": 17,
|
||||
"is_prime": true,
|
||||
"factors": [],
|
||||
"algorithm": "trial_division"
|
||||
},
|
||||
"metadata": {
|
||||
"algorithm": "PrimeChecker",
|
||||
"version": "1.0.0",
|
||||
"elapsed_time": 0.001
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**错误响应 (400 Bad Request)**
|
||||
|
||||
```json
|
||||
{
|
||||
"error": "VALIDATION_ERROR",
|
||||
"message": "number must be an integer",
|
||||
"details": {"field": "number", "value": "abc"},
|
||||
"request_id": "550e8400-e29b-41d4-a716-446655440000"
|
||||
}
|
||||
```
|
||||
|
||||
#### 示例调用
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/invoke \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"number": 17}'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 异步任务接口
|
||||
|
||||
### POST /jobs
|
||||
|
||||
创建异步任务,立即返回任务 ID,任务在后台执行。
|
||||
|
||||
#### 请求
|
||||
|
||||
```http
|
||||
POST /jobs
|
||||
Content-Type: application/json
|
||||
```
|
||||
|
||||
**请求体**
|
||||
|
||||
| 字段 | 类型 | 必填 | 描述 |
|
||||
|------|------|------|------|
|
||||
| algorithm | string | 是 | 算法名称(如 PrimeChecker) |
|
||||
| params | object | 是 | 算法参数 |
|
||||
| webhook | string | 否 | 任务完成后的回调 URL |
|
||||
|
||||
**示例**
|
||||
|
||||
```json
|
||||
{
|
||||
"algorithm": "PrimeChecker",
|
||||
"params": {"number": 17},
|
||||
"webhook": "https://example.com/callback"
|
||||
}
|
||||
```
|
||||
|
||||
#### 响应
|
||||
|
||||
**成功响应 (202 Accepted)**
|
||||
|
||||
```json
|
||||
{
|
||||
"job_id": "a1b2c3d4e5f6",
|
||||
"status": "pending",
|
||||
"message": "任务已创建",
|
||||
"created_at": "2026-02-02T10:00:00+00:00"
|
||||
}
|
||||
```
|
||||
|
||||
**错误响应 (404 Not Found)**
|
||||
|
||||
```json
|
||||
{
|
||||
"detail": {
|
||||
"error": "ALGORITHM_NOT_FOUND",
|
||||
"message": "算法 'NonExistent' 不存在",
|
||||
"details": {"available_algorithms": ["PrimeChecker"]},
|
||||
"request_id": "xxx"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**错误响应 (503 Service Unavailable)**
|
||||
|
||||
```json
|
||||
{
|
||||
"detail": {
|
||||
"error": "SERVICE_UNAVAILABLE",
|
||||
"message": "任务服务暂不可用,请稍后重试",
|
||||
"request_id": "xxx"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### 示例调用
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/jobs \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"algorithm": "PrimeChecker",
|
||||
"params": {"number": 17},
|
||||
"webhook": "https://webhook.site/your-uuid"
|
||||
}'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### GET /jobs/{job_id}
|
||||
|
||||
查询异步任务的执行状态和结果。
|
||||
|
||||
#### 请求
|
||||
|
||||
```http
|
||||
GET /jobs/{job_id}
|
||||
```
|
||||
|
||||
**路径参数**
|
||||
|
||||
| 参数 | 类型 | 描述 |
|
||||
|------|------|------|
|
||||
| job_id | string | 任务唯一标识(12位十六进制) |
|
||||
|
||||
#### 响应
|
||||
|
||||
**成功响应 (200 OK) - 任务进行中**
|
||||
|
||||
```json
|
||||
{
|
||||
"job_id": "a1b2c3d4e5f6",
|
||||
"status": "running",
|
||||
"algorithm": "PrimeChecker",
|
||||
"created_at": "2026-02-02T10:00:00+00:00",
|
||||
"started_at": "2026-02-02T10:00:01+00:00",
|
||||
"completed_at": null,
|
||||
"result": null,
|
||||
"error": null,
|
||||
"metadata": null
|
||||
}
|
||||
```
|
||||
|
||||
**成功响应 (200 OK) - 任务完成**
|
||||
|
||||
```json
|
||||
{
|
||||
"job_id": "a1b2c3d4e5f6",
|
||||
"status": "completed",
|
||||
"algorithm": "PrimeChecker",
|
||||
"created_at": "2026-02-02T10:00:00+00:00",
|
||||
"started_at": "2026-02-02T10:00:01+00:00",
|
||||
"completed_at": "2026-02-02T10:00:02+00:00",
|
||||
"result": {
|
||||
"number": 17,
|
||||
"is_prime": true,
|
||||
"factors": [],
|
||||
"algorithm": "trial_division"
|
||||
},
|
||||
"error": null,
|
||||
"metadata": {
|
||||
"algorithm": "PrimeChecker",
|
||||
"version": "1.0.0",
|
||||
"elapsed_time": 0.001
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**成功响应 (200 OK) - 任务失败**
|
||||
|
||||
```json
|
||||
{
|
||||
"job_id": "a1b2c3d4e5f6",
|
||||
"status": "failed",
|
||||
"algorithm": "PrimeChecker",
|
||||
"created_at": "2026-02-02T10:00:00+00:00",
|
||||
"started_at": "2026-02-02T10:00:01+00:00",
|
||||
"completed_at": "2026-02-02T10:00:02+00:00",
|
||||
"result": null,
|
||||
"error": "Invalid input: number must be positive",
|
||||
"metadata": null
|
||||
}
|
||||
```
|
||||
|
||||
**错误响应 (404 Not Found)**
|
||||
|
||||
```json
|
||||
{
|
||||
"detail": {
|
||||
"error": "JOB_NOT_FOUND",
|
||||
"message": "任务 'xxx' 不存在或已过期"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### 任务状态说明
|
||||
|
||||
| 状态 | 描述 |
|
||||
|------|------|
|
||||
| pending | 等待执行 |
|
||||
| running | 执行中 |
|
||||
| completed | 已完成 |
|
||||
| failed | 执行失败 |
|
||||
|
||||
#### 示例调用
|
||||
|
||||
```bash
|
||||
curl http://localhost:8000/jobs/a1b2c3d4e5f6
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Webhook 回调
|
||||
|
||||
当任务完成时,如果指定了 webhook URL,系统会发送 POST 请求到该 URL。
|
||||
|
||||
**回调请求**
|
||||
|
||||
```http
|
||||
POST {webhook_url}
|
||||
Content-Type: application/json
|
||||
```
|
||||
|
||||
**回调负载**
|
||||
|
||||
```json
|
||||
{
|
||||
"job_id": "a1b2c3d4e5f6",
|
||||
"status": "completed",
|
||||
"algorithm": "PrimeChecker",
|
||||
"result": {"number": 17, "is_prime": true},
|
||||
"error": null,
|
||||
"metadata": {"elapsed_time": 0.001},
|
||||
"completed_at": "2026-02-02T10:00:02+00:00"
|
||||
}
|
||||
```
|
||||
|
||||
**重试机制**
|
||||
|
||||
- 最大重试次数:3 次
|
||||
- 重试间隔:1s, 5s, 15s(指数退避)
|
||||
- 超时时间:10 秒
|
||||
|
||||
---
|
||||
|
||||
## 健康检查接口
|
||||
|
||||
### GET /healthz
|
||||
|
||||
存活检查端点,用于 Kubernetes 存活探针。
|
||||
|
||||
#### 响应
|
||||
|
||||
**成功响应 (200 OK)**
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "healthy",
|
||||
"timestamp": 1706868000.123
|
||||
}
|
||||
```
|
||||
|
||||
#### 示例调用
|
||||
|
||||
```bash
|
||||
curl http://localhost:8000/healthz
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### GET /readyz
|
||||
|
||||
就绪检查端点,用于 Kubernetes 就绪探针。
|
||||
|
||||
#### 响应
|
||||
|
||||
**成功响应 (200 OK)**
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "ready",
|
||||
"timestamp": 1706868000.123,
|
||||
"checks": {
|
||||
"algorithm": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### 示例调用
|
||||
|
||||
```bash
|
||||
curl http://localhost:8000/readyz
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 监控接口
|
||||
|
||||
### GET /metrics
|
||||
|
||||
返回 Prometheus 格式的监控指标。
|
||||
|
||||
#### 响应
|
||||
|
||||
**成功响应 (200 OK)**
|
||||
|
||||
```
|
||||
Content-Type: text/plain; version=0.0.4; charset=utf-8
|
||||
|
||||
# HELP http_requests_total HTTP 请求总数
|
||||
# TYPE http_requests_total counter
|
||||
http_requests_total{endpoint="/invoke",method="POST",status="success"} 42
|
||||
|
||||
# HELP http_request_duration_seconds HTTP 请求延迟
|
||||
# TYPE http_request_duration_seconds histogram
|
||||
http_request_duration_seconds_bucket{endpoint="/invoke",method="POST",le="0.01"} 35
|
||||
...
|
||||
|
||||
# HELP algorithm_executions_total 算法执行总数
|
||||
# TYPE algorithm_executions_total counter
|
||||
algorithm_executions_total{algorithm="PrimeChecker",status="success"} 42
|
||||
|
||||
# HELP jobs_created_total 创建的异步任务总数
|
||||
# TYPE jobs_created_total counter
|
||||
jobs_created_total{algorithm="PrimeChecker"} 10
|
||||
|
||||
# HELP jobs_completed_total 完成的异步任务总数
|
||||
# TYPE jobs_completed_total counter
|
||||
jobs_completed_total{algorithm="PrimeChecker",status="completed"} 8
|
||||
jobs_completed_total{algorithm="PrimeChecker",status="failed"} 2
|
||||
```
|
||||
|
||||
#### 可用指标
|
||||
|
||||
| 指标名称 | 类型 | 标签 | 描述 |
|
||||
|---------|------|------|------|
|
||||
| http_requests_total | counter | method, endpoint, status | HTTP 请求总数 |
|
||||
| http_request_duration_seconds | histogram | method, endpoint | HTTP 请求延迟 |
|
||||
| http_requests_in_progress | gauge | - | 当前进行中的请求数 |
|
||||
| algorithm_executions_total | counter | algorithm, status | 算法执行总数 |
|
||||
| algorithm_execution_duration_seconds | histogram | algorithm | 算法执行延迟 |
|
||||
| jobs_created_total | counter | algorithm | 创建的异步任务总数 |
|
||||
| jobs_completed_total | counter | algorithm, status | 完成的异步任务总数 |
|
||||
| job_execution_duration_seconds | histogram | algorithm | 异步任务执行时间 |
|
||||
| webhook_deliveries_total | counter | status | Webhook 回调发送总数 |
|
||||
|
||||
#### 示例调用
|
||||
|
||||
```bash
|
||||
curl http://localhost:8000/metrics
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 错误码说明
|
||||
|
||||
| 错误码 | HTTP 状态码 | 描述 |
|
||||
|--------|------------|------|
|
||||
| VALIDATION_ERROR | 400 | 请求参数验证失败 |
|
||||
| ALGORITHM_NOT_FOUND | 404 | 指定的算法不存在 |
|
||||
| JOB_NOT_FOUND | 404 | 任务不存在或已过期 |
|
||||
| ALGORITHM_ERROR | 500 | 算法执行错误 |
|
||||
| INTERNAL_ERROR | 500 | 服务器内部错误 |
|
||||
| SERVICE_UNAVAILABLE | 503 | 服务暂不可用 |
|
||||
|
||||
---
|
||||
|
||||
## 在线文档
|
||||
|
||||
启动服务后,可以访问交互式 API 文档:
|
||||
|
||||
- **Swagger UI**: http://localhost:8000/docs
|
||||
- **ReDoc**: http://localhost:8000/redoc
|
||||
- **OpenAPI JSON**: http://localhost:8000/openapi.json
|
||||
58
docs/api/README.md
Normal file
58
docs/api/README.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# API 文档
|
||||
|
||||
本目录包含 API 相关文档和自动生成的 OpenAPI 规范。
|
||||
|
||||
## 文件说明
|
||||
|
||||
- `openapi.json` - 自动生成的 OpenAPI 3.0 规范文件
|
||||
|
||||
## 生成文档
|
||||
|
||||
运行以下命令更新 OpenAPI 规范:
|
||||
|
||||
```bash
|
||||
python scripts/export_openapi.py
|
||||
```
|
||||
|
||||
## 在线文档
|
||||
|
||||
启动应用后,访问以下 URL 查看交互式文档:
|
||||
|
||||
| 文档类型 | 地址 |
|
||||
|---------|------|
|
||||
| Swagger UI | http://localhost:8000/docs |
|
||||
| ReDoc | http://localhost:8000/redoc |
|
||||
| OpenAPI JSON | http://localhost:8000/openapi.json |
|
||||
|
||||
## API 端点
|
||||
|
||||
### 核心接口
|
||||
|
||||
| 方法 | 端点 | 描述 |
|
||||
|------|------|------|
|
||||
| POST | `/invoke` | 同步调用算法 |
|
||||
| POST | `/jobs` | 创建异步任务 |
|
||||
| GET | `/jobs/{job_id}` | 查询任务状态 |
|
||||
|
||||
### 健康检查
|
||||
|
||||
| 方法 | 端点 | 描述 |
|
||||
|------|------|------|
|
||||
| GET | `/healthz` | 存活检查 |
|
||||
| GET | `/readyz` | 就绪检查 |
|
||||
|
||||
### 监控
|
||||
|
||||
| 方法 | 端点 | 描述 |
|
||||
|------|------|------|
|
||||
| GET | `/metrics` | Prometheus 指标 |
|
||||
|
||||
## 详细文档
|
||||
|
||||
完整的 API 参考文档请查看:[API 参考文档](../api-reference.md)
|
||||
|
||||
## 注意事项
|
||||
|
||||
- `openapi.json` 是自动生成的,请勿手动编辑
|
||||
- API 变更后需要重新运行导出脚本
|
||||
- 确保 Pydantic 模型包含完整的文档字符串和示例
|
||||
736
docs/api/openapi.json
Normal file
736
docs/api/openapi.json
Normal file
@@ -0,0 +1,736 @@
|
||||
{
|
||||
"openapi": "3.1.0",
|
||||
"info": {
|
||||
"title": "FunctionalScaffold",
|
||||
"description": "算法工程化 Serverless 脚手架 - 提供标准化的算法服务接口",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
"paths": {
|
||||
"/invoke": {
|
||||
"post": {
|
||||
"tags": [
|
||||
"Algorithm"
|
||||
],
|
||||
"summary": "同步调用算法",
|
||||
"description": "同步调用质数判断算法,立即返回结果",
|
||||
"operationId": "invoke_algorithm_invoke_post",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "x-request-id",
|
||||
"in": "header",
|
||||
"required": false,
|
||||
"schema": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"title": "X-Request-Id"
|
||||
}
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/InvokeRequest"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "成功",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/InvokeResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"description": "请求参数错误",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ErrorResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "服务器内部错误",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ErrorResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"422": {
|
||||
"description": "Validation Error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/HTTPValidationError"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/healthz": {
|
||||
"get": {
|
||||
"tags": [
|
||||
"Algorithm"
|
||||
],
|
||||
"summary": "健康检查",
|
||||
"description": "检查服务是否存活",
|
||||
"operationId": "health_check_healthz_get",
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful Response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/HealthResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/readyz": {
|
||||
"get": {
|
||||
"tags": [
|
||||
"Algorithm"
|
||||
],
|
||||
"summary": "就绪检查",
|
||||
"description": "检查服务是否就绪",
|
||||
"operationId": "readiness_check_readyz_get",
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful Response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ReadinessResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/jobs": {
|
||||
"post": {
|
||||
"tags": [
|
||||
"Algorithm"
|
||||
],
|
||||
"summary": "创建异步任务",
|
||||
"description": "创建异步任务,立即返回任务 ID,任务在后台执行",
|
||||
"operationId": "create_job_jobs_post",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "x-request-id",
|
||||
"in": "header",
|
||||
"required": false,
|
||||
"schema": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"title": "X-Request-Id"
|
||||
}
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/JobRequest"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"202": {
|
||||
"description": "任务已创建",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/JobCreateResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"description": "请求参数错误",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ErrorResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"404": {
|
||||
"description": "算法不存在",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ErrorResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"503": {
|
||||
"description": "服务不可用",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ErrorResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"422": {
|
||||
"description": "Validation Error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/HTTPValidationError"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/jobs/{job_id}": {
|
||||
"get": {
|
||||
"tags": [
|
||||
"Algorithm"
|
||||
],
|
||||
"summary": "查询任务状态",
|
||||
"description": "查询异步任务的执行状态和结果",
|
||||
"operationId": "get_job_status_jobs__job_id__get",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "job_id",
|
||||
"in": "path",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string",
|
||||
"title": "Job Id"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "成功",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/JobStatusResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"404": {
|
||||
"description": "任务不存在或已过期",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ErrorResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"503": {
|
||||
"description": "服务不可用",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ErrorResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"422": {
|
||||
"description": "Validation Error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/HTTPValidationError"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/metrics": {
|
||||
"get": {
|
||||
"tags": [
|
||||
"Monitoring"
|
||||
],
|
||||
"summary": "Prometheus 指标",
|
||||
"description": "导出 Prometheus 格式的监控指标",
|
||||
"operationId": "metrics_metrics_get",
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful Response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"components": {
|
||||
"schemas": {
|
||||
"ErrorResponse": {
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"title": "Error",
|
||||
"description": "错误代码"
|
||||
},
|
||||
"message": {
|
||||
"type": "string",
|
||||
"title": "Message",
|
||||
"description": "错误消息"
|
||||
},
|
||||
"details": {
|
||||
"anyOf": [
|
||||
{
|
||||
"additionalProperties": true,
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"title": "Details",
|
||||
"description": "错误详情"
|
||||
},
|
||||
"request_id": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"title": "Request Id",
|
||||
"description": "请求ID"
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"required": [
|
||||
"error",
|
||||
"message"
|
||||
],
|
||||
"title": "ErrorResponse",
|
||||
"description": "错误响应",
|
||||
"example": {
|
||||
"details": {
|
||||
"field": "number",
|
||||
"value": "abc"
|
||||
},
|
||||
"error": "VALIDATION_ERROR",
|
||||
"message": "number must be an integer",
|
||||
"request_id": "550e8400-e29b-41d4-a716-446655440000"
|
||||
}
|
||||
},
|
||||
"HTTPValidationError": {
|
||||
"properties": {
|
||||
"detail": {
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/ValidationError"
|
||||
},
|
||||
"type": "array",
|
||||
"title": "Detail"
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"title": "HTTPValidationError"
|
||||
},
|
||||
"HealthResponse": {
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"title": "Status",
|
||||
"description": "健康状态"
|
||||
},
|
||||
"timestamp": {
|
||||
"type": "number",
|
||||
"title": "Timestamp",
|
||||
"description": "时间戳"
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"required": [
|
||||
"status",
|
||||
"timestamp"
|
||||
],
|
||||
"title": "HealthResponse",
|
||||
"description": "健康检查响应"
|
||||
},
|
||||
"InvokeRequest": {
|
||||
"properties": {
|
||||
"number": {
|
||||
"type": "integer",
|
||||
"title": "Number",
|
||||
"description": "待判断的整数"
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"required": [
|
||||
"number"
|
||||
],
|
||||
"title": "InvokeRequest",
|
||||
"description": "同步调用请求",
|
||||
"example": {
|
||||
"number": 17
|
||||
}
|
||||
},
|
||||
"InvokeResponse": {
|
||||
"properties": {
|
||||
"request_id": {
|
||||
"type": "string",
|
||||
"title": "Request Id",
|
||||
"description": "请求唯一标识"
|
||||
},
|
||||
"status": {
|
||||
"type": "string",
|
||||
"title": "Status",
|
||||
"description": "处理状态"
|
||||
},
|
||||
"result": {
|
||||
"additionalProperties": true,
|
||||
"type": "object",
|
||||
"title": "Result",
|
||||
"description": "算法执行结果"
|
||||
},
|
||||
"metadata": {
|
||||
"additionalProperties": true,
|
||||
"type": "object",
|
||||
"title": "Metadata",
|
||||
"description": "元数据信息"
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"required": [
|
||||
"request_id",
|
||||
"status",
|
||||
"result",
|
||||
"metadata"
|
||||
],
|
||||
"title": "InvokeResponse",
|
||||
"description": "同步调用响应",
|
||||
"example": {
|
||||
"metadata": {
|
||||
"algorithm": "PrimeChecker",
|
||||
"elapsed_time": 0.001,
|
||||
"version": "1.0.0"
|
||||
},
|
||||
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
||||
"result": {
|
||||
"algorithm": "trial_division",
|
||||
"factors": [],
|
||||
"is_prime": true,
|
||||
"number": 17
|
||||
},
|
||||
"status": "success"
|
||||
}
|
||||
},
|
||||
"JobCreateResponse": {
|
||||
"properties": {
|
||||
"job_id": {
|
||||
"type": "string",
|
||||
"title": "Job Id",
|
||||
"description": "任务唯一标识"
|
||||
},
|
||||
"status": {
|
||||
"$ref": "#/components/schemas/JobStatus",
|
||||
"description": "任务状态"
|
||||
},
|
||||
"message": {
|
||||
"type": "string",
|
||||
"title": "Message",
|
||||
"description": "状态消息"
|
||||
},
|
||||
"created_at": {
|
||||
"type": "string",
|
||||
"title": "Created At",
|
||||
"description": "创建时间(ISO 8601)"
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"required": [
|
||||
"job_id",
|
||||
"status",
|
||||
"message",
|
||||
"created_at"
|
||||
],
|
||||
"title": "JobCreateResponse",
|
||||
"description": "任务创建响应",
|
||||
"example": {
|
||||
"created_at": "2026-02-02T10:00:00Z",
|
||||
"job_id": "a1b2c3d4e5f6",
|
||||
"message": "任务已创建",
|
||||
"status": "pending"
|
||||
}
|
||||
},
|
||||
"JobRequest": {
|
||||
"properties": {
|
||||
"algorithm": {
|
||||
"type": "string",
|
||||
"title": "Algorithm",
|
||||
"description": "算法名称"
|
||||
},
|
||||
"params": {
|
||||
"additionalProperties": true,
|
||||
"type": "object",
|
||||
"title": "Params",
|
||||
"description": "算法参数"
|
||||
},
|
||||
"webhook": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"title": "Webhook",
|
||||
"description": "回调 URL"
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"required": [
|
||||
"algorithm",
|
||||
"params"
|
||||
],
|
||||
"title": "JobRequest",
|
||||
"description": "异步任务请求",
|
||||
"example": {
|
||||
"algorithm": "PrimeChecker",
|
||||
"params": {
|
||||
"number": 17
|
||||
},
|
||||
"webhook": "https://example.com/callback"
|
||||
}
|
||||
},
|
||||
"JobStatus": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"pending",
|
||||
"running",
|
||||
"completed",
|
||||
"failed"
|
||||
],
|
||||
"title": "JobStatus",
|
||||
"description": "任务状态枚举"
|
||||
},
|
||||
"JobStatusResponse": {
|
||||
"properties": {
|
||||
"job_id": {
|
||||
"type": "string",
|
||||
"title": "Job Id",
|
||||
"description": "任务唯一标识"
|
||||
},
|
||||
"status": {
|
||||
"$ref": "#/components/schemas/JobStatus",
|
||||
"description": "任务状态"
|
||||
},
|
||||
"algorithm": {
|
||||
"type": "string",
|
||||
"title": "Algorithm",
|
||||
"description": "算法名称"
|
||||
},
|
||||
"created_at": {
|
||||
"type": "string",
|
||||
"title": "Created At",
|
||||
"description": "创建时间(ISO 8601)"
|
||||
},
|
||||
"started_at": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"title": "Started At",
|
||||
"description": "开始执行时间(ISO 8601)"
|
||||
},
|
||||
"completed_at": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"title": "Completed At",
|
||||
"description": "完成时间(ISO 8601)"
|
||||
},
|
||||
"result": {
|
||||
"anyOf": [
|
||||
{
|
||||
"additionalProperties": true,
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"title": "Result",
|
||||
"description": "执行结果(仅完成时返回)"
|
||||
},
|
||||
"error": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"title": "Error",
|
||||
"description": "错误信息(仅失败时返回)"
|
||||
},
|
||||
"metadata": {
|
||||
"anyOf": [
|
||||
{
|
||||
"additionalProperties": true,
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"title": "Metadata",
|
||||
"description": "元数据信息"
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"required": [
|
||||
"job_id",
|
||||
"status",
|
||||
"algorithm",
|
||||
"created_at"
|
||||
],
|
||||
"title": "JobStatusResponse",
|
||||
"description": "任务状态查询响应",
|
||||
"example": {
|
||||
"algorithm": "PrimeChecker",
|
||||
"completed_at": "2026-02-02T10:00:02Z",
|
||||
"created_at": "2026-02-02T10:00:00Z",
|
||||
"job_id": "a1b2c3d4e5f6",
|
||||
"metadata": {
|
||||
"elapsed_time": 0.001
|
||||
},
|
||||
"result": {
|
||||
"is_prime": true,
|
||||
"number": 17
|
||||
},
|
||||
"started_at": "2026-02-02T10:00:01Z",
|
||||
"status": "completed"
|
||||
}
|
||||
},
|
||||
"ReadinessResponse": {
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"title": "Status",
|
||||
"description": "就绪状态"
|
||||
},
|
||||
"timestamp": {
|
||||
"type": "number",
|
||||
"title": "Timestamp",
|
||||
"description": "时间戳"
|
||||
},
|
||||
"checks": {
|
||||
"anyOf": [
|
||||
{
|
||||
"additionalProperties": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"type": "object"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"title": "Checks",
|
||||
"description": "各项检查结果"
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"required": [
|
||||
"status",
|
||||
"timestamp"
|
||||
],
|
||||
"title": "ReadinessResponse",
|
||||
"description": "就绪检查响应"
|
||||
},
|
||||
"ValidationError": {
|
||||
"properties": {
|
||||
"loc": {
|
||||
"items": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "integer"
|
||||
}
|
||||
]
|
||||
},
|
||||
"type": "array",
|
||||
"title": "Location"
|
||||
},
|
||||
"msg": {
|
||||
"type": "string",
|
||||
"title": "Message"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"title": "Error Type"
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"required": [
|
||||
"loc",
|
||||
"msg",
|
||||
"type"
|
||||
],
|
||||
"title": "ValidationError"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
204
docs/concurrency-control-changelog.md
Normal file
204
docs/concurrency-control-changelog.md
Normal file
@@ -0,0 +1,204 @@
|
||||
# 异步任务并发控制实现总结
|
||||
|
||||
## 变更概述
|
||||
|
||||
为异步任务管理器添加了并发控制功能,使用 `asyncio.Semaphore` 限制同时运行的任务数量,防止系统资源耗尽。
|
||||
|
||||
## 修改的文件
|
||||
|
||||
### 1. `src/functional_scaffold/config.py`
|
||||
|
||||
**新增配置项:**
|
||||
```python
|
||||
max_concurrent_jobs: int = 10 # 最大并发任务数
|
||||
```
|
||||
|
||||
### 2. `src/functional_scaffold/core/job_manager.py`
|
||||
|
||||
**新增属性:**
|
||||
- `_semaphore: Optional[asyncio.Semaphore]` - 并发控制信号量
|
||||
- `_max_concurrent_jobs: int` - 最大并发数(存储配置值)
|
||||
|
||||
**修改方法:**
|
||||
- `__init__()` - 初始化 semaphore 和 max_concurrent_jobs 属性
|
||||
- `initialize()` - 创建 Semaphore 实例
|
||||
- `execute_job()` - 使用 `async with self._semaphore` 包裹执行逻辑
|
||||
|
||||
**新增方法:**
|
||||
- `get_concurrency_status()` - 返回并发状态(最大并发数、可用槽位、运行中任务数)
|
||||
|
||||
### 3. `src/functional_scaffold/api/models.py`
|
||||
|
||||
**新增模型:**
|
||||
```python
|
||||
class ConcurrencyStatusResponse(BaseModel):
|
||||
"""并发状态响应"""
|
||||
max_concurrent: int
|
||||
available_slots: int
|
||||
running_jobs: int
|
||||
```
|
||||
|
||||
### 4. `src/functional_scaffold/api/routes.py`
|
||||
|
||||
**新增端点:**
|
||||
```python
|
||||
GET /jobs/concurrency/status
|
||||
```
|
||||
|
||||
返回当前并发执行状态。
|
||||
|
||||
### 5. `tests/test_job_manager.py`
|
||||
|
||||
**新增测试类:**
|
||||
```python
|
||||
class TestConcurrencyControl:
|
||||
- test_get_concurrency_status()
|
||||
- test_get_concurrency_status_without_semaphore()
|
||||
- test_concurrency_limit()
|
||||
- test_concurrency_status_api()
|
||||
```
|
||||
|
||||
**修改测试:**
|
||||
- `test_execute_job()` - 添加 semaphore 初始化
|
||||
|
||||
## 工作原理
|
||||
|
||||
### 并发控制流程
|
||||
|
||||
```
|
||||
创建任务 (POST /jobs)
|
||||
│
|
||||
▼
|
||||
asyncio.create_task(execute_job)
|
||||
│
|
||||
▼
|
||||
检查 Redis 和 semaphore 可用性
|
||||
│
|
||||
▼
|
||||
async with self._semaphore: ← 获取槽位(阻塞直到有可用槽位)
|
||||
│
|
||||
├─ 更新状态为 running
|
||||
├─ 执行算法
|
||||
├─ 更新状态为 completed/failed
|
||||
└─ 发送 webhook
|
||||
│
|
||||
▼
|
||||
自动释放槽位
|
||||
```
|
||||
|
||||
### 关键设计决策
|
||||
|
||||
1. **使用 asyncio.Semaphore**
|
||||
- 简单、高效、无需外部依赖
|
||||
- 自动管理槽位获取和释放
|
||||
- 支持异步等待
|
||||
|
||||
2. **在 execute_job 内部使用 semaphore**
|
||||
- 快速失败的检查(Redis 可用性、任务存在性)在 semaphore 外部
|
||||
- 只有真正要执行的任务才占用槽位
|
||||
- 任务完成后自动释放(即使发生异常)
|
||||
|
||||
3. **存储 _max_concurrent_jobs**
|
||||
- Semaphore 不暴露最大值属性
|
||||
- 需要单独存储以便 `get_concurrency_status()` 使用
|
||||
|
||||
## 测试覆盖
|
||||
|
||||
- ✅ 获取并发状态
|
||||
- ✅ 未初始化时的并发状态
|
||||
- ✅ 并发限制生效(创建超过限制的任务,验证只有限定数量在运行)
|
||||
- ✅ API 端点测试
|
||||
- ✅ 所有现有测试继续通过(60/60)
|
||||
|
||||
## 使用示例
|
||||
|
||||
### 配置并发限制
|
||||
|
||||
```bash
|
||||
# 环境变量
|
||||
export MAX_CONCURRENT_JOBS=20
|
||||
|
||||
# 或在 .env 文件
|
||||
MAX_CONCURRENT_JOBS=20
|
||||
```
|
||||
|
||||
### 查询并发状态
|
||||
|
||||
```bash
|
||||
curl http://localhost:8000/jobs/concurrency/status
|
||||
```
|
||||
|
||||
响应:
|
||||
```json
|
||||
{
|
||||
"max_concurrent": 10,
|
||||
"available_slots": 7,
|
||||
"running_jobs": 3
|
||||
}
|
||||
```
|
||||
|
||||
### 测试并发控制
|
||||
|
||||
```bash
|
||||
# 运行测试脚本
|
||||
./scripts/test_concurrency.sh
|
||||
```
|
||||
|
||||
## 性能影响
|
||||
|
||||
### 优点
|
||||
|
||||
1. **防止资源耗尽**:限制同时运行的任务数
|
||||
2. **可预测的负载**:系统负载不会超过配置的限制
|
||||
3. **自动排队**:超过限制的任务自动等待
|
||||
4. **零开销**:未达到限制时,semaphore 几乎无性能开销
|
||||
|
||||
### 注意事项
|
||||
|
||||
1. **任务等待**:超过限制的任务会等待,可能导致响应延迟
|
||||
2. **内存占用**:等待中的任务仍占用内存(协程对象)
|
||||
3. **配置调优**:需要根据实际负载调整并发数
|
||||
|
||||
## 监控建议
|
||||
|
||||
### Prometheus 查询
|
||||
|
||||
```promql
|
||||
# 任务创建速率
|
||||
rate(jobs_created_total[5m])
|
||||
|
||||
# 任务完成速率
|
||||
rate(jobs_completed_total[5m])
|
||||
|
||||
# 任务积压(创建 - 完成)
|
||||
rate(jobs_created_total[5m]) - rate(jobs_completed_total[5m])
|
||||
```
|
||||
|
||||
### Grafana 面板
|
||||
|
||||
建议添加以下面板:
|
||||
1. 并发状态时间序列(max_concurrent, available_slots, running_jobs)
|
||||
2. 任务创建/完成速率
|
||||
3. 任务执行时间分布(P50, P95, P99)
|
||||
|
||||
## 未来改进
|
||||
|
||||
1. **任务超时机制**:为长时间运行的任务设置超时
|
||||
2. **优先级队列**:支持高优先级任务优先执行
|
||||
3. **动态调整**:根据系统负载动态调整并发数
|
||||
4. **任务取消**:支持取消等待中或运行中的任务
|
||||
5. **资源限制**:更细粒度的 CPU、内存限制
|
||||
|
||||
## 相关文档
|
||||
|
||||
- [并发控制详细文档](./concurrency-control.md)
|
||||
- [异步任务接口实现计划](../plans/giggly-hatching-kite.md)
|
||||
- [监控指南](./monitoring.md)
|
||||
|
||||
## 测试结果
|
||||
|
||||
```
|
||||
======================== 60 passed, 7 warnings in 1.53s ========================
|
||||
```
|
||||
|
||||
所有测试通过,包括 4 个新增的并发控制测试。
|
||||
102
docs/concurrency-control-quickref.md
Normal file
102
docs/concurrency-control-quickref.md
Normal file
@@ -0,0 +1,102 @@
|
||||
# 并发控制快速参考
|
||||
|
||||
## 配置
|
||||
|
||||
```bash
|
||||
# 设置最大并发数(默认 10)
|
||||
export MAX_CONCURRENT_JOBS=20
|
||||
```
|
||||
|
||||
## API
|
||||
|
||||
### 查询并发状态
|
||||
|
||||
```bash
|
||||
GET /jobs/concurrency/status
|
||||
```
|
||||
|
||||
**响应:**
|
||||
```json
|
||||
{
|
||||
"max_concurrent": 10, // 最大并发数
|
||||
"available_slots": 7, // 可用槽位
|
||||
"running_jobs": 3 // 运行中任务数
|
||||
}
|
||||
```
|
||||
|
||||
## 代码示例
|
||||
|
||||
### 在 JobManager 中使用
|
||||
|
||||
```python
|
||||
# 并发控制自动生效,无需额外代码
|
||||
job_manager = await get_job_manager()
|
||||
job_id = await job_manager.create_job(...)
|
||||
|
||||
# 任务会自动排队,等待可用槽位
|
||||
asyncio.create_task(job_manager.execute_job(job_id))
|
||||
```
|
||||
|
||||
### 查询并发状态
|
||||
|
||||
```python
|
||||
job_manager = await get_job_manager()
|
||||
status = job_manager.get_concurrency_status()
|
||||
|
||||
print(f"运行中: {status['running_jobs']}/{status['max_concurrent']}")
|
||||
print(f"可用槽位: {status['available_slots']}")
|
||||
```
|
||||
|
||||
## 监控
|
||||
|
||||
### 实时监控
|
||||
|
||||
```bash
|
||||
# 持续监控并发状态
|
||||
watch -n 1 'curl -s http://localhost:8000/jobs/concurrency/status | jq'
|
||||
```
|
||||
|
||||
### 测试脚本
|
||||
|
||||
```bash
|
||||
# 运行并发控制测试
|
||||
./scripts/test_concurrency.sh
|
||||
```
|
||||
|
||||
## 推荐配置
|
||||
|
||||
| 任务类型 | 推荐并发数 |
|
||||
|---------|-----------|
|
||||
| CPU 密集型 | 核心数 × 1.5 |
|
||||
| I/O 密集型 | 核心数 × 5-10 |
|
||||
| 混合型 | 核心数 × 2-3 |
|
||||
|
||||
## 故障排查
|
||||
|
||||
### 任务一直 pending
|
||||
|
||||
```bash
|
||||
# 检查并发状态
|
||||
curl http://localhost:8000/jobs/concurrency/status
|
||||
|
||||
# 如果 available_slots = 0,说明所有槽位被占用
|
||||
# 解决方案:
|
||||
# 1. 等待当前任务完成
|
||||
# 2. 增加并发限制
|
||||
# 3. 优化算法性能
|
||||
```
|
||||
|
||||
### 系统资源耗尽
|
||||
|
||||
```bash
|
||||
# 降低并发限制
|
||||
export MAX_CONCURRENT_JOBS=5
|
||||
|
||||
# 重启服务
|
||||
./scripts/run_dev.sh
|
||||
```
|
||||
|
||||
## 相关文档
|
||||
|
||||
- [详细文档](./concurrency-control.md)
|
||||
- [实现总结](./concurrency-control-changelog.md)
|
||||
204
docs/concurrency-control.md
Normal file
204
docs/concurrency-control.md
Normal file
@@ -0,0 +1,204 @@
|
||||
# 异步任务并发控制
|
||||
|
||||
## 概述
|
||||
|
||||
为了防止系统资源耗尽和控制负载,任务管理器实现了基于 `asyncio.Semaphore` 的并发控制机制。
|
||||
|
||||
## 配置
|
||||
|
||||
在 `config.py` 或环境变量中设置最大并发任务数:
|
||||
|
||||
```python
|
||||
# config.py
|
||||
max_concurrent_jobs: int = 10 # 默认值
|
||||
```
|
||||
|
||||
或通过环境变量:
|
||||
|
||||
```bash
|
||||
export MAX_CONCURRENT_JOBS=20
|
||||
```
|
||||
|
||||
## 工作原理
|
||||
|
||||
1. **信号量机制**:使用 `asyncio.Semaphore` 限制同时运行的任务数
|
||||
2. **自动管理**:任务开始时获取槽位,完成后自动释放
|
||||
3. **队列等待**:超过限制的任务会自动等待,直到有可用槽位
|
||||
|
||||
### 执行流程
|
||||
|
||||
```
|
||||
POST /jobs 创建任务
|
||||
│
|
||||
▼
|
||||
asyncio.create_task(execute_job)
|
||||
│
|
||||
▼
|
||||
等待获取 semaphore 槽位
|
||||
│
|
||||
▼
|
||||
async with semaphore: ← 获取槽位
|
||||
执行算法
|
||||
更新状态
|
||||
发送 webhook
|
||||
│
|
||||
▼
|
||||
自动释放槽位
|
||||
```
|
||||
|
||||
## API 端点
|
||||
|
||||
### 查询并发状态
|
||||
|
||||
```bash
|
||||
GET /jobs/concurrency/status
|
||||
```
|
||||
|
||||
**响应示例:**
|
||||
|
||||
```json
|
||||
{
|
||||
"max_concurrent": 10,
|
||||
"available_slots": 7,
|
||||
"running_jobs": 3
|
||||
}
|
||||
```
|
||||
|
||||
**字段说明:**
|
||||
|
||||
- `max_concurrent`: 最大并发任务数(配置值)
|
||||
- `available_slots`: 当前可用槽位数
|
||||
- `running_jobs`: 当前正在运行的任务数
|
||||
|
||||
## 使用示例
|
||||
|
||||
### 1. 创建多个任务
|
||||
|
||||
```bash
|
||||
# 创建 20 个任务
|
||||
for i in {1..20}; do
|
||||
curl -X POST http://localhost:8000/jobs \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"algorithm\": \"PrimeChecker\", \"params\": {\"number\": $i}}"
|
||||
done
|
||||
```
|
||||
|
||||
### 2. 监控并发状态
|
||||
|
||||
```bash
|
||||
# 持续监控并发状态
|
||||
watch -n 1 'curl -s http://localhost:8000/jobs/concurrency/status | jq'
|
||||
```
|
||||
|
||||
输出示例:
|
||||
|
||||
```json
|
||||
{
|
||||
"max_concurrent": 10,
|
||||
"available_slots": 0,
|
||||
"running_jobs": 10
|
||||
}
|
||||
```
|
||||
|
||||
### 3. 调整并发限制
|
||||
|
||||
```bash
|
||||
# 重启服务前设置环境变量
|
||||
export MAX_CONCURRENT_JOBS=20
|
||||
./scripts/run_dev.sh
|
||||
```
|
||||
|
||||
## 性能考虑
|
||||
|
||||
### 选择合适的并发数
|
||||
|
||||
并发数应根据以下因素确定:
|
||||
|
||||
1. **CPU 核心数**:CPU 密集型任务建议设置为核心数的 1-2 倍
|
||||
2. **内存限制**:每个任务的内存占用 × 并发数 < 可用内存
|
||||
3. **外部服务限制**:如果调用外部 API,考虑其速率限制
|
||||
4. **Redis 连接池**:确保 Redis 连接池大小 ≥ 并发数
|
||||
|
||||
### 推荐配置
|
||||
|
||||
| 场景 | 推荐并发数 | 说明 |
|
||||
|------|-----------|------|
|
||||
| CPU 密集型(如质数判断) | 核心数 × 1.5 | 充分利用 CPU |
|
||||
| I/O 密集型(如网络请求) | 核心数 × 5-10 | 等待 I/O 时可切换 |
|
||||
| 混合型 | 核心数 × 2-3 | 平衡 CPU 和 I/O |
|
||||
| 内存受限 | 根据内存计算 | 避免 OOM |
|
||||
|
||||
### 示例计算
|
||||
|
||||
假设:
|
||||
- 服务器:4 核 8GB 内存
|
||||
- 任务类型:I/O 密集型(网络请求)
|
||||
- 单任务内存:50MB
|
||||
|
||||
```
|
||||
最大并发数 = min(
|
||||
核心数 × 8 = 32,
|
||||
可用内存 / 单任务内存 = 8000MB / 50MB = 160
|
||||
) = 32
|
||||
```
|
||||
|
||||
## 监控指标
|
||||
|
||||
相关 Prometheus 指标:
|
||||
|
||||
```promql
|
||||
# 任务创建速率
|
||||
rate(jobs_created_total[5m])
|
||||
|
||||
# 任务完成速率
|
||||
rate(jobs_completed_total[5m])
|
||||
|
||||
# 任务执行时间分布
|
||||
histogram_quantile(0.95, job_execution_duration_seconds_bucket)
|
||||
```
|
||||
|
||||
## 故障排查
|
||||
|
||||
### 问题:任务一直处于 pending 状态
|
||||
|
||||
**可能原因:**
|
||||
1. 所有槽位都被占用
|
||||
2. 某些任务执行时间过长
|
||||
|
||||
**解决方案:**
|
||||
```bash
|
||||
# 1. 检查并发状态
|
||||
curl http://localhost:8000/jobs/concurrency/status
|
||||
|
||||
# 2. 如果 available_slots = 0,说明所有槽位被占用
|
||||
# 3. 检查是否有长时间运行的任务
|
||||
# 4. 考虑增加并发限制或优化算法性能
|
||||
```
|
||||
|
||||
### 问题:系统资源耗尽
|
||||
|
||||
**可能原因:**
|
||||
并发数设置过高
|
||||
|
||||
**解决方案:**
|
||||
```bash
|
||||
# 降低并发限制
|
||||
export MAX_CONCURRENT_JOBS=5
|
||||
# 重启服务
|
||||
```
|
||||
|
||||
## 最佳实践
|
||||
|
||||
1. **监控优先**:部署后持续监控并发状态和系统资源
|
||||
2. **逐步调整**:从保守值开始,逐步增加并发数
|
||||
3. **压力测试**:在生产环境前进行充分的压力测试
|
||||
4. **设置告警**:当 `available_slots = 0` 持续时间过长时告警
|
||||
5. **任务超时**:为长时间运行的任务设置超时机制(待实现)
|
||||
|
||||
## 未来改进
|
||||
|
||||
- [ ] 任务超时机制
|
||||
- [ ] 优先级队列
|
||||
- [ ] 动态调整并发数
|
||||
- [ ] 任务取消功能
|
||||
- [ ] 更细粒度的资源控制(CPU、内存限制)
|
||||
251
docs/getting-started.md
Normal file
251
docs/getting-started.md
Normal file
@@ -0,0 +1,251 @@
|
||||
# 快速入门指南
|
||||
|
||||
本指南帮助算法同学快速上手 FunctionalScaffold 脚手架,在 10 分钟内完成第一个算法服务的开发和部署。
|
||||
|
||||
## 核心理念
|
||||
|
||||
**算法同学只需关注核心算法逻辑**,框架自动处理:
|
||||
- HTTP 接口封装
|
||||
- 参数验证
|
||||
- 错误处理
|
||||
- 日志记录
|
||||
- 性能指标
|
||||
- 健康检查
|
||||
- 容器化部署
|
||||
|
||||
## 环境准备
|
||||
|
||||
### 1. 安装依赖
|
||||
|
||||
```bash
|
||||
# 克隆项目
|
||||
git clone <repository-url>
|
||||
cd FunctionalScaffold
|
||||
|
||||
# 创建虚拟环境
|
||||
python -m venv venv
|
||||
source venv/bin/activate # Windows: venv\Scripts\activate
|
||||
|
||||
# 安装依赖
|
||||
pip install -e ".[dev]"
|
||||
```
|
||||
|
||||
### 2. 启动服务
|
||||
|
||||
```bash
|
||||
# 开发模式(自动重载)
|
||||
uvicorn functional_scaffold.main:app --reload --port 8000
|
||||
# docker 开发者模式
|
||||
cd deployment && docker compose up -d
|
||||
```
|
||||
|
||||
### 3. 验证服务
|
||||
|
||||
```bash
|
||||
# 健康检查
|
||||
curl http://localhost:8000/healthz
|
||||
|
||||
# 调用示例算法
|
||||
curl -X POST http://localhost:8000/invoke \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"number": 17}'
|
||||
```
|
||||
|
||||
## 添加你的第一个算法
|
||||
|
||||
### 步骤 1:创建算法类
|
||||
|
||||
在 `src/functional_scaffold/algorithms/` 目录下创建新文件:
|
||||
|
||||
```python
|
||||
# src/functional_scaffold/algorithms/my_algorithm.py
|
||||
from typing import Dict, Any
|
||||
from .base import BaseAlgorithm
|
||||
|
||||
class MyAlgorithm(BaseAlgorithm):
|
||||
"""我的算法类"""
|
||||
|
||||
def process(self, input_data: Any) -> Dict[str, Any]:
|
||||
"""
|
||||
算法处理逻辑 - 只需实现这个方法!
|
||||
|
||||
Args:
|
||||
input_data: 输入数据
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 处理结果
|
||||
"""
|
||||
# 在这里实现你的算法逻辑
|
||||
result = self._do_calculation(input_data)
|
||||
|
||||
return {
|
||||
"input": input_data,
|
||||
"output": result,
|
||||
"message": "处理成功"
|
||||
}
|
||||
|
||||
def _do_calculation(self, data):
|
||||
"""内部计算方法"""
|
||||
# 你的算法实现
|
||||
return data * 2
|
||||
```
|
||||
|
||||
### 步骤 2:注册算法
|
||||
|
||||
在 `src/functional_scaffold/algorithms/__init__.py` 中添加导出:
|
||||
|
||||
```python
|
||||
from .base import BaseAlgorithm
|
||||
from .prime_checker import PrimeChecker
|
||||
from .my_algorithm import MyAlgorithm # 添加这行
|
||||
|
||||
__all__ = ["BaseAlgorithm", "PrimeChecker", "MyAlgorithm"] # 添加到列表
|
||||
```
|
||||
|
||||
### 步骤 3:添加 API 端点
|
||||
|
||||
在 `src/functional_scaffold/api/routes.py` 中添加路由:
|
||||
|
||||
```python
|
||||
from ..algorithms.my_algorithm import MyAlgorithm
|
||||
|
||||
@router.post("/my-endpoint")
|
||||
async def my_endpoint(
|
||||
request: MyRequest, # 需要定义请求模型
|
||||
request_id: str = Depends(get_request_id)
|
||||
):
|
||||
"""我的算法端点"""
|
||||
algorithm = MyAlgorithm()
|
||||
result = algorithm.execute(request.data)
|
||||
|
||||
if not result["success"]:
|
||||
raise HTTPException(status_code=500, detail=result["error"])
|
||||
|
||||
return {
|
||||
"request_id": request_id,
|
||||
"status": "success",
|
||||
"result": result["result"],
|
||||
"metadata": result["metadata"]
|
||||
}
|
||||
```
|
||||
|
||||
### 步骤 4:定义请求模型
|
||||
|
||||
在 `src/functional_scaffold/api/models.py` 中添加:
|
||||
|
||||
```python
|
||||
class MyRequest(BaseModel):
|
||||
"""我的请求模型"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
json_schema_extra={
|
||||
"example": {"data": 10}
|
||||
}
|
||||
)
|
||||
|
||||
data: int = Field(..., description="输入数据")
|
||||
```
|
||||
|
||||
### 步骤 5:测试
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/my-endpoint \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"data": 10}'
|
||||
```
|
||||
|
||||
## 使用异步任务
|
||||
|
||||
对于耗时较长的算法,使用异步任务接口:
|
||||
|
||||
### 创建异步任务
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/jobs \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"algorithm": "MyAlgorithm",
|
||||
"params": {"data": 10},
|
||||
"webhook": "https://your-callback-url.com/notify"
|
||||
}'
|
||||
```
|
||||
|
||||
响应:
|
||||
```json
|
||||
{
|
||||
"job_id": "a1b2c3d4e5f6",
|
||||
"status": "pending",
|
||||
"message": "任务已创建",
|
||||
"created_at": "2026-02-02T10:00:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
### 查询任务状态
|
||||
|
||||
```bash
|
||||
curl http://localhost:8000/jobs/a1b2c3d4e5f6
|
||||
```
|
||||
|
||||
响应:
|
||||
```json
|
||||
{
|
||||
"job_id": "a1b2c3d4e5f6",
|
||||
"status": "completed",
|
||||
"algorithm": "MyAlgorithm",
|
||||
"result": {"input": 10, "output": 20},
|
||||
"metadata": {"elapsed_time": 0.001}
|
||||
}
|
||||
```
|
||||
|
||||
## 本地开发技巧
|
||||
|
||||
### 查看 API 文档
|
||||
|
||||
启动服务后访问:
|
||||
- Swagger UI: http://localhost:8000/docs
|
||||
- ReDoc: http://localhost:8000/redoc
|
||||
|
||||
### 运行测试
|
||||
|
||||
```bash
|
||||
# 运行所有测试
|
||||
pytest tests/ -v
|
||||
|
||||
# 运行单个测试文件
|
||||
pytest tests/test_algorithms.py -v
|
||||
```
|
||||
|
||||
### 代码格式化
|
||||
|
||||
```bash
|
||||
# 格式化代码
|
||||
black src/ tests/
|
||||
|
||||
# 检查代码规范
|
||||
ruff check src/ tests/
|
||||
```
|
||||
|
||||
## 下一步
|
||||
|
||||
- [算法开发详细指南](./algorithm-development.md) - 深入了解算法开发
|
||||
- [API 参考文档](./api-reference.md) - 完整的 API 说明
|
||||
- [监控指南](./monitoring.md) - 了解监控和告警
|
||||
- [部署指南](./deployment.md) - 生产环境部署
|
||||
|
||||
## 常见问题
|
||||
|
||||
### Q: 如何处理算法中的异常?
|
||||
|
||||
A: 在 `process()` 方法中抛出异常即可,框架会自动捕获并返回标准错误响应。
|
||||
|
||||
### Q: 如何添加自定义指标?
|
||||
|
||||
A: 在 `config/metrics.yaml` 中定义指标,然后在代码中使用 `incr()` 或 `observe()` 记录。
|
||||
|
||||
### Q: 如何访问外部服务(数据库、OSS)?
|
||||
|
||||
A: 在 `config.py` 中添加配置项,通过环境变量注入连接信息。
|
||||
|
||||
### Q: 算法需要加载大模型文件怎么办?
|
||||
|
||||
A: 在算法类的 `__init__` 方法中加载模型,框架会在容器启动时初始化。
|
||||
182
docs/grafana-dashboard-usage.md
Normal file
182
docs/grafana-dashboard-usage.md
Normal file
@@ -0,0 +1,182 @@
|
||||
# Grafana 日志仪表板使用说明
|
||||
|
||||
## Request ID 过滤功能
|
||||
|
||||
日志监控仪表板现在支持按 request_id 过滤日志,可以追踪单个请求的完整生命周期。
|
||||
|
||||
### 如何使用
|
||||
|
||||
1. **访问仪表板**
|
||||
- 打开 Grafana: http://localhost:3000
|
||||
- 登录(admin/admin)
|
||||
- 进入 "日志监控" 仪表板
|
||||
|
||||
2. **使用 Request ID 过滤**
|
||||
- 在仪表板顶部找到 "Request ID" 输入框
|
||||
- 输入完整的 request_id(例如:`59017bdd-5963-40b1-a325-5088593382c0`)
|
||||
- 所有面板会自动更新,只显示该 request_id 的日志
|
||||
|
||||
3. **查看所有日志**
|
||||
- 清空 "Request ID" 输入框
|
||||
- 所有面板会显示所有日志
|
||||
|
||||
### 示例
|
||||
|
||||
#### 获取 Request ID
|
||||
|
||||
从 API 响应中获取:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8111/invoke \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"number": 17}' | jq -r '.request_id'
|
||||
```
|
||||
|
||||
输出示例:
|
||||
```
|
||||
59017bdd-5963-40b1-a325-5088593382c0
|
||||
```
|
||||
|
||||
#### 在仪表板中过滤
|
||||
|
||||
1. 复制上面的 request_id
|
||||
2. 在 Grafana 仪表板顶部的 "Request ID" 输入框中粘贴
|
||||
3. 按回车或点击刷新
|
||||
|
||||
#### 查看结果
|
||||
|
||||
过滤后,你会看到该请求的所有日志:
|
||||
|
||||
- **日志流面板**:显示该请求的所有日志条目
|
||||
- **日志量趋势**:显示该请求的日志分布
|
||||
- **日志级别分布**:显示该请求的日志级别统计
|
||||
- **错误日志**:如果该请求有错误,会显示在这里
|
||||
|
||||
### 典型的请求日志流
|
||||
|
||||
一个成功的请求通常包含以下日志:
|
||||
|
||||
```
|
||||
1. Request: POST /invoke
|
||||
2. Processing request {request_id} with number=17
|
||||
3. Starting algorithm: PrimeChecker
|
||||
4. Algorithm PrimeChecker completed successfully in 0.001s
|
||||
5. Response: 200
|
||||
```
|
||||
|
||||
所有这些日志都有相同的 request_id,可以通过过滤功能一起查看。
|
||||
|
||||
### 高级用法
|
||||
|
||||
#### 在 Explore 中使用
|
||||
|
||||
1. 进入 Grafana Explore: http://localhost:3000/explore
|
||||
2. 选择 Loki 数据源
|
||||
3. 使用以下查询:
|
||||
|
||||
```logql
|
||||
# 查询特定 request_id
|
||||
{job="functional-scaffold-app"} |= "59017bdd-5963-40b1-a325-5088593382c0"
|
||||
|
||||
# 使用 JSON 解析(更精确)
|
||||
{job="functional-scaffold-app"} | json | request_id="59017bdd-5963-40b1-a325-5088593382c0"
|
||||
|
||||
# 查询特定 request_id 的错误日志
|
||||
{job="functional-scaffold-app", level="ERROR"} |= "59017bdd-5963-40b1-a325-5088593382c0"
|
||||
```
|
||||
|
||||
#### 组合过滤
|
||||
|
||||
可以结合其他过滤条件:
|
||||
|
||||
```logql
|
||||
# 特定 request_id 的 ERROR 日志
|
||||
{job="functional-scaffold-app", level="ERROR"} |= "59017bdd-5963-40b1-a325-5088593382c0"
|
||||
|
||||
# 特定 request_id 的特定 logger
|
||||
{job="functional-scaffold-app", logger="functional_scaffold.algorithms.base"} |= "59017bdd-5963-40b1-a325-5088593382c0"
|
||||
```
|
||||
|
||||
### 故障排查
|
||||
|
||||
#### Request ID 过滤不生效
|
||||
|
||||
1. **检查 request_id 格式**
|
||||
- 确保输入的是完整的 UUID 格式
|
||||
- 不要包含额外的空格或引号
|
||||
|
||||
2. **检查时间范围**
|
||||
- 确保仪表板的时间范围包含该请求的时间
|
||||
- 可以调整为 "Last 15 minutes" 或更长
|
||||
|
||||
3. **刷新仪表板**
|
||||
- 点击右上角的刷新按钮
|
||||
- 或者按 Ctrl+R (Cmd+R on Mac)
|
||||
|
||||
4. **验证日志是否存在**
|
||||
- 在 Explore 中手动查询:
|
||||
```logql
|
||||
{job="functional-scaffold-app"} |= "your-request-id"
|
||||
```
|
||||
- 如果没有结果,说明日志还没有被收集
|
||||
|
||||
#### 日志延迟
|
||||
|
||||
- Promtail 每 5 秒刷新一次
|
||||
- Loki 可能有几秒的延迟
|
||||
- 建议等待 5-10 秒后再查询
|
||||
|
||||
### 最佳实践
|
||||
|
||||
1. **调试单个请求**
|
||||
- 发送请求并记录 request_id
|
||||
- 在仪表板中输入 request_id
|
||||
- 查看完整的请求处理流程
|
||||
|
||||
2. **追踪错误**
|
||||
- 当发现错误时,从错误日志中提取 request_id
|
||||
- 使用 request_id 过滤查看完整的请求上下文
|
||||
- 分析错误发生前后的日志
|
||||
|
||||
3. **性能分析**
|
||||
- 使用 request_id 过滤慢请求
|
||||
- 查看算法执行时间
|
||||
- 分析性能瓶颈
|
||||
|
||||
4. **用户问题排查**
|
||||
- 从用户报告中获取 request_id(如果有)
|
||||
- 使用 request_id 重现问题场景
|
||||
- 查看完整的请求处理过程
|
||||
|
||||
### 技术细节
|
||||
|
||||
#### 过滤实现
|
||||
|
||||
仪表板使用 LogQL 的文本匹配操作符 `|=`:
|
||||
|
||||
```logql
|
||||
{job="functional-scaffold-app"} |= "$request_id"
|
||||
```
|
||||
|
||||
- 当 `$request_id` 为空时,`|= ""` 匹配所有日志
|
||||
- 当 `$request_id` 有值时,只匹配包含该字符串的日志
|
||||
|
||||
#### 性能考虑
|
||||
|
||||
- 文本匹配 (`|=`) 比 JSON 解析更快
|
||||
- 适合实时查询和仪表板
|
||||
- 对于精确匹配,可以在 Explore 中使用 JSON 解析
|
||||
|
||||
#### 变量配置
|
||||
|
||||
Request ID 变量配置:
|
||||
- 类型:textbox(文本输入框)
|
||||
- 名称:request_id
|
||||
- 标签:Request ID
|
||||
- 默认值:空字符串
|
||||
|
||||
## 相关文档
|
||||
|
||||
- [Loki 集成文档](loki-integration.md)
|
||||
- [Loki 快速参考](loki-quick-reference.md)
|
||||
- [LogQL 查询语言](https://grafana.com/docs/loki/latest/logql/)
|
||||
307
docs/kubernetes-deployment.md
Normal file
307
docs/kubernetes-deployment.md
Normal file
@@ -0,0 +1,307 @@
|
||||
# Kubernetes 部署指南
|
||||
|
||||
本文档介绍如何在 Kubernetes 集群中部署 FunctionalScaffold 服务。
|
||||
|
||||
## 架构概览
|
||||
|
||||
```
|
||||
┌─────────────────┐
|
||||
│ Ingress/LB │
|
||||
└────────┬────────┘
|
||||
│
|
||||
┌────────▼────────┐
|
||||
│ API Service │
|
||||
│ (ClusterIP) │
|
||||
└────────┬────────┘
|
||||
│
|
||||
┌──────────────┼──────────────┐
|
||||
│ │ │
|
||||
┌──────▼──────┐ ┌─────▼─────┐ ┌─────▼─────┐
|
||||
│ API Pod 1 │ │ API Pod 2 │ │ API Pod 3 │
|
||||
└─────────────┘ └───────────┘ └───────────┘
|
||||
│
|
||||
┌────────▼────────┐
|
||||
│ Redis Service │
|
||||
└────────┬────────┘
|
||||
│
|
||||
┌──────────────┼──────────────┐
|
||||
│ │ │
|
||||
┌──────▼──────┐ ┌─────▼─────┐ │
|
||||
│ Worker Pod 1│ │Worker Pod2│ │
|
||||
└─────────────┘ └───────────┘ │
|
||||
┌──────▼──────┐
|
||||
│ Redis Pod │
|
||||
└─────────────┘
|
||||
```
|
||||
|
||||
## 组件说明
|
||||
|
||||
| 组件 | 副本数 | 说明 |
|
||||
|------|--------|------|
|
||||
| **API Deployment** | 3 | HTTP 服务,处理同步请求和任务创建 |
|
||||
| **Worker Deployment** | 2 | 异步任务处理,从 Redis 队列消费任务 |
|
||||
| **Redis Deployment** | 1 | 任务队列和状态存储 |
|
||||
| **ConfigMap** | - | 共享配置管理 |
|
||||
|
||||
## 快速部署
|
||||
|
||||
```bash
|
||||
# 部署所有资源
|
||||
kubectl apply -f deployment/kubernetes/deployment.yaml
|
||||
kubectl apply -f deployment/kubernetes/service.yaml
|
||||
|
||||
# 查看部署状态
|
||||
kubectl get pods -l app=functional-scaffold
|
||||
kubectl get svc -l app=functional-scaffold
|
||||
```
|
||||
|
||||
## 配置文件说明
|
||||
|
||||
### deployment.yaml
|
||||
|
||||
包含以下资源:
|
||||
|
||||
#### ConfigMap
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: functional-scaffold-config
|
||||
data:
|
||||
APP_ENV: "production"
|
||||
LOG_LEVEL: "INFO"
|
||||
REDIS_HOST: "functional-scaffold-redis"
|
||||
# ... 更多配置
|
||||
```
|
||||
|
||||
主要配置项:
|
||||
|
||||
| 配置项 | 默认值 | 说明 |
|
||||
|--------|--------|------|
|
||||
| `APP_ENV` | production | 运行环境 |
|
||||
| `LOG_LEVEL` | INFO | 日志级别 |
|
||||
| `REDIS_HOST` | functional-scaffold-redis | Redis 服务地址 |
|
||||
| `MAX_CONCURRENT_JOBS` | 10 | 最大并发任务数 |
|
||||
| `JOB_EXECUTION_TIMEOUT` | 300 | 任务执行超时(秒) |
|
||||
|
||||
#### API Deployment
|
||||
|
||||
- **副本数**: 3
|
||||
- **资源限制**: 256Mi-512Mi 内存,250m-500m CPU
|
||||
- **健康检查**: `/healthz`(存活)、`/readyz`(就绪)
|
||||
- **环境变量**: `RUN_MODE=api`
|
||||
|
||||
#### Worker Deployment
|
||||
|
||||
- **副本数**: 2
|
||||
- **资源限制**: 256Mi-512Mi 内存,250m-500m CPU
|
||||
- **健康检查**: exec 探针检查 Redis 连接
|
||||
- **环境变量**: `RUN_MODE=worker`
|
||||
|
||||
#### Redis Deployment
|
||||
|
||||
- **副本数**: 1
|
||||
- **资源限制**: 128Mi-256Mi 内存,100m-200m CPU
|
||||
- **持久化**: AOF 模式(appendonly yes)
|
||||
- **存储**: emptyDir(开发环境)
|
||||
|
||||
### service.yaml
|
||||
|
||||
| Service | 类型 | 端口 | 说明 |
|
||||
|---------|------|------|------|
|
||||
| `functional-scaffold-api` | ClusterIP | 80 → 8000 | API 服务 |
|
||||
| `functional-scaffold-metrics` | ClusterIP | 8000 | Prometheus 指标 |
|
||||
| `functional-scaffold-redis` | ClusterIP | 6379 | Redis 服务 |
|
||||
|
||||
## 生产环境建议
|
||||
|
||||
### 1. 使用外部 Redis
|
||||
|
||||
生产环境建议使用托管 Redis 服务(如阿里云 Redis、AWS ElastiCache):
|
||||
|
||||
```yaml
|
||||
# 修改 ConfigMap
|
||||
data:
|
||||
REDIS_HOST: "r-xxxxx.redis.rds.aliyuncs.com"
|
||||
REDIS_PORT: "6379"
|
||||
REDIS_PASSWORD: "" # 使用 Secret 管理
|
||||
```
|
||||
|
||||
### 2. 使用 Secret 管理敏感信息
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: functional-scaffold-secrets
|
||||
type: Opaque
|
||||
stringData:
|
||||
REDIS_PASSWORD: "your-password"
|
||||
DATABASE_URL: "postgresql://..."
|
||||
```
|
||||
|
||||
在 Deployment 中引用:
|
||||
|
||||
```yaml
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: functional-scaffold-config
|
||||
- secretRef:
|
||||
name: functional-scaffold-secrets
|
||||
```
|
||||
|
||||
### 3. 配置 HPA 自动扩缩容
|
||||
|
||||
```yaml
|
||||
apiVersion: autoscaling/v2
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: functional-scaffold-api-hpa
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: functional-scaffold-api
|
||||
minReplicas: 2
|
||||
maxReplicas: 10
|
||||
metrics:
|
||||
- type: Resource
|
||||
resource:
|
||||
name: cpu
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: 70
|
||||
```
|
||||
|
||||
### 4. 配置 PDB 保证可用性
|
||||
|
||||
```yaml
|
||||
apiVersion: policy/v1
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: functional-scaffold-api-pdb
|
||||
spec:
|
||||
minAvailable: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
```
|
||||
|
||||
### 5. 使用 PVC 持久化 Redis 数据
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: redis-data-pvc
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
```
|
||||
|
||||
## 监控集成
|
||||
|
||||
### Prometheus 抓取配置
|
||||
|
||||
`functional-scaffold-metrics` Service 已添加 Prometheus 注解:
|
||||
|
||||
```yaml
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8000"
|
||||
prometheus.io/path: "/metrics"
|
||||
```
|
||||
|
||||
### ServiceMonitor(如使用 Prometheus Operator)
|
||||
|
||||
```yaml
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: functional-scaffold
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: functional-scaffold
|
||||
component: api
|
||||
endpoints:
|
||||
- port: metrics
|
||||
path: /metrics
|
||||
interval: 30s
|
||||
```
|
||||
|
||||
## 常用命令
|
||||
|
||||
```bash
|
||||
# 查看所有资源
|
||||
kubectl get all -l app=functional-scaffold
|
||||
|
||||
# 查看 Pod 日志
|
||||
kubectl logs -l app=functional-scaffold,component=api -f
|
||||
kubectl logs -l app=functional-scaffold,component=worker -f
|
||||
|
||||
# 扩缩容
|
||||
kubectl scale deployment functional-scaffold-api --replicas=5
|
||||
kubectl scale deployment functional-scaffold-worker --replicas=3
|
||||
|
||||
# 滚动更新
|
||||
kubectl set image deployment/functional-scaffold-api \
|
||||
api=functional-scaffold:v2.0.0
|
||||
|
||||
# 回滚
|
||||
kubectl rollout undo deployment/functional-scaffold-api
|
||||
|
||||
# 查看部署历史
|
||||
kubectl rollout history deployment/functional-scaffold-api
|
||||
|
||||
# 进入 Pod 调试
|
||||
kubectl exec -it <pod-name> -- /bin/sh
|
||||
|
||||
# 端口转发(本地调试)
|
||||
kubectl port-forward svc/functional-scaffold-api 8000:80
|
||||
```
|
||||
|
||||
## 故障排查
|
||||
|
||||
### Pod 启动失败
|
||||
|
||||
```bash
|
||||
# 查看 Pod 事件
|
||||
kubectl describe pod <pod-name>
|
||||
|
||||
# 查看 Pod 日志
|
||||
kubectl logs <pod-name> --previous
|
||||
```
|
||||
|
||||
### Redis 连接失败
|
||||
|
||||
```bash
|
||||
# 检查 Redis Service
|
||||
kubectl get svc functional-scaffold-redis
|
||||
|
||||
# 测试 Redis 连接
|
||||
kubectl run redis-test --rm -it --image=redis:7-alpine -- \
|
||||
redis-cli -h functional-scaffold-redis ping
|
||||
```
|
||||
|
||||
### Worker 不消费任务
|
||||
|
||||
```bash
|
||||
# 检查 Worker 日志
|
||||
kubectl logs -l component=worker -f
|
||||
|
||||
# 检查 Redis 队列
|
||||
kubectl exec -it <redis-pod> -- redis-cli LLEN job:queue
|
||||
```
|
||||
|
||||
## 相关文档
|
||||
|
||||
- [快速入门](getting-started.md)
|
||||
- [监控指南](monitoring.md)
|
||||
- [并发控制](concurrency-control.md)
|
||||
- [日志集成](loki-quick-reference.md)
|
||||
564
docs/loki-integration.md
Normal file
564
docs/loki-integration.md
Normal file
@@ -0,0 +1,564 @@
|
||||
# Loki 日志收集系统集成文档
|
||||
|
||||
## 概述
|
||||
|
||||
本项目已集成 Grafana Loki 日志收集系统,支持两种日志收集模式:
|
||||
|
||||
1. **Docker stdio 收集**(推荐)- 从容器标准输出/错误收集日志
|
||||
2. **Log 文件收集**(备用)- 从日志文件收集日志
|
||||
|
||||
## 架构
|
||||
|
||||
```
|
||||
应用容器 (stdout/stderr)
|
||||
↓
|
||||
Docker Engine
|
||||
↓
|
||||
Promtail (日志采集器)
|
||||
↓
|
||||
Loki (日志存储)
|
||||
↓
|
||||
Grafana (可视化)
|
||||
```
|
||||
|
||||
## 快速开始
|
||||
|
||||
### 1. 启动服务
|
||||
|
||||
```bash
|
||||
cd deployment
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
这将启动以下服务:
|
||||
- **app**: 应用服务 (端口 8111)
|
||||
- **loki**: 日志存储服务 (端口 3100)
|
||||
- **promtail**: 日志采集服务 (端口 9080)
|
||||
- **grafana**: 可视化服务 (端口 3000)
|
||||
- **prometheus**: 指标收集服务 (端口 9090)
|
||||
- **redis**: 缓存服务 (端口 6380)
|
||||
|
||||
### 2. 访问 Grafana
|
||||
|
||||
1. 打开浏览器访问 http://localhost:3000
|
||||
2. 使用默认凭据登录:
|
||||
- 用户名: `admin`
|
||||
- 密码: `admin`
|
||||
3. 首次登录后建议修改密码
|
||||
|
||||
### 3. 查看日志
|
||||
|
||||
#### 方式 1: 使用预配置的日志仪表板
|
||||
|
||||
1. 在 Grafana 左侧菜单点击 **Dashboards**
|
||||
2. 选择 **日志监控** 仪表板
|
||||
3. 查看以下面板:
|
||||
- **日志流 (实时)**: 实时日志流
|
||||
- **日志量趋势(按级别)**: 时间序列图表
|
||||
- **日志级别分布**: 按级别统计
|
||||
- **错误日志**: 只显示 ERROR 级别日志
|
||||
|
||||
#### 方式 2: 使用 Explore 功能
|
||||
|
||||
1. 在 Grafana 左侧菜单点击 **Explore** (指南针图标)
|
||||
2. 选择 **Loki** 数据源
|
||||
3. 输入 LogQL 查询语句(见下文)
|
||||
|
||||
## LogQL 查询示例
|
||||
|
||||
### 基础查询
|
||||
|
||||
```logql
|
||||
# 查询所有应用日志
|
||||
{job="functional-scaffold-app"}
|
||||
|
||||
# 查询特定级别的日志
|
||||
{job="functional-scaffold-app", level="ERROR"}
|
||||
{job="functional-scaffold-app", level="INFO"}
|
||||
|
||||
# 查询特定容器的日志
|
||||
{container="functional-scaffold-app-1"}
|
||||
```
|
||||
|
||||
### 文本过滤
|
||||
|
||||
```logql
|
||||
# 包含特定文本
|
||||
{job="functional-scaffold-app"} |= "request_id"
|
||||
|
||||
# 不包含特定文本
|
||||
{job="functional-scaffold-app"} != "healthz"
|
||||
|
||||
# 正则表达式匹配
|
||||
{job="functional-scaffold-app"} |~ "error|exception"
|
||||
|
||||
# 正则表达式不匹配
|
||||
{job="functional-scaffold-app"} !~ "debug|trace"
|
||||
```
|
||||
|
||||
### JSON 字段提取
|
||||
|
||||
```logql
|
||||
# 提取 request_id 字段
|
||||
{job="functional-scaffold-app"} | json | request_id != ""
|
||||
|
||||
# 提取并过滤特定 request_id
|
||||
{job="functional-scaffold-app"} | json | request_id = "abc123"
|
||||
|
||||
# 提取 logger 字段
|
||||
{job="functional-scaffold-app"} | json | logger = "functional_scaffold.api.routes"
|
||||
```
|
||||
|
||||
### 聚合查询
|
||||
|
||||
```logql
|
||||
# 统计日志数量
|
||||
count_over_time({job="functional-scaffold-app"}[5m])
|
||||
|
||||
# 按级别统计
|
||||
sum by (level) (count_over_time({job="functional-scaffold-app"}[5m]))
|
||||
|
||||
# 计算错误率
|
||||
sum(rate({job="functional-scaffold-app", level="ERROR"}[5m]))
|
||||
/
|
||||
sum(rate({job="functional-scaffold-app"}[5m]))
|
||||
```
|
||||
|
||||
## 日志收集模式
|
||||
|
||||
### 模式 1: Docker stdio 收集(默认,推荐)
|
||||
|
||||
**特点:**
|
||||
- 无需修改应用代码
|
||||
- 自动收集容器标准输出/错误
|
||||
- 性能影响极小
|
||||
- 配置简单
|
||||
|
||||
**工作原理:**
|
||||
1. 应用将日志输出到 stdout/stderr
|
||||
2. Docker Engine 捕获日志
|
||||
3. Promtail 通过 Docker API 读取日志
|
||||
4. 日志发送到 Loki 存储
|
||||
|
||||
**配置:**
|
||||
- 应用容器需要添加标签:
|
||||
```yaml
|
||||
labels:
|
||||
logging: "promtail"
|
||||
logging_jobname: "functional-scaffold-app"
|
||||
```
|
||||
|
||||
### 模式 2: Log 文件收集(备用)
|
||||
|
||||
**特点:**
|
||||
- 日志持久化到文件
|
||||
- 支持日志轮转
|
||||
- 适合需要本地日志文件的场景
|
||||
|
||||
**启用方式:**
|
||||
|
||||
1. 修改 `deployment/docker-compose.yml`:
|
||||
```yaml
|
||||
environment:
|
||||
- LOG_FILE_ENABLED=true
|
||||
- LOG_FILE_PATH=/var/log/app/app.log
|
||||
```
|
||||
|
||||
2. 重启服务:
|
||||
```bash
|
||||
docker-compose up -d app
|
||||
```
|
||||
|
||||
**日志文件配置:**
|
||||
- 最大文件大小: 100MB
|
||||
- 保留备份数: 5 个
|
||||
- 总存储空间: 最多 500MB
|
||||
|
||||
## 配置说明
|
||||
|
||||
### Loki 配置 (monitoring/loki.yaml)
|
||||
|
||||
```yaml
|
||||
limits_config:
|
||||
retention_period: 168h # 日志保留 7 天
|
||||
ingestion_rate_mb: 10 # 摄入速率限制 10MB/s
|
||||
ingestion_burst_size_mb: 20 # 突发大小 20MB
|
||||
```
|
||||
|
||||
**可调整参数:**
|
||||
- `retention_period`: 日志保留时间(默认 7 天)
|
||||
- `ingestion_rate_mb`: 每秒摄入速率限制
|
||||
- `ingestion_burst_size_mb`: 突发流量大小
|
||||
|
||||
### Promtail 配置 (monitoring/promtail.yaml)
|
||||
|
||||
**Docker stdio 收集配置:**
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: docker
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
filters:
|
||||
- name: label
|
||||
values: ["logging=promtail"]
|
||||
```
|
||||
|
||||
**文件收集配置:**
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: app_files
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost
|
||||
labels:
|
||||
job: functional-scaffold-app-files
|
||||
__path__: /var/log/app/*.log
|
||||
```
|
||||
|
||||
## 验证和测试
|
||||
|
||||
### 1. 检查服务状态
|
||||
|
||||
```bash
|
||||
# 查看所有服务
|
||||
docker-compose ps
|
||||
|
||||
# 检查 Loki 健康状态
|
||||
curl http://localhost:3100/ready
|
||||
|
||||
# 检查 Promtail 健康状态
|
||||
curl http://localhost:9080/ready
|
||||
```
|
||||
|
||||
### 2. 生成测试日志
|
||||
|
||||
```bash
|
||||
# 发送测试请求
|
||||
curl -X POST http://localhost:8111/invoke \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"algorithm": "PrimeChecker", "params": {"number": 17}}'
|
||||
```
|
||||
|
||||
### 3. 查询日志
|
||||
|
||||
```bash
|
||||
# 使用 Loki API 查询
|
||||
curl -G -s "http://localhost:3100/loki/api/v1/query_range" \
|
||||
--data-urlencode 'query={job="functional-scaffold-app"}' \
|
||||
--data-urlencode 'limit=10' \
|
||||
| jq '.data.result'
|
||||
```
|
||||
|
||||
### 4. 在 Grafana 中验证
|
||||
|
||||
1. 访问 http://localhost:3000/explore
|
||||
2. 选择 Loki 数据源
|
||||
3. 输入查询: `{job="functional-scaffold-app"}`
|
||||
4. 应该能看到应用日志
|
||||
|
||||
## 故障排查
|
||||
|
||||
### 问题 1: 看不到日志
|
||||
|
||||
**检查步骤:**
|
||||
|
||||
1. 确认 Promtail 正在运行:
|
||||
```bash
|
||||
docker-compose ps promtail
|
||||
```
|
||||
|
||||
2. 检查 Promtail 日志:
|
||||
```bash
|
||||
docker-compose logs promtail
|
||||
```
|
||||
|
||||
3. 确认应用容器有正确的标签:
|
||||
```bash
|
||||
docker inspect functional-scaffold-app-1 | grep -A 5 Labels
|
||||
```
|
||||
|
||||
4. 检查 Loki 是否接收到日志:
|
||||
```bash
|
||||
curl -G -s "http://localhost:3100/loki/api/v1/label/job/values" | jq
|
||||
```
|
||||
|
||||
### 问题 2: Promtail 无法访问 Docker socket
|
||||
|
||||
**错误信息:**
|
||||
```
|
||||
permission denied while trying to connect to the Docker daemon socket
|
||||
```
|
||||
|
||||
**解决方案:**
|
||||
|
||||
在 macOS/Linux 上,确保 Docker socket 权限正确:
|
||||
```bash
|
||||
sudo chmod 666 /var/run/docker.sock
|
||||
```
|
||||
|
||||
或者将 Promtail 容器添加到 docker 组(Linux):
|
||||
```yaml
|
||||
promtail:
|
||||
user: root
|
||||
group_add:
|
||||
- docker
|
||||
```
|
||||
|
||||
### 问题 3: 日志量过大
|
||||
|
||||
**症状:**
|
||||
- Loki 响应缓慢
|
||||
- 磁盘空间不足
|
||||
|
||||
**解决方案:**
|
||||
|
||||
1. 调整日志保留期:
|
||||
```yaml
|
||||
# monitoring/loki.yaml
|
||||
limits_config:
|
||||
retention_period: 72h # 改为 3 天
|
||||
```
|
||||
|
||||
2. 增加摄入速率限制:
|
||||
```yaml
|
||||
limits_config:
|
||||
ingestion_rate_mb: 5 # 降低到 5MB/s
|
||||
```
|
||||
|
||||
3. 添加日志过滤:
|
||||
```yaml
|
||||
# monitoring/promtail.yaml
|
||||
pipeline_stages:
|
||||
- match:
|
||||
selector: '{job="functional-scaffold-app"}'
|
||||
stages:
|
||||
- drop:
|
||||
expression: ".*healthz.*" # 丢弃健康检查日志
|
||||
```
|
||||
|
||||
### 问题 4: 文件模式下看不到日志
|
||||
|
||||
**检查步骤:**
|
||||
|
||||
1. 确认文件日志已启用:
|
||||
```bash
|
||||
docker-compose exec app env | grep LOG_FILE
|
||||
```
|
||||
|
||||
2. 检查日志文件是否存在:
|
||||
```bash
|
||||
docker-compose exec app ls -lh /var/log/app/
|
||||
```
|
||||
|
||||
3. 检查 Promtail 是否能访问日志文件:
|
||||
```bash
|
||||
docker-compose exec promtail ls -lh /var/log/app/
|
||||
```
|
||||
|
||||
## 性能优化
|
||||
|
||||
### 1. 减少日志量
|
||||
|
||||
**在应用层面:**
|
||||
- 调整日志级别为 WARNING 或 ERROR
|
||||
- 过滤掉不必要的日志(如健康检查)
|
||||
|
||||
```yaml
|
||||
# docker-compose.yml
|
||||
environment:
|
||||
- LOG_LEVEL=WARNING
|
||||
```
|
||||
|
||||
**在 Promtail 层面:**
|
||||
```yaml
|
||||
# monitoring/promtail.yaml
|
||||
pipeline_stages:
|
||||
- drop:
|
||||
expression: ".*healthz.*"
|
||||
drop_counter_reason: "healthcheck"
|
||||
```
|
||||
|
||||
### 2. 优化查询性能
|
||||
|
||||
**使用标签过滤:**
|
||||
```logql
|
||||
# 好:使用标签过滤(快)
|
||||
{job="functional-scaffold-app", level="ERROR"}
|
||||
|
||||
# 差:使用文本过滤(慢)
|
||||
{job="functional-scaffold-app"} |= "ERROR"
|
||||
```
|
||||
|
||||
**限制时间范围:**
|
||||
```logql
|
||||
# 查询最近 5 分钟
|
||||
{job="functional-scaffold-app"}[5m]
|
||||
|
||||
# 避免查询过长时间范围
|
||||
{job="functional-scaffold-app"}[7d] # 慢
|
||||
```
|
||||
|
||||
### 3. 存储优化
|
||||
|
||||
**定期清理旧数据:**
|
||||
```bash
|
||||
# Loki 会自动根据 retention_period 清理
|
||||
# 也可以手动清理
|
||||
docker-compose exec loki rm -rf /loki/chunks/*
|
||||
```
|
||||
|
||||
**监控磁盘使用:**
|
||||
```bash
|
||||
docker-compose exec loki du -sh /loki/chunks
|
||||
```
|
||||
|
||||
## 高级功能
|
||||
|
||||
### 1. 告警规则
|
||||
|
||||
在 Loki 中配置告警规则(需要 Loki Ruler):
|
||||
|
||||
```yaml
|
||||
# monitoring/loki-rules.yaml
|
||||
groups:
|
||||
- name: error_alerts
|
||||
interval: 1m
|
||||
rules:
|
||||
- alert: HighErrorRate
|
||||
expr: |
|
||||
sum(rate({job="functional-scaffold-app", level="ERROR"}[5m]))
|
||||
/
|
||||
sum(rate({job="functional-scaffold-app"}[5m]))
|
||||
> 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "错误率过高"
|
||||
description: "应用错误率超过 5%"
|
||||
```
|
||||
|
||||
### 2. 日志导出
|
||||
|
||||
**导出为 JSON:**
|
||||
```bash
|
||||
curl -G -s "http://localhost:3100/loki/api/v1/query_range" \
|
||||
--data-urlencode 'query={job="functional-scaffold-app"}' \
|
||||
--data-urlencode 'start=2024-01-01T00:00:00Z' \
|
||||
--data-urlencode 'end=2024-01-02T00:00:00Z' \
|
||||
| jq '.data.result' > logs.json
|
||||
```
|
||||
|
||||
**导出为文本:**
|
||||
```bash
|
||||
curl -G -s "http://localhost:3100/loki/api/v1/query_range" \
|
||||
--data-urlencode 'query={job="functional-scaffold-app"}' \
|
||||
| jq -r '.data.result[].values[][1]' > logs.txt
|
||||
```
|
||||
|
||||
### 3. 与 Prometheus 集成
|
||||
|
||||
在 Grafana 仪表板中同时显示日志和指标:
|
||||
|
||||
```json
|
||||
{
|
||||
"panels": [
|
||||
{
|
||||
"title": "错误率和错误日志",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"expr": "rate(http_requests_total{status=\"error\"}[5m])"
|
||||
},
|
||||
{
|
||||
"datasource": "Loki",
|
||||
"expr": "{job=\"functional-scaffold-app\", level=\"ERROR\"}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## 最佳实践
|
||||
|
||||
### 1. 日志格式
|
||||
|
||||
**使用结构化日志(JSON):**
|
||||
```python
|
||||
logger.info("处理请求", extra={
|
||||
"request_id": "abc123",
|
||||
"user_id": "user456",
|
||||
"duration": 0.123
|
||||
})
|
||||
```
|
||||
|
||||
**输出:**
|
||||
```json
|
||||
{
|
||||
"asctime": "2024-01-01 12:00:00,000",
|
||||
"name": "functional_scaffold.api.routes",
|
||||
"levelname": "INFO",
|
||||
"message": "处理请求",
|
||||
"request_id": "abc123",
|
||||
"user_id": "user456",
|
||||
"duration": 0.123
|
||||
}
|
||||
```
|
||||
|
||||
### 2. 标签策略
|
||||
|
||||
**好的标签:**
|
||||
- 低基数(值的种类少)
|
||||
- 用于过滤和分组
|
||||
- 例如:`level`, `logger`, `container`
|
||||
|
||||
**不好的标签:**
|
||||
- 高基数(值的种类多)
|
||||
- 例如:`request_id`, `user_id`, `timestamp`
|
||||
|
||||
**正确做法:**
|
||||
```logql
|
||||
# 使用标签过滤
|
||||
{job="functional-scaffold-app", level="ERROR"}
|
||||
|
||||
# 使用 JSON 提取高基数字段
|
||||
{job="functional-scaffold-app"} | json | request_id = "abc123"
|
||||
```
|
||||
|
||||
### 3. 查询优化
|
||||
|
||||
**使用时间范围:**
|
||||
```logql
|
||||
{job="functional-scaffold-app"}[5m] # 最近 5 分钟
|
||||
```
|
||||
|
||||
**限制返回行数:**
|
||||
```logql
|
||||
{job="functional-scaffold-app"} | limit 100
|
||||
```
|
||||
|
||||
**使用聚合减少数据量:**
|
||||
```logql
|
||||
sum by (level) (count_over_time({job="functional-scaffold-app"}[5m]))
|
||||
```
|
||||
|
||||
## 参考资料
|
||||
|
||||
- [Loki 官方文档](https://grafana.com/docs/loki/latest/)
|
||||
- [LogQL 查询语言](https://grafana.com/docs/loki/latest/logql/)
|
||||
- [Promtail 配置](https://grafana.com/docs/loki/latest/clients/promtail/configuration/)
|
||||
- [Grafana Explore](https://grafana.com/docs/grafana/latest/explore/)
|
||||
|
||||
## 总结
|
||||
|
||||
本项目的 Loki 集成提供了:
|
||||
|
||||
✅ **开箱即用** - 无需额外配置即可收集日志
|
||||
✅ **双模式支持** - Docker stdio(默认)和文件收集
|
||||
✅ **自动化配置** - 数据源和仪表板自动加载
|
||||
✅ **结构化日志** - JSON 格式,支持字段提取
|
||||
✅ **高性能** - 低资源占用,快速查询
|
||||
✅ **易于扩展** - 支持自定义标签和过滤规则
|
||||
|
||||
如有问题,请参考故障排查章节或查阅官方文档。
|
||||
237
docs/loki-quick-reference.md
Normal file
237
docs/loki-quick-reference.md
Normal file
@@ -0,0 +1,237 @@
|
||||
# Loki 快速参考
|
||||
|
||||
## 常用命令
|
||||
|
||||
### 服务管理
|
||||
|
||||
```bash
|
||||
# 启动所有服务
|
||||
cd deployment && docker-compose up -d
|
||||
|
||||
# 查看服务状态
|
||||
docker-compose ps
|
||||
|
||||
# 查看日志
|
||||
docker-compose logs -f loki
|
||||
docker-compose logs -f promtail
|
||||
|
||||
# 重启服务
|
||||
docker-compose restart loki promtail
|
||||
|
||||
# 停止服务
|
||||
docker-compose down
|
||||
```
|
||||
|
||||
### 健康检查
|
||||
|
||||
```bash
|
||||
# Loki
|
||||
curl http://localhost:3100/ready
|
||||
|
||||
# Promtail
|
||||
curl http://localhost:9080/ready
|
||||
|
||||
# 验证脚本
|
||||
./scripts/verify_loki.sh
|
||||
```
|
||||
|
||||
## 常用 LogQL 查询
|
||||
|
||||
### 基础查询
|
||||
|
||||
```logql
|
||||
# 所有日志
|
||||
{job="functional-scaffold-app"}
|
||||
|
||||
# 错误日志
|
||||
{job="functional-scaffold-app", level="ERROR"}
|
||||
|
||||
# 特定时间范围
|
||||
{job="functional-scaffold-app"}[5m]
|
||||
```
|
||||
|
||||
### 文本过滤
|
||||
|
||||
```logql
|
||||
# 包含文本
|
||||
{job="functional-scaffold-app"} |= "error"
|
||||
|
||||
# 不包含文本
|
||||
{job="functional-scaffold-app"} != "healthz"
|
||||
|
||||
# 正则匹配
|
||||
{job="functional-scaffold-app"} |~ "error|exception"
|
||||
```
|
||||
|
||||
### JSON 提取
|
||||
|
||||
```logql
|
||||
# 提取 request_id
|
||||
{job="functional-scaffold-app"} | json | request_id != ""
|
||||
|
||||
# 按 request_id 过滤
|
||||
{job="functional-scaffold-app"} | json | request_id = "abc123"
|
||||
```
|
||||
|
||||
### 聚合统计
|
||||
|
||||
```logql
|
||||
# 日志数量
|
||||
count_over_time({job="functional-scaffold-app"}[5m])
|
||||
|
||||
# 按级别统计
|
||||
sum by (level) (count_over_time({job="functional-scaffold-app"}[5m]))
|
||||
|
||||
# 错误率
|
||||
sum(rate({job="functional-scaffold-app", level="ERROR"}[5m]))
|
||||
/
|
||||
sum(rate({job="functional-scaffold-app"}[5m]))
|
||||
```
|
||||
|
||||
## API 查询
|
||||
|
||||
### 查询日志
|
||||
|
||||
```bash
|
||||
# 查询最近的日志
|
||||
curl -G -s "http://localhost:3100/loki/api/v1/query_range" \
|
||||
--data-urlencode 'query={job="functional-scaffold-app"}' \
|
||||
--data-urlencode 'limit=10' \
|
||||
| jq '.data.result'
|
||||
|
||||
# 查询错误日志
|
||||
curl -G -s "http://localhost:3100/loki/api/v1/query_range" \
|
||||
--data-urlencode 'query={job="functional-scaffold-app", level="ERROR"}' \
|
||||
| jq '.data.result'
|
||||
```
|
||||
|
||||
### 查询标签
|
||||
|
||||
```bash
|
||||
# 查询所有 job 标签值
|
||||
curl -s "http://localhost:3100/loki/api/v1/label/job/values" | jq
|
||||
|
||||
# 查询所有 level 标签值
|
||||
curl -s "http://localhost:3100/loki/api/v1/label/level/values" | jq
|
||||
```
|
||||
|
||||
## 配置切换
|
||||
|
||||
### 启用文件日志
|
||||
|
||||
编辑 `deployment/docker-compose.yml`:
|
||||
|
||||
```yaml
|
||||
environment:
|
||||
- LOG_FILE_ENABLED=true
|
||||
```
|
||||
|
||||
重启服务:
|
||||
|
||||
```bash
|
||||
docker-compose up -d app
|
||||
```
|
||||
|
||||
### 调整日志级别
|
||||
|
||||
编辑 `deployment/docker-compose.yml`:
|
||||
|
||||
```yaml
|
||||
environment:
|
||||
- LOG_LEVEL=WARNING # DEBUG, INFO, WARNING, ERROR, CRITICAL
|
||||
```
|
||||
|
||||
### 修改保留期
|
||||
|
||||
编辑 `monitoring/loki.yaml`:
|
||||
|
||||
```yaml
|
||||
limits_config:
|
||||
retention_period: 72h # 改为 3 天
|
||||
```
|
||||
|
||||
重启 Loki:
|
||||
|
||||
```bash
|
||||
docker-compose restart loki
|
||||
```
|
||||
|
||||
## 访问地址
|
||||
|
||||
| 服务 | 地址 | 凭据 |
|
||||
|------|------|------|
|
||||
| Grafana | http://localhost:3000 | admin/admin |
|
||||
| Loki API | http://localhost:3100 | - |
|
||||
| Promtail | http://localhost:9080 | - |
|
||||
| Prometheus | http://localhost:9090 | - |
|
||||
| App | http://localhost:8111 | - |
|
||||
|
||||
## 故障排查
|
||||
|
||||
### 看不到日志
|
||||
|
||||
```bash
|
||||
# 1. 检查 Promtail 日志
|
||||
docker-compose logs promtail | tail -50
|
||||
|
||||
# 2. 检查容器标签
|
||||
docker inspect deployment-app-1 | grep -A 5 Labels
|
||||
|
||||
# 3. 查询 Loki
|
||||
curl -s "http://localhost:3100/loki/api/v1/label/job/values" | jq
|
||||
```
|
||||
|
||||
### Docker socket 权限
|
||||
|
||||
```bash
|
||||
sudo chmod 666 /var/run/docker.sock
|
||||
```
|
||||
|
||||
### 清理日志数据
|
||||
|
||||
```bash
|
||||
# 停止 Loki
|
||||
docker-compose stop loki
|
||||
|
||||
# 清理数据
|
||||
docker-compose exec loki rm -rf /loki/chunks/*
|
||||
|
||||
# 重启 Loki
|
||||
docker-compose start loki
|
||||
```
|
||||
|
||||
## 性能优化
|
||||
|
||||
### 减少日志量
|
||||
|
||||
```yaml
|
||||
# docker-compose.yml
|
||||
environment:
|
||||
- LOG_LEVEL=WARNING # 只记录警告和错误
|
||||
```
|
||||
|
||||
### 过滤健康检查日志
|
||||
|
||||
编辑 `monitoring/promtail.yaml`:
|
||||
|
||||
```yaml
|
||||
pipeline_stages:
|
||||
- drop:
|
||||
expression: ".*healthz.*"
|
||||
```
|
||||
|
||||
### 限制查询范围
|
||||
|
||||
```logql
|
||||
# 好:限制时间范围
|
||||
{job="functional-scaffold-app"}[5m]
|
||||
|
||||
# 差:查询所有时间
|
||||
{job="functional-scaffold-app"}
|
||||
```
|
||||
|
||||
## 文档链接
|
||||
|
||||
- 完整文档: `docs/loki-integration.md`
|
||||
- 实施总结: `docs/loki-implementation-summary.md`
|
||||
- 验证脚本: `scripts/verify_loki.sh`
|
||||
126
docs/metrics-filtering-changelog.md
Normal file
126
docs/metrics-filtering-changelog.md
Normal file
@@ -0,0 +1,126 @@
|
||||
# 指标过滤和路径规范化
|
||||
|
||||
## 变更说明
|
||||
|
||||
本次修改优化了 HTTP 请求指标的记录逻辑,主要包括两个方面:
|
||||
|
||||
### 1. 跳过健康检查端点
|
||||
|
||||
以下端点不再记录到 Prometheus 指标中:
|
||||
- `/metrics` - 指标端点本身
|
||||
- `/healthz` - 存活检查
|
||||
- `/readyz` - 就绪检查
|
||||
|
||||
**原因**:这些端点通常被频繁调用(如 Kubernetes 健康检查、Prometheus 抓取),但对业务监控意义不大,会产生大量噪音数据。
|
||||
|
||||
### 2. 路径参数规范化
|
||||
|
||||
带有路径参数的端点会被规范化为模板形式:
|
||||
|
||||
| 原始路径 | 规范化后 |
|
||||
|---------|---------|
|
||||
| `GET /jobs/a1b2c3d4e5f6` | `GET /jobs/{job_id}` |
|
||||
| `GET /jobs/xyz123456789` | `GET /jobs/{job_id}` |
|
||||
|
||||
**原因**:避免因为不同的路径参数值产生过多的指标标签,导致指标基数爆炸(cardinality explosion),影响 Prometheus 性能。
|
||||
|
||||
## 实现细节
|
||||
|
||||
### 代码修改
|
||||
|
||||
**文件:`src/functional_scaffold/main.py`**
|
||||
|
||||
1. 添加 `normalize_path()` 函数:
|
||||
```python
|
||||
def normalize_path(path: str) -> str:
|
||||
"""规范化路径,将路径参数替换为模板形式"""
|
||||
if path.startswith("/jobs/") and len(path) > 6:
|
||||
return "/jobs/{job_id}"
|
||||
return path
|
||||
```
|
||||
|
||||
2. 修改 `track_metrics` 中间件:
|
||||
```python
|
||||
# 跳过不需要记录指标的端点
|
||||
skip_paths = {"/metrics", "/readyz", "/healthz"}
|
||||
if request.url.path in skip_paths:
|
||||
return await call_next(request)
|
||||
|
||||
# 使用规范化后的路径记录指标
|
||||
normalized_path = normalize_path(request.url.path)
|
||||
incr("http_requests_total",
|
||||
{"method": request.method, "endpoint": normalized_path, "status": status})
|
||||
```
|
||||
|
||||
### 测试覆盖
|
||||
|
||||
**文件:`tests/test_middleware.py`**
|
||||
|
||||
新增 6 个测试用例:
|
||||
- `test_normalize_jobs_path` - 测试任务路径规范化
|
||||
- `test_normalize_other_paths` - 测试其他路径保持不变
|
||||
- `test_normalize_jobs_root` - 测试 /jobs 根路径
|
||||
- `test_skip_health_endpoints` - 测试跳过健康检查端点
|
||||
- `test_record_normal_endpoints` - 测试记录普通端点
|
||||
- `test_normalize_job_path` - 测试规范化任务路径的集成测试
|
||||
|
||||
所有测试通过:✅ 56/56 passed
|
||||
|
||||
## 验证方法
|
||||
|
||||
### 手动测试
|
||||
|
||||
使用提供的测试脚本:
|
||||
```bash
|
||||
./scripts/test_metrics_filtering.sh
|
||||
```
|
||||
|
||||
### 预期结果
|
||||
|
||||
访问 `/metrics` 端点后,应该看到:
|
||||
|
||||
✅ **应该出现的指标:**
|
||||
```
|
||||
http_requests_total{method="POST",endpoint="/invoke",status="success"} 1
|
||||
http_requests_total{method="GET",endpoint="/jobs/{job_id}",status="error"} 2
|
||||
```
|
||||
|
||||
❌ **不应该出现的指标:**
|
||||
```
|
||||
http_requests_total{method="GET",endpoint="/healthz",...}
|
||||
http_requests_total{method="GET",endpoint="/readyz",...}
|
||||
http_requests_total{method="GET",endpoint="/metrics",...}
|
||||
http_requests_total{method="GET",endpoint="/jobs/a1b2c3d4e5f6",...}
|
||||
```
|
||||
|
||||
## 扩展性
|
||||
|
||||
如果需要添加更多路径规范化规则,只需修改 `normalize_path()` 函数:
|
||||
|
||||
```python
|
||||
def normalize_path(path: str) -> str:
|
||||
"""规范化路径,将路径参数替换为模板形式"""
|
||||
# 任务路径
|
||||
if path.startswith("/jobs/") and len(path) > 6:
|
||||
return "/jobs/{job_id}"
|
||||
|
||||
# 用户路径(示例)
|
||||
if path.startswith("/users/") and len(path) > 7:
|
||||
return "/users/{user_id}"
|
||||
|
||||
# 其他路径保持不变
|
||||
return path
|
||||
```
|
||||
|
||||
## 影响范围
|
||||
|
||||
- ✅ 不影响现有功能
|
||||
- ✅ 不影响 API 行为
|
||||
- ✅ 仅影响指标记录逻辑
|
||||
- ✅ 向后兼容
|
||||
- ✅ 所有测试通过
|
||||
|
||||
## 相关文档
|
||||
|
||||
- [监控指南](../docs/monitoring.md) - 已更新指标说明
|
||||
- [测试脚本](../scripts/test_metrics_filtering.sh) - 手动验证脚本
|
||||
350
docs/monitoring.md
Normal file
350
docs/monitoring.md
Normal file
@@ -0,0 +1,350 @@
|
||||
# 监控指南
|
||||
|
||||
本文档介绍 FunctionalScaffold 的监控体系,包括指标收集、可视化和告警配置。
|
||||
|
||||
## 监控架构
|
||||
|
||||
```
|
||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
||||
│ 应用实例 1 │ │ 应用实例 2 │ │ 应用实例 N │
|
||||
│ /metrics 端点 │ │ /metrics 端点 │ │ /metrics 端点 │
|
||||
└────────┬────────┘ └────────┬────────┘ └────────┬────────┘
|
||||
│ │ │
|
||||
│ 写入指标到 Redis │ │
|
||||
└───────────────────────┼───────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────┐
|
||||
│ Redis │
|
||||
│ (指标聚合存储) │
|
||||
└────────────┬────────────┘
|
||||
│
|
||||
│ 读取并导出
|
||||
▼
|
||||
┌─────────────────────────┐
|
||||
│ Prometheus │
|
||||
│ (抓取 /metrics) │
|
||||
└────────────┬────────────┘
|
||||
│
|
||||
│ 查询
|
||||
▼
|
||||
┌─────────────────────────┐
|
||||
│ Grafana │
|
||||
│ (可视化展示) │
|
||||
└─────────────────────────┘
|
||||
```
|
||||
|
||||
## 快速开始
|
||||
|
||||
### 启动监控服务
|
||||
|
||||
```bash
|
||||
cd deployment
|
||||
docker-compose up -d redis prometheus grafana
|
||||
```
|
||||
|
||||
### 访问地址
|
||||
|
||||
| 服务 | 地址 | 默认账号 |
|
||||
|------|------|---------|
|
||||
| 应用 Metrics | http://localhost:8000/metrics | - |
|
||||
| Prometheus | http://localhost:9090 | - |
|
||||
| Grafana | http://localhost:3000 | admin/admin |
|
||||
|
||||
## 指标说明
|
||||
|
||||
### HTTP 请求指标
|
||||
|
||||
| 指标 | 类型 | 标签 | 描述 |
|
||||
|------|------|------|------|
|
||||
| `http_requests_total` | Counter | method, endpoint, status | HTTP 请求总数 |
|
||||
| `http_request_duration_seconds` | Histogram | method, endpoint | HTTP 请求延迟分布 |
|
||||
| `http_requests_in_progress` | Gauge | - | 当前进行中的请求数 |
|
||||
|
||||
**注意事项:**
|
||||
|
||||
1. **跳过的端点**:以下端点不会被记录到指标中,以减少噪音:
|
||||
- `/metrics` - 指标端点本身
|
||||
- `/healthz` - 存活检查
|
||||
- `/readyz` - 就绪检查
|
||||
|
||||
2. **路径规范化**:带有路径参数的端点会被规范化为模板形式:
|
||||
- `GET /jobs/a1b2c3d4e5f6` → `GET /jobs/{job_id}`
|
||||
- `GET /jobs/xyz123456789` → `GET /jobs/{job_id}`
|
||||
|
||||
这样可以避免因为不同的路径参数值产生过多的指标标签,导致指标基数爆炸。
|
||||
|
||||
### 算法执行指标
|
||||
|
||||
| 指标 | 类型 | 标签 | 描述 |
|
||||
|------|------|------|------|
|
||||
| `algorithm_executions_total` | Counter | algorithm, status | 算法执行总数 |
|
||||
| `algorithm_execution_duration_seconds` | Histogram | algorithm | 算法执行延迟分布 |
|
||||
|
||||
### 异步任务指标
|
||||
|
||||
| 指标 | 类型 | 标签 | 描述 |
|
||||
|------|------|------|------|
|
||||
| `jobs_created_total` | Counter | algorithm | 创建的任务总数 |
|
||||
| `jobs_completed_total` | Counter | algorithm, status | 完成的任务总数 |
|
||||
| `job_execution_duration_seconds` | Histogram | algorithm | 任务执行时间分布 |
|
||||
| `webhook_deliveries_total` | Counter | status | Webhook 发送总数 |
|
||||
|
||||
## Prometheus 查询示例
|
||||
|
||||
### 基础查询
|
||||
|
||||
```promql
|
||||
# 每秒请求数 (QPS)
|
||||
rate(http_requests_total[5m])
|
||||
|
||||
# 按端点分组的 QPS
|
||||
sum(rate(http_requests_total[5m])) by (endpoint)
|
||||
|
||||
# 请求成功率
|
||||
sum(rate(http_requests_total{status="success"}[5m]))
|
||||
/ sum(rate(http_requests_total[5m]))
|
||||
|
||||
# 当前并发请求数
|
||||
http_requests_in_progress
|
||||
```
|
||||
|
||||
### 延迟分析
|
||||
|
||||
```promql
|
||||
# P50 延迟
|
||||
histogram_quantile(0.50, rate(http_request_duration_seconds_bucket[5m]))
|
||||
|
||||
# P95 延迟
|
||||
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
|
||||
|
||||
# P99 延迟
|
||||
histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))
|
||||
|
||||
# 平均延迟
|
||||
rate(http_request_duration_seconds_sum[5m])
|
||||
/ rate(http_request_duration_seconds_count[5m])
|
||||
```
|
||||
|
||||
### 算法分析
|
||||
|
||||
```promql
|
||||
# 算法执行速率
|
||||
sum(rate(algorithm_executions_total[5m])) by (algorithm)
|
||||
|
||||
# 算法失败率
|
||||
sum(rate(algorithm_executions_total{status="error"}[5m]))
|
||||
/ sum(rate(algorithm_executions_total[5m]))
|
||||
|
||||
# 算法 P95 延迟
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(algorithm_execution_duration_seconds_bucket[5m])) by (le, algorithm)
|
||||
)
|
||||
```
|
||||
|
||||
### 异步任务分析
|
||||
|
||||
```promql
|
||||
# 任务创建速率
|
||||
sum(rate(jobs_created_total[5m])) by (algorithm)
|
||||
|
||||
# 任务成功率
|
||||
sum(rate(jobs_completed_total{status="completed"}[5m]))
|
||||
/ sum(rate(jobs_completed_total[5m]))
|
||||
|
||||
# 任务积压(创建速率 - 完成速率)
|
||||
sum(rate(jobs_created_total[5m])) - sum(rate(jobs_completed_total[5m]))
|
||||
|
||||
# Webhook 成功率
|
||||
sum(rate(webhook_deliveries_total{status="success"}[5m]))
|
||||
/ sum(rate(webhook_deliveries_total[5m]))
|
||||
```
|
||||
|
||||
## Grafana 仪表板
|
||||
|
||||
### 导入仪表板
|
||||
|
||||
1. 打开 Grafana: http://localhost:3000
|
||||
2. 登录(admin/admin)
|
||||
3. 进入 **Dashboards** → **Import**
|
||||
4. 上传文件:`monitoring/grafana/dashboard.json`
|
||||
5. 选择 Prometheus 数据源
|
||||
6. 点击 **Import**
|
||||
|
||||
### 仪表板面板
|
||||
|
||||
#### HTTP 监控区域
|
||||
- **HTTP 请求速率 (QPS)** - 每秒请求数趋势
|
||||
- **HTTP 请求延迟** - P50/P95/P99 延迟趋势
|
||||
- **请求成功率** - 成功率仪表盘
|
||||
- **当前并发请求数** - 实时并发数
|
||||
- **HTTP 请求总数** - 累计请求数
|
||||
- **请求分布** - 按端点/状态的饼图
|
||||
|
||||
#### 算法监控区域
|
||||
- **算法执行速率** - 每秒执行次数
|
||||
- **算法执行延迟** - P50/P95/P99 延迟
|
||||
- **算法执行总数** - 累计执行数
|
||||
|
||||
#### 异步任务监控区域
|
||||
- **任务创建总数** - 累计创建的任务数
|
||||
- **任务完成总数** - 累计完成的任务数
|
||||
- **任务失败总数** - 累计失败的任务数
|
||||
- **任务成功率** - 成功率仪表盘
|
||||
- **异步任务速率** - 创建和完成速率趋势
|
||||
- **异步任务执行延迟** - P50/P95/P99 延迟
|
||||
- **任务状态分布** - 按状态的饼图
|
||||
- **Webhook 发送状态** - 成功/失败分布
|
||||
|
||||
## 告警配置
|
||||
|
||||
### 告警规则
|
||||
|
||||
告警规则定义在 `monitoring/alerts/rules.yaml`:
|
||||
|
||||
```yaml
|
||||
groups:
|
||||
- name: functional_scaffold_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# 高错误率告警
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status="error"}[5m]) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "检测到高错误率"
|
||||
description: "端点 {{ $labels.endpoint }} 的错误率为 {{ $value }} 请求/秒"
|
||||
|
||||
# 高延迟告警
|
||||
- alert: HighLatency
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "检测到高延迟"
|
||||
description: "端点 {{ $labels.endpoint }} 的 P95 延迟为 {{ $value }}s"
|
||||
|
||||
# 服务不可用告警
|
||||
- alert: ServiceDown
|
||||
expr: up{job="functional-scaffold"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "服务不可用"
|
||||
description: "FunctionalScaffold 服务已停止超过 1 分钟"
|
||||
|
||||
# 异步任务失败率告警
|
||||
- alert: HighJobFailureRate
|
||||
expr: rate(jobs_completed_total{status="failed"}[5m]) / rate(jobs_completed_total[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "异步任务失败率过高"
|
||||
description: "算法 {{ $labels.algorithm }} 的异步任务失败率超过 10%"
|
||||
|
||||
# 任务积压告警
|
||||
- alert: JobBacklog
|
||||
expr: sum(rate(jobs_created_total[5m])) - sum(rate(jobs_completed_total[5m])) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "异步任务积压"
|
||||
description: "任务创建速率超过完成速率,可能存在积压"
|
||||
```
|
||||
|
||||
### 告警级别
|
||||
|
||||
| 级别 | 描述 | 响应时间 |
|
||||
|------|------|---------|
|
||||
| critical | 严重告警,服务不可用 | 立即响应 |
|
||||
| warning | 警告,性能下降或异常 | 1 小时内响应 |
|
||||
| info | 信息,需要关注 | 工作时间内响应 |
|
||||
|
||||
## 自定义指标
|
||||
|
||||
### 添加新指标
|
||||
|
||||
1. 在 `config/metrics.yaml` 中定义:
|
||||
|
||||
```yaml
|
||||
custom_metrics:
|
||||
my_custom_counter:
|
||||
name: "my_custom_counter"
|
||||
type: counter
|
||||
description: "我的自定义计数器"
|
||||
labels: [label1, label2]
|
||||
|
||||
my_custom_histogram:
|
||||
name: "my_custom_histogram"
|
||||
type: histogram
|
||||
description: "我的自定义直方图"
|
||||
labels: [label1]
|
||||
buckets: [0.1, 0.5, 1, 5, 10]
|
||||
```
|
||||
|
||||
2. 在代码中使用:
|
||||
|
||||
```python
|
||||
from functional_scaffold.core.metrics_unified import incr, observe
|
||||
|
||||
# 增加计数器
|
||||
incr("my_custom_counter", {"label1": "value1", "label2": "value2"})
|
||||
|
||||
# 记录直方图
|
||||
observe("my_custom_histogram", {"label1": "value1"}, 0.5)
|
||||
```
|
||||
|
||||
## 故障排查
|
||||
|
||||
### 指标不显示
|
||||
|
||||
1. 检查应用 metrics 端点:
|
||||
```bash
|
||||
curl http://localhost:8000/metrics
|
||||
```
|
||||
|
||||
2. 检查 Redis 连接:
|
||||
```bash
|
||||
redis-cli ping
|
||||
```
|
||||
|
||||
3. 检查 Prometheus 抓取状态:
|
||||
- 访问 http://localhost:9090/targets
|
||||
- 确认 functional-scaffold 目标状态为 UP
|
||||
|
||||
### Grafana 无数据
|
||||
|
||||
1. 检查数据源配置:
|
||||
- URL 应为 `http://prometheus:9090`(容器内部)
|
||||
- 不是 `http://localhost:9090`
|
||||
|
||||
2. 检查时间范围:
|
||||
- 确保选择了正确的时间范围
|
||||
- 尝试 "Last 5 minutes"
|
||||
|
||||
3. 生成测试流量:
|
||||
```bash
|
||||
./scripts/generate_traffic.sh
|
||||
```
|
||||
|
||||
### 告警不触发
|
||||
|
||||
1. 检查 Prometheus 规则加载:
|
||||
- 访问 http://localhost:9090/rules
|
||||
- 确认规则已加载
|
||||
|
||||
2. 检查告警状态:
|
||||
- 访问 http://localhost:9090/alerts
|
||||
- 查看告警是否处于 pending 或 firing 状态
|
||||
|
||||
## 参考资料
|
||||
|
||||
- [Prometheus 文档](https://prometheus.io/docs/)
|
||||
- [Grafana 文档](https://grafana.com/docs/)
|
||||
- [PromQL 查询语言](https://prometheus.io/docs/prometheus/latest/querying/basics/)
|
||||
258
monitoring/README.md
Normal file
258
monitoring/README.md
Normal file
@@ -0,0 +1,258 @@
|
||||
# Monitoring 目录说明
|
||||
|
||||
本目录包含所有监控和日志收集相关的配置文件。
|
||||
|
||||
## 目录结构
|
||||
|
||||
```
|
||||
monitoring/
|
||||
├── alerts/ # Prometheus 告警规则
|
||||
│ └── rules.yaml # 告警规则配置
|
||||
├── grafana/ # Grafana 配置
|
||||
│ ├── datasources/ # 数据源自动配置
|
||||
│ │ ├── prometheus.yaml # Prometheus 数据源
|
||||
│ │ └── loki.yaml # Loki 数据源
|
||||
│ └── dashboards/ # 仪表板自动加载
|
||||
│ ├── provider.yaml # Dashboard provider 配置
|
||||
│ ├── dashboard.json # 指标监控仪表板
|
||||
│ └── logs-dashboard.json # 日志监控仪表板
|
||||
├── loki.yaml # Loki 日志存储配置
|
||||
├── promtail.yaml # Promtail 日志采集配置
|
||||
└── prometheus.yml # Prometheus 指标收集配置
|
||||
```
|
||||
|
||||
## 配置文件说明
|
||||
|
||||
### Prometheus 配置
|
||||
|
||||
**文件**: `prometheus.yml`
|
||||
|
||||
Prometheus 指标收集配置,包括:
|
||||
- 抓取间隔: 5 秒
|
||||
- 目标: app 服务的 `/metrics` 端点
|
||||
- 告警规则: 从 `alerts/` 目录加载
|
||||
|
||||
### Loki 配置
|
||||
|
||||
**文件**: `loki.yaml`
|
||||
|
||||
Loki 日志存储配置,包括:
|
||||
- 存储方式: 本地文件系统
|
||||
- 日志保留期: 7 天
|
||||
- 摄入速率限制: 10MB/s
|
||||
- 自动压缩和清理
|
||||
|
||||
**关键配置**:
|
||||
```yaml
|
||||
limits_config:
|
||||
retention_period: 168h # 7 天
|
||||
ingestion_rate_mb: 10 # 10MB/s
|
||||
```
|
||||
|
||||
### Promtail 配置
|
||||
|
||||
**文件**: `promtail.yaml`
|
||||
|
||||
Promtail 日志采集配置,支持两种模式:
|
||||
|
||||
**模式 1: Docker stdio 收集(默认)**
|
||||
- 通过 Docker API 自动发现容器
|
||||
- 过滤带有 `logging=promtail` 标签的容器
|
||||
- 自动解析 JSON 日志
|
||||
|
||||
**模式 2: 文件收集(备用)**
|
||||
- 从 `/var/log/app/*.log` 读取日志文件
|
||||
- 支持日志轮转
|
||||
- 需要设置 `LOG_FILE_ENABLED=true`
|
||||
|
||||
### Grafana Provisioning
|
||||
|
||||
**数据源** (`grafana/datasources/`)
|
||||
|
||||
自动配置 Grafana 数据源:
|
||||
- `prometheus.yaml`: Prometheus 数据源(默认)
|
||||
- `loki.yaml`: Loki 数据源
|
||||
|
||||
**仪表板** (`grafana/dashboards/`)
|
||||
|
||||
自动加载 Grafana 仪表板:
|
||||
- `provider.yaml`: Dashboard provider 配置
|
||||
- `dashboard.json`: 指标监控仪表板(HTTP 请求、算法执行等)
|
||||
- `logs-dashboard.json`: 日志监控仪表板(日志流、错误日志等)
|
||||
|
||||
### 告警规则
|
||||
|
||||
**文件**: `alerts/rules.yaml`
|
||||
|
||||
Prometheus 告警规则,包括:
|
||||
- 高错误率告警
|
||||
- 高延迟告警
|
||||
- 服务不可用告警
|
||||
|
||||
## 修改配置
|
||||
|
||||
### 调整日志保留期
|
||||
|
||||
编辑 `loki.yaml`:
|
||||
|
||||
```yaml
|
||||
limits_config:
|
||||
retention_period: 72h # 改为 3 天
|
||||
```
|
||||
|
||||
重启 Loki:
|
||||
|
||||
```bash
|
||||
cd deployment
|
||||
docker-compose restart loki
|
||||
```
|
||||
|
||||
### 调整指标抓取间隔
|
||||
|
||||
编辑 `prometheus.yml`:
|
||||
|
||||
```yaml
|
||||
global:
|
||||
scrape_interval: 10s # 改为 10 秒
|
||||
```
|
||||
|
||||
重启 Prometheus:
|
||||
|
||||
```bash
|
||||
cd deployment
|
||||
docker-compose restart prometheus
|
||||
```
|
||||
|
||||
### 添加新的告警规则
|
||||
|
||||
编辑 `alerts/rules.yaml`,添加新规则:
|
||||
|
||||
```yaml
|
||||
groups:
|
||||
- name: my_alerts
|
||||
rules:
|
||||
- alert: MyAlert
|
||||
expr: my_metric > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "我的告警"
|
||||
```
|
||||
|
||||
重启 Prometheus:
|
||||
|
||||
```bash
|
||||
cd deployment
|
||||
docker-compose restart prometheus
|
||||
```
|
||||
|
||||
### 添加新的仪表板
|
||||
|
||||
1. 在 Grafana UI 中创建仪表板
|
||||
2. 导出为 JSON
|
||||
3. 保存到 `grafana/dashboards/my-dashboard.json`
|
||||
4. 重启 Grafana(或等待自动重载)
|
||||
|
||||
```bash
|
||||
cd deployment
|
||||
docker-compose restart grafana
|
||||
```
|
||||
|
||||
## 验证配置
|
||||
|
||||
### 检查 Prometheus 配置
|
||||
|
||||
```bash
|
||||
# 访问 Prometheus UI
|
||||
open http://localhost:9090
|
||||
|
||||
# 检查目标状态
|
||||
open http://localhost:9090/targets
|
||||
|
||||
# 检查告警规则
|
||||
open http://localhost:9090/alerts
|
||||
```
|
||||
|
||||
### 检查 Loki 配置
|
||||
|
||||
```bash
|
||||
# 检查 Loki 健康状态
|
||||
curl http://localhost:3100/ready
|
||||
|
||||
# 查询标签
|
||||
curl -s "http://localhost:3100/loki/api/v1/label/job/values" | jq
|
||||
```
|
||||
|
||||
### 检查 Grafana 配置
|
||||
|
||||
```bash
|
||||
# 访问 Grafana UI
|
||||
open http://localhost:3000
|
||||
|
||||
# 检查数据源
|
||||
curl -s -u admin:admin http://localhost:3000/api/datasources | jq
|
||||
|
||||
# 检查仪表板
|
||||
curl -s -u admin:admin http://localhost:3000/api/search | jq
|
||||
```
|
||||
|
||||
## 故障排查
|
||||
|
||||
### Prometheus 无法抓取指标
|
||||
|
||||
1. 检查 app 服务是否运行: `docker-compose ps app`
|
||||
2. 检查 metrics 端点: `curl http://localhost:8111/metrics`
|
||||
3. 查看 Prometheus 日志: `docker-compose logs prometheus`
|
||||
|
||||
### Loki 无法接收日志
|
||||
|
||||
1. 检查 Promtail 是否运行: `docker-compose ps promtail`
|
||||
2. 查看 Promtail 日志: `docker-compose logs promtail`
|
||||
3. 检查容器标签: `docker inspect <container> | grep Labels`
|
||||
|
||||
### Grafana 数据源未加载
|
||||
|
||||
1. 检查 provisioning 目录挂载: `docker-compose config | grep grafana -A 10`
|
||||
2. 查看 Grafana 日志: `docker-compose logs grafana`
|
||||
3. 手动重启 Grafana: `docker-compose restart grafana`
|
||||
|
||||
## 相关文档
|
||||
|
||||
- [Loki 集成文档](../docs/loki-integration.md) - 完整的 Loki 使用文档
|
||||
- [Loki 快速参考](../docs/loki-quick-reference.md) - 常用命令和查询
|
||||
- [Loki 实施总结](../docs/loki-implementation-summary.md) - 实施细节和架构说明
|
||||
- [Prometheus 官方文档](https://prometheus.io/docs/)
|
||||
- [Loki 官方文档](https://grafana.com/docs/loki/latest/)
|
||||
- [Grafana 官方文档](https://grafana.com/docs/grafana/latest/)
|
||||
|
||||
## 性能建议
|
||||
|
||||
### 日志量控制
|
||||
|
||||
- 调整日志级别为 WARNING 或 ERROR
|
||||
- 过滤掉不必要的日志(如健康检查)
|
||||
- 减少日志保留期
|
||||
|
||||
### 指标优化
|
||||
|
||||
- 增加抓取间隔(如 15s 或 30s)
|
||||
- 减少指标基数(避免高基数标签)
|
||||
- 定期清理旧数据
|
||||
|
||||
### 存储优化
|
||||
|
||||
- 监控磁盘使用: `docker-compose exec loki du -sh /loki`
|
||||
- 定期备份重要数据
|
||||
- 考虑使用对象存储(S3/OSS)作为后端
|
||||
|
||||
## 总结
|
||||
|
||||
本目录包含完整的监控和日志收集配置:
|
||||
|
||||
✅ **Prometheus** - 指标收集和告警
|
||||
✅ **Loki** - 日志存储和查询
|
||||
✅ **Promtail** - 日志采集
|
||||
✅ **Grafana** - 可视化和仪表板
|
||||
|
||||
所有配置都支持自动加载,无需手动配置。
|
||||
93
monitoring/alerts/rules.yaml
Normal file
93
monitoring/alerts/rules.yaml
Normal file
@@ -0,0 +1,93 @@
|
||||
groups:
|
||||
- name: functional_scaffold_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# 高错误率告警
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status="error"}[5m]) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "检测到高错误率"
|
||||
description: "端点 {{ $labels.endpoint }} 的错误率为 {{ $value }} 请求/秒"
|
||||
|
||||
# 高延迟告警
|
||||
- alert: HighLatency
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "检测到高延迟"
|
||||
description: "端点 {{ $labels.endpoint }} 的 P95 延迟为 {{ $value }}s"
|
||||
|
||||
# 服务不可用告警
|
||||
- alert: ServiceDown
|
||||
expr: up{job="functional-scaffold"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "服务不可用"
|
||||
description: "FunctionalScaffold 服务已停止超过 1 分钟"
|
||||
|
||||
# 算法执行失败率告警
|
||||
- alert: HighAlgorithmFailureRate
|
||||
expr: rate(algorithm_executions_total{status="error"}[5m]) / rate(algorithm_executions_total[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "算法执行失败率过高"
|
||||
description: "算法 {{ $labels.algorithm }} 的失败率超过 10%"
|
||||
|
||||
# 算法执行延迟告警
|
||||
- alert: HighAlgorithmLatency
|
||||
expr: histogram_quantile(0.95, rate(algorithm_execution_duration_seconds_bucket[5m])) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "算法执行延迟过高"
|
||||
description: "算法 {{ $labels.algorithm }} 的 P95 延迟为 {{ $value }}s"
|
||||
|
||||
# 异步任务失败率告警
|
||||
- alert: HighJobFailureRate
|
||||
expr: rate(jobs_completed_total{status="failed"}[5m]) / rate(jobs_completed_total[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "异步任务失败率过高"
|
||||
description: "算法 {{ $labels.algorithm }} 的异步任务失败率超过 10%"
|
||||
|
||||
# 异步任务执行延迟告警
|
||||
- alert: HighJobLatency
|
||||
expr: histogram_quantile(0.95, rate(job_execution_duration_seconds_bucket[5m])) > 60
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "异步任务执行延迟过高"
|
||||
description: "算法 {{ $labels.algorithm }} 的异步任务 P95 延迟为 {{ $value }}s"
|
||||
|
||||
# 异步任务积压告警
|
||||
- alert: JobBacklog
|
||||
expr: sum(rate(jobs_created_total[5m])) - sum(rate(jobs_completed_total[5m])) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "异步任务积压"
|
||||
description: "任务创建速率超过完成速率,可能存在积压"
|
||||
|
||||
# Webhook 发送失败率告警
|
||||
- alert: HighWebhookFailureRate
|
||||
expr: rate(webhook_deliveries_total{status="failed"}[5m]) / rate(webhook_deliveries_total[5m]) > 0.2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Webhook 发送失败率过高"
|
||||
description: "Webhook 发送失败率超过 20%"
|
||||
1936
monitoring/grafana/dashboards/dashboard.json
Normal file
1936
monitoring/grafana/dashboards/dashboard.json
Normal file
File diff suppressed because it is too large
Load Diff
292
monitoring/grafana/dashboards/logs-dashboard.json
Normal file
292
monitoring/grafana/dashboards/logs-dashboard.json
Normal file
@@ -0,0 +1,292 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "grafana",
|
||||
"uid": "-- Grafana --"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "Loki"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"dedupStrategy": "none",
|
||||
"enableLogDetails": true,
|
||||
"prettifyLogMessage": false,
|
||||
"showCommonLabels": false,
|
||||
"showLabels": false,
|
||||
"showTime": true,
|
||||
"sortOrder": "Descending",
|
||||
"wrapLogMessage": false
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "Loki"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "{job=\"functional-scaffold-app\"} |= \"$request_id\"",
|
||||
"queryType": "range",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "日志流 (实时)",
|
||||
"type": "logs"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "Loki"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "Loki"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (level) (count_over_time({job=\"functional-scaffold-app\"} |= \"$request_id\" [1m]))",
|
||||
"queryType": "range",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "日志量趋势(按级别)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "Loki"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 10
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 50
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 10
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"values": false,
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": ""
|
||||
},
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"pluginVersion": "9.5.3",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "Loki"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (level) (count_over_time({job=\"functional-scaffold-app\"} |= \"$request_id\" [$__range]))",
|
||||
"queryType": "range",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "日志级别分布",
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "Loki"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 18
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"dedupStrategy": "none",
|
||||
"enableLogDetails": true,
|
||||
"prettifyLogMessage": false,
|
||||
"showCommonLabels": false,
|
||||
"showLabels": false,
|
||||
"showTime": true,
|
||||
"sortOrder": "Descending",
|
||||
"wrapLogMessage": false
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "Loki"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "{job=\"functional-scaffold-app\", level=\"ERROR\"} |= \"$request_id\"",
|
||||
"queryType": "range",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "错误日志",
|
||||
"type": "logs"
|
||||
}
|
||||
],
|
||||
"refresh": "5s",
|
||||
"schemaVersion": 38,
|
||||
"style": "dark",
|
||||
"tags": ["logs", "loki"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": "",
|
||||
"value": ""
|
||||
},
|
||||
"hide": 0,
|
||||
"label": "Request ID",
|
||||
"name": "request_id",
|
||||
"options": [
|
||||
{
|
||||
"selected": true,
|
||||
"text": "",
|
||||
"value": ""
|
||||
}
|
||||
],
|
||||
"query": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "textbox"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-15m",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "日志监控",
|
||||
"uid": "logs-dashboard",
|
||||
"version": 0,
|
||||
"weekStart": ""
|
||||
}
|
||||
13
monitoring/grafana/dashboards/provider.yaml
Normal file
13
monitoring/grafana/dashboards/provider.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards
|
||||
foldersFromFilesStructure: true
|
||||
11
monitoring/grafana/datasources/loki.yaml
Normal file
11
monitoring/grafana/datasources/loki.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki:3100
|
||||
isDefault: false
|
||||
editable: false
|
||||
jsonData:
|
||||
maxLines: 1000
|
||||
11
monitoring/grafana/datasources/prometheus.yaml
Normal file
11
monitoring/grafana/datasources/prometheus.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
jsonData:
|
||||
timeInterval: "5s"
|
||||
39
monitoring/loki.yaml
Normal file
39
monitoring/loki.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
instance_addr: 127.0.0.1
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-10-24
|
||||
store: boltdb-shipper
|
||||
object_store: filesystem
|
||||
schema: v11
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
limits_config:
|
||||
retention_period: 168h # 7 天
|
||||
ingestion_rate_mb: 10
|
||||
ingestion_burst_size_mb: 20
|
||||
|
||||
compactor:
|
||||
working_directory: /loki/compactor
|
||||
shared_store: filesystem
|
||||
compaction_interval: 10m
|
||||
retention_enabled: true
|
||||
retention_delete_delay: 2h
|
||||
32
monitoring/prometheus.yml
Normal file
32
monitoring/prometheus.yml
Normal file
@@ -0,0 +1,32 @@
|
||||
# Prometheus 配置文件
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'functional-scaffold'
|
||||
environment: 'development'
|
||||
|
||||
# 抓取配置
|
||||
scrape_configs:
|
||||
# 从应用实例抓取指标(Redis 统一指标方案)
|
||||
# 应用通过 /metrics 端点从 Redis 读取并导出 Prometheus 格式指标
|
||||
- job_name: 'functional-scaffold'
|
||||
static_configs:
|
||||
- targets: ['app:8000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 10s
|
||||
|
||||
# Prometheus 自身监控
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# 告警规则文件
|
||||
rule_files:
|
||||
- '/etc/prometheus/rules/*.yaml'
|
||||
|
||||
# Alertmanager 配置(可选)
|
||||
# alerting:
|
||||
# alertmanagers:
|
||||
# - static_configs:
|
||||
# - targets: ['alertmanager:9093']
|
||||
71
monitoring/promtail.yaml
Normal file
71
monitoring/promtail.yaml
Normal file
@@ -0,0 +1,71 @@
|
||||
server:
|
||||
http_listen_port: 9080
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /tmp/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://loki:3100/loki/api/v1/push
|
||||
|
||||
scrape_configs:
|
||||
# 场景 1: Docker stdio 收集(主要方式)
|
||||
- job_name: docker
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
refresh_interval: 5s
|
||||
filters:
|
||||
- name: label
|
||||
values: ["logging=promtail"]
|
||||
relabel_configs:
|
||||
- source_labels: ['__meta_docker_container_name']
|
||||
regex: '/(.*)'
|
||||
target_label: 'container'
|
||||
- source_labels: ['__meta_docker_container_label_logging_jobname']
|
||||
target_label: 'job'
|
||||
- source_labels: ['__meta_docker_container_id']
|
||||
target_label: '__path__'
|
||||
replacement: '/var/lib/docker/containers/$1/*.log'
|
||||
pipeline_stages:
|
||||
- json:
|
||||
expressions:
|
||||
log: log
|
||||
stream: stream
|
||||
time: time
|
||||
- json:
|
||||
source: log
|
||||
expressions:
|
||||
level: levelname
|
||||
logger: name
|
||||
message: message
|
||||
request_id: request_id
|
||||
- labels:
|
||||
level:
|
||||
logger:
|
||||
- output:
|
||||
source: log
|
||||
|
||||
# 场景 2: Log 文件收集(备用)
|
||||
- job_name: app_files
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost
|
||||
labels:
|
||||
job: functional-scaffold-app-files
|
||||
__path__: /var/log/app/*.log
|
||||
pipeline_stages:
|
||||
- json:
|
||||
expressions:
|
||||
timestamp: asctime
|
||||
level: levelname
|
||||
logger: name
|
||||
message: message
|
||||
request_id: request_id
|
||||
- timestamp:
|
||||
source: timestamp
|
||||
format: "2006-01-02 15:04:05,000"
|
||||
- labels:
|
||||
level:
|
||||
logger:
|
||||
- output:
|
||||
source: message
|
||||
56
pyproject.toml
Normal file
56
pyproject.toml
Normal file
@@ -0,0 +1,56 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=65.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "functional-scaffold"
|
||||
version = "1.0.0"
|
||||
description = "算法工程化 Serverless 脚手架"
|
||||
requires-python = ">=3.9"
|
||||
authors = [
|
||||
{name = "FunctionalScaffold Team"}
|
||||
]
|
||||
readme = "README.md"
|
||||
|
||||
dependencies = [
|
||||
"fastapi>=0.109.0",
|
||||
"uvicorn[standard]>=0.27.0",
|
||||
"pydantic>=2.5.0",
|
||||
"pydantic-settings>=2.0.0",
|
||||
"prometheus-client>=0.19.0",
|
||||
"python-json-logger>=2.0.7",
|
||||
# Redis - 任务队列和指标存储
|
||||
"redis>=5.0.0",
|
||||
# YAML 配置解析
|
||||
"pyyaml>=6.0.0",
|
||||
# HTTP 客户端(Webhook 回调)
|
||||
"httpx>=0.27.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest>=7.4.0",
|
||||
"pytest-asyncio>=0.21.0",
|
||||
"pytest-cov>=4.1.0",
|
||||
"black>=23.12.0",
|
||||
"ruff>=0.1.0",
|
||||
]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["src"]
|
||||
|
||||
[tool.black]
|
||||
line-length = 100
|
||||
target-version = ['py39']
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 100
|
||||
target-version = "py39"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
python_files = ["test_*.py"]
|
||||
python_classes = ["Test*"]
|
||||
python_functions = ["test_*"]
|
||||
addopts = "-v --strict-markers"
|
||||
pythonpath = ["src"]
|
||||
6
requirements-dev.txt
Normal file
6
requirements-dev.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
pytest>=7.4.0
|
||||
pytest-asyncio>=0.21.0
|
||||
pytest-cov>=4.1.0
|
||||
httpx>=0.26.0
|
||||
black>=23.12.0
|
||||
ruff>=0.1.0
|
||||
16
requirements.txt
Normal file
16
requirements.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
# 核心依赖 - 与 pyproject.toml 保持同步
|
||||
fastapi>=0.109.0
|
||||
uvicorn[standard]>=0.27.0
|
||||
pydantic>=2.5.0
|
||||
pydantic-settings>=2.0.0
|
||||
prometheus-client>=0.19.0
|
||||
python-json-logger>=2.0.7
|
||||
|
||||
# Redis - 任务队列和指标存储
|
||||
redis>=5.0.0
|
||||
|
||||
# YAML 配置解析
|
||||
pyyaml>=6.0.0
|
||||
|
||||
# HTTP 客户端(Webhook 回调)
|
||||
httpx>=0.27.0
|
||||
35
scripts/export_openapi.py
Normal file
35
scripts/export_openapi.py
Normal file
@@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env python3
|
||||
"""导出 OpenAPI 规范到 JSON 文件"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# 添加 src 到路径
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from functional_scaffold.main import app
|
||||
|
||||
|
||||
def export_openapi():
|
||||
"""导出 OpenAPI 规范"""
|
||||
openapi_schema = app.openapi()
|
||||
|
||||
# 确保输出目录存在
|
||||
output_dir = Path(__file__).parent.parent / "docs" / "swagger"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 写入文件
|
||||
output_file = output_dir / "openapi.json"
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(openapi_schema, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"OpenAPI schema exported to: {output_file}")
|
||||
print(f"Schema version: {openapi_schema.get('openapi')}")
|
||||
print(f"API title: {openapi_schema.get('info', {}).get('title')}")
|
||||
print(f"API version: {openapi_schema.get('info', {}).get('version')}")
|
||||
print(f"Endpoints: {len(openapi_schema.get('paths', {}))}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
export_openapi()
|
||||
22
scripts/generate_traffic.sh
Executable file
22
scripts/generate_traffic.sh
Executable file
@@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
# 生成测试流量脚本
|
||||
|
||||
echo "开始生成测试流量..."
|
||||
echo "按 Ctrl+C 停止"
|
||||
|
||||
count=0
|
||||
while true; do
|
||||
# 随机生成一个数字
|
||||
number=$((RANDOM % 1000 + 1))
|
||||
|
||||
# 发送请求
|
||||
curl -s -X POST http://localhost:8111/invoke \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"number\": $number}" > /dev/null
|
||||
|
||||
count=$((count + 1))
|
||||
echo "[$count] 已发送请求: number=$number"
|
||||
|
||||
# 随机延迟 0.5-2 秒
|
||||
sleep $(awk -v min=0.5 -v max=2 'BEGIN{srand(); print min+rand()*(max-min)}')
|
||||
done
|
||||
24
scripts/run_dev.sh
Executable file
24
scripts/run_dev.sh
Executable file
@@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
# 开发环境启动脚本
|
||||
|
||||
set -e
|
||||
|
||||
echo "Starting FunctionalScaffold in development mode..."
|
||||
|
||||
# 检查虚拟环境
|
||||
if [ ! -d "venv" ]; then
|
||||
echo "Creating virtual environment..."
|
||||
python3 -m venv venv
|
||||
fi
|
||||
|
||||
# 激活虚拟环境
|
||||
source venv/bin/activate
|
||||
|
||||
# 安装依赖
|
||||
echo "Installing dependencies..."
|
||||
pip install -e ".[dev]"
|
||||
|
||||
# 启动服务
|
||||
echo "Starting server on http://localhost:8000"
|
||||
echo "API docs available at http://localhost:8000/docs"
|
||||
uvicorn functional_scaffold.main:app --reload --host 0.0.0.0 --port 8000
|
||||
28
scripts/run_tests.sh
Executable file
28
scripts/run_tests.sh
Executable file
@@ -0,0 +1,28 @@
|
||||
#!/bin/bash
|
||||
# 测试运行脚本
|
||||
|
||||
set -e
|
||||
|
||||
echo "Running tests for FunctionalScaffold..."
|
||||
|
||||
# 激活虚拟环境(如果存在)
|
||||
if [ -d "venv" ]; then
|
||||
source venv/bin/activate
|
||||
fi
|
||||
|
||||
# 运行代码检查
|
||||
echo "Running code quality checks..."
|
||||
echo "- Checking with ruff..."
|
||||
ruff check src/ tests/ || true
|
||||
|
||||
echo "- Checking formatting with black..."
|
||||
black --check src/ tests/ || true
|
||||
|
||||
# 运行测试
|
||||
echo ""
|
||||
echo "Running tests..."
|
||||
pytest tests/ -v --cov=src/functional_scaffold --cov-report=term --cov-report=html
|
||||
|
||||
echo ""
|
||||
echo "Tests completed!"
|
||||
echo "Coverage report available at: htmlcov/index.html"
|
||||
104
scripts/test_concurrency.sh
Executable file
104
scripts/test_concurrency.sh
Executable file
@@ -0,0 +1,104 @@
|
||||
#!/bin/bash
|
||||
# 并发控制测试脚本
|
||||
|
||||
set -e
|
||||
|
||||
BASE_URL="http://localhost:8000"
|
||||
|
||||
echo "=== 异步任务并发控制测试 ==="
|
||||
echo ""
|
||||
|
||||
# 1. 检查服务是否运行
|
||||
echo "1. 检查服务状态..."
|
||||
if ! curl -s "${BASE_URL}/healthz" > /dev/null; then
|
||||
echo "❌ 服务未运行,请先启动服务"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ 服务正常运行"
|
||||
echo ""
|
||||
|
||||
# 2. 查询初始并发状态
|
||||
echo "2. 查询初始并发状态..."
|
||||
curl -s "${BASE_URL}/jobs/concurrency/status" | jq '.'
|
||||
echo ""
|
||||
|
||||
# 3. 创建多个任务
|
||||
echo "3. 创建 15 个任务(测试并发限制)..."
|
||||
JOB_IDS=()
|
||||
for i in {1..15}; do
|
||||
# 使用较大的质数,让任务执行时间更长
|
||||
NUMBER=$((10000 + i * 1000))
|
||||
RESPONSE=$(curl -s -X POST "${BASE_URL}/jobs" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"algorithm\": \"PrimeChecker\", \"params\": {\"number\": ${NUMBER}}}")
|
||||
|
||||
JOB_ID=$(echo "$RESPONSE" | jq -r '.job_id')
|
||||
JOB_IDS+=("$JOB_ID")
|
||||
echo " 创建任务 ${i}/15: job_id=${JOB_ID}"
|
||||
|
||||
# 短暂延迟,避免请求过快
|
||||
sleep 0.1
|
||||
done
|
||||
echo ""
|
||||
|
||||
# 4. 立即查询并发状态(应该看到多个任务在运行)
|
||||
echo "4. 查询并发状态(任务执行中)..."
|
||||
for i in {1..5}; do
|
||||
echo " 第 ${i} 次查询:"
|
||||
STATUS=$(curl -s "${BASE_URL}/jobs/concurrency/status")
|
||||
echo " $(echo "$STATUS" | jq -c '.')"
|
||||
sleep 1
|
||||
done
|
||||
echo ""
|
||||
|
||||
# 5. 等待所有任务完成
|
||||
echo "5. 等待任务完成..."
|
||||
COMPLETED=0
|
||||
TOTAL=${#JOB_IDS[@]}
|
||||
|
||||
while [ $COMPLETED -lt $TOTAL ]; do
|
||||
COMPLETED=0
|
||||
for JOB_ID in "${JOB_IDS[@]}"; do
|
||||
STATUS=$(curl -s "${BASE_URL}/jobs/${JOB_ID}" | jq -r '.status')
|
||||
if [ "$STATUS" = "completed" ] || [ "$STATUS" = "failed" ]; then
|
||||
((COMPLETED++))
|
||||
fi
|
||||
done
|
||||
|
||||
echo " 进度: ${COMPLETED}/${TOTAL} 任务完成"
|
||||
|
||||
# 显示当前并发状态
|
||||
CONCURRENCY=$(curl -s "${BASE_URL}/jobs/concurrency/status")
|
||||
echo " 并发状态: $(echo "$CONCURRENCY" | jq -c '.')"
|
||||
|
||||
if [ $COMPLETED -lt $TOTAL ]; then
|
||||
sleep 2
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
|
||||
# 6. 查询最终并发状态
|
||||
echo "6. 查询最终并发状态..."
|
||||
curl -s "${BASE_URL}/jobs/concurrency/status" | jq '.'
|
||||
echo ""
|
||||
|
||||
# 7. 显示任务结果统计
|
||||
echo "7. 任务结果统计..."
|
||||
COMPLETED_COUNT=0
|
||||
FAILED_COUNT=0
|
||||
|
||||
for JOB_ID in "${JOB_IDS[@]}"; do
|
||||
STATUS=$(curl -s "${BASE_URL}/jobs/${JOB_ID}" | jq -r '.status')
|
||||
if [ "$STATUS" = "completed" ]; then
|
||||
((COMPLETED_COUNT++))
|
||||
elif [ "$STATUS" = "failed" ]; then
|
||||
((FAILED_COUNT++))
|
||||
fi
|
||||
done
|
||||
|
||||
echo " 总任务数: ${TOTAL}"
|
||||
echo " 成功: ${COMPLETED_COUNT}"
|
||||
echo " 失败: ${FAILED_COUNT}"
|
||||
echo ""
|
||||
|
||||
echo "=== 测试完成 ==="
|
||||
262
scripts/test_metrics.py
Executable file
262
scripts/test_metrics.py
Executable file
@@ -0,0 +1,262 @@
|
||||
#!/usr/bin/env python3
|
||||
"""指标方案测试脚本"""
|
||||
|
||||
import requests
|
||||
import time
|
||||
import sys
|
||||
from typing import Literal
|
||||
|
||||
MetricsBackend = Literal["pushgateway", "redis", "memory"]
|
||||
|
||||
|
||||
def test_pushgateway():
|
||||
"""测试 Pushgateway 方案"""
|
||||
print("\n=== 测试 Pushgateway 方案 ===\n")
|
||||
|
||||
# 1. 检查 Pushgateway 是否运行
|
||||
try:
|
||||
response = requests.get("http://localhost:9091/metrics", timeout=2)
|
||||
print(f"✓ Pushgateway 运行正常 (状态码: {response.status_code})")
|
||||
except Exception as e:
|
||||
print(f"✗ Pushgateway 未运行: {e}")
|
||||
return False
|
||||
|
||||
# 2. 发送测试请求到应用
|
||||
print("\n发送测试请求...")
|
||||
for i in range(5):
|
||||
try:
|
||||
response = requests.post(
|
||||
"http://localhost:8000/invoke",
|
||||
json={"number": 17},
|
||||
timeout=5,
|
||||
)
|
||||
print(f" 请求 {i+1}: {response.status_code}")
|
||||
time.sleep(0.5)
|
||||
except Exception as e:
|
||||
print(f" 请求 {i+1} 失败: {e}")
|
||||
|
||||
# 3. 等待指标推送
|
||||
print("\n等待指标推送...")
|
||||
time.sleep(2)
|
||||
|
||||
# 4. 检查 Pushgateway 中的指标
|
||||
try:
|
||||
response = requests.get("http://localhost:9091/metrics", timeout=2)
|
||||
metrics = response.text
|
||||
|
||||
# 查找关键指标
|
||||
if "http_requests_total" in metrics:
|
||||
print("✓ 找到 http_requests_total 指标")
|
||||
# 提取指标值
|
||||
for line in metrics.split("\n"):
|
||||
if "http_requests_total" in line and not line.startswith("#"):
|
||||
print(f" {line}")
|
||||
else:
|
||||
print("✗ 未找到 http_requests_total 指标")
|
||||
|
||||
if "algorithm_executions_total" in metrics:
|
||||
print("✓ 找到 algorithm_executions_total 指标")
|
||||
for line in metrics.split("\n"):
|
||||
if "algorithm_executions_total" in line and not line.startswith("#"):
|
||||
print(f" {line}")
|
||||
else:
|
||||
print("✗ 未找到 algorithm_executions_total 指标")
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ 获取指标失败: {e}")
|
||||
return False
|
||||
|
||||
# 5. 检查 Prometheus 是否能抓取
|
||||
print("\n检查 Prometheus...")
|
||||
try:
|
||||
response = requests.get(
|
||||
"http://localhost:9090/api/v1/query",
|
||||
params={"query": "http_requests_total"},
|
||||
timeout=5,
|
||||
)
|
||||
data = response.json()
|
||||
if data["status"] == "success" and data["data"]["result"]:
|
||||
print(f"✓ Prometheus 成功抓取指标,找到 {len(data['data']['result'])} 条记录")
|
||||
for result in data["data"]["result"][:3]:
|
||||
print(f" {result['metric']} = {result['value'][1]}")
|
||||
else:
|
||||
print("✗ Prometheus 未找到指标")
|
||||
except Exception as e:
|
||||
print(f"✗ Prometheus 查询失败: {e}")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def test_redis():
|
||||
"""测试 Redis 方案"""
|
||||
print("\n=== 测试 Redis 方案 ===\n")
|
||||
|
||||
# 1. 检查 Redis 是否运行
|
||||
try:
|
||||
import redis
|
||||
|
||||
client = redis.Redis(host="localhost", port=6379, db=0, decode_responses=True)
|
||||
client.ping()
|
||||
print("✓ Redis 运行正常")
|
||||
except ImportError:
|
||||
print("✗ Redis 库未安装,请运行: pip install redis")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"✗ Redis 未运行: {e}")
|
||||
return False
|
||||
|
||||
# 2. 清空测试数据
|
||||
print("\n清空旧数据...")
|
||||
try:
|
||||
keys = client.keys("metrics:*")
|
||||
if keys:
|
||||
client.delete(*keys)
|
||||
print(f" 删除了 {len(keys)} 个键")
|
||||
except Exception as e:
|
||||
print(f" 清空失败: {e}")
|
||||
|
||||
# 3. 发送测试请求
|
||||
print("\n发送测试请求...")
|
||||
for i in range(5):
|
||||
try:
|
||||
response = requests.post(
|
||||
"http://localhost:8000/invoke",
|
||||
json={"number": 17},
|
||||
timeout=5,
|
||||
)
|
||||
print(f" 请求 {i+1}: {response.status_code}")
|
||||
time.sleep(0.5)
|
||||
except Exception as e:
|
||||
print(f" 请求 {i+1} 失败: {e}")
|
||||
|
||||
# 4. 检查 Redis 中的指标
|
||||
print("\n检查 Redis 指标...")
|
||||
try:
|
||||
# 检查计数器
|
||||
counter_data = client.hgetall("metrics:request_counter")
|
||||
if counter_data:
|
||||
print(f"✓ 找到 {len(counter_data)} 个请求计数器指标")
|
||||
for key, value in list(counter_data.items())[:5]:
|
||||
if not key.endswith(":timestamp"):
|
||||
print(f" {key} = {value}")
|
||||
else:
|
||||
print("✗ 未找到请求计数器指标")
|
||||
|
||||
# 检查算法计数器
|
||||
algo_data = client.hgetall("metrics:algorithm_counter")
|
||||
if algo_data:
|
||||
print(f"✓ 找到 {len(algo_data)} 个算法计数器指标")
|
||||
for key, value in list(algo_data.items())[:5]:
|
||||
if not key.endswith(":timestamp"):
|
||||
print(f" {key} = {value}")
|
||||
else:
|
||||
print("✗ 未找到算法计数器指标")
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ 检查 Redis 失败: {e}")
|
||||
return False
|
||||
|
||||
# 5. 检查 Redis Exporter
|
||||
print("\n检查 Redis Exporter...")
|
||||
try:
|
||||
response = requests.get("http://localhost:8001/metrics", timeout=2)
|
||||
metrics = response.text
|
||||
|
||||
if "http_requests_total" in metrics:
|
||||
print("✓ Exporter 成功导出 http_requests_total")
|
||||
for line in metrics.split("\n"):
|
||||
if "http_requests_total" in line and not line.startswith("#"):
|
||||
print(f" {line}")
|
||||
break
|
||||
else:
|
||||
print("✗ Exporter 未导出 http_requests_total")
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Redis Exporter 未运行: {e}")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def test_memory():
|
||||
"""测试原有的内存方案"""
|
||||
print("\n=== 测试内存方案(原有方案)===\n")
|
||||
|
||||
# 发送测试请求
|
||||
print("发送测试请求...")
|
||||
for i in range(5):
|
||||
try:
|
||||
response = requests.post(
|
||||
"http://localhost:8000/invoke",
|
||||
json={"number": 17},
|
||||
timeout=5,
|
||||
)
|
||||
print(f" 请求 {i+1}: {response.status_code}")
|
||||
time.sleep(0.5)
|
||||
except Exception as e:
|
||||
print(f" 请求 {i+1} 失败: {e}")
|
||||
|
||||
# 检查应用的 /metrics 端点
|
||||
print("\n检查应用 /metrics 端点...")
|
||||
try:
|
||||
response = requests.get("http://localhost:8000/metrics", timeout=2)
|
||||
metrics = response.text
|
||||
|
||||
if "http_requests_total" in metrics:
|
||||
print("✓ 找到 http_requests_total 指标")
|
||||
for line in metrics.split("\n"):
|
||||
if "http_requests_total" in line and not line.startswith("#"):
|
||||
print(f" {line}")
|
||||
break
|
||||
else:
|
||||
print("✗ 未找到指标")
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ 获取指标失败: {e}")
|
||||
return False
|
||||
|
||||
print("\n⚠️ 注意:内存方案在多实例部署时,每个实例的指标是独立的")
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
print("=" * 60)
|
||||
print("FunctionalScaffold 指标方案测试")
|
||||
print("=" * 60)
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
backend = sys.argv[1]
|
||||
else:
|
||||
print("\n请选择要测试的方案:")
|
||||
print("1. Pushgateway(推荐)")
|
||||
print("2. Redis + Exporter")
|
||||
print("3. Memory(原有方案)")
|
||||
choice = input("\n输入选项 (1/2/3): ").strip()
|
||||
|
||||
backend_map = {"1": "pushgateway", "2": "redis", "3": "memory"}
|
||||
backend = backend_map.get(choice, "pushgateway")
|
||||
|
||||
print(f"\n选择的方案: {backend}")
|
||||
|
||||
# 运行测试
|
||||
if backend == "pushgateway":
|
||||
success = test_pushgateway()
|
||||
elif backend == "redis":
|
||||
success = test_redis()
|
||||
elif backend == "memory":
|
||||
success = test_memory()
|
||||
else:
|
||||
print(f"未知的方案: {backend}")
|
||||
sys.exit(1)
|
||||
|
||||
# 输出结果
|
||||
print("\n" + "=" * 60)
|
||||
if success:
|
||||
print("✓ 测试通过")
|
||||
else:
|
||||
print("✗ 测试失败")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
39
scripts/test_metrics_filtering.sh
Executable file
39
scripts/test_metrics_filtering.sh
Executable file
@@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
# 测试指标过滤和路径规范化
|
||||
|
||||
echo "=== 测试指标过滤和路径规范化 ==="
|
||||
echo ""
|
||||
|
||||
# 启动服务(假设已经在运行)
|
||||
BASE_URL="http://localhost:8000"
|
||||
|
||||
echo "1. 访问健康检查端点(应该被跳过,不记录指标)"
|
||||
curl -s "$BASE_URL/healthz" > /dev/null
|
||||
curl -s "$BASE_URL/readyz" > /dev/null
|
||||
echo " ✓ 已访问 /healthz 和 /readyz"
|
||||
echo ""
|
||||
|
||||
echo "2. 访问普通端点(应该记录指标)"
|
||||
curl -s -X POST "$BASE_URL/invoke" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"number": 17}' > /dev/null
|
||||
echo " ✓ 已访问 POST /invoke"
|
||||
echo ""
|
||||
|
||||
echo "3. 访问任务端点(应该规范化为 /jobs/{job_id})"
|
||||
curl -s "$BASE_URL/jobs/a1b2c3d4e5f6" > /dev/null
|
||||
curl -s "$BASE_URL/jobs/xyz123456789" > /dev/null
|
||||
echo " ✓ 已访问 GET /jobs/a1b2c3d4e5f6 和 GET /jobs/xyz123456789"
|
||||
echo ""
|
||||
|
||||
echo "4. 查看指标输出"
|
||||
echo " 查找 http_requests_total 指标:"
|
||||
curl -s "$BASE_URL/metrics" | grep 'http_requests_total{' | grep -v '#'
|
||||
echo ""
|
||||
echo " 预期结果:"
|
||||
echo " - 应该看到 endpoint=\"/invoke\" 的记录"
|
||||
echo " - 应该看到 endpoint=\"/jobs/{job_id}\" 的记录(而不是具体的 job_id)"
|
||||
echo " - 不应该看到 endpoint=\"/healthz\" 或 endpoint=\"/readyz\" 的记录"
|
||||
echo " - 不应该看到 endpoint=\"/metrics\" 的记录"
|
||||
echo ""
|
||||
echo "=== 测试完成 ==="
|
||||
69
scripts/test_request_id_filter.sh
Executable file
69
scripts/test_request_id_filter.sh
Executable file
@@ -0,0 +1,69 @@
|
||||
#!/bin/bash
|
||||
# Grafana Request ID 过滤功能测试脚本
|
||||
|
||||
set -e
|
||||
|
||||
echo "========================================="
|
||||
echo "Grafana Request ID 过滤功能测试"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[0;32m'
|
||||
BLUE='\033[0;34m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
echo "1. 生成测试请求..."
|
||||
echo "-------------------"
|
||||
RESPONSE=$(curl -X POST http://localhost:8111/invoke \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"number": 43}' \
|
||||
-s)
|
||||
|
||||
REQUEST_ID=$(echo "$RESPONSE" | jq -r '.request_id')
|
||||
echo -e "${GREEN}✓ 请求成功${NC}"
|
||||
echo -e "${BLUE}Request ID: $REQUEST_ID${NC}"
|
||||
|
||||
echo ""
|
||||
echo "2. 等待日志收集 (5秒)..."
|
||||
sleep 5
|
||||
|
||||
echo ""
|
||||
echo "3. 测试 Loki 过滤..."
|
||||
echo "-------------------"
|
||||
|
||||
# 测试过滤特定 request_id
|
||||
LOG_COUNT=$(curl -G -s "http://localhost:3100/loki/api/v1/query_range" \
|
||||
--data-urlencode "query={job=\"functional-scaffold-app\"} |= \"$REQUEST_ID\"" \
|
||||
| jq '.data.result[0].values | length')
|
||||
|
||||
if [ "$LOG_COUNT" -gt 0 ]; then
|
||||
echo -e "${GREEN}✓ 找到 $LOG_COUNT 条日志${NC}"
|
||||
else
|
||||
echo -e "${YELLOW}⚠ 没有找到日志,可能需要等待更长时间${NC}"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "4. 显示日志内容..."
|
||||
echo "-------------------"
|
||||
curl -G -s "http://localhost:3100/loki/api/v1/query_range" \
|
||||
--data-urlencode "query={job=\"functional-scaffold-app\"} |= \"$REQUEST_ID\"" \
|
||||
| jq -r '.data.result[0].values[].[-1]' \
|
||||
| jq -r '.message' \
|
||||
| nl
|
||||
|
||||
echo ""
|
||||
echo "========================================="
|
||||
echo "测试完成!"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
echo "在 Grafana 中测试:"
|
||||
echo " 1. 访问: http://localhost:3000"
|
||||
echo " 2. 进入 '日志监控' 仪表板"
|
||||
echo " 3. 在顶部 'Request ID' 输入框中输入:"
|
||||
echo -e " ${BLUE}$REQUEST_ID${NC}"
|
||||
echo " 4. 按回车,查看过滤后的日志"
|
||||
echo ""
|
||||
echo "清空 Request ID 输入框可以查看所有日志"
|
||||
echo ""
|
||||
100
scripts/verify_loki.sh
Executable file
100
scripts/verify_loki.sh
Executable file
@@ -0,0 +1,100 @@
|
||||
#!/bin/bash
|
||||
# Loki 集成验证脚本
|
||||
|
||||
set -e
|
||||
|
||||
echo "========================================="
|
||||
echo "Loki 日志收集系统验证"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[0;32m'
|
||||
RED='\033[0;31m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 检查服务状态
|
||||
echo "1. 检查服务状态..."
|
||||
echo "-------------------"
|
||||
docker-compose ps
|
||||
|
||||
echo ""
|
||||
echo "2. 检查 Loki 健康状态..."
|
||||
echo "-------------------"
|
||||
if curl -s http://localhost:3100/ready | grep -q "ready"; then
|
||||
echo -e "${GREEN}✓ Loki 服务正常${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ Loki 服务异常${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "3. 检查 Promtail 健康状态..."
|
||||
echo "-------------------"
|
||||
if curl -s http://localhost:9080/ready | grep -q "ready"; then
|
||||
echo -e "${GREEN}✓ Promtail 服务正常${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ Promtail 服务异常${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "4. 生成测试日志..."
|
||||
echo "-------------------"
|
||||
curl -X POST http://localhost:8111/invoke \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"algorithm": "PrimeChecker", "params": {"number": 17}}' \
|
||||
-s -o /dev/null -w "HTTP Status: %{http_code}\n"
|
||||
|
||||
echo ""
|
||||
echo "5. 等待日志收集 (5秒)..."
|
||||
sleep 5
|
||||
|
||||
echo ""
|
||||
echo "6. 查询 Loki 日志..."
|
||||
echo "-------------------"
|
||||
LOGS=$(curl -G -s "http://localhost:3100/loki/api/v1/query_range" \
|
||||
--data-urlencode 'query={job="functional-scaffold-app"}' \
|
||||
--data-urlencode 'limit=5')
|
||||
|
||||
if echo "$LOGS" | jq -e '.data.result | length > 0' > /dev/null 2>&1; then
|
||||
echo -e "${GREEN}✓ 成功查询到日志${NC}"
|
||||
echo ""
|
||||
echo "最近的日志条目:"
|
||||
echo "$LOGS" | jq -r '.data.result[0].values[-1][1]' | head -3
|
||||
else
|
||||
echo -e "${YELLOW}⚠ 暂时没有查询到日志,可能需要等待更长时间${NC}"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "7. 检查 Grafana 数据源..."
|
||||
echo "-------------------"
|
||||
DATASOURCES=$(curl -s -u admin:admin http://localhost:3000/api/datasources)
|
||||
if echo "$DATASOURCES" | jq -e '.[] | select(.name == "Loki")' > /dev/null 2>&1; then
|
||||
echo -e "${GREEN}✓ Loki 数据源已配置${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ Loki 数据源未配置${NC}"
|
||||
fi
|
||||
|
||||
if echo "$DATASOURCES" | jq -e '.[] | select(.name == "Prometheus")' > /dev/null 2>&1; then
|
||||
echo -e "${GREEN}✓ Prometheus 数据源已配置${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ Prometheus 数据源未配置${NC}"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "========================================="
|
||||
echo "验证完成!"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
echo "访问地址:"
|
||||
echo " - Grafana: http://localhost:3000 (admin/admin)"
|
||||
echo " - Loki: http://localhost:3100"
|
||||
echo " - Promtail: http://localhost:9080"
|
||||
echo ""
|
||||
echo "查看日志:"
|
||||
echo " 1. 访问 Grafana Explore: http://localhost:3000/explore"
|
||||
echo " 2. 选择 Loki 数据源"
|
||||
echo " 3. 输入查询: {job=\"functional-scaffold-app\"}"
|
||||
echo ""
|
||||
3
src/functional_scaffold/__init__.py
Normal file
3
src/functional_scaffold/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""FunctionalScaffold - 算法工程化 Serverless 脚手架"""
|
||||
|
||||
__version__ = "1.0.0"
|
||||
6
src/functional_scaffold/algorithms/__init__.py
Normal file
6
src/functional_scaffold/algorithms/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""算法模块"""
|
||||
|
||||
from .base import BaseAlgorithm
|
||||
from .prime_checker import PrimeChecker
|
||||
|
||||
__all__ = ["BaseAlgorithm", "PrimeChecker"]
|
||||
75
src/functional_scaffold/algorithms/base.py
Normal file
75
src/functional_scaffold/algorithms/base.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""算法基类"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict
|
||||
import time
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseAlgorithm(ABC):
|
||||
"""算法基类,所有算法必须继承此类"""
|
||||
|
||||
def __init__(self):
|
||||
self.name = self.__class__.__name__
|
||||
self.version = "1.0.0"
|
||||
|
||||
@abstractmethod
|
||||
def process(self, *args, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
算法处理逻辑,子类必须实现此方法
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 算法处理结果
|
||||
"""
|
||||
pass
|
||||
|
||||
def execute(self, *args, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
执行算法,包含埋点和错误处理
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 包含结果和元数据的字典
|
||||
"""
|
||||
from ..core.metrics_unified import incr, observe
|
||||
|
||||
start_time = time.time()
|
||||
status = "success"
|
||||
|
||||
try:
|
||||
logger.info(f"Starting algorithm: {self.name}")
|
||||
result = self.process(*args, **kwargs)
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
logger.info(f"Algorithm {self.name} completed successfully in {elapsed_time:.3f}s")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"result": result,
|
||||
"metadata": {
|
||||
"algorithm": self.name,
|
||||
"version": self.version,
|
||||
"elapsed_time": elapsed_time,
|
||||
},
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
status = "error"
|
||||
elapsed_time = time.time() - start_time
|
||||
logger.error(f"Algorithm {self.name} failed: {str(e)}", exc_info=True)
|
||||
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"metadata": {
|
||||
"algorithm": self.name,
|
||||
"version": self.version,
|
||||
"elapsed_time": elapsed_time,
|
||||
},
|
||||
}
|
||||
finally:
|
||||
# 记录算法执行指标
|
||||
elapsed_time = time.time() - start_time
|
||||
incr("algorithm_executions_total", {"algorithm": self.name, "status": status})
|
||||
observe("algorithm_execution_duration_seconds", {"algorithm": self.name}, elapsed_time)
|
||||
97
src/functional_scaffold/algorithms/prime_checker.py
Normal file
97
src/functional_scaffold/algorithms/prime_checker.py
Normal file
@@ -0,0 +1,97 @@
|
||||
"""质数判断算法"""
|
||||
|
||||
from typing import Dict, Any, List
|
||||
from .base import BaseAlgorithm
|
||||
from ..core.metrics_unified import incr
|
||||
|
||||
|
||||
class PrimeChecker(BaseAlgorithm):
|
||||
"""
|
||||
质数判断算法
|
||||
|
||||
使用试除法判断一个整数是否为质数,并返回因数分解结果
|
||||
"""
|
||||
|
||||
def process(self, number: int) -> Dict[str, Any]:
|
||||
"""
|
||||
判断给定数字是否为质数
|
||||
|
||||
Args:
|
||||
number: 待判断的整数
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 包含判断结果的字典
|
||||
- number: 输入的数字
|
||||
- is_prime: 是否为质数
|
||||
- factors: 因数列表(如果不是质数)
|
||||
- reason: 说明(如果适用)
|
||||
- algorithm: 使用的算法名称
|
||||
|
||||
Raises:
|
||||
ValueError: 如果输入不是整数
|
||||
"""
|
||||
if not isinstance(number, int):
|
||||
incr('prime_check',{"status":"invalid_input"})
|
||||
raise ValueError(f"Input must be an integer, got {type(number).__name__}")
|
||||
|
||||
# 小于2的数不是质数
|
||||
if number < 2:
|
||||
incr('prime_check', {"status": "number_little_two"})
|
||||
return {
|
||||
"number": number,
|
||||
"is_prime": False,
|
||||
"reason": "Numbers less than 2 are not prime",
|
||||
"factors": [],
|
||||
"algorithm": "trial_division",
|
||||
}
|
||||
|
||||
# 判断是否为质数
|
||||
is_prime = self._is_prime(number)
|
||||
|
||||
# 如果不是质数,计算因数
|
||||
factors = [] if is_prime else self._get_factors(number)
|
||||
incr('prime_check', {"status": "success"})
|
||||
return {
|
||||
"number": number,
|
||||
"is_prime": is_prime,
|
||||
"factors": factors,
|
||||
"algorithm": "trial_division",
|
||||
}
|
||||
|
||||
def _is_prime(self, n: int) -> bool:
|
||||
"""
|
||||
使用试除法判断是否为质数
|
||||
|
||||
Args:
|
||||
n: 待判断的正整数
|
||||
|
||||
Returns:
|
||||
bool: 是否为质数
|
||||
"""
|
||||
if n == 2:
|
||||
return True
|
||||
if n % 2 == 0:
|
||||
return False
|
||||
|
||||
# 只需检查到sqrt(n)
|
||||
for i in range(3, int(n**0.5) + 1, 2):
|
||||
if n % i == 0:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _get_factors(self, n: int) -> List[int]:
|
||||
"""
|
||||
获取一个数的所有因数(不包括1和自身)
|
||||
|
||||
Args:
|
||||
n: 待分解的正整数
|
||||
|
||||
Returns:
|
||||
List[int]: 因数列表
|
||||
"""
|
||||
factors = []
|
||||
for i in range(2, n):
|
||||
if n % i == 0:
|
||||
factors.append(i)
|
||||
return factors
|
||||
6
src/functional_scaffold/api/__init__.py
Normal file
6
src/functional_scaffold/api/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""API 模块"""
|
||||
|
||||
from .routes import router
|
||||
from .models import InvokeRequest, InvokeResponse, HealthResponse, ErrorResponse
|
||||
|
||||
__all__ = ["router", "InvokeRequest", "InvokeResponse", "HealthResponse", "ErrorResponse"]
|
||||
26
src/functional_scaffold/api/dependencies.py
Normal file
26
src/functional_scaffold/api/dependencies.py
Normal file
@@ -0,0 +1,26 @@
|
||||
"""API 依赖注入"""
|
||||
|
||||
from fastapi import Header, HTTPException
|
||||
from typing import Optional
|
||||
from ..core.tracing import set_request_id, generate_request_id, get_request_id as get_current_request_id
|
||||
|
||||
|
||||
async def get_request_id(x_request_id: Optional[str] = Header(None)) -> str:
|
||||
"""
|
||||
获取或生成请求ID
|
||||
|
||||
Args:
|
||||
x_request_id: 从请求头获取的请求ID
|
||||
|
||||
Returns:
|
||||
str: 请求ID
|
||||
"""
|
||||
# 先检查 ContextVar 中是否已经有 request_id(由中间件设置)
|
||||
existing_request_id = get_current_request_id()
|
||||
if existing_request_id:
|
||||
return existing_request_id
|
||||
|
||||
# 如果没有,则从请求头获取或生成新的
|
||||
request_id = x_request_id or generate_request_id()
|
||||
set_request_id(request_id)
|
||||
return request_id
|
||||
172
src/functional_scaffold/api/models.py
Normal file
172
src/functional_scaffold/api/models.py
Normal file
@@ -0,0 +1,172 @@
|
||||
"""API 数据模型"""
|
||||
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, Field, ConfigDict
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
|
||||
class InvokeRequest(BaseModel):
|
||||
"""同步调用请求"""
|
||||
|
||||
model_config = ConfigDict(json_schema_extra={"example": {"number": 17}})
|
||||
|
||||
number: int = Field(..., description="待判断的整数")
|
||||
|
||||
|
||||
class InvokeResponse(BaseModel):
|
||||
"""同步调用响应"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
json_schema_extra={
|
||||
"example": {
|
||||
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
||||
"status": "success",
|
||||
"result": {
|
||||
"number": 17,
|
||||
"is_prime": True,
|
||||
"factors": [],
|
||||
"algorithm": "trial_division",
|
||||
},
|
||||
"metadata": {
|
||||
"algorithm": "PrimeChecker",
|
||||
"version": "1.0.0",
|
||||
"elapsed_time": 0.001,
|
||||
},
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
request_id: str = Field(..., description="请求唯一标识")
|
||||
status: str = Field(..., description="处理状态")
|
||||
result: Dict[str, Any] = Field(..., description="算法执行结果")
|
||||
metadata: Dict[str, Any] = Field(..., description="元数据信息")
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
"""健康检查响应"""
|
||||
|
||||
status: str = Field(..., description="健康状态")
|
||||
timestamp: float = Field(..., description="时间戳")
|
||||
|
||||
|
||||
class ReadinessResponse(BaseModel):
|
||||
"""就绪检查响应"""
|
||||
|
||||
status: str = Field(..., description="就绪状态")
|
||||
timestamp: float = Field(..., description="时间戳")
|
||||
checks: Optional[Dict[str, bool]] = Field(None, description="各项检查结果")
|
||||
|
||||
|
||||
class ErrorResponse(BaseModel):
|
||||
"""错误响应"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
json_schema_extra={
|
||||
"example": {
|
||||
"error": "VALIDATION_ERROR",
|
||||
"message": "number must be an integer",
|
||||
"details": {"field": "number", "value": "abc"},
|
||||
"request_id": "550e8400-e29b-41d4-a716-446655440000",
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
error: str = Field(..., description="错误代码")
|
||||
message: str = Field(..., description="错误消息")
|
||||
details: Optional[Dict[str, Any]] = Field(None, description="错误详情")
|
||||
request_id: Optional[str] = Field(None, description="请求ID")
|
||||
|
||||
|
||||
class JobStatus(str, Enum):
|
||||
"""任务状态枚举"""
|
||||
|
||||
PENDING = "pending" # 等待执行
|
||||
RUNNING = "running" # 执行中
|
||||
COMPLETED = "completed" # 已完成
|
||||
FAILED = "failed" # 执行失败
|
||||
|
||||
|
||||
class JobRequest(BaseModel):
|
||||
"""异步任务请求"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
json_schema_extra={
|
||||
"example": {
|
||||
"algorithm": "PrimeChecker",
|
||||
"params": {"number": 17},
|
||||
"webhook": "https://example.com/callback",
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
algorithm: str = Field(..., description="算法名称")
|
||||
params: Dict[str, Any] = Field(..., description="算法参数")
|
||||
webhook: Optional[str] = Field(None, description="回调 URL")
|
||||
|
||||
|
||||
class JobCreateResponse(BaseModel):
|
||||
"""任务创建响应"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
json_schema_extra={
|
||||
"example": {
|
||||
"job_id": "a1b2c3d4e5f6",
|
||||
"status": "pending",
|
||||
"message": "任务已创建",
|
||||
"created_at": "2026-02-02T10:00:00Z",
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
job_id: str = Field(..., description="任务唯一标识")
|
||||
status: JobStatus = Field(..., description="任务状态")
|
||||
message: str = Field(..., description="状态消息")
|
||||
created_at: str = Field(..., description="创建时间(ISO 8601)")
|
||||
|
||||
|
||||
class JobStatusResponse(BaseModel):
|
||||
"""任务状态查询响应"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
json_schema_extra={
|
||||
"example": {
|
||||
"job_id": "a1b2c3d4e5f6",
|
||||
"status": "completed",
|
||||
"algorithm": "PrimeChecker",
|
||||
"created_at": "2026-02-02T10:00:00Z",
|
||||
"started_at": "2026-02-02T10:00:01Z",
|
||||
"completed_at": "2026-02-02T10:00:02Z",
|
||||
"result": {"number": 17, "is_prime": True},
|
||||
"error": None,
|
||||
"metadata": {"elapsed_time": 0.001},
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
job_id: str = Field(..., description="任务唯一标识")
|
||||
status: JobStatus = Field(..., description="任务状态")
|
||||
algorithm: str = Field(..., description="算法名称")
|
||||
created_at: str = Field(..., description="创建时间(ISO 8601)")
|
||||
started_at: Optional[str] = Field(None, description="开始执行时间(ISO 8601)")
|
||||
completed_at: Optional[str] = Field(None, description="完成时间(ISO 8601)")
|
||||
result: Optional[Dict[str, Any]] = Field(None, description="执行结果(仅完成时返回)")
|
||||
error: Optional[str] = Field(None, description="错误信息(仅失败时返回)")
|
||||
metadata: Optional[Dict[str, Any]] = Field(None, description="元数据信息")
|
||||
|
||||
|
||||
class ConcurrencyStatusResponse(BaseModel):
|
||||
"""并发状态响应"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
json_schema_extra={
|
||||
"example": {
|
||||
"max_concurrent": 10,
|
||||
"available_slots": 7,
|
||||
"running_jobs": 3,
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
max_concurrent: int = Field(..., description="最大并发任务数")
|
||||
available_slots: int = Field(..., description="当前可用槽位数")
|
||||
running_jobs: int = Field(..., description="当前运行中的任务数")
|
||||
348
src/functional_scaffold/api/routes.py
Normal file
348
src/functional_scaffold/api/routes.py
Normal file
@@ -0,0 +1,348 @@
|
||||
"""API 路由"""
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Depends, status
|
||||
import time
|
||||
import logging
|
||||
|
||||
from .models import (
|
||||
InvokeRequest,
|
||||
InvokeResponse,
|
||||
HealthResponse,
|
||||
ReadinessResponse,
|
||||
ErrorResponse,
|
||||
JobRequest,
|
||||
JobCreateResponse,
|
||||
JobStatusResponse,
|
||||
JobStatus,
|
||||
ConcurrencyStatusResponse,
|
||||
)
|
||||
from .dependencies import get_request_id
|
||||
from ..algorithms.prime_checker import PrimeChecker
|
||||
from ..core.errors import ValidationError, AlgorithmError
|
||||
from ..core.job_manager import get_job_manager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/invoke",
|
||||
response_model=InvokeResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="同步调用算法",
|
||||
description="同步调用质数判断算法,立即返回结果",
|
||||
responses={
|
||||
200: {"description": "成功", "model": InvokeResponse},
|
||||
400: {"description": "请求参数错误", "model": ErrorResponse},
|
||||
500: {"description": "服务器内部错误", "model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
async def invoke_algorithm(
|
||||
request: InvokeRequest,
|
||||
request_id: str = Depends(get_request_id),
|
||||
):
|
||||
"""
|
||||
同步调用质数判断算法
|
||||
|
||||
- **number**: 待判断的整数
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Processing request {request_id} with number={request.number}")
|
||||
|
||||
# 创建算法实例并执行
|
||||
checker = PrimeChecker()
|
||||
execution_result = checker.execute(request.number)
|
||||
|
||||
if not execution_result["success"]:
|
||||
raise AlgorithmError(
|
||||
execution_result.get("error", "Algorithm execution failed"),
|
||||
details=execution_result.get("metadata", {}),
|
||||
)
|
||||
|
||||
return InvokeResponse(
|
||||
request_id=request_id,
|
||||
status="success",
|
||||
result=execution_result["result"],
|
||||
metadata=execution_result["metadata"],
|
||||
)
|
||||
|
||||
except ValidationError as e:
|
||||
logger.warning(f"Validation error for request {request_id}: {e.message}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=e.to_dict(),
|
||||
)
|
||||
|
||||
except AlgorithmError as e:
|
||||
logger.error(f"Algorithm error for request {request_id}: {e.message}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=e.to_dict(),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error for request {request_id}: {str(e)}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail={
|
||||
"error": "INTERNAL_ERROR",
|
||||
"message": str(e),
|
||||
"request_id": request_id,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/healthz",
|
||||
response_model=HealthResponse,
|
||||
summary="健康检查",
|
||||
description="检查服务是否存活",
|
||||
)
|
||||
async def health_check():
|
||||
"""
|
||||
健康检查端点
|
||||
|
||||
返回服务的健康状态,用于存活探针
|
||||
"""
|
||||
return HealthResponse(
|
||||
status="healthy",
|
||||
timestamp=time.time(),
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/readyz",
|
||||
response_model=ReadinessResponse,
|
||||
summary="就绪检查",
|
||||
description="检查服务是否就绪",
|
||||
)
|
||||
async def readiness_check():
|
||||
"""
|
||||
就绪检查端点
|
||||
|
||||
返回服务的就绪状态,用于就绪探针
|
||||
"""
|
||||
# 这里可以添加更多检查,例如数据库连接、外部服务等
|
||||
checks = {
|
||||
"algorithm": True, # 算法模块可用
|
||||
}
|
||||
|
||||
all_ready = all(checks.values())
|
||||
|
||||
return ReadinessResponse(
|
||||
status="ready" if all_ready else "not_ready",
|
||||
timestamp=time.time(),
|
||||
checks=checks,
|
||||
)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/jobs",
|
||||
response_model=JobCreateResponse,
|
||||
status_code=status.HTTP_202_ACCEPTED,
|
||||
summary="创建异步任务",
|
||||
description="创建异步任务,立即返回任务 ID,任务在后台执行",
|
||||
responses={
|
||||
202: {"description": "任务已创建", "model": JobCreateResponse},
|
||||
400: {"description": "请求参数错误", "model": ErrorResponse},
|
||||
404: {"description": "算法不存在", "model": ErrorResponse},
|
||||
503: {"description": "服务不可用", "model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
async def create_job(
|
||||
request: JobRequest,
|
||||
request_id: str = Depends(get_request_id),
|
||||
):
|
||||
"""
|
||||
创建异步任务
|
||||
|
||||
- **algorithm**: 算法名称(如 PrimeChecker)
|
||||
- **params**: 算法参数
|
||||
- **webhook**: 任务完成后的回调 URL(可选)
|
||||
"""
|
||||
try:
|
||||
job_manager = await get_job_manager()
|
||||
|
||||
# 检查任务管理器是否可用
|
||||
if not job_manager.is_available():
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||
detail={
|
||||
"error": "SERVICE_UNAVAILABLE",
|
||||
"message": "任务服务暂不可用,请稍后重试",
|
||||
"request_id": request_id,
|
||||
},
|
||||
)
|
||||
|
||||
# 验证算法存在
|
||||
available_algorithms = job_manager.get_available_algorithms()
|
||||
if request.algorithm not in available_algorithms:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail={
|
||||
"error": "ALGORITHM_NOT_FOUND",
|
||||
"message": f"算法 '{request.algorithm}' 不存在",
|
||||
"details": {"available_algorithms": available_algorithms},
|
||||
"request_id": request_id,
|
||||
},
|
||||
)
|
||||
|
||||
# 创建任务
|
||||
job_id = await job_manager.create_job(
|
||||
algorithm=request.algorithm,
|
||||
params=request.params,
|
||||
webhook=request.webhook,
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
# 获取任务信息
|
||||
job_data = await job_manager.get_job(job_id)
|
||||
|
||||
# 任务入队,由 Worker 执行
|
||||
await job_manager.enqueue_job(job_id)
|
||||
|
||||
logger.info(f"异步任务已创建并入队: job_id={job_id}, request_id={request_id}")
|
||||
|
||||
return JobCreateResponse(
|
||||
job_id=job_id,
|
||||
status=JobStatus.PENDING,
|
||||
message="任务已创建",
|
||||
created_at=job_data["created_at"],
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"创建任务失败: {str(e)}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail={
|
||||
"error": "INTERNAL_ERROR",
|
||||
"message": str(e),
|
||||
"request_id": request_id,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/jobs/{job_id}",
|
||||
response_model=JobStatusResponse,
|
||||
summary="查询任务状态",
|
||||
description="查询异步任务的执行状态和结果",
|
||||
responses={
|
||||
200: {"description": "成功", "model": JobStatusResponse},
|
||||
404: {"description": "任务不存在或已过期", "model": ErrorResponse},
|
||||
503: {"description": "服务不可用", "model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
async def get_job_status(job_id: str):
|
||||
"""
|
||||
查询任务状态
|
||||
|
||||
- **job_id**: 任务唯一标识
|
||||
"""
|
||||
try:
|
||||
job_manager = await get_job_manager()
|
||||
|
||||
# 检查任务管理器是否可用
|
||||
if not job_manager.is_available():
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||
detail={
|
||||
"error": "SERVICE_UNAVAILABLE",
|
||||
"message": "任务服务暂不可用,请稍后重试",
|
||||
},
|
||||
)
|
||||
|
||||
# 获取任务信息
|
||||
job_data = await job_manager.get_job(job_id)
|
||||
|
||||
if not job_data:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail={
|
||||
"error": "JOB_NOT_FOUND",
|
||||
"message": f"任务 '{job_id}' 不存在或已过期",
|
||||
},
|
||||
)
|
||||
|
||||
return JobStatusResponse(
|
||||
job_id=job_data["job_id"],
|
||||
status=JobStatus(job_data["status"]),
|
||||
algorithm=job_data["algorithm"],
|
||||
created_at=job_data["created_at"],
|
||||
started_at=job_data["started_at"],
|
||||
completed_at=job_data["completed_at"],
|
||||
result=job_data["result"],
|
||||
error=job_data["error"],
|
||||
metadata=job_data["metadata"],
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"查询任务状态失败: {str(e)}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail={
|
||||
"error": "INTERNAL_ERROR",
|
||||
"message": str(e),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/jobs/concurrency/status",
|
||||
response_model=ConcurrencyStatusResponse,
|
||||
summary="查询并发状态",
|
||||
description="查询任务管理器的并发执行状态",
|
||||
responses={
|
||||
200: {"description": "成功", "model": ConcurrencyStatusResponse},
|
||||
503: {"description": "服务不可用", "model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
async def get_concurrency_status():
|
||||
"""
|
||||
查询并发状态
|
||||
|
||||
返回当前任务管理器的并发执行状态,包括:
|
||||
- 最大并发任务数
|
||||
- 当前可用槽位数
|
||||
- 当前运行中的任务数
|
||||
"""
|
||||
try:
|
||||
job_manager = await get_job_manager()
|
||||
|
||||
# 检查任务管理器是否可用
|
||||
if not job_manager.is_available():
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||
detail={
|
||||
"error": "SERVICE_UNAVAILABLE",
|
||||
"message": "任务管理器不可用",
|
||||
},
|
||||
)
|
||||
|
||||
concurrency_status = job_manager.get_concurrency_status()
|
||||
|
||||
return ConcurrencyStatusResponse(
|
||||
max_concurrent=concurrency_status["max_concurrent"],
|
||||
available_slots=concurrency_status["available_slots"],
|
||||
running_jobs=concurrency_status["running_jobs"],
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"查询并发状态失败: {str(e)}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail={
|
||||
"error": "INTERNAL_ERROR",
|
||||
"message": str(e),
|
||||
},
|
||||
)
|
||||
82
src/functional_scaffold/config.py
Normal file
82
src/functional_scaffold/config.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""配置管理模块"""
|
||||
|
||||
from pydantic_settings import BaseSettings
|
||||
from pydantic import ConfigDict
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
"""应用配置"""
|
||||
|
||||
model_config = ConfigDict(env_file=".env", case_sensitive=False)
|
||||
|
||||
# 应用信息
|
||||
app_name: str = "FunctionalScaffold"
|
||||
app_version: str = "1.0.0"
|
||||
app_env: str = "development"
|
||||
|
||||
# 服务器配置
|
||||
host: str = "0.0.0.0"
|
||||
port: int = 8000
|
||||
workers: int = 4
|
||||
|
||||
# 日志配置
|
||||
log_level: str = "INFO"
|
||||
log_format: str = "json"
|
||||
log_file_enabled: bool = False
|
||||
log_file_path: str = "/var/log/app/app.log"
|
||||
|
||||
# 指标配置
|
||||
metrics_enabled: bool = True
|
||||
|
||||
# 追踪配置
|
||||
tracing_enabled: bool = False
|
||||
jaeger_endpoint: Optional[str] = None
|
||||
|
||||
# 外部服务配置(示例)
|
||||
oss_endpoint: Optional[str] = None
|
||||
oss_access_key_id: Optional[str] = None
|
||||
oss_access_key_secret: Optional[str] = None
|
||||
oss_bucket_name: Optional[str] = None
|
||||
|
||||
database_url: Optional[str] = None
|
||||
|
||||
# Redis 配置
|
||||
redis_host: str = "localhost"
|
||||
redis_port: int = 6379
|
||||
redis_db: int = 0
|
||||
redis_password: Optional[str] = None
|
||||
|
||||
# 指标配置
|
||||
metrics_config_path: str = "config/metrics.yaml"
|
||||
metrics_instance_id: Optional[str] = None # 默认使用 hostname
|
||||
|
||||
# 异步任务配置
|
||||
job_result_ttl: int = 1800 # 结果缓存时间(秒),默认 30 分钟
|
||||
webhook_max_retries: int = 3 # Webhook 最大重试次数
|
||||
webhook_timeout: int = 10 # Webhook 超时时间(秒)
|
||||
max_concurrent_jobs: int = 10 # 最大并发任务数
|
||||
|
||||
# Worker 配置
|
||||
worker_poll_interval: float = 0.1 # Worker 轮询间隔(秒)
|
||||
job_queue_key: str = "job:queue" # 任务队列 Redis Key
|
||||
job_concurrency_key: str = "job:concurrency" # 全局并发计数器 Redis Key
|
||||
job_lock_ttl: int = 300 # 任务锁 TTL(秒)
|
||||
job_max_retries: int = 3 # 任务最大重试次数
|
||||
job_execution_timeout: int = 300 # 任务执行超时(秒)
|
||||
|
||||
# 处理队列配置
|
||||
job_processing_key: str = "job:processing" # 处理中队列
|
||||
job_processing_ts_key: str = "job:processing:ts" # 处理时间戳 ZSET
|
||||
job_dlq_key: str = "job:dlq" # 死信队列
|
||||
|
||||
# 锁配置扩展
|
||||
job_lock_buffer: int = 60 # 锁 TTL 缓冲时间(秒)
|
||||
|
||||
# 回收器配置
|
||||
job_sweeper_enabled: bool = True # 启用回收器
|
||||
job_sweeper_interval: int = 60 # 回收扫描间隔(秒)
|
||||
|
||||
|
||||
# 全局配置实例
|
||||
settings = Settings()
|
||||
21
src/functional_scaffold/core/__init__.py
Normal file
21
src/functional_scaffold/core/__init__.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""核心功能模块"""
|
||||
|
||||
from .errors import (
|
||||
FunctionalScaffoldError,
|
||||
ValidationError,
|
||||
AlgorithmError,
|
||||
ConfigurationError,
|
||||
)
|
||||
from .logging import setup_logging
|
||||
from .metrics import metrics_registry, track_request, track_algorithm_execution
|
||||
|
||||
__all__ = [
|
||||
"FunctionalScaffoldError",
|
||||
"ValidationError",
|
||||
"AlgorithmError",
|
||||
"ConfigurationError",
|
||||
"setup_logging",
|
||||
"metrics_registry",
|
||||
"track_request",
|
||||
"track_algorithm_execution",
|
||||
]
|
||||
47
src/functional_scaffold/core/errors.py
Normal file
47
src/functional_scaffold/core/errors.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""错误处理模块"""
|
||||
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
|
||||
class FunctionalScaffoldError(Exception):
|
||||
"""基础异常类"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
error_code: Optional[str] = None,
|
||||
details: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
self.message = message
|
||||
self.error_code = error_code or "INTERNAL_ERROR"
|
||||
self.details = details or {}
|
||||
super().__init__(self.message)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""转换为字典格式"""
|
||||
return {
|
||||
"error": self.error_code,
|
||||
"message": self.message,
|
||||
"details": self.details,
|
||||
}
|
||||
|
||||
|
||||
class ValidationError(FunctionalScaffoldError):
|
||||
"""参数验证错误"""
|
||||
|
||||
def __init__(self, message: str, details: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(message, error_code="VALIDATION_ERROR", details=details)
|
||||
|
||||
|
||||
class AlgorithmError(FunctionalScaffoldError):
|
||||
"""算法执行错误"""
|
||||
|
||||
def __init__(self, message: str, details: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(message, error_code="ALGORITHM_ERROR", details=details)
|
||||
|
||||
|
||||
class ConfigurationError(FunctionalScaffoldError):
|
||||
"""配置错误"""
|
||||
|
||||
def __init__(self, message: str, details: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(message, error_code="CONFIGURATION_ERROR", details=details)
|
||||
856
src/functional_scaffold/core/job_manager.py
Normal file
856
src/functional_scaffold/core/job_manager.py
Normal file
@@ -0,0 +1,856 @@
|
||||
"""异步任务管理模块
|
||||
|
||||
基于 Redis 的异步任务管理,支持任务创建、执行、状态查询和 Webhook 回调。
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import secrets
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, List, Optional, Type
|
||||
|
||||
import httpx
|
||||
import redis.asyncio as aioredis
|
||||
|
||||
from ..algorithms.base import BaseAlgorithm
|
||||
from ..config import settings
|
||||
from ..core.metrics_unified import incr, observe
|
||||
from ..core.tracing import set_request_id
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class JobManager:
|
||||
"""异步任务管理器"""
|
||||
|
||||
# Lua 脚本:安全释放锁(验证 token)
|
||||
RELEASE_LOCK_SCRIPT = """
|
||||
local current = redis.call('GET', KEYS[1])
|
||||
if current == ARGV[1] then
|
||||
return redis.call('DEL', KEYS[1])
|
||||
end
|
||||
return 0
|
||||
"""
|
||||
|
||||
# Lua 脚本:锁续租(验证 token 后延长 TTL)
|
||||
RENEW_LOCK_SCRIPT = """
|
||||
local current = redis.call('GET', KEYS[1])
|
||||
if current == ARGV[1] then
|
||||
return redis.call('EXPIRE', KEYS[1], ARGV[2])
|
||||
end
|
||||
return 0
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._redis_client: Optional[aioredis.Redis] = None
|
||||
self._algorithm_registry: Dict[str, Type[BaseAlgorithm]] = {}
|
||||
self._http_client: Optional[httpx.AsyncClient] = None
|
||||
self._semaphore: Optional[asyncio.Semaphore] = None
|
||||
self._max_concurrent_jobs: int = 0
|
||||
|
||||
async def initialize(self) -> None:
|
||||
"""初始化 Redis 连接和 HTTP 客户端"""
|
||||
# 初始化 Redis 异步连接
|
||||
try:
|
||||
self._redis_client = aioredis.Redis(
|
||||
host=settings.redis_host,
|
||||
port=settings.redis_port,
|
||||
db=settings.redis_db,
|
||||
password=settings.redis_password if settings.redis_password else None,
|
||||
decode_responses=True,
|
||||
socket_connect_timeout=5,
|
||||
socket_timeout=5,
|
||||
)
|
||||
# 测试连接
|
||||
await self._redis_client.ping()
|
||||
logger.info(f"任务管理器 Redis 连接成功: {settings.redis_host}:{settings.redis_port}")
|
||||
except Exception as e:
|
||||
logger.error(f"任务管理器 Redis 连接失败: {e}")
|
||||
self._redis_client = None
|
||||
|
||||
# 初始化 HTTP 客户端
|
||||
self._http_client = httpx.AsyncClient(timeout=settings.webhook_timeout)
|
||||
|
||||
# 初始化并发控制信号量
|
||||
self._max_concurrent_jobs = settings.max_concurrent_jobs
|
||||
self._semaphore = asyncio.Semaphore(self._max_concurrent_jobs)
|
||||
logger.info(f"任务并发限制已设置: {self._max_concurrent_jobs}")
|
||||
|
||||
# 注册算法
|
||||
self._register_algorithms()
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
"""关闭连接"""
|
||||
if self._redis_client:
|
||||
await self._redis_client.close()
|
||||
logger.info("任务管理器 Redis 连接已关闭")
|
||||
|
||||
if self._http_client:
|
||||
await self._http_client.aclose()
|
||||
logger.info("任务管理器 HTTP 客户端已关闭")
|
||||
|
||||
def _register_algorithms(self) -> None:
|
||||
"""注册可用的算法类"""
|
||||
from ..algorithms import __all__ as algorithm_names
|
||||
from .. import algorithms as algorithms_module
|
||||
|
||||
for name in algorithm_names:
|
||||
cls = getattr(algorithms_module, name, None)
|
||||
if cls and isinstance(cls, type) and issubclass(cls, BaseAlgorithm):
|
||||
if cls is not BaseAlgorithm:
|
||||
self._algorithm_registry[name] = cls
|
||||
logger.debug(f"已注册算法: {name}")
|
||||
|
||||
logger.info(f"已注册 {len(self._algorithm_registry)} 个算法")
|
||||
|
||||
def get_available_algorithms(self) -> List[str]:
|
||||
"""获取可用算法列表"""
|
||||
return list(self._algorithm_registry.keys())
|
||||
|
||||
def _generate_job_id(self) -> str:
|
||||
"""生成 12 位十六进制任务 ID"""
|
||||
return secrets.token_hex(6)
|
||||
|
||||
def _get_timestamp(self) -> str:
|
||||
"""获取 ISO 8601 格式时间戳"""
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
async def create_job(
|
||||
self,
|
||||
algorithm: str,
|
||||
params: Dict[str, Any],
|
||||
webhook: Optional[str] = None,
|
||||
request_id: Optional[str] = None,
|
||||
) -> str:
|
||||
"""创建新任务,返回 job_id
|
||||
|
||||
Args:
|
||||
algorithm: 算法名称
|
||||
params: 算法参数
|
||||
webhook: 回调 URL(可选)
|
||||
request_id: 关联的请求 ID(可选)
|
||||
|
||||
Returns:
|
||||
str: 任务 ID
|
||||
|
||||
Raises:
|
||||
RuntimeError: Redis 不可用时抛出
|
||||
ValueError: 算法不存在时抛出
|
||||
"""
|
||||
if not self._redis_client:
|
||||
raise RuntimeError("Redis 不可用,无法创建任务")
|
||||
|
||||
if algorithm not in self._algorithm_registry:
|
||||
raise ValueError(f"算法 '{algorithm}' 不存在")
|
||||
|
||||
job_id = self._generate_job_id()
|
||||
created_at = self._get_timestamp()
|
||||
|
||||
# 构建任务数据
|
||||
job_data = {
|
||||
"status": "pending",
|
||||
"algorithm": algorithm,
|
||||
"params": json.dumps(params),
|
||||
"webhook": webhook or "",
|
||||
"request_id": request_id or "",
|
||||
"created_at": created_at,
|
||||
"started_at": "",
|
||||
"completed_at": "",
|
||||
"result": "",
|
||||
"error": "",
|
||||
"metadata": "",
|
||||
}
|
||||
|
||||
# 存储到 Redis
|
||||
key = f"job:{job_id}"
|
||||
await self._redis_client.hset(key, mapping=job_data)
|
||||
|
||||
# 记录指标
|
||||
incr("jobs_created_total", {"algorithm": algorithm})
|
||||
|
||||
logger.info(f"任务已创建: job_id={job_id}, algorithm={algorithm}")
|
||||
return job_id
|
||||
|
||||
async def get_job(self, job_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""获取任务信息
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
|
||||
Returns:
|
||||
任务信息字典,不存在时返回 None
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return None
|
||||
|
||||
key = f"job:{job_id}"
|
||||
job_data = await self._redis_client.hgetall(key)
|
||||
|
||||
if not job_data:
|
||||
return None
|
||||
|
||||
# 解析 JSON 字段
|
||||
result = {
|
||||
"job_id": job_id,
|
||||
"status": job_data.get("status", ""),
|
||||
"algorithm": job_data.get("algorithm", ""),
|
||||
"request_id": job_data.get("request_id") or None,
|
||||
"created_at": job_data.get("created_at", ""),
|
||||
"started_at": job_data.get("started_at") or None,
|
||||
"completed_at": job_data.get("completed_at") or None,
|
||||
"result": None,
|
||||
"error": job_data.get("error") or None,
|
||||
"metadata": None,
|
||||
}
|
||||
|
||||
# 解析 result
|
||||
if job_data.get("result"):
|
||||
try:
|
||||
result["result"] = json.loads(job_data["result"])
|
||||
except json.JSONDecodeError:
|
||||
result["result"] = None
|
||||
|
||||
# 解析 metadata
|
||||
if job_data.get("metadata"):
|
||||
try:
|
||||
result["metadata"] = json.loads(job_data["metadata"])
|
||||
except json.JSONDecodeError:
|
||||
result["metadata"] = None
|
||||
|
||||
return result
|
||||
|
||||
async def execute_job(self, job_id: str) -> None:
|
||||
"""执行任务(在后台任务中调用)
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
"""
|
||||
if not self._redis_client:
|
||||
logger.error(f"Redis 不可用,无法执行任务: {job_id}")
|
||||
return
|
||||
|
||||
if not self._semaphore:
|
||||
logger.error(f"并发控制未初始化,无法执行任务: {job_id}")
|
||||
return
|
||||
|
||||
key = f"job:{job_id}"
|
||||
job_data = await self._redis_client.hgetall(key)
|
||||
|
||||
if not job_data:
|
||||
logger.error(f"任务不存在: {job_id}")
|
||||
return
|
||||
|
||||
algorithm_name = job_data.get("algorithm", "")
|
||||
webhook_url = job_data.get("webhook", "")
|
||||
request_id = job_data.get("request_id", "")
|
||||
|
||||
# 设置 request_id 上下文,确保日志中包含 request_id
|
||||
if request_id:
|
||||
set_request_id(request_id)
|
||||
|
||||
# 解析参数
|
||||
try:
|
||||
params = json.loads(job_data.get("params", "{}"))
|
||||
except json.JSONDecodeError:
|
||||
params = {}
|
||||
|
||||
# 使用信号量控制并发
|
||||
async with self._semaphore:
|
||||
# 更新状态为 running
|
||||
started_at = self._get_timestamp()
|
||||
await self._redis_client.hset(
|
||||
key, mapping={"status": "running", "started_at": started_at}
|
||||
)
|
||||
|
||||
logger.info(f"开始执行任务: job_id={job_id}, algorithm={algorithm_name}")
|
||||
|
||||
import time
|
||||
|
||||
start_time = time.time()
|
||||
status = "completed"
|
||||
result_data = None
|
||||
error_msg = None
|
||||
metadata = None
|
||||
|
||||
try:
|
||||
# 获取算法类并执行
|
||||
algorithm_cls = self._algorithm_registry.get(algorithm_name)
|
||||
if not algorithm_cls:
|
||||
raise ValueError(f"算法 '{algorithm_name}' 不存在")
|
||||
|
||||
algorithm = algorithm_cls()
|
||||
|
||||
# 根据算法类型传递参数
|
||||
if algorithm_name == "PrimeChecker":
|
||||
execution_result = algorithm.execute(params.get("number", 0))
|
||||
else:
|
||||
# 通用参数传递
|
||||
execution_result = algorithm.execute(**params)
|
||||
|
||||
if execution_result.get("success"):
|
||||
result_data = execution_result.get("result", {})
|
||||
metadata = execution_result.get("metadata", {})
|
||||
else:
|
||||
status = "failed"
|
||||
error_msg = execution_result.get("error", "算法执行失败")
|
||||
metadata = execution_result.get("metadata", {})
|
||||
|
||||
except Exception as e:
|
||||
status = "failed"
|
||||
error_msg = str(e)
|
||||
logger.error(f"任务执行失败: job_id={job_id}, error={e}", exc_info=True)
|
||||
|
||||
# 计算执行时间
|
||||
elapsed_time = time.time() - start_time
|
||||
completed_at = self._get_timestamp()
|
||||
|
||||
# 更新任务状态
|
||||
update_data = {
|
||||
"status": status,
|
||||
"completed_at": completed_at,
|
||||
"result": json.dumps(result_data) if result_data else "",
|
||||
"error": error_msg or "",
|
||||
"metadata": json.dumps(metadata) if metadata else "",
|
||||
}
|
||||
await self._redis_client.hset(key, mapping=update_data)
|
||||
|
||||
# 设置 TTL
|
||||
await self._redis_client.expire(key, settings.job_result_ttl)
|
||||
|
||||
# 记录指标
|
||||
incr("jobs_completed_total", {"algorithm": algorithm_name, "status": status})
|
||||
observe("job_execution_duration_seconds", {"algorithm": algorithm_name}, elapsed_time)
|
||||
|
||||
logger.info(
|
||||
f"任务执行完成: job_id={job_id}, status={status}, elapsed={elapsed_time:.3f}s"
|
||||
)
|
||||
|
||||
# 发送 Webhook 回调
|
||||
if webhook_url:
|
||||
await self._send_webhook(job_id, webhook_url)
|
||||
|
||||
async def _send_webhook(self, job_id: str, webhook_url: str) -> None:
|
||||
"""发送 Webhook 回调(带重试)
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
webhook_url: 回调 URL
|
||||
"""
|
||||
if not self._http_client:
|
||||
logger.warning("HTTP 客户端不可用,无法发送 Webhook")
|
||||
return
|
||||
|
||||
# 获取任务数据
|
||||
job_data = await self.get_job(job_id)
|
||||
if not job_data:
|
||||
logger.error(f"无法获取任务数据用于 Webhook: {job_id}")
|
||||
return
|
||||
|
||||
# 构建回调负载
|
||||
payload = {
|
||||
"job_id": job_data["job_id"],
|
||||
"status": job_data["status"],
|
||||
"algorithm": job_data["algorithm"],
|
||||
"result": job_data["result"],
|
||||
"error": job_data["error"],
|
||||
"metadata": job_data["metadata"],
|
||||
"completed_at": job_data["completed_at"],
|
||||
}
|
||||
|
||||
# 重试间隔(指数退避)
|
||||
retry_delays = [1, 5, 15]
|
||||
max_retries = settings.webhook_max_retries
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = await self._http_client.post(
|
||||
webhook_url,
|
||||
json=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
|
||||
if response.status_code < 400:
|
||||
incr("webhook_deliveries_total", {"status": "success"})
|
||||
logger.info(
|
||||
f"Webhook 发送成功: job_id={job_id}, url={webhook_url}, "
|
||||
f"status_code={response.status_code}"
|
||||
)
|
||||
return
|
||||
else:
|
||||
logger.warning(
|
||||
f"Webhook 响应错误: job_id={job_id}, status_code={response.status_code}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Webhook 发送失败 (尝试 {attempt + 1}/{max_retries}): "
|
||||
f"job_id={job_id}, error={e}"
|
||||
)
|
||||
|
||||
# 等待后重试
|
||||
if attempt < max_retries - 1:
|
||||
delay = retry_delays[min(attempt, len(retry_delays) - 1)]
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
# 所有重试都失败
|
||||
incr("webhook_deliveries_total", {"status": "failed"})
|
||||
logger.error(f"Webhook 发送最终失败: job_id={job_id}, url={webhook_url}")
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""检查任务管理器是否可用"""
|
||||
return self._redis_client is not None
|
||||
|
||||
async def enqueue_job(self, job_id: str) -> bool:
|
||||
"""将任务加入队列
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
|
||||
Returns:
|
||||
bool: 是否成功入队
|
||||
"""
|
||||
if not self._redis_client:
|
||||
logger.error(f"Redis 不可用,无法入队任务: {job_id}")
|
||||
return False
|
||||
|
||||
try:
|
||||
await self._redis_client.lpush(settings.job_queue_key, job_id)
|
||||
logger.info(f"任务已入队: job_id={job_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"任务入队失败: job_id={job_id}, error={e}")
|
||||
return False
|
||||
|
||||
async def dequeue_job(self, timeout: int = 5) -> Optional[str]:
|
||||
"""从队列获取任务(阻塞式,转移式出队)
|
||||
|
||||
使用 BLMOVE 原子性地将任务从 job:queue 移动到 job:processing,
|
||||
防止 Worker 崩溃时任务丢失。
|
||||
|
||||
Args:
|
||||
timeout: 阻塞超时时间(秒)
|
||||
|
||||
Returns:
|
||||
Optional[str]: 任务 ID,超时返回 None
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return None
|
||||
|
||||
try:
|
||||
# 使用 BLMOVE 原子性转移任务
|
||||
job_id = await self._redis_client.blmove(
|
||||
settings.job_queue_key, # 源: job:queue
|
||||
settings.job_processing_key, # 目标: job:processing
|
||||
timeout,
|
||||
"RIGHT",
|
||||
"LEFT",
|
||||
)
|
||||
if job_id:
|
||||
# 记录出队时间戳到 ZSET
|
||||
await self._redis_client.zadd(settings.job_processing_ts_key, {job_id: time.time()})
|
||||
logger.debug(f"任务已转移到处理队列: {job_id}")
|
||||
return job_id
|
||||
except Exception as e:
|
||||
logger.error(f"任务出队失败: error={e}")
|
||||
return None
|
||||
|
||||
async def acquire_job_lock(self, job_id: str) -> Optional[str]:
|
||||
"""获取任务执行锁(分布式锁,带 Token)
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
|
||||
Returns:
|
||||
Optional[str]: 成功时返回锁 token,失败返回 None
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return None
|
||||
|
||||
lock_key = f"job:lock:{job_id}"
|
||||
lock_token = secrets.token_hex(16) # 随机 token
|
||||
lock_ttl = settings.job_execution_timeout + settings.job_lock_buffer
|
||||
try:
|
||||
acquired = await self._redis_client.set(lock_key, lock_token, nx=True, ex=lock_ttl)
|
||||
if acquired:
|
||||
logger.debug(f"获取任务锁成功: job_id={job_id}")
|
||||
return lock_token
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"获取任务锁失败: job_id={job_id}, error={e}")
|
||||
return None
|
||||
|
||||
async def release_job_lock(self, job_id: str, lock_token: Optional[str] = None) -> bool:
|
||||
"""释放任务执行锁(使用 Lua 脚本验证 token)
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
lock_token: 锁 token(用于验证所有权)
|
||||
|
||||
Returns:
|
||||
bool: 是否成功释放锁
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return False
|
||||
|
||||
lock_key = f"job:lock:{job_id}"
|
||||
try:
|
||||
if lock_token:
|
||||
# 使用 Lua 脚本安全释放锁
|
||||
result = await self._redis_client.eval(
|
||||
self.RELEASE_LOCK_SCRIPT, 1, lock_key, lock_token
|
||||
)
|
||||
if result == 1:
|
||||
logger.debug(f"释放任务锁成功: job_id={job_id}")
|
||||
return True
|
||||
else:
|
||||
logger.warning(f"释放任务锁失败(token 不匹配): job_id={job_id}")
|
||||
return False
|
||||
else:
|
||||
# 向后兼容:无 token 时直接删除
|
||||
await self._redis_client.delete(lock_key)
|
||||
logger.debug(f"释放任务锁成功(无 token 验证): job_id={job_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"释放任务锁失败: job_id={job_id}, error={e}")
|
||||
return False
|
||||
|
||||
async def increment_concurrency(self) -> int:
|
||||
"""增加全局并发计数
|
||||
|
||||
Returns:
|
||||
int: 增加后的并发数
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return 0
|
||||
|
||||
try:
|
||||
count = await self._redis_client.incr(settings.job_concurrency_key)
|
||||
return count
|
||||
except Exception as e:
|
||||
logger.error(f"增加并发计数失败: error={e}")
|
||||
return 0
|
||||
|
||||
async def decrement_concurrency(self) -> int:
|
||||
"""减少全局并发计数
|
||||
|
||||
Returns:
|
||||
int: 减少后的并发数
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return 0
|
||||
|
||||
try:
|
||||
count = await self._redis_client.decr(settings.job_concurrency_key)
|
||||
# 防止计数变为负数
|
||||
if count < 0:
|
||||
await self._redis_client.set(settings.job_concurrency_key, 0)
|
||||
return 0
|
||||
return count
|
||||
except Exception as e:
|
||||
logger.error(f"减少并发计数失败: error={e}")
|
||||
return 0
|
||||
|
||||
async def get_global_concurrency(self) -> int:
|
||||
"""获取当前全局并发数
|
||||
|
||||
Returns:
|
||||
int: 当前并发数
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return 0
|
||||
|
||||
try:
|
||||
count = await self._redis_client.get(settings.job_concurrency_key)
|
||||
return int(count) if count else 0
|
||||
except Exception as e:
|
||||
logger.error(f"获取并发计数失败: error={e}")
|
||||
return 0
|
||||
|
||||
async def can_execute(self) -> bool:
|
||||
"""检查是否可以执行新任务(全局并发控制)
|
||||
|
||||
Returns:
|
||||
bool: 是否可以执行
|
||||
"""
|
||||
current = await self.get_global_concurrency()
|
||||
return current < settings.max_concurrent_jobs
|
||||
|
||||
async def get_job_retry_count(self, job_id: str) -> int:
|
||||
"""获取任务重试次数
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
|
||||
Returns:
|
||||
int: 重试次数
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return 0
|
||||
|
||||
key = f"job:{job_id}"
|
||||
try:
|
||||
retry_count = await self._redis_client.hget(key, "retry_count")
|
||||
return int(retry_count) if retry_count else 0
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
async def increment_job_retry(self, job_id: str) -> int:
|
||||
"""增加任务重试次数
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
|
||||
Returns:
|
||||
int: 增加后的重试次数
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return 0
|
||||
|
||||
key = f"job:{job_id}"
|
||||
try:
|
||||
await self._redis_client.hincrby(key, "retry_count", 1)
|
||||
retry_count = await self._redis_client.hget(key, "retry_count")
|
||||
return int(retry_count) if retry_count else 1
|
||||
except Exception as e:
|
||||
logger.error(f"增加重试次数失败: job_id={job_id}, error={e}")
|
||||
return 0
|
||||
|
||||
async def ack_job(self, job_id: str) -> bool:
|
||||
"""确认任务完成(从处理队列移除)
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
|
||||
Returns:
|
||||
bool: 是否成功确认
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return False
|
||||
|
||||
try:
|
||||
async with self._redis_client.pipeline(transaction=True) as pipe:
|
||||
pipe.lrem(settings.job_processing_key, 1, job_id)
|
||||
pipe.zrem(settings.job_processing_ts_key, job_id)
|
||||
await pipe.execute()
|
||||
logger.debug(f"任务已确认完成: job_id={job_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"确认任务失败: job_id={job_id}, error={e}")
|
||||
return False
|
||||
|
||||
async def nack_job(self, job_id: str, requeue: bool = True) -> bool:
|
||||
"""拒绝任务(从处理队列移除,根据重试次数决定重新入队或进死信队列)
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
requeue: 是否尝试重新入队
|
||||
|
||||
Returns:
|
||||
bool: 是否成功处理
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return False
|
||||
|
||||
try:
|
||||
retry_count = await self.get_job_retry_count(job_id)
|
||||
async with self._redis_client.pipeline(transaction=True) as pipe:
|
||||
pipe.lrem(settings.job_processing_key, 1, job_id)
|
||||
pipe.zrem(settings.job_processing_ts_key, job_id)
|
||||
if requeue and retry_count < settings.job_max_retries:
|
||||
pipe.lpush(settings.job_queue_key, job_id)
|
||||
logger.info(f"任务重新入队: job_id={job_id}, retry_count={retry_count}")
|
||||
else:
|
||||
pipe.lpush(settings.job_dlq_key, job_id)
|
||||
logger.warning(f"任务进入死信队列: job_id={job_id}, retry_count={retry_count}")
|
||||
await pipe.execute()
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"拒绝任务失败: job_id={job_id}, error={e}")
|
||||
return False
|
||||
|
||||
async def renew_job_lock(self, job_id: str, lock_token: str) -> bool:
|
||||
"""续租任务锁(延长 TTL)
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
lock_token: 锁 token
|
||||
|
||||
Returns:
|
||||
bool: 是否成功续租
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return False
|
||||
|
||||
lock_key = f"job:lock:{job_id}"
|
||||
lock_ttl = settings.job_execution_timeout + settings.job_lock_buffer
|
||||
try:
|
||||
result = await self._redis_client.eval(
|
||||
self.RENEW_LOCK_SCRIPT, 1, lock_key, lock_token, lock_ttl
|
||||
)
|
||||
if result == 1:
|
||||
logger.debug(f"锁续租成功: job_id={job_id}")
|
||||
return True
|
||||
else:
|
||||
logger.warning(f"锁续租失败(token 不匹配或锁已过期): job_id={job_id}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"锁续租失败: job_id={job_id}, error={e}")
|
||||
return False
|
||||
|
||||
async def recover_stale_jobs(self) -> int:
|
||||
"""回收超时任务
|
||||
|
||||
扫描 job:processing:ts ZSET,找出超时的任务,
|
||||
根据重试次数决定重新入队或进死信队列。
|
||||
|
||||
Returns:
|
||||
int: 回收的任务数量
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return 0
|
||||
|
||||
timeout = settings.job_execution_timeout + settings.job_lock_buffer
|
||||
cutoff = time.time() - timeout
|
||||
|
||||
try:
|
||||
# 获取超时任务列表
|
||||
stale_jobs = await self._redis_client.zrangebyscore(
|
||||
settings.job_processing_ts_key, "-inf", cutoff
|
||||
)
|
||||
|
||||
recovered = 0
|
||||
for job_id in stale_jobs:
|
||||
# 增加重试次数
|
||||
await self.increment_job_retry(job_id)
|
||||
retry_count = await self.get_job_retry_count(job_id)
|
||||
|
||||
async with self._redis_client.pipeline(transaction=True) as pipe:
|
||||
pipe.lrem(settings.job_processing_key, 1, job_id)
|
||||
pipe.zrem(settings.job_processing_ts_key, job_id)
|
||||
if retry_count < settings.job_max_retries:
|
||||
pipe.lpush(settings.job_queue_key, job_id)
|
||||
logger.info(f"超时任务重新入队: job_id={job_id}, retry_count={retry_count}")
|
||||
else:
|
||||
pipe.lpush(settings.job_dlq_key, job_id)
|
||||
logger.warning(
|
||||
f"超时任务进入死信队列: job_id={job_id}, retry_count={retry_count}"
|
||||
)
|
||||
await pipe.execute()
|
||||
recovered += 1
|
||||
|
||||
if recovered > 0:
|
||||
logger.info(f"回收超时任务完成: 共 {recovered} 个")
|
||||
return recovered
|
||||
except Exception as e:
|
||||
logger.error(f"回收超时任务失败: error={e}")
|
||||
return 0
|
||||
|
||||
def get_concurrency_status(self) -> Dict[str, int]:
|
||||
"""获取并发状态
|
||||
|
||||
Returns:
|
||||
Dict[str, int]: 包含以下键的字典
|
||||
- max_concurrent: 最大并发数
|
||||
- available_slots: 可用槽位数
|
||||
- running_jobs: 当前运行中的任务数
|
||||
"""
|
||||
if not self._semaphore:
|
||||
return {
|
||||
"max_concurrent": 0,
|
||||
"available_slots": 0,
|
||||
"running_jobs": 0,
|
||||
}
|
||||
|
||||
max_concurrent = self._max_concurrent_jobs
|
||||
available_slots = self._semaphore._value
|
||||
running_jobs = max_concurrent - available_slots
|
||||
|
||||
return {
|
||||
"max_concurrent": max_concurrent,
|
||||
"available_slots": available_slots,
|
||||
"running_jobs": running_jobs,
|
||||
}
|
||||
|
||||
async def collect_queue_metrics(self) -> Dict[str, Any]:
|
||||
"""收集队列监控指标
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 包含以下键的字典
|
||||
- queue_length: 待处理队列长度
|
||||
- processing_length: 处理中队列长度
|
||||
- dlq_length: 死信队列长度
|
||||
- oldest_waiting_seconds: 最长等待时间(秒)
|
||||
"""
|
||||
if not self._redis_client:
|
||||
return {
|
||||
"queue_length": 0,
|
||||
"processing_length": 0,
|
||||
"dlq_length": 0,
|
||||
"oldest_waiting_seconds": 0,
|
||||
}
|
||||
|
||||
try:
|
||||
# 使用 pipeline 批量获取队列长度
|
||||
async with self._redis_client.pipeline(transaction=False) as pipe:
|
||||
pipe.llen(settings.job_queue_key)
|
||||
pipe.llen(settings.job_processing_key)
|
||||
pipe.llen(settings.job_dlq_key)
|
||||
pipe.zrange(settings.job_processing_ts_key, 0, 0, withscores=True)
|
||||
results = await pipe.execute()
|
||||
|
||||
queue_length = results[0] or 0
|
||||
processing_length = results[1] or 0
|
||||
dlq_length = results[2] or 0
|
||||
|
||||
# 计算最长等待时间
|
||||
oldest_waiting_seconds = 0
|
||||
if results[3]:
|
||||
# results[3] 是 [(job_id, timestamp), ...] 格式
|
||||
oldest_ts = results[3][0][1]
|
||||
oldest_waiting_seconds = time.time() - oldest_ts
|
||||
|
||||
# 更新指标
|
||||
from .metrics_unified import set as metrics_set
|
||||
|
||||
metrics_set("job_queue_length", {"queue": "pending"}, queue_length)
|
||||
metrics_set("job_queue_length", {"queue": "processing"}, processing_length)
|
||||
metrics_set("job_queue_length", {"queue": "dlq"}, dlq_length)
|
||||
metrics_set("job_oldest_waiting_seconds", None, oldest_waiting_seconds)
|
||||
|
||||
return {
|
||||
"queue_length": queue_length,
|
||||
"processing_length": processing_length,
|
||||
"dlq_length": dlq_length,
|
||||
"oldest_waiting_seconds": oldest_waiting_seconds,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"收集队列指标失败: error={e}")
|
||||
return {
|
||||
"queue_length": 0,
|
||||
"processing_length": 0,
|
||||
"dlq_length": 0,
|
||||
"oldest_waiting_seconds": 0,
|
||||
}
|
||||
|
||||
|
||||
# 全局单例
|
||||
_job_manager: Optional[JobManager] = None
|
||||
|
||||
|
||||
async def get_job_manager() -> JobManager:
|
||||
"""获取任务管理器单例"""
|
||||
global _job_manager
|
||||
if _job_manager is None:
|
||||
_job_manager = JobManager()
|
||||
await _job_manager.initialize()
|
||||
return _job_manager
|
||||
|
||||
|
||||
async def shutdown_job_manager() -> None:
|
||||
"""关闭任务管理器"""
|
||||
global _job_manager
|
||||
if _job_manager is not None:
|
||||
await _job_manager.shutdown()
|
||||
_job_manager = None
|
||||
98
src/functional_scaffold/core/logging.py
Normal file
98
src/functional_scaffold/core/logging.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""日志配置模块"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from pythonjsonlogger.json import JsonFormatter
|
||||
|
||||
from .tracing import get_request_id
|
||||
|
||||
|
||||
class RequestIdFilter(logging.Filter):
|
||||
"""自动添加 request_id 到日志记录的过滤器"""
|
||||
|
||||
def filter(self, record: logging.LogRecord) -> bool:
|
||||
"""
|
||||
为日志记录添加 request_id 字段
|
||||
|
||||
Args:
|
||||
record: 日志记录
|
||||
|
||||
Returns:
|
||||
bool: 总是返回 True(不过滤任何日志)
|
||||
"""
|
||||
# 从 ContextVar 中获取 request_id
|
||||
request_id = get_request_id()
|
||||
# 添加到日志记录中,如果没有则设置为 None
|
||||
record.request_id = request_id if request_id else "-"
|
||||
return True
|
||||
|
||||
|
||||
def setup_logging(
|
||||
level: str = "INFO",
|
||||
format_type: str = "json",
|
||||
logger_name: Optional[str] = None,
|
||||
file_path: Optional[str] = None,
|
||||
) -> logging.Logger:
|
||||
"""
|
||||
配置日志系统
|
||||
|
||||
Args:
|
||||
level: 日志级别 (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
format_type: 日志格式 ('json' 或 'text')
|
||||
logger_name: 日志器名称,None表示根日志器
|
||||
file_path: 日志文件路径,None表示不写入文件
|
||||
|
||||
Returns:
|
||||
logging.Logger: 配置好的日志器
|
||||
"""
|
||||
logger = logging.getLogger(logger_name)
|
||||
logger.setLevel(getattr(logging, level.upper()))
|
||||
|
||||
# 清除现有处理器
|
||||
logger.handlers.clear()
|
||||
|
||||
# 设置格式
|
||||
if format_type == "json":
|
||||
formatter = JsonFormatter(
|
||||
"%(asctime)s %(name)s %(levelname)s %(message)s %(request_id)s",
|
||||
timestamp=True,
|
||||
)
|
||||
else:
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s - %(name)s - %(levelname)s - [%(request_id)s] - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
|
||||
# 创建 RequestIdFilter
|
||||
request_id_filter = RequestIdFilter()
|
||||
|
||||
# 创建控制台处理器
|
||||
console_handler = logging.StreamHandler(sys.stdout)
|
||||
console_handler.setLevel(getattr(logging, level.upper()))
|
||||
console_handler.setFormatter(formatter)
|
||||
console_handler.addFilter(request_id_filter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
# 创建文件处理器(如果指定了文件路径)
|
||||
if file_path:
|
||||
# 确保日志目录存在
|
||||
log_dir = Path(file_path).parent
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 创建 RotatingFileHandler
|
||||
# 最大 100MB,保留 5 个备份
|
||||
file_handler = RotatingFileHandler(
|
||||
file_path,
|
||||
maxBytes=100 * 1024 * 1024, # 100MB
|
||||
backupCount=5,
|
||||
encoding="utf-8",
|
||||
)
|
||||
file_handler.setLevel(getattr(logging, level.upper()))
|
||||
file_handler.setFormatter(formatter)
|
||||
file_handler.addFilter(request_id_filter)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
return logger
|
||||
111
src/functional_scaffold/core/metrics.py
Normal file
111
src/functional_scaffold/core/metrics.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""Prometheus 指标模块"""
|
||||
|
||||
from prometheus_client import Counter, Histogram, Gauge, CollectorRegistry
|
||||
from functools import wraps
|
||||
import time
|
||||
from typing import Callable
|
||||
|
||||
# 创建指标注册表
|
||||
metrics_registry = CollectorRegistry()
|
||||
|
||||
# 请求计数器
|
||||
request_counter = Counter(
|
||||
"http_requests_total",
|
||||
"Total HTTP requests",
|
||||
["method", "endpoint", "status"],
|
||||
registry=metrics_registry,
|
||||
)
|
||||
|
||||
# 请求延迟直方图
|
||||
request_latency = Histogram(
|
||||
"http_request_duration_seconds",
|
||||
"HTTP request latency",
|
||||
["method", "endpoint"],
|
||||
registry=metrics_registry,
|
||||
)
|
||||
|
||||
# 算法执行计数器
|
||||
algorithm_counter = Counter(
|
||||
"algorithm_executions_total",
|
||||
"Total algorithm executions",
|
||||
["algorithm", "status"],
|
||||
registry=metrics_registry,
|
||||
)
|
||||
|
||||
# 算法执行延迟
|
||||
algorithm_latency = Histogram(
|
||||
"algorithm_execution_duration_seconds",
|
||||
"Algorithm execution latency",
|
||||
["algorithm"],
|
||||
registry=metrics_registry,
|
||||
)
|
||||
|
||||
# 当前处理中的请求数
|
||||
in_progress_requests = Gauge(
|
||||
"http_requests_in_progress",
|
||||
"Number of HTTP requests in progress",
|
||||
registry=metrics_registry,
|
||||
)
|
||||
|
||||
|
||||
def track_request(method: str, endpoint: str):
|
||||
"""
|
||||
装饰器:跟踪HTTP请求指标
|
||||
|
||||
Args:
|
||||
method: HTTP方法
|
||||
endpoint: 端点路径
|
||||
"""
|
||||
|
||||
def decorator(func: Callable):
|
||||
@wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
in_progress_requests.inc()
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
result = await func(*args, **kwargs)
|
||||
status = "success"
|
||||
return result
|
||||
except Exception as e:
|
||||
status = "error"
|
||||
raise e
|
||||
finally:
|
||||
elapsed = time.time() - start_time
|
||||
request_counter.labels(method=method, endpoint=endpoint, status=status).inc()
|
||||
request_latency.labels(method=method, endpoint=endpoint).observe(elapsed)
|
||||
in_progress_requests.dec()
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def track_algorithm_execution(algorithm_name: str):
|
||||
"""
|
||||
装饰器:跟踪算法执行指标
|
||||
|
||||
Args:
|
||||
algorithm_name: 算法名称
|
||||
"""
|
||||
|
||||
def decorator(func: Callable):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
status = "success"
|
||||
return result
|
||||
except Exception as e:
|
||||
status = "error"
|
||||
raise e
|
||||
finally:
|
||||
elapsed = time.time() - start_time
|
||||
algorithm_counter.labels(algorithm=algorithm_name, status=status).inc()
|
||||
algorithm_latency.labels(algorithm=algorithm_name).observe(elapsed)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
162
src/functional_scaffold/core/metrics_pushgateway.py
Normal file
162
src/functional_scaffold/core/metrics_pushgateway.py
Normal file
@@ -0,0 +1,162 @@
|
||||
"""基于 Pushgateway 的 Prometheus 指标模块"""
|
||||
|
||||
from prometheus_client import Counter, Histogram, Gauge, CollectorRegistry, push_to_gateway
|
||||
from functools import wraps
|
||||
import time
|
||||
from typing import Callable, Optional
|
||||
import os
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 创建指标注册表
|
||||
metrics_registry = CollectorRegistry()
|
||||
|
||||
# Pushgateway 配置
|
||||
PUSHGATEWAY_URL = os.getenv("PUSHGATEWAY_URL", "localhost:9091")
|
||||
JOB_NAME = os.getenv("METRICS_JOB_NAME", "functional_scaffold")
|
||||
INSTANCE_ID = os.getenv("INSTANCE_ID", os.getenv("HOSTNAME", "unknown"))
|
||||
|
||||
# 请求计数器
|
||||
request_counter = Counter(
|
||||
"http_requests_total",
|
||||
"Total HTTP requests",
|
||||
["method", "endpoint", "status", "instance"],
|
||||
registry=metrics_registry,
|
||||
)
|
||||
|
||||
# 请求延迟直方图
|
||||
request_latency = Histogram(
|
||||
"http_request_duration_seconds",
|
||||
"HTTP request latency",
|
||||
["method", "endpoint", "instance"],
|
||||
registry=metrics_registry,
|
||||
)
|
||||
|
||||
# 算法执行计数器
|
||||
algorithm_counter = Counter(
|
||||
"algorithm_executions_total",
|
||||
"Total algorithm executions",
|
||||
["algorithm", "status", "instance"],
|
||||
registry=metrics_registry,
|
||||
)
|
||||
|
||||
# 算法执行延迟
|
||||
algorithm_latency = Histogram(
|
||||
"algorithm_execution_duration_seconds",
|
||||
"Algorithm execution latency",
|
||||
["algorithm", "instance"],
|
||||
registry=metrics_registry,
|
||||
)
|
||||
|
||||
# 当前处理中的请求数
|
||||
in_progress_requests = Gauge(
|
||||
"http_requests_in_progress",
|
||||
"Number of HTTP requests in progress",
|
||||
["instance"],
|
||||
registry=metrics_registry,
|
||||
)
|
||||
|
||||
|
||||
def push_metrics(grouping_key: Optional[dict] = None):
|
||||
"""
|
||||
推送指标到 Pushgateway
|
||||
|
||||
Args:
|
||||
grouping_key: 额外的分组键
|
||||
"""
|
||||
try:
|
||||
grouping = {"instance": INSTANCE_ID}
|
||||
if grouping_key:
|
||||
grouping.update(grouping_key)
|
||||
|
||||
push_to_gateway(
|
||||
PUSHGATEWAY_URL,
|
||||
job=JOB_NAME,
|
||||
registry=metrics_registry,
|
||||
grouping_key=grouping,
|
||||
)
|
||||
logger.debug(f"成功推送指标到 Pushgateway: {PUSHGATEWAY_URL}")
|
||||
except Exception as e:
|
||||
logger.error(f"推送指标到 Pushgateway 失败: {e}")
|
||||
|
||||
|
||||
def track_request(method: str, endpoint: str, auto_push: bool = True):
|
||||
"""
|
||||
装饰器:跟踪HTTP请求指标
|
||||
|
||||
Args:
|
||||
method: HTTP方法
|
||||
endpoint: 端点路径
|
||||
auto_push: 是否自动推送到 Pushgateway
|
||||
"""
|
||||
|
||||
def decorator(func: Callable):
|
||||
@wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
in_progress_requests.labels(instance=INSTANCE_ID).inc()
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
result = await func(*args, **kwargs)
|
||||
status = "success"
|
||||
return result
|
||||
except Exception as e:
|
||||
status = "error"
|
||||
raise e
|
||||
finally:
|
||||
elapsed = time.time() - start_time
|
||||
request_counter.labels(
|
||||
method=method, endpoint=endpoint, status=status, instance=INSTANCE_ID
|
||||
).inc()
|
||||
request_latency.labels(
|
||||
method=method, endpoint=endpoint, instance=INSTANCE_ID
|
||||
).observe(elapsed)
|
||||
in_progress_requests.labels(instance=INSTANCE_ID).dec()
|
||||
|
||||
# 自动推送指标
|
||||
if auto_push:
|
||||
push_metrics()
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def track_algorithm_execution(algorithm_name: str, auto_push: bool = True):
|
||||
"""
|
||||
装饰器:跟踪算法执行指标
|
||||
|
||||
Args:
|
||||
algorithm_name: 算法名称
|
||||
auto_push: 是否自动推送到 Pushgateway
|
||||
"""
|
||||
|
||||
def decorator(func: Callable):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
status = "success"
|
||||
return result
|
||||
except Exception as e:
|
||||
status = "error"
|
||||
raise e
|
||||
finally:
|
||||
elapsed = time.time() - start_time
|
||||
algorithm_counter.labels(
|
||||
algorithm=algorithm_name, status=status, instance=INSTANCE_ID
|
||||
).inc()
|
||||
algorithm_latency.labels(
|
||||
algorithm=algorithm_name, instance=INSTANCE_ID
|
||||
).observe(elapsed)
|
||||
|
||||
# 自动推送指标
|
||||
if auto_push:
|
||||
push_metrics()
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
247
src/functional_scaffold/core/metrics_redis.py
Normal file
247
src/functional_scaffold/core/metrics_redis.py
Normal file
@@ -0,0 +1,247 @@
|
||||
"""基于 Redis 的指标记录模块"""
|
||||
|
||||
from functools import wraps
|
||||
import time
|
||||
from typing import Callable, Optional
|
||||
import os
|
||||
import logging
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
import redis
|
||||
REDIS_AVAILABLE = True
|
||||
except ImportError:
|
||||
REDIS_AVAILABLE = False
|
||||
logging.warning("Redis 未安装,指标将无法记录到 Redis")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Redis 配置
|
||||
REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
|
||||
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
|
||||
REDIS_DB = int(os.getenv("REDIS_METRICS_DB", "0"))
|
||||
REDIS_PASSWORD = os.getenv("REDIS_PASSWORD", None)
|
||||
INSTANCE_ID = os.getenv("INSTANCE_ID", os.getenv("HOSTNAME", "unknown"))
|
||||
|
||||
# Redis 键前缀
|
||||
METRICS_PREFIX = "metrics:"
|
||||
REQUEST_COUNTER_KEY = f"{METRICS_PREFIX}request_counter"
|
||||
REQUEST_LATENCY_KEY = f"{METRICS_PREFIX}request_latency"
|
||||
ALGORITHM_COUNTER_KEY = f"{METRICS_PREFIX}algorithm_counter"
|
||||
ALGORITHM_LATENCY_KEY = f"{METRICS_PREFIX}algorithm_latency"
|
||||
IN_PROGRESS_KEY = f"{METRICS_PREFIX}in_progress"
|
||||
|
||||
|
||||
class RedisMetricsClient:
|
||||
"""Redis 指标客户端"""
|
||||
|
||||
def __init__(self):
|
||||
if not REDIS_AVAILABLE:
|
||||
raise ImportError("需要安装 redis 库: pip install redis")
|
||||
|
||||
self.client = redis.Redis(
|
||||
host=REDIS_HOST,
|
||||
port=REDIS_PORT,
|
||||
db=REDIS_DB,
|
||||
password=REDIS_PASSWORD,
|
||||
decode_responses=True,
|
||||
)
|
||||
self.instance_id = INSTANCE_ID
|
||||
|
||||
def increment_counter(self, key: str, labels: dict, value: int = 1):
|
||||
"""
|
||||
增加计数器
|
||||
|
||||
Args:
|
||||
key: 指标键
|
||||
labels: 标签字典
|
||||
value: 增加的值
|
||||
"""
|
||||
try:
|
||||
# 使用 Hash 存储,键为标签组合
|
||||
label_key = self._make_label_key(labels)
|
||||
full_key = f"{key}:{label_key}"
|
||||
self.client.hincrby(key, full_key, value)
|
||||
|
||||
# 记录最后更新时间
|
||||
self.client.hset(key, f"{full_key}:timestamp", int(time.time()))
|
||||
except Exception as e:
|
||||
logger.error(f"Redis 计数器增加失败: {e}")
|
||||
|
||||
def observe_histogram(self, key: str, labels: dict, value: float):
|
||||
"""
|
||||
记录直方图观测值
|
||||
|
||||
Args:
|
||||
key: 指标键
|
||||
labels: 标签字典
|
||||
value: 观测值
|
||||
"""
|
||||
try:
|
||||
label_key = self._make_label_key(labels)
|
||||
full_key = f"{key}:{label_key}"
|
||||
|
||||
# 使用 Sorted Set 存储延迟数据(用于计算分位数)
|
||||
timestamp = time.time()
|
||||
self.client.zadd(full_key, {f"{timestamp}:{value}": timestamp})
|
||||
|
||||
# 保留最近1小时的数据
|
||||
cutoff = timestamp - 3600
|
||||
self.client.zremrangebyscore(full_key, "-inf", cutoff)
|
||||
|
||||
# 同时记录到 Hash 中用于快速统计
|
||||
self.client.hincrby(f"{key}:count", full_key, 1)
|
||||
self.client.hincrbyfloat(f"{key}:sum", full_key, value)
|
||||
except Exception as e:
|
||||
logger.error(f"Redis 直方图记录失败: {e}")
|
||||
|
||||
def set_gauge(self, key: str, labels: dict, value: float):
|
||||
"""
|
||||
设置仪表盘值
|
||||
|
||||
Args:
|
||||
key: 指标键
|
||||
labels: 标签字典
|
||||
value: 值
|
||||
"""
|
||||
try:
|
||||
label_key = self._make_label_key(labels)
|
||||
full_key = f"{key}:{label_key}"
|
||||
self.client.hset(key, full_key, value)
|
||||
self.client.hset(key, f"{full_key}:timestamp", int(time.time()))
|
||||
except Exception as e:
|
||||
logger.error(f"Redis 仪表盘设置失败: {e}")
|
||||
|
||||
def increment_gauge(self, key: str, labels: dict, value: float = 1):
|
||||
"""增加仪表盘值"""
|
||||
try:
|
||||
label_key = self._make_label_key(labels)
|
||||
full_key = f"{key}:{label_key}"
|
||||
self.client.hincrbyfloat(key, full_key, value)
|
||||
except Exception as e:
|
||||
logger.error(f"Redis 仪表盘增加失败: {e}")
|
||||
|
||||
def decrement_gauge(self, key: str, labels: dict, value: float = 1):
|
||||
"""减少仪表盘值"""
|
||||
self.increment_gauge(key, labels, -value)
|
||||
|
||||
def _make_label_key(self, labels: dict) -> str:
|
||||
"""
|
||||
从标签字典生成键
|
||||
|
||||
Args:
|
||||
labels: 标签字典
|
||||
|
||||
Returns:
|
||||
str: 标签键
|
||||
"""
|
||||
# 添加实例ID
|
||||
labels_with_instance = {**labels, "instance": self.instance_id}
|
||||
# 按键排序确保一致性
|
||||
sorted_labels = sorted(labels_with_instance.items())
|
||||
return ",".join(f"{k}={v}" for k, v in sorted_labels)
|
||||
|
||||
def get_metrics_summary(self) -> dict:
|
||||
"""
|
||||
获取指标摘要(用于调试)
|
||||
|
||||
Returns:
|
||||
dict: 指标摘要
|
||||
"""
|
||||
try:
|
||||
return {
|
||||
"request_counter": self.client.hgetall(REQUEST_COUNTER_KEY),
|
||||
"algorithm_counter": self.client.hgetall(ALGORITHM_COUNTER_KEY),
|
||||
"in_progress": self.client.hgetall(IN_PROGRESS_KEY),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"获取指标摘要失败: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
# 全局客户端实例
|
||||
_redis_client: Optional[RedisMetricsClient] = None
|
||||
|
||||
|
||||
def get_redis_client() -> RedisMetricsClient:
|
||||
"""获取 Redis 客户端单例"""
|
||||
global _redis_client
|
||||
if _redis_client is None:
|
||||
_redis_client = RedisMetricsClient()
|
||||
return _redis_client
|
||||
|
||||
|
||||
def track_request(method: str, endpoint: str):
|
||||
"""
|
||||
装饰器:跟踪HTTP请求指标
|
||||
|
||||
Args:
|
||||
method: HTTP方法
|
||||
endpoint: 端点路径
|
||||
"""
|
||||
|
||||
def decorator(func: Callable):
|
||||
@wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
client = get_redis_client()
|
||||
labels = {"method": method, "endpoint": endpoint}
|
||||
|
||||
# 增加进行中的请求数
|
||||
client.increment_gauge(IN_PROGRESS_KEY, labels)
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
result = await func(*args, **kwargs)
|
||||
status = "success"
|
||||
return result
|
||||
except Exception as e:
|
||||
status = "error"
|
||||
raise e
|
||||
finally:
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# 记录指标
|
||||
counter_labels = {**labels, "status": status}
|
||||
client.increment_counter(REQUEST_COUNTER_KEY, counter_labels)
|
||||
client.observe_histogram(REQUEST_LATENCY_KEY, labels, elapsed)
|
||||
client.decrement_gauge(IN_PROGRESS_KEY, labels)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def track_algorithm_execution(algorithm_name: str):
|
||||
"""
|
||||
装饰器:跟踪算法执行指标
|
||||
|
||||
Args:
|
||||
algorithm_name: 算法名称
|
||||
"""
|
||||
|
||||
def decorator(func: Callable):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
client = get_redis_client()
|
||||
labels = {"algorithm": algorithm_name}
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
status = "success"
|
||||
return result
|
||||
except Exception as e:
|
||||
status = "error"
|
||||
raise e
|
||||
finally:
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# 记录指标
|
||||
counter_labels = {**labels, "status": status}
|
||||
client.increment_counter(ALGORITHM_COUNTER_KEY, counter_labels)
|
||||
client.observe_histogram(ALGORITHM_LATENCY_KEY, labels, elapsed)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
247
src/functional_scaffold/core/metrics_redis_exporter.py
Normal file
247
src/functional_scaffold/core/metrics_redis_exporter.py
Normal file
@@ -0,0 +1,247 @@
|
||||
"""Redis 指标 Exporter - 将 Redis 中的指标转换为 Prometheus 格式"""
|
||||
|
||||
from prometheus_client import Counter, Histogram, Gauge, CollectorRegistry, generate_latest
|
||||
from prometheus_client.core import GaugeMetricFamily, CounterMetricFamily, HistogramMetricFamily
|
||||
import redis
|
||||
import os
|
||||
import logging
|
||||
from typing import Dict, List, Tuple
|
||||
import time
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Redis 配置
|
||||
REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
|
||||
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
|
||||
REDIS_DB = int(os.getenv("REDIS_METRICS_DB", "0"))
|
||||
REDIS_PASSWORD = os.getenv("REDIS_PASSWORD", None)
|
||||
|
||||
# Redis 键前缀
|
||||
METRICS_PREFIX = "metrics:"
|
||||
REQUEST_COUNTER_KEY = f"{METRICS_PREFIX}request_counter"
|
||||
REQUEST_LATENCY_KEY = f"{METRICS_PREFIX}request_latency"
|
||||
ALGORITHM_COUNTER_KEY = f"{METRICS_PREFIX}algorithm_counter"
|
||||
ALGORITHM_LATENCY_KEY = f"{METRICS_PREFIX}algorithm_latency"
|
||||
IN_PROGRESS_KEY = f"{METRICS_PREFIX}in_progress"
|
||||
|
||||
|
||||
class RedisMetricsCollector:
|
||||
"""从 Redis 收集指标并转换为 Prometheus 格式"""
|
||||
|
||||
def __init__(self):
|
||||
self.redis_client = redis.Redis(
|
||||
host=REDIS_HOST,
|
||||
port=REDIS_PORT,
|
||||
db=REDIS_DB,
|
||||
password=REDIS_PASSWORD,
|
||||
decode_responses=True,
|
||||
)
|
||||
|
||||
def collect(self):
|
||||
"""收集所有指标"""
|
||||
try:
|
||||
# 收集计数器指标
|
||||
yield from self._collect_counter(
|
||||
REQUEST_COUNTER_KEY,
|
||||
"http_requests_total",
|
||||
"Total HTTP requests",
|
||||
)
|
||||
yield from self._collect_counter(
|
||||
ALGORITHM_COUNTER_KEY,
|
||||
"algorithm_executions_total",
|
||||
"Total algorithm executions",
|
||||
)
|
||||
|
||||
# 收集直方图指标
|
||||
yield from self._collect_histogram(
|
||||
REQUEST_LATENCY_KEY,
|
||||
"http_request_duration_seconds",
|
||||
"HTTP request latency",
|
||||
)
|
||||
yield from self._collect_histogram(
|
||||
ALGORITHM_LATENCY_KEY,
|
||||
"algorithm_execution_duration_seconds",
|
||||
"Algorithm execution latency",
|
||||
)
|
||||
|
||||
# 收集仪表盘指标
|
||||
yield from self._collect_gauge(
|
||||
IN_PROGRESS_KEY,
|
||||
"http_requests_in_progress",
|
||||
"Number of HTTP requests in progress",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"收集指标失败: {e}")
|
||||
|
||||
def _collect_counter(self, redis_key: str, metric_name: str, description: str):
|
||||
"""收集计数器指标"""
|
||||
try:
|
||||
data = self.redis_client.hgetall(redis_key)
|
||||
if not data:
|
||||
return
|
||||
|
||||
# 解析标签和值
|
||||
metrics_data = []
|
||||
for key, value in data.items():
|
||||
if key.endswith(":timestamp"):
|
||||
continue
|
||||
labels = self._parse_labels(key)
|
||||
metrics_data.append((labels, float(value)))
|
||||
|
||||
# 创建 Prometheus 指标
|
||||
if metrics_data:
|
||||
label_names = list(metrics_data[0][0].keys())
|
||||
counter = CounterMetricFamily(metric_name, description, labels=label_names)
|
||||
for labels, value in metrics_data:
|
||||
counter.add_metric(list(labels.values()), value)
|
||||
yield counter
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"收集计数器 {redis_key} 失败: {e}")
|
||||
|
||||
def _collect_histogram(self, redis_key: str, metric_name: str, description: str):
|
||||
"""收集直方图指标"""
|
||||
try:
|
||||
# 获取计数和总和
|
||||
count_data = self.redis_client.hgetall(f"{redis_key}:count")
|
||||
sum_data = self.redis_client.hgetall(f"{redis_key}:sum")
|
||||
|
||||
if not count_data:
|
||||
return
|
||||
|
||||
metrics_data = []
|
||||
for key in count_data.keys():
|
||||
labels = self._parse_labels(key)
|
||||
count = float(count_data.get(key, 0))
|
||||
sum_value = float(sum_data.get(key, 0))
|
||||
|
||||
# 计算分位数(从 Sorted Set 中)
|
||||
full_key = f"{redis_key}:{key}"
|
||||
latencies = self._get_latencies(full_key)
|
||||
buckets = self._calculate_buckets(latencies)
|
||||
|
||||
metrics_data.append((labels, count, sum_value, buckets))
|
||||
|
||||
# 创建 Prometheus 指标
|
||||
if metrics_data:
|
||||
label_names = list(metrics_data[0][0].keys())
|
||||
histogram = HistogramMetricFamily(
|
||||
metric_name, description, labels=label_names
|
||||
)
|
||||
for labels, count, sum_value, buckets in metrics_data:
|
||||
histogram.add_metric(
|
||||
list(labels.values()),
|
||||
buckets=buckets,
|
||||
sum_value=sum_value,
|
||||
)
|
||||
yield histogram
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"收集直方图 {redis_key} 失败: {e}")
|
||||
|
||||
def _collect_gauge(self, redis_key: str, metric_name: str, description: str):
|
||||
"""收集仪表盘指标"""
|
||||
try:
|
||||
data = self.redis_client.hgetall(redis_key)
|
||||
if not data:
|
||||
return
|
||||
|
||||
metrics_data = []
|
||||
for key, value in data.items():
|
||||
if key.endswith(":timestamp"):
|
||||
continue
|
||||
labels = self._parse_labels(key)
|
||||
metrics_data.append((labels, float(value)))
|
||||
|
||||
# 创建 Prometheus 指标
|
||||
if metrics_data:
|
||||
label_names = list(metrics_data[0][0].keys())
|
||||
gauge = GaugeMetricFamily(metric_name, description, labels=label_names)
|
||||
for labels, value in metrics_data:
|
||||
gauge.add_metric(list(labels.values()), value)
|
||||
yield gauge
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"收集仪表盘 {redis_key} 失败: {e}")
|
||||
|
||||
def _parse_labels(self, label_key: str) -> Dict[str, str]:
|
||||
"""
|
||||
解析标签键
|
||||
|
||||
Args:
|
||||
label_key: 标签键字符串 (e.g., "method=GET,endpoint=/invoke,instance=host1")
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: 标签字典
|
||||
"""
|
||||
labels = {}
|
||||
for pair in label_key.split(","):
|
||||
if "=" in pair:
|
||||
key, value = pair.split("=", 1)
|
||||
labels[key] = value
|
||||
return labels
|
||||
|
||||
def _get_latencies(self, key: str) -> List[float]:
|
||||
"""从 Sorted Set 获取延迟数据"""
|
||||
try:
|
||||
data = self.redis_client.zrange(key, 0, -1)
|
||||
latencies = []
|
||||
for item in data:
|
||||
# 格式: "timestamp:value"
|
||||
if ":" in item:
|
||||
_, value = item.rsplit(":", 1)
|
||||
latencies.append(float(value))
|
||||
return sorted(latencies)
|
||||
except Exception as e:
|
||||
logger.error(f"获取延迟数据失败: {e}")
|
||||
return []
|
||||
|
||||
def _calculate_buckets(
|
||||
self, latencies: List[float]
|
||||
) -> List[Tuple[str, float]]:
|
||||
"""
|
||||
计算直方图桶
|
||||
|
||||
Args:
|
||||
latencies: 延迟数据列表
|
||||
|
||||
Returns:
|
||||
List[Tuple[str, float]]: 桶列表 [(上限, 计数), ...]
|
||||
"""
|
||||
if not latencies:
|
||||
return [("+Inf", 0)]
|
||||
|
||||
# 定义桶边界(秒)
|
||||
buckets_boundaries = [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]
|
||||
buckets = []
|
||||
|
||||
for boundary in buckets_boundaries:
|
||||
count = sum(1 for lat in latencies if lat <= boundary)
|
||||
buckets.append((str(boundary), count))
|
||||
|
||||
# +Inf 桶
|
||||
buckets.append(("+Inf", len(latencies)))
|
||||
|
||||
return buckets
|
||||
|
||||
|
||||
# 创建全局收集器
|
||||
redis_collector = RedisMetricsCollector()
|
||||
|
||||
|
||||
def get_metrics() -> bytes:
|
||||
"""
|
||||
获取 Prometheus 格式的指标
|
||||
|
||||
Returns:
|
||||
bytes: Prometheus 格式的指标数据
|
||||
"""
|
||||
registry = CollectorRegistry()
|
||||
registry.register(redis_collector)
|
||||
return generate_latest(registry)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试
|
||||
print(get_metrics().decode("utf-8"))
|
||||
605
src/functional_scaffold/core/metrics_unified.py
Normal file
605
src/functional_scaffold/core/metrics_unified.py
Normal file
@@ -0,0 +1,605 @@
|
||||
"""统一指标管理模块
|
||||
|
||||
基于 Redis 的指标收集方案,支持多实例部署和 YAML 配置。
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import socket
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
from functools import wraps
|
||||
import time
|
||||
|
||||
import yaml
|
||||
import redis
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MetricsManager:
|
||||
"""统一指标管理器
|
||||
|
||||
支持从 YAML 配置文件加载指标定义,使用 Redis 存储指标数据,
|
||||
并导出 Prometheus 格式的指标。
|
||||
"""
|
||||
|
||||
def __init__(self, config_path: Optional[str] = None):
|
||||
"""初始化指标管理器
|
||||
|
||||
Args:
|
||||
config_path: 配置文件路径,默认从 settings 获取
|
||||
"""
|
||||
from ..config import settings
|
||||
|
||||
self.config_path = config_path or settings.metrics_config_path
|
||||
self.instance_id = settings.metrics_instance_id or socket.gethostname()
|
||||
self.config: Dict[str, Any] = {}
|
||||
self.metrics_definitions: Dict[str, Dict[str, Any]] = {}
|
||||
self._redis_client: Optional[redis.Redis] = None
|
||||
self._redis_available = False
|
||||
|
||||
# 加载配置
|
||||
self._load_config()
|
||||
# 初始化 Redis 连接
|
||||
self._init_redis()
|
||||
# 注册指标定义
|
||||
self._register_metrics()
|
||||
|
||||
def _load_config(self) -> None:
|
||||
"""加载 YAML 配置文件"""
|
||||
# 尝试多个路径
|
||||
paths_to_try = [
|
||||
Path(self.config_path),
|
||||
Path.cwd() / self.config_path,
|
||||
Path(__file__).parent.parent.parent.parent / self.config_path,
|
||||
]
|
||||
|
||||
for path in paths_to_try:
|
||||
if path.exists():
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
# 处理环境变量替换 ${VAR:default}
|
||||
content = self._substitute_env_vars(content)
|
||||
self.config = yaml.safe_load(content) or {}
|
||||
logger.info(f"已加载指标配置文件: {path}")
|
||||
return
|
||||
|
||||
logger.warning(f"未找到指标配置文件: {self.config_path},使用默认配置")
|
||||
self.config = self._get_default_config()
|
||||
|
||||
def _substitute_env_vars(self, content: str) -> str:
|
||||
"""替换配置中的环境变量
|
||||
|
||||
支持格式: ${VAR_NAME:default_value}
|
||||
"""
|
||||
pattern = r"\$\{([^}:]+)(?::([^}]*))?\}"
|
||||
|
||||
def replacer(match):
|
||||
var_name = match.group(1)
|
||||
default_value = match.group(2) or ""
|
||||
return os.environ.get(var_name, default_value)
|
||||
|
||||
return re.sub(pattern, replacer, content)
|
||||
|
||||
def _get_default_config(self) -> Dict[str, Any]:
|
||||
"""获取默认配置"""
|
||||
return {
|
||||
"redis": {
|
||||
"host": "localhost",
|
||||
"port": 6379,
|
||||
"db": 0,
|
||||
"password": "",
|
||||
},
|
||||
"global": {
|
||||
"prefix": "functional_scaffold",
|
||||
"instance_label": True,
|
||||
},
|
||||
"builtin_metrics": {
|
||||
"http_requests": {
|
||||
"enabled": True,
|
||||
"name": "http_requests_total",
|
||||
"type": "counter",
|
||||
"description": "HTTP 请求总数",
|
||||
"labels": ["method", "endpoint", "status"],
|
||||
},
|
||||
"http_latency": {
|
||||
"enabled": True,
|
||||
"name": "http_request_duration_seconds",
|
||||
"type": "histogram",
|
||||
"description": "HTTP 请求延迟",
|
||||
"labels": ["method", "endpoint"],
|
||||
"buckets": [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
|
||||
},
|
||||
"http_in_progress": {
|
||||
"enabled": True,
|
||||
"name": "http_requests_in_progress",
|
||||
"type": "gauge",
|
||||
"description": "当前进行中的 HTTP 请求数",
|
||||
"labels": [],
|
||||
},
|
||||
"algorithm_executions": {
|
||||
"enabled": True,
|
||||
"name": "algorithm_executions_total",
|
||||
"type": "counter",
|
||||
"description": "算法执行总数",
|
||||
"labels": ["algorithm", "status"],
|
||||
},
|
||||
"algorithm_latency": {
|
||||
"enabled": True,
|
||||
"name": "algorithm_execution_duration_seconds",
|
||||
"type": "histogram",
|
||||
"description": "算法执行延迟",
|
||||
"labels": ["algorithm"],
|
||||
"buckets": [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 60],
|
||||
},
|
||||
},
|
||||
"custom_metrics": {},
|
||||
}
|
||||
|
||||
def _init_redis(self) -> None:
|
||||
"""初始化 Redis 连接"""
|
||||
from ..config import settings
|
||||
|
||||
redis_config = self.config.get("redis", {})
|
||||
host = redis_config.get("host") or settings.redis_host
|
||||
port = int(redis_config.get("port") or settings.redis_port)
|
||||
db = int(redis_config.get("db") or settings.redis_db)
|
||||
password = redis_config.get("password") or settings.redis_password
|
||||
|
||||
try:
|
||||
self._redis_client = redis.Redis(
|
||||
host=host,
|
||||
port=port,
|
||||
db=db,
|
||||
password=password if password else None,
|
||||
decode_responses=True,
|
||||
socket_connect_timeout=5,
|
||||
socket_timeout=5,
|
||||
)
|
||||
# 测试连接
|
||||
self._redis_client.ping()
|
||||
self._redis_available = True
|
||||
logger.info(f"Redis 连接成功: {host}:{port}/{db}")
|
||||
except redis.ConnectionError as e:
|
||||
logger.warning(f"Redis 连接失败: {e},指标将不会被收集")
|
||||
self._redis_available = False
|
||||
except Exception as e:
|
||||
logger.warning(f"Redis 初始化异常: {e},指标将不会被收集")
|
||||
self._redis_available = False
|
||||
|
||||
def _register_metrics(self) -> None:
|
||||
"""注册所有指标定义"""
|
||||
# 注册内置指标
|
||||
builtin = self.config.get("builtin_metrics", {})
|
||||
for key, metric_def in builtin.items():
|
||||
if metric_def.get("enabled", True):
|
||||
name = metric_def.get("name", key)
|
||||
self.metrics_definitions[name] = {
|
||||
"type": metric_def.get("type", "counter"),
|
||||
"description": metric_def.get("description", ""),
|
||||
"labels": metric_def.get("labels", []),
|
||||
"buckets": metric_def.get("buckets", []),
|
||||
}
|
||||
|
||||
# 注册自定义指标
|
||||
custom = self.config.get("custom_metrics", {})
|
||||
for key, metric_def in custom.items():
|
||||
name = metric_def.get("name", key)
|
||||
self.metrics_definitions[name] = {
|
||||
"type": metric_def.get("type", "counter"),
|
||||
"description": metric_def.get("description", ""),
|
||||
"labels": metric_def.get("labels", []),
|
||||
"buckets": metric_def.get("buckets", []),
|
||||
}
|
||||
|
||||
logger.info(f"已注册 {len(self.metrics_definitions)} 个指标定义")
|
||||
|
||||
def _labels_to_key(self, labels: Optional[Dict[str, str]]) -> str:
|
||||
"""将标签字典转换为 Redis key 的一部分"""
|
||||
if not labels:
|
||||
return ""
|
||||
sorted_items = sorted(labels.items())
|
||||
return ",".join(f"{k}={v}" for k, v in sorted_items)
|
||||
|
||||
def _key_to_prometheus_labels(self, key: str) -> str:
|
||||
"""将 Redis key 格式转换为 Prometheus 标签格式(带引号)
|
||||
|
||||
输入: endpoint=/healthz,method=GET,status=success
|
||||
输出: endpoint="/healthz",method="GET",status="success"
|
||||
"""
|
||||
if not key or key == "_default_":
|
||||
return ""
|
||||
parts = []
|
||||
for pair in key.split(","):
|
||||
if "=" in pair:
|
||||
k, v = pair.split("=", 1)
|
||||
# 转义值中的特殊字符
|
||||
v = v.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
|
||||
parts.append(f'{k}="{v}"')
|
||||
return ",".join(parts)
|
||||
|
||||
def _validate_metric(self, name: str, expected_type: str) -> bool:
|
||||
"""验证指标是否已定义且类型正确"""
|
||||
if name not in self.metrics_definitions:
|
||||
logger.warning(f"指标 '{name}' 未在配置中定义")
|
||||
return False
|
||||
if self.metrics_definitions[name]["type"] != expected_type:
|
||||
logger.warning(
|
||||
f"指标 '{name}' 类型不匹配: 期望 {expected_type}, "
|
||||
f"实际 {self.metrics_definitions[name]['type']}"
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
# === 简单 API(业务代码使用)===
|
||||
|
||||
def incr(self, name: str, labels: Optional[Dict[str, str]] = None, value: int = 1) -> None:
|
||||
"""增加计数器
|
||||
|
||||
Args:
|
||||
name: 指标名称
|
||||
labels: 标签字典
|
||||
value: 增加的值,默认为 1
|
||||
"""
|
||||
if not self._redis_available:
|
||||
return
|
||||
|
||||
if not self._validate_metric(name, "counter"):
|
||||
return
|
||||
|
||||
try:
|
||||
key = f"metrics:counter:{name}"
|
||||
field = self._labels_to_key(labels) or "_default_"
|
||||
self._redis_client.hincrbyfloat(key, field, value)
|
||||
except Exception as e:
|
||||
logger.error(f"增加计数器失败: {e}")
|
||||
|
||||
def set(self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) -> None:
|
||||
"""设置仪表盘值
|
||||
|
||||
Args:
|
||||
name: 指标名称
|
||||
labels: 标签字典
|
||||
value: 设置的值
|
||||
"""
|
||||
if not self._redis_available:
|
||||
return
|
||||
|
||||
if not self._validate_metric(name, "gauge"):
|
||||
return
|
||||
|
||||
try:
|
||||
key = f"metrics:gauge:{name}"
|
||||
field = self._labels_to_key(labels) or "_default_"
|
||||
self._redis_client.hset(key, field, value)
|
||||
except Exception as e:
|
||||
logger.error(f"设置仪表盘失败: {e}")
|
||||
|
||||
def gauge_incr(
|
||||
self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 1
|
||||
) -> None:
|
||||
"""增加仪表盘值
|
||||
|
||||
Args:
|
||||
name: 指标名称
|
||||
labels: 标签字典
|
||||
value: 增加的值
|
||||
"""
|
||||
if not self._redis_available:
|
||||
return
|
||||
|
||||
if not self._validate_metric(name, "gauge"):
|
||||
return
|
||||
|
||||
try:
|
||||
key = f"metrics:gauge:{name}"
|
||||
field = self._labels_to_key(labels) or "_default_"
|
||||
self._redis_client.hincrbyfloat(key, field, value)
|
||||
except Exception as e:
|
||||
logger.error(f"增加仪表盘失败: {e}")
|
||||
|
||||
def gauge_decr(
|
||||
self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 1
|
||||
) -> None:
|
||||
"""减少仪表盘值
|
||||
|
||||
Args:
|
||||
name: 指标名称
|
||||
labels: 标签字典
|
||||
value: 减少的值
|
||||
"""
|
||||
self.gauge_incr(name, labels, -value)
|
||||
|
||||
def observe(self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) -> None:
|
||||
"""记录直方图观测值
|
||||
|
||||
Args:
|
||||
name: 指标名称
|
||||
labels: 标签字典
|
||||
value: 观测值
|
||||
"""
|
||||
if not self._redis_available:
|
||||
return
|
||||
|
||||
if not self._validate_metric(name, "histogram"):
|
||||
return
|
||||
|
||||
try:
|
||||
label_key = self._labels_to_key(labels) or "_default_"
|
||||
buckets = self.metrics_definitions[name].get("buckets", [])
|
||||
|
||||
# 使用 pipeline 批量操作
|
||||
pipe = self._redis_client.pipeline()
|
||||
|
||||
# 增加 count
|
||||
pipe.hincrbyfloat(f"metrics:histogram:{name}:count", label_key, 1)
|
||||
|
||||
# 增加 sum
|
||||
pipe.hincrbyfloat(f"metrics:histogram:{name}:sum", label_key, value)
|
||||
|
||||
# 更新各个桶
|
||||
for bucket in buckets:
|
||||
if value <= bucket:
|
||||
bucket_key = f"metrics:histogram:{name}:bucket:{bucket}"
|
||||
pipe.hincrbyfloat(bucket_key, label_key, 1)
|
||||
|
||||
# +Inf 桶总是增加
|
||||
pipe.hincrbyfloat(f"metrics:histogram:{name}:bucket:+Inf", label_key, 1)
|
||||
|
||||
pipe.execute()
|
||||
except Exception as e:
|
||||
logger.error(f"记录直方图失败: {e}")
|
||||
|
||||
# === 导出方法 ===
|
||||
|
||||
def export(self) -> str:
|
||||
"""导出 Prometheus 格式指标
|
||||
|
||||
Returns:
|
||||
Prometheus 文本格式的指标字符串
|
||||
"""
|
||||
if not self._redis_available:
|
||||
return "# Redis 不可用,无法导出指标\n"
|
||||
|
||||
lines: List[str] = []
|
||||
|
||||
try:
|
||||
for name, definition in self.metrics_definitions.items():
|
||||
metric_type = definition["type"]
|
||||
description = definition["description"]
|
||||
|
||||
# 添加 HELP 和 TYPE
|
||||
lines.append(f"# HELP {name} {description}")
|
||||
lines.append(f"# TYPE {name} {metric_type}")
|
||||
|
||||
if metric_type == "counter":
|
||||
lines.extend(self._export_counter(name))
|
||||
elif metric_type == "gauge":
|
||||
lines.extend(self._export_gauge(name))
|
||||
elif metric_type == "histogram":
|
||||
lines.extend(self._export_histogram(name, definition))
|
||||
|
||||
lines.append("") # 空行分隔
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"导出指标失败: {e}")
|
||||
return f"# 导出指标失败: {e}\n"
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _export_counter(self, name: str) -> List[str]:
|
||||
"""导出计数器指标"""
|
||||
lines = []
|
||||
key = f"metrics:counter:{name}"
|
||||
|
||||
data = self._redis_client.hgetall(key)
|
||||
for field, value in data.items():
|
||||
if field == "_default_":
|
||||
lines.append(f"{name} {value}")
|
||||
else:
|
||||
prom_labels = self._key_to_prometheus_labels(field)
|
||||
lines.append(f"{name}{{{prom_labels}}} {value}")
|
||||
|
||||
return lines
|
||||
|
||||
def _export_gauge(self, name: str) -> List[str]:
|
||||
"""导出仪表盘指标"""
|
||||
lines = []
|
||||
key = f"metrics:gauge:{name}"
|
||||
|
||||
data = self._redis_client.hgetall(key)
|
||||
for field, value in data.items():
|
||||
if field == "_default_":
|
||||
lines.append(f"{name} {value}")
|
||||
else:
|
||||
prom_labels = self._key_to_prometheus_labels(field)
|
||||
lines.append(f"{name}{{{prom_labels}}} {value}")
|
||||
|
||||
return lines
|
||||
|
||||
def _export_histogram(self, name: str, definition: Dict[str, Any]) -> List[str]:
|
||||
"""导出直方图指标"""
|
||||
lines = []
|
||||
buckets = definition.get("buckets", [])
|
||||
|
||||
# 获取所有标签组合
|
||||
count_data = self._redis_client.hgetall(f"metrics:histogram:{name}:count")
|
||||
sum_data = self._redis_client.hgetall(f"metrics:histogram:{name}:sum")
|
||||
|
||||
for label_key in count_data.keys():
|
||||
prom_labels = self._key_to_prometheus_labels(label_key)
|
||||
|
||||
# 导出各个桶
|
||||
for bucket in buckets:
|
||||
bucket_key = f"metrics:histogram:{name}:bucket:{bucket}"
|
||||
bucket_value = self._redis_client.hget(bucket_key, label_key) or "0"
|
||||
if label_key == "_default_":
|
||||
lines.append(f'{name}_bucket{{le="{bucket}"}} {bucket_value}')
|
||||
else:
|
||||
lines.append(f'{name}_bucket{{{prom_labels},le="{bucket}"}} {bucket_value}')
|
||||
|
||||
# +Inf 桶
|
||||
inf_key = f"metrics:histogram:{name}:bucket:+Inf"
|
||||
inf_value = self._redis_client.hget(inf_key, label_key) or "0"
|
||||
if label_key == "_default_":
|
||||
lines.append(f'{name}_bucket{{le="+Inf"}} {inf_value}')
|
||||
else:
|
||||
lines.append(f'{name}_bucket{{{prom_labels},le="+Inf"}} {inf_value}')
|
||||
|
||||
# count 和 sum
|
||||
count_value = count_data.get(label_key, "0")
|
||||
sum_value = sum_data.get(label_key, "0")
|
||||
if label_key == "_default_":
|
||||
lines.append(f"{name}_count {count_value}")
|
||||
lines.append(f"{name}_sum {sum_value}")
|
||||
else:
|
||||
lines.append(f"{name}_count{{{prom_labels}}} {count_value}")
|
||||
lines.append(f"{name}_sum{{{prom_labels}}} {sum_value}")
|
||||
|
||||
return lines
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""检查 Redis 是否可用"""
|
||||
return self._redis_available
|
||||
|
||||
def reset(self) -> None:
|
||||
"""重置所有指标(主要用于测试)"""
|
||||
if not self._redis_available:
|
||||
return
|
||||
|
||||
try:
|
||||
# 删除所有指标相关的 key
|
||||
keys = self._redis_client.keys("metrics:*")
|
||||
if keys:
|
||||
self._redis_client.delete(*keys)
|
||||
logger.info("已重置所有指标")
|
||||
except Exception as e:
|
||||
logger.error(f"重置指标失败: {e}")
|
||||
|
||||
|
||||
# 全局单例
|
||||
_manager: Optional[MetricsManager] = None
|
||||
|
||||
|
||||
def get_metrics_manager() -> MetricsManager:
|
||||
"""获取指标管理器单例"""
|
||||
global _manager
|
||||
if _manager is None:
|
||||
_manager = MetricsManager()
|
||||
return _manager
|
||||
|
||||
|
||||
def reset_metrics_manager() -> None:
|
||||
"""重置指标管理器单例(主要用于测试)"""
|
||||
global _manager
|
||||
_manager = None
|
||||
|
||||
|
||||
# === 便捷函数(业务代码直接调用)===
|
||||
|
||||
|
||||
def incr(name: str, labels: Optional[Dict[str, str]] = None, value: int = 1) -> None:
|
||||
"""增加计数器 - 便捷函数
|
||||
|
||||
Args:
|
||||
name: 指标名称
|
||||
labels: 标签字典
|
||||
value: 增加的值,默认为 1
|
||||
"""
|
||||
get_metrics_manager().incr(name, labels, value)
|
||||
|
||||
|
||||
def set(name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) -> None:
|
||||
"""设置仪表盘 - 便捷函数
|
||||
|
||||
Args:
|
||||
name: 指标名称
|
||||
labels: 标签字典
|
||||
value: 设置的值
|
||||
"""
|
||||
get_metrics_manager().set(name, labels, value)
|
||||
|
||||
|
||||
def gauge_incr(name: str, labels: Optional[Dict[str, str]] = None, value: float = 1) -> None:
|
||||
"""增加仪表盘 - 便捷函数
|
||||
|
||||
Args:
|
||||
name: 指标名称
|
||||
labels: 标签字典
|
||||
value: 增加的值
|
||||
"""
|
||||
get_metrics_manager().gauge_incr(name, labels, value)
|
||||
|
||||
|
||||
def gauge_decr(name: str, labels: Optional[Dict[str, str]] = None, value: float = 1) -> None:
|
||||
"""减少仪表盘 - 便捷函数
|
||||
|
||||
Args:
|
||||
name: 指标名称
|
||||
labels: 标签字典
|
||||
value: 减少的值
|
||||
"""
|
||||
get_metrics_manager().gauge_decr(name, labels, value)
|
||||
|
||||
|
||||
def observe(name: str, labels: Optional[Dict[str, str]] = None, value: float = 0) -> None:
|
||||
"""记录直方图 - 便捷函数
|
||||
|
||||
Args:
|
||||
name: 指标名称
|
||||
labels: 标签字典
|
||||
value: 观测值
|
||||
"""
|
||||
get_metrics_manager().observe(name, labels, value)
|
||||
|
||||
|
||||
def export() -> str:
|
||||
"""导出指标 - 便捷函数
|
||||
|
||||
Returns:
|
||||
Prometheus 文本格式的指标字符串
|
||||
"""
|
||||
return get_metrics_manager().export()
|
||||
|
||||
|
||||
def is_available() -> bool:
|
||||
"""检查 Redis 是否可用 - 便捷函数"""
|
||||
return get_metrics_manager().is_available()
|
||||
|
||||
|
||||
# === 装饰器(兼容旧 API)===
|
||||
|
||||
|
||||
def track_algorithm_execution(algorithm_name: str):
|
||||
"""装饰器:跟踪算法执行指标
|
||||
|
||||
Args:
|
||||
algorithm_name: 算法名称
|
||||
"""
|
||||
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
start_time = time.time()
|
||||
status = "success"
|
||||
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
return result
|
||||
except Exception as e:
|
||||
status = "error"
|
||||
raise e
|
||||
finally:
|
||||
elapsed = time.time() - start_time
|
||||
incr("algorithm_executions_total", {"algorithm": algorithm_name, "status": status})
|
||||
observe(
|
||||
"algorithm_execution_duration_seconds",
|
||||
{"algorithm": algorithm_name},
|
||||
elapsed,
|
||||
)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
39
src/functional_scaffold/core/tracing.py
Normal file
39
src/functional_scaffold/core/tracing.py
Normal file
@@ -0,0 +1,39 @@
|
||||
"""分布式追踪模块"""
|
||||
|
||||
import uuid
|
||||
from contextvars import ContextVar
|
||||
from typing import Optional
|
||||
|
||||
# 使用 ContextVar 存储请求ID,支持异步上下文
|
||||
request_id_var: ContextVar[Optional[str]] = ContextVar("request_id", default=None)
|
||||
|
||||
|
||||
def generate_request_id() -> str:
|
||||
"""生成唯一的请求ID"""
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
def get_request_id() -> Optional[str]:
|
||||
"""获取当前请求ID"""
|
||||
return request_id_var.get()
|
||||
|
||||
|
||||
def set_request_id(request_id: str) -> None:
|
||||
"""设置当前请求ID"""
|
||||
request_id_var.set(request_id)
|
||||
|
||||
|
||||
class TracingContext:
|
||||
"""追踪上下文管理器"""
|
||||
|
||||
def __init__(self, request_id: Optional[str] = None):
|
||||
self.request_id = request_id or generate_request_id()
|
||||
self.token = None
|
||||
|
||||
def __enter__(self):
|
||||
self.token = request_id_var.set(self.request_id)
|
||||
return self.request_id
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
if self.token:
|
||||
request_id_var.reset(self.token)
|
||||
203
src/functional_scaffold/main.py
Normal file
203
src/functional_scaffold/main.py
Normal file
@@ -0,0 +1,203 @@
|
||||
"""FastAPI 应用入口"""
|
||||
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import Response
|
||||
import logging
|
||||
import time
|
||||
|
||||
from .api import router
|
||||
from .config import settings
|
||||
from .core.logging import setup_logging
|
||||
from .core.tracing import generate_request_id, set_request_id, get_request_id
|
||||
from .core.metrics_unified import (
|
||||
get_metrics_manager,
|
||||
incr,
|
||||
observe,
|
||||
gauge_incr,
|
||||
gauge_decr,
|
||||
export,
|
||||
)
|
||||
from .core.job_manager import get_job_manager, shutdown_job_manager
|
||||
|
||||
# 设置日志
|
||||
setup_logging(
|
||||
level=settings.log_level,
|
||||
format_type=settings.log_format,
|
||||
file_path=settings.log_file_path if settings.log_file_enabled else None,
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 创建 FastAPI 应用
|
||||
app = FastAPI(
|
||||
title=settings.app_name,
|
||||
description="算法工程化 Serverless 脚手架 - 提供标准化的算法服务接口",
|
||||
version=settings.app_version,
|
||||
docs_url="/docs",
|
||||
redoc_url="/redoc",
|
||||
openapi_url="/openapi.json",
|
||||
)
|
||||
|
||||
# CORS 中间件
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
# 请求日志中间件
|
||||
@app.middleware("http")
|
||||
async def log_requests(request: Request, call_next):
|
||||
"""记录所有HTTP请求"""
|
||||
# 从请求头获取或生成 request_id
|
||||
request_id = request.headers.get("x-request-id") or generate_request_id()
|
||||
set_request_id(request_id)
|
||||
|
||||
logger.info(f"Request: {request.method} {request.url.path}")
|
||||
response = await call_next(request)
|
||||
logger.info(f"Response: {response.status_code}")
|
||||
return response
|
||||
|
||||
|
||||
def normalize_path(path: str) -> str:
|
||||
"""
|
||||
规范化路径,将路径参数替换为模板形式
|
||||
|
||||
Args:
|
||||
path: 原始路径
|
||||
|
||||
Returns:
|
||||
规范化后的路径
|
||||
|
||||
Examples:
|
||||
/jobs/a1b2c3d4e5f6 -> /jobs/{job_id}
|
||||
/invoke -> /invoke
|
||||
"""
|
||||
# 匹配 /jobs/{任意字符串} 模式
|
||||
if path.startswith("/jobs/") and len(path) > 6:
|
||||
return "/jobs/{job_id}"
|
||||
|
||||
return path
|
||||
|
||||
|
||||
# 指标跟踪中间件
|
||||
@app.middleware("http")
|
||||
async def track_metrics(request: Request, call_next):
|
||||
"""记录所有HTTP请求的指标"""
|
||||
if not settings.metrics_enabled:
|
||||
return await call_next(request)
|
||||
|
||||
# 跳过不需要记录指标的端点
|
||||
skip_paths = {"/metrics", "/readyz", "/healthz"}
|
||||
if request.url.path in skip_paths:
|
||||
return await call_next(request)
|
||||
|
||||
gauge_incr("http_requests_in_progress")
|
||||
start_time = time.time()
|
||||
status = "success"
|
||||
|
||||
try:
|
||||
response = await call_next(request)
|
||||
# 根据 HTTP 状态码判断成功或失败
|
||||
if response.status_code >= 400:
|
||||
status = "error"
|
||||
return response
|
||||
except Exception as e:
|
||||
status = "error"
|
||||
raise e
|
||||
finally:
|
||||
elapsed = time.time() - start_time
|
||||
# 使用规范化后的路径记录指标
|
||||
normalized_path = normalize_path(request.url.path)
|
||||
incr(
|
||||
"http_requests_total",
|
||||
{"method": request.method, "endpoint": normalized_path, "status": status},
|
||||
)
|
||||
observe(
|
||||
"http_request_duration_seconds",
|
||||
{"method": request.method, "endpoint": normalized_path},
|
||||
elapsed,
|
||||
)
|
||||
gauge_decr("http_requests_in_progress")
|
||||
|
||||
|
||||
# 注册路由
|
||||
app.include_router(router, tags=["Algorithm"])
|
||||
|
||||
|
||||
# Prometheus 指标端点
|
||||
@app.get(
|
||||
"/metrics",
|
||||
tags=["Monitoring"],
|
||||
summary="Prometheus 指标",
|
||||
description="导出 Prometheus 格式的监控指标",
|
||||
)
|
||||
async def metrics():
|
||||
"""
|
||||
Prometheus 指标端点
|
||||
|
||||
返回应用的监控指标,供 Prometheus 抓取
|
||||
"""
|
||||
if not settings.metrics_enabled:
|
||||
return Response(content="Metrics disabled", status_code=404)
|
||||
|
||||
return Response(
|
||||
content=export(),
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8",
|
||||
)
|
||||
|
||||
|
||||
# 启动事件
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
"""应用启动时执行"""
|
||||
logger.info(f"Starting {settings.app_name} v{settings.app_version}")
|
||||
logger.info(f"Environment: {settings.app_env}")
|
||||
logger.info(f"Metrics enabled: {settings.metrics_enabled}")
|
||||
|
||||
# 初始化指标管理器
|
||||
if settings.metrics_enabled:
|
||||
manager = get_metrics_manager()
|
||||
if manager.is_available():
|
||||
logger.info("Redis 指标收集已启用")
|
||||
else:
|
||||
logger.warning("Redis 不可用,指标将不会被收集")
|
||||
|
||||
# 初始化任务管理器
|
||||
try:
|
||||
job_manager = await get_job_manager()
|
||||
if job_manager.is_available():
|
||||
logger.info("异步任务管理器已启用")
|
||||
else:
|
||||
logger.warning("Redis 不可用,异步任务功能将不可用")
|
||||
except Exception as e:
|
||||
logger.warning(f"任务管理器初始化失败: {e}")
|
||||
|
||||
|
||||
# 关闭事件
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event():
|
||||
"""应用关闭时执行"""
|
||||
logger.info(f"Shutting down {settings.app_name}")
|
||||
|
||||
# 关闭任务管理器
|
||||
try:
|
||||
await shutdown_job_manager()
|
||||
logger.info("任务管理器已关闭")
|
||||
except Exception as e:
|
||||
logger.warning(f"任务管理器关闭失败: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(
|
||||
"functional_scaffold.main:app",
|
||||
host=settings.host,
|
||||
port=settings.port,
|
||||
reload=settings.app_env == "development",
|
||||
log_level=settings.log_level.lower(),
|
||||
)
|
||||
5
src/functional_scaffold/utils/__init__.py
Normal file
5
src/functional_scaffold/utils/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""工具函数模块"""
|
||||
|
||||
from .validators import validate_integer, validate_positive_integer
|
||||
|
||||
__all__ = ["validate_integer", "validate_positive_integer"]
|
||||
51
src/functional_scaffold/utils/validators.py
Normal file
51
src/functional_scaffold/utils/validators.py
Normal file
@@ -0,0 +1,51 @@
|
||||
"""参数校验工具"""
|
||||
|
||||
from typing import Any
|
||||
from ..core.errors import ValidationError
|
||||
|
||||
|
||||
def validate_integer(value: Any, field_name: str = "value") -> int:
|
||||
"""
|
||||
验证值是否为整数
|
||||
|
||||
Args:
|
||||
value: 待验证的值
|
||||
field_name: 字段名称(用于错误消息)
|
||||
|
||||
Returns:
|
||||
int: 验证后的整数值
|
||||
|
||||
Raises:
|
||||
ValidationError: 如果值不是整数
|
||||
"""
|
||||
if not isinstance(value, int) or isinstance(value, bool):
|
||||
raise ValidationError(
|
||||
f"{field_name} must be an integer",
|
||||
details={"field": field_name, "value": value, "type": type(value).__name__},
|
||||
)
|
||||
return value
|
||||
|
||||
|
||||
def validate_positive_integer(value: Any, field_name: str = "value") -> int:
|
||||
"""
|
||||
验证值是否为正整数
|
||||
|
||||
Args:
|
||||
value: 待验证的值
|
||||
field_name: 字段名称(用于错误消息)
|
||||
|
||||
Returns:
|
||||
int: 验证后的正整数值
|
||||
|
||||
Raises:
|
||||
ValidationError: 如果值不是正整数
|
||||
"""
|
||||
value = validate_integer(value, field_name)
|
||||
|
||||
if value <= 0:
|
||||
raise ValidationError(
|
||||
f"{field_name} must be a positive integer",
|
||||
details={"field": field_name, "value": value},
|
||||
)
|
||||
|
||||
return value
|
||||
308
src/functional_scaffold/worker.py
Normal file
308
src/functional_scaffold/worker.py
Normal file
@@ -0,0 +1,308 @@
|
||||
"""Worker 进程模块
|
||||
|
||||
基于 Redis 队列的任务 Worker,支持分布式锁和全局并发控制。
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from .config import settings
|
||||
from .core.job_manager import JobManager
|
||||
from .core.logging import setup_logging
|
||||
from .core.tracing import set_request_id
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class JobWorker:
|
||||
"""任务 Worker
|
||||
|
||||
从 Redis 队列获取任务并执行,支持:
|
||||
- 分布式锁防止重复执行
|
||||
- 全局并发控制
|
||||
- 任务重试机制
|
||||
- 锁续租机制
|
||||
- 超时任务回收
|
||||
- 优雅关闭
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._job_manager: Optional[JobManager] = None
|
||||
self._running: bool = False
|
||||
self._current_job_id: Optional[str] = None
|
||||
self._current_lock_token: Optional[str] = None
|
||||
self._lock_renewal_task: Optional[asyncio.Task] = None
|
||||
self._sweeper_task: Optional[asyncio.Task] = None
|
||||
|
||||
async def initialize(self) -> None:
|
||||
"""初始化 Worker"""
|
||||
self._job_manager = JobManager()
|
||||
await self._job_manager.initialize()
|
||||
logger.info("Worker 初始化完成")
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
"""关闭 Worker"""
|
||||
logger.info("Worker 正在关闭...")
|
||||
self._running = False
|
||||
|
||||
# 取消回收器任务
|
||||
if self._sweeper_task and not self._sweeper_task.done():
|
||||
self._sweeper_task.cancel()
|
||||
try:
|
||||
await self._sweeper_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# 取消锁续租任务
|
||||
if self._lock_renewal_task and not self._lock_renewal_task.done():
|
||||
self._lock_renewal_task.cancel()
|
||||
try:
|
||||
await self._lock_renewal_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# 等待当前任务完成
|
||||
if self._current_job_id:
|
||||
logger.info(f"等待当前任务完成: {self._current_job_id}")
|
||||
|
||||
if self._job_manager:
|
||||
await self._job_manager.shutdown()
|
||||
|
||||
logger.info("Worker 已关闭")
|
||||
|
||||
async def run(self) -> None:
|
||||
"""运行 Worker 主循环"""
|
||||
self._running = True
|
||||
logger.info(
|
||||
f"Worker 启动,轮询间隔: {settings.worker_poll_interval}s,"
|
||||
f"最大并发: {settings.max_concurrent_jobs}"
|
||||
)
|
||||
|
||||
# 启动超时任务回收器
|
||||
if settings.job_sweeper_enabled:
|
||||
self._sweeper_task = asyncio.create_task(self._sweeper_loop())
|
||||
logger.info(f"超时任务回收器已启动,扫描间隔: {settings.job_sweeper_interval}s")
|
||||
|
||||
while self._running:
|
||||
try:
|
||||
await self._process_next_job()
|
||||
except Exception as e:
|
||||
logger.error(f"Worker 循环异常: {e}", exc_info=True)
|
||||
await asyncio.sleep(settings.worker_poll_interval)
|
||||
|
||||
async def _process_next_job(self) -> None:
|
||||
"""处理下一个任务"""
|
||||
if not self._job_manager:
|
||||
logger.error("JobManager 未初始化")
|
||||
await asyncio.sleep(settings.worker_poll_interval)
|
||||
return
|
||||
|
||||
# 从队列获取任务(转移式出队)
|
||||
job_id = await self._job_manager.dequeue_job(timeout=int(settings.worker_poll_interval))
|
||||
|
||||
if not job_id:
|
||||
return
|
||||
|
||||
# 获取任务信息以提取 request_id
|
||||
job_data = await self._job_manager.get_job(job_id)
|
||||
if job_data:
|
||||
request_id = job_data.get("request_id") or job_id
|
||||
set_request_id(request_id)
|
||||
else:
|
||||
set_request_id(job_id)
|
||||
|
||||
logger.info(f"从队列获取任务: {job_id}")
|
||||
|
||||
# 尝试获取分布式锁(返回 token)
|
||||
lock_token = await self._job_manager.acquire_job_lock(job_id)
|
||||
if not lock_token:
|
||||
logger.warning(f"无法获取任务锁,任务可能正在被其他 Worker 执行: {job_id}")
|
||||
# 任务留在 processing 队列,等待回收器处理
|
||||
return
|
||||
|
||||
self._current_lock_token = lock_token
|
||||
|
||||
# 启动锁续租协程
|
||||
self._lock_renewal_task = asyncio.create_task(self._lock_renewal_loop(job_id, lock_token))
|
||||
|
||||
try:
|
||||
# 检查全局并发限制
|
||||
if not await self._job_manager.can_execute():
|
||||
logger.info(f"达到并发限制,任务 NACK 重新入队: {job_id}")
|
||||
await self._job_manager.nack_job(job_id, requeue=True)
|
||||
return
|
||||
|
||||
# 增加并发计数
|
||||
await self._job_manager.increment_concurrency()
|
||||
self._current_job_id = job_id
|
||||
|
||||
try:
|
||||
# 执行任务
|
||||
success = await self._execute_with_retry(job_id)
|
||||
if success:
|
||||
await self._job_manager.ack_job(job_id)
|
||||
else:
|
||||
await self._job_manager.increment_job_retry(job_id)
|
||||
await self._job_manager.nack_job(job_id, requeue=True)
|
||||
finally:
|
||||
# 减少并发计数
|
||||
await self._job_manager.decrement_concurrency()
|
||||
self._current_job_id = None
|
||||
|
||||
finally:
|
||||
# 停止锁续租
|
||||
if self._lock_renewal_task and not self._lock_renewal_task.done():
|
||||
self._lock_renewal_task.cancel()
|
||||
try:
|
||||
await self._lock_renewal_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._lock_renewal_task = None
|
||||
|
||||
# 释放分布式锁
|
||||
await self._job_manager.release_job_lock(job_id, lock_token)
|
||||
self._current_lock_token = None
|
||||
|
||||
async def _execute_with_retry(self, job_id: str) -> bool:
|
||||
"""执行任务(带重试机制)
|
||||
|
||||
Returns:
|
||||
bool: 任务是否成功执行
|
||||
"""
|
||||
if not self._job_manager:
|
||||
return False
|
||||
|
||||
try:
|
||||
# 执行任务
|
||||
await asyncio.wait_for(
|
||||
self._job_manager.execute_job(job_id),
|
||||
timeout=settings.job_execution_timeout,
|
||||
)
|
||||
return True
|
||||
except asyncio.TimeoutError:
|
||||
logger.error(f"任务执行超时: {job_id}")
|
||||
await self._handle_job_failure(job_id, "任务执行超时")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"任务执行异常: {job_id}, error={e}", exc_info=True)
|
||||
await self._handle_job_failure(job_id, str(e))
|
||||
return False
|
||||
|
||||
async def _handle_job_failure(self, job_id: str, error: str) -> None:
|
||||
"""处理任务失败"""
|
||||
if not self._job_manager:
|
||||
return
|
||||
|
||||
retry_count = await self._job_manager.increment_job_retry(job_id)
|
||||
|
||||
if retry_count < settings.job_max_retries:
|
||||
logger.info(f"任务将重试 ({retry_count}/{settings.job_max_retries}): {job_id}")
|
||||
# 重新入队
|
||||
await self._job_manager.enqueue_job(job_id)
|
||||
else:
|
||||
logger.error(f"任务达到最大重试次数,标记为失败: {job_id}")
|
||||
# 更新任务状态为失败
|
||||
if self._job_manager._redis_client:
|
||||
key = f"job:{job_id}"
|
||||
await self._job_manager._redis_client.hset(
|
||||
key,
|
||||
mapping={
|
||||
"status": "failed",
|
||||
"error": f"达到最大重试次数 ({settings.job_max_retries}): {error}",
|
||||
},
|
||||
)
|
||||
|
||||
async def _lock_renewal_loop(self, job_id: str, lock_token: str) -> None:
|
||||
"""锁续租协程
|
||||
|
||||
定期续租任务锁,防止长任务执行时锁过期。
|
||||
|
||||
Args:
|
||||
job_id: 任务 ID
|
||||
lock_token: 锁 token
|
||||
"""
|
||||
# 续租间隔为锁 TTL 的一半
|
||||
interval = (settings.job_execution_timeout + settings.job_lock_buffer) / 2
|
||||
while True:
|
||||
try:
|
||||
await asyncio.sleep(interval)
|
||||
if not self._job_manager:
|
||||
break
|
||||
if not await self._job_manager.renew_job_lock(job_id, lock_token):
|
||||
logger.error(f"锁续租失败,可能已被其他进程获取: {job_id}")
|
||||
break
|
||||
logger.debug(f"锁续租成功: {job_id}")
|
||||
except asyncio.CancelledError:
|
||||
logger.debug(f"锁续租协程已取消: {job_id}")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"锁续租异常: {job_id}, error={e}")
|
||||
break
|
||||
|
||||
async def _sweeper_loop(self) -> None:
|
||||
"""超时任务回收协程
|
||||
|
||||
定期扫描处理中队列,回收超时任务,并收集队列监控指标。
|
||||
"""
|
||||
while self._running:
|
||||
try:
|
||||
await asyncio.sleep(settings.job_sweeper_interval)
|
||||
if not self._job_manager:
|
||||
continue
|
||||
|
||||
# 回收超时任务
|
||||
recovered = await self._job_manager.recover_stale_jobs()
|
||||
if recovered > 0:
|
||||
logger.info(f"回收超时任务: {recovered} 个")
|
||||
# 记录回收指标
|
||||
from .core.metrics_unified import incr
|
||||
|
||||
incr("job_recovered_total", None, recovered)
|
||||
|
||||
# 收集队列监控指标
|
||||
await self._job_manager.collect_queue_metrics()
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("超时任务回收协程已取消")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"超时任务回收异常: {e}")
|
||||
|
||||
|
||||
def setup_signal_handlers(worker: JobWorker, loop: asyncio.AbstractEventLoop) -> None:
|
||||
"""设置信号处理器"""
|
||||
|
||||
def signal_handler(sig: signal.Signals) -> None:
|
||||
logger.info(f"收到信号 {sig.name},准备关闭...")
|
||||
loop.create_task(worker.shutdown())
|
||||
|
||||
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||
loop.add_signal_handler(sig, signal_handler, sig)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
"""Worker 入口函数"""
|
||||
# 设置日志
|
||||
setup_logging(level=settings.log_level, format_type=settings.log_format)
|
||||
|
||||
worker = JobWorker()
|
||||
|
||||
# 设置信号处理
|
||||
loop = asyncio.get_running_loop()
|
||||
setup_signal_handlers(worker, loop)
|
||||
|
||||
try:
|
||||
await worker.initialize()
|
||||
await worker.run()
|
||||
except Exception as e:
|
||||
logger.error(f"Worker 异常退出: {e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
finally:
|
||||
await worker.shutdown()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
1
tests/__init__.py
Normal file
1
tests/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""测试模块"""
|
||||
23
tests/conftest.py
Normal file
23
tests/conftest.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""pytest 配置"""
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
from functional_scaffold.main import app
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client():
|
||||
"""测试客户端"""
|
||||
return TestClient(app)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_prime_numbers():
|
||||
"""质数样本"""
|
||||
return [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_composite_numbers():
|
||||
"""合数样本"""
|
||||
return [4, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25]
|
||||
77
tests/test_algorithms.py
Normal file
77
tests/test_algorithms.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""算法单元测试"""
|
||||
|
||||
import pytest
|
||||
from functional_scaffold.algorithms.prime_checker import PrimeChecker
|
||||
|
||||
|
||||
class TestPrimeChecker:
|
||||
"""质数判断算法测试"""
|
||||
|
||||
def setup_method(self):
|
||||
"""每个测试方法前执行"""
|
||||
self.checker = PrimeChecker()
|
||||
|
||||
def test_prime_numbers(self, sample_prime_numbers):
|
||||
"""测试质数判断"""
|
||||
for num in sample_prime_numbers:
|
||||
result = self.checker.process(num)
|
||||
assert result["is_prime"] is True
|
||||
assert result["number"] == num
|
||||
assert result["factors"] == []
|
||||
assert result["algorithm"] == "trial_division"
|
||||
|
||||
def test_composite_numbers(self, sample_composite_numbers):
|
||||
"""测试合数判断"""
|
||||
for num in sample_composite_numbers:
|
||||
result = self.checker.process(num)
|
||||
assert result["is_prime"] is False
|
||||
assert result["number"] == num
|
||||
assert len(result["factors"]) > 0
|
||||
assert result["algorithm"] == "trial_division"
|
||||
|
||||
def test_edge_cases(self):
|
||||
"""测试边界情况"""
|
||||
# 0 不是质数
|
||||
result = self.checker.process(0)
|
||||
assert result["is_prime"] is False
|
||||
assert "reason" in result
|
||||
|
||||
# 1 不是质数
|
||||
result = self.checker.process(1)
|
||||
assert result["is_prime"] is False
|
||||
assert "reason" in result
|
||||
|
||||
# 2 是质数
|
||||
result = self.checker.process(2)
|
||||
assert result["is_prime"] is True
|
||||
|
||||
# 负数不是质数
|
||||
result = self.checker.process(-5)
|
||||
assert result["is_prime"] is False
|
||||
|
||||
def test_large_prime(self):
|
||||
"""测试大质数"""
|
||||
large_prime = 7919 # 第1000个质数
|
||||
result = self.checker.process(large_prime)
|
||||
assert result["is_prime"] is True
|
||||
|
||||
def test_invalid_input(self):
|
||||
"""测试无效输入"""
|
||||
with pytest.raises(ValueError):
|
||||
self.checker.process("not a number")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
self.checker.process(3.14)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
self.checker.process(None)
|
||||
|
||||
def test_execute_method(self):
|
||||
"""测试 execute 方法(包含埋点)"""
|
||||
result = self.checker.execute(17)
|
||||
|
||||
assert result["success"] is True
|
||||
assert "result" in result
|
||||
assert "metadata" in result
|
||||
assert result["metadata"]["algorithm"] == "PrimeChecker"
|
||||
assert "elapsed_time" in result["metadata"]
|
||||
107
tests/test_api.py
Normal file
107
tests/test_api.py
Normal file
@@ -0,0 +1,107 @@
|
||||
"""API 集成测试"""
|
||||
|
||||
import pytest
|
||||
from fastapi import status
|
||||
|
||||
|
||||
class TestInvokeEndpoint:
|
||||
"""测试 /invoke 端点"""
|
||||
|
||||
def test_invoke_prime_number(self, client):
|
||||
"""测试质数判断"""
|
||||
response = client.post("/invoke", json={"number": 17})
|
||||
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
data = response.json()
|
||||
|
||||
assert "request_id" in data
|
||||
assert data["status"] == "success"
|
||||
assert data["result"]["number"] == 17
|
||||
assert data["result"]["is_prime"] is True
|
||||
assert data["result"]["factors"] == []
|
||||
|
||||
def test_invoke_composite_number(self, client):
|
||||
"""测试合数判断"""
|
||||
response = client.post("/invoke", json={"number": 12})
|
||||
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
data = response.json()
|
||||
|
||||
assert data["status"] == "success"
|
||||
assert data["result"]["number"] == 12
|
||||
assert data["result"]["is_prime"] is False
|
||||
assert len(data["result"]["factors"]) > 0
|
||||
|
||||
def test_invoke_edge_cases(self, client):
|
||||
"""测试边界情况"""
|
||||
# 测试 0
|
||||
response = client.post("/invoke", json={"number": 0})
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
assert response.json()["result"]["is_prime"] is False
|
||||
|
||||
# 测试 1
|
||||
response = client.post("/invoke", json={"number": 1})
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
assert response.json()["result"]["is_prime"] is False
|
||||
|
||||
# 测试 2
|
||||
response = client.post("/invoke", json={"number": 2})
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
assert response.json()["result"]["is_prime"] is True
|
||||
|
||||
def test_invoke_invalid_input(self, client):
|
||||
"""测试无效输入"""
|
||||
# 缺少必需字段
|
||||
response = client.post("/invoke", json={})
|
||||
assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
|
||||
|
||||
# 错误的数据类型
|
||||
response = client.post("/invoke", json={"number": "not a number"})
|
||||
assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
|
||||
|
||||
# 浮点数
|
||||
response = client.post("/invoke", json={"number": 3.14})
|
||||
assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
|
||||
|
||||
|
||||
class TestHealthEndpoints:
|
||||
"""测试健康检查端点"""
|
||||
|
||||
def test_healthz(self, client):
|
||||
"""测试存活检查"""
|
||||
response = client.get("/healthz")
|
||||
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
data = response.json()
|
||||
|
||||
assert data["status"] == "healthy"
|
||||
assert "timestamp" in data
|
||||
|
||||
def test_readyz(self, client):
|
||||
"""测试就绪检查"""
|
||||
response = client.get("/readyz")
|
||||
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
data = response.json()
|
||||
|
||||
assert data["status"] == "ready"
|
||||
assert "timestamp" in data
|
||||
assert "checks" in data
|
||||
|
||||
|
||||
class TestMetricsEndpoint:
|
||||
"""测试指标端点"""
|
||||
|
||||
def test_metrics(self, client):
|
||||
"""测试 Prometheus 指标"""
|
||||
response = client.get("/metrics")
|
||||
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
assert "text/plain" in response.headers["content-type"]
|
||||
|
||||
|
||||
class TestJobsEndpoint:
|
||||
"""测试异步任务端点"""
|
||||
|
||||
# 详细测试在 test_job_manager.py 中
|
||||
pass
|
||||
1170
tests/test_job_manager.py
Normal file
1170
tests/test_job_manager.py
Normal file
File diff suppressed because it is too large
Load Diff
273
tests/test_metrics_unified.py
Normal file
273
tests/test_metrics_unified.py
Normal file
@@ -0,0 +1,273 @@
|
||||
"""metrics_unified 模块单元测试"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
|
||||
class TestMetricsManager:
|
||||
"""MetricsManager 类测试"""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_redis(self):
|
||||
"""模拟 Redis 客户端"""
|
||||
with patch("redis.Redis") as mock:
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_instance.hincrbyfloat.return_value = 1.0
|
||||
mock_instance.hset.return_value = True
|
||||
mock_instance.hgetall.return_value = {}
|
||||
mock_instance.hget.return_value = "0"
|
||||
mock_instance.keys.return_value = []
|
||||
mock_instance.pipeline.return_value = MagicMock()
|
||||
mock.return_value = mock_instance
|
||||
yield mock_instance
|
||||
|
||||
@pytest.fixture
|
||||
def manager(self, mock_redis):
|
||||
"""创建测试用的 MetricsManager"""
|
||||
from functional_scaffold.core.metrics_unified import (
|
||||
MetricsManager,
|
||||
reset_metrics_manager,
|
||||
)
|
||||
|
||||
reset_metrics_manager()
|
||||
manager = MetricsManager()
|
||||
return manager
|
||||
|
||||
def test_init_loads_default_config(self, manager):
|
||||
"""测试初始化加载默认配置"""
|
||||
assert manager.config is not None
|
||||
assert "builtin_metrics" in manager.config or len(manager.metrics_definitions) > 0
|
||||
|
||||
def test_metrics_definitions_registered(self, manager):
|
||||
"""测试指标定义已注册"""
|
||||
assert "http_requests_total" in manager.metrics_definitions
|
||||
assert "http_request_duration_seconds" in manager.metrics_definitions
|
||||
assert "algorithm_executions_total" in manager.metrics_definitions
|
||||
|
||||
def test_incr_counter(self, manager, mock_redis):
|
||||
"""测试计数器增加"""
|
||||
manager.incr("http_requests_total", {"method": "GET", "endpoint": "/", "status": "success"})
|
||||
mock_redis.hincrbyfloat.assert_called()
|
||||
|
||||
def test_incr_with_invalid_metric_type(self, manager, mock_redis):
|
||||
"""测试对非计数器类型调用 incr"""
|
||||
# http_request_duration_seconds 是 histogram 类型
|
||||
manager.incr("http_request_duration_seconds", {})
|
||||
# 不应该调用 Redis(因为类型不匹配)
|
||||
# 验证没有调用 hincrbyfloat(或者调用次数没有增加)
|
||||
|
||||
def test_set_gauge(self, manager, mock_redis):
|
||||
"""测试设置仪表盘"""
|
||||
manager.set("http_requests_in_progress", {}, 5)
|
||||
mock_redis.hset.assert_called()
|
||||
|
||||
def test_gauge_incr(self, manager, mock_redis):
|
||||
"""测试增加仪表盘"""
|
||||
manager.gauge_incr("http_requests_in_progress", {}, 1)
|
||||
mock_redis.hincrbyfloat.assert_called()
|
||||
|
||||
def test_gauge_decr(self, manager, mock_redis):
|
||||
"""测试减少仪表盘"""
|
||||
manager.gauge_decr("http_requests_in_progress", {}, 1)
|
||||
mock_redis.hincrbyfloat.assert_called()
|
||||
|
||||
def test_observe_histogram(self, manager, mock_redis):
|
||||
"""测试直方图观测"""
|
||||
mock_pipeline = MagicMock()
|
||||
mock_redis.pipeline.return_value = mock_pipeline
|
||||
|
||||
manager.observe("http_request_duration_seconds", {"method": "GET", "endpoint": "/"}, 0.05)
|
||||
|
||||
mock_redis.pipeline.assert_called()
|
||||
mock_pipeline.execute.assert_called()
|
||||
|
||||
def test_labels_to_key(self, manager):
|
||||
"""测试标签转换为 key"""
|
||||
labels = {"method": "GET", "endpoint": "/api"}
|
||||
key = manager._labels_to_key(labels)
|
||||
assert "method=GET" in key
|
||||
assert "endpoint=/api" in key
|
||||
|
||||
def test_labels_to_key_empty(self, manager):
|
||||
"""测试空标签转换"""
|
||||
key = manager._labels_to_key(None)
|
||||
assert key == ""
|
||||
|
||||
key = manager._labels_to_key({})
|
||||
assert key == ""
|
||||
|
||||
def test_is_available(self, manager):
|
||||
"""测试 Redis 可用性检查"""
|
||||
assert manager.is_available() is True
|
||||
|
||||
|
||||
class TestConvenienceFunctions:
|
||||
"""便捷函数测试"""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self):
|
||||
"""每个测试前重置管理器"""
|
||||
from functional_scaffold.core.metrics_unified import reset_metrics_manager
|
||||
|
||||
reset_metrics_manager()
|
||||
|
||||
@patch("redis.Redis")
|
||||
def test_incr_function(self, mock_redis_class):
|
||||
"""测试 incr 便捷函数"""
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
from functional_scaffold.core.metrics_unified import incr, reset_metrics_manager
|
||||
|
||||
reset_metrics_manager()
|
||||
incr("http_requests_total", {"method": "GET", "endpoint": "/", "status": "success"})
|
||||
|
||||
mock_instance.hincrbyfloat.assert_called()
|
||||
|
||||
@patch("redis.Redis")
|
||||
def test_set_function(self, mock_redis_class):
|
||||
"""测试 set 便捷函数"""
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
from functional_scaffold.core.metrics_unified import reset_metrics_manager, set
|
||||
|
||||
reset_metrics_manager()
|
||||
set("http_requests_in_progress", {}, 10)
|
||||
|
||||
mock_instance.hset.assert_called()
|
||||
|
||||
@patch("redis.Redis")
|
||||
def test_observe_function(self, mock_redis_class):
|
||||
"""测试 observe 便捷函数"""
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_pipeline = MagicMock()
|
||||
mock_instance.pipeline.return_value = mock_pipeline
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
from functional_scaffold.core.metrics_unified import observe, reset_metrics_manager
|
||||
|
||||
reset_metrics_manager()
|
||||
observe("http_request_duration_seconds", {"method": "GET", "endpoint": "/"}, 0.1)
|
||||
|
||||
mock_instance.pipeline.assert_called()
|
||||
|
||||
|
||||
class TestExport:
|
||||
"""导出功能测试"""
|
||||
|
||||
@patch("redis.Redis")
|
||||
def test_export_counter(self, mock_redis_class):
|
||||
"""测试导出计数器"""
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_instance.hgetall.return_value = {"method=GET,endpoint=/,status=success": "10"}
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
from functional_scaffold.core.metrics_unified import export, reset_metrics_manager
|
||||
|
||||
reset_metrics_manager()
|
||||
output = export()
|
||||
|
||||
assert "http_requests_total" in output
|
||||
assert "HELP" in output
|
||||
assert "TYPE" in output
|
||||
|
||||
@patch("redis.Redis")
|
||||
def test_export_histogram(self, mock_redis_class):
|
||||
"""测试导出直方图"""
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_instance.hgetall.side_effect = lambda key: (
|
||||
{"method=GET,endpoint=/": "5"}
|
||||
if "count" in key
|
||||
else {"method=GET,endpoint=/": "0.5"}
|
||||
if "sum" in key
|
||||
else {}
|
||||
)
|
||||
mock_instance.hget.return_value = "3"
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
from functional_scaffold.core.metrics_unified import export, reset_metrics_manager
|
||||
|
||||
reset_metrics_manager()
|
||||
output = export()
|
||||
|
||||
assert "http_request_duration_seconds" in output
|
||||
|
||||
|
||||
class TestEnvVarSubstitution:
|
||||
"""环境变量替换测试"""
|
||||
|
||||
def test_substitute_env_vars(self):
|
||||
"""测试环境变量替换"""
|
||||
import os
|
||||
from functional_scaffold.core.metrics_unified import MetricsManager
|
||||
|
||||
# 设置测试环境变量
|
||||
os.environ["TEST_VAR"] = "test_value"
|
||||
|
||||
manager = MetricsManager.__new__(MetricsManager)
|
||||
result = manager._substitute_env_vars("${TEST_VAR:default}")
|
||||
assert result == "test_value"
|
||||
|
||||
# 测试默认值
|
||||
result = manager._substitute_env_vars("${NONEXISTENT_VAR:default_value}")
|
||||
assert result == "default_value"
|
||||
|
||||
# 清理
|
||||
del os.environ["TEST_VAR"]
|
||||
|
||||
|
||||
class TestTrackAlgorithmExecution:
|
||||
"""track_algorithm_execution 装饰器测试"""
|
||||
|
||||
@patch("redis.Redis")
|
||||
def test_decorator_success(self, mock_redis_class):
|
||||
"""测试装饰器成功执行"""
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_pipeline = MagicMock()
|
||||
mock_instance.pipeline.return_value = mock_pipeline
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
from functional_scaffold.core.metrics_unified import (
|
||||
reset_metrics_manager,
|
||||
track_algorithm_execution,
|
||||
)
|
||||
|
||||
reset_metrics_manager()
|
||||
|
||||
@track_algorithm_execution("test_algo")
|
||||
def test_func():
|
||||
return "result"
|
||||
|
||||
result = test_func()
|
||||
assert result == "result"
|
||||
|
||||
@patch("redis.Redis")
|
||||
def test_decorator_error(self, mock_redis_class):
|
||||
"""测试装饰器错误处理"""
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.ping.return_value = True
|
||||
mock_pipeline = MagicMock()
|
||||
mock_instance.pipeline.return_value = mock_pipeline
|
||||
mock_redis_class.return_value = mock_instance
|
||||
|
||||
from functional_scaffold.core.metrics_unified import (
|
||||
reset_metrics_manager,
|
||||
track_algorithm_execution,
|
||||
)
|
||||
|
||||
reset_metrics_manager()
|
||||
|
||||
@track_algorithm_execution("test_algo")
|
||||
def test_func():
|
||||
raise ValueError("test error")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
test_func()
|
||||
97
tests/test_middleware.py
Normal file
97
tests/test_middleware.py
Normal file
@@ -0,0 +1,97 @@
|
||||
"""中间件测试"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from functional_scaffold.main import app, normalize_path
|
||||
|
||||
|
||||
class TestNormalizePath:
|
||||
"""测试路径规范化函数"""
|
||||
|
||||
def test_normalize_jobs_path(self):
|
||||
"""测试 /jobs/{job_id} 路径规范化"""
|
||||
assert normalize_path("/jobs/a1b2c3d4e5f6") == "/jobs/{job_id}"
|
||||
assert normalize_path("/jobs/123456789012") == "/jobs/{job_id}"
|
||||
assert normalize_path("/jobs/xyz") == "/jobs/{job_id}"
|
||||
|
||||
def test_normalize_other_paths(self):
|
||||
"""测试其他路径保持不变"""
|
||||
assert normalize_path("/invoke") == "/invoke"
|
||||
assert normalize_path("/healthz") == "/healthz"
|
||||
assert normalize_path("/readyz") == "/readyz"
|
||||
assert normalize_path("/metrics") == "/metrics"
|
||||
assert normalize_path("/docs") == "/docs"
|
||||
|
||||
def test_normalize_jobs_root(self):
|
||||
"""测试 /jobs 根路径"""
|
||||
assert normalize_path("/jobs") == "/jobs"
|
||||
|
||||
|
||||
class TestMetricsMiddleware:
|
||||
"""测试指标中间件"""
|
||||
|
||||
@patch("functional_scaffold.main.incr")
|
||||
@patch("functional_scaffold.main.observe")
|
||||
@patch("functional_scaffold.main.gauge_incr")
|
||||
@patch("functional_scaffold.main.gauge_decr")
|
||||
def test_skip_health_endpoints(self, mock_gauge_decr, mock_gauge_incr, mock_observe, mock_incr):
|
||||
"""测试跳过健康检查端点"""
|
||||
client = TestClient(app)
|
||||
|
||||
# 访问健康检查端点
|
||||
client.get("/healthz")
|
||||
client.get("/readyz")
|
||||
client.get("/metrics")
|
||||
|
||||
# 验证没有记录指标
|
||||
mock_incr.assert_not_called()
|
||||
mock_observe.assert_not_called()
|
||||
mock_gauge_incr.assert_not_called()
|
||||
mock_gauge_decr.assert_not_called()
|
||||
|
||||
@patch("functional_scaffold.main.incr")
|
||||
@patch("functional_scaffold.main.observe")
|
||||
@patch("functional_scaffold.main.gauge_incr")
|
||||
@patch("functional_scaffold.main.gauge_decr")
|
||||
def test_record_normal_endpoints(self, mock_gauge_decr, mock_gauge_incr, mock_observe, mock_incr):
|
||||
"""测试记录普通端点"""
|
||||
client = TestClient(app)
|
||||
|
||||
# 访问普通端点
|
||||
client.post("/invoke", json={"number": 17})
|
||||
|
||||
# 验证记录了指标
|
||||
mock_gauge_incr.assert_called_once()
|
||||
mock_gauge_decr.assert_called_once()
|
||||
mock_incr.assert_called_once()
|
||||
mock_observe.assert_called_once()
|
||||
|
||||
# 验证使用了正确的端点路径
|
||||
incr_call_args = mock_incr.call_args
|
||||
assert incr_call_args[0][1]["endpoint"] == "/invoke"
|
||||
|
||||
@patch("functional_scaffold.main.incr")
|
||||
@patch("functional_scaffold.main.observe")
|
||||
@patch("functional_scaffold.main.gauge_incr")
|
||||
@patch("functional_scaffold.main.gauge_decr")
|
||||
@patch("functional_scaffold.core.job_manager.get_job_manager")
|
||||
def test_normalize_job_path(self, mock_get_manager, mock_gauge_decr, mock_gauge_incr, mock_observe, mock_incr):
|
||||
"""测试规范化任务路径"""
|
||||
# Mock job manager
|
||||
mock_manager = MagicMock()
|
||||
mock_manager.get_job.return_value = None
|
||||
mock_get_manager.return_value = mock_manager
|
||||
|
||||
client = TestClient(app)
|
||||
|
||||
# 访问任务端点(会返回 404,但中间件应该记录指标)
|
||||
client.get("/jobs/a1b2c3d4e5f6")
|
||||
|
||||
# 验证记录了指标
|
||||
mock_incr.assert_called_once()
|
||||
|
||||
# 验证使用了规范化后的路径
|
||||
incr_call_args = mock_incr.call_args
|
||||
assert incr_call_args[0][1]["endpoint"] == "/jobs/{job_id}"
|
||||
Reference in New Issue
Block a user