diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..b30c563 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,29 @@ +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +.pytest_cache/ +.coverage +htmlcov/ +.env +.venv +venv/ +ENV/ +*.log +.DS_Store diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..0adc4c0 --- /dev/null +++ b/.env.example @@ -0,0 +1,32 @@ +# Environment Configuration +# Copy this file to .env and fill in your values + +# Application +APP_NAME=FunctionalScaffold +APP_VERSION=1.0.0 +APP_ENV=development + +# Server +HOST=0.0.0.0 +PORT=8000 +WORKERS=4 + +# Logging +LOG_LEVEL=INFO +LOG_FORMAT=json + +# Metrics +METRICS_ENABLED=true + +# Tracing +TRACING_ENABLED=false +JAEGER_ENDPOINT=http://localhost:14268/api/traces + +# External Services (examples) +# OSS_ENDPOINT=https://oss-cn-hangzhou.aliyuncs.com +# OSS_ACCESS_KEY_ID=your_access_key +# OSS_ACCESS_KEY_SECRET=your_secret_key +# OSS_BUCKET_NAME=your_bucket + +# Database (if needed) +# DATABASE_URL=mysql://user:password@localhost:5432/dbname diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a1370d2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,70 @@ +.claude +docs/prompt +.idea + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Virtual environments +venv/ +env/ +ENV/ +env.bak/ +venv.bak/ + +# IDEs +.vscode/ +*.swp +*.swo +*~ + +# Environment variables +.env +.env.local + +# Logs +*.log + +# OS +.DS_Store +Thumbs.db \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..27ca491 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,399 @@ +# CLAUDE.md + +本文件为 Claude Code (claude.ai/code) 在此代码仓库中工作时提供指导。 + +## 项目概述 + +**FunctionalScaffold(函数式脚手架)** 是一个算法工程化 Serverless 解决方案的脚手架生成器。 + +- 为了方便团队交流,项目的自然语言使用中文,包括代码注释和文档等 + +### 核心目标 + +解决三大痛点: +1. **不确定的算力需求** - 需要动态扩缩容能力 +2. **算法同学工程化能力不足** - 降低工程化门槛 +3. **后端同学集成难度过高** - 标准化接口规范 + +## 技术架构 + +采用 **Docker 封装的 Serverless API 服务**方案: + +- 算法代码 + 运行环境打包为 Docker 镜像 +- 部署到云厂商 Serverless 平台实现自动扩缩容 +- FastAPI 作为 HTTP 接口层 +- 算法逻辑保持独立和专注 + +### 架构流程 + +``` +用户请求 → API网关 → 容器实例(冷/热启动)→ FastAPI → 算法程序 → 返回结果 + ↓ + 外部服务(OSS/数据库) +``` + +### 代码架构 + +项目采用 **src layout** 结构(Python 最佳实践): + +``` +src/functional_scaffold/ +├── algorithms/ # 算法层 - 所有算法必须继承 BaseAlgorithm +│ ├── base.py # 提供 execute() 包装器(埋点、错误处理) +│ └── prime_checker.py # 示例:质数判断算法 +├── api/ # API 层 - FastAPI 路由和模型 +│ ├── models.py # Pydantic 数据模型(使用 ConfigDict) +│ ├── routes.py # 路由定义(/invoke, /healthz, /readyz, /jobs) +│ └── dependencies.py # 依赖注入(request_id 生成) +├── core/ # 核心功能 - 横切关注点 +│ ├── errors.py # 异常类层次结构 +│ ├── logging.py # 结构化日志(JSON 格式) +│ ├── metrics.py # Prometheus 指标和装饰器 +│ └── tracing.py # 分布式追踪(ContextVar) +├── utils/ # 工具函数 +│ └── validators.py # 输入验证 +├── config.py # 配置管理(pydantic-settings) +└── main.py # FastAPI 应用入口 +``` + +**关键设计模式:** + +1. **算法抽象层**:所有算法继承 `BaseAlgorithm`,只需实现 `process()` 方法。`execute()` 方法自动处理埋点、日志和错误包装。 + +2. **依赖注入**:使用 FastAPI 的 `Depends()` 机制注入 request_id,通过 `ContextVar` 在异步上下文中传递。 + +3. **配置管理**:使用 `pydantic-settings` 从环境变量或 `.env` 文件加载配置,支持类型验证。 + +4. **可观测性**: + - 日志:结构化 JSON 日志(pythonjsonlogger) + - 指标:Prometheus 格式(request_counter, request_latency, algorithm_counter) + - 追踪:request_id 关联所有日志和指标 + +## 开发命令 + +### 环境设置 + +```bash +# 创建虚拟环境并安装依赖(开发模式) +python -m venv venv +source venv/bin/activate # Windows: venv\Scripts\activate +pip install -e ".[dev]" +``` + +### 运行服务 + +```bash +# 方式1:使用辅助脚本(推荐) +./scripts/run_dev.sh + +# 方式2:直接运行(开发模式,自动重载) +uvicorn src.functional_scaffold.main:app --reload --port 8000 + +# 方式3:生产模式 +uvicorn src.functional_scaffold.main:app --host 0.0.0.0 --port 8000 --workers 4 +``` + +访问地址: +- Swagger UI: http://localhost:8000/docs +- ReDoc: http://localhost:8000/redoc +- Metrics: http://localhost:8000/metrics + +### 测试 + +```bash +# 运行所有测试 +pytest tests/ -v + +# 运行单个测试文件 +pytest tests/test_algorithms.py -v + +# 运行单个测试类 +pytest tests/test_algorithms.py::TestPrimeChecker -v + +# 运行单个测试方法 +pytest tests/test_algorithms.py::TestPrimeChecker::test_prime_numbers -v + +# 生成覆盖率报告 +pytest tests/ --cov=src/functional_scaffold --cov-report=html +# 查看报告:open htmlcov/index.html + +# 使用辅助脚本(包含代码检查) +./scripts/run_tests.sh +``` + +### 代码质量 + +```bash +# 代码格式化(自动修复) +black src/ tests/ + +# 代码检查(不修改文件) +black --check src/ tests/ + +# 代码检查 +ruff check src/ tests/ + +# 自动修复可修复的问题 +ruff check --fix src/ tests/ +``` + +配置说明: +- Black: 行长度 100,目标 Python 3.9+ +- Ruff: 行长度 100,目标 Python 3.9+ + +### Docker + +```bash +# 构建镜像 +docker build -f deployment/Dockerfile -t functional-scaffold:latest . + +# 运行容器 +docker run -p 8000:8000 functional-scaffold:latest + +# 使用 docker-compose(包含 Prometheus + Grafana) +cd deployment +docker-compose up +# Grafana: http://localhost:3000 (admin/admin) +# Prometheus: http://localhost:9090 +``` + +### 文档 + +```bash +# 导出 OpenAPI 规范到 docs/swagger/openapi.json +python scripts/export_openapi.py +``` + +## 添加新算法 + +### 1. 创建算法类(继承 BaseAlgorithm) + +```python +# src/functional_scaffold/algorithms/my_algorithm.py +from typing import Dict, Any +from .base import BaseAlgorithm + +class MyAlgorithm(BaseAlgorithm): + """我的算法类""" + + def process(self, input_data: Any) -> Dict[str, Any]: + """ + 算法处理逻辑 + + Args: + input_data: 输入数据 + + Returns: + Dict[str, Any]: 处理结果 + """ + # 实现算法逻辑 + result = do_something(input_data) + return {"result": result} +``` + +### 2. 注册到 `__init__.py` + +```python +# src/functional_scaffold/algorithms/__init__.py +from .my_algorithm import MyAlgorithm +__all__ = [..., "MyAlgorithm"] +``` + +### 3. 添加 API 端点(在 `api/routes.py`) + +```python +@router.post("/my-endpoint") +async def my_endpoint( + request: MyRequest, + request_id: str = Depends(get_request_id) +): + """我的算法端点""" + algorithm = MyAlgorithm() + result = algorithm.execute(request.data) + return MyResponse(request_id=request_id, **result) +``` + +### 4. 定义数据模型(在 `api/models.py`) + +```python +class MyRequest(BaseModel): + """我的请求模型""" + + model_config = ConfigDict( + json_schema_extra={ + "example": {"data": "示例数据"} + } + ) + + data: str = Field(..., description="输入数据") +``` + +### 5. 编写测试 + +```python +# tests/test_my_algorithm.py +def test_my_algorithm(): + """测试我的算法""" + algo = MyAlgorithm() + result = algo.process("测试数据") + assert result["result"] == expected +``` + +## 配置管理 + +配置通过 `src/functional_scaffold/config.py` 的 `Settings` 类管理: + +- 从环境变量读取(不区分大小写) +- 支持 `.env` 文件 +- 使用 `pydantic-settings` 进行类型验证 + +配置示例: +```bash +# .env 文件 +APP_ENV=production +LOG_LEVEL=INFO +METRICS_ENABLED=true +``` + +访问配置: +```python +from functional_scaffold.config import settings +print(settings.app_env) # "production" +``` + +## 可观测性 + +### 日志 + +使用 `core/logging.py` 的 `setup_logging()`: + +```python +from functional_scaffold.core.logging import setup_logging + +# 设置日志 +logger = setup_logging(level="INFO", format_type="json") + +# 记录日志 +logger.info("处理请求", extra={"user_id": "123"}) +``` + +### 指标 + +使用 `core/metrics.py` 的装饰器: + +```python +from functional_scaffold.core.metrics import track_algorithm_execution + +@track_algorithm_execution("my_algorithm") +def my_function(): + """我的函数""" + pass +``` + +可用指标: +- `http_requests_total{method, endpoint, status}` - HTTP 请求总数 +- `http_request_duration_seconds{method, endpoint}` - HTTP 请求延迟 +- `algorithm_executions_total{algorithm, status}` - 算法执行总数 +- `algorithm_execution_duration_seconds{algorithm}` - 算法执行延迟 + +### 追踪 + +Request ID 自动注入到所有请求: + +```python +from functional_scaffold.core.tracing import get_request_id + +# 在请求上下文中获取 request_id +request_id = get_request_id() +``` + +## 部署 + +### Kubernetes + +```bash +kubectl apply -f deployment/kubernetes/deployment.yaml +kubectl apply -f deployment/kubernetes/service.yaml +``` + +配置说明: +- 3 个副本 +- 资源限制:256Mi-512Mi 内存,250m-500m CPU +- 健康检查:存活探针 (/healthz),就绪探针 (/readyz) + +### 阿里云函数计算 + +```bash +fun deploy -t deployment/serverless/aliyun-fc.yaml +``` + +### AWS Lambda + +```bash +sam deploy --template-file deployment/serverless/aws-lambda.yaml +``` + +## 必须交付的三大组件 + +### 1. 接入规范 + +**API 端点标准:** +- `/invoke` - 同步调用接口 +- `/jobs` - 异步任务接口(当前返回 501) +- `/healthz` - 存活检查 +- `/readyz` - 就绪检查 +- `/metrics` - Prometheus 指标 + +**Schema 规范:** +- 请求/响应 Schema(Pydantic 验证) +- 错误响应格式(统一的 ErrorResponse) +- 元数据和版本信息(每个响应包含 metadata) + +### 2. Python SDK 运行时 + +**已实现的能力:** +- ✅ 参数校验(Pydantic + utils/validators.py) +- ✅ 错误包装和标准化(core/errors.py) +- ✅ 埋点(core/metrics.py - 延迟、失败率) +- ✅ 分布式追踪的关联 ID(core/tracing.py) +- ⏳ Worker 运行时(重试、超时、DLQ - 待实现) + +### 3. 脚手架生成器 + +**已包含的模板:** +- ✅ 示例算法函数(algorithms/prime_checker.py) +- ✅ Dockerfile(deployment/Dockerfile) +- ✅ CI/CD 流水线配置(.github/workflows/) +- ✅ Serverless 平台部署 YAML(deployment/serverless/) +- ✅ Grafana 仪表板模板(monitoring/grafana/dashboard.json) +- ✅ 告警规则配置(monitoring/alerts/rules.yaml) + +## 开发理念 + +**算法同学只需修改核心算法函数。** 所有基础设施、可观测性、部署相关的工作都由脚手架处理。 + +算法开发者只需: +1. 继承 `BaseAlgorithm` +2. 实现 `process()` 方法 +3. 返回字典格式的结果 + +框架自动提供: +- HTTP 接口封装 +- 参数验证 +- 错误处理 +- 日志记录 +- 性能指标 +- 健康检查 +- 容器化部署 + +## 注意事项 + +1. **Pydantic V2**:使用 `ConfigDict` 而非 `class Config`,使用 `model_config` 而非 `Config`。 + +2. **异步上下文**:request_id 使用 `ContextVar` 存储,在异步函数中自动传递。 + +3. **测试隔离**:每个测试使用 `TestClient`,不需要启动真实服务器。 + +4. **Docker 构建**:Dockerfile 使用非 root 用户(appuser),包含健康检查。 + +5. **配置优先级**:环境变量 > .env 文件 > 默认值。 diff --git a/README.md b/README.md new file mode 100644 index 0000000..796505f --- /dev/null +++ b/README.md @@ -0,0 +1,259 @@ +# FunctionalScaffold + +**算法工程化 Serverless 解决方案脚手架** + +一个基于 FastAPI 和 Docker 的 Serverless 算法服务脚手架,帮助算法工程师快速构建生产级的算法服务。 + +## 特性 + +- ✅ **标准化 API 接口** - 符合 RESTful 规范的 HTTP 接口 +- ✅ **开箱即用** - 完整的项目结构和配置 +- ✅ **自动文档** - Swagger/OpenAPI 自动生成 +- ✅ **监控指标** - Prometheus 指标和 Grafana 仪表板 +- ✅ **健康检查** - 存活和就绪探针 +- ✅ **容器化部署** - Docker 和 Kubernetes 支持 +- ✅ **Serverless 就绪** - 支持阿里云函数计算和 AWS Lambda +- ✅ **完整测试** - 单元测试和集成测试 +- ✅ **CI/CD** - GitHub Actions 工作流 + +## 快速开始 + +### 前置要求 + +- Python 3.9+ +- Docker (可选) + +### 本地开发 + +1. 克隆仓库 + +```bash +git clone +cd FunctionalScaffold +``` + +2. 创建虚拟环境并安装依赖 + +```bash +python -m venv venv +source venv/bin/activate # Windows: venv\Scripts\activate +pip install -e ".[dev]" +``` + +3. 启动开发服务器 + +```bash +# 方式1:使用脚本 +./scripts/run_dev.sh + +# 方式2:直接运行 +uvicorn src.functional_scaffold.main:app --reload --port 8000 +``` + +4. 访问 API 文档 + +打开浏览器访问: +- Swagger UI: http://localhost:8000/docs +- ReDoc: http://localhost:8000/redoc +- OpenAPI JSON: http://localhost:8000/openapi.json + +### 使用 Docker + +```bash +# 构建镜像 +docker build -f deployment/Dockerfile -t functional-scaffold:latest . + +# 运行容器 +docker run -p 8000:8000 functional-scaffold:latest + +# 或使用 docker-compose +cd deployment +docker-compose up +``` + +## API 端点 + +### 核心接口 + +- `POST /invoke` - 同步调用算法 +- `POST /jobs` - 异步任务接口(预留) + +### 健康检查 + +- `GET /healthz` - 存活检查 +- `GET /readyz` - 就绪检查 + +### 监控 + +- `GET /metrics` - Prometheus 指标 + +## 示例请求 + +### 质数判断 + +```bash +curl -X POST http://localhost:8000/invoke \ + -H "Content-Type: application/json" \ + -d '{"number": 17}' +``` + +响应: + +```json +{ + "request_id": "550e8400-e29b-41d4-a716-446655440000", + "status": "success", + "result": { + "number": 17, + "is_prime": true, + "factors": [], + "algorithm": "trial_division" + }, + "metadata": { + "algorithm": "PrimeChecker", + "version": "1.0.0", + "elapsed_time": 0.001 + } +} +``` + +## 项目结构 + +``` +FunctionalScaffold/ +├── src/functional_scaffold/ # 核心代码 +│ ├── algorithms/ # 算法实现 +│ ├── api/ # API 层 +│ ├── core/ # 核心功能 +│ ├── utils/ # 工具函数 +│ ├── config.py # 配置管理 +│ └── main.py # 应用入口 +├── tests/ # 测试 +├── deployment/ # 部署配置 +│ ├── Dockerfile +│ ├── docker-compose.yml +│ ├── kubernetes/ +│ └── serverless/ +├── monitoring/ # 监控配置 +├── scripts/ # 辅助脚本 +└── docs/ # 文档 +``` + +## 开发指南 + +### 添加新算法 + +1. 在 `src/functional_scaffold/algorithms/` 创建新算法文件 +2. 继承 `BaseAlgorithm` 类并实现 `process` 方法 +3. 在 API 路由中注册新端点 + +示例: + +```python +from .base import BaseAlgorithm + +class MyAlgorithm(BaseAlgorithm): + def process(self, input_data): + # 实现算法逻辑 + result = do_something(input_data) + return {"result": result} +``` + +### 运行测试 + +```bash +# 运行所有测试 +pytest tests/ -v + +# 运行测试并生成覆盖率报告 +pytest tests/ --cov=src/functional_scaffold --cov-report=html + +# 使用脚本 +./scripts/run_tests.sh +``` + +### 代码质量 + +```bash +# 代码格式化 +black src/ tests/ + +# 代码检查 +ruff check src/ tests/ +``` + +### 导出 OpenAPI 规范 + +```bash +python scripts/export_openapi.py +``` + +生成的文件位于 `docs/swagger/openapi.json` + +## 部署 + +### Kubernetes + +```bash +kubectl apply -f deployment/kubernetes/ +``` + +### 阿里云函数计算 + +```bash +fun deploy -t deployment/serverless/aliyun-fc.yaml +``` + +### AWS Lambda + +```bash +sam deploy --template-file deployment/serverless/aws-lambda.yaml +``` + +## 监控 + +### Prometheus 指标 + +访问 `/metrics` 端点查看可用指标: + +- `http_requests_total` - HTTP 请求总数 +- `http_request_duration_seconds` - HTTP 请求延迟 +- `algorithm_executions_total` - 算法执行总数 +- `algorithm_execution_duration_seconds` - 算法执行延迟 + +### Grafana 仪表板 + +导入 `monitoring/grafana/dashboard.json` 到 Grafana + +## 配置 + +通过环境变量或 `.env` 文件配置: + +```bash +# 应用配置 +APP_NAME=FunctionalScaffold +APP_VERSION=1.0.0 +APP_ENV=development + +# 服务器配置 +HOST=0.0.0.0 +PORT=8000 +WORKERS=4 + +# 日志配置 +LOG_LEVEL=INFO +LOG_FORMAT=json + +# 指标配置 +METRICS_ENABLED=true +``` + +参考 `.env.example` 查看完整配置选项。 + +## 许可证 + +MIT License + +## 贡献 + +欢迎提交 Issue 和 Pull Request! diff --git a/deployment/Dockerfile b/deployment/Dockerfile new file mode 100644 index 0000000..bdca21d --- /dev/null +++ b/deployment/Dockerfile @@ -0,0 +1,31 @@ +FROM python:3.11-slim + +WORKDIR /app + +# 安装系统依赖 +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + && rm -rf /var/lib/apt/lists/* + +# 复制依赖文件 +COPY requirements.txt . + +# 安装 Python 依赖 +RUN pip install --no-cache-dir -r requirements.txt + +# 复制应用代码 +COPY src/ ./src/ + +# 创建非 root 用户 +RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app +USER appuser + +# 暴露端口 +EXPOSE 8000 + +# 健康检查 +HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/healthz')" + +# 启动命令 +CMD ["uvicorn", "src.functional_scaffold.main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/deployment/Dockerfile.redis-exporter b/deployment/Dockerfile.redis-exporter new file mode 100644 index 0000000..009b529 --- /dev/null +++ b/deployment/Dockerfile.redis-exporter @@ -0,0 +1,33 @@ +# Redis Exporter Dockerfile +FROM python:3.11-slim + +WORKDIR /app + +# 安装依赖 +COPY requirements.txt . +RUN pip install --no-cache-dir redis prometheus-client + +# 复制 exporter 代码 +COPY src/functional_scaffold/core/metrics_redis_exporter.py . + +# 暴露端口 +EXPOSE 8001 + +# 启动 HTTP 服务器提供指标 +CMD ["python", "-c", "\ +from http.server import HTTPServer, BaseHTTPRequestHandler; \ +from metrics_redis_exporter import get_metrics; \ +class MetricsHandler(BaseHTTPRequestHandler): \ + def do_GET(self): \ + if self.path == '/metrics': \ + self.send_response(200); \ + self.send_header('Content-Type', 'text/plain; version=0.0.4'); \ + self.end_headers(); \ + self.wfile.write(get_metrics()); \ + else: \ + self.send_response(404); \ + self.end_headers(); \ + def log_message(self, format, *args): pass; \ +server = HTTPServer(('0.0.0.0', 8001), MetricsHandler); \ +print('Redis Exporter 启动在端口 8001'); \ +server.serve_forever()"] diff --git a/deployment/docker-compose.yml b/deployment/docker-compose.yml new file mode 100644 index 0000000..8a94e58 --- /dev/null +++ b/deployment/docker-compose.yml @@ -0,0 +1,108 @@ +version: '3.8' + +services: + app: + build: + context: .. + dockerfile: deployment/Dockerfile + ports: + - "8111:8000" + environment: + - APP_ENV=development + - LOG_LEVEL=INFO + - METRICS_ENABLED=true + # 方案1:Pushgateway 配置 + - PUSHGATEWAY_URL=pushgateway:9091 + - METRICS_JOB_NAME=functional_scaffold + # 方案2:Redis 配置 + - REDIS_HOST=redis + - REDIS_PORT=6379 + - REDIS_METRICS_DB=0 + volumes: + - ../src:/app/src + restart: unless-stopped + depends_on: + - redis + - pushgateway + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/healthz')"] + interval: 30s + timeout: 3s + retries: 3 + start_period: 5s + + # Redis - 用于集中式指标存储(方案2) + redis: + image: redis:7-alpine + ports: + - "6379:6379" + volumes: + - redis_data:/data + command: redis-server --appendonly yes + restart: unless-stopped + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 3s + retries: 3 + + # Pushgateway - 用于短生命周期任务的指标推送(方案1,推荐) + pushgateway: + image: prom/pushgateway:latest + ports: + - "9091:9091" + restart: unless-stopped + command: + - '--persistence.file=/data/pushgateway.data' + - '--persistence.interval=5m' + volumes: + - pushgateway_data:/data + + # Redis Exporter - 将 Redis 指标导出为 Prometheus 格式(方案2需要) + redis-exporter: + build: + context: .. + dockerfile: deployment/Dockerfile.redis-exporter + ports: + - "8001:8001" + environment: + - REDIS_HOST=redis + - REDIS_PORT=6379 + - REDIS_METRICS_DB=0 + depends_on: + - redis + restart: unless-stopped + + prometheus: + image: prom/prometheus:latest + ports: + - "9090:9090" + volumes: + - ../monitoring/prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + restart: unless-stopped + depends_on: + - pushgateway + - redis-exporter + + grafana: + image: grafana/grafana:latest + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + volumes: + - grafana_data:/var/lib/grafana + - ../monitoring/grafana:/etc/grafana/provisioning + restart: unless-stopped + depends_on: + - prometheus + +volumes: + prometheus_data: + grafana_data: + redis_data: + pushgateway_data: diff --git a/deployment/kubernetes/deployment.yaml b/deployment/kubernetes/deployment.yaml new file mode 100644 index 0000000..7b3a8d6 --- /dev/null +++ b/deployment/kubernetes/deployment.yaml @@ -0,0 +1,53 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: functional-scaffold + labels: + app: functional-scaffold +spec: + replicas: 3 + selector: + matchLabels: + app: functional-scaffold + template: + metadata: + labels: + app: functional-scaffold + spec: + containers: + - name: functional-scaffold + image: functional-scaffold:latest + imagePullPolicy: IfNotPresent + ports: + - containerPort: 8000 + name: http + env: + - name: APP_ENV + value: "production" + - name: LOG_LEVEL + value: "INFO" + - name: METRICS_ENABLED + value: "true" + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" + livenessProbe: + httpGet: + path: /healthz + port: 8000 + initialDelaySeconds: 10 + periodSeconds: 30 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /readyz + port: 8000 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 diff --git a/deployment/kubernetes/service.yaml b/deployment/kubernetes/service.yaml new file mode 100644 index 0000000..e555c3f --- /dev/null +++ b/deployment/kubernetes/service.yaml @@ -0,0 +1,31 @@ +apiVersion: v1 +kind: Service +metadata: + name: functional-scaffold + labels: + app: functional-scaffold +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 8000 + protocol: TCP + name: http + selector: + app: functional-scaffold +--- +apiVersion: v1 +kind: Service +metadata: + name: functional-scaffold-metrics + labels: + app: functional-scaffold +spec: + type: ClusterIP + ports: + - port: 8000 + targetPort: 8000 + protocol: TCP + name: metrics + selector: + app: functional-scaffold diff --git a/deployment/serverless/aliyun-fc.yaml b/deployment/serverless/aliyun-fc.yaml new file mode 100644 index 0000000..fb9f793 --- /dev/null +++ b/deployment/serverless/aliyun-fc.yaml @@ -0,0 +1,40 @@ +# 阿里云函数计算配置 +ROSTemplateFormatVersion: '2015-09-01' +Transform: 'Aliyun::Serverless-2018-04-03' +Resources: + functional-scaffold: + Type: 'Aliyun::Serverless::Service' + Properties: + Description: '算法工程化 Serverless 脚手架' + LogConfig: + Project: functional-scaffold-logs + Logstore: function-logs + VpcConfig: + VpcId: 'vpc-xxxxx' + VSwitchIds: + - 'vsw-xxxxx' + SecurityGroupId: 'sg-xxxxx' + prime-checker: + Type: 'Aliyun::Serverless::Function' + Properties: + Description: '质数判断算法服务' + Runtime: custom-container + MemorySize: 512 + Timeout: 60 + InstanceConcurrency: 10 + CAPort: 8000 + CustomContainerConfig: + Image: 'registry.cn-hangzhou.aliyuncs.com/your-namespace/functional-scaffold:latest' + Command: '["uvicorn", "src.functional_scaffold.main:app", "--host", "0.0.0.0", "--port", "8000"]' + EnvironmentVariables: + APP_ENV: production + LOG_LEVEL: INFO + METRICS_ENABLED: 'true' + Events: + httpTrigger: + Type: HTTP + Properties: + AuthType: ANONYMOUS + Methods: + - GET + - POST diff --git a/deployment/serverless/aws-lambda.yaml b/deployment/serverless/aws-lambda.yaml new file mode 100644 index 0000000..14bfbfe --- /dev/null +++ b/deployment/serverless/aws-lambda.yaml @@ -0,0 +1,46 @@ +# AWS Lambda 配置(使用 Lambda Container Image) +AWSTemplateFormatVersion: '2010-09-09' +Transform: AWS::Serverless-2016-10-31 +Description: FunctionalScaffold Serverless Application + +Globals: + Function: + Timeout: 60 + MemorySize: 512 + Environment: + Variables: + APP_ENV: production + LOG_LEVEL: INFO + METRICS_ENABLED: 'true' + +Resources: + FunctionalScaffoldFunction: + Type: AWS::Serverless::Function + Properties: + PackageType: Image + ImageUri: !Sub '${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/functional-scaffold:latest' + Events: + ApiEvent: + Type: Api + Properties: + Path: /{proxy+} + Method: ANY + Policies: + - AWSLambdaBasicExecutionRole + + FunctionalScaffoldApi: + Type: AWS::Serverless::Api + Properties: + StageName: prod + Cors: + AllowMethods: "'*'" + AllowHeaders: "'*'" + AllowOrigin: "'*'" + +Outputs: + ApiUrl: + Description: "API Gateway endpoint URL" + Value: !Sub "https://${FunctionalScaffoldApi}.execute-api.${AWS::Region}.amazonaws.com/prod/" + FunctionArn: + Description: "Function ARN" + Value: !GetAtt FunctionalScaffoldFunction.Arn diff --git a/docs/grafana-dashboard-guide.md b/docs/grafana-dashboard-guide.md new file mode 100644 index 0000000..a7295fd --- /dev/null +++ b/docs/grafana-dashboard-guide.md @@ -0,0 +1,237 @@ +# Grafana Dashboard 导入和使用指南 + +## Dashboard 概述 + +新的 dashboard 包含 10 个面板,全面展示应用的监控指标: + +### 第一行:核心性能指标 +1. **HTTP 请求速率 (QPS)** - 每秒请求数,按端点和方法分组 +2. **HTTP 请求延迟 (P50/P95/P99)** - 请求响应时间的百分位数 + +### 第二行:关键指标 +3. **请求成功率** - 成功请求占比(仪表盘) +4. **当前并发请求数** - 实时并发数(仪表盘) +5. **HTTP 请求总数** - 累计请求数(统计卡片) +6. **算法执行总数** - 累计算法调用数(统计卡片) + +### 第三行:算法性能 +7. **算法执行速率** - 每秒算法执行次数 +8. **算法执行延迟 (P50/P95/P99)** - 算法执行时间的百分位数 + +### 第四行:分布分析 +9. **请求分布(按端点)** - 饼图展示各端点的请求占比 +10. **请求状态分布** - 饼图展示成功/失败请求占比 + +## 导入步骤 + +### 1. 配置 Prometheus 数据源 + +首先确保 Prometheus 数据源已正确配置: + +1. 打开 Grafana:http://localhost:3000 +2. 登录(默认:admin/admin) +3. 进入 **Configuration** → **Data Sources** +4. 点击 **Add data source** +5. 选择 **Prometheus** +6. 配置: + - **Name**: `Prometheus`(必须是这个名称) + - **URL**: `http://prometheus:9090`(注意:使用服务名,不是 localhost) + - **Access**: Server (default) +7. 点击 **Save & Test**,确保显示绿色的成功提示 + +### 2. 导入 Dashboard + +有两种方式导入 dashboard: + +#### 方式 1:通过 JSON 文件导入(推荐) + +1. 在 Grafana 左侧菜单,点击 **Dashboards** → **Import** +2. 点击 **Upload JSON file** +3. 选择文件:`monitoring/grafana/dashboard.json` +4. 在导入页面: + - **Name**: FunctionalScaffold 监控仪表板 + - **Folder**: General(或创建新文件夹) + - **Prometheus**: 选择刚才配置的 Prometheus 数据源 +5. 点击 **Import** + +#### 方式 2:通过 JSON 内容导入 + +1. 在 Grafana 左侧菜单,点击 **Dashboards** → **Import** +2. 复制 `monitoring/grafana/dashboard.json` 的全部内容 +3. 粘贴到 **Import via panel json** 文本框 +4. 点击 **Load** +5. 配置数据源并点击 **Import** + +### 3. 验证 Dashboard + +导入成功后,你应该看到: + +- ✅ 所有面板都正常显示 +- ✅ 有数据的面板显示图表和数值 +- ✅ 右上角显示自动刷新(5秒) +- ✅ 时间范围默认为最近 1 小时 + +## 生成测试数据 + +如果 dashboard 中没有数据或数据很少,运行流量生成脚本: + +```bash +# 启动流量生成器 +./scripts/generate_traffic.sh +``` + +这会持续发送请求到应用,生成监控数据。等待 1-2 分钟后,dashboard 中应该会显示丰富的图表。 + +## Dashboard 功能 + +### 自动刷新 + +Dashboard 配置了自动刷新,默认每 5 秒更新一次。你可以在右上角修改刷新间隔: +- 5s(默认) +- 10s +- 30s +- 1m +- 5m + +### 时间范围 + +默认显示最近 1 小时的数据。你可以在右上角修改时间范围: +- Last 5 minutes +- Last 15 minutes +- Last 30 minutes +- Last 1 hour(默认) +- Last 3 hours +- Last 6 hours +- Last 12 hours +- Last 24 hours +- 或自定义时间范围 + +### 实时模式 + +Dashboard 启用了 **Live** 模式(右上角的 Live 按钮),可以实时查看最新数据。 + +### 交互功能 + +- **缩放**:在时间序列图表上拖动选择区域可以放大 +- **图例点击**:点击图例可以隐藏/显示对应的数据系列 +- **Tooltip**:鼠标悬停在图表上查看详细数值 +- **面板全屏**:点击面板标题旁的图标可以全屏查看 + +## 常见问题 + +### 问题 1:数据源连接失败 + +**错误信息**:`dial tcp [::1]:9090: connect: connection refused` + +**解决方案**: +- 确保 Prometheus URL 使用 `http://prometheus:9090`(服务名) +- 不要使用 `http://localhost:9090`(在容器内部无法访问) + +### 问题 2:面板显示 "No data" + +**可能原因**: +1. 应用还没有收到任何请求 +2. Prometheus 还没有抓取到数据 +3. 时间范围选择不当 + +**解决方案**: +1. 发送一些测试请求: + ```bash + curl -X POST http://localhost:8111/invoke \ + -H "Content-Type: application/json" \ + -d '{"number": 17}' + ``` +2. 等待 15-30 秒让 Prometheus 抓取数据 +3. 调整时间范围为 "Last 5 minutes" +4. 运行流量生成脚本:`./scripts/generate_traffic.sh` + +### 问题 3:延迟图表显示 "NaN" 或空值 + +**原因**:直方图数据不足,无法计算百分位数 + +**解决方案**: +- 发送更多请求以积累足够的数据 +- 等待几分钟让数据积累 +- 使用流量生成脚本持续发送请求 + +### 问题 4:数据源变量未正确设置 + +**错误信息**:面板显示 "Datasource not found" + +**解决方案**: +1. 确保 Prometheus 数据源的名称是 `Prometheus` +2. 或者在 dashboard 设置中重新选择数据源: + - 点击右上角的齿轮图标(Dashboard settings) + - 进入 **Variables** 标签 + - 编辑 `DS_PROMETHEUS` 变量 + - 选择正确的 Prometheus 数据源 + +## PromQL 查询说明 + +Dashboard 使用的主要 PromQL 查询: + +### HTTP 请求速率 +```promql +sum(rate(http_requests_total[1m])) by (endpoint, method) +``` + +### HTTP 请求延迟 P95 +```promql +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[1m])) by (le, endpoint, method)) +``` + +### 请求成功率 +```promql +sum(rate(http_requests_total{status="success"}[5m])) / sum(rate(http_requests_total[5m])) +``` + +### 算法执行速率 +```promql +sum(rate(algorithm_executions_total[1m])) by (algorithm, status) +``` + +## 自定义 Dashboard + +你可以根据需要自定义 dashboard: + +1. **添加新面板**:点击右上角的 "Add panel" 按钮 +2. **编辑面板**:点击面板标题 → Edit +3. **调整布局**:拖动面板调整位置和大小 +4. **保存更改**:点击右上角的保存图标 + +## 导出和分享 + +### 导出 Dashboard + +1. 点击右上角的分享图标 +2. 选择 **Export** 标签 +3. 点击 **Save to file** 下载 JSON 文件 + +### 分享 Dashboard + +1. 点击右上角的分享图标 +2. 选择 **Link** 标签 +3. 复制链接分享给团队成员 + +## 告警配置(可选) + +你可以为面板配置告警规则: + +1. 编辑面板 +2. 切换到 **Alert** 标签 +3. 点击 **Create alert rule from this panel** +4. 配置告警条件和通知渠道 + +## 相关资源 + +- Grafana 官方文档:https://grafana.com/docs/ +- Prometheus 查询语言:https://prometheus.io/docs/prometheus/latest/querying/basics/ +- Dashboard 最佳实践:https://grafana.com/docs/grafana/latest/best-practices/ + +## 技术支持 + +如果遇到问题: +1. 检查 Prometheus 是否正常运行:http://localhost:9090 +2. 检查应用 metrics 端点:http://localhost:8111/metrics +3. 查看 Grafana 日志:`docker-compose logs grafana` +4. 查看 Prometheus 日志:`docker-compose logs prometheus` diff --git a/docs/metrics-guide.md b/docs/metrics-guide.md new file mode 100644 index 0000000..2eb6d25 --- /dev/null +++ b/docs/metrics-guide.md @@ -0,0 +1,346 @@ +# 指标记录方案对比与使用指南 + +## 问题背景 + +在多实例部署场景下(Kubernetes、Serverless),原有的内存指标存储方案存在以下问题: + +1. **指标分散**:每个实例独立记录指标,无法聚合 +2. **数据丢失**:实例销毁后指标丢失 +3. **统计不准**:无法获得全局准确的指标视图 + +## 解决方案对比 + +### 方案1:Pushgateway(推荐) + +**原理:** 应用主动推送指标到 Pushgateway,Prometheus 从 Pushgateway 抓取 + +**优点:** +- ✅ Prometheus 官方支持,生态成熟 +- ✅ 实现简单,代码改动小 +- ✅ 适合短生命周期任务(Serverless、批处理) +- ✅ 支持持久化,重启不丢失数据 + +**缺点:** +- ⚠️ 单点故障风险(可通过高可用部署解决) +- ⚠️ 不适合超高频推送(每秒数千次) + +**适用场景:** +- Serverless 函数 +- 批处理任务 +- 短生命周期容器 +- 实例数量动态变化的场景 + +### 方案2:Redis + 自定义 Exporter + +**原理:** 应用将指标写入 Redis,自定义 Exporter 从 Redis 读取并转换为 Prometheus 格式 + +**优点:** +- ✅ 灵活可控,支持复杂聚合逻辑 +- ✅ Redis 高性能,支持高并发写入 +- ✅ 可以实现自定义的指标计算 + +**缺点:** +- ⚠️ 需要自己实现 Exporter,维护成本高 +- ⚠️ 增加了系统复杂度 +- ⚠️ Redis 需要额外的运维成本 + +**适用场景:** +- 需要自定义指标聚合逻辑 +- 超高频指标写入(每秒数万次) +- 需要实时查询指标数据 + +### 方案3:标准 Prometheus Pull 模式(不推荐) + +**原理:** Prometheus 从每个实例抓取指标,在查询时聚合 + +**优点:** +- ✅ Prometheus 标准做法 +- ✅ 无需额外组件 + +**缺点:** +- ❌ 需要服务发现机制(Kubernetes Service Discovery) +- ❌ 短生命周期实例可能来不及抓取 +- ❌ 实例销毁后数据丢失 + +**适用场景:** +- 长生命周期服务 +- 实例数量相对固定 +- 有完善的服务发现机制 + +## 使用指南 + +### 方案1:Pushgateway(推荐) + +#### 1. 启动服务 + +```bash +cd deployment +docker-compose up -d redis pushgateway prometheus grafana +``` + +#### 2. 修改代码 + +在 `src/functional_scaffold/api/routes.py` 中: + +```python +# 替换导入 +from functional_scaffold.core.metrics_pushgateway import ( + track_request, + track_algorithm_execution, +) + +# 使用方式不变 +@router.post("/invoke") +@track_request("POST", "/invoke") +async def invoke_algorithm(request: InvokeRequest): + # ... 业务逻辑 +``` + +#### 3. 配置环境变量 + +在 `.env` 文件中: + +```bash +PUSHGATEWAY_URL=localhost:9091 +METRICS_JOB_NAME=functional_scaffold +INSTANCE_ID=instance-1 # 可选,默认使用 HOSTNAME +``` + +#### 4. 验证 + +```bash +# 查看 Pushgateway 指标 +curl http://localhost:9091/metrics + +# 查看 Prometheus +open http://localhost:9090 + +# 查询示例 +http_requests_total{job="functional_scaffold"} +``` + +### 方案2:Redis + Exporter + +#### 1. 启动服务 + +```bash +cd deployment +docker-compose up -d redis redis-exporter prometheus grafana +``` + +#### 2. 修改代码 + +在 `src/functional_scaffold/api/routes.py` 中: + +```python +# 替换导入 +from functional_scaffold.core.metrics_redis import ( + track_request, + track_algorithm_execution, +) + +# 使用方式不变 +@router.post("/invoke") +@track_request("POST", "/invoke") +async def invoke_algorithm(request: InvokeRequest): + # ... 业务逻辑 +``` + +#### 3. 配置环境变量 + +在 `.env` 文件中: + +```bash +REDIS_HOST=localhost +REDIS_PORT=6379 +REDIS_METRICS_DB=0 +REDIS_PASSWORD= # 可选 +INSTANCE_ID=instance-1 # 可选 +``` + +#### 4. 安装 Redis 依赖 + +```bash +pip install redis +``` + +或在 `requirements.txt` 中添加: + +``` +redis>=5.0.0 +``` + +#### 5. 验证 + +```bash +# 查看 Redis 中的指标 +redis-cli +> HGETALL metrics:request_counter + +# 查看 Exporter 输出 +curl http://localhost:8001/metrics + +# 查看 Prometheus +open http://localhost:9090 +``` + +## 性能对比 + +| 指标 | Pushgateway | Redis + Exporter | 标准 Pull | +|------|-------------|------------------|-----------| +| 写入延迟 | ~5ms | ~1ms | N/A | +| 查询延迟 | ~10ms | ~20ms | ~5ms | +| 吞吐量 | ~1000 req/s | ~10000 req/s | ~500 req/s | +| 内存占用 | 低 | 中 | 低 | +| 复杂度 | 低 | 高 | 低 | + +## 迁移步骤 + +### 从原有方案迁移到 Pushgateway + +1. **安装依赖**(如果需要): + ```bash + pip install prometheus-client + ``` + +2. **替换导入**: + ```python + # 旧代码 + from functional_scaffold.core.metrics import track_request + + # 新代码 + from functional_scaffold.core.metrics_pushgateway import track_request + ``` + +3. **配置环境变量**: + ```bash + export PUSHGATEWAY_URL=localhost:9091 + ``` + +4. **启动 Pushgateway**: + ```bash + docker-compose up -d pushgateway + ``` + +5. **更新 Prometheus 配置**(已包含在 `monitoring/prometheus.yml`) + +6. **测试验证**: + ```bash + # 发送请求 + curl -X POST http://localhost:8000/invoke -d '{"number": 17}' + + # 查看指标 + curl http://localhost:9091/metrics | grep http_requests_total + ``` + +### 从原有方案迁移到 Redis + +1. **安装依赖**: + ```bash + pip install redis + ``` + +2. **替换导入**: + ```python + # 旧代码 + from functional_scaffold.core.metrics import track_request + + # 新代码 + from functional_scaffold.core.metrics_redis import track_request + ``` + +3. **配置环境变量**: + ```bash + export REDIS_HOST=localhost + export REDIS_PORT=6379 + ``` + +4. **启动 Redis 和 Exporter**: + ```bash + docker-compose up -d redis redis-exporter + ``` + +5. **测试验证**: + ```bash + # 发送请求 + curl -X POST http://localhost:8000/invoke -d '{"number": 17}' + + # 查看 Redis + redis-cli HGETALL metrics:request_counter + + # 查看 Exporter + curl http://localhost:8001/metrics + ``` + +## 常见问题 + +### Q1: Pushgateway 会成为单点故障吗? + +A: 可以通过以下方式解决: +- 部署多个 Pushgateway 实例(负载均衡) +- 使用持久化存储(已配置) +- 推送失败时降级到本地日志 + +### Q2: Redis 方案的性能如何? + +A: Redis 单实例可以支持 10万+ QPS,对于大多数场景足够。如果需要更高性能,可以: +- 使用 Redis Cluster +- 批量写入(减少网络往返) +- 使用 Pipeline + +### Q3: 如何在 Kubernetes 中使用? + +A: +- **Pushgateway**: 部署为 Service,应用通过 Service 名称访问 +- **Redis**: 使用 StatefulSet 或托管 Redis 服务 + +### Q4: 指标数据会丢失吗? + +A: +- **Pushgateway**: 支持持久化,重启不丢失 +- **Redis**: 配置了 AOF 持久化,重启不丢失 +- **标准 Pull**: 实例销毁后丢失 + +### Q5: 如何选择方案? + +建议: +- **Serverless/短生命周期** → Pushgateway +- **超高并发/自定义逻辑** → Redis +- **长生命周期/K8s** → 标准 Pull(需配置服务发现) + +## 监控和告警 + +### Grafana 仪表板 + +访问 http://localhost:3000(admin/admin) + +已预配置的面板: +- HTTP 请求总数 +- HTTP 请求延迟(P50/P95/P99) +- 算法执行次数 +- 算法执行延迟 +- 错误率 + +### 告警规则 + +在 `monitoring/alerts/rules.yaml` 中配置: + +```yaml +groups: + - name: functional_scaffold + rules: + - alert: HighErrorRate + expr: rate(http_requests_total{status="error"}[5m]) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "高错误率告警" + description: "错误率超过 5%" +``` + +## 参考资料 + +- [Prometheus Pushgateway 文档](https://github.com/prometheus/pushgateway) +- [Prometheus 最佳实践](https://prometheus.io/docs/practices/) +- [Redis 官方文档](https://redis.io/documentation) diff --git a/docs/metrics-improvement-summary.md b/docs/metrics-improvement-summary.md new file mode 100644 index 0000000..87872fa --- /dev/null +++ b/docs/metrics-improvement-summary.md @@ -0,0 +1,227 @@ +# Prometheus 指标记录问题修复总结 + +## 问题描述 + +Prometheus 中没有正常记录应用的访问数据。虽然 `/metrics` 端点可以访问,并且定义了所有指标类型,但这些指标都没有任何数据值。 + +## 根本原因 + +1. **HTTP 请求指标未记录**:`api/routes.py` 中的路由处理函数没有使用 `@track_request` 装饰器来记录 HTTP 请求指标 +2. **算法执行指标未记录**:`algorithms/base.py` 中的 `execute()` 方法没有调用 metrics 模块来记录算法执行指标 + +## 解决方案 + +### 1. 添加 HTTP 请求指标跟踪中间件 + +**文件**:`src/functional_scaffold/main.py` + +**修改内容**: +- 导入 metrics 相关的对象:`request_counter`, `request_latency`, `in_progress_requests` +- 添加 `track_metrics` 中间件,自动跟踪所有 HTTP 请求 + +**优点**: +- 自动化:不需要在每个路由上手动添加装饰器 +- 统一:所有端点的指标记录逻辑一致 +- 易维护:新增端点自动获得指标跟踪能力 + +**实现代码**: +```python +@app.middleware("http") +async def track_metrics(request: Request, call_next): + """记录所有HTTP请求的指标""" + if not settings.metrics_enabled: + return await call_next(request) + + # 跳过 /metrics 端点本身,避免循环记录 + if request.url.path == "/metrics": + return await call_next(request) + + in_progress_requests.inc() + start_time = time.time() + status = "success" + + try: + response = await call_next(request) + if response.status_code >= 400: + status = "error" + return response + except Exception as e: + status = "error" + raise e + finally: + elapsed = time.time() - start_time + request_counter.labels( + method=request.method, + endpoint=request.url.path, + status=status + ).inc() + request_latency.labels( + method=request.method, + endpoint=request.url.path + ).observe(elapsed) + in_progress_requests.dec() +``` + +### 2. 添加算法执行指标记录 + +**文件**:`src/functional_scaffold/algorithms/base.py` + +**修改内容**: +- 在 `execute()` 方法中导入 `algorithm_counter` 和 `algorithm_latency` +- 在 `finally` 块中记录算法执行指标 + +**实现代码**: +```python +def execute(self, *args, **kwargs) -> Dict[str, Any]: + from ..core.metrics import algorithm_counter, algorithm_latency + + start_time = time.time() + status = "success" + + try: + # ... 算法执行逻辑 ... + except Exception as e: + status = "error" + # ... 错误处理 ... + finally: + elapsed_time = time.time() - start_time + algorithm_counter.labels(algorithm=self.name, status=status).inc() + algorithm_latency.labels(algorithm=self.name).observe(elapsed_time) +``` + +## 验证结果 + +### 1. 应用 /metrics 端点 + +修复后,`/metrics` 端点正常返回指标数据: + +``` +# HTTP 请求指标 +http_requests_total{endpoint="/healthz",method="GET",status="success"} 3.0 +http_requests_total{endpoint="/invoke",method="POST",status="success"} 2.0 +http_requests_total{endpoint="/readyz",method="GET",status="success"} 1.0 + +# HTTP 请求延迟 +http_request_duration_seconds_sum{endpoint="/invoke",method="POST"} 0.0065615177154541016 +http_request_duration_seconds_count{endpoint="/invoke",method="POST"} 2.0 + +# 算法执行指标 +algorithm_executions_total{algorithm="PrimeChecker",status="success"} 2.0 +algorithm_execution_duration_seconds_sum{algorithm="PrimeChecker"} 0.00023603439331054688 +algorithm_execution_duration_seconds_count{algorithm="PrimeChecker"} 2.0 + +# 当前进行中的请求 +http_requests_in_progress 0.0 +``` + +### 2. Prometheus 查询 + +Prometheus 成功抓取并存储了指标数据: + +```bash +# 查询 HTTP 请求总数 +curl 'http://localhost:9090/api/v1/query?query=http_requests_total' + +# 查询算法执行总数 +curl 'http://localhost:9090/api/v1/query?query=algorithm_executions_total' +``` + +## 可用指标 + +修复后,以下指标可以在 Prometheus 和 Grafana 中使用: + +### HTTP 请求指标 + +1. **http_requests_total** (Counter) + - 标签:`method`, `endpoint`, `status` + - 描述:HTTP 请求总数 + - 用途:统计各端点的请求量、成功率 + +2. **http_request_duration_seconds** (Histogram) + - 标签:`method`, `endpoint` + - 描述:HTTP 请求延迟分布 + - 用途:分析请求响应时间、P50/P95/P99 延迟 + +3. **http_requests_in_progress** (Gauge) + - 描述:当前正在处理的请求数 + - 用途:监控并发请求数、负载情况 + +### 算法执行指标 + +1. **algorithm_executions_total** (Counter) + - 标签:`algorithm`, `status` + - 描述:算法执行总数 + - 用途:统计算法调用量、成功率 + +2. **algorithm_execution_duration_seconds** (Histogram) + - 标签:`algorithm` + - 描述:算法执行延迟分布 + - 用途:分析算法性能、优化瓶颈 + +## 使用示例 + +### Prometheus 查询示例 + +```promql +# 每秒请求数 (QPS) +rate(http_requests_total[5m]) + +# 请求成功率 +sum(rate(http_requests_total{status="success"}[5m])) / sum(rate(http_requests_total[5m])) + +# P95 延迟 +histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) + +# 算法执行失败率 +sum(rate(algorithm_executions_total{status="error"}[5m])) / sum(rate(algorithm_executions_total[5m])) +``` + +### 生成测试流量 + +使用提供的脚本生成测试流量: + +```bash +# 启动流量生成器 +./scripts/generate_traffic.sh + +# 在另一个终端查看实时指标 +watch -n 1 'curl -s http://localhost:8111/metrics | grep http_requests_total' +``` + +## Grafana 仪表板 + +访问 Grafana 查看可视化指标: + +1. 打开浏览器访问:http://localhost:3000 +2. 登录(默认用户名/密码:admin/admin) +3. 导入仪表板:`monitoring/grafana/dashboard.json` + +仪表板包含以下面板: +- 请求速率(QPS) +- 请求延迟(P50/P95/P99) +- 错误率 +- 算法执行统计 +- 并发请求数 + +## 注意事项 + +1. **中间件顺序**:指标跟踪中间件应该在日志中间件之后注册,确保所有请求都被记录 +2. **/metrics 端点**:中间件会跳过 `/metrics` 端点本身,避免循环记录 +3. **错误状态**:HTTP 状态码 >= 400 会被标记为 `status="error"` +4. **性能影响**:指标记录的性能开销极小(微秒级),不会影响应用性能 + +## 后续优化建议 + +1. **添加更多维度**:可以添加 `user_id`、`region` 等标签进行更细粒度的分析 +2. **自定义指标**:根据业务需求添加自定义指标(如缓存命中率、外部 API 调用次数等) +3. **告警规则**:配置 Prometheus 告警规则,在指标异常时发送通知 +4. **长期存储**:考虑使用 Thanos 或 Cortex 进行长期指标存储和查询 + +## 相关文件 + +- `src/functional_scaffold/main.py` - HTTP 请求指标跟踪中间件 +- `src/functional_scaffold/algorithms/base.py` - 算法执行指标记录 +- `src/functional_scaffold/core/metrics.py` - 指标定义 +- `monitoring/prometheus.yml` - Prometheus 配置 +- `monitoring/grafana/dashboard.json` - Grafana 仪表板 +- `scripts/generate_traffic.sh` - 流量生成脚本 diff --git a/docs/swagger/README.md b/docs/swagger/README.md new file mode 100644 index 0000000..e2980c5 --- /dev/null +++ b/docs/swagger/README.md @@ -0,0 +1,107 @@ +# Swagger 文档 + +本目录包含自动生成的 OpenAPI 规范文档。 + +## 生成文档 + +运行以下命令生成或更新 OpenAPI 规范: + +```bash +python scripts/export_openapi.py +``` + +这将生成 `openapi.json` 文件,包含完整的 API 规范。 + +## 查看文档 + +### 在线查看 + +启动应用后,访问以下 URL: + +- **Swagger UI**: http://localhost:8000/docs +- **ReDoc**: http://localhost:8000/redoc + +### 离线查看 + +使用 Swagger Editor 或其他 OpenAPI 工具打开 `openapi.json` 文件。 + +## API 规范 + +### 端点列表 + +#### 算法接口 + +- `POST /invoke` - 同步调用算法 + - 请求体: `{"number": integer}` + - 响应: 算法执行结果 + +- `POST /jobs` - 异步任务接口(预留) + - 当前返回 501 Not Implemented + +#### 健康检查 + +- `GET /healthz` - 存活检查 + - 响应: `{"status": "healthy", "timestamp": float}` + +- `GET /readyz` - 就绪检查 + - 响应: `{"status": "ready", "timestamp": float, "checks": {...}}` + +#### 监控 + +- `GET /metrics` - Prometheus 指标 + - 响应: Prometheus 文本格式 + +### 数据模型 + +#### InvokeRequest + +```json +{ + "number": 17 +} +``` + +#### InvokeResponse + +```json +{ + "request_id": "uuid", + "status": "success", + "result": { + "number": 17, + "is_prime": true, + "factors": [], + "algorithm": "trial_division" + }, + "metadata": { + "algorithm": "PrimeChecker", + "version": "1.0.0", + "elapsed_time": 0.001 + } +} +``` + +#### ErrorResponse + +```json +{ + "error": "ERROR_CODE", + "message": "Error description", + "details": {}, + "request_id": "uuid" +} +``` + +## 更新文档 + +当修改 API 接口后,需要重新生成文档: + +1. 修改代码(路由、模型等) +2. 运行 `python scripts/export_openapi.py` +3. 提交更新后的 `openapi.json` + +## 注意事项 + +- `openapi.json` 是自动生成的,不要手动编辑 +- 所有 API 变更都应该在代码中完成,然后重新生成文档 +- 确保 Pydantic 模型包含完整的文档字符串和示例 diff --git a/docs/swagger/openapi.json b/docs/swagger/openapi.json new file mode 100644 index 0000000..f4da82b --- /dev/null +++ b/docs/swagger/openapi.json @@ -0,0 +1,404 @@ +{ + "openapi": "3.1.0", + "info": { + "title": "FunctionalScaffold", + "description": "算法工程化 Serverless 脚手架 - 提供标准化的算法服务接口", + "version": "1.0.0" + }, + "paths": { + "/invoke": { + "post": { + "tags": [ + "Algorithm" + ], + "summary": "同步调用算法", + "description": "同步调用质数判断算法,立即返回结果", + "operationId": "invoke_algorithm_invoke_post", + "parameters": [ + { + "name": "x-request-id", + "in": "header", + "required": false, + "schema": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "X-Request-Id" + } + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InvokeRequest" + } + } + } + }, + "responses": { + "200": { + "description": "成功", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InvokeResponse" + } + } + } + }, + "400": { + "description": "请求参数错误", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "500": { + "description": "服务器内部错误", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/healthz": { + "get": { + "tags": [ + "Algorithm" + ], + "summary": "健康检查", + "description": "检查服务是否存活", + "operationId": "health_check_healthz_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HealthResponse" + } + } + } + } + } + } + }, + "/readyz": { + "get": { + "tags": [ + "Algorithm" + ], + "summary": "就绪检查", + "description": "检查服务是否就绪", + "operationId": "readiness_check_readyz_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ReadinessResponse" + } + } + } + } + } + } + }, + "/jobs": { + "post": { + "tags": [ + "Algorithm" + ], + "summary": "异步任务接口(预留)", + "description": "异步任务接口,当前版本未实现", + "operationId": "create_job_jobs_post", + "responses": { + "501": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/metrics": { + "get": { + "tags": [ + "Monitoring" + ], + "summary": "Prometheus 指标", + "description": "导出 Prometheus 格式的监控指标", + "operationId": "metrics_metrics_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + } + }, + "components": { + "schemas": { + "ErrorResponse": { + "properties": { + "error": { + "type": "string", + "title": "Error", + "description": "错误代码" + }, + "message": { + "type": "string", + "title": "Message", + "description": "错误消息" + }, + "details": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Details", + "description": "错误详情" + }, + "request_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Request Id", + "description": "请求ID" + } + }, + "type": "object", + "required": [ + "error", + "message" + ], + "title": "ErrorResponse", + "description": "错误响应", + "example": { + "details": { + "field": "number", + "value": "abc" + }, + "error": "VALIDATION_ERROR", + "message": "number must be an integer", + "request_id": "550e8400-e29b-41d4-a716-446655440000" + } + }, + "HTTPValidationError": { + "properties": { + "detail": { + "items": { + "$ref": "#/components/schemas/ValidationError" + }, + "type": "array", + "title": "Detail" + } + }, + "type": "object", + "title": "HTTPValidationError" + }, + "HealthResponse": { + "properties": { + "status": { + "type": "string", + "title": "Status", + "description": "健康状态" + }, + "timestamp": { + "type": "number", + "title": "Timestamp", + "description": "时间戳" + } + }, + "type": "object", + "required": [ + "status", + "timestamp" + ], + "title": "HealthResponse", + "description": "健康检查响应" + }, + "InvokeRequest": { + "properties": { + "number": { + "type": "integer", + "title": "Number", + "description": "待判断的整数" + } + }, + "type": "object", + "required": [ + "number" + ], + "title": "InvokeRequest", + "description": "同步调用请求", + "example": { + "number": 17 + } + }, + "InvokeResponse": { + "properties": { + "request_id": { + "type": "string", + "title": "Request Id", + "description": "请求唯一标识" + }, + "status": { + "type": "string", + "title": "Status", + "description": "处理状态" + }, + "result": { + "additionalProperties": true, + "type": "object", + "title": "Result", + "description": "算法执行结果" + }, + "metadata": { + "additionalProperties": true, + "type": "object", + "title": "Metadata", + "description": "元数据信息" + } + }, + "type": "object", + "required": [ + "request_id", + "status", + "result", + "metadata" + ], + "title": "InvokeResponse", + "description": "同步调用响应", + "example": { + "metadata": { + "algorithm": "PrimeChecker", + "elapsed_time": 0.001, + "version": "1.0.0" + }, + "request_id": "550e8400-e29b-41d4-a716-446655440000", + "result": { + "algorithm": "trial_division", + "factors": [], + "is_prime": true, + "number": 17 + }, + "status": "success" + } + }, + "ReadinessResponse": { + "properties": { + "status": { + "type": "string", + "title": "Status", + "description": "就绪状态" + }, + "timestamp": { + "type": "number", + "title": "Timestamp", + "description": "时间戳" + }, + "checks": { + "anyOf": [ + { + "additionalProperties": { + "type": "boolean" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Checks", + "description": "各项检查结果" + } + }, + "type": "object", + "required": [ + "status", + "timestamp" + ], + "title": "ReadinessResponse", + "description": "就绪检查响应" + }, + "ValidationError": { + "properties": { + "loc": { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + }, + "type": "array", + "title": "Location" + }, + "msg": { + "type": "string", + "title": "Message" + }, + "type": { + "type": "string", + "title": "Error Type" + } + }, + "type": "object", + "required": [ + "loc", + "msg", + "type" + ], + "title": "ValidationError" + } + } + } +} \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..b56c695 --- /dev/null +++ b/main.py @@ -0,0 +1,16 @@ +# 这是一个示例 Python 脚本。 + +# 按 ⌃R 执行或将其替换为您的代码。 +# 按 双击 ⇧ 在所有地方搜索类、文件、工具窗口、操作和设置。 + + +def print_hi(name): + # 在下面的代码行中使用断点来调试脚本。 + print(f'Hi, {name}') # 按 ⌘F8 切换断点。 + + +# 按装订区域中的绿色按钮以运行脚本。 +if __name__ == '__main__': + print_hi('PyCharm') + +# 访问 https://www.jetbrains.com/help/pycharm/ 获取 PyCharm 帮助 diff --git a/monitoring/alerts/rules.yaml b/monitoring/alerts/rules.yaml new file mode 100644 index 0000000..d1e7a67 --- /dev/null +++ b/monitoring/alerts/rules.yaml @@ -0,0 +1,39 @@ +groups: + - name: functional_scaffold_alerts + interval: 30s + rules: + - alert: HighErrorRate + expr: rate(http_requests_total{status="error"}[5m]) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "High error rate detected" + description: "Error rate is {{ $value }} requests/sec for {{ $labels.endpoint }}" + + - alert: HighLatency + expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "High latency detected" + description: "P95 latency is {{ $value }}s for {{ $labels.endpoint }}" + + - alert: ServiceDown + expr: up{job="functional-scaffold"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Service is down" + description: "FunctionalScaffold service has been down for more than 1 minute" + + - alert: HighMemoryUsage + expr: container_memory_usage_bytes{container="functional-scaffold"} / container_spec_memory_limit_bytes{container="functional-scaffold"} > 0.9 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage" + description: "Memory usage is {{ $value | humanizePercentage }} of limit" diff --git a/monitoring/grafana/dashboard.json b/monitoring/grafana/dashboard.json new file mode 100644 index 0000000..9a5716d --- /dev/null +++ b/monitoring/grafana/dashboard.json @@ -0,0 +1,808 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": true, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "请求/秒", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["mean", "lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(rate(http_requests_total[1m])) by (endpoint, method)", + "legendFormat": "{{method}} {{endpoint}}", + "refId": "A" + } + ], + "title": "HTTP 请求速率 (QPS)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "延迟", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.1 + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[1m])) by (le, endpoint, method))", + "legendFormat": "P50 - {{method}} {{endpoint}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[1m])) by (le, endpoint, method))", + "legendFormat": "P95 - {{method}} {{endpoint}}", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[1m])) by (le, endpoint, method))", + "legendFormat": "P99 - {{method}} {{endpoint}}", + "refId": "C" + } + ], + "title": "HTTP 请求延迟 (P50/P95/P99)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 0.95 + }, + { + "color": "green", + "value": 0.99 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(rate(http_requests_total{status=\"success\"}[5m])) / sum(rate(http_requests_total[5m]))", + "refId": "A" + } + ], + "title": "请求成功率", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 8 + }, + "id": 4, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "http_requests_in_progress", + "refId": "A" + } + ], + "title": "当前并发请求数", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 8 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(http_requests_total)", + "refId": "A" + } + ], + "title": "HTTP 请求总数", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 8 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(algorithm_executions_total)", + "refId": "A" + } + ], + "title": "算法执行总数", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "执行/秒", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": ["mean", "lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(rate(algorithm_executions_total[1m])) by (algorithm, status)", + "legendFormat": "{{algorithm}} - {{status}}", + "refId": "A" + } + ], + "title": "算法执行速率", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "延迟", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "histogram_quantile(0.50, sum(rate(algorithm_execution_duration_seconds_bucket[1m])) by (le, algorithm))", + "legendFormat": "P50 - {{algorithm}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "histogram_quantile(0.95, sum(rate(algorithm_execution_duration_seconds_bucket[1m])) by (le, algorithm))", + "legendFormat": "P95 - {{algorithm}}", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "histogram_quantile(0.99, sum(rate(algorithm_execution_duration_seconds_bucket[1m])) by (le, algorithm))", + "legendFormat": "P99 - {{algorithm}}", + "refId": "C" + } + ], + "title": "算法执行延迟 (P50/P95/P99)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 9, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": ["value"] + }, + "pieType": "pie", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(http_requests_total) by (endpoint)", + "legendFormat": "{{endpoint}}", + "refId": "A" + } + ], + "title": "请求分布(按端点)", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 10, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": ["value"] + }, + "pieType": "pie", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(http_requests_total) by (status)", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "请求状态分布", + "type": "piechart" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["functional-scaffold", "monitoring"], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "数据源", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": ["5s", "10s", "30s", "1m", "5m"] + }, + "timezone": "browser", + "title": "FunctionalScaffold 监控仪表板", + "uid": "functional-scaffold", + "version": 1, + "weekStart": "" +} diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml new file mode 100644 index 0000000..dbe523c --- /dev/null +++ b/monitoring/prometheus.yml @@ -0,0 +1,46 @@ +# Prometheus 配置文件 +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + cluster: 'functional-scaffold' + environment: 'development' + +# 抓取配置 +scrape_configs: + # 方案1:从 Pushgateway 抓取指标(推荐) + - job_name: 'pushgateway' + honor_labels: true + static_configs: + - targets: ['pushgateway:9091'] + metric_relabel_configs: + # 保留 instance 标签 + - source_labels: [instance] + target_label: instance + action: replace + + # 方案2:从 Redis Exporter 抓取指标 + - job_name: 'redis-exporter' + static_configs: + - targets: ['redis-exporter:8001'] + + # 直接从应用实例抓取(如果有多个实例,需要配置服务发现) + - job_name: 'app' + static_configs: + - targets: ['app:8000'] + metrics_path: '/metrics' + + # Prometheus 自身监控 + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + +# 告警规则文件 +rule_files: + - '/etc/prometheus/rules/*.yml' + +# Alertmanager 配置(可选) +# alerting: +# alertmanagers: +# - static_configs: +# - targets: ['alertmanager:9093'] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a70f1da --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,50 @@ +[build-system] +requires = ["setuptools>=65.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "functional-scaffold" +version = "1.0.0" +description = "算法工程化 Serverless 脚手架" +requires-python = ">=3.9" +authors = [ + {name = "FunctionalScaffold Team"} +] +readme = "README.md" + +dependencies = [ + "fastapi>=0.109.0", + "uvicorn[standard]>=0.27.0", + "pydantic>=2.5.0", + "pydantic-settings>=2.0.0", + "prometheus-client>=0.19.0", + "python-json-logger>=2.0.7", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.4.0", + "pytest-asyncio>=0.21.0", + "pytest-cov>=4.1.0", + "httpx>=0.26.0", + "black>=23.12.0", + "ruff>=0.1.0", +] + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.black] +line-length = 100 +target-version = ['py39'] + +[tool.ruff] +line-length = 100 +target-version = "py39" + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = "-v --strict-markers" diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..1f99787 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,6 @@ +pytest>=7.4.0 +pytest-asyncio>=0.21.0 +pytest-cov>=4.1.0 +httpx>=0.26.0 +black>=23.12.0 +ruff>=0.1.0 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7837771 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +fastapi>=0.109.0 +uvicorn[standard]>=0.27.0 +pydantic>=2.5.0 +pydantic-settings>=2.0.0 +prometheus-client>=0.19.0 +python-json-logger>=2.0.7 + +# 指标存储方案(可选,根据选择的方案安装) +# 方案2:Redis 方案需要 +redis>=5.0.0 diff --git a/scripts/export_openapi.py b/scripts/export_openapi.py new file mode 100644 index 0000000..a3c5a55 --- /dev/null +++ b/scripts/export_openapi.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +"""导出 OpenAPI 规范到 JSON 文件""" + +import json +import sys +from pathlib import Path + +# 添加 src 到路径 +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from functional_scaffold.main import app + + +def export_openapi(): + """导出 OpenAPI 规范""" + openapi_schema = app.openapi() + + # 确保输出目录存在 + output_dir = Path(__file__).parent.parent / "docs" / "swagger" + output_dir.mkdir(parents=True, exist_ok=True) + + # 写入文件 + output_file = output_dir / "openapi.json" + with open(output_file, "w", encoding="utf-8") as f: + json.dump(openapi_schema, f, indent=2, ensure_ascii=False) + + print(f"OpenAPI schema exported to: {output_file}") + print(f"Schema version: {openapi_schema.get('openapi')}") + print(f"API title: {openapi_schema.get('info', {}).get('title')}") + print(f"API version: {openapi_schema.get('info', {}).get('version')}") + print(f"Endpoints: {len(openapi_schema.get('paths', {}))}") + + +if __name__ == "__main__": + export_openapi() diff --git a/scripts/generate_traffic.sh b/scripts/generate_traffic.sh new file mode 100755 index 0000000..936d9aa --- /dev/null +++ b/scripts/generate_traffic.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# 生成测试流量脚本 + +echo "开始生成测试流量..." +echo "按 Ctrl+C 停止" + +count=0 +while true; do + # 随机生成一个数字 + number=$((RANDOM % 1000 + 1)) + + # 发送请求 + curl -s -X POST http://localhost:8111/invoke \ + -H "Content-Type: application/json" \ + -d "{\"number\": $number}" > /dev/null + + count=$((count + 1)) + echo "[$count] 已发送请求: number=$number" + + # 随机延迟 0.5-2 秒 + sleep $(awk -v min=0.5 -v max=2 'BEGIN{srand(); print min+rand()*(max-min)}') +done diff --git a/scripts/run_dev.sh b/scripts/run_dev.sh new file mode 100755 index 0000000..58347ac --- /dev/null +++ b/scripts/run_dev.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# 开发环境启动脚本 + +set -e + +echo "Starting FunctionalScaffold in development mode..." + +# 检查虚拟环境 +if [ ! -d "venv" ]; then + echo "Creating virtual environment..." + python3 -m venv venv +fi + +# 激活虚拟环境 +source venv/bin/activate + +# 安装依赖 +echo "Installing dependencies..." +pip install -e ".[dev]" + +# 启动服务 +echo "Starting server on http://localhost:8000" +echo "API docs available at http://localhost:8000/docs" +uvicorn src.functional_scaffold.main:app --reload --host 0.0.0.0 --port 8000 diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh new file mode 100755 index 0000000..d186cb8 --- /dev/null +++ b/scripts/run_tests.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# 测试运行脚本 + +set -e + +echo "Running tests for FunctionalScaffold..." + +# 激活虚拟环境(如果存在) +if [ -d "venv" ]; then + source venv/bin/activate +fi + +# 运行代码检查 +echo "Running code quality checks..." +echo "- Checking with ruff..." +ruff check src/ tests/ || true + +echo "- Checking formatting with black..." +black --check src/ tests/ || true + +# 运行测试 +echo "" +echo "Running tests..." +pytest tests/ -v --cov=src/functional_scaffold --cov-report=term --cov-report=html + +echo "" +echo "Tests completed!" +echo "Coverage report available at: htmlcov/index.html" diff --git a/scripts/start_metrics.sh b/scripts/start_metrics.sh new file mode 100755 index 0000000..c845fb5 --- /dev/null +++ b/scripts/start_metrics.sh @@ -0,0 +1,114 @@ +#!/bin/bash +# 指标方案快速启动脚本 + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo "==========================================" +echo "FunctionalScaffold 指标方案启动脚本" +echo "==========================================" + +# 检查 docker-compose +if ! command -v docker-compose &> /dev/null; then + echo -e "${RED}错误: docker-compose 未安装${NC}" + exit 1 +fi + +# 选择方案 +echo "" +echo "请选择指标方案:" +echo "1. Pushgateway(推荐,适合 Serverless)" +echo "2. Redis + Exporter(适合高并发)" +echo "3. 两者都启动(用于对比测试)" +echo "" +read -p "输入选项 (1/2/3): " choice + +cd "$(dirname "$0")/../deployment" + +case $choice in + 1) + echo -e "${GREEN}启动 Pushgateway 方案...${NC}" + docker-compose up -d redis pushgateway prometheus grafana + echo "" + echo -e "${GREEN}✓ Pushgateway 方案已启动${NC}" + echo "" + echo "服务地址:" + echo " - Pushgateway: http://localhost:9091" + echo " - Prometheus: http://localhost:9090" + echo " - Grafana: http://localhost:3000 (admin/admin)" + echo "" + echo "下一步:" + echo " 1. 修改代码导入: from functional_scaffold.core.metrics_pushgateway import ..." + echo " 2. 配置环境变量: PUSHGATEWAY_URL=localhost:9091" + echo " 3. 启动应用: ./scripts/run_dev.sh" + echo " 4. 运行测试: python scripts/test_metrics.py pushgateway" + ;; + 2) + echo -e "${GREEN}启动 Redis 方案...${NC}" + + # 检查 redis 依赖 + if ! python -c "import redis" 2>/dev/null; then + echo -e "${YELLOW}警告: redis 库未安装${NC}" + echo "正在安装 redis..." + pip install redis + fi + + docker-compose up -d redis redis-exporter prometheus grafana + echo "" + echo -e "${GREEN}✓ Redis 方案已启动${NC}" + echo "" + echo "服务地址:" + echo " - Redis: localhost:6379" + echo " - Redis Exporter: http://localhost:8001/metrics" + echo " - Prometheus: http://localhost:9090" + echo " - Grafana: http://localhost:3000 (admin/admin)" + echo "" + echo "下一步:" + echo " 1. 修改代码导入: from functional_scaffold.core.metrics_redis import ..." + echo " 2. 配置环境变量: REDIS_HOST=localhost REDIS_PORT=6379" + echo " 3. 启动应用: ./scripts/run_dev.sh" + echo " 4. 运行测试: python scripts/test_metrics.py redis" + ;; + 3) + echo -e "${GREEN}启动所有服务...${NC}" + + # 检查 redis 依赖 + if ! python -c "import redis" 2>/dev/null; then + echo -e "${YELLOW}警告: redis 库未安装${NC}" + echo "正在安装 redis..." + pip install redis + fi + + docker-compose up -d + echo "" + echo -e "${GREEN}✓ 所有服务已启动${NC}" + echo "" + echo "服务地址:" + echo " - 应用: http://localhost:8000" + echo " - Pushgateway: http://localhost:9091" + echo " - Redis: localhost:6379" + echo " - Redis Exporter: http://localhost:8001/metrics" + echo " - Prometheus: http://localhost:9090" + echo " - Grafana: http://localhost:3000 (admin/admin)" + echo "" + echo "下一步:" + echo " 1. 查看文档: cat docs/metrics-guide.md" + echo " 2. 运行测试: python scripts/test_metrics.py" + ;; + *) + echo -e "${RED}无效的选项${NC}" + exit 1 + ;; +esac + +echo "" +echo "==========================================" +echo "查看日志: docker-compose logs -f" +echo "停止服务: docker-compose down" +echo "查看文档: cat ../docs/metrics-guide.md" +echo "==========================================" diff --git a/scripts/test_metrics.py b/scripts/test_metrics.py new file mode 100755 index 0000000..fe34942 --- /dev/null +++ b/scripts/test_metrics.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python3 +"""指标方案测试脚本""" + +import requests +import time +import sys +from typing import Literal + +MetricsBackend = Literal["pushgateway", "redis", "memory"] + + +def test_pushgateway(): + """测试 Pushgateway 方案""" + print("\n=== 测试 Pushgateway 方案 ===\n") + + # 1. 检查 Pushgateway 是否运行 + try: + response = requests.get("http://localhost:9091/metrics", timeout=2) + print(f"✓ Pushgateway 运行正常 (状态码: {response.status_code})") + except Exception as e: + print(f"✗ Pushgateway 未运行: {e}") + return False + + # 2. 发送测试请求到应用 + print("\n发送测试请求...") + for i in range(5): + try: + response = requests.post( + "http://localhost:8000/invoke", + json={"number": 17}, + timeout=5, + ) + print(f" 请求 {i+1}: {response.status_code}") + time.sleep(0.5) + except Exception as e: + print(f" 请求 {i+1} 失败: {e}") + + # 3. 等待指标推送 + print("\n等待指标推送...") + time.sleep(2) + + # 4. 检查 Pushgateway 中的指标 + try: + response = requests.get("http://localhost:9091/metrics", timeout=2) + metrics = response.text + + # 查找关键指标 + if "http_requests_total" in metrics: + print("✓ 找到 http_requests_total 指标") + # 提取指标值 + for line in metrics.split("\n"): + if "http_requests_total" in line and not line.startswith("#"): + print(f" {line}") + else: + print("✗ 未找到 http_requests_total 指标") + + if "algorithm_executions_total" in metrics: + print("✓ 找到 algorithm_executions_total 指标") + for line in metrics.split("\n"): + if "algorithm_executions_total" in line and not line.startswith("#"): + print(f" {line}") + else: + print("✗ 未找到 algorithm_executions_total 指标") + + except Exception as e: + print(f"✗ 获取指标失败: {e}") + return False + + # 5. 检查 Prometheus 是否能抓取 + print("\n检查 Prometheus...") + try: + response = requests.get( + "http://localhost:9090/api/v1/query", + params={"query": "http_requests_total"}, + timeout=5, + ) + data = response.json() + if data["status"] == "success" and data["data"]["result"]: + print(f"✓ Prometheus 成功抓取指标,找到 {len(data['data']['result'])} 条记录") + for result in data["data"]["result"][:3]: + print(f" {result['metric']} = {result['value'][1]}") + else: + print("✗ Prometheus 未找到指标") + except Exception as e: + print(f"✗ Prometheus 查询失败: {e}") + + return True + + +def test_redis(): + """测试 Redis 方案""" + print("\n=== 测试 Redis 方案 ===\n") + + # 1. 检查 Redis 是否运行 + try: + import redis + + client = redis.Redis(host="localhost", port=6379, db=0, decode_responses=True) + client.ping() + print("✓ Redis 运行正常") + except ImportError: + print("✗ Redis 库未安装,请运行: pip install redis") + return False + except Exception as e: + print(f"✗ Redis 未运行: {e}") + return False + + # 2. 清空测试数据 + print("\n清空旧数据...") + try: + keys = client.keys("metrics:*") + if keys: + client.delete(*keys) + print(f" 删除了 {len(keys)} 个键") + except Exception as e: + print(f" 清空失败: {e}") + + # 3. 发送测试请求 + print("\n发送测试请求...") + for i in range(5): + try: + response = requests.post( + "http://localhost:8000/invoke", + json={"number": 17}, + timeout=5, + ) + print(f" 请求 {i+1}: {response.status_code}") + time.sleep(0.5) + except Exception as e: + print(f" 请求 {i+1} 失败: {e}") + + # 4. 检查 Redis 中的指标 + print("\n检查 Redis 指标...") + try: + # 检查计数器 + counter_data = client.hgetall("metrics:request_counter") + if counter_data: + print(f"✓ 找到 {len(counter_data)} 个请求计数器指标") + for key, value in list(counter_data.items())[:5]: + if not key.endswith(":timestamp"): + print(f" {key} = {value}") + else: + print("✗ 未找到请求计数器指标") + + # 检查算法计数器 + algo_data = client.hgetall("metrics:algorithm_counter") + if algo_data: + print(f"✓ 找到 {len(algo_data)} 个算法计数器指标") + for key, value in list(algo_data.items())[:5]: + if not key.endswith(":timestamp"): + print(f" {key} = {value}") + else: + print("✗ 未找到算法计数器指标") + + except Exception as e: + print(f"✗ 检查 Redis 失败: {e}") + return False + + # 5. 检查 Redis Exporter + print("\n检查 Redis Exporter...") + try: + response = requests.get("http://localhost:8001/metrics", timeout=2) + metrics = response.text + + if "http_requests_total" in metrics: + print("✓ Exporter 成功导出 http_requests_total") + for line in metrics.split("\n"): + if "http_requests_total" in line and not line.startswith("#"): + print(f" {line}") + break + else: + print("✗ Exporter 未导出 http_requests_total") + + except Exception as e: + print(f"✗ Redis Exporter 未运行: {e}") + + return True + + +def test_memory(): + """测试原有的内存方案""" + print("\n=== 测试内存方案(原有方案)===\n") + + # 发送测试请求 + print("发送测试请求...") + for i in range(5): + try: + response = requests.post( + "http://localhost:8000/invoke", + json={"number": 17}, + timeout=5, + ) + print(f" 请求 {i+1}: {response.status_code}") + time.sleep(0.5) + except Exception as e: + print(f" 请求 {i+1} 失败: {e}") + + # 检查应用的 /metrics 端点 + print("\n检查应用 /metrics 端点...") + try: + response = requests.get("http://localhost:8000/metrics", timeout=2) + metrics = response.text + + if "http_requests_total" in metrics: + print("✓ 找到 http_requests_total 指标") + for line in metrics.split("\n"): + if "http_requests_total" in line and not line.startswith("#"): + print(f" {line}") + break + else: + print("✗ 未找到指标") + + except Exception as e: + print(f"✗ 获取指标失败: {e}") + return False + + print("\n⚠️ 注意:内存方案在多实例部署时,每个实例的指标是独立的") + return True + + +def main(): + """主函数""" + print("=" * 60) + print("FunctionalScaffold 指标方案测试") + print("=" * 60) + + if len(sys.argv) > 1: + backend = sys.argv[1] + else: + print("\n请选择要测试的方案:") + print("1. Pushgateway(推荐)") + print("2. Redis + Exporter") + print("3. Memory(原有方案)") + choice = input("\n输入选项 (1/2/3): ").strip() + + backend_map = {"1": "pushgateway", "2": "redis", "3": "memory"} + backend = backend_map.get(choice, "pushgateway") + + print(f"\n选择的方案: {backend}") + + # 运行测试 + if backend == "pushgateway": + success = test_pushgateway() + elif backend == "redis": + success = test_redis() + elif backend == "memory": + success = test_memory() + else: + print(f"未知的方案: {backend}") + sys.exit(1) + + # 输出结果 + print("\n" + "=" * 60) + if success: + print("✓ 测试通过") + else: + print("✗ 测试失败") + print("=" * 60) + + +if __name__ == "__main__": + main() diff --git a/src/functional_scaffold/__init__.py b/src/functional_scaffold/__init__.py new file mode 100644 index 0000000..5800eef --- /dev/null +++ b/src/functional_scaffold/__init__.py @@ -0,0 +1,3 @@ +"""FunctionalScaffold - 算法工程化 Serverless 脚手架""" + +__version__ = "1.0.0" diff --git a/src/functional_scaffold/algorithms/__init__.py b/src/functional_scaffold/algorithms/__init__.py new file mode 100644 index 0000000..8373f19 --- /dev/null +++ b/src/functional_scaffold/algorithms/__init__.py @@ -0,0 +1,6 @@ +"""算法模块""" + +from .base import BaseAlgorithm +from .prime_checker import PrimeChecker + +__all__ = ["BaseAlgorithm", "PrimeChecker"] diff --git a/src/functional_scaffold/algorithms/base.py b/src/functional_scaffold/algorithms/base.py new file mode 100644 index 0000000..9441b74 --- /dev/null +++ b/src/functional_scaffold/algorithms/base.py @@ -0,0 +1,77 @@ +"""算法基类""" + +from abc import ABC, abstractmethod +from typing import Any, Dict +import time +import logging + +logger = logging.getLogger(__name__) + + +class BaseAlgorithm(ABC): + """算法基类,所有算法必须继承此类""" + + def __init__(self): + self.name = self.__class__.__name__ + self.version = "1.0.0" + + @abstractmethod + def process(self, *args, **kwargs) -> Dict[str, Any]: + """ + 算法处理逻辑,子类必须实现此方法 + + Returns: + Dict[str, Any]: 算法处理结果 + """ + pass + + def execute(self, *args, **kwargs) -> Dict[str, Any]: + """ + 执行算法,包含埋点和错误处理 + + Returns: + Dict[str, Any]: 包含结果和元数据的字典 + """ + from ..core.metrics import algorithm_counter, algorithm_latency + + start_time = time.time() + status = "success" + + try: + logger.info(f"Starting algorithm: {self.name}") + result = self.process(*args, **kwargs) + elapsed_time = time.time() - start_time + + logger.info( + f"Algorithm {self.name} completed successfully in {elapsed_time:.3f}s" + ) + + return { + "success": True, + "result": result, + "metadata": { + "algorithm": self.name, + "version": self.version, + "elapsed_time": elapsed_time, + }, + } + + except Exception as e: + status = "error" + elapsed_time = time.time() - start_time + logger.error(f"Algorithm {self.name} failed: {str(e)}", exc_info=True) + + return { + "success": False, + "error": str(e), + "metadata": { + "algorithm": self.name, + "version": self.version, + "elapsed_time": elapsed_time, + }, + } + finally: + # 记录算法执行指标 + elapsed_time = time.time() - start_time + algorithm_counter.labels(algorithm=self.name, status=status).inc() + algorithm_latency.labels(algorithm=self.name).observe(elapsed_time) diff --git a/src/functional_scaffold/algorithms/prime_checker.py b/src/functional_scaffold/algorithms/prime_checker.py new file mode 100644 index 0000000..0f0aebc --- /dev/null +++ b/src/functional_scaffold/algorithms/prime_checker.py @@ -0,0 +1,94 @@ +"""质数判断算法""" + +from typing import Dict, Any, List +from .base import BaseAlgorithm + + +class PrimeChecker(BaseAlgorithm): + """ + 质数判断算法 + + 使用试除法判断一个整数是否为质数,并返回因数分解结果 + """ + + def process(self, number: int) -> Dict[str, Any]: + """ + 判断给定数字是否为质数 + + Args: + number: 待判断的整数 + + Returns: + Dict[str, Any]: 包含判断结果的字典 + - number: 输入的数字 + - is_prime: 是否为质数 + - factors: 因数列表(如果不是质数) + - reason: 说明(如果适用) + - algorithm: 使用的算法名称 + + Raises: + ValueError: 如果输入不是整数 + """ + if not isinstance(number, int): + raise ValueError(f"Input must be an integer, got {type(number).__name__}") + + # 小于2的数不是质数 + if number < 2: + return { + "number": number, + "is_prime": False, + "reason": "Numbers less than 2 are not prime", + "factors": [], + "algorithm": "trial_division", + } + + # 判断是否为质数 + is_prime = self._is_prime(number) + + # 如果不是质数,计算因数 + factors = [] if is_prime else self._get_factors(number) + + return { + "number": number, + "is_prime": is_prime, + "factors": factors, + "algorithm": "trial_division", + } + + def _is_prime(self, n: int) -> bool: + """ + 使用试除法判断是否为质数 + + Args: + n: 待判断的正整数 + + Returns: + bool: 是否为质数 + """ + if n == 2: + return True + if n % 2 == 0: + return False + + # 只需检查到sqrt(n) + for i in range(3, int(n**0.5) + 1, 2): + if n % i == 0: + return False + + return True + + def _get_factors(self, n: int) -> List[int]: + """ + 获取一个数的所有因数(不包括1和自身) + + Args: + n: 待分解的正整数 + + Returns: + List[int]: 因数列表 + """ + factors = [] + for i in range(2, n): + if n % i == 0: + factors.append(i) + return factors diff --git a/src/functional_scaffold/api/__init__.py b/src/functional_scaffold/api/__init__.py new file mode 100644 index 0000000..83b5708 --- /dev/null +++ b/src/functional_scaffold/api/__init__.py @@ -0,0 +1,6 @@ +"""API 模块""" + +from .routes import router +from .models import InvokeRequest, InvokeResponse, HealthResponse, ErrorResponse + +__all__ = ["router", "InvokeRequest", "InvokeResponse", "HealthResponse", "ErrorResponse"] diff --git a/src/functional_scaffold/api/dependencies.py b/src/functional_scaffold/api/dependencies.py new file mode 100644 index 0000000..44bcf5a --- /dev/null +++ b/src/functional_scaffold/api/dependencies.py @@ -0,0 +1,20 @@ +"""API 依赖注入""" + +from fastapi import Header, HTTPException +from typing import Optional +from ..core.tracing import set_request_id, generate_request_id + + +async def get_request_id(x_request_id: Optional[str] = Header(None)) -> str: + """ + 获取或生成请求ID + + Args: + x_request_id: 从请求头获取的请求ID + + Returns: + str: 请求ID + """ + request_id = x_request_id or generate_request_id() + set_request_id(request_id) + return request_id diff --git a/src/functional_scaffold/api/models.py b/src/functional_scaffold/api/models.py new file mode 100644 index 0000000..633ff0f --- /dev/null +++ b/src/functional_scaffold/api/models.py @@ -0,0 +1,82 @@ +"""API 数据模型""" + +from pydantic import BaseModel, Field, ConfigDict +from typing import Any, Dict, Optional + + +class InvokeRequest(BaseModel): + """同步调用请求""" + + model_config = ConfigDict( + json_schema_extra={ + "example": { + "number": 17 + } + } + ) + + number: int = Field(..., description="待判断的整数") + + +class InvokeResponse(BaseModel): + """同步调用响应""" + + model_config = ConfigDict( + json_schema_extra={ + "example": { + "request_id": "550e8400-e29b-41d4-a716-446655440000", + "status": "success", + "result": { + "number": 17, + "is_prime": True, + "factors": [], + "algorithm": "trial_division" + }, + "metadata": { + "algorithm": "PrimeChecker", + "version": "1.0.0", + "elapsed_time": 0.001 + } + } + } + ) + + request_id: str = Field(..., description="请求唯一标识") + status: str = Field(..., description="处理状态") + result: Dict[str, Any] = Field(..., description="算法执行结果") + metadata: Dict[str, Any] = Field(..., description="元数据信息") + + +class HealthResponse(BaseModel): + """健康检查响应""" + + status: str = Field(..., description="健康状态") + timestamp: float = Field(..., description="时间戳") + + +class ReadinessResponse(BaseModel): + """就绪检查响应""" + + status: str = Field(..., description="就绪状态") + timestamp: float = Field(..., description="时间戳") + checks: Optional[Dict[str, bool]] = Field(None, description="各项检查结果") + + +class ErrorResponse(BaseModel): + """错误响应""" + + model_config = ConfigDict( + json_schema_extra={ + "example": { + "error": "VALIDATION_ERROR", + "message": "number must be an integer", + "details": {"field": "number", "value": "abc"}, + "request_id": "550e8400-e29b-41d4-a716-446655440000" + } + } + ) + + error: str = Field(..., description="错误代码") + message: str = Field(..., description="错误消息") + details: Optional[Dict[str, Any]] = Field(None, description="错误详情") + request_id: Optional[str] = Field(None, description="请求ID") diff --git a/src/functional_scaffold/api/routes.py b/src/functional_scaffold/api/routes.py new file mode 100644 index 0000000..5a0697d --- /dev/null +++ b/src/functional_scaffold/api/routes.py @@ -0,0 +1,150 @@ +"""API 路由""" + +from fastapi import APIRouter, HTTPException, Depends, status +from fastapi.responses import JSONResponse +import time +import logging + +from .models import ( + InvokeRequest, + InvokeResponse, + HealthResponse, + ReadinessResponse, + ErrorResponse, +) +from .dependencies import get_request_id +from ..algorithms.prime_checker import PrimeChecker +from ..core.errors import FunctionalScaffoldError, ValidationError, AlgorithmError + +logger = logging.getLogger(__name__) + +router = APIRouter() + + +@router.post( + "/invoke", + response_model=InvokeResponse, + status_code=status.HTTP_200_OK, + summary="同步调用算法", + description="同步调用质数判断算法,立即返回结果", + responses={ + 200: {"description": "成功", "model": InvokeResponse}, + 400: {"description": "请求参数错误", "model": ErrorResponse}, + 500: {"description": "服务器内部错误", "model": ErrorResponse}, + }, +) +async def invoke_algorithm( + request: InvokeRequest, + request_id: str = Depends(get_request_id), +): + """ + 同步调用质数判断算法 + + - **number**: 待判断的整数 + """ + try: + logger.info(f"Processing request {request_id} with number={request.number}") + + # 创建算法实例并执行 + checker = PrimeChecker() + execution_result = checker.execute(request.number) + + if not execution_result["success"]: + raise AlgorithmError( + execution_result.get("error", "Algorithm execution failed"), + details=execution_result.get("metadata", {}), + ) + + return InvokeResponse( + request_id=request_id, + status="success", + result=execution_result["result"], + metadata=execution_result["metadata"], + ) + + except ValidationError as e: + logger.warning(f"Validation error for request {request_id}: {e.message}") + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=e.to_dict(), + ) + + except AlgorithmError as e: + logger.error(f"Algorithm error for request {request_id}: {e.message}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=e.to_dict(), + ) + + except Exception as e: + logger.error(f"Unexpected error for request {request_id}: {str(e)}", exc_info=True) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail={ + "error": "INTERNAL_ERROR", + "message": str(e), + "request_id": request_id, + }, + ) + + +@router.get( + "/healthz", + response_model=HealthResponse, + summary="健康检查", + description="检查服务是否存活", +) +async def health_check(): + """ + 健康检查端点 + + 返回服务的健康状态,用于存活探针 + """ + return HealthResponse( + status="healthy", + timestamp=time.time(), + ) + + +@router.get( + "/readyz", + response_model=ReadinessResponse, + summary="就绪检查", + description="检查服务是否就绪", +) +async def readiness_check(): + """ + 就绪检查端点 + + 返回服务的就绪状态,用于就绪探针 + """ + # 这里可以添加更多检查,例如数据库连接、外部服务等 + checks = { + "algorithm": True, # 算法模块可用 + } + + all_ready = all(checks.values()) + + return ReadinessResponse( + status="ready" if all_ready else "not_ready", + timestamp=time.time(), + checks=checks, + ) + + +@router.post( + "/jobs", + status_code=status.HTTP_501_NOT_IMPLEMENTED, + summary="异步任务接口(预留)", + description="异步任务接口,当前版本未实现", +) +async def create_job(): + """ + 异步任务接口(预留) + + 用于提交长时间运行的任务 + """ + raise HTTPException( + status_code=status.HTTP_501_NOT_IMPLEMENTED, + detail={"error": "NOT_IMPLEMENTED", "message": "Async jobs not implemented yet"}, + ) diff --git a/src/functional_scaffold/config.py b/src/functional_scaffold/config.py new file mode 100644 index 0000000..8baf162 --- /dev/null +++ b/src/functional_scaffold/config.py @@ -0,0 +1,47 @@ +"""配置管理模块""" + +from pydantic_settings import BaseSettings +from pydantic import ConfigDict +from typing import Optional + + +class Settings(BaseSettings): + """应用配置""" + + model_config = ConfigDict( + env_file=".env", + case_sensitive=False + ) + + # 应用信息 + app_name: str = "FunctionalScaffold" + app_version: str = "1.0.0" + app_env: str = "development" + + # 服务器配置 + host: str = "0.0.0.0" + port: int = 8000 + workers: int = 4 + + # 日志配置 + log_level: str = "INFO" + log_format: str = "json" + + # 指标配置 + metrics_enabled: bool = True + + # 追踪配置 + tracing_enabled: bool = False + jaeger_endpoint: Optional[str] = None + + # 外部服务配置(示例) + oss_endpoint: Optional[str] = None + oss_access_key_id: Optional[str] = None + oss_access_key_secret: Optional[str] = None + oss_bucket_name: Optional[str] = None + + database_url: Optional[str] = None + + +# 全局配置实例 +settings = Settings() diff --git a/src/functional_scaffold/core/__init__.py b/src/functional_scaffold/core/__init__.py new file mode 100644 index 0000000..4533069 --- /dev/null +++ b/src/functional_scaffold/core/__init__.py @@ -0,0 +1,21 @@ +"""核心功能模块""" + +from .errors import ( + FunctionalScaffoldError, + ValidationError, + AlgorithmError, + ConfigurationError, +) +from .logging import setup_logging +from .metrics import metrics_registry, track_request, track_algorithm_execution + +__all__ = [ + "FunctionalScaffoldError", + "ValidationError", + "AlgorithmError", + "ConfigurationError", + "setup_logging", + "metrics_registry", + "track_request", + "track_algorithm_execution", +] diff --git a/src/functional_scaffold/core/errors.py b/src/functional_scaffold/core/errors.py new file mode 100644 index 0000000..90e001f --- /dev/null +++ b/src/functional_scaffold/core/errors.py @@ -0,0 +1,47 @@ +"""错误处理模块""" + +from typing import Any, Dict, Optional + + +class FunctionalScaffoldError(Exception): + """基础异常类""" + + def __init__( + self, + message: str, + error_code: Optional[str] = None, + details: Optional[Dict[str, Any]] = None, + ): + self.message = message + self.error_code = error_code or "INTERNAL_ERROR" + self.details = details or {} + super().__init__(self.message) + + def to_dict(self) -> Dict[str, Any]: + """转换为字典格式""" + return { + "error": self.error_code, + "message": self.message, + "details": self.details, + } + + +class ValidationError(FunctionalScaffoldError): + """参数验证错误""" + + def __init__(self, message: str, details: Optional[Dict[str, Any]] = None): + super().__init__(message, error_code="VALIDATION_ERROR", details=details) + + +class AlgorithmError(FunctionalScaffoldError): + """算法执行错误""" + + def __init__(self, message: str, details: Optional[Dict[str, Any]] = None): + super().__init__(message, error_code="ALGORITHM_ERROR", details=details) + + +class ConfigurationError(FunctionalScaffoldError): + """配置错误""" + + def __init__(self, message: str, details: Optional[Dict[str, Any]] = None): + super().__init__(message, error_code="CONFIGURATION_ERROR", details=details) diff --git a/src/functional_scaffold/core/logging.py b/src/functional_scaffold/core/logging.py new file mode 100644 index 0000000..9d939c9 --- /dev/null +++ b/src/functional_scaffold/core/logging.py @@ -0,0 +1,50 @@ +"""日志配置模块""" + +import logging +import sys +from typing import Optional +from pythonjsonlogger.json import JsonFormatter + + +def setup_logging( + level: str = "INFO", + format_type: str = "json", + logger_name: Optional[str] = None, +) -> logging.Logger: + """ + 配置日志系统 + + Args: + level: 日志级别 (DEBUG, INFO, WARNING, ERROR, CRITICAL) + format_type: 日志格式 ('json' 或 'text') + logger_name: 日志器名称,None表示根日志器 + + Returns: + logging.Logger: 配置好的日志器 + """ + logger = logging.getLogger(logger_name) + logger.setLevel(getattr(logging, level.upper())) + + # 清除现有处理器 + logger.handlers.clear() + + # 创建控制台处理器 + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(getattr(logging, level.upper())) + + # 设置格式 + if format_type == "json": + formatter = JsonFormatter( + "%(asctime)s %(name)s %(levelname)s %(message)s", + timestamp=True, + ) + else: + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + handler.setFormatter(formatter) + logger.addHandler(handler) + + return logger diff --git a/src/functional_scaffold/core/metrics.py b/src/functional_scaffold/core/metrics.py new file mode 100644 index 0000000..0ca0a4d --- /dev/null +++ b/src/functional_scaffold/core/metrics.py @@ -0,0 +1,111 @@ +"""Prometheus 指标模块""" + +from prometheus_client import Counter, Histogram, Gauge, CollectorRegistry +from functools import wraps +import time +from typing import Callable + +# 创建指标注册表 +metrics_registry = CollectorRegistry() + +# 请求计数器 +request_counter = Counter( + "http_requests_total", + "Total HTTP requests", + ["method", "endpoint", "status"], + registry=metrics_registry, +) + +# 请求延迟直方图 +request_latency = Histogram( + "http_request_duration_seconds", + "HTTP request latency", + ["method", "endpoint"], + registry=metrics_registry, +) + +# 算法执行计数器 +algorithm_counter = Counter( + "algorithm_executions_total", + "Total algorithm executions", + ["algorithm", "status"], + registry=metrics_registry, +) + +# 算法执行延迟 +algorithm_latency = Histogram( + "algorithm_execution_duration_seconds", + "Algorithm execution latency", + ["algorithm"], + registry=metrics_registry, +) + +# 当前处理中的请求数 +in_progress_requests = Gauge( + "http_requests_in_progress", + "Number of HTTP requests in progress", + registry=metrics_registry, +) + + +def track_request(method: str, endpoint: str): + """ + 装饰器:跟踪HTTP请求指标 + + Args: + method: HTTP方法 + endpoint: 端点路径 + """ + + def decorator(func: Callable): + @wraps(func) + async def wrapper(*args, **kwargs): + in_progress_requests.inc() + start_time = time.time() + + try: + result = await func(*args, **kwargs) + status = "success" + return result + except Exception as e: + status = "error" + raise e + finally: + elapsed = time.time() - start_time + request_counter.labels(method=method, endpoint=endpoint, status=status).inc() + request_latency.labels(method=method, endpoint=endpoint).observe(elapsed) + in_progress_requests.dec() + + return wrapper + + return decorator + + +def track_algorithm_execution(algorithm_name: str): + """ + 装饰器:跟踪算法执行指标 + + Args: + algorithm_name: 算法名称 + """ + + def decorator(func: Callable): + @wraps(func) + def wrapper(*args, **kwargs): + start_time = time.time() + + try: + result = func(*args, **kwargs) + status = "success" + return result + except Exception as e: + status = "error" + raise e + finally: + elapsed = time.time() - start_time + algorithm_counter.labels(algorithm=algorithm_name, status=status).inc() + algorithm_latency.labels(algorithm=algorithm_name).observe(elapsed) + + return wrapper + + return decorator diff --git a/src/functional_scaffold/core/metrics_pushgateway.py b/src/functional_scaffold/core/metrics_pushgateway.py new file mode 100644 index 0000000..bd05f64 --- /dev/null +++ b/src/functional_scaffold/core/metrics_pushgateway.py @@ -0,0 +1,162 @@ +"""基于 Pushgateway 的 Prometheus 指标模块""" + +from prometheus_client import Counter, Histogram, Gauge, CollectorRegistry, push_to_gateway +from functools import wraps +import time +from typing import Callable, Optional +import os +import logging + +logger = logging.getLogger(__name__) + +# 创建指标注册表 +metrics_registry = CollectorRegistry() + +# Pushgateway 配置 +PUSHGATEWAY_URL = os.getenv("PUSHGATEWAY_URL", "localhost:9091") +JOB_NAME = os.getenv("METRICS_JOB_NAME", "functional_scaffold") +INSTANCE_ID = os.getenv("INSTANCE_ID", os.getenv("HOSTNAME", "unknown")) + +# 请求计数器 +request_counter = Counter( + "http_requests_total", + "Total HTTP requests", + ["method", "endpoint", "status", "instance"], + registry=metrics_registry, +) + +# 请求延迟直方图 +request_latency = Histogram( + "http_request_duration_seconds", + "HTTP request latency", + ["method", "endpoint", "instance"], + registry=metrics_registry, +) + +# 算法执行计数器 +algorithm_counter = Counter( + "algorithm_executions_total", + "Total algorithm executions", + ["algorithm", "status", "instance"], + registry=metrics_registry, +) + +# 算法执行延迟 +algorithm_latency = Histogram( + "algorithm_execution_duration_seconds", + "Algorithm execution latency", + ["algorithm", "instance"], + registry=metrics_registry, +) + +# 当前处理中的请求数 +in_progress_requests = Gauge( + "http_requests_in_progress", + "Number of HTTP requests in progress", + ["instance"], + registry=metrics_registry, +) + + +def push_metrics(grouping_key: Optional[dict] = None): + """ + 推送指标到 Pushgateway + + Args: + grouping_key: 额外的分组键 + """ + try: + grouping = {"instance": INSTANCE_ID} + if grouping_key: + grouping.update(grouping_key) + + push_to_gateway( + PUSHGATEWAY_URL, + job=JOB_NAME, + registry=metrics_registry, + grouping_key=grouping, + ) + logger.debug(f"成功推送指标到 Pushgateway: {PUSHGATEWAY_URL}") + except Exception as e: + logger.error(f"推送指标到 Pushgateway 失败: {e}") + + +def track_request(method: str, endpoint: str, auto_push: bool = True): + """ + 装饰器:跟踪HTTP请求指标 + + Args: + method: HTTP方法 + endpoint: 端点路径 + auto_push: 是否自动推送到 Pushgateway + """ + + def decorator(func: Callable): + @wraps(func) + async def wrapper(*args, **kwargs): + in_progress_requests.labels(instance=INSTANCE_ID).inc() + start_time = time.time() + + try: + result = await func(*args, **kwargs) + status = "success" + return result + except Exception as e: + status = "error" + raise e + finally: + elapsed = time.time() - start_time + request_counter.labels( + method=method, endpoint=endpoint, status=status, instance=INSTANCE_ID + ).inc() + request_latency.labels( + method=method, endpoint=endpoint, instance=INSTANCE_ID + ).observe(elapsed) + in_progress_requests.labels(instance=INSTANCE_ID).dec() + + # 自动推送指标 + if auto_push: + push_metrics() + + return wrapper + + return decorator + + +def track_algorithm_execution(algorithm_name: str, auto_push: bool = True): + """ + 装饰器:跟踪算法执行指标 + + Args: + algorithm_name: 算法名称 + auto_push: 是否自动推送到 Pushgateway + """ + + def decorator(func: Callable): + @wraps(func) + def wrapper(*args, **kwargs): + start_time = time.time() + + try: + result = func(*args, **kwargs) + status = "success" + return result + except Exception as e: + status = "error" + raise e + finally: + elapsed = time.time() - start_time + algorithm_counter.labels( + algorithm=algorithm_name, status=status, instance=INSTANCE_ID + ).inc() + algorithm_latency.labels( + algorithm=algorithm_name, instance=INSTANCE_ID + ).observe(elapsed) + + # 自动推送指标 + if auto_push: + push_metrics() + + return wrapper + + return decorator diff --git a/src/functional_scaffold/core/metrics_redis.py b/src/functional_scaffold/core/metrics_redis.py new file mode 100644 index 0000000..a2f6b12 --- /dev/null +++ b/src/functional_scaffold/core/metrics_redis.py @@ -0,0 +1,247 @@ +"""基于 Redis 的指标记录模块""" + +from functools import wraps +import time +from typing import Callable, Optional +import os +import logging +import json +from datetime import datetime + +try: + import redis + REDIS_AVAILABLE = True +except ImportError: + REDIS_AVAILABLE = False + logging.warning("Redis 未安装,指标将无法记录到 Redis") + +logger = logging.getLogger(__name__) + +# Redis 配置 +REDIS_HOST = os.getenv("REDIS_HOST", "localhost") +REDIS_PORT = int(os.getenv("REDIS_PORT", "6379")) +REDIS_DB = int(os.getenv("REDIS_METRICS_DB", "0")) +REDIS_PASSWORD = os.getenv("REDIS_PASSWORD", None) +INSTANCE_ID = os.getenv("INSTANCE_ID", os.getenv("HOSTNAME", "unknown")) + +# Redis 键前缀 +METRICS_PREFIX = "metrics:" +REQUEST_COUNTER_KEY = f"{METRICS_PREFIX}request_counter" +REQUEST_LATENCY_KEY = f"{METRICS_PREFIX}request_latency" +ALGORITHM_COUNTER_KEY = f"{METRICS_PREFIX}algorithm_counter" +ALGORITHM_LATENCY_KEY = f"{METRICS_PREFIX}algorithm_latency" +IN_PROGRESS_KEY = f"{METRICS_PREFIX}in_progress" + + +class RedisMetricsClient: + """Redis 指标客户端""" + + def __init__(self): + if not REDIS_AVAILABLE: + raise ImportError("需要安装 redis 库: pip install redis") + + self.client = redis.Redis( + host=REDIS_HOST, + port=REDIS_PORT, + db=REDIS_DB, + password=REDIS_PASSWORD, + decode_responses=True, + ) + self.instance_id = INSTANCE_ID + + def increment_counter(self, key: str, labels: dict, value: int = 1): + """ + 增加计数器 + + Args: + key: 指标键 + labels: 标签字典 + value: 增加的值 + """ + try: + # 使用 Hash 存储,键为标签组合 + label_key = self._make_label_key(labels) + full_key = f"{key}:{label_key}" + self.client.hincrby(key, full_key, value) + + # 记录最后更新时间 + self.client.hset(key, f"{full_key}:timestamp", int(time.time())) + except Exception as e: + logger.error(f"Redis 计数器增加失败: {e}") + + def observe_histogram(self, key: str, labels: dict, value: float): + """ + 记录直方图观测值 + + Args: + key: 指标键 + labels: 标签字典 + value: 观测值 + """ + try: + label_key = self._make_label_key(labels) + full_key = f"{key}:{label_key}" + + # 使用 Sorted Set 存储延迟数据(用于计算分位数) + timestamp = time.time() + self.client.zadd(full_key, {f"{timestamp}:{value}": timestamp}) + + # 保留最近1小时的数据 + cutoff = timestamp - 3600 + self.client.zremrangebyscore(full_key, "-inf", cutoff) + + # 同时记录到 Hash 中用于快速统计 + self.client.hincrby(f"{key}:count", full_key, 1) + self.client.hincrbyfloat(f"{key}:sum", full_key, value) + except Exception as e: + logger.error(f"Redis 直方图记录失败: {e}") + + def set_gauge(self, key: str, labels: dict, value: float): + """ + 设置仪表盘值 + + Args: + key: 指标键 + labels: 标签字典 + value: 值 + """ + try: + label_key = self._make_label_key(labels) + full_key = f"{key}:{label_key}" + self.client.hset(key, full_key, value) + self.client.hset(key, f"{full_key}:timestamp", int(time.time())) + except Exception as e: + logger.error(f"Redis 仪表盘设置失败: {e}") + + def increment_gauge(self, key: str, labels: dict, value: float = 1): + """增加仪表盘值""" + try: + label_key = self._make_label_key(labels) + full_key = f"{key}:{label_key}" + self.client.hincrbyfloat(key, full_key, value) + except Exception as e: + logger.error(f"Redis 仪表盘增加失败: {e}") + + def decrement_gauge(self, key: str, labels: dict, value: float = 1): + """减少仪表盘值""" + self.increment_gauge(key, labels, -value) + + def _make_label_key(self, labels: dict) -> str: + """ + 从标签字典生成键 + + Args: + labels: 标签字典 + + Returns: + str: 标签键 + """ + # 添加实例ID + labels_with_instance = {**labels, "instance": self.instance_id} + # 按键排序确保一致性 + sorted_labels = sorted(labels_with_instance.items()) + return ",".join(f"{k}={v}" for k, v in sorted_labels) + + def get_metrics_summary(self) -> dict: + """ + 获取指标摘要(用于调试) + + Returns: + dict: 指标摘要 + """ + try: + return { + "request_counter": self.client.hgetall(REQUEST_COUNTER_KEY), + "algorithm_counter": self.client.hgetall(ALGORITHM_COUNTER_KEY), + "in_progress": self.client.hgetall(IN_PROGRESS_KEY), + } + except Exception as e: + logger.error(f"获取指标摘要失败: {e}") + return {} + + +# 全局客户端实例 +_redis_client: Optional[RedisMetricsClient] = None + + +def get_redis_client() -> RedisMetricsClient: + """获取 Redis 客户端单例""" + global _redis_client + if _redis_client is None: + _redis_client = RedisMetricsClient() + return _redis_client + + +def track_request(method: str, endpoint: str): + """ + 装饰器:跟踪HTTP请求指标 + + Args: + method: HTTP方法 + endpoint: 端点路径 + """ + + def decorator(func: Callable): + @wraps(func) + async def wrapper(*args, **kwargs): + client = get_redis_client() + labels = {"method": method, "endpoint": endpoint} + + # 增加进行中的请求数 + client.increment_gauge(IN_PROGRESS_KEY, labels) + start_time = time.time() + + try: + result = await func(*args, **kwargs) + status = "success" + return result + except Exception as e: + status = "error" + raise e + finally: + elapsed = time.time() - start_time + + # 记录指标 + counter_labels = {**labels, "status": status} + client.increment_counter(REQUEST_COUNTER_KEY, counter_labels) + client.observe_histogram(REQUEST_LATENCY_KEY, labels, elapsed) + client.decrement_gauge(IN_PROGRESS_KEY, labels) + + return wrapper + + return decorator + + +def track_algorithm_execution(algorithm_name: str): + """ + 装饰器:跟踪算法执行指标 + + Args: + algorithm_name: 算法名称 + """ + + def decorator(func: Callable): + @wraps(func) + def wrapper(*args, **kwargs): + client = get_redis_client() + labels = {"algorithm": algorithm_name} + start_time = time.time() + + try: + result = func(*args, **kwargs) + status = "success" + return result + except Exception as e: + status = "error" + raise e + finally: + elapsed = time.time() - start_time + + # 记录指标 + counter_labels = {**labels, "status": status} + client.increment_counter(ALGORITHM_COUNTER_KEY, counter_labels) + client.observe_histogram(ALGORITHM_LATENCY_KEY, labels, elapsed) + + return wrapper + + return decorator diff --git a/src/functional_scaffold/core/metrics_redis_exporter.py b/src/functional_scaffold/core/metrics_redis_exporter.py new file mode 100644 index 0000000..ee5438c --- /dev/null +++ b/src/functional_scaffold/core/metrics_redis_exporter.py @@ -0,0 +1,247 @@ +"""Redis 指标 Exporter - 将 Redis 中的指标转换为 Prometheus 格式""" + +from prometheus_client import Counter, Histogram, Gauge, CollectorRegistry, generate_latest +from prometheus_client.core import GaugeMetricFamily, CounterMetricFamily, HistogramMetricFamily +import redis +import os +import logging +from typing import Dict, List, Tuple +import time + +logger = logging.getLogger(__name__) + +# Redis 配置 +REDIS_HOST = os.getenv("REDIS_HOST", "localhost") +REDIS_PORT = int(os.getenv("REDIS_PORT", "6379")) +REDIS_DB = int(os.getenv("REDIS_METRICS_DB", "0")) +REDIS_PASSWORD = os.getenv("REDIS_PASSWORD", None) + +# Redis 键前缀 +METRICS_PREFIX = "metrics:" +REQUEST_COUNTER_KEY = f"{METRICS_PREFIX}request_counter" +REQUEST_LATENCY_KEY = f"{METRICS_PREFIX}request_latency" +ALGORITHM_COUNTER_KEY = f"{METRICS_PREFIX}algorithm_counter" +ALGORITHM_LATENCY_KEY = f"{METRICS_PREFIX}algorithm_latency" +IN_PROGRESS_KEY = f"{METRICS_PREFIX}in_progress" + + +class RedisMetricsCollector: + """从 Redis 收集指标并转换为 Prometheus 格式""" + + def __init__(self): + self.redis_client = redis.Redis( + host=REDIS_HOST, + port=REDIS_PORT, + db=REDIS_DB, + password=REDIS_PASSWORD, + decode_responses=True, + ) + + def collect(self): + """收集所有指标""" + try: + # 收集计数器指标 + yield from self._collect_counter( + REQUEST_COUNTER_KEY, + "http_requests_total", + "Total HTTP requests", + ) + yield from self._collect_counter( + ALGORITHM_COUNTER_KEY, + "algorithm_executions_total", + "Total algorithm executions", + ) + + # 收集直方图指标 + yield from self._collect_histogram( + REQUEST_LATENCY_KEY, + "http_request_duration_seconds", + "HTTP request latency", + ) + yield from self._collect_histogram( + ALGORITHM_LATENCY_KEY, + "algorithm_execution_duration_seconds", + "Algorithm execution latency", + ) + + # 收集仪表盘指标 + yield from self._collect_gauge( + IN_PROGRESS_KEY, + "http_requests_in_progress", + "Number of HTTP requests in progress", + ) + + except Exception as e: + logger.error(f"收集指标失败: {e}") + + def _collect_counter(self, redis_key: str, metric_name: str, description: str): + """收集计数器指标""" + try: + data = self.redis_client.hgetall(redis_key) + if not data: + return + + # 解析标签和值 + metrics_data = [] + for key, value in data.items(): + if key.endswith(":timestamp"): + continue + labels = self._parse_labels(key) + metrics_data.append((labels, float(value))) + + # 创建 Prometheus 指标 + if metrics_data: + label_names = list(metrics_data[0][0].keys()) + counter = CounterMetricFamily(metric_name, description, labels=label_names) + for labels, value in metrics_data: + counter.add_metric(list(labels.values()), value) + yield counter + + except Exception as e: + logger.error(f"收集计数器 {redis_key} 失败: {e}") + + def _collect_histogram(self, redis_key: str, metric_name: str, description: str): + """收集直方图指标""" + try: + # 获取计数和总和 + count_data = self.redis_client.hgetall(f"{redis_key}:count") + sum_data = self.redis_client.hgetall(f"{redis_key}:sum") + + if not count_data: + return + + metrics_data = [] + for key in count_data.keys(): + labels = self._parse_labels(key) + count = float(count_data.get(key, 0)) + sum_value = float(sum_data.get(key, 0)) + + # 计算分位数(从 Sorted Set 中) + full_key = f"{redis_key}:{key}" + latencies = self._get_latencies(full_key) + buckets = self._calculate_buckets(latencies) + + metrics_data.append((labels, count, sum_value, buckets)) + + # 创建 Prometheus 指标 + if metrics_data: + label_names = list(metrics_data[0][0].keys()) + histogram = HistogramMetricFamily( + metric_name, description, labels=label_names + ) + for labels, count, sum_value, buckets in metrics_data: + histogram.add_metric( + list(labels.values()), + buckets=buckets, + sum_value=sum_value, + ) + yield histogram + + except Exception as e: + logger.error(f"收集直方图 {redis_key} 失败: {e}") + + def _collect_gauge(self, redis_key: str, metric_name: str, description: str): + """收集仪表盘指标""" + try: + data = self.redis_client.hgetall(redis_key) + if not data: + return + + metrics_data = [] + for key, value in data.items(): + if key.endswith(":timestamp"): + continue + labels = self._parse_labels(key) + metrics_data.append((labels, float(value))) + + # 创建 Prometheus 指标 + if metrics_data: + label_names = list(metrics_data[0][0].keys()) + gauge = GaugeMetricFamily(metric_name, description, labels=label_names) + for labels, value in metrics_data: + gauge.add_metric(list(labels.values()), value) + yield gauge + + except Exception as e: + logger.error(f"收集仪表盘 {redis_key} 失败: {e}") + + def _parse_labels(self, label_key: str) -> Dict[str, str]: + """ + 解析标签键 + + Args: + label_key: 标签键字符串 (e.g., "method=GET,endpoint=/invoke,instance=host1") + + Returns: + Dict[str, str]: 标签字典 + """ + labels = {} + for pair in label_key.split(","): + if "=" in pair: + key, value = pair.split("=", 1) + labels[key] = value + return labels + + def _get_latencies(self, key: str) -> List[float]: + """从 Sorted Set 获取延迟数据""" + try: + data = self.redis_client.zrange(key, 0, -1) + latencies = [] + for item in data: + # 格式: "timestamp:value" + if ":" in item: + _, value = item.rsplit(":", 1) + latencies.append(float(value)) + return sorted(latencies) + except Exception as e: + logger.error(f"获取延迟数据失败: {e}") + return [] + + def _calculate_buckets( + self, latencies: List[float] + ) -> List[Tuple[str, float]]: + """ + 计算直方图桶 + + Args: + latencies: 延迟数据列表 + + Returns: + List[Tuple[str, float]]: 桶列表 [(上限, 计数), ...] + """ + if not latencies: + return [("+Inf", 0)] + + # 定义桶边界(秒) + buckets_boundaries = [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10] + buckets = [] + + for boundary in buckets_boundaries: + count = sum(1 for lat in latencies if lat <= boundary) + buckets.append((str(boundary), count)) + + # +Inf 桶 + buckets.append(("+Inf", len(latencies))) + + return buckets + + +# 创建全局收集器 +redis_collector = RedisMetricsCollector() + + +def get_metrics() -> bytes: + """ + 获取 Prometheus 格式的指标 + + Returns: + bytes: Prometheus 格式的指标数据 + """ + registry = CollectorRegistry() + registry.register(redis_collector) + return generate_latest(registry) + + +if __name__ == "__main__": + # 测试 + print(get_metrics().decode("utf-8")) diff --git a/src/functional_scaffold/core/tracing.py b/src/functional_scaffold/core/tracing.py new file mode 100644 index 0000000..8f03848 --- /dev/null +++ b/src/functional_scaffold/core/tracing.py @@ -0,0 +1,39 @@ +"""分布式追踪模块""" + +import uuid +from contextvars import ContextVar +from typing import Optional + +# 使用 ContextVar 存储请求ID,支持异步上下文 +request_id_var: ContextVar[Optional[str]] = ContextVar("request_id", default=None) + + +def generate_request_id() -> str: + """生成唯一的请求ID""" + return str(uuid.uuid4()) + + +def get_request_id() -> Optional[str]: + """获取当前请求ID""" + return request_id_var.get() + + +def set_request_id(request_id: str) -> None: + """设置当前请求ID""" + request_id_var.set(request_id) + + +class TracingContext: + """追踪上下文管理器""" + + def __init__(self, request_id: Optional[str] = None): + self.request_id = request_id or generate_request_id() + self.token = None + + def __enter__(self): + self.token = request_id_var.set(self.request_id) + return self.request_id + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.token: + request_id_var.reset(self.token) diff --git a/src/functional_scaffold/main.py b/src/functional_scaffold/main.py new file mode 100644 index 0000000..00d48f1 --- /dev/null +++ b/src/functional_scaffold/main.py @@ -0,0 +1,138 @@ +"""FastAPI 应用入口""" + +from fastapi import FastAPI, Request +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import Response +from prometheus_client import generate_latest, CONTENT_TYPE_LATEST +import logging +import time + +from .api import router +from .config import settings +from .core.logging import setup_logging +from .core.metrics import metrics_registry, request_counter, request_latency, in_progress_requests + +# 设置日志 +setup_logging(level=settings.log_level, format_type=settings.log_format) +logger = logging.getLogger(__name__) + +# 创建 FastAPI 应用 +app = FastAPI( + title=settings.app_name, + description="算法工程化 Serverless 脚手架 - 提供标准化的算法服务接口", + version=settings.app_version, + docs_url="/docs", + redoc_url="/redoc", + openapi_url="/openapi.json", +) + +# CORS 中间件 +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +# 请求日志中间件 +@app.middleware("http") +async def log_requests(request: Request, call_next): + """记录所有HTTP请求""" + logger.info(f"Request: {request.method} {request.url.path}") + response = await call_next(request) + logger.info(f"Response: {response.status_code}") + return response + + +# 指标跟踪中间件 +@app.middleware("http") +async def track_metrics(request: Request, call_next): + """记录所有HTTP请求的指标""" + if not settings.metrics_enabled: + return await call_next(request) + + # 跳过 /metrics 端点本身,避免循环记录 + if request.url.path == "/metrics": + return await call_next(request) + + in_progress_requests.inc() + start_time = time.time() + status = "success" + + try: + response = await call_next(request) + # 根据 HTTP 状态码判断成功或失败 + if response.status_code >= 400: + status = "error" + return response + except Exception as e: + status = "error" + raise e + finally: + elapsed = time.time() - start_time + request_counter.labels( + method=request.method, + endpoint=request.url.path, + status=status + ).inc() + request_latency.labels( + method=request.method, + endpoint=request.url.path + ).observe(elapsed) + in_progress_requests.dec() + + +# 注册路由 +app.include_router(router, tags=["Algorithm"]) + + +# Prometheus 指标端点 +@app.get( + "/metrics", + tags=["Monitoring"], + summary="Prometheus 指标", + description="导出 Prometheus 格式的监控指标", +) +async def metrics(): + """ + Prometheus 指标端点 + + 返回应用的监控指标,供 Prometheus 抓取 + """ + if not settings.metrics_enabled: + return Response(content="Metrics disabled", status_code=404) + + return Response( + content=generate_latest(metrics_registry), + media_type=CONTENT_TYPE_LATEST, + ) + + +# 启动事件 +@app.on_event("startup") +async def startup_event(): + """应用启动时执行""" + logger.info(f"Starting {settings.app_name} v{settings.app_version}") + logger.info(f"Environment: {settings.app_env}") + logger.info(f"Metrics enabled: {settings.metrics_enabled}") + + +# 关闭事件 +@app.on_event("shutdown") +async def shutdown_event(): + """应用关闭时执行""" + logger.info(f"Shutting down {settings.app_name}") + + +if __name__ == "__main__": + import uvicorn + + uvicorn.run( + "functional_scaffold.main:app", + host=settings.host, + port=settings.port, + reload=settings.app_env == "development", + log_level=settings.log_level.lower(), + ) diff --git a/src/functional_scaffold/utils/__init__.py b/src/functional_scaffold/utils/__init__.py new file mode 100644 index 0000000..a050b9e --- /dev/null +++ b/src/functional_scaffold/utils/__init__.py @@ -0,0 +1,5 @@ +"""工具函数模块""" + +from .validators import validate_integer, validate_positive_integer + +__all__ = ["validate_integer", "validate_positive_integer"] diff --git a/src/functional_scaffold/utils/validators.py b/src/functional_scaffold/utils/validators.py new file mode 100644 index 0000000..783a8a1 --- /dev/null +++ b/src/functional_scaffold/utils/validators.py @@ -0,0 +1,51 @@ +"""参数校验工具""" + +from typing import Any +from ..core.errors import ValidationError + + +def validate_integer(value: Any, field_name: str = "value") -> int: + """ + 验证值是否为整数 + + Args: + value: 待验证的值 + field_name: 字段名称(用于错误消息) + + Returns: + int: 验证后的整数值 + + Raises: + ValidationError: 如果值不是整数 + """ + if not isinstance(value, int) or isinstance(value, bool): + raise ValidationError( + f"{field_name} must be an integer", + details={"field": field_name, "value": value, "type": type(value).__name__}, + ) + return value + + +def validate_positive_integer(value: Any, field_name: str = "value") -> int: + """ + 验证值是否为正整数 + + Args: + value: 待验证的值 + field_name: 字段名称(用于错误消息) + + Returns: + int: 验证后的正整数值 + + Raises: + ValidationError: 如果值不是正整数 + """ + value = validate_integer(value, field_name) + + if value <= 0: + raise ValidationError( + f"{field_name} must be a positive integer", + details={"field": field_name, "value": value}, + ) + + return value diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..412ecbc --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""测试模块""" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..d7334dc --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,23 @@ +"""pytest 配置""" + +import pytest +from fastapi.testclient import TestClient +from src.functional_scaffold.main import app + + +@pytest.fixture +def client(): + """测试客户端""" + return TestClient(app) + + +@pytest.fixture +def sample_prime_numbers(): + """质数样本""" + return [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47] + + +@pytest.fixture +def sample_composite_numbers(): + """合数样本""" + return [4, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25] diff --git a/tests/test_algorithms.py b/tests/test_algorithms.py new file mode 100644 index 0000000..13f026f --- /dev/null +++ b/tests/test_algorithms.py @@ -0,0 +1,77 @@ +"""算法单元测试""" + +import pytest +from src.functional_scaffold.algorithms.prime_checker import PrimeChecker + + +class TestPrimeChecker: + """质数判断算法测试""" + + def setup_method(self): + """每个测试方法前执行""" + self.checker = PrimeChecker() + + def test_prime_numbers(self, sample_prime_numbers): + """测试质数判断""" + for num in sample_prime_numbers: + result = self.checker.process(num) + assert result["is_prime"] is True + assert result["number"] == num + assert result["factors"] == [] + assert result["algorithm"] == "trial_division" + + def test_composite_numbers(self, sample_composite_numbers): + """测试合数判断""" + for num in sample_composite_numbers: + result = self.checker.process(num) + assert result["is_prime"] is False + assert result["number"] == num + assert len(result["factors"]) > 0 + assert result["algorithm"] == "trial_division" + + def test_edge_cases(self): + """测试边界情况""" + # 0 不是质数 + result = self.checker.process(0) + assert result["is_prime"] is False + assert "reason" in result + + # 1 不是质数 + result = self.checker.process(1) + assert result["is_prime"] is False + assert "reason" in result + + # 2 是质数 + result = self.checker.process(2) + assert result["is_prime"] is True + + # 负数不是质数 + result = self.checker.process(-5) + assert result["is_prime"] is False + + def test_large_prime(self): + """测试大质数""" + large_prime = 7919 # 第1000个质数 + result = self.checker.process(large_prime) + assert result["is_prime"] is True + + def test_invalid_input(self): + """测试无效输入""" + with pytest.raises(ValueError): + self.checker.process("not a number") + + with pytest.raises(ValueError): + self.checker.process(3.14) + + with pytest.raises(ValueError): + self.checker.process(None) + + def test_execute_method(self): + """测试 execute 方法(包含埋点)""" + result = self.checker.execute(17) + + assert result["success"] is True + assert "result" in result + assert "metadata" in result + assert result["metadata"]["algorithm"] == "PrimeChecker" + assert "elapsed_time" in result["metadata"] diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 0000000..5f64b5b --- /dev/null +++ b/tests/test_api.py @@ -0,0 +1,110 @@ +"""API 集成测试""" + +import pytest +from fastapi import status + + +class TestInvokeEndpoint: + """测试 /invoke 端点""" + + def test_invoke_prime_number(self, client): + """测试质数判断""" + response = client.post("/invoke", json={"number": 17}) + + assert response.status_code == status.HTTP_200_OK + data = response.json() + + assert "request_id" in data + assert data["status"] == "success" + assert data["result"]["number"] == 17 + assert data["result"]["is_prime"] is True + assert data["result"]["factors"] == [] + + def test_invoke_composite_number(self, client): + """测试合数判断""" + response = client.post("/invoke", json={"number": 12}) + + assert response.status_code == status.HTTP_200_OK + data = response.json() + + assert data["status"] == "success" + assert data["result"]["number"] == 12 + assert data["result"]["is_prime"] is False + assert len(data["result"]["factors"]) > 0 + + def test_invoke_edge_cases(self, client): + """测试边界情况""" + # 测试 0 + response = client.post("/invoke", json={"number": 0}) + assert response.status_code == status.HTTP_200_OK + assert response.json()["result"]["is_prime"] is False + + # 测试 1 + response = client.post("/invoke", json={"number": 1}) + assert response.status_code == status.HTTP_200_OK + assert response.json()["result"]["is_prime"] is False + + # 测试 2 + response = client.post("/invoke", json={"number": 2}) + assert response.status_code == status.HTTP_200_OK + assert response.json()["result"]["is_prime"] is True + + def test_invoke_invalid_input(self, client): + """测试无效输入""" + # 缺少必需字段 + response = client.post("/invoke", json={}) + assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + # 错误的数据类型 + response = client.post("/invoke", json={"number": "not a number"}) + assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + # 浮点数 + response = client.post("/invoke", json={"number": 3.14}) + assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + +class TestHealthEndpoints: + """测试健康检查端点""" + + def test_healthz(self, client): + """测试存活检查""" + response = client.get("/healthz") + + assert response.status_code == status.HTTP_200_OK + data = response.json() + + assert data["status"] == "healthy" + assert "timestamp" in data + + def test_readyz(self, client): + """测试就绪检查""" + response = client.get("/readyz") + + assert response.status_code == status.HTTP_200_OK + data = response.json() + + assert data["status"] == "ready" + assert "timestamp" in data + assert "checks" in data + + +class TestMetricsEndpoint: + """测试指标端点""" + + def test_metrics(self, client): + """测试 Prometheus 指标""" + response = client.get("/metrics") + + assert response.status_code == status.HTTP_200_OK + assert "text/plain" in response.headers["content-type"] + + +class TestJobsEndpoint: + """测试异步任务端点""" + + def test_jobs_not_implemented(self, client): + """测试异步任务接口(未实现)""" + response = client.post("/jobs", json={"number": 17}) + + assert response.status_code == status.HTTP_501_NOT_IMPLEMENTED