diff --git a/README.md b/README.md index b711185..6447e4e 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ | [API 参考](docs/api-reference.md) | 完整的 API 文档 | | [监控指南](docs/monitoring.md) | 监控和告警配置 | | [API 规范](docs/api/README.md) | OpenAPI 规范说明 | +| [Kubernetes 部署](docs/kubernetes-deployment.md) | K8s 集群部署指南 | | [日志集成(Loki)](docs/loki-quick-reference.md) | 日志收集部署说明 | ## 快速开始 diff --git a/deployment/kubernetes/deployment.yaml b/deployment/kubernetes/deployment.yaml index 7b3a8d6..e15a4a5 100644 --- a/deployment/kubernetes/deployment.yaml +++ b/deployment/kubernetes/deployment.yaml @@ -1,33 +1,70 @@ +# Kubernetes 部署配置 +# 包含:ConfigMap、API Deployment、Worker Deployment、Redis Deployment + +--- +# ConfigMap - 共享配置 +apiVersion: v1 +kind: ConfigMap +metadata: + name: functional-scaffold-config + labels: + app: functional-scaffold +data: + APP_ENV: "production" + LOG_LEVEL: "INFO" + LOG_FORMAT: "json" + METRICS_ENABLED: "true" + # Redis 配置(指向集群内 Redis 服务) + REDIS_HOST: "functional-scaffold-redis" + REDIS_PORT: "6379" + REDIS_DB: "0" + # 异步任务配置 + MAX_CONCURRENT_JOBS: "10" + JOB_RESULT_TTL: "1800" + WEBHOOK_MAX_RETRIES: "3" + WEBHOOK_TIMEOUT: "10" + # Worker 配置 + WORKER_POLL_INTERVAL: "1.0" + JOB_QUEUE_KEY: "job:queue" + JOB_CONCURRENCY_KEY: "job:concurrency" + JOB_LOCK_TTL: "300" + JOB_MAX_RETRIES: "3" + JOB_EXECUTION_TIMEOUT: "300" + +--- +# API Deployment - HTTP 服务 apiVersion: apps/v1 kind: Deployment metadata: - name: functional-scaffold + name: functional-scaffold-api labels: app: functional-scaffold + component: api spec: replicas: 3 selector: matchLabels: app: functional-scaffold + component: api template: metadata: labels: app: functional-scaffold + component: api spec: containers: - - name: functional-scaffold + - name: api image: functional-scaffold:latest imagePullPolicy: IfNotPresent ports: - containerPort: 8000 name: http env: - - name: APP_ENV - value: "production" - - name: LOG_LEVEL - value: "INFO" - - name: METRICS_ENABLED - value: "true" + - name: RUN_MODE + value: "api" + envFrom: + - configMapRef: + name: functional-scaffold-config resources: requests: memory: "256Mi" @@ -51,3 +88,116 @@ spec: periodSeconds: 10 timeoutSeconds: 3 failureThreshold: 3 + +--- +# Worker Deployment - 异步任务处理 +apiVersion: apps/v1 +kind: Deployment +metadata: + name: functional-scaffold-worker + labels: + app: functional-scaffold + component: worker +spec: + replicas: 2 + selector: + matchLabels: + app: functional-scaffold + component: worker + template: + metadata: + labels: + app: functional-scaffold + component: worker + spec: + containers: + - name: worker + image: functional-scaffold:latest + imagePullPolicy: IfNotPresent + env: + - name: RUN_MODE + value: "worker" + envFrom: + - configMapRef: + name: functional-scaffold-config + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" + # Worker 没有 HTTP 端口,使用命令探针 + livenessProbe: + exec: + command: + - python + - -c + - "import redis; r = redis.Redis(host='functional-scaffold-redis'); r.ping()" + initialDelaySeconds: 10 + periodSeconds: 30 + timeoutSeconds: 5 + failureThreshold: 3 + +--- +# Redis Deployment - 任务队列和状态存储 +apiVersion: apps/v1 +kind: Deployment +metadata: + name: functional-scaffold-redis + labels: + app: functional-scaffold + component: redis +spec: + replicas: 1 + selector: + matchLabels: + app: functional-scaffold + component: redis + template: + metadata: + labels: + app: functional-scaffold + component: redis + spec: + containers: + - name: redis + image: redis:7-alpine + ports: + - containerPort: 6379 + name: redis + command: + - redis-server + - --appendonly + - "yes" + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "200m" + livenessProbe: + exec: + command: + - redis-cli + - ping + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + exec: + command: + - redis-cli + - ping + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + volumeMounts: + - name: redis-data + mountPath: /data + volumes: + - name: redis-data + emptyDir: {} \ No newline at end of file diff --git a/deployment/kubernetes/service.yaml b/deployment/kubernetes/service.yaml index e555c3f..2ab8d49 100644 --- a/deployment/kubernetes/service.yaml +++ b/deployment/kubernetes/service.yaml @@ -1,9 +1,15 @@ +# Kubernetes Service 配置 +# 包含:API Service、Metrics Service、Redis Service + +--- +# API Service - 对外暴露 HTTP 服务 apiVersion: v1 kind: Service metadata: - name: functional-scaffold + name: functional-scaffold-api labels: app: functional-scaffold + component: api spec: type: ClusterIP ports: @@ -13,13 +19,21 @@ spec: name: http selector: app: functional-scaffold + component: api + --- +# Metrics Service - Prometheus 抓取指标 apiVersion: v1 kind: Service metadata: name: functional-scaffold-metrics labels: app: functional-scaffold + component: api + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8000" + prometheus.io/path: "/metrics" spec: type: ClusterIP ports: @@ -29,3 +43,24 @@ spec: name: metrics selector: app: functional-scaffold + component: api + +--- +# Redis Service - 内部 Redis 服务 +apiVersion: v1 +kind: Service +metadata: + name: functional-scaffold-redis + labels: + app: functional-scaffold + component: redis +spec: + type: ClusterIP + ports: + - port: 6379 + targetPort: 6379 + protocol: TCP + name: redis + selector: + app: functional-scaffold + component: redis \ No newline at end of file diff --git a/docs/kubernetes-deployment.md b/docs/kubernetes-deployment.md new file mode 100644 index 0000000..64432c6 --- /dev/null +++ b/docs/kubernetes-deployment.md @@ -0,0 +1,307 @@ +# Kubernetes 部署指南 + +本文档介绍如何在 Kubernetes 集群中部署 FunctionalScaffold 服务。 + +## 架构概览 + +``` + ┌─────────────────┐ + │ Ingress/LB │ + └────────┬────────┘ + │ + ┌────────▼────────┐ + │ API Service │ + │ (ClusterIP) │ + └────────┬────────┘ + │ + ┌──────────────┼──────────────┐ + │ │ │ + ┌──────▼──────┐ ┌─────▼─────┐ ┌─────▼─────┐ + │ API Pod 1 │ │ API Pod 2 │ │ API Pod 3 │ + └─────────────┘ └───────────┘ └───────────┘ + │ + ┌────────▼────────┐ + │ Redis Service │ + └────────┬────────┘ + │ + ┌──────────────┼──────────────┐ + │ │ │ + ┌──────▼──────┐ ┌─────▼─────┐ │ + │ Worker Pod 1│ │Worker Pod2│ │ + └─────────────┘ └───────────┘ │ + ┌──────▼──────┐ + │ Redis Pod │ + └─────────────┘ +``` + +## 组件说明 + +| 组件 | 副本数 | 说明 | +|------|--------|------| +| **API Deployment** | 3 | HTTP 服务,处理同步请求和任务创建 | +| **Worker Deployment** | 2 | 异步任务处理,从 Redis 队列消费任务 | +| **Redis Deployment** | 1 | 任务队列和状态存储 | +| **ConfigMap** | - | 共享配置管理 | + +## 快速部署 + +```bash +# 部署所有资源 +kubectl apply -f deployment/kubernetes/deployment.yaml +kubectl apply -f deployment/kubernetes/service.yaml + +# 查看部署状态 +kubectl get pods -l app=functional-scaffold +kubectl get svc -l app=functional-scaffold +``` + +## 配置文件说明 + +### deployment.yaml + +包含以下资源: + +#### ConfigMap + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: functional-scaffold-config +data: + APP_ENV: "production" + LOG_LEVEL: "INFO" + REDIS_HOST: "functional-scaffold-redis" + # ... 更多配置 +``` + +主要配置项: + +| 配置项 | 默认值 | 说明 | +|--------|--------|------| +| `APP_ENV` | production | 运行环境 | +| `LOG_LEVEL` | INFO | 日志级别 | +| `REDIS_HOST` | functional-scaffold-redis | Redis 服务地址 | +| `MAX_CONCURRENT_JOBS` | 10 | 最大并发任务数 | +| `JOB_EXECUTION_TIMEOUT` | 300 | 任务执行超时(秒) | + +#### API Deployment + +- **副本数**: 3 +- **资源限制**: 256Mi-512Mi 内存,250m-500m CPU +- **健康检查**: `/healthz`(存活)、`/readyz`(就绪) +- **环境变量**: `RUN_MODE=api` + +#### Worker Deployment + +- **副本数**: 2 +- **资源限制**: 256Mi-512Mi 内存,250m-500m CPU +- **健康检查**: exec 探针检查 Redis 连接 +- **环境变量**: `RUN_MODE=worker` + +#### Redis Deployment + +- **副本数**: 1 +- **资源限制**: 128Mi-256Mi 内存,100m-200m CPU +- **持久化**: AOF 模式(appendonly yes) +- **存储**: emptyDir(开发环境) + +### service.yaml + +| Service | 类型 | 端口 | 说明 | +|---------|------|------|------| +| `functional-scaffold-api` | ClusterIP | 80 → 8000 | API 服务 | +| `functional-scaffold-metrics` | ClusterIP | 8000 | Prometheus 指标 | +| `functional-scaffold-redis` | ClusterIP | 6379 | Redis 服务 | + +## 生产环境建议 + +### 1. 使用外部 Redis + +生产环境建议使用托管 Redis 服务(如阿里云 Redis、AWS ElastiCache): + +```yaml +# 修改 ConfigMap +data: + REDIS_HOST: "r-xxxxx.redis.rds.aliyuncs.com" + REDIS_PORT: "6379" + REDIS_PASSWORD: "" # 使用 Secret 管理 +``` + +### 2. 使用 Secret 管理敏感信息 + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: functional-scaffold-secrets +type: Opaque +stringData: + REDIS_PASSWORD: "your-password" + DATABASE_URL: "postgresql://..." +``` + +在 Deployment 中引用: + +```yaml +envFrom: +- configMapRef: + name: functional-scaffold-config +- secretRef: + name: functional-scaffold-secrets +``` + +### 3. 配置 HPA 自动扩缩容 + +```yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: functional-scaffold-api-hpa +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: functional-scaffold-api + minReplicas: 2 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 +``` + +### 4. 配置 PDB 保证可用性 + +```yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: functional-scaffold-api-pdb +spec: + minAvailable: 2 + selector: + matchLabels: + app: functional-scaffold + component: api +``` + +### 5. 使用 PVC 持久化 Redis 数据 + +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: redis-data-pvc +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi +``` + +## 监控集成 + +### Prometheus 抓取配置 + +`functional-scaffold-metrics` Service 已添加 Prometheus 注解: + +```yaml +annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8000" + prometheus.io/path: "/metrics" +``` + +### ServiceMonitor(如使用 Prometheus Operator) + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: functional-scaffold +spec: + selector: + matchLabels: + app: functional-scaffold + component: api + endpoints: + - port: metrics + path: /metrics + interval: 30s +``` + +## 常用命令 + +```bash +# 查看所有资源 +kubectl get all -l app=functional-scaffold + +# 查看 Pod 日志 +kubectl logs -l app=functional-scaffold,component=api -f +kubectl logs -l app=functional-scaffold,component=worker -f + +# 扩缩容 +kubectl scale deployment functional-scaffold-api --replicas=5 +kubectl scale deployment functional-scaffold-worker --replicas=3 + +# 滚动更新 +kubectl set image deployment/functional-scaffold-api \ + api=functional-scaffold:v2.0.0 + +# 回滚 +kubectl rollout undo deployment/functional-scaffold-api + +# 查看部署历史 +kubectl rollout history deployment/functional-scaffold-api + +# 进入 Pod 调试 +kubectl exec -it -- /bin/sh + +# 端口转发(本地调试) +kubectl port-forward svc/functional-scaffold-api 8000:80 +``` + +## 故障排查 + +### Pod 启动失败 + +```bash +# 查看 Pod 事件 +kubectl describe pod + +# 查看 Pod 日志 +kubectl logs --previous +``` + +### Redis 连接失败 + +```bash +# 检查 Redis Service +kubectl get svc functional-scaffold-redis + +# 测试 Redis 连接 +kubectl run redis-test --rm -it --image=redis:7-alpine -- \ + redis-cli -h functional-scaffold-redis ping +``` + +### Worker 不消费任务 + +```bash +# 检查 Worker 日志 +kubectl logs -l component=worker -f + +# 检查 Redis 队列 +kubectl exec -it -- redis-cli LLEN job:queue +``` + +## 相关文档 + +- [快速入门](getting-started.md) +- [监控指南](monitoring.md) +- [并发控制](concurrency-control.md) +- [日志集成](loki-quick-reference.md) \ No newline at end of file diff --git a/docs/loki-implementation-summary.md b/docs/loki-implementation-summary.md deleted file mode 100644 index b3b7fd0..0000000 --- a/docs/loki-implementation-summary.md +++ /dev/null @@ -1,238 +0,0 @@ -# Loki 日志收集系统集成 - 实施总结 - -## 实施完成 - -已成功集成 Grafana Loki 日志收集系统到 FunctionalScaffold 项目。 - -## 新增文件 - -### 1. 监控配置文件 - -| 文件 | 说明 | -|------|------| -| `monitoring/loki.yaml` | Loki 服务配置(7天保留期,10MB/s速率限制)| -| `monitoring/promtail.yaml` | Promtail 日志采集配置(支持 Docker stdio 和文件两种模式)| - -### 2. Grafana Provisioning - -| 文件 | 说明 | -|------|------| -| `monitoring/grafana/datasources/prometheus.yaml` | Prometheus 数据源自动配置 | -| `monitoring/grafana/datasources/loki.yaml` | Loki 数据源自动配置 | -| `monitoring/grafana/dashboards/provider.yaml` | Dashboard 自动加载配置 | -| `monitoring/grafana/dashboards/logs-dashboard.json` | 日志监控仪表板 | -| `monitoring/grafana/dashboards/dashboard.json` | 原有监控仪表板(已移动)| - -### 3. 文档和脚本 - -| 文件 | 说明 | -|------|------| -| `docs/loki-integration.md` | Loki 使用完整文档(包含查询示例、故障排查等)| -| `scripts/verify_loki.sh` | Loki 集成验证脚本 | - -## 修改文件 - -### 1. Docker Compose 配置 - -**文件**: `deployment/docker-compose.yml` - -**变更**: -- 添加 `loki` 服务(端口 3100) -- 添加 `promtail` 服务(端口 9080) -- 更新 `app` 服务: - - 添加日志文件配置环境变量 - - 添加 `app_logs` 卷挂载 - - 添加 Promtail 标签 -- 更新 `grafana` 服务: - - 修改 provisioning 卷挂载结构 - - 添加对 Loki 的依赖 -- 添加 `loki_data` 和 `app_logs` 卷 - -### 2. 应用代码 - -**文件**: `src/functional_scaffold/core/logging.py` - -**变更**: -- 添加 `file_path` 参数支持 -- 实现 `RotatingFileHandler`(100MB,5个备份) -- 支持同时输出到控制台和文件 - -**文件**: `src/functional_scaffold/config.py` - -**变更**: -- 添加 `log_file_enabled` 配置(默认 False) -- 添加 `log_file_path` 配置(默认 `/var/log/app/app.log`) - -**文件**: `src/functional_scaffold/main.py` - -**变更**: -- 更新 `setup_logging()` 调用,传入文件路径参数 - -## 架构特点 - -### 1. 双模式日志收集 - -**模式 1: Docker stdio 收集(默认)** -- ✅ 无需修改应用代码 -- ✅ 自动收集容器标准输出 -- ✅ 性能影响极小 -- ✅ 推荐用于生产环境 - -**模式 2: 文件收集(备用)** -- ✅ 日志持久化到文件 -- ✅ 支持日志轮转 -- ✅ 适合需要本地日志的场景 -- ⚙️ 需要设置 `LOG_FILE_ENABLED=true` - -### 2. 自动化配置 - -- ✅ Grafana 数据源自动加载 -- ✅ Dashboard 自动加载 -- ✅ 无需手动配置 - -### 3. 结构化日志 - -- ✅ JSON 格式日志 -- ✅ 自动提取字段(level, logger, request_id 等) -- ✅ 支持 LogQL 查询 - -## 使用方式 - -### 快速启动 - -```bash -cd deployment -docker-compose up -d -``` - -### 访问服务 - -- **Grafana**: http://localhost:3000 (admin/admin) -- **Loki API**: http://localhost:3100 -- **Promtail**: http://localhost:9080 - -### 查看日志 - -**方式 1: Grafana 日志仪表板** -1. 访问 http://localhost:3000 -2. 进入 "日志监控" 仪表板 - -**方式 2: Grafana Explore** -1. 访问 http://localhost:3000/explore -2. 选择 Loki 数据源 -3. 输入查询: `{job="functional-scaffold-app"}` - -### 验证集成 - -```bash -./scripts/verify_loki.sh -``` - -## LogQL 查询示例 - -```logql -# 查询所有日志 -{job="functional-scaffold-app"} - -# 查询错误日志 -{job="functional-scaffold-app", level="ERROR"} - -# 按 request_id 过滤 -{job="functional-scaffold-app"} | json | request_id = "abc123" - -# 统计日志量 -sum by (level) (count_over_time({job="functional-scaffold-app"}[5m])) -``` - -## 配置说明 - -### 日志保留期 - -默认 7 天,可在 `monitoring/loki.yaml` 中修改: - -```yaml -limits_config: - retention_period: 168h # 7 天 -``` - -### 日志文件模式 - -在 `deployment/docker-compose.yml` 中启用: - -```yaml -environment: - - LOG_FILE_ENABLED=true - - LOG_FILE_PATH=/var/log/app/app.log -``` - -### 日志级别 - -在 `deployment/docker-compose.yml` 中调整: - -```yaml -environment: - - LOG_LEVEL=INFO # DEBUG, INFO, WARNING, ERROR, CRITICAL -``` - -## 监控指标 - -Loki 集成后,可以在 Grafana 中查看: - -- **日志流**: 实时日志流 -- **日志量趋势**: 按时间和级别统计 -- **日志级别分布**: INFO/WARNING/ERROR 分布 -- **错误日志**: 只显示 ERROR 级别 - -## 故障排查 - -### 看不到日志 - -1. 检查服务状态: `docker-compose ps` -2. 查看 Promtail 日志: `docker-compose logs promtail` -3. 验证容器标签: `docker inspect | grep Labels` -4. 查询 Loki API: `curl http://localhost:3100/loki/api/v1/label/job/values` - -### Docker socket 权限问题 - -```bash -sudo chmod 666 /var/run/docker.sock -``` - -### 日志量过大 - -1. 调整保留期为 3 天 -2. 降低摄入速率限制 -3. 添加日志过滤规则 - -详细故障排查请参考 `docs/loki-integration.md`。 - -## 性能影响 - -- **CPU**: < 5% 额外开销 -- **内存**: Loki ~200MB, Promtail ~50MB -- **磁盘**: 取决于日志量,7天约 1-5GB -- **网络**: 本地通信,影响极小 - -## 下一步 - -可选的增强功能: - -1. **告警规则**: 配置基于日志的告警 -2. **日志导出**: 定期导出日志到对象存储 -3. **多租户**: 配置 Loki 多租户模式 -4. **长期存储**: 配置 S3/OSS 作为后端存储 - -## 参考文档 - -- 完整使用文档: `docs/loki-integration.md` -- Loki 官方文档: https://grafana.com/docs/loki/latest/ -- LogQL 查询语言: https://grafana.com/docs/loki/latest/logql/ - -## 总结 - -✅ **完成**: Loki 日志收集系统已成功集成 -✅ **测试**: 可通过 `./scripts/verify_loki.sh` 验证 -✅ **文档**: 提供完整的使用和故障排查文档 -✅ **生产就绪**: 支持双模式收集,配置灵活 - -集成已完成,可以开始使用 Loki 进行日志收集和分析!