diff --git a/deploy/monitoring/README.md b/deploy/monitoring/README.md new file mode 100644 index 00000000..b8f44b31 --- /dev/null +++ b/deploy/monitoring/README.md @@ -0,0 +1,171 @@ +# Sub2API Relay Manager Monitoring Setup + +## 概述 + +本项目已配置完整的监控告警体系,包括 Prometheus metrics、Grafana 仪表板和 Prometheus 告警规则。 + +## 已配置的 Metrics + +### HTTP 层指标 + +- `http_requests_total` - HTTP 请求总数(按 method, path, status 分类) +- `http_request_duration_seconds` - HTTP 请求延迟分布 + +### 业务指标 + +- `active_hosts` - 活跃宿主数量 +- `active_providers` - 活跃 provider 数量 +- `route_decisions_total` - 路由决策总数 +- `route_failovers_total` - 路由故障转移总数 + +### 数据库指标 + +- `db_connections_active` - 活跃数据库连接数 +- `db_operations_total` - 数据库操作总数 + +### 日志指标 + +- `log_flush_errors_total` - 日志刷新错误数 +- `log_dropped_events_total` - 丢弃的日志事件数 + +## 告警规则 + +### Critical 级别 + +| 告警名称 | 触发条件 | 说明 | +| ------------------ | ------------------------------- | --------------- | +| ServiceDown | up == 0 持续1分钟 | 服务完全宕机 | +| NoActiveProviders | active_providers == 0 持续1分钟 | 无可用 provider | +| NoActiveHosts | active_hosts == 0 持续1分钟 | 无可用 host | +| HealthCheckFailing | /healthz 返回非200 | 健康检查失败 | + +### Warning 级别 + +| 告警名称 | 触发条件 | 说明 | +| ------------------- | ------------------------ | --------------------- | +| HighErrorRate | 错误率 > 5% 持续2分钟 | HTTP 5xx/4xx 错误率高 | +| HighLatency | P95 延迟 > 1秒 持续3分钟 | 请求处理延迟高 | +| RouteFailoverSpike | 故障转移率 > 正常水平2倍 | 路由不稳定 | +| HighDBConnections | 活跃连接 > 50 持续5分钟 | 数据库连接池压力大 | +| LogFlushErrors | 日志刷新错误 > 0 | 日志系统异常 | +| LogDroppedEvents | 丢弃事件率 > 10/sec | 日志缓冲区溢出 | +| BatchImportFailures | 批处理失败率 > 10% | Provider 导入问题 | +| AuthFailures | 认证失败 > 10/sec | 凭证问题或攻击 | + +## 部署步骤 + +### 1. Prometheus 配置 + +在 `prometheus.yml` 中添加: + +```yaml +rule_files: + - "sub2api-relay-manager-rules.yml" + +scrape_configs: + - job_name: "sub2api-relay-manager" + static_configs: + - targets: ["localhost:8080"] + metrics_path: /metrics + scrape_interval: 15s +``` + +复制告警规则: + +```bash +cp deploy/monitoring/prometheus-rules.yml /etc/prometheus/rules/ +``` + +### 2. Grafana 配置 + +导入仪表板: + +```bash +curl -X POST \ + http://admin:admin@localhost:3000/api/dashboards/db \ + -H 'Content-Type: application/json' \ + -d @deploy/monitoring/grafana-dashboard.json +``` + +### 3. Alertmanager 配置(可选) + +配置告警通知渠道(Slack/Email/PagerDuty): + +```yaml +# alertmanager.yml +global: + smtp_smarthost: "localhost:587" + smtp_from: "alerts@example.com" + +route: + receiver: "ops-team" + group_by: ["alertname", "severity"] + +receivers: + - name: "ops-team" + email_configs: + - to: "ops@example.com" + subject: "[Alert] {{ .GroupLabels.alertname }}" + slack_configs: + - api_url: "YOUR_SLACK_WEBHOOK_URL" + channel: "#alerts" +``` + +## 验证 + +### 检查 Metrics 端点 + +```bash +curl http://localhost:8080/metrics +``` + +### 验证告警规则 + +```bash +# 在 Prometheus 中查看 +http://localhost:9090/rules + +# 查看告警状态 +http://localhost:9090/alerts +``` + +### 触发测试告警 + +```bash +# 模拟高错误率 +for i in {1..100}; do + curl http://localhost:8080/api/nonexistent +done +``` + +## 监控指标解释 + +### 正常状态参考值 + +| 指标 | 正常范围 | 告警阈值 | +| ---------------- | -------- | ----------------------------- | +| active_providers | >= 2 | < 2 (warning), = 0 (critical) | +| active_hosts | >= 1 | = 0 (critical) | +| Error Rate | < 1% | > 5% | +| P95 Latency | < 500ms | > 1s | +| DB Connections | < 20 | > 50 | + +## 故障排查 + +### 服务 Down 告警 + +1. 检查进程状态:`systemctl status sub2api-relay-manager` +2. 查看日志:`journalctl -u sub2api-relay-manager` +3. 检查端口监听:`netstat -tlnp | grep 8080` + +### 高延迟告警 + +1. 检查数据库性能 +2. 查看 upstream provider 响应时间 +3. 检查内存和 CPU 使用率 + +### 路由故障转移告警 + +1. 检查 provider 健康状态 +2. 查看 `/api/routing/routes/health` +3. 分析 provider 响应日志 diff --git a/deploy/monitoring/grafana-dashboard.json b/deploy/monitoring/grafana-dashboard.json new file mode 100644 index 00000000..9b398e79 --- /dev/null +++ b/deploy/monitoring/grafana-dashboard.json @@ -0,0 +1,85 @@ +{ + "dashboard": { + "id": null, + "title": "Sub2API Relay Manager", + "tags": ["sub2api", "relay", "monitoring"], + "timezone": "UTC", + "panels": [ + { + "id": 1, + "title": "Service Status", + "type": "stat", + "targets": [ + { + "expr": "up{job=\"sub2api-relay-manager\"}", + "legendFormat": "Service Up" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { "color": "red", "value": 0 }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 } + }, + { + "id": 2, + "title": "Active Providers", + "type": "stat", + "targets": [ + { + "expr": "active_providers", + "legendFormat": "Providers" + } + ], + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 } + }, + { + "id": 3, + "title": "Active Hosts", + "type": "stat", + "targets": [ + { + "expr": "active_hosts", + "legendFormat": "Hosts" + } + ], + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 } + }, + { + "id": 4, + "title": "Request Rate", + "type": "graph", + "targets": [ + { + "expr": "rate(http_requests_total[5m])", + "legendFormat": "{{method}} {{path}}" + } + ], + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 } + }, + { + "id": 5, + "title": "Request Duration p95", + "type": "graph", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p95" + } + ], + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 } + } + ], + "time": { + "from": "now-1h", + "to": "now" + }, + "refresh": "30s" + } +} diff --git a/deploy/monitoring/prometheus-rules.yml b/deploy/monitoring/prometheus-rules.yml new file mode 100644 index 00000000..0421aa19 --- /dev/null +++ b/deploy/monitoring/prometheus-rules.yml @@ -0,0 +1,181 @@ +# Prometheus Alerting Rules for sub2api-cn-relay-manager +# Place this file in your Prometheus rules directory + +groups: + - name: sub2api-relay-manager-alerts + interval: 30s + rules: + # 服务可用性告警 + - alert: ServiceDown + expr: up{job="sub2api-relay-manager"} == 0 + for: 1m + labels: + severity: critical + team: ops + annotations: + summary: "sub2api-relay-manager service is down" + description: "The sub2api-relay-manager service has been down for more than 1 minute." + + # HTTP错误率告警 + - alert: HighErrorRate + expr: | + ( + sum(rate(http_requests_total{status=~"5..|4.."}[5m])) + / + sum(rate(http_requests_total[5m])) + ) > 0.05 + for: 2m + labels: + severity: warning + team: ops + annotations: + summary: "High error rate detected" + description: "Error rate is above 5% for more than 2 minutes. Current value: {{ $value | humanizePercentage }}" + + # 请求延迟告警 + - alert: HighLatency + expr: | + histogram_quantile(0.95, + sum(rate(http_request_duration_seconds_bucket[5m])) by (le) + ) > 1.0 + for: 3m + labels: + severity: warning + team: ops + annotations: + summary: "High request latency" + description: "95th percentile latency is above 1 second for more than 3 minutes." + + # 路由故障转移告警 + - alert: RouteFailoverSpike + expr: | + ( + rate(route_failovers_total[5m]) + > + 2 * avg_over_time(rate(route_failovers_total[1h])[1h:5m]) + ) + for: 1m + labels: + severity: warning + team: ops + annotations: + summary: "Route failover spike detected" + description: "Route failovers have spiked above normal levels. Current rate: {{ $value }}" + + # 活跃Provider数量告警 + - alert: NoActiveProviders + expr: active_providers == 0 + for: 1m + labels: + severity: critical + team: ops + annotations: + summary: "No active providers" + description: "There are no active providers configured. The system cannot route requests." + + - alert: LowActiveProviders + expr: active_providers < 2 + for: 5m + labels: + severity: warning + team: ops + annotations: + summary: "Low number of active providers" + description: "Only {{ $value }} active provider(s) detected. Consider adding more for redundancy." + + # 活跃Host告警 + - alert: NoActiveHosts + expr: active_hosts == 0 + for: 1m + labels: + severity: critical + team: ops + annotations: + summary: "No active hosts" + description: "There are no active hosts. The system cannot import providers." + + # 数据库连接告警 + - alert: HighDBConnections + expr: db_connections_active > 50 + for: 5m + labels: + severity: warning + team: ops + annotations: + summary: "High database connection count" + description: "Active DB connections: {{ $value }}. Consider connection pool tuning." + + # 数据库操作错误告警 + - alert: DBOperationErrors + expr: | + rate(db_operations_total{operation=~"INSERT|UPDATE|DELETE"}[5m]) + > 100 + for: 2m + labels: + severity: warning + team: ops + annotations: + summary: "High database write rate" + description: "DB write operations are above threshold: {{ $value }} ops/sec" + + # 日志系统告警 + - alert: LogFlushErrors + expr: rate(log_flush_errors_total[5m]) > 0 + for: 1m + labels: + severity: warning + team: ops + annotations: + summary: "Log flush errors detected" + description: "Log flush errors have been detected. Check log storage/backend." + + - alert: LogDroppedEvents + expr: | + rate(log_dropped_events_total[5m]) > 10 + for: 1m + labels: + severity: warning + team: ops + annotations: + summary: "Log events being dropped" + description: "Log events are being dropped at {{ $value }} events/sec. Check log buffer capacity." + + # 批处理导入告警 + - alert: BatchImportFailures + expr: | + ( + rate(route_decisions_total{status="failed"}[5m]) + / + rate(route_decisions_total[5m]) + ) > 0.1 + for: 5m + labels: + severity: warning + team: ops + annotations: + summary: "High batch import failure rate" + description: "Batch import failure rate is above 10%. Check provider configurations." + + # API认证失败告警 + - alert: AuthFailures + expr: | + rate(http_requests_total{status="401"}[5m]) > 10 + for: 2m + labels: + severity: warning + team: security + annotations: + summary: "High authentication failure rate" + description: "Auth failures detected. Possible credential issues or attacks." + + # 健康检查告警 + - alert: HealthCheckFailing + expr: | + http_requests_total{path="/healthz",status!="200"} > 0 + for: 30s + labels: + severity: critical + team: ops + annotations: + summary: "Health check failing" + description: "The /healthz endpoint is returning non-200 status."