docs: project docs, scripts, deployment configs, and evidence

This commit is contained in:
2026-04-02 11:22:17 +08:00
parent 4718980ab5
commit bbeeb63dfa
396 changed files with 165018 additions and 0 deletions

View File

@@ -0,0 +1,133 @@
groups:
- name: user-ms-alerts
interval: 30s
rules:
# 高错误率告警
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
) > 0.05
for: 5m
labels:
severity: critical
service: user-management
annotations:
summary: "高错误率告警"
description: "过去5分钟错误率超过5%,当前值: {{ $value | humanizePercentage }}"
# 高响应时间告警
- alert: HighResponseTime
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path)
) > 1
for: 5m
labels:
severity: warning
service: user-management
annotations:
summary: "高响应时间告警"
description: "API P95响应时间超过1秒路径: {{ $labels.path }},当前值: {{ $value }}s"
# 低缓存命中率告警
- alert: LowCacheHitRate
expr: |
(
sum(rate(cache_hits_total[5m]))
/
sum(rate(cache_operations_total[5m]))
) < 0.7
for: 10m
labels:
severity: warning
service: user-management
annotations:
summary: "低缓存命中率告警"
description: "缓存命中率低于70%,当前值: {{ $value | humanizePercentage }}"
# CPU 使用率告警
- alert: HighCPUUsage
expr: rate(process_cpu_seconds_total[5m]) > 0.8
for: 5m
labels:
severity: warning
service: user-management
annotations:
summary: "高CPU使用率告警"
description: "CPU使用率超过80%,当前值: {{ $value | humanizePercentage }}"
# 内存使用率告警
- alert: HighMemoryUsage
expr: |
(
system_memory_usage_bytes /
(node_memory_MemTotal_bytes)
) > 0.85
for: 5m
labels:
severity: critical
service: user-management
annotations:
summary: "高内存使用率告警"
description: "内存使用率超过85%,当前值: {{ $value | humanizePercentage }}"
# 数据库连接告警
- alert: DatabaseConnectionPoolExhausted
expr: |
(
db_connections_active /
db_connections_max
) > 0.9
for: 3m
labels:
severity: critical
service: user-management
annotations:
summary: "数据库连接池耗尽告警"
description: "数据库连接池使用率超过90%,当前值: {{ $value | humanizePercentage }}"
# 在线用户数告警
- alert: LowOnlineUsers
expr: active_users{period="5m"} < 10
for: 30m
labels:
severity: info
service: user-management
annotations:
summary: "在线用户数告警"
description: "过去5分钟活跃用户数低于10当前值: {{ $value }}"
# 登录失败率告警
- alert: HighLoginFailureRate
expr: |
(
sum(rate(user_logins_total{status="failed"}[5m]))
/
sum(rate(user_logins_total[5m]))
) > 0.3
for: 5m
labels:
severity: warning
service: user-management
annotations:
summary: "高登录失败率告警"
description: "登录失败率超过30%,可能存在暴力破解,当前值: {{ $value | humanizePercentage }}"
# API QPS 异常告警
- alert: UnusualAPIRequestRate
expr: |
abs(
sum(rate(http_requests_total[5m]))
-
avg(sum(rate(http_requests_total[5m])) over 1h)
) / avg(sum(rate(http_requests_total[5m])) over 1h) > 0.5
for: 5m
labels:
severity: info
service: user-management
annotations:
summary: "API请求量异常告警"
description: "API请求量与1小时平均值偏差超过50%,当前值: {{ $value | humanizePercentage }}"