docs: project docs, scripts, deployment configs, and evidence
This commit is contained in:
133
deployment/alertmanager/alerts.yml
Normal file
133
deployment/alertmanager/alerts.yml
Normal file
@@ -0,0 +1,133 @@
|
||||
groups:
|
||||
- name: user-ms-alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# 高错误率告警
|
||||
- alert: HighErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{status=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total[5m]))
|
||||
) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "高错误率告警"
|
||||
description: "过去5分钟错误率超过5%,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# 高响应时间告警
|
||||
- alert: HighResponseTime
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path)
|
||||
) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "高响应时间告警"
|
||||
description: "API P95响应时间超过1秒,路径: {{ $labels.path }},当前值: {{ $value }}s"
|
||||
|
||||
# 低缓存命中率告警
|
||||
- alert: LowCacheHitRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(cache_hits_total[5m]))
|
||||
/
|
||||
sum(rate(cache_operations_total[5m]))
|
||||
) < 0.7
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "低缓存命中率告警"
|
||||
description: "缓存命中率低于70%,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# CPU 使用率告警
|
||||
- alert: HighCPUUsage
|
||||
expr: rate(process_cpu_seconds_total[5m]) > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "高CPU使用率告警"
|
||||
description: "CPU使用率超过80%,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# 内存使用率告警
|
||||
- alert: HighMemoryUsage
|
||||
expr: |
|
||||
(
|
||||
system_memory_usage_bytes /
|
||||
(node_memory_MemTotal_bytes)
|
||||
) > 0.85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "高内存使用率告警"
|
||||
description: "内存使用率超过85%,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# 数据库连接告警
|
||||
- alert: DatabaseConnectionPoolExhausted
|
||||
expr: |
|
||||
(
|
||||
db_connections_active /
|
||||
db_connections_max
|
||||
) > 0.9
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "数据库连接池耗尽告警"
|
||||
description: "数据库连接池使用率超过90%,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# 在线用户数告警
|
||||
- alert: LowOnlineUsers
|
||||
expr: active_users{period="5m"} < 10
|
||||
for: 30m
|
||||
labels:
|
||||
severity: info
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "在线用户数告警"
|
||||
description: "过去5分钟活跃用户数低于10,当前值: {{ $value }}"
|
||||
|
||||
# 登录失败率告警
|
||||
- alert: HighLoginFailureRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(user_logins_total{status="failed"}[5m]))
|
||||
/
|
||||
sum(rate(user_logins_total[5m]))
|
||||
) > 0.3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "高登录失败率告警"
|
||||
description: "登录失败率超过30%,可能存在暴力破解,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# API QPS 异常告警
|
||||
- alert: UnusualAPIRequestRate
|
||||
expr: |
|
||||
abs(
|
||||
sum(rate(http_requests_total[5m]))
|
||||
-
|
||||
avg(sum(rate(http_requests_total[5m])) over 1h)
|
||||
) / avg(sum(rate(http_requests_total[5m])) over 1h) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "API请求量异常告警"
|
||||
description: "API请求量与1小时平均值偏差超过50%,当前值: {{ $value | humanizePercentage }}"
|
||||
Reference in New Issue
Block a user