docs: project docs, scripts, deployment configs, and evidence
This commit is contained in:
11
deployment/alertmanager/alertmanager.env.example
Normal file
11
deployment/alertmanager/alertmanager.env.example
Normal file
@@ -0,0 +1,11 @@
|
||||
# Alertmanager notification channel injection example.
|
||||
# Production should source these values from a secrets manager, CI/CD secret store,
|
||||
# or environment-specific secure deployment mechanism.
|
||||
|
||||
ALERTMANAGER_DEFAULT_TO=ops@example.com
|
||||
ALERTMANAGER_CRITICAL_TO=oncall-critical@example.com
|
||||
ALERTMANAGER_WARNING_TO=oncall-warning@example.com
|
||||
ALERTMANAGER_FROM=alertmanager@example.com
|
||||
ALERTMANAGER_SMARTHOST=smtp.example.com:587
|
||||
ALERTMANAGER_AUTH_USERNAME=alertmanager@example.com
|
||||
ALERTMANAGER_AUTH_PASSWORD=replace-with-secret
|
||||
84
deployment/alertmanager/alertmanager.yml
Normal file
84
deployment/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,84 @@
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
# 注意:
|
||||
# 该文件为模板文件,生产环境必须先注入并渲染 `${ALERTMANAGER_*}` 变量,
|
||||
# 再将渲染结果交给 Alertmanager 使用。
|
||||
|
||||
# 告警路由
|
||||
route:
|
||||
group_by: ['alertname', 'service']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: 'default'
|
||||
|
||||
# 子路由,根据严重级别分发
|
||||
routes:
|
||||
# Critical 告警
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'critical-alerts'
|
||||
group_wait: 10s
|
||||
continue: true
|
||||
|
||||
# Warning 告警
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'warning-alerts'
|
||||
continue: true
|
||||
|
||||
# 告警接收者
|
||||
receivers:
|
||||
# 默认接收者
|
||||
- name: 'default'
|
||||
email_configs:
|
||||
- to: '${ALERTMANAGER_DEFAULT_TO}'
|
||||
from: '${ALERTMANAGER_FROM}'
|
||||
smarthost: '${ALERTMANAGER_SMARTHOST}'
|
||||
auth_username: '${ALERTMANAGER_AUTH_USERNAME}'
|
||||
auth_password: '${ALERTMANAGER_AUTH_PASSWORD}'
|
||||
headers:
|
||||
Subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}'
|
||||
|
||||
# Critical 告警接收者
|
||||
- name: 'critical-alerts'
|
||||
email_configs:
|
||||
- to: '${ALERTMANAGER_CRITICAL_TO}'
|
||||
from: '${ALERTMANAGER_FROM}'
|
||||
smarthost: '${ALERTMANAGER_SMARTHOST}'
|
||||
auth_username: '${ALERTMANAGER_AUTH_USERNAME}'
|
||||
auth_password: '${ALERTMANAGER_AUTH_PASSWORD}'
|
||||
headers:
|
||||
Subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
|
||||
|
||||
# Warning 告警接收者
|
||||
- name: 'warning-alerts'
|
||||
email_configs:
|
||||
- to: '${ALERTMANAGER_WARNING_TO}'
|
||||
from: '${ALERTMANAGER_FROM}'
|
||||
smarthost: '${ALERTMANAGER_SMARTHOST}'
|
||||
auth_username: '${ALERTMANAGER_AUTH_USERNAME}'
|
||||
auth_password: '${ALERTMANAGER_AUTH_PASSWORD}'
|
||||
headers:
|
||||
Subject: '[WARNING] {{ .GroupLabels.alertname }}'
|
||||
|
||||
# 告警抑制规则
|
||||
inhibit_rules:
|
||||
# 如果有 critical 告警,抑制同一服务的 warning 告警
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['service']
|
||||
|
||||
# 告警静默规则(按需配置)
|
||||
# silences:
|
||||
# - matchers:
|
||||
# - name: alertname
|
||||
# value: LowOnlineUsers
|
||||
# - name: severity
|
||||
# value: info
|
||||
# startsAt: "2026-03-12T00:00:00+08:00"
|
||||
# endsAt: "2026-03-12T23:59:59+08:00"
|
||||
# comment: "维护期间静默低在线用户告警"
|
||||
133
deployment/alertmanager/alerts.yml
Normal file
133
deployment/alertmanager/alerts.yml
Normal file
@@ -0,0 +1,133 @@
|
||||
groups:
|
||||
- name: user-ms-alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# 高错误率告警
|
||||
- alert: HighErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{status=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total[5m]))
|
||||
) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "高错误率告警"
|
||||
description: "过去5分钟错误率超过5%,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# 高响应时间告警
|
||||
- alert: HighResponseTime
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path)
|
||||
) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "高响应时间告警"
|
||||
description: "API P95响应时间超过1秒,路径: {{ $labels.path }},当前值: {{ $value }}s"
|
||||
|
||||
# 低缓存命中率告警
|
||||
- alert: LowCacheHitRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(cache_hits_total[5m]))
|
||||
/
|
||||
sum(rate(cache_operations_total[5m]))
|
||||
) < 0.7
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "低缓存命中率告警"
|
||||
description: "缓存命中率低于70%,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# CPU 使用率告警
|
||||
- alert: HighCPUUsage
|
||||
expr: rate(process_cpu_seconds_total[5m]) > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "高CPU使用率告警"
|
||||
description: "CPU使用率超过80%,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# 内存使用率告警
|
||||
- alert: HighMemoryUsage
|
||||
expr: |
|
||||
(
|
||||
system_memory_usage_bytes /
|
||||
(node_memory_MemTotal_bytes)
|
||||
) > 0.85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "高内存使用率告警"
|
||||
description: "内存使用率超过85%,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# 数据库连接告警
|
||||
- alert: DatabaseConnectionPoolExhausted
|
||||
expr: |
|
||||
(
|
||||
db_connections_active /
|
||||
db_connections_max
|
||||
) > 0.9
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "数据库连接池耗尽告警"
|
||||
description: "数据库连接池使用率超过90%,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# 在线用户数告警
|
||||
- alert: LowOnlineUsers
|
||||
expr: active_users{period="5m"} < 10
|
||||
for: 30m
|
||||
labels:
|
||||
severity: info
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "在线用户数告警"
|
||||
description: "过去5分钟活跃用户数低于10,当前值: {{ $value }}"
|
||||
|
||||
# 登录失败率告警
|
||||
- alert: HighLoginFailureRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(user_logins_total{status="failed"}[5m]))
|
||||
/
|
||||
sum(rate(user_logins_total[5m]))
|
||||
) > 0.3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "高登录失败率告警"
|
||||
description: "登录失败率超过30%,可能存在暴力破解,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# API QPS 异常告警
|
||||
- alert: UnusualAPIRequestRate
|
||||
expr: |
|
||||
abs(
|
||||
sum(rate(http_requests_total[5m]))
|
||||
-
|
||||
avg(sum(rate(http_requests_total[5m])) over 1h)
|
||||
) / avg(sum(rate(http_requests_total[5m])) over 1h) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "API请求量异常告警"
|
||||
description: "API请求量与1小时平均值偏差超过50%,当前值: {{ $value | humanizePercentage }}"
|
||||
Reference in New Issue
Block a user