docs: project docs, scripts, deployment configs, and evidence
This commit is contained in:
11
deployment/alertmanager/alertmanager.env.example
Normal file
11
deployment/alertmanager/alertmanager.env.example
Normal file
@@ -0,0 +1,11 @@
|
||||
# Alertmanager notification channel injection example.
|
||||
# Production should source these values from a secrets manager, CI/CD secret store,
|
||||
# or environment-specific secure deployment mechanism.
|
||||
|
||||
ALERTMANAGER_DEFAULT_TO=ops@example.com
|
||||
ALERTMANAGER_CRITICAL_TO=oncall-critical@example.com
|
||||
ALERTMANAGER_WARNING_TO=oncall-warning@example.com
|
||||
ALERTMANAGER_FROM=alertmanager@example.com
|
||||
ALERTMANAGER_SMARTHOST=smtp.example.com:587
|
||||
ALERTMANAGER_AUTH_USERNAME=alertmanager@example.com
|
||||
ALERTMANAGER_AUTH_PASSWORD=replace-with-secret
|
||||
84
deployment/alertmanager/alertmanager.yml
Normal file
84
deployment/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,84 @@
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
# 注意:
|
||||
# 该文件为模板文件,生产环境必须先注入并渲染 `${ALERTMANAGER_*}` 变量,
|
||||
# 再将渲染结果交给 Alertmanager 使用。
|
||||
|
||||
# 告警路由
|
||||
route:
|
||||
group_by: ['alertname', 'service']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: 'default'
|
||||
|
||||
# 子路由,根据严重级别分发
|
||||
routes:
|
||||
# Critical 告警
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'critical-alerts'
|
||||
group_wait: 10s
|
||||
continue: true
|
||||
|
||||
# Warning 告警
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'warning-alerts'
|
||||
continue: true
|
||||
|
||||
# 告警接收者
|
||||
receivers:
|
||||
# 默认接收者
|
||||
- name: 'default'
|
||||
email_configs:
|
||||
- to: '${ALERTMANAGER_DEFAULT_TO}'
|
||||
from: '${ALERTMANAGER_FROM}'
|
||||
smarthost: '${ALERTMANAGER_SMARTHOST}'
|
||||
auth_username: '${ALERTMANAGER_AUTH_USERNAME}'
|
||||
auth_password: '${ALERTMANAGER_AUTH_PASSWORD}'
|
||||
headers:
|
||||
Subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}'
|
||||
|
||||
# Critical 告警接收者
|
||||
- name: 'critical-alerts'
|
||||
email_configs:
|
||||
- to: '${ALERTMANAGER_CRITICAL_TO}'
|
||||
from: '${ALERTMANAGER_FROM}'
|
||||
smarthost: '${ALERTMANAGER_SMARTHOST}'
|
||||
auth_username: '${ALERTMANAGER_AUTH_USERNAME}'
|
||||
auth_password: '${ALERTMANAGER_AUTH_PASSWORD}'
|
||||
headers:
|
||||
Subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
|
||||
|
||||
# Warning 告警接收者
|
||||
- name: 'warning-alerts'
|
||||
email_configs:
|
||||
- to: '${ALERTMANAGER_WARNING_TO}'
|
||||
from: '${ALERTMANAGER_FROM}'
|
||||
smarthost: '${ALERTMANAGER_SMARTHOST}'
|
||||
auth_username: '${ALERTMANAGER_AUTH_USERNAME}'
|
||||
auth_password: '${ALERTMANAGER_AUTH_PASSWORD}'
|
||||
headers:
|
||||
Subject: '[WARNING] {{ .GroupLabels.alertname }}'
|
||||
|
||||
# 告警抑制规则
|
||||
inhibit_rules:
|
||||
# 如果有 critical 告警,抑制同一服务的 warning 告警
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['service']
|
||||
|
||||
# 告警静默规则(按需配置)
|
||||
# silences:
|
||||
# - matchers:
|
||||
# - name: alertname
|
||||
# value: LowOnlineUsers
|
||||
# - name: severity
|
||||
# value: info
|
||||
# startsAt: "2026-03-12T00:00:00+08:00"
|
||||
# endsAt: "2026-03-12T23:59:59+08:00"
|
||||
# comment: "维护期间静默低在线用户告警"
|
||||
133
deployment/alertmanager/alerts.yml
Normal file
133
deployment/alertmanager/alerts.yml
Normal file
@@ -0,0 +1,133 @@
|
||||
groups:
|
||||
- name: user-ms-alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# 高错误率告警
|
||||
- alert: HighErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{status=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total[5m]))
|
||||
) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "高错误率告警"
|
||||
description: "过去5分钟错误率超过5%,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# 高响应时间告警
|
||||
- alert: HighResponseTime
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path)
|
||||
) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "高响应时间告警"
|
||||
description: "API P95响应时间超过1秒,路径: {{ $labels.path }},当前值: {{ $value }}s"
|
||||
|
||||
# 低缓存命中率告警
|
||||
- alert: LowCacheHitRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(cache_hits_total[5m]))
|
||||
/
|
||||
sum(rate(cache_operations_total[5m]))
|
||||
) < 0.7
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "低缓存命中率告警"
|
||||
description: "缓存命中率低于70%,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# CPU 使用率告警
|
||||
- alert: HighCPUUsage
|
||||
expr: rate(process_cpu_seconds_total[5m]) > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "高CPU使用率告警"
|
||||
description: "CPU使用率超过80%,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# 内存使用率告警
|
||||
- alert: HighMemoryUsage
|
||||
expr: |
|
||||
(
|
||||
system_memory_usage_bytes /
|
||||
(node_memory_MemTotal_bytes)
|
||||
) > 0.85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "高内存使用率告警"
|
||||
description: "内存使用率超过85%,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# 数据库连接告警
|
||||
- alert: DatabaseConnectionPoolExhausted
|
||||
expr: |
|
||||
(
|
||||
db_connections_active /
|
||||
db_connections_max
|
||||
) > 0.9
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "数据库连接池耗尽告警"
|
||||
description: "数据库连接池使用率超过90%,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# 在线用户数告警
|
||||
- alert: LowOnlineUsers
|
||||
expr: active_users{period="5m"} < 10
|
||||
for: 30m
|
||||
labels:
|
||||
severity: info
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "在线用户数告警"
|
||||
description: "过去5分钟活跃用户数低于10,当前值: {{ $value }}"
|
||||
|
||||
# 登录失败率告警
|
||||
- alert: HighLoginFailureRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(user_logins_total{status="failed"}[5m]))
|
||||
/
|
||||
sum(rate(user_logins_total[5m]))
|
||||
) > 0.3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "高登录失败率告警"
|
||||
description: "登录失败率超过30%,可能存在暴力破解,当前值: {{ $value | humanizePercentage }}"
|
||||
|
||||
# API QPS 异常告警
|
||||
- alert: UnusualAPIRequestRate
|
||||
expr: |
|
||||
abs(
|
||||
sum(rate(http_requests_total[5m]))
|
||||
-
|
||||
avg(sum(rate(http_requests_total[5m])) over 1h)
|
||||
) / avg(sum(rate(http_requests_total[5m])) over 1h) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
service: user-management
|
||||
annotations:
|
||||
summary: "API请求量异常告警"
|
||||
description: "API请求量与1小时平均值偏差超过50%,当前值: {{ $value | humanizePercentage }}"
|
||||
143
deployment/grafana/dashboards/user-management.json
Normal file
143
deployment/grafana/dashboards/user-management.json
Normal file
@@ -0,0 +1,143 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 0.8
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0.9
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"text": {},
|
||||
"textMode": "value"
|
||||
},
|
||||
"pluginVersion": "7.5.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{status=~\"2..\"}[5m])) / sum(rate(http_requests_total[5m]))",
|
||||
"legendFormat": "成功率",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "HTTP 请求成功率",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 100
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "ms"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"text": {},
|
||||
"textMode": "value"
|
||||
},
|
||||
"pluginVersion": "7.5.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) * 1000",
|
||||
"legendFormat": "P95 响应时间",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "P95 响应时间",
|
||||
"type": "stat"
|
||||
}
|
||||
],
|
||||
"refresh": "10s",
|
||||
"schemaVersion": 27,
|
||||
"style": "dark",
|
||||
"tags": ["user-management"],
|
||||
"title": "用户管理系统监控仪表板",
|
||||
"uid": "user-ms-dashboard",
|
||||
"version": 1
|
||||
}
|
||||
Reference in New Issue
Block a user