- Remove old review reports (keep latest only) - Move docs/ to deploy/docs-backup/ - Move performance-testing/ to deploy/ - Clean up test output files - Organize root directory
463 lines
16 KiB
YAML
463 lines
16 KiB
YAML
# Sub2API 单机版轻量告警规则
|
||
# 针对 2核4G 环境优化,避免告警风暴
|
||
|
||
groups:
|
||
# ==================== 系统资源告警 ====================
|
||
- name: system-alerts
|
||
interval: 60s
|
||
rules:
|
||
- alert: HighCPUUsage
|
||
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: infrastructure
|
||
annotations:
|
||
summary: "CPU 使用率过高"
|
||
description: "实例 {{ $labels.instance }} CPU 使用率超过 80%,当前值: {{ $value }}%"
|
||
|
||
- alert: HighMemoryUsage
|
||
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: infrastructure
|
||
annotations:
|
||
summary: "内存使用率过高"
|
||
description: "实例 {{ $labels.instance }} 内存使用率超过 85%,当前值: {{ $value }}%"
|
||
|
||
- alert: DiskSpaceLow
|
||
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 10
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
category: infrastructure
|
||
annotations:
|
||
summary: "磁盘空间不足"
|
||
description: "实例 {{ $labels.instance }} 磁盘剩余空间不足 10%,当前值: {{ $value }}%"
|
||
|
||
- alert: DiskWillFillIn24Hours
|
||
expr: predict_linear(node_filesystem_avail_bytes{mountpoint="/"}[6h], 24*3600) < 0
|
||
for: 1h
|
||
labels:
|
||
severity: warning
|
||
category: infrastructure
|
||
annotations:
|
||
summary: "磁盘预计24小时内写满"
|
||
description: "实例 {{ $labels.instance }} 按当前趋势,磁盘将在24小时内写满"
|
||
|
||
# ==================== 应用健康告警 ====================
|
||
- name: application-alerts
|
||
interval: 30s
|
||
rules:
|
||
- alert: Sub2APIDown
|
||
expr: up{job="sub2api-app"} == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
category: availability
|
||
annotations:
|
||
summary: "Sub2API 服务不可用"
|
||
description: "Sub2API 应用实例 {{ $labels.instance }} 已宕机超过 1 分钟"
|
||
|
||
- alert: HighErrorRate
|
||
expr: |
|
||
(
|
||
sum(rate(sub2api_http_requests_total{job="sub2api-app",status=~"5.."}[5m]))
|
||
/
|
||
sum(rate(sub2api_http_requests_total{job="sub2api-app"}[5m]))
|
||
) > 0.05
|
||
for: 2m
|
||
labels:
|
||
severity: warning
|
||
category: availability
|
||
annotations:
|
||
summary: "错误率过高"
|
||
description: "API 错误率超过 5%,当前值: {{ $value | humanizePercentage }}"
|
||
|
||
- alert: HighLatencyP99
|
||
expr: histogram_quantile(0.99, sum(rate(sub2api_http_request_duration_seconds_bucket{job="sub2api-app"}[5m])) by (le)) > 5
|
||
for: 3m
|
||
labels:
|
||
severity: warning
|
||
category: performance
|
||
annotations:
|
||
summary: "P99 延迟过高"
|
||
description: "API P99 延迟超过 5 秒,当前值: {{ $value }}s"
|
||
|
||
- alert: HighLatencyP95
|
||
expr: histogram_quantile(0.95, sum(rate(sub2api_http_request_duration_seconds_bucket{job="sub2api-app"}[5m])) by (le)) > 2
|
||
for: 5m
|
||
labels:
|
||
severity: info
|
||
category: performance
|
||
annotations:
|
||
summary: "P95 延迟升高"
|
||
description: "API P95 延迟超过 2 秒,当前值: {{ $value }}s"
|
||
|
||
# ==================== 数据库告警 ====================
|
||
- name: database-alerts
|
||
interval: 60s
|
||
rules:
|
||
- alert: DatabaseConnectionsHigh
|
||
expr: |
|
||
(
|
||
sub2api_db_connections{state="active"} / sub2api_db_connections{state="max"}
|
||
) > 0.8
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: database
|
||
annotations:
|
||
summary: "数据库连接池使用率过高"
|
||
description: "数据库连接池使用率超过 80%,当前值: {{ $value | humanizePercentage }}"
|
||
|
||
# ==================== 业务指标告警 ====================
|
||
- name: business-alerts
|
||
interval: 60s
|
||
rules:
|
||
- alert: LowUpstreamSuccessRate
|
||
expr: |
|
||
(
|
||
sum(rate(sub2api_upstream_requests_total{status="success"}[10m]))
|
||
/
|
||
sum(rate(sub2api_upstream_requests_total[10m]))
|
||
) < 0.95
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: business
|
||
annotations:
|
||
summary: "上游服务成功率下降"
|
||
description: "上游服务成功率低于 95%,当前值: {{ $value | humanizePercentage }}"
|
||
|
||
- alert: HighRateLimitHits
|
||
expr: rate(sub2api_rate_limit_hits_total[5m]) > 10
|
||
for: 5m
|
||
labels:
|
||
severity: info
|
||
category: business
|
||
annotations:
|
||
summary: "限流触发频繁"
|
||
description: "限流触发频率过高,当前: {{ $value }}/s"
|
||
|
||
# ==================== Prometheus 自身监控告警 ====================
|
||
- name: prometheus-alerts
|
||
interval: 60s
|
||
rules:
|
||
- alert: PrometheusTargetMissing
|
||
expr: up == 0
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: monitoring
|
||
annotations:
|
||
summary: "Prometheus 抓取目标丢失"
|
||
description: "{{ $labels.job }} 实例 {{ $labels.instance }} 无法访问"
|
||
|
||
- alert: PrometheusHighMemoryUsage
|
||
expr: |
|
||
(
|
||
process_resident_memory_bytes{job="prometheus"} / 1024 / 1024
|
||
) > 100
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: monitoring
|
||
annotations:
|
||
summary: "Prometheus 内存使用过高"
|
||
description: "Prometheus 内存使用超过 100MB,当前: {{ $value }}MB"
|
||
|
||
- alert: PrometheusTSDBReloadsFailing
|
||
expr: rate(prometheus_tsdb_reloads_failures_total[5m]) > 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
category: monitoring
|
||
annotations:
|
||
summary: "Prometheus TSDB 重载失败"
|
||
description: "Prometheus 时序数据库重载失败,可能需要人工介入"
|
||
|
||
- alert: PrometheusRuleEvaluationFailures
|
||
expr: rate(prometheus_rule_evaluation_failures_total[5m]) > 0
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: monitoring
|
||
annotations:
|
||
summary: "Prometheus 规则评估失败"
|
||
description: "告警规则评估失败,请检查规则配置"
|
||
|
||
- alert: PrometheusDiskSpaceLow
|
||
expr: |
|
||
(
|
||
prometheus_tsdb_storage_blocks_bytes / 1024 / 1024 / 1024
|
||
) > 1.8
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
category: monitoring
|
||
annotations:
|
||
summary: "Prometheus 磁盘空间不足"
|
||
description: "Prometheus 数据占用超过 1.8GB,接近 2GB 限制"
|
||
|
||
# ==================== 证书过期告警 ====================
|
||
- name: certificate-alerts
|
||
interval: 3600s # 每小时检查一次
|
||
rules:
|
||
- alert: TLSCertificateExpiringSoon7Days
|
||
expr: |
|
||
(
|
||
sub2api_tls_certificate_expiry_timestamp - time()
|
||
) / 86400 < 7
|
||
for: 1h
|
||
labels:
|
||
severity: warning
|
||
category: security
|
||
annotations:
|
||
summary: "TLS 证书即将过期 (7天内)"
|
||
description: "{{ $labels.domain }} 证书将在 {{ $value | humanizeDuration }} 后过期"
|
||
|
||
- alert: TLSCertificateExpiringSoon3Days
|
||
expr: |
|
||
(
|
||
sub2api_tls_certificate_expiry_timestamp - time()
|
||
) / 86400 < 3
|
||
for: 1h
|
||
labels:
|
||
severity: critical
|
||
category: security
|
||
annotations:
|
||
summary: "TLS 证书即将过期 (3天内)"
|
||
description: "{{ $labels.domain }} 证书将在 {{ $value | humanizeDuration }} 后过期,请立即续期"
|
||
|
||
- alert: TLSCertificateExpired
|
||
expr: sub2api_tls_certificate_expiry_timestamp - time() < 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
category: security
|
||
annotations:
|
||
summary: "TLS 证书已过期"
|
||
description: "{{ $labels.domain }} 证书已过期,服务可能无法正常访问"
|
||
|
||
# ==================== 备份任务告警 ====================
|
||
- name: backup-alerts
|
||
interval: 3600s # 每小时检查一次
|
||
rules:
|
||
- alert: DatabaseBackupFailed
|
||
expr: |
|
||
time() - sub2api_job_heartbeat_last_success_timestamp{job_name="database_backup"} > 90000
|
||
for: 1h
|
||
labels:
|
||
severity: warning
|
||
category: backup
|
||
annotations:
|
||
summary: "数据库备份失败"
|
||
description: "数据库备份已超过 25 小时未成功执行"
|
||
|
||
- alert: DatabaseBackupMissing
|
||
expr: absent(sub2api_job_heartbeat_last_success_timestamp{job_name="database_backup"})
|
||
for: 1h
|
||
labels:
|
||
severity: info
|
||
category: backup
|
||
annotations:
|
||
summary: "数据库备份监控未配置"
|
||
description: "未检测到数据库备份心跳指标,请检查备份脚本是否上报"
|
||
|
||
- alert: GrafanaConfigBackupFailed
|
||
expr: |
|
||
time() - sub2api_job_heartbeat_last_success_timestamp{job_name="grafana_backup"} > 172800
|
||
for: 1h
|
||
labels:
|
||
severity: info
|
||
category: backup
|
||
annotations:
|
||
summary: "Grafana 配置备份失败"
|
||
description: "Grafana 配置备份已超过 48 小时未成功执行"
|
||
|
||
# ==================== SLO 基础告警 (简化版) ====================
|
||
- name: slo-alerts
|
||
interval: 60s
|
||
rules:
|
||
# 可用性 SLO: 99.9% (30天窗口)
|
||
# 快速燃烧率: 2% 错误预算在1小时内消耗完
|
||
- alert: SLOErrorBudgetBurnRateFast
|
||
expr: |
|
||
(
|
||
sum(rate(sub2api_http_requests_total{job="sub2api-app",status=~"5.."}[1h]))
|
||
/
|
||
sum(rate(sub2api_http_requests_total{job="sub2api-app"}[1h]))
|
||
) > 0.0144 * 14.4
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
category: slo
|
||
slo: availability
|
||
annotations:
|
||
summary: "SLO 错误预算快速燃烧"
|
||
description: "可用性错误预算正在快速消耗,可能在1小时内耗尽"
|
||
|
||
# 慢速燃烧率: 5% 错误预算在6小时内消耗完
|
||
- alert: SLOErrorBudgetBurnRateSlow
|
||
expr: |
|
||
(
|
||
sum(rate(sub2api_http_requests_total{job="sub2api-app",status=~"5.."}[6h]))
|
||
/
|
||
sum(rate(sub2api_http_requests_total{job="sub2api-app"}[6h]))
|
||
) > 0.0144 * 6
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
category: slo
|
||
slo: availability
|
||
annotations:
|
||
summary: "SLO 错误预算慢速燃烧"
|
||
description: "可用性错误预算正在持续消耗,可能在6小时内耗尽"
|
||
|
||
# ==================== Prometheus 自身监控 ====================
|
||
- name: prometheus-self-monitoring
|
||
interval: 60s
|
||
rules:
|
||
# 采集目标下线
|
||
- alert: PrometheusTargetMissing
|
||
expr: up == 0
|
||
for: 3m
|
||
labels:
|
||
severity: critical
|
||
category: monitoring
|
||
annotations:
|
||
summary: "监控采集目标下线"
|
||
description: "目标 {{ $labels.job }}/{{ $labels.instance }} 已无法采集超过 3 分钟"
|
||
|
||
# 采集耗时过长 (可能影响数据精度)
|
||
- alert: PrometheusScrapeDurationHigh
|
||
expr: scrape_duration_seconds > 10
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: monitoring
|
||
annotations:
|
||
summary: "Prometheus 采集耗时过长"
|
||
description: "目标 {{ $labels.job }} 采集耗时 {{ $value }}s,超过 10s"
|
||
|
||
# TSDB 重载失败
|
||
- alert: PrometheusTSDBReloadFailing
|
||
expr: increase(prometheus_tsdb_reloads_failures_total[1h]) > 0
|
||
for: 0m
|
||
labels:
|
||
severity: critical
|
||
category: monitoring
|
||
annotations:
|
||
summary: "Prometheus TSDB 重载失败"
|
||
description: "过去 1 小时内 TSDB 重载出现失败,可能影响数据持久化"
|
||
|
||
# WAL 截断失败
|
||
- alert: PrometheusWALCorruption
|
||
expr: increase(prometheus_tsdb_wal_corruptions_total[5m]) > 0
|
||
for: 0m
|
||
labels:
|
||
severity: critical
|
||
category: monitoring
|
||
annotations:
|
||
summary: "Prometheus WAL 损坏"
|
||
description: "Prometheus WAL 出现损坏,请立即检查数据完整性"
|
||
|
||
# 告警规则评估失败
|
||
- alert: PrometheusRuleEvaluationFailing
|
||
expr: increase(prometheus_rule_evaluation_failures_total[5m]) > 0
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: monitoring
|
||
annotations:
|
||
summary: "告警规则评估失败"
|
||
description: "Prometheus 有告警规则评估失败,可能导致漏报"
|
||
|
||
# Prometheus 存储接近容量上限
|
||
- alert: PrometheusStorageFull
|
||
expr: |
|
||
(prometheus_tsdb_storage_blocks_bytes / 1024 / 1024 / 1024) > 1.8
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
category: monitoring
|
||
annotations:
|
||
summary: "Prometheus 存储接近上限"
|
||
description: "Prometheus 存储已用 {{ $value | humanize }}GB,接近 2GB 上限,请及时清理"
|
||
|
||
# ==================== 证书过期告警 ====================
|
||
- name: certificate-alerts
|
||
interval: 300s # 5分钟检查一次,不需要高频
|
||
rules:
|
||
# 证书 7 天内过期
|
||
- alert: TLSCertificateExpiringSoon
|
||
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 7
|
||
for: 1h
|
||
labels:
|
||
severity: warning
|
||
category: security
|
||
annotations:
|
||
summary: "TLS 证书即将过期"
|
||
description: "{{ $labels.instance }} 的证书将在 {{ $value | humanizeDuration }} 后过期,请及时续期"
|
||
|
||
# 证书 3 天内过期 - 提升为 critical
|
||
- alert: TLSCertificateExpiringCritical
|
||
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 3
|
||
for: 0m
|
||
labels:
|
||
severity: critical
|
||
category: security
|
||
annotations:
|
||
summary: "TLS 证书紧急过期警告"
|
||
description: "{{ $labels.instance }} 的证书将在 {{ $value | humanizeDuration }} 后过期!"
|
||
|
||
# 证书已过期
|
||
- alert: TLSCertificateExpired
|
||
expr: probe_ssl_earliest_cert_expiry - time() <= 0
|
||
for: 0m
|
||
labels:
|
||
severity: critical
|
||
category: security
|
||
annotations:
|
||
summary: "TLS 证书已过期!"
|
||
description: "{{ $labels.instance }} 的 TLS 证书已过期,请立即续期"
|
||
|
||
# ==================== 备份与定时任务监控 ====================
|
||
- name: backup-and-job-alerts
|
||
interval: 300s
|
||
rules:
|
||
# 数据库备份超过 25 小时未成功 (允许 1 小时误差)
|
||
- alert: DatabaseBackupMissing
|
||
expr: time() - sub2api_job_heartbeat_last_success_timestamp{job_name="db-backup"} > 90000
|
||
for: 0m
|
||
labels:
|
||
severity: warning
|
||
category: backup
|
||
annotations:
|
||
summary: "数据库备份超时未完成"
|
||
description: "距上次成功备份已超过 25 小时,请检查备份任务"
|
||
|
||
# 数据库备份超过 49 小时 - 严重
|
||
- alert: DatabaseBackupCriticallyMissing
|
||
expr: time() - sub2api_job_heartbeat_last_success_timestamp{job_name="db-backup"} > 176400
|
||
for: 0m
|
||
labels:
|
||
severity: critical
|
||
category: backup
|
||
annotations:
|
||
summary: "数据库备份连续 2 天未完成!"
|
||
description: "距上次成功备份已超过 49 小时,存在数据丢失风险"
|
||
|
||
# OpsMetricsCollector 定时任务停止心跳
|
||
- alert: OpsCollectorJobStale
|
||
expr: time() - sub2api_job_heartbeat_last_success_timestamp{job_name="ops-collector"} > 180
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: monitoring
|
||
annotations:
|
||
summary: "OpsMetricsCollector 心跳超时"
|
||
description: "OpsMetricsCollector 定时任务已超过 3 分钟未上报心跳,可能已停止"
|