Files
Developer 349d783fd1 refactor: clean up project structure
- Remove old review reports (keep latest only)
- Move docs/ to deploy/docs-backup/
- Move performance-testing/ to deploy/
- Clean up test output files
- Organize root directory
2026-04-06 23:36:03 +08:00

146 lines
3.8 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# =============================================================================
# Alertmanager 配置文件
# =============================================================================
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@sub2api.org'
smtp_auth_username: ''
smtp_auth_password: ''
# 告警分组
slack_api_url: '${SLACK_WEBHOOK_URL}'
# 解决通知等待时间
resolve_timeout: 5m
# 路由树
templates:
- '/etc/alertmanager/templates/*.tmpl'
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'default'
routes:
# Critical 告警 - 立即通知
- match:
severity: critical
receiver: 'critical'
group_wait: 0s
repeat_interval: 5m
continue: true
# High 告警 - 快速通知
- match:
severity: high
receiver: 'high'
group_wait: 30s
repeat_interval: 30m
continue: true
# SLO 相关告警
- match:
slo: api-availability
receiver: 'slo-team'
continue: true
# 基础设施告警
- match_re:
alertname: Database.*|Redis.*|HighCPU.*|HighMemory.*
receiver: 'infra-team'
continue: true
# 所有告警都桥接回内置 ops_alert_events末尾continue: false
- receiver: 'ops-bridge'
# 接收器配置
receivers:
- name: 'default'
email_configs:
- to: 'oncall@sub2api.org'
send_resolved: true
headers:
Subject: '[Alert] {{ .GroupLabels.alertname }}'
- name: 'critical'
email_configs:
- to: 'sre-lead@sub2api.org'
send_resolved: true
slack_configs:
- channel: '#alerts-critical'
send_resolved: true
title: '🔴 CRITICAL: {{ .GroupLabels.alertname }}'
text: |
{{ range .Alerts }}
*Summary:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
*Runbook:* {{ .Annotations.runbook_url }}
{{ end }}
webhook_configs:
- url: '${PAGERDUTY_WEBHOOK_URL}'
send_resolved: true
- name: 'high'
email_configs:
- to: 'oncall@sub2api.org'
send_resolved: true
slack_configs:
- channel: '#alerts-high'
send_resolved: true
title: '🟠 HIGH: {{ .GroupLabels.alertname }}'
text: |
{{ range .Alerts }}
*Summary:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
{{ end }}
- name: 'slo-team'
email_configs:
- to: 'slo-team@sub2api.org'
send_resolved: true
- name: 'infra-team'
slack_configs:
- channel: '#infra-alerts'
send_resolved: true
title: '🔧 INFRA: {{ .GroupLabels.alertname }}'
# ops-bridge: 将 Prometheus 告警写回内置 ops_alert_events 表
# 这使得运维人员可在现有 Ops Dashboard 中统一查看所有告警
# 需要通过环境变量注入 bearer token:
# ALERTMANAGER_INTERNAL_TOKEN=<same value as app INTERNAL_WEBHOOK_TOKEN>
- name: 'ops-bridge'
webhook_configs:
- url: 'http://host.docker.internal:8080/admin/ops/prometheus-alerts'
send_resolved: true
max_alerts: 50
http_config:
bearer_token: '${ALERTMANAGER_INTERNAL_TOKEN}'
# 抑制规则
inhibit_rules:
# 高严重级别抑制低严重级别
- source_match:
severity: 'critical'
target_match:
severity: 'high'
equal: ['alertname', 'cluster', 'service']
# 相同告警抑制重复通知
- source_match:
severity: 'high'
target_match:
severity: 'medium'
equal: ['alertname', 'cluster', 'service']
# 应用宕机时抑制所有应用层告警(避免告警风暴)
- source_match:
alertname: 'Sub2APIDown'
target_match_re:
alertname: 'HighErrorRate|HighLatency.*|SLOErrorBudget.*'
equal: ['job']