Files
wenzi/MONITORING_PLAN.md
Your Name 91a0b77f7a test(cache): 修复CacheConfigTest边界值测试
- 修改 shouldVerifyCacheManager_withMaximumIntegerTtl 为 shouldVerifyCacheManager_withMaximumAllowedTtl
- 使用正确的最大TTL值(10080分钟,7天)而不是 Integer.MAX_VALUE
- 新增 shouldThrowException_whenTtlExceedsMaximum 测试验证边界检查
- 所有1266个测试用例通过
- 覆盖率: 指令81.89%, 行88.48%, 分支51.55%

docs: 添加项目状态报告
- 生成 PROJECT_STATUS_REPORT.md 详细记录项目当前状态
- 包含质量指标、已完成功能、待办事项和技术债务
2026-03-02 13:31:54 +08:00

26 KiB
Raw Blame History

🦟 蚊子项目 - 生产环境监控方案

📊 监控架构概览

本文档提供蚊子项目的完整监控方案,包括指标采集、日志聚合、告警配置等。

监控架构

┌─────────────────────────────────────────────────────────┐
│                    应用层 (Mosquito)                   │
│  Spring Boot Actuator → Prometheus → Alertmanager      │
└───────────────────┬───────────────────────────────────┘
                    │
        ┌───────────┼───────────┐
        │           │           │
┌───────▼─────────▼────────────▼────────┐
│           日志聚合层                    │
│  Application → Loki → Grafana          │
└──────────────────┬──────────────────────┘
                   │
        ┌──────────┼──────────┐
        │          │          │
┌───────▼─────────▼─────────▼────────┐
│           可视化告警层                 │
│     Grafana + Alertmanager            │
└───────────────────────────────────────┘

🔍 一、应用监控

1. Spring Boot Actuator配置

1.1 添加依赖

<!-- pom.xml -->
<dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<dependency>
    <groupId>io.micrometer</groupId>
    <artifactId>micrometer-registry-prometheus</artifactId>
</dependency>
<dependency>
    <groupId>io.micrometer</groupId>
    <artifactId>micrometer-registry-influx</artifactId>
</dependency>

1.2 配置Actuator端点

# application-prod.properties
# Actuator配置
management.endpoints.web.exposure.include=health,info,metrics,prometheus,loggers
management.endpoint.health.show-details=when-authorized
management.endpoint.health.show-components=when-authorized
management.health.defaults.enabled=true

# 健康检查配置
management.health.db.enabled=true
management.health.redis.enabled=true
management.health.diskSpace.enabled=true
management.health.diskSpace.threshold=1GB

# Prometheus配置
management.metrics.export.prometheus.enabled=true
management.metrics.tags.application=mosquito,environment=prod

# 自定义健康检查
management.endpoint.health.probes.enabled=true

2. 自定义健康检查

// SystemHealthIndicator.java
package com.mosquito.project.health;

import org.springframework.boot.actuate.health.Health;
import org.springframework.boot.actuate.health.HealthIndicator;
import org.springframework.stereotype.Component;

import java.io.File;

@Component
public class SystemHealthIndicator implements HealthIndicator {
    
    @Override
    public Health health() {
        // 检查磁盘空间
        File disk = new File("/");
        long freeSpace = disk.getFreeSpace();
        long totalSpace = disk.getTotalSpace();
        double freeSpacePercent = (double) freeSpace / totalSpace * 100;
        
        if (freeSpacePercent < 10) {
            return Health.down()
                .withDetail("disk.free", freeSpace / (1024 * 1024 * 1024) + " GB")
                .withDetail("disk.total", totalSpace / (1024 * 1024 * 1024) + " GB")
                .withDetail("disk.free.percent", freeSpacePercent)
                .build();
        }
        
        return Health.up()
            .withDetail("disk.free", freeSpace / (1024 * 1024 * 1024) + " GB")
            .withDetail("disk.total", totalSpace / (1024 * 1024 * 1024) + " GB")
            .withDetail("disk.free.percent", freeSpacePercent)
            .build();
    }
}
// CacheHealthIndicator.java
package com.mosquito.project.health;

import org.springframework.boot.actuate.health.Health;
import org.springframework.boot.actuate.health.HealthIndicator;
import org.springframework.data.redis.core.RedisTemplate;
import org.springframework.stereotype.Component;

@Component
public class CacheHealthIndicator implements HealthIndicator {
    
    private final RedisTemplate<String, Object> redisTemplate;
    
    public CacheHealthIndicator(RedisTemplate<String, Object> redisTemplate) {
        this.redisTemplate = redisTemplate;
    }
    
    @Override
    public Health health() {
        try {
            // 测试Redis连接
            redisTemplate.getConnectionFactory().getConnection().ping();
            
            // 获取Redis信息
            Object info = redisTemplate.getConnectionFactory()
                .getConnection()
                .info("memory");
            
            return Health.up()
                .withDetail("redis", "connected")
                .withDetail("info", info)
                .build();
        } catch (Exception e) {
            return Health.down()
                .withDetail("error", e.getMessage())
                .build();
        }
    }
}

3. 自定义指标

// BusinessMetrics.java
package com.mosquito.project.metrics;

import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Timer;
import org.springframework.stereotype.Component;

import java.util.concurrent.TimeUnit;

@Component
public class BusinessMetrics {
    
    private final Counter shareLinkCreated;
    private final Counter posterGenerated;
    private final Counter leaderboardAccessed;
    private final Timer apiResponseTime;
    
    public BusinessMetrics(MeterRegistry registry) {
        this.shareLinkCreated = Counter.builder("mosquito.share_link_created")
            .description("Total number of share links created")
            .tag("type", "shortlink")
            .register(registry);
        
        this.posterGenerated = Counter.builder("mosquito.poster_generated")
            .description("Total number of posters generated")
            .tag("format", "image")
            .register(registry);
        
        this.leaderboardAccessed = Counter.builder("mosquito.leaderboard_accessed")
            .description("Total number of leaderboard accesses")
            .register(registry);
        
        this.apiResponseTime = Timer.builder("mosquito.api_response_time")
            .description("API response time")
            .publishPercentiles(0.5, 0.95, 0.99)
            .register(registry);
    }
    
    public void incrementShareLinkCreated(String activityId) {
        shareLinkCreated.increment();
    }
    
    public void incrementPosterGenerated(String template) {
        posterGenerated.increment();
    }
    
    public void incrementLeaderboardAccessed() {
        leaderboardAccessed.increment();
    }
    
    public void recordApiResponseTime(String endpoint, long duration) {
        apiResponseTime.record(duration, TimeUnit.MILLISECONDS);
    }
}
// 使用示例 - ActivityController.java
@RestController
@RequestMapping("/api/v1/activities")
public class ActivityController {
    
    private final BusinessMetrics businessMetrics;
    
    public ActivityController(BusinessMetrics businessMetrics) {
        this.businessMetrics = businessMetrics;
    }
    
    @GetMapping("/{id}/leaderboard")
    public ResponseEntity<List<LeaderboardEntry>> getLeaderboard(@PathVariable Long id) {
        Timer.Sample sample = Timer.start();
        
        try {
            List<LeaderboardEntry> leaderboard = activityService.getLeaderboard(id);
            businessMetrics.incrementLeaderboardAccessed();
            
            sample.stop(businessMetrics.getApiResponseTime());
            return ResponseEntity.ok(leaderboard);
        } catch (Exception e) {
            sample.stop(businessMetrics.getApiResponseTime());
            throw e;
        }
    }
}

📈 二、Prometheus配置

1. Prometheus部署

1.1 Docker部署Prometheus

# docker-compose.prometheus.yml
version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: mosquito-prometheus
    restart: unless-stopped
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=30d'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--web.enable-lifecycle'
    volumes:
      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
      - prometheus_data:/prometheus
    ports:
      - "9090:9090"
    networks:
      - monitoring

  alertmanager:
    image: prom/alertmanager:latest
    container_name: mosquito-alertmanager
    restart: unless-stopped
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
      - '--web.external-url=http://localhost:9093'
    volumes:
      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
      - alertmanager_data:/alertmanager
    ports:
      - "9093:9093"
    networks:
      - monitoring

  node_exporter:
    image: prom/node-exporter:latest
    container_name: mosquito-node-exporter
    restart: unless-stopped
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    ports:
      - "9100:9100"
    networks:
      - monitoring

volumes:
  prometheus_data:
    driver: local
  alertmanager_data:
    driver: local

networks:
  monitoring:
    driver: bridge

1.2 Prometheus配置文件

# prometheus/prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    cluster: 'mosquito-prod'
    environment: 'production'

# Alertmanager配置
alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - 'alertmanager:9093'

# 告警规则文件
rule_files:
  - "alerts.yml"

# 抓取配置
scrape_configs:
  # Mosquito应用指标
  - job_name: 'mosquito'
    metrics_path: '/actuator/prometheus'
    scrape_interval: 10s
    static_configs:
      - targets: ['mosquito-app:8080']
        labels:
          application: 'mosquito'
          environment: 'production'
          
  # Node Exporter系统指标
  - job_name: 'node_exporter'
    static_configs:
      - targets: ['node_exporter:9100']
        labels:
          environment: 'production'
          
  # PostgreSQL指标
  - job_name: 'postgres_exporter'
    static_configs:
      - targets: ['postgres-exporter:9187']
        labels:
          environment: 'production'
          
  # Redis指标
  - job_name: 'redis_exporter'
    static_configs:
      - targets: ['redis-exporter:9121']
        labels:
          environment: 'production'

1.3 告警规则配置

# prometheus/alerts.yml
groups:
  - name: mosquito_alerts
    interval: 30s
    rules:
      # 应用可用性告警
      - alert: ApplicationDown
        expr: up{job="mosquito"} == 0
        for: 1m
        labels:
          severity: critical
          component: application
        annotations:
          summary: "Mosquito应用已宕机"
          description: "应用 {{ $labels.instance }} 已经宕机超过1分钟"
          
      # 高错误率告警
      - alert: HighErrorRate
        expr: |
          (
            sum(rate(http_server_requests_seconds_count{job="mosquito",status=~"5.."}[5m]))
            /
            sum(rate(http_server_requests_seconds_count{job="mosquito"}[5m]))
          ) > 0.05
        for: 5m
        labels:
          severity: warning
          component: application
        annotations:
          summary: "高HTTP错误率"
          description: "应用 {{ $labels.instance }} 错误率超过5%,当前值: {{ $value | humanizePercentage }}"
          
      # 慢响应时间告警
      - alert: HighResponseTime
        expr: |
          histogram_quantile(0.95, 
            sum(rate(http_server_requests_seconds_bucket{job="mosquito"}[5m])) by (le, instance)
          ) > 1.0
        for: 10m
        labels:
          severity: warning
          component: application
        annotations:
          summary: "API响应时间过长"
          description: "应用 {{ $labels.instance }} P95响应时间超过1秒当前值: {{ $value }}s"
          
      # 高CPU使用率告警
      - alert: HighCPUUsage
        expr: |
          (
            sum by (instance) (rate(process_cpu_seconds_total{job="mosquito"}[5m])) * 100
          ) > 80
        for: 10m
        labels:
          severity: warning
          component: system
        annotations:
          summary: "高CPU使用率"
          description: "实例 {{ $labels.instance }} CPU使用率超过80%,当前值: {{ $value }}%"
          
      # 高内存使用率告警
      - alert: HighMemoryUsage
        expr: |
          (
            jvm_memory_used_bytes{job="mosquito",area="heap"} 
            / 
            jvm_memory_max_bytes{job="mosquito",area="heap"}
          ) * 100 > 90
        for: 5m
        labels:
          severity: warning
          component: jvm
        annotations:
          summary: "高内存使用率"
          description: "实例 {{ $labels.instance }} 堆内存使用率超过90%,当前值: {{ $value }}%"
          
      # 数据库连接池告警
      - alert: HighDatabaseConnectionPoolUsage
        expr: |
          (
            hikaricp_connections_active{job="mosquito"}
            / 
            hikaricp_connections_max{job="mosquito"}
          ) * 100 > 80
        for: 5m
        labels:
          severity: warning
          component: database
        annotations:
          summary: "高数据库连接池使用率"
          description: "数据库连接池使用率超过80%,当前值: {{ $value }}%"
          
      # Redis连接失败告警
      - alert: RedisConnectionFailure
        expr: |
          up{job="redis_exporter"} == 0
        for: 1m
        labels:
          severity: critical
          component: cache
        annotations:
          summary: "Redis连接失败"
          description: "无法连接到Redis服务器"
          
      # GC时间过长告警
      - alert: LongGCPauseTime
        expr: |
          rate(jvm_gc_pause_seconds_sum{job="mosquito"}[5m]) > 0.1
        for: 10m
        labels:
          severity: warning
          component: jvm
        annotations:
          summary: "GC停顿时间过长"
          description: "实例 {{ $labels.instance }} GC停顿时间超过100ms当前值: {{ $value }}s/ms"
          
      # 磁盘空间不足告警
      - alert: LowDiskSpace
        expr: |
          (
            node_filesystem_avail_bytes{mountpoint="/"}
            / 
            node_filesystem_size_bytes{mountpoint="/"}
          ) * 100 < 10
        for: 5m
        labels:
          severity: warning
          component: system
        annotations:
          summary: "磁盘空间不足"
          description: "磁盘 {{ $labels.device }} 剩余空间少于10%,当前值: {{ $value }}%"

📊 三、Grafana仪表板

1. 应用性能仪表板

{
  "dashboard": {
    "title": "Mosquito Application Performance",
    "panels": [
      {
        "title": "请求速率",
        "type": "graph",
        "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
        "targets": [
          {
            "expr": "sum(rate(http_server_requests_seconds_count{job='mosquito'}[5m]))",
            "legendFormat": "{{method}} {{uri}}"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "reqps"
          }
        }
      },
      {
        "title": "响应时间分布",
        "type": "graph",
        "gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
        "targets": [
          {
            "expr": "histogram_quantile(0.50, sum(rate(http_server_requests_seconds_bucket{job='mosquito'}[5m])) by (le))",
            "legendFormat": "P50"
          },
          {
            "expr": "histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket{job='mosquito'}[5m])) by (le))",
            "legendFormat": "P95"
          },
          {
            "expr": "histogram_quantile(0.99, sum(rate(http_server_requests_seconds_bucket{job='mosquito'}[5m])) by (le))",
            "legendFormat": "P99"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "s"
          }
        }
      },
      {
        "title": "错误率",
        "type": "stat",
        "gridPos": {"x": 0, "y": 8, "w": 6, "h": 4},
        "targets": [
          {
            "expr": "sum(rate(http_server_requests_seconds_count{job='mosquito',status=~'5..'}[5m])) / sum(rate(http_server_requests_seconds_count{job='mosquito'}[5m]))"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percentunit",
            "max": 1,
            "thresholds": {
              "steps": [
                {"color": "green", "value": 0},
                {"color": "yellow", "value": 0.01},
                {"color": "red", "value": 0.05}
              ]
            }
          }
        }
      },
      {
        "title": "JVM堆内存使用",
        "type": "graph",
        "gridPos": {"x": 6, "y": 8, "w": 18, "h": 4},
        "targets": [
          {
            "expr": "jvm_memory_used_bytes{job='mosquito',area='heap'}",
            "legendFormat": "已使用"
          },
          {
            "expr": "jvm_memory_max_bytes{job='mosquito',area='heap'}",
            "legendFormat": "最大值"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "bytes"
          }
        }
      },
      {
        "title": "数据库连接池",
        "type": "graph",
        "gridPos": {"x": 0, "y": 12, "w": 12, "h": 6},
        "targets": [
          {
            "expr": "hikaricp_connections_active{job='mosquito'}",
            "legendFormat": "活跃连接"
          },
          {
            "expr": "hikaricp_connections_idle{job='mosquito'}",
            "legendFormat": "空闲连接"
          },
          {
            "expr": "hikaricp_connections_max{job='mosquito'}",
            "legendFormat": "最大连接"
          }
        ]
      },
      {
        "title": "Redis连接状态",
        "type": "stat",
        "gridPos": {"x": 12, "y": 12, "w": 12, "h": 6},
        "targets": [
          {
            "expr": "up{job='redis_exporter'}"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "mappings": [
              {"value": 1, "text": "正常"},
              {"value": 0, "text": "异常"}
            ],
            "thresholds": {
              "steps": [
                {"color": "red", "value": 0},
                {"color": "green", "value": 1}
              ]
            }
          }
        }
      }
    ]
  }
}

2. 业务指标仪表板

{
  "dashboard": {
    "title": "Mosquito Business Metrics",
    "panels": [
      {
        "title": "分享链接创建趋势",
        "type": "graph",
        "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
        "targets": [
          {
            "expr": "sum(increase(mosquito_share_link_created_total[1h]))",
            "legendFormat": "{{activity}}"
          }
        ]
      },
      {
        "title": "海报生成次数",
        "type": "stat",
        "gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
        "targets": [
          {
            "expr": "sum(increase(mosquito_poster_generated_total[24h]))"
          }
        ]
      },
      {
        "title": "排行榜访问热度",
        "type": "heatmap",
        "gridPos": {"x": 0, "y": 8, "w": 24, "h": 8},
        "targets": [
          {
            "expr": "sum by (activity_id) (rate(mosquito_leaderboard_accessed_total[1h]))"
          }
        ]
      }
    ]
  }
}

🚨 四、告警通知配置

1. Alertmanager配置

# alertmanager/alertmanager.yml
global:
  resolve_timeout: 5m
  slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'

templates:
  - '/etc/alertmanager/templates/*.tmpl'

route:
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 12h
  receiver: 'default'
  routes:
    - match:
        severity: critical
      receiver: 'critical-alerts'
      continue: true
    
    - match:
        severity: warning
      receiver: 'warning-alerts'
    
    - match:
        alertname: 'ApplicationDown'
      receiver: 'pagerduty'

receivers:
  - name: 'default'
    slack_configs:
      - channel: '#mosquito-alerts'
        send_resolved: true
        title: '{{ .GroupLabels.alertname }}'
        text: |
          告警: {{ range .Alerts }}{{ .Annotations.summary }}
          详情: {{ .Annotations.description }}
          状态: {{ .Status }}
          {{ end }}'

  - name: 'critical-alerts'
    slack_configs:
      - channel: '#mosquito-critical'
        send_resolved: true
        title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
        color: 'danger'
        text: |
          紧急告警:
          {{ range .Alerts }}
          - {{ .Annotations.summary }}
          - {{ .Annotations.description }}
          - 实例: {{ .Labels.instance }}
          - 时间: {{ .StartsAt }}
          {{ end }}'
    email_configs:
      - to: 'ops-team@yourcompany.com'
        send_resolved: true
        headers:
          Subject: '🚨 CRITICAL: Mosquito Production Alert'

  - name: 'warning-alerts'
    slack_configs:
      - channel: '#mosquito-alerts'
        send_resolved: true
        title: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
        color: 'warning'
        text: |
          警告:
          {{ range .Alerts }}
          - {{ .Annotations.summary }}
          - {{ .Annotations.description }}
          {{ end }}'

  - name: 'pagerduty'
    pagerduty_configs:
      - service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
        severity: 'critical'

2. PagerDuty集成

# pagerduty配置示例
pagerduty_configs:
  - service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
    description: '{{ .GroupLabels.alertname }}'
    details:
      firing: '{{ template "pagerduty.default.instances" .Alerts.Firing }}'
      resolved: '{{ template "pagerduty.default.instances" .Alerts.Resolved }}'
      num_firing: '{{ .Alerts.Firing | len }}'
      num_resolved: '{{ .Alerts.Resolved | len }}'

📝 五、日志聚合配置

1. Loki配置

# loki-config.yml
server:
  http_listen_port: 3100

ingester:
  lifecycler:
    ring:
      replication_factor: 1
      kvstore:
        store: inmemory
  chunk_idle_period: 1h
  chunk_retain_period: 1m
  max_transfer_retries: 0

schema_config:
  configs:
    - from: 2020-10-24
      store: boltdb-shipper
      object_store: filesystem
      schema: v11
      index:
        prefix: index_
        period: 24h

storage_config:
  boltdb_shipper:
    active_index_directory: /loki/boltdb-shipper-active
    cache_location: /loki/boltdb-shipper-cache
    shared_store: filesystem
  filesystem:
    directory: /loki/chunks

limits_config:
  enforce_metric_name: false
  reject_old_samples: true
  reject_old_samples_max_age: 168h

chunk_store_config:
  max_look_back_period: 0s

table_manager:
  retention_deletes_enabled: true
  retention_period: 30d

2. Promtail配置

# promtail-config.yml
server:
  http_listen_port: 9080

clients:
  - url: http://loki:3100/loki/api/v1/push

scrape_configs:
  - job_name: mosquito
    static_configs:
      - targets:
          - localhost
        labels:
          job: mosquito
          app: mosquito-api
          env: production
    
    pipeline_stages:
      - json:
          expressions:
            level: level
            message: message
            exception: exception
      
      - labels:
          level: level
      
      - regex:
          expression: '(?P<timestamp>\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}) (?P<level>\\w+) .*? - (?P<message>.*)'
      
      - output:
          source: message

📊 六、监控指标总结

核心监控指标

类别 指标 告警阈值
可用性 应用启动状态 down > 1min
性能 API响应时间(P95) > 1.0s
性能 API响应时间(P99) > 2.0s
错误 HTTP 5xx错误率 > 5%
系统 CPU使用率 > 80%
系统 内存使用率 > 90%
系统 磁盘剩余空间 < 10%
数据库 连接池使用率 > 80%
缓存 Redis连接状态 down > 1min
JVM GC停顿时间 > 100ms

业务监控指标

类别 指标 说明
用户行为 分享链接创建次数 总计和分活动
用户行为 海报生成次数 按模板类型
用户行为 排行榜访问次数 按活动ID
业务逻辑 活动创建失败率 失败/总数
业务逻辑 API密钥生成趋势 按时间段

监控检查清单

监控系统检查

  • Prometheus正常运行
  • Alertmanager配置正确
  • Grafana仪表板可用
  • Loki日志聚合正常
  • 告警通知渠道畅通

监控指标检查

  • 应用指标采集正常
  • 系统指标采集正常
  • 业务指标采集正常
  • 告警规则生效
  • 数据保留策略配置

告警通知检查

  • Slack通知正常
  • 邮件通知正常
  • PagerDuty集成正常
  • 告警分级正确
  • 告警抑制正常

监控方案版本: v2.0.0
最后更新: 2026-01-22
维护团队: DevOps Team