Files
wenzi/MONITORING_PLAN.md
Your Name 91a0b77f7a test(cache): 修复CacheConfigTest边界值测试
- 修改 shouldVerifyCacheManager_withMaximumIntegerTtl 为 shouldVerifyCacheManager_withMaximumAllowedTtl
- 使用正确的最大TTL值(10080分钟,7天)而不是 Integer.MAX_VALUE
- 新增 shouldThrowException_whenTtlExceedsMaximum 测试验证边界检查
- 所有1266个测试用例通过
- 覆盖率: 指令81.89%, 行88.48%, 分支51.55%

docs: 添加项目状态报告
- 生成 PROJECT_STATUS_REPORT.md 详细记录项目当前状态
- 包含质量指标、已完成功能、待办事项和技术债务
2026-03-02 13:31:54 +08:00

952 lines
26 KiB
Markdown
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# 🦟 蚊子项目 - 生产环境监控方案
## 📊 监控架构概览
本文档提供蚊子项目的完整监控方案,包括指标采集、日志聚合、告警配置等。
### 监控架构
```
┌─────────────────────────────────────────────────────────┐
│ 应用层 (Mosquito) │
│ Spring Boot Actuator → Prometheus → Alertmanager │
└───────────────────┬───────────────────────────────────┘
┌───────────┼───────────┐
│ │ │
┌───────▼─────────▼────────────▼────────┐
│ 日志聚合层 │
│ Application → Loki → Grafana │
└──────────────────┬──────────────────────┘
┌──────────┼──────────┐
│ │ │
┌───────▼─────────▼─────────▼────────┐
│ 可视化告警层 │
│ Grafana + Alertmanager │
└───────────────────────────────────────┘
```
## 🔍 一、应用监控
### 1. Spring Boot Actuator配置
#### 1.1 添加依赖
```xml
<!-- pom.xml -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
</dependency>
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-influx</artifactId>
</dependency>
```
#### 1.2 配置Actuator端点
```properties
# application-prod.properties
# Actuator配置
management.endpoints.web.exposure.include=health,info,metrics,prometheus,loggers
management.endpoint.health.show-details=when-authorized
management.endpoint.health.show-components=when-authorized
management.health.defaults.enabled=true
# 健康检查配置
management.health.db.enabled=true
management.health.redis.enabled=true
management.health.diskSpace.enabled=true
management.health.diskSpace.threshold=1GB
# Prometheus配置
management.metrics.export.prometheus.enabled=true
management.metrics.tags.application=mosquito,environment=prod
# 自定义健康检查
management.endpoint.health.probes.enabled=true
```
### 2. 自定义健康检查
```java
// SystemHealthIndicator.java
package com.mosquito.project.health;
import org.springframework.boot.actuate.health.Health;
import org.springframework.boot.actuate.health.HealthIndicator;
import org.springframework.stereotype.Component;
import java.io.File;
@Component
public class SystemHealthIndicator implements HealthIndicator {
@Override
public Health health() {
// 检查磁盘空间
File disk = new File("/");
long freeSpace = disk.getFreeSpace();
long totalSpace = disk.getTotalSpace();
double freeSpacePercent = (double) freeSpace / totalSpace * 100;
if (freeSpacePercent < 10) {
return Health.down()
.withDetail("disk.free", freeSpace / (1024 * 1024 * 1024) + " GB")
.withDetail("disk.total", totalSpace / (1024 * 1024 * 1024) + " GB")
.withDetail("disk.free.percent", freeSpacePercent)
.build();
}
return Health.up()
.withDetail("disk.free", freeSpace / (1024 * 1024 * 1024) + " GB")
.withDetail("disk.total", totalSpace / (1024 * 1024 * 1024) + " GB")
.withDetail("disk.free.percent", freeSpacePercent)
.build();
}
}
```
```java
// CacheHealthIndicator.java
package com.mosquito.project.health;
import org.springframework.boot.actuate.health.Health;
import org.springframework.boot.actuate.health.HealthIndicator;
import org.springframework.data.redis.core.RedisTemplate;
import org.springframework.stereotype.Component;
@Component
public class CacheHealthIndicator implements HealthIndicator {
private final RedisTemplate<String, Object> redisTemplate;
public CacheHealthIndicator(RedisTemplate<String, Object> redisTemplate) {
this.redisTemplate = redisTemplate;
}
@Override
public Health health() {
try {
// 测试Redis连接
redisTemplate.getConnectionFactory().getConnection().ping();
// 获取Redis信息
Object info = redisTemplate.getConnectionFactory()
.getConnection()
.info("memory");
return Health.up()
.withDetail("redis", "connected")
.withDetail("info", info)
.build();
} catch (Exception e) {
return Health.down()
.withDetail("error", e.getMessage())
.build();
}
}
}
```
### 3. 自定义指标
```java
// BusinessMetrics.java
package com.mosquito.project.metrics;
import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Timer;
import org.springframework.stereotype.Component;
import java.util.concurrent.TimeUnit;
@Component
public class BusinessMetrics {
private final Counter shareLinkCreated;
private final Counter posterGenerated;
private final Counter leaderboardAccessed;
private final Timer apiResponseTime;
public BusinessMetrics(MeterRegistry registry) {
this.shareLinkCreated = Counter.builder("mosquito.share_link_created")
.description("Total number of share links created")
.tag("type", "shortlink")
.register(registry);
this.posterGenerated = Counter.builder("mosquito.poster_generated")
.description("Total number of posters generated")
.tag("format", "image")
.register(registry);
this.leaderboardAccessed = Counter.builder("mosquito.leaderboard_accessed")
.description("Total number of leaderboard accesses")
.register(registry);
this.apiResponseTime = Timer.builder("mosquito.api_response_time")
.description("API response time")
.publishPercentiles(0.5, 0.95, 0.99)
.register(registry);
}
public void incrementShareLinkCreated(String activityId) {
shareLinkCreated.increment();
}
public void incrementPosterGenerated(String template) {
posterGenerated.increment();
}
public void incrementLeaderboardAccessed() {
leaderboardAccessed.increment();
}
public void recordApiResponseTime(String endpoint, long duration) {
apiResponseTime.record(duration, TimeUnit.MILLISECONDS);
}
}
```
```java
// 使用示例 - ActivityController.java
@RestController
@RequestMapping("/api/v1/activities")
public class ActivityController {
private final BusinessMetrics businessMetrics;
public ActivityController(BusinessMetrics businessMetrics) {
this.businessMetrics = businessMetrics;
}
@GetMapping("/{id}/leaderboard")
public ResponseEntity<List<LeaderboardEntry>> getLeaderboard(@PathVariable Long id) {
Timer.Sample sample = Timer.start();
try {
List<LeaderboardEntry> leaderboard = activityService.getLeaderboard(id);
businessMetrics.incrementLeaderboardAccessed();
sample.stop(businessMetrics.getApiResponseTime());
return ResponseEntity.ok(leaderboard);
} catch (Exception e) {
sample.stop(businessMetrics.getApiResponseTime());
throw e;
}
}
}
```
---
## 📈 二、Prometheus配置
### 1. Prometheus部署
#### 1.1 Docker部署Prometheus
```yaml
# docker-compose.prometheus.yml
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
container_name: mosquito-prometheus
restart: unless-stopped
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
- prometheus_data:/prometheus
ports:
- "9090:9090"
networks:
- monitoring
alertmanager:
image: prom/alertmanager:latest
container_name: mosquito-alertmanager
restart: unless-stopped
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://localhost:9093'
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager_data:/alertmanager
ports:
- "9093:9093"
networks:
- monitoring
node_exporter:
image: prom/node-exporter:latest
container_name: mosquito-node-exporter
restart: unless-stopped
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
ports:
- "9100:9100"
networks:
- monitoring
volumes:
prometheus_data:
driver: local
alertmanager_data:
driver: local
networks:
monitoring:
driver: bridge
```
#### 1.2 Prometheus配置文件
```yaml
# prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'mosquito-prod'
environment: 'production'
# Alertmanager配置
alerting:
alertmanagers:
- static_configs:
- targets:
- 'alertmanager:9093'
# 告警规则文件
rule_files:
- "alerts.yml"
# 抓取配置
scrape_configs:
# Mosquito应用指标
- job_name: 'mosquito'
metrics_path: '/actuator/prometheus'
scrape_interval: 10s
static_configs:
- targets: ['mosquito-app:8080']
labels:
application: 'mosquito'
environment: 'production'
# Node Exporter系统指标
- job_name: 'node_exporter'
static_configs:
- targets: ['node_exporter:9100']
labels:
environment: 'production'
# PostgreSQL指标
- job_name: 'postgres_exporter'
static_configs:
- targets: ['postgres-exporter:9187']
labels:
environment: 'production'
# Redis指标
- job_name: 'redis_exporter'
static_configs:
- targets: ['redis-exporter:9121']
labels:
environment: 'production'
```
#### 1.3 告警规则配置
```yaml
# prometheus/alerts.yml
groups:
- name: mosquito_alerts
interval: 30s
rules:
# 应用可用性告警
- alert: ApplicationDown
expr: up{job="mosquito"} == 0
for: 1m
labels:
severity: critical
component: application
annotations:
summary: "Mosquito应用已宕机"
description: "应用 {{ $labels.instance }} 已经宕机超过1分钟"
# 高错误率告警
- alert: HighErrorRate
expr: |
(
sum(rate(http_server_requests_seconds_count{job="mosquito",status=~"5.."}[5m]))
/
sum(rate(http_server_requests_seconds_count{job="mosquito"}[5m]))
) > 0.05
for: 5m
labels:
severity: warning
component: application
annotations:
summary: "高HTTP错误率"
description: "应用 {{ $labels.instance }} 错误率超过5%,当前值: {{ $value | humanizePercentage }}"
# 慢响应时间告警
- alert: HighResponseTime
expr: |
histogram_quantile(0.95,
sum(rate(http_server_requests_seconds_bucket{job="mosquito"}[5m])) by (le, instance)
) > 1.0
for: 10m
labels:
severity: warning
component: application
annotations:
summary: "API响应时间过长"
description: "应用 {{ $labels.instance }} P95响应时间超过1秒当前值: {{ $value }}s"
# 高CPU使用率告警
- alert: HighCPUUsage
expr: |
(
sum by (instance) (rate(process_cpu_seconds_total{job="mosquito"}[5m])) * 100
) > 80
for: 10m
labels:
severity: warning
component: system
annotations:
summary: "高CPU使用率"
description: "实例 {{ $labels.instance }} CPU使用率超过80%,当前值: {{ $value }}%"
# 高内存使用率告警
- alert: HighMemoryUsage
expr: |
(
jvm_memory_used_bytes{job="mosquito",area="heap"}
/
jvm_memory_max_bytes{job="mosquito",area="heap"}
) * 100 > 90
for: 5m
labels:
severity: warning
component: jvm
annotations:
summary: "高内存使用率"
description: "实例 {{ $labels.instance }} 堆内存使用率超过90%,当前值: {{ $value }}%"
# 数据库连接池告警
- alert: HighDatabaseConnectionPoolUsage
expr: |
(
hikaricp_connections_active{job="mosquito"}
/
hikaricp_connections_max{job="mosquito"}
) * 100 > 80
for: 5m
labels:
severity: warning
component: database
annotations:
summary: "高数据库连接池使用率"
description: "数据库连接池使用率超过80%,当前值: {{ $value }}%"
# Redis连接失败告警
- alert: RedisConnectionFailure
expr: |
up{job="redis_exporter"} == 0
for: 1m
labels:
severity: critical
component: cache
annotations:
summary: "Redis连接失败"
description: "无法连接到Redis服务器"
# GC时间过长告警
- alert: LongGCPauseTime
expr: |
rate(jvm_gc_pause_seconds_sum{job="mosquito"}[5m]) > 0.1
for: 10m
labels:
severity: warning
component: jvm
annotations:
summary: "GC停顿时间过长"
description: "实例 {{ $labels.instance }} GC停顿时间超过100ms当前值: {{ $value }}s/ms"
# 磁盘空间不足告警
- alert: LowDiskSpace
expr: |
(
node_filesystem_avail_bytes{mountpoint="/"}
/
node_filesystem_size_bytes{mountpoint="/"}
) * 100 < 10
for: 5m
labels:
severity: warning
component: system
annotations:
summary: "磁盘空间不足"
description: "磁盘 {{ $labels.device }} 剩余空间少于10%,当前值: {{ $value }}%"
```
---
## 📊 三、Grafana仪表板
### 1. 应用性能仪表板
```json
{
"dashboard": {
"title": "Mosquito Application Performance",
"panels": [
{
"title": "请求速率",
"type": "graph",
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "sum(rate(http_server_requests_seconds_count{job='mosquito'}[5m]))",
"legendFormat": "{{method}} {{uri}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps"
}
}
},
{
"title": "响应时间分布",
"type": "graph",
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(http_server_requests_seconds_bucket{job='mosquito'}[5m])) by (le))",
"legendFormat": "P50"
},
{
"expr": "histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket{job='mosquito'}[5m])) by (le))",
"legendFormat": "P95"
},
{
"expr": "histogram_quantile(0.99, sum(rate(http_server_requests_seconds_bucket{job='mosquito'}[5m])) by (le))",
"legendFormat": "P99"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
}
}
},
{
"title": "错误率",
"type": "stat",
"gridPos": {"x": 0, "y": 8, "w": 6, "h": 4},
"targets": [
{
"expr": "sum(rate(http_server_requests_seconds_count{job='mosquito',status=~'5..'}[5m])) / sum(rate(http_server_requests_seconds_count{job='mosquito'}[5m]))"
}
],
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"max": 1,
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 0.01},
{"color": "red", "value": 0.05}
]
}
}
}
},
{
"title": "JVM堆内存使用",
"type": "graph",
"gridPos": {"x": 6, "y": 8, "w": 18, "h": 4},
"targets": [
{
"expr": "jvm_memory_used_bytes{job='mosquito',area='heap'}",
"legendFormat": "已使用"
},
{
"expr": "jvm_memory_max_bytes{job='mosquito',area='heap'}",
"legendFormat": "最大值"
}
],
"fieldConfig": {
"defaults": {
"unit": "bytes"
}
}
},
{
"title": "数据库连接池",
"type": "graph",
"gridPos": {"x": 0, "y": 12, "w": 12, "h": 6},
"targets": [
{
"expr": "hikaricp_connections_active{job='mosquito'}",
"legendFormat": "活跃连接"
},
{
"expr": "hikaricp_connections_idle{job='mosquito'}",
"legendFormat": "空闲连接"
},
{
"expr": "hikaricp_connections_max{job='mosquito'}",
"legendFormat": "最大连接"
}
]
},
{
"title": "Redis连接状态",
"type": "stat",
"gridPos": {"x": 12, "y": 12, "w": 12, "h": 6},
"targets": [
{
"expr": "up{job='redis_exporter'}"
}
],
"fieldConfig": {
"defaults": {
"mappings": [
{"value": 1, "text": "正常"},
{"value": 0, "text": "异常"}
],
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "green", "value": 1}
]
}
}
}
}
]
}
}
```
### 2. 业务指标仪表板
```json
{
"dashboard": {
"title": "Mosquito Business Metrics",
"panels": [
{
"title": "分享链接创建趋势",
"type": "graph",
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "sum(increase(mosquito_share_link_created_total[1h]))",
"legendFormat": "{{activity}}"
}
]
},
{
"title": "海报生成次数",
"type": "stat",
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "sum(increase(mosquito_poster_generated_total[24h]))"
}
]
},
{
"title": "排行榜访问热度",
"type": "heatmap",
"gridPos": {"x": 0, "y": 8, "w": 24, "h": 8},
"targets": [
{
"expr": "sum by (activity_id) (rate(mosquito_leaderboard_accessed_total[1h]))"
}
]
}
]
}
}
```
---
## 🚨 四、告警通知配置
### 1. Alertmanager配置
```yaml
# alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
templates:
- '/etc/alertmanager/templates/*.tmpl'
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
continue: true
- match:
severity: warning
receiver: 'warning-alerts'
- match:
alertname: 'ApplicationDown'
receiver: 'pagerduty'
receivers:
- name: 'default'
slack_configs:
- channel: '#mosquito-alerts'
send_resolved: true
title: '{{ .GroupLabels.alertname }}'
text: |
告警: {{ range .Alerts }}{{ .Annotations.summary }}
详情: {{ .Annotations.description }}
状态: {{ .Status }}
{{ end }}'
- name: 'critical-alerts'
slack_configs:
- channel: '#mosquito-critical'
send_resolved: true
title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
color: 'danger'
text: |
紧急告警:
{{ range .Alerts }}
- {{ .Annotations.summary }}
- {{ .Annotations.description }}
- 实例: {{ .Labels.instance }}
- 时间: {{ .StartsAt }}
{{ end }}'
email_configs:
- to: 'ops-team@yourcompany.com'
send_resolved: true
headers:
Subject: '🚨 CRITICAL: Mosquito Production Alert'
- name: 'warning-alerts'
slack_configs:
- channel: '#mosquito-alerts'
send_resolved: true
title: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
color: 'warning'
text: |
警告:
{{ range .Alerts }}
- {{ .Annotations.summary }}
- {{ .Annotations.description }}
{{ end }}'
- name: 'pagerduty'
pagerduty_configs:
- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
severity: 'critical'
```
### 2. PagerDuty集成
```yaml
# pagerduty配置示例
pagerduty_configs:
- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
description: '{{ .GroupLabels.alertname }}'
details:
firing: '{{ template "pagerduty.default.instances" .Alerts.Firing }}'
resolved: '{{ template "pagerduty.default.instances" .Alerts.Resolved }}'
num_firing: '{{ .Alerts.Firing | len }}'
num_resolved: '{{ .Alerts.Resolved | len }}'
```
---
## 📝 五、日志聚合配置
### 1. Loki配置
```yaml
# loki-config.yml
server:
http_listen_port: 3100
ingester:
lifecycler:
ring:
replication_factor: 1
kvstore:
store: inmemory
chunk_idle_period: 1h
chunk_retain_period: 1m
max_transfer_retries: 0
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb_shipper:
active_index_directory: /loki/boltdb-shipper-active
cache_location: /loki/boltdb-shipper-cache
shared_store: filesystem
filesystem:
directory: /loki/chunks
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
chunk_store_config:
max_look_back_period: 0s
table_manager:
retention_deletes_enabled: true
retention_period: 30d
```
### 2. Promtail配置
```yaml
# promtail-config.yml
server:
http_listen_port: 9080
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: mosquito
static_configs:
- targets:
- localhost
labels:
job: mosquito
app: mosquito-api
env: production
pipeline_stages:
- json:
expressions:
level: level
message: message
exception: exception
- labels:
level: level
- regex:
expression: '(?P<timestamp>\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}) (?P<level>\\w+) .*? - (?P<message>.*)'
- output:
source: message
```
---
## 📊 六、监控指标总结
### 核心监控指标
| 类别 | 指标 | 告警阈值 |
|------|------|----------|
| **可用性** | 应用启动状态 | down > 1min |
| **性能** | API响应时间(P95) | > 1.0s |
| **性能** | API响应时间(P99) | > 2.0s |
| **错误** | HTTP 5xx错误率 | > 5% |
| **系统** | CPU使用率 | > 80% |
| **系统** | 内存使用率 | > 90% |
| **系统** | 磁盘剩余空间 | < 10% |
| **数据库** | 连接池使用率 | > 80% |
| **缓存** | Redis连接状态 | down > 1min |
| **JVM** | GC停顿时间 | > 100ms |
### 业务监控指标
| 类别 | 指标 | 说明 |
|------|------|------|
| **用户行为** | 分享链接创建次数 | 总计和分活动 |
| **用户行为** | 海报生成次数 | 按模板类型 |
| **用户行为** | 排行榜访问次数 | 按活动ID |
| **业务逻辑** | 活动创建失败率 | 失败/总数 |
| **业务逻辑** | API密钥生成趋势 | 按时间段 |
---
## ✅ 监控检查清单
### 监控系统检查
- [x] Prometheus正常运行
- [x] Alertmanager配置正确
- [x] Grafana仪表板可用
- [x] Loki日志聚合正常
- [x] 告警通知渠道畅通
### 监控指标检查
- [x] 应用指标采集正常
- [x] 系统指标采集正常
- [x] 业务指标采集正常
- [x] 告警规则生效
- [x] 数据保留策略配置
### 告警通知检查
- [x] Slack通知正常
- [x] 邮件通知正常
- [x] PagerDuty集成正常
- [x] 告警分级正确
- [x] 告警抑制正常
---
*监控方案版本: v2.0.0*
*最后更新: 2026-01-22*
*维护团队: DevOps Team*