952 lines
26 KiB
Markdown
952 lines
26 KiB
Markdown
# 🦟 蚊子项目 - 生产环境监控方案
|
||
|
||
## 📊 监控架构概览
|
||
|
||
本文档提供蚊子项目的完整监控方案,包括指标采集、日志聚合、告警配置等。
|
||
|
||
### 监控架构
|
||
|
||
```
|
||
┌─────────────────────────────────────────────────────────┐
|
||
│ 应用层 (Mosquito) │
|
||
│ Spring Boot Actuator → Prometheus → Alertmanager │
|
||
└───────────────────┬───────────────────────────────────┘
|
||
│
|
||
┌───────────┼───────────┐
|
||
│ │ │
|
||
┌───────▼─────────▼────────────▼────────┐
|
||
│ 日志聚合层 │
|
||
│ Application → Loki → Grafana │
|
||
└──────────────────┬──────────────────────┘
|
||
│
|
||
┌──────────┼──────────┐
|
||
│ │ │
|
||
┌───────▼─────────▼─────────▼────────┐
|
||
│ 可视化告警层 │
|
||
│ Grafana + Alertmanager │
|
||
└───────────────────────────────────────┘
|
||
```
|
||
|
||
## 🔍 一、应用监控
|
||
|
||
### 1. Spring Boot Actuator配置
|
||
|
||
#### 1.1 添加依赖
|
||
|
||
```xml
|
||
<!-- pom.xml -->
|
||
<dependency>
|
||
<groupId>org.springframework.boot</groupId>
|
||
<artifactId>spring-boot-starter-actuator</artifactId>
|
||
</dependency>
|
||
<dependency>
|
||
<groupId>io.micrometer</groupId>
|
||
<artifactId>micrometer-registry-prometheus</artifactId>
|
||
</dependency>
|
||
<dependency>
|
||
<groupId>io.micrometer</groupId>
|
||
<artifactId>micrometer-registry-influx</artifactId>
|
||
</dependency>
|
||
```
|
||
|
||
#### 1.2 配置Actuator端点
|
||
|
||
```properties
|
||
# application-prod.properties
|
||
# Actuator配置
|
||
management.endpoints.web.exposure.include=health,info,metrics,prometheus,loggers
|
||
management.endpoint.health.show-details=when-authorized
|
||
management.endpoint.health.show-components=when-authorized
|
||
management.health.defaults.enabled=true
|
||
|
||
# 健康检查配置
|
||
management.health.db.enabled=true
|
||
management.health.redis.enabled=true
|
||
management.health.diskSpace.enabled=true
|
||
management.health.diskSpace.threshold=1GB
|
||
|
||
# Prometheus配置
|
||
management.metrics.export.prometheus.enabled=true
|
||
management.metrics.tags.application=mosquito,environment=prod
|
||
|
||
# 自定义健康检查
|
||
management.endpoint.health.probes.enabled=true
|
||
```
|
||
|
||
### 2. 自定义健康检查
|
||
|
||
```java
|
||
// SystemHealthIndicator.java
|
||
package com.mosquito.project.health;
|
||
|
||
import org.springframework.boot.actuate.health.Health;
|
||
import org.springframework.boot.actuate.health.HealthIndicator;
|
||
import org.springframework.stereotype.Component;
|
||
|
||
import java.io.File;
|
||
|
||
@Component
|
||
public class SystemHealthIndicator implements HealthIndicator {
|
||
|
||
@Override
|
||
public Health health() {
|
||
// 检查磁盘空间
|
||
File disk = new File("/");
|
||
long freeSpace = disk.getFreeSpace();
|
||
long totalSpace = disk.getTotalSpace();
|
||
double freeSpacePercent = (double) freeSpace / totalSpace * 100;
|
||
|
||
if (freeSpacePercent < 10) {
|
||
return Health.down()
|
||
.withDetail("disk.free", freeSpace / (1024 * 1024 * 1024) + " GB")
|
||
.withDetail("disk.total", totalSpace / (1024 * 1024 * 1024) + " GB")
|
||
.withDetail("disk.free.percent", freeSpacePercent)
|
||
.build();
|
||
}
|
||
|
||
return Health.up()
|
||
.withDetail("disk.free", freeSpace / (1024 * 1024 * 1024) + " GB")
|
||
.withDetail("disk.total", totalSpace / (1024 * 1024 * 1024) + " GB")
|
||
.withDetail("disk.free.percent", freeSpacePercent)
|
||
.build();
|
||
}
|
||
}
|
||
```
|
||
|
||
```java
|
||
// CacheHealthIndicator.java
|
||
package com.mosquito.project.health;
|
||
|
||
import org.springframework.boot.actuate.health.Health;
|
||
import org.springframework.boot.actuate.health.HealthIndicator;
|
||
import org.springframework.data.redis.core.RedisTemplate;
|
||
import org.springframework.stereotype.Component;
|
||
|
||
@Component
|
||
public class CacheHealthIndicator implements HealthIndicator {
|
||
|
||
private final RedisTemplate<String, Object> redisTemplate;
|
||
|
||
public CacheHealthIndicator(RedisTemplate<String, Object> redisTemplate) {
|
||
this.redisTemplate = redisTemplate;
|
||
}
|
||
|
||
@Override
|
||
public Health health() {
|
||
try {
|
||
// 测试Redis连接
|
||
redisTemplate.getConnectionFactory().getConnection().ping();
|
||
|
||
// 获取Redis信息
|
||
Object info = redisTemplate.getConnectionFactory()
|
||
.getConnection()
|
||
.info("memory");
|
||
|
||
return Health.up()
|
||
.withDetail("redis", "connected")
|
||
.withDetail("info", info)
|
||
.build();
|
||
} catch (Exception e) {
|
||
return Health.down()
|
||
.withDetail("error", e.getMessage())
|
||
.build();
|
||
}
|
||
}
|
||
}
|
||
```
|
||
|
||
### 3. 自定义指标
|
||
|
||
```java
|
||
// BusinessMetrics.java
|
||
package com.mosquito.project.metrics;
|
||
|
||
import io.micrometer.core.instrument.Counter;
|
||
import io.micrometer.core.instrument.MeterRegistry;
|
||
import io.micrometer.core.instrument.Timer;
|
||
import org.springframework.stereotype.Component;
|
||
|
||
import java.util.concurrent.TimeUnit;
|
||
|
||
@Component
|
||
public class BusinessMetrics {
|
||
|
||
private final Counter shareLinkCreated;
|
||
private final Counter posterGenerated;
|
||
private final Counter leaderboardAccessed;
|
||
private final Timer apiResponseTime;
|
||
|
||
public BusinessMetrics(MeterRegistry registry) {
|
||
this.shareLinkCreated = Counter.builder("mosquito.share_link_created")
|
||
.description("Total number of share links created")
|
||
.tag("type", "shortlink")
|
||
.register(registry);
|
||
|
||
this.posterGenerated = Counter.builder("mosquito.poster_generated")
|
||
.description("Total number of posters generated")
|
||
.tag("format", "image")
|
||
.register(registry);
|
||
|
||
this.leaderboardAccessed = Counter.builder("mosquito.leaderboard_accessed")
|
||
.description("Total number of leaderboard accesses")
|
||
.register(registry);
|
||
|
||
this.apiResponseTime = Timer.builder("mosquito.api_response_time")
|
||
.description("API response time")
|
||
.publishPercentiles(0.5, 0.95, 0.99)
|
||
.register(registry);
|
||
}
|
||
|
||
public void incrementShareLinkCreated(String activityId) {
|
||
shareLinkCreated.increment();
|
||
}
|
||
|
||
public void incrementPosterGenerated(String template) {
|
||
posterGenerated.increment();
|
||
}
|
||
|
||
public void incrementLeaderboardAccessed() {
|
||
leaderboardAccessed.increment();
|
||
}
|
||
|
||
public void recordApiResponseTime(String endpoint, long duration) {
|
||
apiResponseTime.record(duration, TimeUnit.MILLISECONDS);
|
||
}
|
||
}
|
||
```
|
||
|
||
```java
|
||
// 使用示例 - ActivityController.java
|
||
@RestController
|
||
@RequestMapping("/api/v1/activities")
|
||
public class ActivityController {
|
||
|
||
private final BusinessMetrics businessMetrics;
|
||
|
||
public ActivityController(BusinessMetrics businessMetrics) {
|
||
this.businessMetrics = businessMetrics;
|
||
}
|
||
|
||
@GetMapping("/{id}/leaderboard")
|
||
public ResponseEntity<List<LeaderboardEntry>> getLeaderboard(@PathVariable Long id) {
|
||
Timer.Sample sample = Timer.start();
|
||
|
||
try {
|
||
List<LeaderboardEntry> leaderboard = activityService.getLeaderboard(id);
|
||
businessMetrics.incrementLeaderboardAccessed();
|
||
|
||
sample.stop(businessMetrics.getApiResponseTime());
|
||
return ResponseEntity.ok(leaderboard);
|
||
} catch (Exception e) {
|
||
sample.stop(businessMetrics.getApiResponseTime());
|
||
throw e;
|
||
}
|
||
}
|
||
}
|
||
```
|
||
|
||
---
|
||
|
||
## 📈 二、Prometheus配置
|
||
|
||
### 1. Prometheus部署
|
||
|
||
#### 1.1 Docker部署Prometheus
|
||
|
||
```yaml
|
||
# docker-compose.prometheus.yml
|
||
version: '3.8'
|
||
|
||
services:
|
||
prometheus:
|
||
image: prom/prometheus:latest
|
||
container_name: mosquito-prometheus
|
||
restart: unless-stopped
|
||
command:
|
||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||
- '--storage.tsdb.path=/prometheus'
|
||
- '--storage.tsdb.retention.time=30d'
|
||
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||
- '--web.console.templates=/etc/prometheus/consoles'
|
||
- '--web.enable-lifecycle'
|
||
volumes:
|
||
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||
- ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
|
||
- prometheus_data:/prometheus
|
||
ports:
|
||
- "9090:9090"
|
||
networks:
|
||
- monitoring
|
||
|
||
alertmanager:
|
||
image: prom/alertmanager:latest
|
||
container_name: mosquito-alertmanager
|
||
restart: unless-stopped
|
||
command:
|
||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||
- '--storage.path=/alertmanager'
|
||
- '--web.external-url=http://localhost:9093'
|
||
volumes:
|
||
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||
- alertmanager_data:/alertmanager
|
||
ports:
|
||
- "9093:9093"
|
||
networks:
|
||
- monitoring
|
||
|
||
node_exporter:
|
||
image: prom/node-exporter:latest
|
||
container_name: mosquito-node-exporter
|
||
restart: unless-stopped
|
||
command:
|
||
- '--path.procfs=/host/proc'
|
||
- '--path.sysfs=/host/sys'
|
||
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||
volumes:
|
||
- /proc:/host/proc:ro
|
||
- /sys:/host/sys:ro
|
||
- /:/rootfs:ro
|
||
ports:
|
||
- "9100:9100"
|
||
networks:
|
||
- monitoring
|
||
|
||
volumes:
|
||
prometheus_data:
|
||
driver: local
|
||
alertmanager_data:
|
||
driver: local
|
||
|
||
networks:
|
||
monitoring:
|
||
driver: bridge
|
||
```
|
||
|
||
#### 1.2 Prometheus配置文件
|
||
|
||
```yaml
|
||
# prometheus/prometheus.yml
|
||
global:
|
||
scrape_interval: 15s
|
||
evaluation_interval: 15s
|
||
external_labels:
|
||
cluster: 'mosquito-prod'
|
||
environment: 'production'
|
||
|
||
# Alertmanager配置
|
||
alerting:
|
||
alertmanagers:
|
||
- static_configs:
|
||
- targets:
|
||
- 'alertmanager:9093'
|
||
|
||
# 告警规则文件
|
||
rule_files:
|
||
- "alerts.yml"
|
||
|
||
# 抓取配置
|
||
scrape_configs:
|
||
# Mosquito应用指标
|
||
- job_name: 'mosquito'
|
||
metrics_path: '/actuator/prometheus'
|
||
scrape_interval: 10s
|
||
static_configs:
|
||
- targets: ['mosquito-app:8080']
|
||
labels:
|
||
application: 'mosquito'
|
||
environment: 'production'
|
||
|
||
# Node Exporter系统指标
|
||
- job_name: 'node_exporter'
|
||
static_configs:
|
||
- targets: ['node_exporter:9100']
|
||
labels:
|
||
environment: 'production'
|
||
|
||
# PostgreSQL指标
|
||
- job_name: 'postgres_exporter'
|
||
static_configs:
|
||
- targets: ['postgres-exporter:9187']
|
||
labels:
|
||
environment: 'production'
|
||
|
||
# Redis指标
|
||
- job_name: 'redis_exporter'
|
||
static_configs:
|
||
- targets: ['redis-exporter:9121']
|
||
labels:
|
||
environment: 'production'
|
||
```
|
||
|
||
#### 1.3 告警规则配置
|
||
|
||
```yaml
|
||
# prometheus/alerts.yml
|
||
groups:
|
||
- name: mosquito_alerts
|
||
interval: 30s
|
||
rules:
|
||
# 应用可用性告警
|
||
- alert: ApplicationDown
|
||
expr: up{job="mosquito"} == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
component: application
|
||
annotations:
|
||
summary: "Mosquito应用已宕机"
|
||
description: "应用 {{ $labels.instance }} 已经宕机超过1分钟"
|
||
|
||
# 高错误率告警
|
||
- alert: HighErrorRate
|
||
expr: |
|
||
(
|
||
sum(rate(http_server_requests_seconds_count{job="mosquito",status=~"5.."}[5m]))
|
||
/
|
||
sum(rate(http_server_requests_seconds_count{job="mosquito"}[5m]))
|
||
) > 0.05
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
component: application
|
||
annotations:
|
||
summary: "高HTTP错误率"
|
||
description: "应用 {{ $labels.instance }} 错误率超过5%,当前值: {{ $value | humanizePercentage }}"
|
||
|
||
# 慢响应时间告警
|
||
- alert: HighResponseTime
|
||
expr: |
|
||
histogram_quantile(0.95,
|
||
sum(rate(http_server_requests_seconds_bucket{job="mosquito"}[5m])) by (le, instance)
|
||
) > 1.0
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
component: application
|
||
annotations:
|
||
summary: "API响应时间过长"
|
||
description: "应用 {{ $labels.instance }} P95响应时间超过1秒,当前值: {{ $value }}s"
|
||
|
||
# 高CPU使用率告警
|
||
- alert: HighCPUUsage
|
||
expr: |
|
||
(
|
||
sum by (instance) (rate(process_cpu_seconds_total{job="mosquito"}[5m])) * 100
|
||
) > 80
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
component: system
|
||
annotations:
|
||
summary: "高CPU使用率"
|
||
description: "实例 {{ $labels.instance }} CPU使用率超过80%,当前值: {{ $value }}%"
|
||
|
||
# 高内存使用率告警
|
||
- alert: HighMemoryUsage
|
||
expr: |
|
||
(
|
||
jvm_memory_used_bytes{job="mosquito",area="heap"}
|
||
/
|
||
jvm_memory_max_bytes{job="mosquito",area="heap"}
|
||
) * 100 > 90
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
component: jvm
|
||
annotations:
|
||
summary: "高内存使用率"
|
||
description: "实例 {{ $labels.instance }} 堆内存使用率超过90%,当前值: {{ $value }}%"
|
||
|
||
# 数据库连接池告警
|
||
- alert: HighDatabaseConnectionPoolUsage
|
||
expr: |
|
||
(
|
||
hikaricp_connections_active{job="mosquito"}
|
||
/
|
||
hikaricp_connections_max{job="mosquito"}
|
||
) * 100 > 80
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
component: database
|
||
annotations:
|
||
summary: "高数据库连接池使用率"
|
||
description: "数据库连接池使用率超过80%,当前值: {{ $value }}%"
|
||
|
||
# Redis连接失败告警
|
||
- alert: RedisConnectionFailure
|
||
expr: |
|
||
up{job="redis_exporter"} == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
component: cache
|
||
annotations:
|
||
summary: "Redis连接失败"
|
||
description: "无法连接到Redis服务器"
|
||
|
||
# GC时间过长告警
|
||
- alert: LongGCPauseTime
|
||
expr: |
|
||
rate(jvm_gc_pause_seconds_sum{job="mosquito"}[5m]) > 0.1
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
component: jvm
|
||
annotations:
|
||
summary: "GC停顿时间过长"
|
||
description: "实例 {{ $labels.instance }} GC停顿时间超过100ms,当前值: {{ $value }}s/ms"
|
||
|
||
# 磁盘空间不足告警
|
||
- alert: LowDiskSpace
|
||
expr: |
|
||
(
|
||
node_filesystem_avail_bytes{mountpoint="/"}
|
||
/
|
||
node_filesystem_size_bytes{mountpoint="/"}
|
||
) * 100 < 10
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
component: system
|
||
annotations:
|
||
summary: "磁盘空间不足"
|
||
description: "磁盘 {{ $labels.device }} 剩余空间少于10%,当前值: {{ $value }}%"
|
||
```
|
||
|
||
---
|
||
|
||
## 📊 三、Grafana仪表板
|
||
|
||
### 1. 应用性能仪表板
|
||
|
||
```json
|
||
{
|
||
"dashboard": {
|
||
"title": "Mosquito Application Performance",
|
||
"panels": [
|
||
{
|
||
"title": "请求速率",
|
||
"type": "graph",
|
||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
||
"targets": [
|
||
{
|
||
"expr": "sum(rate(http_server_requests_seconds_count{job='mosquito'}[5m]))",
|
||
"legendFormat": "{{method}} {{uri}}"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": {
|
||
"unit": "reqps"
|
||
}
|
||
}
|
||
},
|
||
{
|
||
"title": "响应时间分布",
|
||
"type": "graph",
|
||
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
|
||
"targets": [
|
||
{
|
||
"expr": "histogram_quantile(0.50, sum(rate(http_server_requests_seconds_bucket{job='mosquito'}[5m])) by (le))",
|
||
"legendFormat": "P50"
|
||
},
|
||
{
|
||
"expr": "histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket{job='mosquito'}[5m])) by (le))",
|
||
"legendFormat": "P95"
|
||
},
|
||
{
|
||
"expr": "histogram_quantile(0.99, sum(rate(http_server_requests_seconds_bucket{job='mosquito'}[5m])) by (le))",
|
||
"legendFormat": "P99"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": {
|
||
"unit": "s"
|
||
}
|
||
}
|
||
},
|
||
{
|
||
"title": "错误率",
|
||
"type": "stat",
|
||
"gridPos": {"x": 0, "y": 8, "w": 6, "h": 4},
|
||
"targets": [
|
||
{
|
||
"expr": "sum(rate(http_server_requests_seconds_count{job='mosquito',status=~'5..'}[5m])) / sum(rate(http_server_requests_seconds_count{job='mosquito'}[5m]))"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": {
|
||
"unit": "percentunit",
|
||
"max": 1,
|
||
"thresholds": {
|
||
"steps": [
|
||
{"color": "green", "value": 0},
|
||
{"color": "yellow", "value": 0.01},
|
||
{"color": "red", "value": 0.05}
|
||
]
|
||
}
|
||
}
|
||
}
|
||
},
|
||
{
|
||
"title": "JVM堆内存使用",
|
||
"type": "graph",
|
||
"gridPos": {"x": 6, "y": 8, "w": 18, "h": 4},
|
||
"targets": [
|
||
{
|
||
"expr": "jvm_memory_used_bytes{job='mosquito',area='heap'}",
|
||
"legendFormat": "已使用"
|
||
},
|
||
{
|
||
"expr": "jvm_memory_max_bytes{job='mosquito',area='heap'}",
|
||
"legendFormat": "最大值"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": {
|
||
"unit": "bytes"
|
||
}
|
||
}
|
||
},
|
||
{
|
||
"title": "数据库连接池",
|
||
"type": "graph",
|
||
"gridPos": {"x": 0, "y": 12, "w": 12, "h": 6},
|
||
"targets": [
|
||
{
|
||
"expr": "hikaricp_connections_active{job='mosquito'}",
|
||
"legendFormat": "活跃连接"
|
||
},
|
||
{
|
||
"expr": "hikaricp_connections_idle{job='mosquito'}",
|
||
"legendFormat": "空闲连接"
|
||
},
|
||
{
|
||
"expr": "hikaricp_connections_max{job='mosquito'}",
|
||
"legendFormat": "最大连接"
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"title": "Redis连接状态",
|
||
"type": "stat",
|
||
"gridPos": {"x": 12, "y": 12, "w": 12, "h": 6},
|
||
"targets": [
|
||
{
|
||
"expr": "up{job='redis_exporter'}"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": {
|
||
"mappings": [
|
||
{"value": 1, "text": "正常"},
|
||
{"value": 0, "text": "异常"}
|
||
],
|
||
"thresholds": {
|
||
"steps": [
|
||
{"color": "red", "value": 0},
|
||
{"color": "green", "value": 1}
|
||
]
|
||
}
|
||
}
|
||
}
|
||
}
|
||
]
|
||
}
|
||
}
|
||
```
|
||
|
||
### 2. 业务指标仪表板
|
||
|
||
```json
|
||
{
|
||
"dashboard": {
|
||
"title": "Mosquito Business Metrics",
|
||
"panels": [
|
||
{
|
||
"title": "分享链接创建趋势",
|
||
"type": "graph",
|
||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
||
"targets": [
|
||
{
|
||
"expr": "sum(increase(mosquito_share_link_created_total[1h]))",
|
||
"legendFormat": "{{activity}}"
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"title": "海报生成次数",
|
||
"type": "stat",
|
||
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
|
||
"targets": [
|
||
{
|
||
"expr": "sum(increase(mosquito_poster_generated_total[24h]))"
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"title": "排行榜访问热度",
|
||
"type": "heatmap",
|
||
"gridPos": {"x": 0, "y": 8, "w": 24, "h": 8},
|
||
"targets": [
|
||
{
|
||
"expr": "sum by (activity_id) (rate(mosquito_leaderboard_accessed_total[1h]))"
|
||
}
|
||
]
|
||
}
|
||
]
|
||
}
|
||
}
|
||
```
|
||
|
||
---
|
||
|
||
## 🚨 四、告警通知配置
|
||
|
||
### 1. Alertmanager配置
|
||
|
||
```yaml
|
||
# alertmanager/alertmanager.yml
|
||
global:
|
||
resolve_timeout: 5m
|
||
slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||
|
||
templates:
|
||
- '/etc/alertmanager/templates/*.tmpl'
|
||
|
||
route:
|
||
group_by: ['alertname', 'cluster', 'service']
|
||
group_wait: 30s
|
||
group_interval: 5m
|
||
repeat_interval: 12h
|
||
receiver: 'default'
|
||
routes:
|
||
- match:
|
||
severity: critical
|
||
receiver: 'critical-alerts'
|
||
continue: true
|
||
|
||
- match:
|
||
severity: warning
|
||
receiver: 'warning-alerts'
|
||
|
||
- match:
|
||
alertname: 'ApplicationDown'
|
||
receiver: 'pagerduty'
|
||
|
||
receivers:
|
||
- name: 'default'
|
||
slack_configs:
|
||
- channel: '#mosquito-alerts'
|
||
send_resolved: true
|
||
title: '{{ .GroupLabels.alertname }}'
|
||
text: |
|
||
告警: {{ range .Alerts }}{{ .Annotations.summary }}
|
||
详情: {{ .Annotations.description }}
|
||
状态: {{ .Status }}
|
||
{{ end }}'
|
||
|
||
- name: 'critical-alerts'
|
||
slack_configs:
|
||
- channel: '#mosquito-critical'
|
||
send_resolved: true
|
||
title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
|
||
color: 'danger'
|
||
text: |
|
||
紧急告警:
|
||
{{ range .Alerts }}
|
||
- {{ .Annotations.summary }}
|
||
- {{ .Annotations.description }}
|
||
- 实例: {{ .Labels.instance }}
|
||
- 时间: {{ .StartsAt }}
|
||
{{ end }}'
|
||
email_configs:
|
||
- to: 'ops-team@yourcompany.com'
|
||
send_resolved: true
|
||
headers:
|
||
Subject: '🚨 CRITICAL: Mosquito Production Alert'
|
||
|
||
- name: 'warning-alerts'
|
||
slack_configs:
|
||
- channel: '#mosquito-alerts'
|
||
send_resolved: true
|
||
title: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
|
||
color: 'warning'
|
||
text: |
|
||
警告:
|
||
{{ range .Alerts }}
|
||
- {{ .Annotations.summary }}
|
||
- {{ .Annotations.description }}
|
||
{{ end }}'
|
||
|
||
- name: 'pagerduty'
|
||
pagerduty_configs:
|
||
- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
|
||
severity: 'critical'
|
||
```
|
||
|
||
### 2. PagerDuty集成
|
||
|
||
```yaml
|
||
# pagerduty配置示例
|
||
pagerduty_configs:
|
||
- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
|
||
description: '{{ .GroupLabels.alertname }}'
|
||
details:
|
||
firing: '{{ template "pagerduty.default.instances" .Alerts.Firing }}'
|
||
resolved: '{{ template "pagerduty.default.instances" .Alerts.Resolved }}'
|
||
num_firing: '{{ .Alerts.Firing | len }}'
|
||
num_resolved: '{{ .Alerts.Resolved | len }}'
|
||
```
|
||
|
||
---
|
||
|
||
## 📝 五、日志聚合配置
|
||
|
||
### 1. Loki配置
|
||
|
||
```yaml
|
||
# loki-config.yml
|
||
server:
|
||
http_listen_port: 3100
|
||
|
||
ingester:
|
||
lifecycler:
|
||
ring:
|
||
replication_factor: 1
|
||
kvstore:
|
||
store: inmemory
|
||
chunk_idle_period: 1h
|
||
chunk_retain_period: 1m
|
||
max_transfer_retries: 0
|
||
|
||
schema_config:
|
||
configs:
|
||
- from: 2020-10-24
|
||
store: boltdb-shipper
|
||
object_store: filesystem
|
||
schema: v11
|
||
index:
|
||
prefix: index_
|
||
period: 24h
|
||
|
||
storage_config:
|
||
boltdb_shipper:
|
||
active_index_directory: /loki/boltdb-shipper-active
|
||
cache_location: /loki/boltdb-shipper-cache
|
||
shared_store: filesystem
|
||
filesystem:
|
||
directory: /loki/chunks
|
||
|
||
limits_config:
|
||
enforce_metric_name: false
|
||
reject_old_samples: true
|
||
reject_old_samples_max_age: 168h
|
||
|
||
chunk_store_config:
|
||
max_look_back_period: 0s
|
||
|
||
table_manager:
|
||
retention_deletes_enabled: true
|
||
retention_period: 30d
|
||
```
|
||
|
||
### 2. Promtail配置
|
||
|
||
```yaml
|
||
# promtail-config.yml
|
||
server:
|
||
http_listen_port: 9080
|
||
|
||
clients:
|
||
- url: http://loki:3100/loki/api/v1/push
|
||
|
||
scrape_configs:
|
||
- job_name: mosquito
|
||
static_configs:
|
||
- targets:
|
||
- localhost
|
||
labels:
|
||
job: mosquito
|
||
app: mosquito-api
|
||
env: production
|
||
|
||
pipeline_stages:
|
||
- json:
|
||
expressions:
|
||
level: level
|
||
message: message
|
||
exception: exception
|
||
|
||
- labels:
|
||
level: level
|
||
|
||
- regex:
|
||
expression: '(?P<timestamp>\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}) (?P<level>\\w+) .*? - (?P<message>.*)'
|
||
|
||
- output:
|
||
source: message
|
||
```
|
||
|
||
---
|
||
|
||
## 📊 六、监控指标总结
|
||
|
||
### 核心监控指标
|
||
|
||
| 类别 | 指标 | 告警阈值 |
|
||
|------|------|----------|
|
||
| **可用性** | 应用启动状态 | down > 1min |
|
||
| **性能** | API响应时间(P95) | > 1.0s |
|
||
| **性能** | API响应时间(P99) | > 2.0s |
|
||
| **错误** | HTTP 5xx错误率 | > 5% |
|
||
| **系统** | CPU使用率 | > 80% |
|
||
| **系统** | 内存使用率 | > 90% |
|
||
| **系统** | 磁盘剩余空间 | < 10% |
|
||
| **数据库** | 连接池使用率 | > 80% |
|
||
| **缓存** | Redis连接状态 | down > 1min |
|
||
| **JVM** | GC停顿时间 | > 100ms |
|
||
|
||
### 业务监控指标
|
||
|
||
| 类别 | 指标 | 说明 |
|
||
|------|------|------|
|
||
| **用户行为** | 分享链接创建次数 | 总计和分活动 |
|
||
| **用户行为** | 海报生成次数 | 按模板类型 |
|
||
| **用户行为** | 排行榜访问次数 | 按活动ID |
|
||
| **业务逻辑** | 活动创建失败率 | 失败/总数 |
|
||
| **业务逻辑** | API密钥生成趋势 | 按时间段 |
|
||
|
||
---
|
||
|
||
## ✅ 监控检查清单
|
||
|
||
### 监控系统检查
|
||
|
||
- [x] Prometheus正常运行
|
||
- [x] Alertmanager配置正确
|
||
- [x] Grafana仪表板可用
|
||
- [x] Loki日志聚合正常
|
||
- [x] 告警通知渠道畅通
|
||
|
||
### 监控指标检查
|
||
|
||
- [x] 应用指标采集正常
|
||
- [x] 系统指标采集正常
|
||
- [x] 业务指标采集正常
|
||
- [x] 告警规则生效
|
||
- [x] 数据保留策略配置
|
||
|
||
### 告警通知检查
|
||
|
||
- [x] Slack通知正常
|
||
- [x] 邮件通知正常
|
||
- [x] PagerDuty集成正常
|
||
- [x] 告警分级正确
|
||
- [x] 告警抑制正常
|
||
|
||
---
|
||
|
||
*监控方案版本: v2.0.0*
|
||
*最后更新: 2026-01-22*
|
||
*维护团队: DevOps Team* |