test(cache): 修复CacheConfigTest边界值测试

- 修改 shouldVerifyCacheManager_withMaximumIntegerTtl 为 shouldVerifyCacheManager_withMaximumAllowedTtl - 使用正确的最大TTL值（10080分钟，7天）而不是 Integer.MAX_VALUE - 新增 shouldThrowException_whenTtlExceedsMaximum 测试验证边界检查 - 所有1266个测试用例通过 - 覆盖率: 指令81.89%, 行88.48%, 分支51.55% docs: 添加项目状态报告 - 生成 PROJECT_STATUS_REPORT.md 详细记录项目当前状态 - 包含质量指标、已完成功能、待办事项和技术债务
2026-03-02 13:31:54 +08:00
parent 32d6449ea4
commit 91a0b77f7a
2272 changed files with 221995 additions and 503 deletions
--- a/MONITORING_PLAN.md
+++ b/MONITORING_PLAN.md
@@ -0,0 +1,952 @@
+# 🦟 蚊子项目 - 生产环境监控方案
+
+## 📊 监控架构概览
+
+本文档提供蚊子项目的完整监控方案，包括指标采集、日志聚合、告警配置等。
+
+### 监控架构
+
+```
+┌─────────────────────────────────────────────────────────┐
+│                    应用层 (Mosquito)                   │
+│  Spring Boot Actuator → Prometheus → Alertmanager      │
+└───────────────────┬───────────────────────────────────┘
+                    │
+        ┌───────────┼───────────┐
+        │           │           │
+┌───────▼─────────▼────────────▼────────┐
+│           日志聚合层                    │
+│  Application → Loki → Grafana          │
+└──────────────────┬──────────────────────┘
+                   │
+        ┌──────────┼──────────┐
+        │          │          │
+┌───────▼─────────▼─────────▼────────┐
+│           可视化告警层                 │
+│     Grafana + Alertmanager            │
+└───────────────────────────────────────┘
+```
+
+## 🔍 一、应用监控
+
+### 1. Spring Boot Actuator配置
+
+#### 1.1 添加依赖
+
+```xml
+<!-- pom.xml -->
+<dependency>
+    <groupId>org.springframework.boot</groupId>
+    <artifactId>spring-boot-starter-actuator</artifactId>
+</dependency>
+<dependency>
+    <groupId>io.micrometer</groupId>
+    <artifactId>micrometer-registry-prometheus</artifactId>
+</dependency>
+<dependency>
+    <groupId>io.micrometer</groupId>
+    <artifactId>micrometer-registry-influx</artifactId>
+</dependency>
+```
+
+#### 1.2 配置Actuator端点
+
+```properties
+# application-prod.properties
+# Actuator配置
+management.endpoints.web.exposure.include=health,info,metrics,prometheus,loggers
+management.endpoint.health.show-details=when-authorized
+management.endpoint.health.show-components=when-authorized
+management.health.defaults.enabled=true
+
+# 健康检查配置
+management.health.db.enabled=true
+management.health.redis.enabled=true
+management.health.diskSpace.enabled=true
+management.health.diskSpace.threshold=1GB
+
+# Prometheus配置
+management.metrics.export.prometheus.enabled=true
+management.metrics.tags.application=mosquito,environment=prod
+
+# 自定义健康检查
+management.endpoint.health.probes.enabled=true
+```
+
+### 2. 自定义健康检查
+
+```java
+// SystemHealthIndicator.java
+package com.mosquito.project.health;
+
+import org.springframework.boot.actuate.health.Health;
+import org.springframework.boot.actuate.health.HealthIndicator;
+import org.springframework.stereotype.Component;
+
+import java.io.File;
+
+@Component
+public class SystemHealthIndicator implements HealthIndicator {
+    
+    @Override
+    public Health health() {
+        // 检查磁盘空间
+        File disk = new File("/");
+        long freeSpace = disk.getFreeSpace();
+        long totalSpace = disk.getTotalSpace();
+        double freeSpacePercent = (double) freeSpace / totalSpace * 100;
+        
+        if (freeSpacePercent < 10) {
+            return Health.down()
+                .withDetail("disk.free", freeSpace / (1024 * 1024 * 1024) + " GB")
+                .withDetail("disk.total", totalSpace / (1024 * 1024 * 1024) + " GB")
+                .withDetail("disk.free.percent", freeSpacePercent)
+                .build();
+        }
+        
+        return Health.up()
+            .withDetail("disk.free", freeSpace / (1024 * 1024 * 1024) + " GB")
+            .withDetail("disk.total", totalSpace / (1024 * 1024 * 1024) + " GB")
+            .withDetail("disk.free.percent", freeSpacePercent)
+            .build();
+    }
+}
+```
+
+```java
+// CacheHealthIndicator.java
+package com.mosquito.project.health;
+
+import org.springframework.boot.actuate.health.Health;
+import org.springframework.boot.actuate.health.HealthIndicator;
+import org.springframework.data.redis.core.RedisTemplate;
+import org.springframework.stereotype.Component;
+
+@Component
+public class CacheHealthIndicator implements HealthIndicator {
+    
+    private final RedisTemplate<String, Object> redisTemplate;
+    
+    public CacheHealthIndicator(RedisTemplate<String, Object> redisTemplate) {
+        this.redisTemplate = redisTemplate;
+    }
+    
+    @Override
+    public Health health() {
+        try {
+            // 测试Redis连接
+            redisTemplate.getConnectionFactory().getConnection().ping();
+            
+            // 获取Redis信息
+            Object info = redisTemplate.getConnectionFactory()
+                .getConnection()
+                .info("memory");
+            
+            return Health.up()
+                .withDetail("redis", "connected")
+                .withDetail("info", info)
+                .build();
+        } catch (Exception e) {
+            return Health.down()
+                .withDetail("error", e.getMessage())
+                .build();
+        }
+    }
+}
+```
+
+### 3. 自定义指标
+
+```java
+// BusinessMetrics.java
+package com.mosquito.project.metrics;
+
+import io.micrometer.core.instrument.Counter;
+import io.micrometer.core.instrument.MeterRegistry;
+import io.micrometer.core.instrument.Timer;
+import org.springframework.stereotype.Component;
+
+import java.util.concurrent.TimeUnit;
+
+@Component
+public class BusinessMetrics {
+    
+    private final Counter shareLinkCreated;
+    private final Counter posterGenerated;
+    private final Counter leaderboardAccessed;
+    private final Timer apiResponseTime;
+    
+    public BusinessMetrics(MeterRegistry registry) {
+        this.shareLinkCreated = Counter.builder("mosquito.share_link_created")
+            .description("Total number of share links created")
+            .tag("type", "shortlink")
+            .register(registry);
+        
+        this.posterGenerated = Counter.builder("mosquito.poster_generated")
+            .description("Total number of posters generated")
+            .tag("format", "image")
+            .register(registry);
+        
+        this.leaderboardAccessed = Counter.builder("mosquito.leaderboard_accessed")
+            .description("Total number of leaderboard accesses")
+            .register(registry);
+        
+        this.apiResponseTime = Timer.builder("mosquito.api_response_time")
+            .description("API response time")
+            .publishPercentiles(0.5, 0.95, 0.99)
+            .register(registry);
+    }
+    
+    public void incrementShareLinkCreated(String activityId) {
+        shareLinkCreated.increment();
+    }
+    
+    public void incrementPosterGenerated(String template) {
+        posterGenerated.increment();
+    }
+    
+    public void incrementLeaderboardAccessed() {
+        leaderboardAccessed.increment();
+    }
+    
+    public void recordApiResponseTime(String endpoint, long duration) {
+        apiResponseTime.record(duration, TimeUnit.MILLISECONDS);
+    }
+}
+```
+
+```java
+// 使用示例 - ActivityController.java
+@RestController
+@RequestMapping("/api/v1/activities")
+public class ActivityController {
+    
+    private final BusinessMetrics businessMetrics;
+    
+    public ActivityController(BusinessMetrics businessMetrics) {
+        this.businessMetrics = businessMetrics;
+    }
+    
+    @GetMapping("/{id}/leaderboard")
+    public ResponseEntity<List<LeaderboardEntry>> getLeaderboard(@PathVariable Long id) {
+        Timer.Sample sample = Timer.start();
+        
+        try {
+            List<LeaderboardEntry> leaderboard = activityService.getLeaderboard(id);
+            businessMetrics.incrementLeaderboardAccessed();
+            
+            sample.stop(businessMetrics.getApiResponseTime());
+            return ResponseEntity.ok(leaderboard);
+        } catch (Exception e) {
+            sample.stop(businessMetrics.getApiResponseTime());
+            throw e;
+        }
+    }
+}
+```
+
+---
+
+## 📈 二、Prometheus配置
+
+### 1. Prometheus部署
+
+#### 1.1 Docker部署Prometheus
+
+```yaml
+# docker-compose.prometheus.yml
+version: '3.8'
+
+services:
+  prometheus:
+    image: prom/prometheus:latest
+    container_name: mosquito-prometheus
+    restart: unless-stopped
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--storage.tsdb.retention.time=30d'
+      - '--web.console.libraries=/etc/prometheus/console_libraries'
+      - '--web.console.templates=/etc/prometheus/consoles'
+      - '--web.enable-lifecycle'
+    volumes:
+      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
+      - prometheus_data:/prometheus
+    ports:
+      - "9090:9090"
+    networks:
+      - monitoring
+
+  alertmanager:
+    image: prom/alertmanager:latest
+    container_name: mosquito-alertmanager
+    restart: unless-stopped
+    command:
+      - '--config.file=/etc/alertmanager/alertmanager.yml'
+      - '--storage.path=/alertmanager'
+      - '--web.external-url=http://localhost:9093'
+    volumes:
+      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+      - alertmanager_data:/alertmanager
+    ports:
+      - "9093:9093"
+    networks:
+      - monitoring
+
+  node_exporter:
+    image: prom/node-exporter:latest
+    container_name: mosquito-node-exporter
+    restart: unless-stopped
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.sysfs=/host/sys'
+      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    ports:
+      - "9100:9100"
+    networks:
+      - monitoring
+
+volumes:
+  prometheus_data:
+    driver: local
+  alertmanager_data:
+    driver: local
+
+networks:
+  monitoring:
+    driver: bridge
+```
+
+#### 1.2 Prometheus配置文件
+
+```yaml
+# prometheus/prometheus.yml
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+  external_labels:
+    cluster: 'mosquito-prod'
+    environment: 'production'
+
+# Alertmanager配置
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+            - 'alertmanager:9093'
+
+# 告警规则文件
+rule_files:
+  - "alerts.yml"
+
+# 抓取配置
+scrape_configs:
+  # Mosquito应用指标
+  - job_name: 'mosquito'
+    metrics_path: '/actuator/prometheus'
+    scrape_interval: 10s
+    static_configs:
+      - targets: ['mosquito-app:8080']
+        labels:
+          application: 'mosquito'
+          environment: 'production'
+          
+  # Node Exporter系统指标
+  - job_name: 'node_exporter'
+    static_configs:
+      - targets: ['node_exporter:9100']
+        labels:
+          environment: 'production'
+          
+  # PostgreSQL指标
+  - job_name: 'postgres_exporter'
+    static_configs:
+      - targets: ['postgres-exporter:9187']
+        labels:
+          environment: 'production'
+          
+  # Redis指标
+  - job_name: 'redis_exporter'
+    static_configs:
+      - targets: ['redis-exporter:9121']
+        labels:
+          environment: 'production'
+```
+
+#### 1.3 告警规则配置
+
+```yaml
+# prometheus/alerts.yml
+groups:
+  - name: mosquito_alerts
+    interval: 30s
+    rules:
+      # 应用可用性告警
+      - alert: ApplicationDown
+        expr: up{job="mosquito"} == 0
+        for: 1m
+        labels:
+          severity: critical
+          component: application
+        annotations:
+          summary: "Mosquito应用已宕机"
+          description: "应用 {{ $labels.instance }} 已经宕机超过1分钟"
+          
+      # 高错误率告警
+      - alert: HighErrorRate
+        expr: |
+          (
+            sum(rate(http_server_requests_seconds_count{job="mosquito",status=~"5.."}[5m]))
+            /
+            sum(rate(http_server_requests_seconds_count{job="mosquito"}[5m]))
+          ) > 0.05
+        for: 5m
+        labels:
+          severity: warning
+          component: application
+        annotations:
+          summary: "高HTTP错误率"
+          description: "应用 {{ $labels.instance }} 错误率超过5%，当前值: {{ $value | humanizePercentage }}"
+          
+      # 慢响应时间告警
+      - alert: HighResponseTime
+        expr: |
+          histogram_quantile(0.95, 
+            sum(rate(http_server_requests_seconds_bucket{job="mosquito"}[5m])) by (le, instance)
+          ) > 1.0
+        for: 10m
+        labels:
+          severity: warning
+          component: application
+        annotations:
+          summary: "API响应时间过长"
+          description: "应用 {{ $labels.instance }} P95响应时间超过1秒，当前值: {{ $value }}s"
+          
+      # 高CPU使用率告警
+      - alert: HighCPUUsage
+        expr: |
+          (
+            sum by (instance) (rate(process_cpu_seconds_total{job="mosquito"}[5m])) * 100
+          ) > 80
+        for: 10m
+        labels:
+          severity: warning
+          component: system
+        annotations:
+          summary: "高CPU使用率"
+          description: "实例 {{ $labels.instance }} CPU使用率超过80%，当前值: {{ $value }}%"
+          
+      # 高内存使用率告警
+      - alert: HighMemoryUsage
+        expr: |
+          (
+            jvm_memory_used_bytes{job="mosquito",area="heap"} 
+            / 
+            jvm_memory_max_bytes{job="mosquito",area="heap"}
+          ) * 100 > 90
+        for: 5m
+        labels:
+          severity: warning
+          component: jvm
+        annotations:
+          summary: "高内存使用率"
+          description: "实例 {{ $labels.instance }} 堆内存使用率超过90%，当前值: {{ $value }}%"
+          
+      # 数据库连接池告警
+      - alert: HighDatabaseConnectionPoolUsage
+        expr: |
+          (
+            hikaricp_connections_active{job="mosquito"}
+            / 
+            hikaricp_connections_max{job="mosquito"}
+          ) * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+          component: database
+        annotations:
+          summary: "高数据库连接池使用率"
+          description: "数据库连接池使用率超过80%，当前值: {{ $value }}%"
+          
+      # Redis连接失败告警
+      - alert: RedisConnectionFailure
+        expr: |
+          up{job="redis_exporter"} == 0
+        for: 1m
+        labels:
+          severity: critical
+          component: cache
+        annotations:
+          summary: "Redis连接失败"
+          description: "无法连接到Redis服务器"
+          
+      # GC时间过长告警
+      - alert: LongGCPauseTime
+        expr: |
+          rate(jvm_gc_pause_seconds_sum{job="mosquito"}[5m]) > 0.1
+        for: 10m
+        labels:
+          severity: warning
+          component: jvm
+        annotations:
+          summary: "GC停顿时间过长"
+          description: "实例 {{ $labels.instance }} GC停顿时间超过100ms，当前值: {{ $value }}s/ms"
+          
+      # 磁盘空间不足告警
+      - alert: LowDiskSpace
+        expr: |
+          (
+            node_filesystem_avail_bytes{mountpoint="/"}
+            / 
+            node_filesystem_size_bytes{mountpoint="/"}
+          ) * 100 < 10
+        for: 5m
+        labels:
+          severity: warning
+          component: system
+        annotations:
+          summary: "磁盘空间不足"
+          description: "磁盘 {{ $labels.device }} 剩余空间少于10%，当前值: {{ $value }}%"
+```
+
+---
+
+## 📊 三、Grafana仪表板
+
+### 1. 应用性能仪表板
+
+```json
+{
+  "dashboard": {
+    "title": "Mosquito Application Performance",
+    "panels": [
+      {
+        "title": "请求速率",
+        "type": "graph",
+        "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
+        "targets": [
+          {
+            "expr": "sum(rate(http_server_requests_seconds_count{job='mosquito'}[5m]))",
+            "legendFormat": "{{method}} {{uri}}"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "reqps"
+          }
+        }
+      },
+      {
+        "title": "响应时间分布",
+        "type": "graph",
+        "gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
+        "targets": [
+          {
+            "expr": "histogram_quantile(0.50, sum(rate(http_server_requests_seconds_bucket{job='mosquito'}[5m])) by (le))",
+            "legendFormat": "P50"
+          },
+          {
+            "expr": "histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket{job='mosquito'}[5m])) by (le))",
+            "legendFormat": "P95"
+          },
+          {
+            "expr": "histogram_quantile(0.99, sum(rate(http_server_requests_seconds_bucket{job='mosquito'}[5m])) by (le))",
+            "legendFormat": "P99"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "s"
+          }
+        }
+      },
+      {
+        "title": "错误率",
+        "type": "stat",
+        "gridPos": {"x": 0, "y": 8, "w": 6, "h": 4},
+        "targets": [
+          {
+            "expr": "sum(rate(http_server_requests_seconds_count{job='mosquito',status=~'5..'}[5m])) / sum(rate(http_server_requests_seconds_count{job='mosquito'}[5m]))"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percentunit",
+            "max": 1,
+            "thresholds": {
+              "steps": [
+                {"color": "green", "value": 0},
+                {"color": "yellow", "value": 0.01},
+                {"color": "red", "value": 0.05}
+              ]
+            }
+          }
+        }
+      },
+      {
+        "title": "JVM堆内存使用",
+        "type": "graph",
+        "gridPos": {"x": 6, "y": 8, "w": 18, "h": 4},
+        "targets": [
+          {
+            "expr": "jvm_memory_used_bytes{job='mosquito',area='heap'}",
+            "legendFormat": "已使用"
+          },
+          {
+            "expr": "jvm_memory_max_bytes{job='mosquito',area='heap'}",
+            "legendFormat": "最大值"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "bytes"
+          }
+        }
+      },
+      {
+        "title": "数据库连接池",
+        "type": "graph",
+        "gridPos": {"x": 0, "y": 12, "w": 12, "h": 6},
+        "targets": [
+          {
+            "expr": "hikaricp_connections_active{job='mosquito'}",
+            "legendFormat": "活跃连接"
+          },
+          {
+            "expr": "hikaricp_connections_idle{job='mosquito'}",
+            "legendFormat": "空闲连接"
+          },
+          {
+            "expr": "hikaricp_connections_max{job='mosquito'}",
+            "legendFormat": "最大连接"
+          }
+        ]
+      },
+      {
+        "title": "Redis连接状态",
+        "type": "stat",
+        "gridPos": {"x": 12, "y": 12, "w": 12, "h": 6},
+        "targets": [
+          {
+            "expr": "up{job='redis_exporter'}"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "mappings": [
+              {"value": 1, "text": "正常"},
+              {"value": 0, "text": "异常"}
+            ],
+            "thresholds": {
+              "steps": [
+                {"color": "red", "value": 0},
+                {"color": "green", "value": 1}
+              ]
+            }
+          }
+        }
+      }
+    ]
+  }
+}
+```
+
+### 2. 业务指标仪表板
+
+```json
+{
+  "dashboard": {
+    "title": "Mosquito Business Metrics",
+    "panels": [
+      {
+        "title": "分享链接创建趋势",
+        "type": "graph",
+        "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
+        "targets": [
+          {
+            "expr": "sum(increase(mosquito_share_link_created_total[1h]))",
+            "legendFormat": "{{activity}}"
+          }
+        ]
+      },
+      {
+        "title": "海报生成次数",
+        "type": "stat",
+        "gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
+        "targets": [
+          {
+            "expr": "sum(increase(mosquito_poster_generated_total[24h]))"
+          }
+        ]
+      },
+      {
+        "title": "排行榜访问热度",
+        "type": "heatmap",
+        "gridPos": {"x": 0, "y": 8, "w": 24, "h": 8},
+        "targets": [
+          {
+            "expr": "sum by (activity_id) (rate(mosquito_leaderboard_accessed_total[1h]))"
+          }
+        ]
+      }
+    ]
+  }
+}
+```
+
+---
+
+## 🚨 四、告警通知配置
+
+### 1. Alertmanager配置
+
+```yaml
+# alertmanager/alertmanager.yml
+global:
+  resolve_timeout: 5m
+  slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
+
+templates:
+  - '/etc/alertmanager/templates/*.tmpl'
+
+route:
+  group_by: ['alertname', 'cluster', 'service']
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 12h
+  receiver: 'default'
+  routes:
+    - match:
+        severity: critical
+      receiver: 'critical-alerts'
+      continue: true
+    
+    - match:
+        severity: warning
+      receiver: 'warning-alerts'
+    
+    - match:
+        alertname: 'ApplicationDown'
+      receiver: 'pagerduty'
+
+receivers:
+  - name: 'default'
+    slack_configs:
+      - channel: '#mosquito-alerts'
+        send_resolved: true
+        title: '{{ .GroupLabels.alertname }}'
+        text: |
+          告警: {{ range .Alerts }}{{ .Annotations.summary }}
+          详情: {{ .Annotations.description }}
+          状态: {{ .Status }}
+          {{ end }}'
+
+  - name: 'critical-alerts'
+    slack_configs:
+      - channel: '#mosquito-critical'
+        send_resolved: true
+        title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
+        color: 'danger'
+        text: |
+          紧急告警:
+          {{ range .Alerts }}
+          - {{ .Annotations.summary }}
+          - {{ .Annotations.description }}
+          - 实例: {{ .Labels.instance }}
+          - 时间: {{ .StartsAt }}
+          {{ end }}'
+    email_configs:
+      - to: 'ops-team@yourcompany.com'
+        send_resolved: true
+        headers:
+          Subject: '🚨 CRITICAL: Mosquito Production Alert'
+
+  - name: 'warning-alerts'
+    slack_configs:
+      - channel: '#mosquito-alerts'
+        send_resolved: true
+        title: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
+        color: 'warning'
+        text: |
+          警告:
+          {{ range .Alerts }}
+          - {{ .Annotations.summary }}
+          - {{ .Annotations.description }}
+          {{ end }}'
+
+  - name: 'pagerduty'
+    pagerduty_configs:
+      - service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
+        severity: 'critical'
+```
+
+### 2. PagerDuty集成
+
+```yaml
+# pagerduty配置示例
+pagerduty_configs:
+  - service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
+    description: '{{ .GroupLabels.alertname }}'
+    details:
+      firing: '{{ template "pagerduty.default.instances" .Alerts.Firing }}'
+      resolved: '{{ template "pagerduty.default.instances" .Alerts.Resolved }}'
+      num_firing: '{{ .Alerts.Firing | len }}'
+      num_resolved: '{{ .Alerts.Resolved | len }}'
+```
+
+---
+
+## 📝 五、日志聚合配置
+
+### 1. Loki配置
+
+```yaml
+# loki-config.yml
+server:
+  http_listen_port: 3100
+
+ingester:
+  lifecycler:
+    ring:
+      replication_factor: 1
+      kvstore:
+        store: inmemory
+  chunk_idle_period: 1h
+  chunk_retain_period: 1m
+  max_transfer_retries: 0
+
+schema_config:
+  configs:
+    - from: 2020-10-24
+      store: boltdb-shipper
+      object_store: filesystem
+      schema: v11
+      index:
+        prefix: index_
+        period: 24h
+
+storage_config:
+  boltdb_shipper:
+    active_index_directory: /loki/boltdb-shipper-active
+    cache_location: /loki/boltdb-shipper-cache
+    shared_store: filesystem
+  filesystem:
+    directory: /loki/chunks
+
+limits_config:
+  enforce_metric_name: false
+  reject_old_samples: true
+  reject_old_samples_max_age: 168h
+
+chunk_store_config:
+  max_look_back_period: 0s
+
+table_manager:
+  retention_deletes_enabled: true
+  retention_period: 30d
+```
+
+### 2. Promtail配置
+
+```yaml
+# promtail-config.yml
+server:
+  http_listen_port: 9080
+
+clients:
+  - url: http://loki:3100/loki/api/v1/push
+
+scrape_configs:
+  - job_name: mosquito
+    static_configs:
+      - targets:
+          - localhost
+        labels:
+          job: mosquito
+          app: mosquito-api
+          env: production
+    
+    pipeline_stages:
+      - json:
+          expressions:
+            level: level
+            message: message
+            exception: exception
+      
+      - labels:
+          level: level
+      
+      - regex:
+          expression: '(?P<timestamp>\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}) (?P<level>\\w+) .*? - (?P<message>.*)'
+      
+      - output:
+          source: message
+```
+
+---
+
+## 📊 六、监控指标总结
+
+### 核心监控指标
+
+| 类别 | 指标 | 告警阈值 |
+|------|------|----------|
+| **可用性** | 应用启动状态 | down > 1min |
+| **性能** | API响应时间(P95) | > 1.0s |
+| **性能** | API响应时间(P99) | > 2.0s |
+| **错误** | HTTP 5xx错误率 | > 5% |
+| **系统** | CPU使用率 | > 80% |
+| **系统** | 内存使用率 | > 90% |
+| **系统** | 磁盘剩余空间 | < 10% |
+| **数据库** | 连接池使用率 | > 80% |
+| **缓存** | Redis连接状态 | down > 1min |
+| **JVM** | GC停顿时间 | > 100ms |
+
+### 业务监控指标
+
+| 类别 | 指标 | 说明 |
+|------|------|------|
+| **用户行为** | 分享链接创建次数 | 总计和分活动 |
+| **用户行为** | 海报生成次数 | 按模板类型 |
+| **用户行为** | 排行榜访问次数 | 按活动ID |
+| **业务逻辑** | 活动创建失败率 | 失败/总数 |
+| **业务逻辑** | API密钥生成趋势 | 按时间段 |
+
+---
+
+## ✅ 监控检查清单
+
+### 监控系统检查
+
+- [x] Prometheus正常运行
+- [x] Alertmanager配置正确
+- [x] Grafana仪表板可用
+- [x] Loki日志聚合正常
+- [x] 告警通知渠道畅通
+
+### 监控指标检查
+
+- [x] 应用指标采集正常
+- [x] 系统指标采集正常
+- [x] 业务指标采集正常
+- [x] 告警规则生效
+- [x] 数据保留策略配置
+
+### 告警通知检查
+
+- [x] Slack通知正常
+- [x] 邮件通知正常
+- [x] PagerDuty集成正常
+- [x] 告警分级正确
+- [x] 告警抑制正常
+
+---
+
+*监控方案版本: v2.0.0*  
+*最后更新: 2026-01-22*  
+*维护团队: DevOps Team*