package monitoring import ( "sync" "github.com/prometheus/client_golang/prometheus" ) // SLOMetrics 服务级别目标(SLO)相关指标 // 这些指标是 SLO 测量的基础,用于计算错误预算燃烧率 type SLOMetrics struct { // 缓存命中统计(alerts.yml 引用但原来未定义) CacheHitsTotal *prometheus.CounterVec CacheOperationsTotal *prometheus.CounterVec // 数据库连接池状态(alerts.yml 引用但原来未定义) DBConnectionsActive prometheus.Gauge DBConnectionsMax prometheus.Gauge // Token 操作 TokenRefreshTotal *prometheus.CounterVec // 账号安全事件 AccountLockTotal prometheus.Counter AnomalyDetectedTotal *prometheus.CounterVec // 错误预算燃烧率(可选,用于自定义仪表盘) ErrorBudgetBurnRate *prometheus.GaugeVec registry *prometheus.Registry once sync.Once } var ( globalSLOMetrics *SLOMetrics globalSLOMetricsOnce sync.Once ) // NewSLOMetrics 创建 SLO 指标实例(使用独立 registry 避免测试冲突) func NewSLOMetrics() *SLOMetrics { reg := prometheus.NewRegistry() m := &SLOMetrics{registry: reg} m.CacheHitsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "cache_hits_total", Help: "Total number of cache hits", }, []string{"level", "operation"}, // level: l1/l2, operation: get/set ) m.CacheOperationsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "cache_operations_total", Help: "Total number of cache operations", }, []string{"level", "operation"}, ) m.DBConnectionsActive = prometheus.NewGauge( prometheus.GaugeOpts{ Name: "db_connections_active", Help: "Number of active database connections", }, ) m.DBConnectionsMax = prometheus.NewGauge( prometheus.GaugeOpts{ Name: "db_connections_max", Help: "Maximum number of database connections configured", }, ) m.TokenRefreshTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "token_refresh_total", Help: "Total number of token refresh attempts", }, []string{"status"}, // success/failure/rate_limited ) m.AccountLockTotal = prometheus.NewCounter( prometheus.CounterOpts{ Name: "account_lock_total", Help: "Total number of account lockout events due to failed login attempts", }, ) m.AnomalyDetectedTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "anomaly_detected_total", Help: "Total number of anomaly login detections", }, []string{"type"}, // geo_anomaly/device_anomaly/brute_force/suspicious_ip ) m.ErrorBudgetBurnRate = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "error_budget_burn_rate", Help: "Current error budget burn rate multiplier (1.0 = nominal consumption)", }, []string{"slo"}, // api-availability/api-latency/login-success-rate ) reg.MustRegister( m.CacheHitsTotal, m.CacheOperationsTotal, m.DBConnectionsActive, m.DBConnectionsMax, m.TokenRefreshTotal, m.AccountLockTotal, m.AnomalyDetectedTotal, m.ErrorBudgetBurnRate, ) return m } // GetGlobalSLOMetrics 获取全局 SLO 指标单例(生产使用) func GetGlobalSLOMetrics() *SLOMetrics { globalSLOMetricsOnce.Do(func() { m := NewSLOMetrics() // 注册到默认 registry 以便 /metrics 端点暴露 prometheus.DefaultRegisterer.Register(m.CacheHitsTotal) //nolint:errcheck prometheus.DefaultRegisterer.Register(m.CacheOperationsTotal) //nolint:errcheck prometheus.DefaultRegisterer.Register(m.DBConnectionsActive) //nolint:errcheck prometheus.DefaultRegisterer.Register(m.DBConnectionsMax) //nolint:errcheck prometheus.DefaultRegisterer.Register(m.TokenRefreshTotal) //nolint:errcheck prometheus.DefaultRegisterer.Register(m.AccountLockTotal) //nolint:errcheck prometheus.DefaultRegisterer.Register(m.AnomalyDetectedTotal) //nolint:errcheck prometheus.DefaultRegisterer.Register(m.ErrorBudgetBurnRate) //nolint:errcheck globalSLOMetrics = m }) return globalSLOMetrics } // GetRegistry 获取私有 registry(测试使用) func (m *SLOMetrics) GetRegistry() *prometheus.Registry { return m.registry } // RecordCacheHit 记录缓存命中 func (m *SLOMetrics) RecordCacheHit(level, operation string) { m.CacheHitsTotal.WithLabelValues(level, operation).Inc() m.CacheOperationsTotal.WithLabelValues(level, operation).Inc() } // RecordCacheMiss 记录缓存未命中 func (m *SLOMetrics) RecordCacheMiss(level, operation string) { m.CacheOperationsTotal.WithLabelValues(level, operation).Inc() } // RecordTokenRefresh 记录 Token 刷新操作 func (m *SLOMetrics) RecordTokenRefresh(status string) { m.TokenRefreshTotal.WithLabelValues(status).Inc() } // RecordAccountLock 记录账号锁定事件 func (m *SLOMetrics) RecordAccountLock() { m.AccountLockTotal.Inc() } // RecordAnomaly 记录异常检测事件 func (m *SLOMetrics) RecordAnomaly(anomalyType string) { m.AnomalyDetectedTotal.WithLabelValues(anomalyType).Inc() } // SetDBConnections 更新数据库连接池状态 func (m *SLOMetrics) SetDBConnections(active, max float64) { m.DBConnectionsActive.Set(active) m.DBConnectionsMax.Set(max) } // SetErrorBudgetBurnRate 设置错误预算燃烧率 func (m *SLOMetrics) SetErrorBudgetBurnRate(slo string, burnRate float64) { m.ErrorBudgetBurnRate.WithLabelValues(slo).Set(burnRate) }