P3-C: 三服务可观测面统一 - metrics端点统一/健康检查别名/traceID透传

Gateway:
- remote_runtime.go: P3-C-08 从请求上下文透传 X-Request-Id 到 platform-token-runtime

Supply-api:
- 新建 internal/metrics/metrics.go: HTTP请求计数/latency/token发布/worker queue指标 (Prometheus-text)
- 新建 internal/metrics/metrics_test.go: 6个测试覆盖
- bootstrap.go: 注册 /metrics (P3-C-01/04)、/health、/healthz 别名 (P3-C-05)

Platform-token-runtime:
- bootstrap.go: 添加 /health 和 /livez 别名 (P3-C-05)

三服务 /metrics 统一为 text/plain; version=0.0.4
三服务 /health 端点统一别名
Gateway → platform-token-runtime 透传 trace ID
This commit is contained in:
Your Name
2026-04-21 18:40:43 +08:00
parent 472d9ad4c1
commit e249a9160b
5 changed files with 195 additions and 1 deletions

View File

@@ -239,7 +239,12 @@ func (r *RemoteTokenRuntime) Verify(ctx context.Context, rawToken string) (Verif
return VerifiedToken{}, err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("X-Request-Id", fmt.Sprintf("gateway-introspect-%d", r.now().UnixNano()))
// P3-C-08: 从请求上下文透传 trace ID避免生成新的 ID 截断链路
if reqID, ok := RequestIDFromContext(ctx); ok && reqID != "" {
req.Header.Set("X-Request-Id", reqID)
} else {
req.Header.Set("X-Request-Id", fmt.Sprintf("gateway-introspect-%d", r.now().UnixNano()))
}
start := time.Now()
resp, err := r.httpClient.Do(req)

View File

@@ -99,6 +99,17 @@ func BuildServer(cfg Config) (*http.Server, error) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"status":"UP"}`))
})
// P3-C-05: /health 和 /livez 别名(统一路径,对齐 gateway/supply-api
mux.HandleFunc("/health", func(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"status":"UP"}`))
})
mux.HandleFunc("/livez", func(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"status":"UP"}`))
})
// P3-B: /metrics 端点Prometheus-text 格式)
mux.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain; version=0.0.4")

View File

@@ -9,6 +9,7 @@ import (
"lijiaoqiao/supply-api/internal/config"
"lijiaoqiao/supply-api/internal/httpapi"
"lijiaoqiao/supply-api/internal/metrics"
"lijiaoqiao/supply-api/internal/middleware"
"lijiaoqiao/supply-api/internal/pkg/logging"
)
@@ -156,6 +157,14 @@ func buildRouteMux(opts buildRouteMuxOptions) *http.ServeMux {
mux := http.NewServeMux()
healthHandler := httpapi.NewHealthHandlerWithDefaults(opts.DBHealthCheck, opts.RedisHealthCheck)
healthHandler.RegisterRoutes(mux)
// P3-C: /metrics 端点Prometheus-text 格式)
mux.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain; version=0.0.4")
_, _ = w.Write([]byte(metrics.Export()))
})
// P3-C-05: /health 别名(统一路径,对齐 gateway/platform-token-runtime
mux.HandleFunc("/health", healthHandler.ServeHealth)
mux.HandleFunc("/healthz", healthHandler.ServeHealth)
opts.SupplyAPI.Register(mux)
opts.AlertAPI.Register(mux)
if opts.IAMHandler != nil {

View File

@@ -0,0 +1,104 @@
package metrics
import (
"strconv"
"sync/atomic"
"time"
)
// SupplyAPIMetrics supply-api 指标收集器
// P3-C: 统一可观测面,对齐 gateway/platform-token-runtime metrics 风格
type SupplyAPIMetrics struct {
// HTTP 请求计数
httpRequests atomic.Int64
httpRequestsOK atomic.Int64
httpRequestsError atomic.Int64
// HTTP 延迟(纳秒)
httpLatencySum atomic.Int64
httpLatencyCount atomic.Int64
// Token 发布计数
tokenPublishes atomic.Int64
tokenPublishFail atomic.Int64
// Worker queue 指标
queueSize atomic.Int64
workersBusy atomic.Int64
startAt time.Time
}
var global *SupplyAPIMetrics
func init() {
global = &SupplyAPIMetrics{startAt: time.Now()}
}
// IncHTTPRequest 记录一次 HTTP 请求
func IncHTTPRequest() { global.httpRequests.Add(1) }
// IncHTTPOK 记录一次成功请求
func IncHTTPOK() { global.httpRequestsOK.Add(1) }
// IncHTTPError 记录一次错误请求
func IncHTTPError() { global.httpRequestsError.Add(1) }
// IncLatency 记录延迟(纳秒)
func IncLatency(ns int64) {
global.httpLatencySum.Add(ns)
global.httpLatencyCount.Add(1)
}
// IncTokenPublish 记录一次 token 发布
func IncTokenPublish() { global.tokenPublishes.Add(1) }
// IncTokenPublishFail 记录一次 token 发布失败
func IncTokenPublishFail() { global.tokenPublishes.Add(1); global.tokenPublishFail.Add(1) }
// SetQueueSize 设置当前队列大小
func SetQueueSize(n int64) { global.queueSize.Store(n) }
// SetWorkersBusy 设置忙碌的 worker 数量
func SetWorkersBusy(n int64) { global.workersBusy.Store(n) }
// Export 返回 Prometheus-text 格式指标快照
func Export() string {
m := global
uptime := time.Since(m.startAt).Seconds()
latencyAvg := float64(0)
if count := m.httpLatencyCount.Load(); count > 0 {
latencyAvg = float64(m.httpLatencySum.Load()) / float64(count)
}
latencyMs := latencyAvg / 1e6
return `# HELP supply_api_uptime_seconds Time since service start
# TYPE supply_api_uptime_seconds gauge
supply_api_uptime_seconds ` + strconv.FormatFloat(uptime, 'f', 3, 64) + `
# HELP supply_api_http_requests_total Total HTTP requests received
# TYPE supply_api_http_requests_total counter
supply_api_http_requests_total ` + strconv.FormatInt(m.httpRequests.Load(), 10) + `
# HELP supply_api_http_requests_ok_total Successful HTTP requests (2xx/3xx)
# TYPE supply_api_http_requests_ok_total counter
supply_api_http_requests_ok_total ` + strconv.FormatInt(m.httpRequestsOK.Load(), 10) + `
# HELP supply_api_http_requests_error_total Failed HTTP requests (4xx/5xx)
# TYPE supply_api_http_requests_error_total counter
supply_api_http_requests_error_total ` + strconv.FormatInt(m.httpRequestsError.Load(), 10) + `
# HELP supply_api_http_latency_ms_avg Average HTTP request latency in milliseconds
# TYPE supply_api_http_latency_ms_avg gauge
supply_api_http_latency_ms_avg ` + strconv.FormatFloat(latencyMs, 'f', 3, 64) + `
# HELP supply_api_token_publishes_total Total token publish operations
# TYPE supply_api_token_publishes_total counter
supply_api_token_publishes_total ` + strconv.FormatInt(m.tokenPublishes.Load(), 10) + `
# HELP supply_api_token_publish_fail_total Token publish failures
# TYPE supply_api_token_publish_fail_total counter
supply_api_token_publish_fail_total ` + strconv.FormatInt(m.tokenPublishFail.Load(), 10) + `
# HELP supply_api_queue_size Current worker queue size
# TYPE supply_api_queue_size gauge
supply_api_queue_size ` + strconv.FormatInt(m.queueSize.Load(), 10) + `
# HELP supply_api_workers_busy Number of busy workers
# TYPE supply_api_workers_busy gauge
supply_api_workers_busy ` + strconv.FormatInt(m.workersBusy.Load(), 10) + `
`
}

View File

@@ -0,0 +1,65 @@
package metrics
import (
"strings"
"testing"
)
func TestExport_ContainsUptime(t *testing.T) {
output := Export()
if !strings.Contains(output, "supply_api_uptime_seconds") {
t.Fatal("missing uptime metric")
}
}
func TestExport_ContainsHTTPMetrics(t *testing.T) {
output := Export()
for _, m := range []string{
"supply_api_http_requests_total",
"supply_api_http_requests_ok_total",
"supply_api_http_requests_error_total",
"supply_api_http_latency_ms_avg",
} {
if !strings.Contains(output, m) {
t.Errorf("missing metric: %s", m)
}
}
}
func TestExport_ContainsTokenPublishMetrics(t *testing.T) {
output := Export()
for _, m := range []string{
"supply_api_token_publishes_total",
"supply_api_token_publish_fail_total",
} {
if !strings.Contains(output, m) {
t.Errorf("missing metric: %s", m)
}
}
}
func TestExport_PrometheusFormat(t *testing.T) {
output := Export()
if !strings.Contains(output, "# HELP supply_api_uptime_seconds") {
t.Error("missing HELP line")
}
if !strings.Contains(output, "# TYPE supply_api_uptime_seconds gauge") {
t.Error("missing TYPE line")
}
}
func TestIncTokenPublish_IncrementsCounter(t *testing.T) {
before := global.tokenPublishes.Load()
IncTokenPublish()
after := global.tokenPublishes.Load()
if after != before+1 {
t.Errorf("expected %d, got %d", before+1, after)
}
}
func TestSetQueueSize_SetsValue(t *testing.T) {
SetQueueSize(42)
if got := global.queueSize.Load(); got != 42 {
t.Errorf("expected 42, got %d", got)
}
}