From dd6f332b531865579b8f239e1f051dc9b07e37db Mon Sep 17 00:00:00 2001 From: phamnazage-jpg Date: Mon, 8 Jun 2026 14:49:06 +0800 Subject: [PATCH] feat: close v3 slo gates and lifecycle metrics --- deploy/monitoring/prometheus-rules.yml | 193 ++++++------ docs/2026-06-04-SLO_AND_OBSERVABILITY.md | 49 +++- docs/2026-06-05-VNEXT_COMPLETION_CHECKLIST.md | 33 ++- docs/EXECUTION_BOARD.md | 12 +- .../app/key_self_service_handlers_test.go | 275 ++++++++++++++++++ internal/app/key_self_service_svc.go | 70 +++-- internal/app/key_self_service_test.go | 148 ++++++++++ internal/app/route_resolve_api.go | 4 +- internal/app/route_resolve_api_test.go | 3 + .../app/user_key_operation_metrics_test.go | 55 ++++ internal/metrics/metrics.go | 4 +- internal/metrics/metrics_test.go | 3 + scripts/test/verify_quality_gates.sh | 3 + scripts/test/verify_vnext_slo_release_gate.sh | 79 +++++ 14 files changed, 775 insertions(+), 156 deletions(-) create mode 100644 internal/app/key_self_service_handlers_test.go create mode 100644 internal/app/user_key_operation_metrics_test.go create mode 100755 scripts/test/verify_vnext_slo_release_gate.sh diff --git a/deploy/monitoring/prometheus-rules.yml b/deploy/monitoring/prometheus-rules.yml index 0421aa19..e0ade408 100644 --- a/deploy/monitoring/prometheus-rules.yml +++ b/deploy/monitoring/prometheus-rules.yml @@ -1,11 +1,10 @@ # Prometheus Alerting Rules for sub2api-cn-relay-manager -# Place this file in your Prometheus rules directory +# Aligned with current vNext.3 metrics semantics (2026-06-08) groups: - name: sub2api-relay-manager-alerts interval: 30s rules: - # 服务可用性告警 - alert: ServiceDown expr: up{job="sub2api-relay-manager"} == 0 for: 1m @@ -16,53 +15,110 @@ groups: summary: "sub2api-relay-manager service is down" description: "The sub2api-relay-manager service has been down for more than 1 minute." - # HTTP错误率告警 - alert: HighErrorRate expr: | ( - sum(rate(http_requests_total{status=~"5..|4.."}[5m])) - / - sum(rate(http_requests_total[5m])) + sum(rate(http_requests_total{status=~"4..|5.."}[5m])) + / + clamp_min(sum(rate(http_requests_total[5m])), 0.001) ) > 0.05 for: 2m labels: severity: warning team: ops annotations: - summary: "High error rate detected" - description: "Error rate is above 5% for more than 2 minutes. Current value: {{ $value | humanizePercentage }}" + summary: "High HTTP error rate detected" + description: "HTTP 4xx/5xx error rate is above 5% for more than 2 minutes." - # 请求延迟告警 - - alert: HighLatency - expr: | - histogram_quantile(0.95, - sum(rate(http_request_duration_seconds_bucket[5m])) by (le) - ) > 1.0 - for: 3m - labels: - severity: warning - team: ops - annotations: - summary: "High request latency" - description: "95th percentile latency is above 1 second for more than 3 minutes." - - # 路由故障转移告警 - - alert: RouteFailoverSpike + - alert: UserKeyChatSuccessRateLow expr: | ( - rate(route_failovers_total[5m]) - > - 2 * avg_over_time(rate(route_failovers_total[1h])[1h:5m]) - ) - for: 1m + sum(rate(user_key_chat_requests_total{result="ok"}[10m])) + / + clamp_min(sum(rate(user_key_chat_requests_total[10m])), 0.001) + ) < 0.95 + and sum(rate(user_key_chat_requests_total[10m])) > 0 + for: 10m + labels: + severity: critical + team: ops + annotations: + summary: "User-key chat success rate below SLO" + description: "Recent user-key chat success rate is below 95% for 10 minutes." + + - alert: UserKeyChatP95LatencyHigh + expr: | + histogram_quantile(0.95, + sum(rate(http_request_duration_seconds_bucket{path="/v1/chat/completions"}[10m])) by (le) + ) > 5 + for: 10m labels: severity: warning team: ops annotations: - summary: "Route failover spike detected" - description: "Route failovers have spiked above normal levels. Current rate: {{ $value }}" + summary: "User-key chat P95 latency is high" + description: "P95 latency for /v1/chat/completions exceeds 5 seconds for 10 minutes." + + - alert: UserKeyCreateFailures + expr: | + sum(rate(user_key_operations_total{operation="create",result!~"success|rate_limited"}[10m])) > 0.02 + for: 10m + labels: + severity: critical + team: ops + annotations: + summary: "User-key create failures detected" + description: "Non-rate-limit create failures are occurring on the self-service path." + + - alert: UserKeyResetFailures + expr: | + sum(rate(user_key_operations_total{operation="reset",result!~"success|rate_limited"}[10m])) > 0.02 + for: 10m + labels: + severity: warning + team: ops + annotations: + summary: "User-key reset failures detected" + description: "Non-rate-limit reset failures are occurring on the self-service path." + + - alert: UserKeyQuotaExhaustedSpike + expr: | + sum(rate(user_key_chat_requests_total{result="quota_exhausted"}[10m])) > 0.05 + for: 10m + labels: + severity: warning + team: ops + annotations: + summary: "Quota exhausted events are rising" + description: "quota_exhausted responses are rising on the public user-key gateway path." + + - alert: UserKeyAuthFailuresSpike + expr: | + sum(rate(user_key_chat_requests_total{result=~"unauthorized|invalid_api_key"}[10m])) > 0.05 + for: 10m + labels: + severity: warning + team: security + annotations: + summary: "User-key auth failures are rising" + description: "unauthorized/invalid_api_key outcomes are rising on the public gateway path." + + - alert: RouteFailoverShareHigh + expr: | + ( + sum(rate(route_decisions_total{status="failover"}[10m])) + / + clamp_min(sum(rate(route_decisions_total[10m])), 0.001) + ) > 0.20 + and sum(rate(route_decisions_total[10m])) > 0 + for: 10m + labels: + severity: warning + team: ops + annotations: + summary: "Route failover share is high" + description: "More than 20% of recent route decisions are failovers." - # 活跃Provider数量告警 - alert: NoActiveProviders expr: active_providers == 0 for: 1m @@ -73,17 +129,6 @@ groups: summary: "No active providers" description: "There are no active providers configured. The system cannot route requests." - - alert: LowActiveProviders - expr: active_providers < 2 - for: 5m - labels: - severity: warning - team: ops - annotations: - summary: "Low number of active providers" - description: "Only {{ $value }} active provider(s) detected. Consider adding more for redundancy." - - # 活跃Host告警 - alert: NoActiveHosts expr: active_hosts == 0 for: 1m @@ -94,31 +139,6 @@ groups: summary: "No active hosts" description: "There are no active hosts. The system cannot import providers." - # 数据库连接告警 - - alert: HighDBConnections - expr: db_connections_active > 50 - for: 5m - labels: - severity: warning - team: ops - annotations: - summary: "High database connection count" - description: "Active DB connections: {{ $value }}. Consider connection pool tuning." - - # 数据库操作错误告警 - - alert: DBOperationErrors - expr: | - rate(db_operations_total{operation=~"INSERT|UPDATE|DELETE"}[5m]) - > 100 - for: 2m - labels: - severity: warning - team: ops - annotations: - summary: "High database write rate" - description: "DB write operations are above threshold: {{ $value }} ops/sec" - - # 日志系统告警 - alert: LogFlushErrors expr: rate(log_flush_errors_total[5m]) > 0 for: 1m @@ -130,48 +150,17 @@ groups: description: "Log flush errors have been detected. Check log storage/backend." - alert: LogDroppedEvents - expr: | - rate(log_dropped_events_total[5m]) > 10 + expr: rate(log_dropped_events_total[5m]) > 10 for: 1m labels: severity: warning team: ops annotations: summary: "Log events being dropped" - description: "Log events are being dropped at {{ $value }} events/sec. Check log buffer capacity." + description: "Log events are being dropped. Check log buffer capacity." - # 批处理导入告警 - - alert: BatchImportFailures - expr: | - ( - rate(route_decisions_total{status="failed"}[5m]) - / - rate(route_decisions_total[5m]) - ) > 0.1 - for: 5m - labels: - severity: warning - team: ops - annotations: - summary: "High batch import failure rate" - description: "Batch import failure rate is above 10%. Check provider configurations." - - # API认证失败告警 - - alert: AuthFailures - expr: | - rate(http_requests_total{status="401"}[5m]) > 10 - for: 2m - labels: - severity: warning - team: security - annotations: - summary: "High authentication failure rate" - description: "Auth failures detected. Possible credential issues or attacks." - - # 健康检查告警 - alert: HealthCheckFailing - expr: | - http_requests_total{path="/healthz",status!="200"} > 0 + expr: http_requests_total{path="/healthz",status!="200"} > 0 for: 30s labels: severity: critical diff --git a/docs/2026-06-04-SLO_AND_OBSERVABILITY.md b/docs/2026-06-04-SLO_AND_OBSERVABILITY.md index cb5c12af..ac2768a4 100644 --- a/docs/2026-06-04-SLO_AND_OBSERVABILITY.md +++ b/docs/2026-06-04-SLO_AND_OBSERVABILITY.md @@ -1,7 +1,7 @@ # SLO and Observability 日期:2026-06-04 -状态:待审核 +状态:已落地(V3-2) 适用版本:vNext.3 ## 目的 @@ -107,3 +107,50 @@ vNext.3 引入正式发布门禁前,至少要求: 本文件属于 vNext.3 设计文档。 当前 vNext.1 不进入实现,但必须在规划阶段明确其后续必备性,避免将来“功能可用但不可运营”。 + +## 2026-06-08 当前落地口径 + +### 已落地指标 + +- `http_requests_total{method,path,status}` + - `status` 使用数值字符串(如 `200/403/500`) + - `path` 优先使用 `r.Pattern`,避免动态 path 高基数 +- `http_request_duration_seconds{method,path}` +- `route_decisions_total{logical_group,status}` + - `status in (sticky_hit, bind, fallback, failover)` +- `route_failovers_total` +- `user_key_operations_total{operation,result}` + - `operation in (create, reset, pause, resume, delete)` + - `result` 已覆盖 `success / unauthorized / bad_request / rate_limited / open_store_error / get_key_error / not_found / rate_limit_store_error / resolve_host_error / resolve_shadow_group_error / ensure_access_error / pause_access_error / resume_access_error / db_tx_error` +- `user_key_chat_requests_total{result}` + - `result in (ok, unauthorized, invalid_api_key, key_paused, key_retired, quota_exhausted, bad_request, db_error, proxy_error)` + +### 已落地告警规则 + +文件:`deploy/monitoring/prometheus-rules.yml` + +- `HighErrorRate` +- `UserKeyChatSuccessRateLow` +- `UserKeyChatP95LatencyHigh` +- `UserKeyCreateFailures` +- `UserKeyResetFailures` +- `UserKeyQuotaExhaustedSpike` +- `UserKeyAuthFailuresSpike` +- `RouteFailoverShareHigh` +- `NoActiveProviders` +- `NoActiveHosts` +- `LogFlushErrors` +- `LogDroppedEvents` +- `HealthCheckFailing` + +### 已落地发布门禁 + +脚本:`scripts/test/verify_vnext_slo_release_gate.sh` + +门禁要求: + +1. 核心 source-of-truth 文档存在 +2. metrics 接线与代码口径一致 +3. Prometheus 规则引用真实存在的指标与标签 +4. live governance artifact 满足 create `201` → chat `200` → pause `200` → chat-paused `403 key_paused` → resume `200` → chat `200` → delete `200` +5. `docs/EXECUTION_BOARD.md` 已同步 V3-2 完成态 diff --git a/docs/2026-06-05-VNEXT_COMPLETION_CHECKLIST.md b/docs/2026-06-05-VNEXT_COMPLETION_CHECKLIST.md index 31725af8..a7d24f7c 100644 --- a/docs/2026-06-05-VNEXT_COMPLETION_CHECKLIST.md +++ b/docs/2026-06-05-VNEXT_COMPLETION_CHECKLIST.md @@ -10,7 +10,7 @@ ## 一、先说结论 -当前状态:条件完成(全量 vNext) +当前状态:完成(全量 vNext) 说明: @@ -18,7 +18,7 @@ - vNext.2 已完成 V2-4 + V2-5:key self-service API、portal key 管理 UI、用户 portal reset 后首次调用 200 真实线上闭环。 - vNext.3 已完成 V3-1:key/account governance 的公网 create→chat→pause→chat-paused→resume→chat-resumed 真验闭环。 - vNext.3 / V3-2 已启动首批 SLO/观测最小闭环:HTTP metrics route pattern 归一化、route resolve/failover 指标接线、user-key lifecycle/chat outcome 指标接线与回归测试已完成。 -- 仍未完成的是更宽泛的后续治理/SLO 扩展范围;因此按“当前 CRM 网关路线”口径已完成,按“全量 vNext 后续扩展全部做完”口径仍是条件完成。 +- V3-2 的失败路径指标细化、Prometheus 告警规则、V3-2 release gate 已全部补齐;当前 CRM 网关路线与全量 vNext 后续扩展口径现已统一为完成。 ## 二、5 个核心问题 Checklist(全量 vNext 目标) @@ -30,7 +30,7 @@ | 2. 同模型多供应商池化 | 模型池抽象 + 映射 + 真实池化验收 | vNext.1 已闭环 | `model_pool.go`、pool 测试、真实验收脚本已存在 | | 3. 插件前端承接用户弱能力 | Portal 能承接用户信息、模型、示例、key 信息 | V2-5 已完成 | `PORTAL_KEY_EXPERIENCE.md`、`deploy/tksea-portal/index.html`、`artifacts/portal-ui-v25/20260606_1009/99-summary.json` | | 4. 插件生成/申请 key 并交付 base URL/model/curl 示例 | key self-service API + 首次调用 200 闭环 | V2-4/V2-5 已完成 | `KEY_SELF_SERVICE_API.md`、`verify_user_key_self_service.sh`、`artifacts/user-key-self-service/20260605_195408/99-summary.json`、`artifacts/portal-ui-v25/20260606_1009/99-summary.json` | -| 5. key / 账号暂停、恢复、限额治理 | 三态模型 + 管理页动作 + 真实治理验收 | V3-1 已闭环 / V3-2 进行中 | `KEY_ACCOUNT_GOVERNANCE.md`、`artifacts/v3-governance-live/20260608_102323/99-summary.json`、`internal/metrics/metrics.go`、`internal/app/public_chat_metrics_test.go` | +| 5. key / 账号暂停、恢复、限额治理 | 三态模型 + 管理页动作 + 真实治理验收 | V3-1/V3-2 已闭环 | `KEY_ACCOUNT_GOVERNANCE.md`、`artifacts/v3-governance-live/20260608_102323/99-summary.json`、`internal/metrics/metrics.go`、`internal/app/public_chat_metrics_test.go`、`scripts/test/verify_vnext_slo_release_gate.sh` | ## 三、vNext.1 发布范围 Checklist @@ -96,19 +96,22 @@ - Task 4.1 状态模型与治理语义:已实现并接线到 CRM 网关 `POST /v1/chat/completions` - Task 4.2 管理页治理动作:已实现(pause / resume 同步宿主 managed user `allowed_groups`) - Task 4.3 真实治理验收:已完成,见 `artifacts/v3-governance-live/20260608_102323/99-summary.json` -- Task 4.4 SLO / 观测最小闭环(第一批):已完成首批接线 - - `internal/metrics/metrics.go` 新增 `user_key_operations_total`、`user_key_chat_requests_total` +- Task 4.4 SLO / 观测闭环:已完成 + - `internal/metrics/metrics.go` 新增 `user_key_operations_total`、`user_key_chat_requests_total`,并把 HTTP status label 收口为数值字符串 - `http_requests_total` 优先使用 `r.Pattern`,避免动态 path 高基数 - route resolve / failover、user-key self-service、public chat outcome 已接指标并补回归测试 + - user-key lifecycle 失败路径指标已补齐到 create/reset/pause/resume/delete + - `deploy/monitoring/prometheus-rules.yml` 已按当前指标口径重写 + - `scripts/test/verify_vnext_slo_release_gate.sh` 已落地并纳入总质量门 -状态:V3-1 已闭环;V3-2 首批 SLO/观测接线已完成,剩余治理/SLO 扩展项继续推进 +状态:V3-1 / V3-2 已闭环 ### Phase 5 - Task 5.1 默认链路准入规则:vNext.1 已闭环 - Task 5.2 多层验证:vNext.1 + V2-4 当前均已有真实 artifact -状态:部分完成(整体 vNext 仍未完成) +状态:完成(全量 vNext 已完成) ## 五、当前缺失文件 / 脚本 / 测试(按真实存在性校对) @@ -120,6 +123,9 @@ - `scripts/acceptance/verify_host_protocol_matrix.sh` — 已存在 - `scripts/acceptance/verify_user_key_self_service.sh` — 已从 skeleton 升级为真实验收脚本 - `internal/app/key_self_service_test.go` — 已存在 +- `internal/app/user_key_operation_metrics_test.go` — 已存在 +- `scripts/test/verify_vnext_slo_release_gate.sh` — 已存在 +- `deploy/monitoring/prometheus-rules.yml` — 已按 V3-2 指标口径重写 ### vNext.2 尚缺 @@ -145,16 +151,15 @@ ### 立即执行:收尾与同步 -1. 已完成 V3-1 公网真验闭环:create 201 → chat 200 → pause 200 → chat-paused 403 → resume 200 → chat 200 -2. 已确认 2026-06-06 的“pause 后仍 200”并非宿主 cache TTL,而是公网 `/v1/chat/completions` 当时仍走宿主、且 CRM `hosts.auth_token` 已过期 -3. 已补 remote43 nginx 精确路由与 host bearer 刷新;仓库同步更新部署脚本/示例 nginx -4. 下一步仅剩文档、commit、push 与后续 SLO 范围推进 +1. 运行完整质量门并确认 worktree clean +2. commit + push 本轮 V3-2 收尾变更 +3. 更新任务真相源为 completed ## 八、当前判定(唯一有效口径) - 按 vNext.1 发布范围:**完成** - 按 vNext.2 当前执行项:**完成**(V2-4 + V2-5 已真实闭环) -- 按全量 vNext 规划:**条件完成**(V3-1 核心代码+测试+线上真验已闭环;剩余仅是后续治理/SLO 扩展项,不再阻塞当前 CRM 网关路线) +- 按全量 vNext 规划:**完成**(V2-4 / V2-5 / V3-1 / V3-2 均已完成代码、门禁、文档与真实证据闭环) - 当前结论: - - V2-4 / V2-5 / V3-1 已真实闭环,可提交/推送 - - 若要宣告“全量 vNext 所有后续扩展都完成”,还需单独定义并交付 V3-2/SLO 范围 + - 全量 vNext 后续扩展已完成 + - 当前仅剩本轮变更的 commit / push 收尾动作 diff --git a/docs/EXECUTION_BOARD.md b/docs/EXECUTION_BOARD.md index 49f5b3cc..02e55c5b 100644 --- a/docs/EXECUTION_BOARD.md +++ b/docs/EXECUTION_BOARD.md @@ -129,18 +129,20 @@ - 目标:先把现有 CRM 网关与 user-key 自助链路接成可观测真相源,而不是停留在“有 /metrics 端点但关键路径不产生日志/指标”。 - 本轮代码接线: - - `internal/metrics/metrics.go`:新增 `user_key_operations_total`、`user_key_chat_requests_total`;HTTP metrics 优先使用 `r.Pattern`,避免动态 path 高基数 - - `internal/app/route_resolve_api.go`:resolve / failover 接入 route metrics - - `internal/app/key_self_service_svc.go`:create/reset/pause/resume/delete success metrics 接线 + - `internal/metrics/metrics.go`:`user_key_operations_total`、`user_key_chat_requests_total` 已接线;HTTP status label 改为数值字符串,HTTP path 优先使用 `r.Pattern`,避免动态 path 高基数 + - `internal/app/route_resolve_api.go`:route decision 语义收口为 `sticky_hit / bind / fallback / failover`,failover 不再和 fallback 混成单一状态 + - `internal/app/key_self_service_svc.go`:create/reset/pause/resume/delete 不只记录 success,还补齐 `open_store_error / get_key_error / not_found / rate_limit_store_error / resolve_host_error / resolve_shadow_group_error / ensure_access_error / pause_access_error / resume_access_error / db_tx_error` 等失败路径指标 - `internal/app/http_api.go`:`/v1/chat/completions` 接入 `unauthorized / invalid_api_key / key_paused / key_retired / quota_exhausted / bad_request / db_error / proxy_error / ok` outcome metrics - - `internal/app/public_chat_metrics_test.go`:新增 quota_exhausted 与 route pattern 回归测试 + - `deploy/monitoring/prometheus-rules.yml`:已按当前真实指标口径重写为 `UserKeyChatSuccessRateLow / UserKeyChatP95LatencyHigh / UserKeyCreateFailures / UserKeyResetFailures / UserKeyQuotaExhaustedSpike / UserKeyAuthFailuresSpike / RouteFailoverShareHigh` 等告警规则 + - `scripts/test/verify_vnext_slo_release_gate.sh`:新增 V3-2 发布门禁脚本,并已接入 `scripts/test/verify_quality_gates.sh` - 本轮门禁: - `go test ./internal/app ./internal/metrics -count=1` → PASS - `go test ./tests/integration/... -count=1` → PASS - `go vet ./...` → PASS - `go test -cover ./internal/...` → PASS(核心包 `access/provision/pack` 均 ≥ 70%) + - `bash ./scripts/test/verify_vnext_slo_release_gate.sh` → PASS(校验 metrics 接线 / 告警规则 / live governance artifact / 文档口径) - 当前结论: - - `部分闭环` —— 首批 SLO/观测接线已完成并过门禁;更宽泛的治理/SLO 扩展(失败路径细化、告警/发布门禁)继续推进 + - `已闭环` —— V3-2 的失败路径细化、告警规则、发布门禁均已落地;全量 vNext 后续扩展已收口到可验证完成态 - portal key 管理 UI 已完成实现、部署和真实公网验收: - 关键代码: diff --git a/internal/app/key_self_service_handlers_test.go b/internal/app/key_self_service_handlers_test.go new file mode 100644 index 00000000..7d33e1f2 --- /dev/null +++ b/internal/app/key_self_service_handlers_test.go @@ -0,0 +1,275 @@ +package app + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +func TestGeneratePlaintextKeyAndExtractSubjectID(t *testing.T) { + t.Parallel() + plaintext, fingerprint := generatePlaintextKey() + if !strings.HasPrefix(plaintext, "sk-") { + t.Fatalf("plaintext = %q, want sk- prefix", plaintext) + } + if !strings.HasPrefix(fingerprint, "sha256:") { + t.Fatalf("fingerprint = %q, want sha256 prefix", fingerprint) + } + + h := &UserKeyHandler{} + req := httptest.NewRequest(http.MethodGet, "/api/keys", nil) + req.Header.Set("Authorization", "Bearer abcdefgh12345678") + subjectID, httpErr := h.extractSubjectID(req) + if httpErr != nil { + t.Fatalf("extractSubjectID() unexpected error: %+v", httpErr) + } + if subjectID != "skeleton_user_abcdefgh" { + t.Fatalf("subjectID = %q, want skeleton_user_abcdefgh", subjectID) + } +} + +func TestHandleUserKeyListNotImplemented(t *testing.T) { + t.Parallel() + req := httptest.NewRequest(http.MethodGet, "/api/keys", nil) + rr := httptest.NewRecorder() + serveWithMetrics(t, req, rr, func(w http.ResponseWriter, r *http.Request) { + handleListUserKeys(w, r, nil) + }) + if rr.Code != http.StatusNotImplemented { + t.Fatalf("status = %d, want 501 body=%s", rr.Code, rr.Body.String()) + } +} + +func TestHandleUserKeyListSuccess(t *testing.T) { + t.Parallel() + h := &UserKeyHandler{ + listFn: func(ctx context.Context, subjectID string) ([]UserKeyMeta, error) { + if subjectID != "portal-user:1" { + t.Fatalf("subjectID = %q, want portal-user:1", subjectID) + } + return []UserKeyMeta{{KeyID: "key_1", AdminStatus: "active"}}, nil + }, + } + req := httptest.NewRequest(http.MethodGet, "/api/keys", nil) + req.Header.Set("X-Portal-Subject", "portal-user:1") + rr := httptest.NewRecorder() + serveWithMetrics(t, req, rr, func(w http.ResponseWriter, r *http.Request) { + handleListUserKeys(w, r, h) + }) + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, want 200 body=%s", rr.Code, rr.Body.String()) + } + if !strings.Contains(rr.Body.String(), "key_1") { + t.Fatalf("body missing key_1: %s", rr.Body.String()) + } +} + +func TestHandleGetUserKeyMissingKeyID(t *testing.T) { + t.Parallel() + h := &UserKeyHandler{getFn: func(context.Context, string, string) (UserKeyMeta, error) { + t.Fatal("getFn should not be called when key_id is missing") + return UserKeyMeta{}, nil + }} + req := httptest.NewRequest(http.MethodGet, "/api/keys/", nil) + req.Header.Set("X-Portal-Subject", "portal-user:1") + rr := httptest.NewRecorder() + serveWithMetrics(t, req, rr, func(w http.ResponseWriter, r *http.Request) { + handleGetUserKey(w, r, h) + }) + if rr.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want 400 body=%s", rr.Code, rr.Body.String()) + } +} + +func TestHandleUserKeyMutationHandlers(t *testing.T) { + t.Parallel() + meta := UserKeyMeta{KeyID: "key_1", MaskedPreview: "sk-****1234", AdminStatus: "active"} + cases := []struct { + name string + method string + path string + handlerFn func(http.ResponseWriter, *http.Request, *UserKeyHandler) + userHandler *UserKeyHandler + wantStatus int + wantBody string + }{ + { + name: "get-success", + method: http.MethodGet, + path: "/api/keys/key_1", + handlerFn: handleGetUserKey, + userHandler: &UserKeyHandler{getFn: func(ctx context.Context, keyID, subjectID string) (UserKeyMeta, error) { + if keyID != "key_1" || subjectID != "portal-user:1" { + t.Fatalf("getFn args = (%q,%q)", keyID, subjectID) + } + return meta, nil + }}, + wantStatus: http.StatusOK, + wantBody: "key_1", + }, + { + name: "reset-success", + method: http.MethodPost, + path: "/api/keys/key_1/reset", + handlerFn: handleResetUserKey, + userHandler: &UserKeyHandler{resetFn: func(ctx context.Context, keyID, subjectID string) (ResetUserKeyResponse, error) { + if keyID != "key_1" || subjectID != "portal-user:1" { + t.Fatalf("resetFn args = (%q,%q)", keyID, subjectID) + } + return ResetUserKeyResponse{PlaintextKey: "sk-new", MaskedPreview: "sk-****new", AdminStatus: "active"}, nil + }}, + wantStatus: http.StatusOK, + wantBody: "sk-new", + }, + { + name: "pause-success", + method: http.MethodPost, + path: "/api/keys/key_1/pause", + handlerFn: handlePauseUserKey, + userHandler: &UserKeyHandler{pauseFn: func(ctx context.Context, keyID, subjectID, reason string) (UserKeyMeta, error) { + if keyID != "key_1" || subjectID != "portal-user:1" || reason != "" { + t.Fatalf("pauseFn args = (%q,%q,%q)", keyID, subjectID, reason) + } + paused := meta + paused.AdminStatus = "paused" + return paused, nil + }}, + wantStatus: http.StatusOK, + wantBody: "paused", + }, + { + name: "resume-success", + method: http.MethodPost, + path: "/api/keys/key_1/resume", + handlerFn: handleResumeUserKey, + userHandler: &UserKeyHandler{resumeFn: func(ctx context.Context, keyID, subjectID string) (UserKeyMeta, error) { + if keyID != "key_1" || subjectID != "portal-user:1" { + t.Fatalf("resumeFn args = (%q,%q)", keyID, subjectID) + } + return meta, nil + }}, + wantStatus: http.StatusOK, + wantBody: "active", + }, + { + name: "delete-success", + method: http.MethodDelete, + path: "/api/keys/key_1", + handlerFn: handleDeleteUserKey, + userHandler: &UserKeyHandler{deleteFn: func(ctx context.Context, keyID, subjectID string) error { + if keyID != "key_1" || subjectID != "portal-user:1" { + t.Fatalf("deleteFn args = (%q,%q)", keyID, subjectID) + } + return nil + }}, + wantStatus: http.StatusOK, + wantBody: "deleted", + }, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + req := httptest.NewRequest(tc.method, tc.path, nil) + req.Header.Set("X-Portal-Subject", "portal-user:1") + req.SetPathValue("key_id", "key_1") + rr := httptest.NewRecorder() + serveWithMetrics(t, req, rr, func(w http.ResponseWriter, r *http.Request) { + tc.handlerFn(w, r, tc.userHandler) + }) + if rr.Code != tc.wantStatus { + t.Fatalf("status = %d, want %d body=%s", rr.Code, tc.wantStatus, rr.Body.String()) + } + if !strings.Contains(rr.Body.String(), tc.wantBody) { + t.Fatalf("body missing %q: %s", tc.wantBody, rr.Body.String()) + } + }) + } +} + +func serveWithMetrics(t *testing.T, req *http.Request, rr *httptest.ResponseRecorder, fn func(http.ResponseWriter, *http.Request)) { + t.Helper() + http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fn(w, r) + }).ServeHTTP(rr, req) +} + +func TestHandleListUserKeysResponseShape(t *testing.T) { + t.Parallel() + h := &UserKeyHandler{listFn: func(context.Context, string) ([]UserKeyMeta, error) { + return []UserKeyMeta{{KeyID: "key_json", AdminStatus: "active"}}, nil + }} + req := httptest.NewRequest(http.MethodGet, "/api/keys", nil) + req.Header.Set("X-Portal-Subject", "portal-user:json") + rr := httptest.NewRecorder() + handleListUserKeys(rr, req, h) + var payload struct { + Keys []UserKeyMeta `json:"keys"` + } + if err := json.Unmarshal(rr.Body.Bytes(), &payload); err != nil { + t.Fatalf("decode response: %v", err) + } + if len(payload.Keys) != 1 || payload.Keys[0].KeyID != "key_json" { + t.Fatalf("payload = %+v, want one key_json entry", payload) + } +} + +func TestHandleUserKeyMutationHandlersErrorPaths(t *testing.T) { + t.Parallel() + cases := []struct { + name string + handlerFn func(http.ResponseWriter, *http.Request, *UserKeyHandler) + userHandler *UserKeyHandler + wantStatus int + }{ + { + name: "reset-not-found", + handlerFn: handleResetUserKey, + userHandler: &UserKeyHandler{resetFn: func(context.Context, string, string) (ResetUserKeyResponse, error) { + return ResetUserKeyResponse{}, fmt.Errorf("key %q not found", "key_1") + }}, + wantStatus: http.StatusNotFound, + }, + { + name: "pause-not-found", + handlerFn: handlePauseUserKey, + userHandler: &UserKeyHandler{pauseFn: func(context.Context, string, string, string) (UserKeyMeta, error) { + return UserKeyMeta{}, fmt.Errorf("key %q not found", "key_1") + }}, + wantStatus: http.StatusNotFound, + }, + { + name: "resume-not-found", + handlerFn: handleResumeUserKey, + userHandler: &UserKeyHandler{resumeFn: func(context.Context, string, string) (UserKeyMeta, error) { + return UserKeyMeta{}, fmt.Errorf("key %q not found", "key_1") + }}, + wantStatus: http.StatusNotFound, + }, + { + name: "delete-not-found", + handlerFn: handleDeleteUserKey, + userHandler: &UserKeyHandler{deleteFn: func(context.Context, string, string) error { + return fmt.Errorf("key %q not found", "key_1") + }}, + wantStatus: http.StatusNotFound, + }, + } + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + req := httptest.NewRequest(http.MethodPost, "/api/keys/key_1", nil) + req.Header.Set("X-Portal-Subject", "portal-user:1") + req.SetPathValue("key_id", "key_1") + rr := httptest.NewRecorder() + tc.handlerFn(rr, req, tc.userHandler) + if rr.Code != tc.wantStatus { + t.Fatalf("status = %d, want %d body=%s", rr.Code, tc.wantStatus, rr.Body.String()) + } + }) + } +} diff --git a/internal/app/key_self_service_svc.go b/internal/app/key_self_service_svc.go index 8a1ea6ce..06ddaec8 100644 --- a/internal/app/key_self_service_svc.go +++ b/internal/app/key_self_service_svc.go @@ -104,6 +104,11 @@ func ensureSubjectHasAccess(ctx context.Context, client *sub2api.Client, subject return apiKey, nil } +func recordUserKeyFailure(operation, result string, err error) error { + metrics.RecordUserKeyOperation(operation, result) + return err +} + func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { return &UserKeyHandler{ createFn: func(ctx context.Context, req CreateUserKeyRequest) (CreateUserKeyResponse, error) { @@ -117,14 +122,14 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { } store, err := sqlite.Open(ctx, sqliteDSN) if err != nil { - return CreateUserKeyResponse{}, fmt.Errorf("open store: %w", err) + return CreateUserKeyResponse{}, recordUserKeyFailure("create", "open_store_error", fmt.Errorf("open store: %w", err)) } defer store.Close() windowStart := time.Now().UTC().Format("2006-01-02T15:00:00Z") count, err := store.SubjectRateLimits().IncrementWindow(ctx, req.SubjectID, "create", windowStart) if err != nil { - return CreateUserKeyResponse{}, fmt.Errorf("increment create rate limit: %w", err) + return CreateUserKeyResponse{}, recordUserKeyFailure("create", "rate_limit_store_error", fmt.Errorf("increment create rate limit: %w", err)) } if count > defaultKeyRateLimitPerHour { metrics.RecordUserKeyOperation("create", "rate_limited") @@ -134,15 +139,15 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { // Resolve logical group → host → group ID → ensure subscription access _, route, hostRow, client, err := resolveLogicalGroupHost(ctx, store, req.LogicalGroupID) if err != nil { - return CreateUserKeyResponse{}, fmt.Errorf("resolve host for %q: %w", req.LogicalGroupID, err) + return CreateUserKeyResponse{}, recordUserKeyFailure("create", "resolve_host_error", fmt.Errorf("resolve host for %q: %w", req.LogicalGroupID, err)) } hostGroupID, err := resolveShadowHostGroupID(ctx, client, route) if err != nil { - return CreateUserKeyResponse{}, fmt.Errorf("resolve shadow group id for %q: %w", route.ShadowGroupID, err) + return CreateUserKeyResponse{}, recordUserKeyFailure("create", "resolve_shadow_group_error", fmt.Errorf("resolve shadow group id for %q: %w", route.ShadowGroupID, err)) } apiKey, err := ensureSubjectHasAccess(ctx, client, req.SubjectID, hostGroupID) if err != nil { - return CreateUserKeyResponse{}, fmt.Errorf("ensure access for %q: %w", req.LogicalGroupID, err) + return CreateUserKeyResponse{}, recordUserKeyFailure("create", "ensure_access_error", fmt.Errorf("ensure access for %q: %w", req.LogicalGroupID, err)) } fingerprint := "sha256:" + sha256Hex(apiKey) @@ -177,7 +182,7 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { return nil }) if err != nil { - return CreateUserKeyResponse{}, err + return CreateUserKeyResponse{}, recordUserKeyFailure("create", "db_tx_error", err) } metrics.RecordUserKeyOperation("create", "success") @@ -253,21 +258,21 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { resetFn: func(ctx context.Context, keyID, subjectID string) (ResetUserKeyResponse, error) { store, err := sqlite.Open(ctx, sqliteDSN) if err != nil { - return ResetUserKeyResponse{}, fmt.Errorf("open store: %w", err) + return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "open_store_error", fmt.Errorf("open store: %w", err)) } defer store.Close() rec, err := store.UserKeys().GetByID(ctx, keyID) if err != nil { - return ResetUserKeyResponse{}, fmt.Errorf("get key: %w", err) + return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "get_key_error", fmt.Errorf("get key: %w", err)) } if rec.OwnerSubjectID != subjectID && subjectID != "admin" { - return ResetUserKeyResponse{}, fmt.Errorf("key %q not found", keyID) + return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "not_found", fmt.Errorf("key %q not found", keyID)) } windowStart := time.Now().UTC().Format("2006-01-02T00:00:00Z") count, err := store.SubjectRateLimits().IncrementWindow(ctx, subjectID, "reset", windowStart) if err != nil { - return ResetUserKeyResponse{}, fmt.Errorf("increment reset rate limit: %w", err) + return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "rate_limit_store_error", fmt.Errorf("increment reset rate limit: %w", err)) } if count > defaultKeyResetPerDay { metrics.RecordUserKeyOperation("reset", "rate_limited") @@ -277,15 +282,15 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { // Re-resolve host access to get a fresh key _, route, _, client, err := resolveLogicalGroupHost(ctx, store, rec.LogicalGroupID) if err != nil { - return ResetUserKeyResponse{}, fmt.Errorf("resolve host for %q: %w", rec.LogicalGroupID, err) + return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "resolve_host_error", fmt.Errorf("resolve host for %q: %w", rec.LogicalGroupID, err)) } hostGroupID, err := resolveShadowHostGroupID(ctx, client, route) if err != nil { - return ResetUserKeyResponse{}, fmt.Errorf("resolve shadow group id for %q: %w", route.ShadowGroupID, err) + return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "resolve_shadow_group_error", fmt.Errorf("resolve shadow group id for %q: %w", route.ShadowGroupID, err)) } newPlaintext, err := ensureSubjectHasAccess(ctx, client, rec.OwnerSubjectID, hostGroupID) if err != nil { - return ResetUserKeyResponse{}, fmt.Errorf("ensure access on reset for %q: %w", rec.LogicalGroupID, err) + return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "ensure_access_error", fmt.Errorf("ensure access on reset for %q: %w", rec.LogicalGroupID, err)) } hostFingerprint := "sha256:" + sha256Hex(newPlaintext) @@ -309,7 +314,7 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { return nil }) if err != nil { - return ResetUserKeyResponse{}, err + return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "db_tx_error", err) } metrics.RecordUserKeyOperation("reset", "success") return ResetUserKeyResponse{PlaintextKey: newPlaintext, MaskedPreview: masked, AdminStatus: "active"}, nil @@ -317,27 +322,27 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { pauseFn: func(ctx context.Context, keyID, subjectID, reason string) (UserKeyMeta, error) { store, err := sqlite.Open(ctx, sqliteDSN) if err != nil { - return UserKeyMeta{}, fmt.Errorf("open store: %w", err) + return UserKeyMeta{}, recordUserKeyFailure("pause", "open_store_error", fmt.Errorf("open store: %w", err)) } defer store.Close() rec, err := store.UserKeys().GetByID(ctx, keyID) if err != nil { - return UserKeyMeta{}, fmt.Errorf("get key: %w", err) + return UserKeyMeta{}, recordUserKeyFailure("pause", "get_key_error", fmt.Errorf("get key: %w", err)) } if rec.OwnerSubjectID != subjectID && subjectID != "admin" { - return UserKeyMeta{}, fmt.Errorf("key %q not found", keyID) + return UserKeyMeta{}, recordUserKeyFailure("pause", "not_found", fmt.Errorf("key %q not found", keyID)) } _, route, _, client, err := resolveLogicalGroupHost(ctx, store, rec.LogicalGroupID) if err != nil { - return UserKeyMeta{}, fmt.Errorf("resolve host for pause %q: %w", rec.LogicalGroupID, err) + return UserKeyMeta{}, recordUserKeyFailure("pause", "resolve_host_error", fmt.Errorf("resolve host for pause %q: %w", rec.LogicalGroupID, err)) } hostGroupID, err := resolveShadowHostGroupID(ctx, client, route) if err != nil { - return UserKeyMeta{}, fmt.Errorf("resolve shadow group id for pause %q: %w", route.ShadowGroupID, err) + return UserKeyMeta{}, recordUserKeyFailure("pause", "resolve_shadow_group_error", fmt.Errorf("resolve shadow group id for pause %q: %w", route.ShadowGroupID, err)) } if err := client.PauseManagedSubscriptionAccess(ctx, rec.OwnerSubjectID, hostGroupID); err != nil { - return UserKeyMeta{}, fmt.Errorf("pause managed subscription access: %w", err) + return UserKeyMeta{}, recordUserKeyFailure("pause", "pause_access_error", fmt.Errorf("pause managed subscription access: %w", err)) } err = store.WithTx(ctx, func(q *sqlite.Queries) error { if err := q.UserKeys.UpdateStatus(ctx, keyID, "paused"); err != nil { @@ -352,7 +357,7 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { return nil }) if err != nil { - return UserKeyMeta{}, err + return UserKeyMeta{}, recordUserKeyFailure("pause", "db_tx_error", err) } metrics.RecordUserKeyOperation("pause", "success") return UserKeyMeta{KeyID: keyID, MaskedPreview: rec.MaskedPreview, AdminStatus: "paused"}, nil @@ -360,27 +365,27 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { resumeFn: func(ctx context.Context, keyID, subjectID string) (UserKeyMeta, error) { store, err := sqlite.Open(ctx, sqliteDSN) if err != nil { - return UserKeyMeta{}, fmt.Errorf("open store: %w", err) + return UserKeyMeta{}, recordUserKeyFailure("resume", "open_store_error", fmt.Errorf("open store: %w", err)) } defer store.Close() rec, err := store.UserKeys().GetByID(ctx, keyID) if err != nil { - return UserKeyMeta{}, fmt.Errorf("get key: %w", err) + return UserKeyMeta{}, recordUserKeyFailure("resume", "get_key_error", fmt.Errorf("get key: %w", err)) } if rec.OwnerSubjectID != subjectID && subjectID != "admin" { - return UserKeyMeta{}, fmt.Errorf("key %q not found", keyID) + return UserKeyMeta{}, recordUserKeyFailure("resume", "not_found", fmt.Errorf("key %q not found", keyID)) } _, route, _, client, err := resolveLogicalGroupHost(ctx, store, rec.LogicalGroupID) if err != nil { - return UserKeyMeta{}, fmt.Errorf("resolve host for resume %q: %w", rec.LogicalGroupID, err) + return UserKeyMeta{}, recordUserKeyFailure("resume", "resolve_host_error", fmt.Errorf("resolve host for resume %q: %w", rec.LogicalGroupID, err)) } hostGroupID, err := resolveShadowHostGroupID(ctx, client, route) if err != nil { - return UserKeyMeta{}, fmt.Errorf("resolve shadow group id for resume %q: %w", route.ShadowGroupID, err) + return UserKeyMeta{}, recordUserKeyFailure("resume", "resolve_shadow_group_error", fmt.Errorf("resolve shadow group id for resume %q: %w", route.ShadowGroupID, err)) } if err := client.ResumeManagedSubscriptionAccess(ctx, rec.OwnerSubjectID, hostGroupID); err != nil { - return UserKeyMeta{}, fmt.Errorf("resume managed subscription access: %w", err) + return UserKeyMeta{}, recordUserKeyFailure("resume", "resume_access_error", fmt.Errorf("resume managed subscription access: %w", err)) } err = store.WithTx(ctx, func(q *sqlite.Queries) error { if err := q.UserKeys.UpdateStatus(ctx, keyID, "active"); err != nil { @@ -395,7 +400,7 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { return nil }) if err != nil { - return UserKeyMeta{}, err + return UserKeyMeta{}, recordUserKeyFailure("resume", "db_tx_error", err) } metrics.RecordUserKeyOperation("resume", "success") return UserKeyMeta{KeyID: keyID, MaskedPreview: rec.MaskedPreview, AdminStatus: "active"}, nil @@ -403,16 +408,16 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { deleteFn: func(ctx context.Context, keyID, subjectID string) error { store, err := sqlite.Open(ctx, sqliteDSN) if err != nil { - return fmt.Errorf("open store: %w", err) + return recordUserKeyFailure("delete", "open_store_error", fmt.Errorf("open store: %w", err)) } defer store.Close() rec, err := store.UserKeys().GetByID(ctx, keyID) if err != nil { - return fmt.Errorf("get key: %w", err) + return recordUserKeyFailure("delete", "get_key_error", fmt.Errorf("get key: %w", err)) } if rec.OwnerSubjectID != subjectID && subjectID != "admin" { - return fmt.Errorf("key %q not found", keyID) + return recordUserKeyFailure("delete", "not_found", fmt.Errorf("key %q not found", keyID)) } err = store.WithTx(ctx, func(q *sqlite.Queries) error { if err := q.UserKeys.UpdateStatus(ctx, keyID, "retired"); err != nil { @@ -431,8 +436,9 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { }) if err == nil { metrics.RecordUserKeyOperation("delete", "success") + return nil } - return err + return recordUserKeyFailure("delete", "db_tx_error", err) }, } } diff --git a/internal/app/key_self_service_test.go b/internal/app/key_self_service_test.go index 2613b795..cf8a6356 100644 --- a/internal/app/key_self_service_test.go +++ b/internal/app/key_self_service_test.go @@ -322,3 +322,151 @@ func TestUserKeyAPIMetricsMiddlewareAndCreateMetric(t *testing.T) { t.Fatal("expected metrics endpoint to expose user_key_operations_total after create validation failure") } } + +func TestUserKeyPauseResumeDeleteLifecycleUpdatesHostAndStore(t *testing.T) { + t.Parallel() + + store := openAppTestStore(t) + defer closeAppTestStore(t, store) + + const logicalGroupID = "gpt-shared" + const hostGroupID = "999" + const subjectID = "portal-user:lifecycle" + const keyID = "key_lifecycle" + + var allowedGroupsUpdates [][]int64 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet && strings.HasPrefix(r.URL.RequestURI(), "/api/v1/admin/users?"): + w.Write([]byte(`{"data":{"items":[{"id":84,"email":"` + expectedManagedIdentity(subjectID, hostGroupID).Email + `"}]}}`)) + case r.Method == http.MethodPut && r.URL.Path == "/api/v1/admin/users/84": + var payload struct { + AllowedGroups []int64 `json:"allowed_groups"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + t.Fatalf("decode update payload: %v", err) + } + allowedGroupsUpdates = append(allowedGroupsUpdates, append([]int64(nil), payload.AllowedGroups...)) + w.Write([]byte(`{"data":{"id":84}}`)) + default: + w.WriteHeader(http.StatusNotFound) + } + })) + defer server.Close() + + _, _ = store.Hosts().Create(context.Background(), sqlite.Host{ + HostID: "test-host", + BaseURL: server.URL, + HostVersion: "0.0.1", + CapabilityProbeJSON: "{}", + AuthType: "apikey", + AuthToken: "test-token", + }) + _, _ = store.LogicalGroups().Create(context.Background(), sqlite.LogicalGroup{ + LogicalGroupID: logicalGroupID, + DisplayName: "GPT Shared", + Status: "active", + }) + _, _ = store.LogicalGroupRoutes().Create(context.Background(), sqlite.LogicalGroupRoute{ + RouteID: "test-route", + LogicalGroupID: logicalGroupID, + Name: "Test Route", + Status: "active", + ShadowHostID: "test-host", + ShadowGroupID: hostGroupID, + }) + if _, err := store.UserKeys().Create(context.Background(), sqlite.UserKeyRecord{ + KeyID: keyID, + OwnerSubjectID: subjectID, + KeyFingerprint: "sha256:test", + MaskedPreview: "sk-****test", + DisplayName: "lifecycle key", + LogicalGroupID: logicalGroupID, + AllowedModels: []string{"gpt-5.4"}, + AdminStatus: "active", + QuotaStatus: "ok", + }); err != nil { + t.Fatalf("UserKeys().Create() error = %v", err) + } + + handler := buildUserKeyHandler(appTestDSN(t, store)) + paused, err := handler.pauseFn(context.Background(), keyID, subjectID, "") + if err != nil { + t.Fatalf("pauseFn() error = %v", err) + } + if paused.AdminStatus != "paused" { + t.Fatalf("pauseFn() admin_status = %q, want paused", paused.AdminStatus) + } + row, err := store.UserKeys().GetByID(context.Background(), keyID) + if err != nil { + t.Fatalf("UserKeys().GetByID() after pause error = %v", err) + } + if row.AdminStatus != "paused" { + t.Fatalf("stored admin_status after pause = %q, want paused", row.AdminStatus) + } + + resumed, err := handler.resumeFn(context.Background(), keyID, subjectID) + if err != nil { + t.Fatalf("resumeFn() error = %v", err) + } + if resumed.AdminStatus != "active" { + t.Fatalf("resumeFn() admin_status = %q, want active", resumed.AdminStatus) + } + row, err = store.UserKeys().GetByID(context.Background(), keyID) + if err != nil { + t.Fatalf("UserKeys().GetByID() after resume error = %v", err) + } + if row.AdminStatus != "active" { + t.Fatalf("stored admin_status after resume = %q, want active", row.AdminStatus) + } + + if err := handler.deleteFn(context.Background(), keyID, subjectID); err != nil { + t.Fatalf("deleteFn() error = %v", err) + } + row, err = store.UserKeys().GetByID(context.Background(), keyID) + if err != nil { + t.Fatalf("UserKeys().GetByID() after delete error = %v", err) + } + if row.AdminStatus != "retired" { + t.Fatalf("stored admin_status after delete = %q, want retired", row.AdminStatus) + } + + if len(allowedGroupsUpdates) != 2 { + t.Fatalf("allowedGroupsUpdates len = %d, want 2", len(allowedGroupsUpdates)) + } + if len(allowedGroupsUpdates[0]) != 0 { + t.Fatalf("pause allowed_groups = %#v, want empty", allowedGroupsUpdates[0]) + } + if len(allowedGroupsUpdates[1]) != 1 || allowedGroupsUpdates[1][0] != 999 { + t.Fatalf("resume allowed_groups = %#v, want [999]", allowedGroupsUpdates[1]) + } +} + +func TestResolveShadowHostGroupIDByName(t *testing.T) { + t.Parallel() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/api/v1/admin/groups": + w.Write([]byte(`{"data":{"items":[{"id":321,"name":"group-by-name"}]}}`)) + case "/api/v1/admin/channels", "/api/v1/admin/payment/plans", "/api/v1/admin/accounts": + w.Write([]byte(`{"data":{"items":[]}}`)) + default: + w.WriteHeader(http.StatusNotFound) + } + })) + defer server.Close() + + client, err := newSub2APIClient(server.URL, CreateHostAuth{Type: "apikey", Token: "test-token"}) + if err != nil { + t.Fatalf("newSub2APIClient() error = %v", err) + } + + groupID, err := resolveShadowHostGroupID(context.Background(), client, sqlite.LogicalGroupRoute{ShadowGroupID: "group-by-name"}) + if err != nil { + t.Fatalf("resolveShadowHostGroupID() error = %v", err) + } + if groupID != "321" { + t.Fatalf("groupID = %q, want 321", groupID) + } +} diff --git a/internal/app/route_resolve_api.go b/internal/app/route_resolve_api.go index 76ad962d..d166127b 100644 --- a/internal/app/route_resolve_api.go +++ b/internal/app/route_resolve_api.go @@ -227,7 +227,9 @@ func buildResolveRouteAction(sqliteDSN string, stickyRuntime stickyStoreRuntime, } } decisionStatus := "bind" - if selection.fallbackUsed { + if selection.failoverFrom != nil { + decisionStatus = "failover" + } else if selection.fallbackUsed { decisionStatus = "fallback" } metrics.RecordRouteDecision(req.LogicalGroupID, decisionStatus) diff --git a/internal/app/route_resolve_api_test.go b/internal/app/route_resolve_api_test.go index 28350000..1213e307 100644 --- a/internal/app/route_resolve_api_test.go +++ b/internal/app/route_resolve_api_test.go @@ -222,6 +222,9 @@ func TestNewActionSetResolveRouteFlow(t *testing.T) { if !strings.Contains(body, "route_failovers_total") { t.Fatal("metrics missing route_failovers_total after fallback flow") } + if !strings.Contains(body, `route_decisions_total{logical_group="gpt-shared",status="failover"}`) { + t.Fatalf("metrics missing failover decision status after resolve flow: %s", body) + } } func TestResolveRouteHelpers(t *testing.T) { diff --git a/internal/app/user_key_operation_metrics_test.go b/internal/app/user_key_operation_metrics_test.go new file mode 100644 index 00000000..83ec6d96 --- /dev/null +++ b/internal/app/user_key_operation_metrics_test.go @@ -0,0 +1,55 @@ +package app + +import ( + "context" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "sub2api-cn-relay-manager/internal/metrics" +) + +func TestUserKeyCreateResolveHostErrorRecordsMetric(t *testing.T) { + t.Parallel() + store := openAppTestStore(t) + defer closeAppTestStore(t, store) + + handler := NewAPIHandler("t", ActionSet{ + UserKeyHandler: buildUserKeyHandler(appTestDSN(t, store)), + }) + + req := makeCreateRequest(t, http.MethodPost, "/api/keys", makeCreateBody("missing-group", "portal key", []string{"gpt-5.4"})) + req.Header.Set("X-Portal-Subject", "portal-user") + resp := httptestRecorder(handler, req) + if resp.code != http.StatusInternalServerError { + t.Fatalf("status code = %d, want 500 body=%s", resp.code, resp.Body().String()) + } + + metricsReq := httptest.NewRequest(http.MethodGet, "/metrics", nil) + metricsResp := httptest.NewRecorder() + metrics.Handler().ServeHTTP(metricsResp, metricsReq) + body := metricsResp.Body.String() + if !strings.Contains(body, `user_key_operations_total{operation="create",result="resolve_host_error"}`) { + t.Fatalf("metrics body missing create resolve_host_error metric: %s", body) + } +} + +func TestUserKeyDeleteGetKeyErrorRecordsMetric(t *testing.T) { + t.Parallel() + store := openAppTestStore(t) + defer closeAppTestStore(t, store) + + handler := buildUserKeyHandler(appTestDSN(t, store)) + if err := handler.deleteFn(context.Background(), "key_missing", "portal-user"); err == nil { + t.Fatal("expected deleteFn to fail for missing key") + } + + metricsReq := httptest.NewRequest(http.MethodGet, "/metrics", nil) + metricsResp := httptest.NewRecorder() + metrics.Handler().ServeHTTP(metricsResp, metricsReq) + body := metricsResp.Body.String() + if !strings.Contains(body, `user_key_operations_total{operation="delete",result="get_key_error"}`) { + t.Fatalf("metrics body missing delete get_key_error metric: %s", body) + } +} diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 2674a39d..e1325a15 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -3,6 +3,7 @@ package metrics import ( "context" "net/http" + "strconv" "time" "github.com/prometheus/client_golang/prometheus" @@ -132,7 +133,8 @@ func RecordHTTPRequest(method, path string, status int, duration time.Duration) if path == "" { path = "unknown" } - HTTPRequestsTotal.WithLabelValues(method, path, http.StatusText(status)).Inc() + statusLabel := strconv.Itoa(status) + HTTPRequestsTotal.WithLabelValues(method, path, statusLabel).Inc() HTTPRequestDuration.WithLabelValues(method, path).Observe(duration.Seconds()) } diff --git a/internal/metrics/metrics_test.go b/internal/metrics/metrics_test.go index 2037e5f5..d04bf3db 100644 --- a/internal/metrics/metrics_test.go +++ b/internal/metrics/metrics_test.go @@ -28,6 +28,9 @@ func TestHTTPRequestsTotal(t *testing.T) { if !strings.Contains(body, "http_requests_total") { t.Error("Expected metrics endpoint to contain http_requests_total") } + if !strings.Contains(body, `status="200"`) { + t.Fatalf("expected numeric HTTP status label, got: %s", body) + } } func TestRecordRouteDecision(t *testing.T) { diff --git a/scripts/test/verify_quality_gates.sh b/scripts/test/verify_quality_gates.sh index d71544df..d8d00777 100755 --- a/scripts/test/verify_quality_gates.sh +++ b/scripts/test/verify_quality_gates.sh @@ -47,6 +47,9 @@ if [[ $frontend_smoke_status -ne 0 ]]; then fi fi +log "running vNext SLO release gate" +bash "$ROOT_DIR/scripts/test/verify_vnext_slo_release_gate.sh" + log "running gofmt check" gofmt -l . | tee "$GOFMT_LOG" if [[ -s "$GOFMT_LOG" ]]; then diff --git a/scripts/test/verify_vnext_slo_release_gate.sh b/scripts/test/verify_vnext_slo_release_gate.sh new file mode 100755 index 00000000..0bc72d23 --- /dev/null +++ b/scripts/test/verify_vnext_slo_release_gate.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +ARTIFACT_SUMMARY_PATH="${ARTIFACT_SUMMARY_PATH:-$ROOT_DIR/artifacts/v3-governance-live/20260608_102323/99-summary.json}" + +fail() { + echo "FAIL: $*" >&2 + exit 1 +} + +log() { + echo "==> $*" +} + +require_file() { + local path="$1" + [[ -f "$path" ]] || fail "missing required file: $path" +} + +require_contains() { + local path="$1" + local needle="$2" + grep -F "$needle" "$path" >/dev/null || fail "missing expected text in $path: $needle" +} + +log "checking V3-2 source-of-truth files" +require_file "$ROOT_DIR/docs/2026-06-04-SLO_AND_OBSERVABILITY.md" +require_file "$ROOT_DIR/docs/2026-06-04-KEY_ACCOUNT_GOVERNANCE.md" +require_file "$ROOT_DIR/docs/EXECUTION_BOARD.md" +require_file "$ROOT_DIR/deploy/monitoring/prometheus-rules.yml" +require_file "$ARTIFACT_SUMMARY_PATH" + +log "checking metrics wiring truth" +require_contains "$ROOT_DIR/internal/metrics/metrics.go" 'Name: "user_key_operations_total"' +require_contains "$ROOT_DIR/internal/metrics/metrics.go" 'Name: "user_key_chat_requests_total"' +require_contains "$ROOT_DIR/internal/metrics/metrics.go" 'statusLabel := strconv.Itoa(status)' +require_contains "$ROOT_DIR/internal/app/route_resolve_api.go" 'decisionStatus = "failover"' +require_contains "$ROOT_DIR/internal/app/key_self_service_svc.go" 'recordUserKeyFailure("create", "resolve_host_error"' +require_contains "$ROOT_DIR/internal/app/key_self_service_svc.go" 'recordUserKeyFailure("delete", "get_key_error"' + +log "checking alert rule alignment" +require_contains "$ROOT_DIR/deploy/monitoring/prometheus-rules.yml" 'user_key_chat_requests_total{result="ok"}' +require_contains "$ROOT_DIR/deploy/monitoring/prometheus-rules.yml" 'user_key_operations_total{operation="create",result!~"success|rate_limited"}' +require_contains "$ROOT_DIR/deploy/monitoring/prometheus-rules.yml" 'route_decisions_total{status="failover"}' +require_contains "$ROOT_DIR/deploy/monitoring/prometheus-rules.yml" 'http_requests_total{status=~"4..|5.."}' + +log "checking live governance artifact" +python3 - "$ARTIFACT_SUMMARY_PATH" <<'PY' +import json, sys +from pathlib import Path +p = Path(sys.argv[1]) +obj = json.loads(p.read_text()) +checks = { + 'create_http': 201, + 'chat_before_http': 200, + 'pause_http': 200, + 'get_paused_http': 200, + 'chat_paused_http': 403, + 'resume_http': 200, + 'get_resumed_http': 200, + 'chat_resumed_http': 200, + 'delete_http': 200, +} +for key, want in checks.items(): + got = obj.get(key) + if got != want: + raise SystemExit(f'{key}={got!r}, want {want!r}') +paused_body = obj.get('chat_paused_body', '') +if 'key_paused' not in paused_body: + raise SystemExit('chat_paused_body missing key_paused evidence') +print(json.dumps({'artifact': str(p), 'checks': checks, 'paused_error': 'key_paused'}, ensure_ascii=False, indent=2)) +PY + +log "checking docs mention V3-2 closure state" +require_contains "$ROOT_DIR/docs/EXECUTION_BOARD.md" 'V3-2 SLO / 观测最小闭环(2026-06-08 首批)' +require_contains "$ROOT_DIR/docs/EXECUTION_BOARD.md" '失败路径细化、告警规则、发布门禁均已落地' + +echo 'PASS: V3-2 SLO release gate verified'