feat: close v3 slo gates and lifecycle metrics
Some checks failed
CI / Build & Test (push) Has been cancelled
CI / Lint (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / Docker Build (push) Has been cancelled
CI / Release (push) Has been cancelled

This commit is contained in:
phamnazage-jpg
2026-06-08 14:49:06 +08:00
parent dbbb313a36
commit dd6f332b53
14 changed files with 775 additions and 156 deletions

View File

@@ -1,11 +1,10 @@
# Prometheus Alerting Rules for sub2api-cn-relay-manager
# Place this file in your Prometheus rules directory
# Aligned with current vNext.3 metrics semantics (2026-06-08)
groups:
- name: sub2api-relay-manager-alerts
interval: 30s
rules:
# 服务可用性告警
- alert: ServiceDown
expr: up{job="sub2api-relay-manager"} == 0
for: 1m
@@ -16,53 +15,110 @@ groups:
summary: "sub2api-relay-manager service is down"
description: "The sub2api-relay-manager service has been down for more than 1 minute."
# HTTP错误率告警
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{status=~"5..|4.."}[5m]))
/
sum(rate(http_requests_total[5m]))
sum(rate(http_requests_total{status=~"4..|5.."}[5m]))
/
clamp_min(sum(rate(http_requests_total[5m])), 0.001)
) > 0.05
for: 2m
labels:
severity: warning
team: ops
annotations:
summary: "High error rate detected"
description: "Error rate is above 5% for more than 2 minutes. Current value: {{ $value | humanizePercentage }}"
summary: "High HTTP error rate detected"
description: "HTTP 4xx/5xx error rate is above 5% for more than 2 minutes."
# 请求延迟告警
- alert: HighLatency
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
) > 1.0
for: 3m
labels:
severity: warning
team: ops
annotations:
summary: "High request latency"
description: "95th percentile latency is above 1 second for more than 3 minutes."
# 路由故障转移告警
- alert: RouteFailoverSpike
- alert: UserKeyChatSuccessRateLow
expr: |
(
rate(route_failovers_total[5m])
>
2 * avg_over_time(rate(route_failovers_total[1h])[1h:5m])
)
for: 1m
sum(rate(user_key_chat_requests_total{result="ok"}[10m]))
/
clamp_min(sum(rate(user_key_chat_requests_total[10m])), 0.001)
) < 0.95
and sum(rate(user_key_chat_requests_total[10m])) > 0
for: 10m
labels:
severity: critical
team: ops
annotations:
summary: "User-key chat success rate below SLO"
description: "Recent user-key chat success rate is below 95% for 10 minutes."
- alert: UserKeyChatP95LatencyHigh
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket{path="/v1/chat/completions"}[10m])) by (le)
) > 5
for: 10m
labels:
severity: warning
team: ops
annotations:
summary: "Route failover spike detected"
description: "Route failovers have spiked above normal levels. Current rate: {{ $value }}"
summary: "User-key chat P95 latency is high"
description: "P95 latency for /v1/chat/completions exceeds 5 seconds for 10 minutes."
- alert: UserKeyCreateFailures
expr: |
sum(rate(user_key_operations_total{operation="create",result!~"success|rate_limited"}[10m])) > 0.02
for: 10m
labels:
severity: critical
team: ops
annotations:
summary: "User-key create failures detected"
description: "Non-rate-limit create failures are occurring on the self-service path."
- alert: UserKeyResetFailures
expr: |
sum(rate(user_key_operations_total{operation="reset",result!~"success|rate_limited"}[10m])) > 0.02
for: 10m
labels:
severity: warning
team: ops
annotations:
summary: "User-key reset failures detected"
description: "Non-rate-limit reset failures are occurring on the self-service path."
- alert: UserKeyQuotaExhaustedSpike
expr: |
sum(rate(user_key_chat_requests_total{result="quota_exhausted"}[10m])) > 0.05
for: 10m
labels:
severity: warning
team: ops
annotations:
summary: "Quota exhausted events are rising"
description: "quota_exhausted responses are rising on the public user-key gateway path."
- alert: UserKeyAuthFailuresSpike
expr: |
sum(rate(user_key_chat_requests_total{result=~"unauthorized|invalid_api_key"}[10m])) > 0.05
for: 10m
labels:
severity: warning
team: security
annotations:
summary: "User-key auth failures are rising"
description: "unauthorized/invalid_api_key outcomes are rising on the public gateway path."
- alert: RouteFailoverShareHigh
expr: |
(
sum(rate(route_decisions_total{status="failover"}[10m]))
/
clamp_min(sum(rate(route_decisions_total[10m])), 0.001)
) > 0.20
and sum(rate(route_decisions_total[10m])) > 0
for: 10m
labels:
severity: warning
team: ops
annotations:
summary: "Route failover share is high"
description: "More than 20% of recent route decisions are failovers."
# 活跃Provider数量告警
- alert: NoActiveProviders
expr: active_providers == 0
for: 1m
@@ -73,17 +129,6 @@ groups:
summary: "No active providers"
description: "There are no active providers configured. The system cannot route requests."
- alert: LowActiveProviders
expr: active_providers < 2
for: 5m
labels:
severity: warning
team: ops
annotations:
summary: "Low number of active providers"
description: "Only {{ $value }} active provider(s) detected. Consider adding more for redundancy."
# 活跃Host告警
- alert: NoActiveHosts
expr: active_hosts == 0
for: 1m
@@ -94,31 +139,6 @@ groups:
summary: "No active hosts"
description: "There are no active hosts. The system cannot import providers."
# 数据库连接告警
- alert: HighDBConnections
expr: db_connections_active > 50
for: 5m
labels:
severity: warning
team: ops
annotations:
summary: "High database connection count"
description: "Active DB connections: {{ $value }}. Consider connection pool tuning."
# 数据库操作错误告警
- alert: DBOperationErrors
expr: |
rate(db_operations_total{operation=~"INSERT|UPDATE|DELETE"}[5m])
> 100
for: 2m
labels:
severity: warning
team: ops
annotations:
summary: "High database write rate"
description: "DB write operations are above threshold: {{ $value }} ops/sec"
# 日志系统告警
- alert: LogFlushErrors
expr: rate(log_flush_errors_total[5m]) > 0
for: 1m
@@ -130,48 +150,17 @@ groups:
description: "Log flush errors have been detected. Check log storage/backend."
- alert: LogDroppedEvents
expr: |
rate(log_dropped_events_total[5m]) > 10
expr: rate(log_dropped_events_total[5m]) > 10
for: 1m
labels:
severity: warning
team: ops
annotations:
summary: "Log events being dropped"
description: "Log events are being dropped at {{ $value }} events/sec. Check log buffer capacity."
description: "Log events are being dropped. Check log buffer capacity."
# 批处理导入告警
- alert: BatchImportFailures
expr: |
(
rate(route_decisions_total{status="failed"}[5m])
/
rate(route_decisions_total[5m])
) > 0.1
for: 5m
labels:
severity: warning
team: ops
annotations:
summary: "High batch import failure rate"
description: "Batch import failure rate is above 10%. Check provider configurations."
# API认证失败告警
- alert: AuthFailures
expr: |
rate(http_requests_total{status="401"}[5m]) > 10
for: 2m
labels:
severity: warning
team: security
annotations:
summary: "High authentication failure rate"
description: "Auth failures detected. Possible credential issues or attacks."
# 健康检查告警
- alert: HealthCheckFailing
expr: |
http_requests_total{path="/healthz",status!="200"} > 0
expr: http_requests_total{path="/healthz",status!="200"} > 0
for: 30s
labels:
severity: critical

View File

@@ -1,7 +1,7 @@
# SLO and Observability
日期2026-06-04
状态:待审核
状态:已落地V3-2
适用版本vNext.3
## 目的
@@ -107,3 +107,50 @@ vNext.3 引入正式发布门禁前,至少要求:
本文件属于 vNext.3 设计文档。
当前 vNext.1 不进入实现,但必须在规划阶段明确其后续必备性,避免将来“功能可用但不可运营”。
## 2026-06-08 当前落地口径
### 已落地指标
- `http_requests_total{method,path,status}`
- `status` 使用数值字符串(如 `200/403/500`
- `path` 优先使用 `r.Pattern`,避免动态 path 高基数
- `http_request_duration_seconds{method,path}`
- `route_decisions_total{logical_group,status}`
- `status in (sticky_hit, bind, fallback, failover)`
- `route_failovers_total`
- `user_key_operations_total{operation,result}`
- `operation in (create, reset, pause, resume, delete)`
- `result` 已覆盖 `success / unauthorized / bad_request / rate_limited / open_store_error / get_key_error / not_found / rate_limit_store_error / resolve_host_error / resolve_shadow_group_error / ensure_access_error / pause_access_error / resume_access_error / db_tx_error`
- `user_key_chat_requests_total{result}`
- `result in (ok, unauthorized, invalid_api_key, key_paused, key_retired, quota_exhausted, bad_request, db_error, proxy_error)`
### 已落地告警规则
文件:`deploy/monitoring/prometheus-rules.yml`
- `HighErrorRate`
- `UserKeyChatSuccessRateLow`
- `UserKeyChatP95LatencyHigh`
- `UserKeyCreateFailures`
- `UserKeyResetFailures`
- `UserKeyQuotaExhaustedSpike`
- `UserKeyAuthFailuresSpike`
- `RouteFailoverShareHigh`
- `NoActiveProviders`
- `NoActiveHosts`
- `LogFlushErrors`
- `LogDroppedEvents`
- `HealthCheckFailing`
### 已落地发布门禁
脚本:`scripts/test/verify_vnext_slo_release_gate.sh`
门禁要求:
1. 核心 source-of-truth 文档存在
2. metrics 接线与代码口径一致
3. Prometheus 规则引用真实存在的指标与标签
4. live governance artifact 满足 create `201` → chat `200` → pause `200` → chat-paused `403 key_paused` → resume `200` → chat `200` → delete `200`
5. `docs/EXECUTION_BOARD.md` 已同步 V3-2 完成态

View File

@@ -10,7 +10,7 @@
## 一、先说结论
当前状态:条件完成(全量 vNext
当前状态:完成(全量 vNext
说明:
@@ -18,7 +18,7 @@
- vNext.2 已完成 V2-4 + V2-5key self-service API、portal key 管理 UI、用户 portal reset 后首次调用 200 真实线上闭环。
- vNext.3 已完成 V3-1key/account governance 的公网 create→chat→pause→chat-paused→resume→chat-resumed 真验闭环。
- vNext.3 / V3-2 已启动首批 SLO/观测最小闭环HTTP metrics route pattern 归一化、route resolve/failover 指标接线、user-key lifecycle/chat outcome 指标接线与回归测试已完成。
- 仍未完成的是更宽泛的后续治理/SLO 扩展范围;因此按“当前 CRM 网关路线”口径已完成,按“全量 vNext 后续扩展全部做完”口径仍是条件完成。
- V3-2 的失败路径指标细化、Prometheus 告警规则、V3-2 release gate 已全部补齐;当前 CRM 网关路线全量 vNext 后续扩展口径现已统一为完成。
## 二、5 个核心问题 Checklist全量 vNext 目标)
@@ -30,7 +30,7 @@
| 2. 同模型多供应商池化 | 模型池抽象 + 映射 + 真实池化验收 | vNext.1 已闭环 | `model_pool.go`、pool 测试、真实验收脚本已存在 |
| 3. 插件前端承接用户弱能力 | Portal 能承接用户信息、模型、示例、key 信息 | V2-5 已完成 | `PORTAL_KEY_EXPERIENCE.md``deploy/tksea-portal/index.html``artifacts/portal-ui-v25/20260606_1009/99-summary.json` |
| 4. 插件生成/申请 key 并交付 base URL/model/curl 示例 | key self-service API + 首次调用 200 闭环 | V2-4/V2-5 已完成 | `KEY_SELF_SERVICE_API.md``verify_user_key_self_service.sh``artifacts/user-key-self-service/20260605_195408/99-summary.json``artifacts/portal-ui-v25/20260606_1009/99-summary.json` |
| 5. key / 账号暂停、恢复、限额治理 | 三态模型 + 管理页动作 + 真实治理验收 | V3-1 已闭环 / V3-2 进行中 | `KEY_ACCOUNT_GOVERNANCE.md``artifacts/v3-governance-live/20260608_102323/99-summary.json``internal/metrics/metrics.go``internal/app/public_chat_metrics_test.go` |
| 5. key / 账号暂停、恢复、限额治理 | 三态模型 + 管理页动作 + 真实治理验收 | V3-1/V3-2 已闭环 | `KEY_ACCOUNT_GOVERNANCE.md``artifacts/v3-governance-live/20260608_102323/99-summary.json``internal/metrics/metrics.go``internal/app/public_chat_metrics_test.go``scripts/test/verify_vnext_slo_release_gate.sh` |
## 三、vNext.1 发布范围 Checklist
@@ -96,19 +96,22 @@
- Task 4.1 状态模型与治理语义:已实现并接线到 CRM 网关 `POST /v1/chat/completions`
- Task 4.2 管理页治理动作已实现pause / resume 同步宿主 managed user `allowed_groups`
- Task 4.3 真实治理验收:已完成,见 `artifacts/v3-governance-live/20260608_102323/99-summary.json`
- Task 4.4 SLO / 观测最小闭环(第一批):已完成首批接线
- `internal/metrics/metrics.go` 新增 `user_key_operations_total``user_key_chat_requests_total`
- Task 4.4 SLO / 观测闭环:已完成
- `internal/metrics/metrics.go` 新增 `user_key_operations_total``user_key_chat_requests_total`,并把 HTTP status label 收口为数值字符串
- `http_requests_total` 优先使用 `r.Pattern`,避免动态 path 高基数
- route resolve / failover、user-key self-service、public chat outcome 已接指标并补回归测试
- user-key lifecycle 失败路径指标已补齐到 create/reset/pause/resume/delete
- `deploy/monitoring/prometheus-rules.yml` 已按当前指标口径重写
- `scripts/test/verify_vnext_slo_release_gate.sh` 已落地并纳入总质量门
状态V3-1 已闭环;V3-2 首批 SLO/观测接线已完成,剩余治理/SLO 扩展项继续推进
状态V3-1 / V3-2 已闭环
### Phase 5
- Task 5.1 默认链路准入规则vNext.1 已闭环
- Task 5.2 多层验证vNext.1 + V2-4 当前均已有真实 artifact
状态:部分完成(整体 vNext 仍未完成)
状态:完成(全量 vNext 完成)
## 五、当前缺失文件 / 脚本 / 测试(按真实存在性校对)
@@ -120,6 +123,9 @@
- `scripts/acceptance/verify_host_protocol_matrix.sh` — 已存在
- `scripts/acceptance/verify_user_key_self_service.sh` — 已从 skeleton 升级为真实验收脚本
- `internal/app/key_self_service_test.go` — 已存在
- `internal/app/user_key_operation_metrics_test.go` — 已存在
- `scripts/test/verify_vnext_slo_release_gate.sh` — 已存在
- `deploy/monitoring/prometheus-rules.yml` — 已按 V3-2 指标口径重写
### vNext.2 尚缺
@@ -145,16 +151,15 @@
### 立即执行:收尾与同步
1. 已完成 V3-1 公网真验闭环create 201 → chat 200 → pause 200 → chat-paused 403 → resume 200 → chat 200
2. 已确认 2026-06-06 的“pause 后仍 200”并非宿主 cache TTL而是公网 `/v1/chat/completions` 当时仍走宿主、且 CRM `hosts.auth_token` 已过期
3. 已补 remote43 nginx 精确路由与 host bearer 刷新;仓库同步更新部署脚本/示例 nginx
4. 下一步仅剩文档、commit、push 与后续 SLO 范围推进
1. 运行完整质量门并确认 worktree clean
2. commit + push 本轮 V3-2 收尾变更
3. 更新任务真相源为 completed
## 八、当前判定(唯一有效口径)
- 按 vNext.1 发布范围:**完成**
- 按 vNext.2 当前执行项:**完成**V2-4 + V2-5 已真实闭环)
- 按全量 vNext 规划:**条件完成**V3-1 核心代码+测试+线上真验已闭环;剩余仅是后续治理/SLO 扩展项,不再阻塞当前 CRM 网关路线
- 按全量 vNext 规划:**完成**V2-4 / V2-5 / V3-1 / V3-2 均已完成代码、门禁、文档与真实证据闭环
- 当前结论:
- V2-4 / V2-5 / V3-1 已真实闭环,可提交/推送
- 若要宣告“全量 vNext 所有后续扩展都完成”,还需单独定义并交付 V3-2/SLO 范围
- 全量 vNext 后续扩展已完成
- 当前仅剩本轮变更的 commit / push 收尾动作

View File

@@ -129,18 +129,20 @@
- 目标:先把现有 CRM 网关与 user-key 自助链路接成可观测真相源,而不是停留在“有 /metrics 端点但关键路径不产生日志/指标”。
- 本轮代码接线:
- `internal/metrics/metrics.go`新增 `user_key_operations_total``user_key_chat_requests_total`HTTP metrics 优先使用 `r.Pattern`,避免动态 path 高基数
- `internal/app/route_resolve_api.go`resolve / failover 接入 route metrics
- `internal/app/key_self_service_svc.go`create/reset/pause/resume/delete success metrics 接线
- `internal/metrics/metrics.go``user_key_operations_total``user_key_chat_requests_total` 已接线HTTP status label 改为数值字符串HTTP path 优先使用 `r.Pattern`,避免动态 path 高基数
- `internal/app/route_resolve_api.go`route decision 语义收口为 `sticky_hit / bind / fallback / failover`failover 不再和 fallback 混成单一状态
- `internal/app/key_self_service_svc.go`create/reset/pause/resume/delete 不只记录 success,还补齐 `open_store_error / get_key_error / not_found / rate_limit_store_error / resolve_host_error / resolve_shadow_group_error / ensure_access_error / pause_access_error / resume_access_error / db_tx_error` 等失败路径指标
- `internal/app/http_api.go``/v1/chat/completions` 接入 `unauthorized / invalid_api_key / key_paused / key_retired / quota_exhausted / bad_request / db_error / proxy_error / ok` outcome metrics
- `internal/app/public_chat_metrics_test.go`:新增 quota_exhausted 与 route pattern 回归测试
- `deploy/monitoring/prometheus-rules.yml`:已按当前真实指标口径重写为 `UserKeyChatSuccessRateLow / UserKeyChatP95LatencyHigh / UserKeyCreateFailures / UserKeyResetFailures / UserKeyQuotaExhaustedSpike / UserKeyAuthFailuresSpike / RouteFailoverShareHigh` 等告警规则
- `scripts/test/verify_vnext_slo_release_gate.sh`:新增 V3-2 发布门禁脚本,并已接入 `scripts/test/verify_quality_gates.sh`
- 本轮门禁:
- `go test ./internal/app ./internal/metrics -count=1` → PASS
- `go test ./tests/integration/... -count=1` → PASS
- `go vet ./...` → PASS
- `go test -cover ./internal/...` → PASS核心包 `access/provision/pack` 均 ≥ 70%
- `bash ./scripts/test/verify_vnext_slo_release_gate.sh` → PASS校验 metrics 接线 / 告警规则 / live governance artifact / 文档口径)
- 当前结论:
- `部分闭环` —— 首批 SLO/观测接线已完成并过门禁;更宽泛的治理/SLO 扩展(失败路径细化、告警/发布门禁)继续推进
- `闭环` —— V3-2 的失败路径细化、告警规则、发布门禁均已落地;全量 vNext 后续扩展已收口到可验证完成态
- portal key 管理 UI 已完成实现、部署和真实公网验收:
- 关键代码:

View File

@@ -0,0 +1,275 @@
package app
import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"strings"
"testing"
)
func TestGeneratePlaintextKeyAndExtractSubjectID(t *testing.T) {
t.Parallel()
plaintext, fingerprint := generatePlaintextKey()
if !strings.HasPrefix(plaintext, "sk-") {
t.Fatalf("plaintext = %q, want sk- prefix", plaintext)
}
if !strings.HasPrefix(fingerprint, "sha256:") {
t.Fatalf("fingerprint = %q, want sha256 prefix", fingerprint)
}
h := &UserKeyHandler{}
req := httptest.NewRequest(http.MethodGet, "/api/keys", nil)
req.Header.Set("Authorization", "Bearer abcdefgh12345678")
subjectID, httpErr := h.extractSubjectID(req)
if httpErr != nil {
t.Fatalf("extractSubjectID() unexpected error: %+v", httpErr)
}
if subjectID != "skeleton_user_abcdefgh" {
t.Fatalf("subjectID = %q, want skeleton_user_abcdefgh", subjectID)
}
}
func TestHandleUserKeyListNotImplemented(t *testing.T) {
t.Parallel()
req := httptest.NewRequest(http.MethodGet, "/api/keys", nil)
rr := httptest.NewRecorder()
serveWithMetrics(t, req, rr, func(w http.ResponseWriter, r *http.Request) {
handleListUserKeys(w, r, nil)
})
if rr.Code != http.StatusNotImplemented {
t.Fatalf("status = %d, want 501 body=%s", rr.Code, rr.Body.String())
}
}
func TestHandleUserKeyListSuccess(t *testing.T) {
t.Parallel()
h := &UserKeyHandler{
listFn: func(ctx context.Context, subjectID string) ([]UserKeyMeta, error) {
if subjectID != "portal-user:1" {
t.Fatalf("subjectID = %q, want portal-user:1", subjectID)
}
return []UserKeyMeta{{KeyID: "key_1", AdminStatus: "active"}}, nil
},
}
req := httptest.NewRequest(http.MethodGet, "/api/keys", nil)
req.Header.Set("X-Portal-Subject", "portal-user:1")
rr := httptest.NewRecorder()
serveWithMetrics(t, req, rr, func(w http.ResponseWriter, r *http.Request) {
handleListUserKeys(w, r, h)
})
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, want 200 body=%s", rr.Code, rr.Body.String())
}
if !strings.Contains(rr.Body.String(), "key_1") {
t.Fatalf("body missing key_1: %s", rr.Body.String())
}
}
func TestHandleGetUserKeyMissingKeyID(t *testing.T) {
t.Parallel()
h := &UserKeyHandler{getFn: func(context.Context, string, string) (UserKeyMeta, error) {
t.Fatal("getFn should not be called when key_id is missing")
return UserKeyMeta{}, nil
}}
req := httptest.NewRequest(http.MethodGet, "/api/keys/", nil)
req.Header.Set("X-Portal-Subject", "portal-user:1")
rr := httptest.NewRecorder()
serveWithMetrics(t, req, rr, func(w http.ResponseWriter, r *http.Request) {
handleGetUserKey(w, r, h)
})
if rr.Code != http.StatusBadRequest {
t.Fatalf("status = %d, want 400 body=%s", rr.Code, rr.Body.String())
}
}
func TestHandleUserKeyMutationHandlers(t *testing.T) {
t.Parallel()
meta := UserKeyMeta{KeyID: "key_1", MaskedPreview: "sk-****1234", AdminStatus: "active"}
cases := []struct {
name string
method string
path string
handlerFn func(http.ResponseWriter, *http.Request, *UserKeyHandler)
userHandler *UserKeyHandler
wantStatus int
wantBody string
}{
{
name: "get-success",
method: http.MethodGet,
path: "/api/keys/key_1",
handlerFn: handleGetUserKey,
userHandler: &UserKeyHandler{getFn: func(ctx context.Context, keyID, subjectID string) (UserKeyMeta, error) {
if keyID != "key_1" || subjectID != "portal-user:1" {
t.Fatalf("getFn args = (%q,%q)", keyID, subjectID)
}
return meta, nil
}},
wantStatus: http.StatusOK,
wantBody: "key_1",
},
{
name: "reset-success",
method: http.MethodPost,
path: "/api/keys/key_1/reset",
handlerFn: handleResetUserKey,
userHandler: &UserKeyHandler{resetFn: func(ctx context.Context, keyID, subjectID string) (ResetUserKeyResponse, error) {
if keyID != "key_1" || subjectID != "portal-user:1" {
t.Fatalf("resetFn args = (%q,%q)", keyID, subjectID)
}
return ResetUserKeyResponse{PlaintextKey: "sk-new", MaskedPreview: "sk-****new", AdminStatus: "active"}, nil
}},
wantStatus: http.StatusOK,
wantBody: "sk-new",
},
{
name: "pause-success",
method: http.MethodPost,
path: "/api/keys/key_1/pause",
handlerFn: handlePauseUserKey,
userHandler: &UserKeyHandler{pauseFn: func(ctx context.Context, keyID, subjectID, reason string) (UserKeyMeta, error) {
if keyID != "key_1" || subjectID != "portal-user:1" || reason != "" {
t.Fatalf("pauseFn args = (%q,%q,%q)", keyID, subjectID, reason)
}
paused := meta
paused.AdminStatus = "paused"
return paused, nil
}},
wantStatus: http.StatusOK,
wantBody: "paused",
},
{
name: "resume-success",
method: http.MethodPost,
path: "/api/keys/key_1/resume",
handlerFn: handleResumeUserKey,
userHandler: &UserKeyHandler{resumeFn: func(ctx context.Context, keyID, subjectID string) (UserKeyMeta, error) {
if keyID != "key_1" || subjectID != "portal-user:1" {
t.Fatalf("resumeFn args = (%q,%q)", keyID, subjectID)
}
return meta, nil
}},
wantStatus: http.StatusOK,
wantBody: "active",
},
{
name: "delete-success",
method: http.MethodDelete,
path: "/api/keys/key_1",
handlerFn: handleDeleteUserKey,
userHandler: &UserKeyHandler{deleteFn: func(ctx context.Context, keyID, subjectID string) error {
if keyID != "key_1" || subjectID != "portal-user:1" {
t.Fatalf("deleteFn args = (%q,%q)", keyID, subjectID)
}
return nil
}},
wantStatus: http.StatusOK,
wantBody: "deleted",
},
}
for _, tc := range cases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
req := httptest.NewRequest(tc.method, tc.path, nil)
req.Header.Set("X-Portal-Subject", "portal-user:1")
req.SetPathValue("key_id", "key_1")
rr := httptest.NewRecorder()
serveWithMetrics(t, req, rr, func(w http.ResponseWriter, r *http.Request) {
tc.handlerFn(w, r, tc.userHandler)
})
if rr.Code != tc.wantStatus {
t.Fatalf("status = %d, want %d body=%s", rr.Code, tc.wantStatus, rr.Body.String())
}
if !strings.Contains(rr.Body.String(), tc.wantBody) {
t.Fatalf("body missing %q: %s", tc.wantBody, rr.Body.String())
}
})
}
}
func serveWithMetrics(t *testing.T, req *http.Request, rr *httptest.ResponseRecorder, fn func(http.ResponseWriter, *http.Request)) {
t.Helper()
http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fn(w, r)
}).ServeHTTP(rr, req)
}
func TestHandleListUserKeysResponseShape(t *testing.T) {
t.Parallel()
h := &UserKeyHandler{listFn: func(context.Context, string) ([]UserKeyMeta, error) {
return []UserKeyMeta{{KeyID: "key_json", AdminStatus: "active"}}, nil
}}
req := httptest.NewRequest(http.MethodGet, "/api/keys", nil)
req.Header.Set("X-Portal-Subject", "portal-user:json")
rr := httptest.NewRecorder()
handleListUserKeys(rr, req, h)
var payload struct {
Keys []UserKeyMeta `json:"keys"`
}
if err := json.Unmarshal(rr.Body.Bytes(), &payload); err != nil {
t.Fatalf("decode response: %v", err)
}
if len(payload.Keys) != 1 || payload.Keys[0].KeyID != "key_json" {
t.Fatalf("payload = %+v, want one key_json entry", payload)
}
}
func TestHandleUserKeyMutationHandlersErrorPaths(t *testing.T) {
t.Parallel()
cases := []struct {
name string
handlerFn func(http.ResponseWriter, *http.Request, *UserKeyHandler)
userHandler *UserKeyHandler
wantStatus int
}{
{
name: "reset-not-found",
handlerFn: handleResetUserKey,
userHandler: &UserKeyHandler{resetFn: func(context.Context, string, string) (ResetUserKeyResponse, error) {
return ResetUserKeyResponse{}, fmt.Errorf("key %q not found", "key_1")
}},
wantStatus: http.StatusNotFound,
},
{
name: "pause-not-found",
handlerFn: handlePauseUserKey,
userHandler: &UserKeyHandler{pauseFn: func(context.Context, string, string, string) (UserKeyMeta, error) {
return UserKeyMeta{}, fmt.Errorf("key %q not found", "key_1")
}},
wantStatus: http.StatusNotFound,
},
{
name: "resume-not-found",
handlerFn: handleResumeUserKey,
userHandler: &UserKeyHandler{resumeFn: func(context.Context, string, string) (UserKeyMeta, error) {
return UserKeyMeta{}, fmt.Errorf("key %q not found", "key_1")
}},
wantStatus: http.StatusNotFound,
},
{
name: "delete-not-found",
handlerFn: handleDeleteUserKey,
userHandler: &UserKeyHandler{deleteFn: func(context.Context, string, string) error {
return fmt.Errorf("key %q not found", "key_1")
}},
wantStatus: http.StatusNotFound,
},
}
for _, tc := range cases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/api/keys/key_1", nil)
req.Header.Set("X-Portal-Subject", "portal-user:1")
req.SetPathValue("key_id", "key_1")
rr := httptest.NewRecorder()
tc.handlerFn(rr, req, tc.userHandler)
if rr.Code != tc.wantStatus {
t.Fatalf("status = %d, want %d body=%s", rr.Code, tc.wantStatus, rr.Body.String())
}
})
}
}

View File

@@ -104,6 +104,11 @@ func ensureSubjectHasAccess(ctx context.Context, client *sub2api.Client, subject
return apiKey, nil
}
func recordUserKeyFailure(operation, result string, err error) error {
metrics.RecordUserKeyOperation(operation, result)
return err
}
func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler {
return &UserKeyHandler{
createFn: func(ctx context.Context, req CreateUserKeyRequest) (CreateUserKeyResponse, error) {
@@ -117,14 +122,14 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler {
}
store, err := sqlite.Open(ctx, sqliteDSN)
if err != nil {
return CreateUserKeyResponse{}, fmt.Errorf("open store: %w", err)
return CreateUserKeyResponse{}, recordUserKeyFailure("create", "open_store_error", fmt.Errorf("open store: %w", err))
}
defer store.Close()
windowStart := time.Now().UTC().Format("2006-01-02T15:00:00Z")
count, err := store.SubjectRateLimits().IncrementWindow(ctx, req.SubjectID, "create", windowStart)
if err != nil {
return CreateUserKeyResponse{}, fmt.Errorf("increment create rate limit: %w", err)
return CreateUserKeyResponse{}, recordUserKeyFailure("create", "rate_limit_store_error", fmt.Errorf("increment create rate limit: %w", err))
}
if count > defaultKeyRateLimitPerHour {
metrics.RecordUserKeyOperation("create", "rate_limited")
@@ -134,15 +139,15 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler {
// Resolve logical group → host → group ID → ensure subscription access
_, route, hostRow, client, err := resolveLogicalGroupHost(ctx, store, req.LogicalGroupID)
if err != nil {
return CreateUserKeyResponse{}, fmt.Errorf("resolve host for %q: %w", req.LogicalGroupID, err)
return CreateUserKeyResponse{}, recordUserKeyFailure("create", "resolve_host_error", fmt.Errorf("resolve host for %q: %w", req.LogicalGroupID, err))
}
hostGroupID, err := resolveShadowHostGroupID(ctx, client, route)
if err != nil {
return CreateUserKeyResponse{}, fmt.Errorf("resolve shadow group id for %q: %w", route.ShadowGroupID, err)
return CreateUserKeyResponse{}, recordUserKeyFailure("create", "resolve_shadow_group_error", fmt.Errorf("resolve shadow group id for %q: %w", route.ShadowGroupID, err))
}
apiKey, err := ensureSubjectHasAccess(ctx, client, req.SubjectID, hostGroupID)
if err != nil {
return CreateUserKeyResponse{}, fmt.Errorf("ensure access for %q: %w", req.LogicalGroupID, err)
return CreateUserKeyResponse{}, recordUserKeyFailure("create", "ensure_access_error", fmt.Errorf("ensure access for %q: %w", req.LogicalGroupID, err))
}
fingerprint := "sha256:" + sha256Hex(apiKey)
@@ -177,7 +182,7 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler {
return nil
})
if err != nil {
return CreateUserKeyResponse{}, err
return CreateUserKeyResponse{}, recordUserKeyFailure("create", "db_tx_error", err)
}
metrics.RecordUserKeyOperation("create", "success")
@@ -253,21 +258,21 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler {
resetFn: func(ctx context.Context, keyID, subjectID string) (ResetUserKeyResponse, error) {
store, err := sqlite.Open(ctx, sqliteDSN)
if err != nil {
return ResetUserKeyResponse{}, fmt.Errorf("open store: %w", err)
return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "open_store_error", fmt.Errorf("open store: %w", err))
}
defer store.Close()
rec, err := store.UserKeys().GetByID(ctx, keyID)
if err != nil {
return ResetUserKeyResponse{}, fmt.Errorf("get key: %w", err)
return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "get_key_error", fmt.Errorf("get key: %w", err))
}
if rec.OwnerSubjectID != subjectID && subjectID != "admin" {
return ResetUserKeyResponse{}, fmt.Errorf("key %q not found", keyID)
return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "not_found", fmt.Errorf("key %q not found", keyID))
}
windowStart := time.Now().UTC().Format("2006-01-02T00:00:00Z")
count, err := store.SubjectRateLimits().IncrementWindow(ctx, subjectID, "reset", windowStart)
if err != nil {
return ResetUserKeyResponse{}, fmt.Errorf("increment reset rate limit: %w", err)
return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "rate_limit_store_error", fmt.Errorf("increment reset rate limit: %w", err))
}
if count > defaultKeyResetPerDay {
metrics.RecordUserKeyOperation("reset", "rate_limited")
@@ -277,15 +282,15 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler {
// Re-resolve host access to get a fresh key
_, route, _, client, err := resolveLogicalGroupHost(ctx, store, rec.LogicalGroupID)
if err != nil {
return ResetUserKeyResponse{}, fmt.Errorf("resolve host for %q: %w", rec.LogicalGroupID, err)
return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "resolve_host_error", fmt.Errorf("resolve host for %q: %w", rec.LogicalGroupID, err))
}
hostGroupID, err := resolveShadowHostGroupID(ctx, client, route)
if err != nil {
return ResetUserKeyResponse{}, fmt.Errorf("resolve shadow group id for %q: %w", route.ShadowGroupID, err)
return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "resolve_shadow_group_error", fmt.Errorf("resolve shadow group id for %q: %w", route.ShadowGroupID, err))
}
newPlaintext, err := ensureSubjectHasAccess(ctx, client, rec.OwnerSubjectID, hostGroupID)
if err != nil {
return ResetUserKeyResponse{}, fmt.Errorf("ensure access on reset for %q: %w", rec.LogicalGroupID, err)
return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "ensure_access_error", fmt.Errorf("ensure access on reset for %q: %w", rec.LogicalGroupID, err))
}
hostFingerprint := "sha256:" + sha256Hex(newPlaintext)
@@ -309,7 +314,7 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler {
return nil
})
if err != nil {
return ResetUserKeyResponse{}, err
return ResetUserKeyResponse{}, recordUserKeyFailure("reset", "db_tx_error", err)
}
metrics.RecordUserKeyOperation("reset", "success")
return ResetUserKeyResponse{PlaintextKey: newPlaintext, MaskedPreview: masked, AdminStatus: "active"}, nil
@@ -317,27 +322,27 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler {
pauseFn: func(ctx context.Context, keyID, subjectID, reason string) (UserKeyMeta, error) {
store, err := sqlite.Open(ctx, sqliteDSN)
if err != nil {
return UserKeyMeta{}, fmt.Errorf("open store: %w", err)
return UserKeyMeta{}, recordUserKeyFailure("pause", "open_store_error", fmt.Errorf("open store: %w", err))
}
defer store.Close()
rec, err := store.UserKeys().GetByID(ctx, keyID)
if err != nil {
return UserKeyMeta{}, fmt.Errorf("get key: %w", err)
return UserKeyMeta{}, recordUserKeyFailure("pause", "get_key_error", fmt.Errorf("get key: %w", err))
}
if rec.OwnerSubjectID != subjectID && subjectID != "admin" {
return UserKeyMeta{}, fmt.Errorf("key %q not found", keyID)
return UserKeyMeta{}, recordUserKeyFailure("pause", "not_found", fmt.Errorf("key %q not found", keyID))
}
_, route, _, client, err := resolveLogicalGroupHost(ctx, store, rec.LogicalGroupID)
if err != nil {
return UserKeyMeta{}, fmt.Errorf("resolve host for pause %q: %w", rec.LogicalGroupID, err)
return UserKeyMeta{}, recordUserKeyFailure("pause", "resolve_host_error", fmt.Errorf("resolve host for pause %q: %w", rec.LogicalGroupID, err))
}
hostGroupID, err := resolveShadowHostGroupID(ctx, client, route)
if err != nil {
return UserKeyMeta{}, fmt.Errorf("resolve shadow group id for pause %q: %w", route.ShadowGroupID, err)
return UserKeyMeta{}, recordUserKeyFailure("pause", "resolve_shadow_group_error", fmt.Errorf("resolve shadow group id for pause %q: %w", route.ShadowGroupID, err))
}
if err := client.PauseManagedSubscriptionAccess(ctx, rec.OwnerSubjectID, hostGroupID); err != nil {
return UserKeyMeta{}, fmt.Errorf("pause managed subscription access: %w", err)
return UserKeyMeta{}, recordUserKeyFailure("pause", "pause_access_error", fmt.Errorf("pause managed subscription access: %w", err))
}
err = store.WithTx(ctx, func(q *sqlite.Queries) error {
if err := q.UserKeys.UpdateStatus(ctx, keyID, "paused"); err != nil {
@@ -352,7 +357,7 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler {
return nil
})
if err != nil {
return UserKeyMeta{}, err
return UserKeyMeta{}, recordUserKeyFailure("pause", "db_tx_error", err)
}
metrics.RecordUserKeyOperation("pause", "success")
return UserKeyMeta{KeyID: keyID, MaskedPreview: rec.MaskedPreview, AdminStatus: "paused"}, nil
@@ -360,27 +365,27 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler {
resumeFn: func(ctx context.Context, keyID, subjectID string) (UserKeyMeta, error) {
store, err := sqlite.Open(ctx, sqliteDSN)
if err != nil {
return UserKeyMeta{}, fmt.Errorf("open store: %w", err)
return UserKeyMeta{}, recordUserKeyFailure("resume", "open_store_error", fmt.Errorf("open store: %w", err))
}
defer store.Close()
rec, err := store.UserKeys().GetByID(ctx, keyID)
if err != nil {
return UserKeyMeta{}, fmt.Errorf("get key: %w", err)
return UserKeyMeta{}, recordUserKeyFailure("resume", "get_key_error", fmt.Errorf("get key: %w", err))
}
if rec.OwnerSubjectID != subjectID && subjectID != "admin" {
return UserKeyMeta{}, fmt.Errorf("key %q not found", keyID)
return UserKeyMeta{}, recordUserKeyFailure("resume", "not_found", fmt.Errorf("key %q not found", keyID))
}
_, route, _, client, err := resolveLogicalGroupHost(ctx, store, rec.LogicalGroupID)
if err != nil {
return UserKeyMeta{}, fmt.Errorf("resolve host for resume %q: %w", rec.LogicalGroupID, err)
return UserKeyMeta{}, recordUserKeyFailure("resume", "resolve_host_error", fmt.Errorf("resolve host for resume %q: %w", rec.LogicalGroupID, err))
}
hostGroupID, err := resolveShadowHostGroupID(ctx, client, route)
if err != nil {
return UserKeyMeta{}, fmt.Errorf("resolve shadow group id for resume %q: %w", route.ShadowGroupID, err)
return UserKeyMeta{}, recordUserKeyFailure("resume", "resolve_shadow_group_error", fmt.Errorf("resolve shadow group id for resume %q: %w", route.ShadowGroupID, err))
}
if err := client.ResumeManagedSubscriptionAccess(ctx, rec.OwnerSubjectID, hostGroupID); err != nil {
return UserKeyMeta{}, fmt.Errorf("resume managed subscription access: %w", err)
return UserKeyMeta{}, recordUserKeyFailure("resume", "resume_access_error", fmt.Errorf("resume managed subscription access: %w", err))
}
err = store.WithTx(ctx, func(q *sqlite.Queries) error {
if err := q.UserKeys.UpdateStatus(ctx, keyID, "active"); err != nil {
@@ -395,7 +400,7 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler {
return nil
})
if err != nil {
return UserKeyMeta{}, err
return UserKeyMeta{}, recordUserKeyFailure("resume", "db_tx_error", err)
}
metrics.RecordUserKeyOperation("resume", "success")
return UserKeyMeta{KeyID: keyID, MaskedPreview: rec.MaskedPreview, AdminStatus: "active"}, nil
@@ -403,16 +408,16 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler {
deleteFn: func(ctx context.Context, keyID, subjectID string) error {
store, err := sqlite.Open(ctx, sqliteDSN)
if err != nil {
return fmt.Errorf("open store: %w", err)
return recordUserKeyFailure("delete", "open_store_error", fmt.Errorf("open store: %w", err))
}
defer store.Close()
rec, err := store.UserKeys().GetByID(ctx, keyID)
if err != nil {
return fmt.Errorf("get key: %w", err)
return recordUserKeyFailure("delete", "get_key_error", fmt.Errorf("get key: %w", err))
}
if rec.OwnerSubjectID != subjectID && subjectID != "admin" {
return fmt.Errorf("key %q not found", keyID)
return recordUserKeyFailure("delete", "not_found", fmt.Errorf("key %q not found", keyID))
}
err = store.WithTx(ctx, func(q *sqlite.Queries) error {
if err := q.UserKeys.UpdateStatus(ctx, keyID, "retired"); err != nil {
@@ -431,8 +436,9 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler {
})
if err == nil {
metrics.RecordUserKeyOperation("delete", "success")
return nil
}
return err
return recordUserKeyFailure("delete", "db_tx_error", err)
},
}
}

View File

@@ -322,3 +322,151 @@ func TestUserKeyAPIMetricsMiddlewareAndCreateMetric(t *testing.T) {
t.Fatal("expected metrics endpoint to expose user_key_operations_total after create validation failure")
}
}
func TestUserKeyPauseResumeDeleteLifecycleUpdatesHostAndStore(t *testing.T) {
t.Parallel()
store := openAppTestStore(t)
defer closeAppTestStore(t, store)
const logicalGroupID = "gpt-shared"
const hostGroupID = "999"
const subjectID = "portal-user:lifecycle"
const keyID = "key_lifecycle"
var allowedGroupsUpdates [][]int64
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasPrefix(r.URL.RequestURI(), "/api/v1/admin/users?"):
w.Write([]byte(`{"data":{"items":[{"id":84,"email":"` + expectedManagedIdentity(subjectID, hostGroupID).Email + `"}]}}`))
case r.Method == http.MethodPut && r.URL.Path == "/api/v1/admin/users/84":
var payload struct {
AllowedGroups []int64 `json:"allowed_groups"`
}
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
t.Fatalf("decode update payload: %v", err)
}
allowedGroupsUpdates = append(allowedGroupsUpdates, append([]int64(nil), payload.AllowedGroups...))
w.Write([]byte(`{"data":{"id":84}}`))
default:
w.WriteHeader(http.StatusNotFound)
}
}))
defer server.Close()
_, _ = store.Hosts().Create(context.Background(), sqlite.Host{
HostID: "test-host",
BaseURL: server.URL,
HostVersion: "0.0.1",
CapabilityProbeJSON: "{}",
AuthType: "apikey",
AuthToken: "test-token",
})
_, _ = store.LogicalGroups().Create(context.Background(), sqlite.LogicalGroup{
LogicalGroupID: logicalGroupID,
DisplayName: "GPT Shared",
Status: "active",
})
_, _ = store.LogicalGroupRoutes().Create(context.Background(), sqlite.LogicalGroupRoute{
RouteID: "test-route",
LogicalGroupID: logicalGroupID,
Name: "Test Route",
Status: "active",
ShadowHostID: "test-host",
ShadowGroupID: hostGroupID,
})
if _, err := store.UserKeys().Create(context.Background(), sqlite.UserKeyRecord{
KeyID: keyID,
OwnerSubjectID: subjectID,
KeyFingerprint: "sha256:test",
MaskedPreview: "sk-****test",
DisplayName: "lifecycle key",
LogicalGroupID: logicalGroupID,
AllowedModels: []string{"gpt-5.4"},
AdminStatus: "active",
QuotaStatus: "ok",
}); err != nil {
t.Fatalf("UserKeys().Create() error = %v", err)
}
handler := buildUserKeyHandler(appTestDSN(t, store))
paused, err := handler.pauseFn(context.Background(), keyID, subjectID, "")
if err != nil {
t.Fatalf("pauseFn() error = %v", err)
}
if paused.AdminStatus != "paused" {
t.Fatalf("pauseFn() admin_status = %q, want paused", paused.AdminStatus)
}
row, err := store.UserKeys().GetByID(context.Background(), keyID)
if err != nil {
t.Fatalf("UserKeys().GetByID() after pause error = %v", err)
}
if row.AdminStatus != "paused" {
t.Fatalf("stored admin_status after pause = %q, want paused", row.AdminStatus)
}
resumed, err := handler.resumeFn(context.Background(), keyID, subjectID)
if err != nil {
t.Fatalf("resumeFn() error = %v", err)
}
if resumed.AdminStatus != "active" {
t.Fatalf("resumeFn() admin_status = %q, want active", resumed.AdminStatus)
}
row, err = store.UserKeys().GetByID(context.Background(), keyID)
if err != nil {
t.Fatalf("UserKeys().GetByID() after resume error = %v", err)
}
if row.AdminStatus != "active" {
t.Fatalf("stored admin_status after resume = %q, want active", row.AdminStatus)
}
if err := handler.deleteFn(context.Background(), keyID, subjectID); err != nil {
t.Fatalf("deleteFn() error = %v", err)
}
row, err = store.UserKeys().GetByID(context.Background(), keyID)
if err != nil {
t.Fatalf("UserKeys().GetByID() after delete error = %v", err)
}
if row.AdminStatus != "retired" {
t.Fatalf("stored admin_status after delete = %q, want retired", row.AdminStatus)
}
if len(allowedGroupsUpdates) != 2 {
t.Fatalf("allowedGroupsUpdates len = %d, want 2", len(allowedGroupsUpdates))
}
if len(allowedGroupsUpdates[0]) != 0 {
t.Fatalf("pause allowed_groups = %#v, want empty", allowedGroupsUpdates[0])
}
if len(allowedGroupsUpdates[1]) != 1 || allowedGroupsUpdates[1][0] != 999 {
t.Fatalf("resume allowed_groups = %#v, want [999]", allowedGroupsUpdates[1])
}
}
func TestResolveShadowHostGroupIDByName(t *testing.T) {
t.Parallel()
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/api/v1/admin/groups":
w.Write([]byte(`{"data":{"items":[{"id":321,"name":"group-by-name"}]}}`))
case "/api/v1/admin/channels", "/api/v1/admin/payment/plans", "/api/v1/admin/accounts":
w.Write([]byte(`{"data":{"items":[]}}`))
default:
w.WriteHeader(http.StatusNotFound)
}
}))
defer server.Close()
client, err := newSub2APIClient(server.URL, CreateHostAuth{Type: "apikey", Token: "test-token"})
if err != nil {
t.Fatalf("newSub2APIClient() error = %v", err)
}
groupID, err := resolveShadowHostGroupID(context.Background(), client, sqlite.LogicalGroupRoute{ShadowGroupID: "group-by-name"})
if err != nil {
t.Fatalf("resolveShadowHostGroupID() error = %v", err)
}
if groupID != "321" {
t.Fatalf("groupID = %q, want 321", groupID)
}
}

View File

@@ -227,7 +227,9 @@ func buildResolveRouteAction(sqliteDSN string, stickyRuntime stickyStoreRuntime,
}
}
decisionStatus := "bind"
if selection.fallbackUsed {
if selection.failoverFrom != nil {
decisionStatus = "failover"
} else if selection.fallbackUsed {
decisionStatus = "fallback"
}
metrics.RecordRouteDecision(req.LogicalGroupID, decisionStatus)

View File

@@ -222,6 +222,9 @@ func TestNewActionSetResolveRouteFlow(t *testing.T) {
if !strings.Contains(body, "route_failovers_total") {
t.Fatal("metrics missing route_failovers_total after fallback flow")
}
if !strings.Contains(body, `route_decisions_total{logical_group="gpt-shared",status="failover"}`) {
t.Fatalf("metrics missing failover decision status after resolve flow: %s", body)
}
}
func TestResolveRouteHelpers(t *testing.T) {

View File

@@ -0,0 +1,55 @@
package app
import (
"context"
"net/http"
"net/http/httptest"
"strings"
"testing"
"sub2api-cn-relay-manager/internal/metrics"
)
func TestUserKeyCreateResolveHostErrorRecordsMetric(t *testing.T) {
t.Parallel()
store := openAppTestStore(t)
defer closeAppTestStore(t, store)
handler := NewAPIHandler("t", ActionSet{
UserKeyHandler: buildUserKeyHandler(appTestDSN(t, store)),
})
req := makeCreateRequest(t, http.MethodPost, "/api/keys", makeCreateBody("missing-group", "portal key", []string{"gpt-5.4"}))
req.Header.Set("X-Portal-Subject", "portal-user")
resp := httptestRecorder(handler, req)
if resp.code != http.StatusInternalServerError {
t.Fatalf("status code = %d, want 500 body=%s", resp.code, resp.Body().String())
}
metricsReq := httptest.NewRequest(http.MethodGet, "/metrics", nil)
metricsResp := httptest.NewRecorder()
metrics.Handler().ServeHTTP(metricsResp, metricsReq)
body := metricsResp.Body.String()
if !strings.Contains(body, `user_key_operations_total{operation="create",result="resolve_host_error"}`) {
t.Fatalf("metrics body missing create resolve_host_error metric: %s", body)
}
}
func TestUserKeyDeleteGetKeyErrorRecordsMetric(t *testing.T) {
t.Parallel()
store := openAppTestStore(t)
defer closeAppTestStore(t, store)
handler := buildUserKeyHandler(appTestDSN(t, store))
if err := handler.deleteFn(context.Background(), "key_missing", "portal-user"); err == nil {
t.Fatal("expected deleteFn to fail for missing key")
}
metricsReq := httptest.NewRequest(http.MethodGet, "/metrics", nil)
metricsResp := httptest.NewRecorder()
metrics.Handler().ServeHTTP(metricsResp, metricsReq)
body := metricsResp.Body.String()
if !strings.Contains(body, `user_key_operations_total{operation="delete",result="get_key_error"}`) {
t.Fatalf("metrics body missing delete get_key_error metric: %s", body)
}
}

View File

@@ -3,6 +3,7 @@ package metrics
import (
"context"
"net/http"
"strconv"
"time"
"github.com/prometheus/client_golang/prometheus"
@@ -132,7 +133,8 @@ func RecordHTTPRequest(method, path string, status int, duration time.Duration)
if path == "" {
path = "unknown"
}
HTTPRequestsTotal.WithLabelValues(method, path, http.StatusText(status)).Inc()
statusLabel := strconv.Itoa(status)
HTTPRequestsTotal.WithLabelValues(method, path, statusLabel).Inc()
HTTPRequestDuration.WithLabelValues(method, path).Observe(duration.Seconds())
}

View File

@@ -28,6 +28,9 @@ func TestHTTPRequestsTotal(t *testing.T) {
if !strings.Contains(body, "http_requests_total") {
t.Error("Expected metrics endpoint to contain http_requests_total")
}
if !strings.Contains(body, `status="200"`) {
t.Fatalf("expected numeric HTTP status label, got: %s", body)
}
}
func TestRecordRouteDecision(t *testing.T) {

View File

@@ -47,6 +47,9 @@ if [[ $frontend_smoke_status -ne 0 ]]; then
fi
fi
log "running vNext SLO release gate"
bash "$ROOT_DIR/scripts/test/verify_vnext_slo_release_gate.sh"
log "running gofmt check"
gofmt -l . | tee "$GOFMT_LOG"
if [[ -s "$GOFMT_LOG" ]]; then

View File

@@ -0,0 +1,79 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
ARTIFACT_SUMMARY_PATH="${ARTIFACT_SUMMARY_PATH:-$ROOT_DIR/artifacts/v3-governance-live/20260608_102323/99-summary.json}"
fail() {
echo "FAIL: $*" >&2
exit 1
}
log() {
echo "==> $*"
}
require_file() {
local path="$1"
[[ -f "$path" ]] || fail "missing required file: $path"
}
require_contains() {
local path="$1"
local needle="$2"
grep -F "$needle" "$path" >/dev/null || fail "missing expected text in $path: $needle"
}
log "checking V3-2 source-of-truth files"
require_file "$ROOT_DIR/docs/2026-06-04-SLO_AND_OBSERVABILITY.md"
require_file "$ROOT_DIR/docs/2026-06-04-KEY_ACCOUNT_GOVERNANCE.md"
require_file "$ROOT_DIR/docs/EXECUTION_BOARD.md"
require_file "$ROOT_DIR/deploy/monitoring/prometheus-rules.yml"
require_file "$ARTIFACT_SUMMARY_PATH"
log "checking metrics wiring truth"
require_contains "$ROOT_DIR/internal/metrics/metrics.go" 'Name: "user_key_operations_total"'
require_contains "$ROOT_DIR/internal/metrics/metrics.go" 'Name: "user_key_chat_requests_total"'
require_contains "$ROOT_DIR/internal/metrics/metrics.go" 'statusLabel := strconv.Itoa(status)'
require_contains "$ROOT_DIR/internal/app/route_resolve_api.go" 'decisionStatus = "failover"'
require_contains "$ROOT_DIR/internal/app/key_self_service_svc.go" 'recordUserKeyFailure("create", "resolve_host_error"'
require_contains "$ROOT_DIR/internal/app/key_self_service_svc.go" 'recordUserKeyFailure("delete", "get_key_error"'
log "checking alert rule alignment"
require_contains "$ROOT_DIR/deploy/monitoring/prometheus-rules.yml" 'user_key_chat_requests_total{result="ok"}'
require_contains "$ROOT_DIR/deploy/monitoring/prometheus-rules.yml" 'user_key_operations_total{operation="create",result!~"success|rate_limited"}'
require_contains "$ROOT_DIR/deploy/monitoring/prometheus-rules.yml" 'route_decisions_total{status="failover"}'
require_contains "$ROOT_DIR/deploy/monitoring/prometheus-rules.yml" 'http_requests_total{status=~"4..|5.."}'
log "checking live governance artifact"
python3 - "$ARTIFACT_SUMMARY_PATH" <<'PY'
import json, sys
from pathlib import Path
p = Path(sys.argv[1])
obj = json.loads(p.read_text())
checks = {
'create_http': 201,
'chat_before_http': 200,
'pause_http': 200,
'get_paused_http': 200,
'chat_paused_http': 403,
'resume_http': 200,
'get_resumed_http': 200,
'chat_resumed_http': 200,
'delete_http': 200,
}
for key, want in checks.items():
got = obj.get(key)
if got != want:
raise SystemExit(f'{key}={got!r}, want {want!r}')
paused_body = obj.get('chat_paused_body', '')
if 'key_paused' not in paused_body:
raise SystemExit('chat_paused_body missing key_paused evidence')
print(json.dumps({'artifact': str(p), 'checks': checks, 'paused_error': 'key_paused'}, ensure_ascii=False, indent=2))
PY
log "checking docs mention V3-2 closure state"
require_contains "$ROOT_DIR/docs/EXECUTION_BOARD.md" 'V3-2 SLO / 观测最小闭环2026-06-08 首批)'
require_contains "$ROOT_DIR/docs/EXECUTION_BOARD.md" '失败路径细化、告警规则、发布门禁均已落地'
echo 'PASS: V3-2 SLO release gate verified'