feat: bootstrap supply intelligence baseline

This commit is contained in:
Your Name
2026-05-07 10:16:46 +08:00
commit afdbea6fb5
62 changed files with 9170 additions and 0 deletions

6
.gitignore vendored Normal file
View File

@@ -0,0 +1,6 @@
bin/
.coverprofile
coverage.out
*.log
*.tmp
.DS_Store

21
README.md Normal file
View File

@@ -0,0 +1,21 @@
# supply-intelligence
Supply-Intelligence 项目代码仓。
当前阶段目标:先实现首个最小生产闭环:
1. 账号探针与状态写回
2. 模型发现与 candidate 闭环
3. 准入测试与 draft package 生成
4. package 发布与 gateway package event + ack
实现真源文档:
- `tech/CURRENT_SOURCE_OF_TRUTH_2026-05.md`
- `tech/BASELINE_TECHLEAD_V2.md`
- `tech/GATEWAY_CONSUMER_DECISION_2026-05.md`
- `tech/TEST_DESIGN.md`
- `tech/IMPLEMENTATION_TASK_BOARD_V1_2026-05.md`
运行约束:
- 首期不把 Redis / Temporal / WebSocket / 向量数据库作为硬前置
- 首期不做深自动注册主路径
- 首期默认 package 发布链路采用 event + ack

View File

@@ -0,0 +1,31 @@
package main
import (
"context"
"log"
"net/http"
"time"
"supply-intelligence/internal/app"
"supply-intelligence/internal/domain"
)
func main() {
application := app.New()
application.Repo.UpsertRoutingState(domain.AccountRoutingState{
AccountID: 1,
Platform: "openai",
AccountStatus: domain.AccountStatusActive,
RoutingEnabled: true,
RiskScore: 10,
ReasonCode: "ok",
LastProbeAt: time.Now().UTC(),
Version: 1,
})
application.StartBackground(context.Background())
defer application.StopBackground()
log.Println("supply-intelligence listening on :8080")
if err := http.ListenAndServe(":8080", application.Server.Routes()); err != nil {
log.Fatal(err)
}
}

View File

@@ -0,0 +1,20 @@
# Gateway poller runtime boundary
Current repository stage:
- package publish writes a pending gateway event
- gateway consumer can poll, apply, and ack that event
- the repository implementation in this repo is still in-memory only
This means:
- `published` means the upstream package event was recorded
- `applied` / `failed` means the in-process consumer flow handled the event during the current process lifetime
- this repo does not yet claim durable database persistence for gateway event ack state
Runtime shape in the current repo:
- HTTP debug/manual endpoint: `POST /internal/supply-intelligence/gateway/consume-once`
- background runtime path: application startup wires a minimal ticker-driven poller loop
Non-goals for the current stage:
- no MQ / Redis / external scheduler
- no claim that a full durable publish state machine is complete
- no claim that in-memory ack state survives restart

5
go.mod Normal file
View File

@@ -0,0 +1,5 @@
module supply-intelligence
go 1.22.2
require github.com/google/uuid v1.6.0 // indirect

2
go.sum Normal file
View File

@@ -0,0 +1,2 @@
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=

View File

@@ -0,0 +1,27 @@
package admission
import "context"
// CandidateRepository defines the persistence layer for candidates
type CandidateRepository interface {
GetCandidateByIDContext(ctx context.Context, candidateID string) (Candidate, bool)
UpdateCandidateStatus(ctx context.Context, candidateID string, status CandidateStatus, failureCode, failureSummary string) error
ListCandidatesByStatus(ctx context.Context, status CandidateStatus) []Candidate
}
// SupplyPackageRepository defines the persistence layer for supply packages
type SupplyPackageRepository interface {
UpsertDraftPackage(ctx context.Context, platform, model string, source string) (packageID int64, err error)
GetDraftPackage(ctx context.Context, platform, model string) (DraftPackage, bool)
}
// DraftPackage represents a draft supply package created after admission passes
type DraftPackage struct {
PackageID int64 `json:"package_id"`
Platform string `json:"platform"`
Model string `json:"model"`
Status string `json:"status"` // draft, active, deprecated
Source string `json:"source"`
CreatedAt string `json:"created_at"`
Version int64 `json:"version"`
}

View File

@@ -0,0 +1,131 @@
package admission
import (
"bytes"
"context"
"io"
"net/http"
"time"
)
// HTTPTestRunner implements TestRunner by making real HTTP requests
type HTTPTestRunner struct {
client *http.Client
now func() time.Time
}
// NewHTTPTestRunner creates a runner that makes real HTTP calls
func NewHTTPTestRunner() *HTTPTestRunner {
return &HTTPTestRunner{
client: &http.Client{
Timeout: 60 * time.Second,
},
now: func() time.Time { return time.Now().UTC() },
}
}
// Run executes a single test case via HTTP
func (r *HTTPTestRunner) Run(ctx context.Context, tc TestCase) TestCaseResult {
var body io.Reader
if tc.Body != "" {
body = bytes.NewBufferString(tc.Body)
}
req, err := http.NewRequestWithContext(ctx, tc.Method, tc.Endpoint, body)
if err != nil {
return TestCaseResult{Error: err.Error()}
}
for k, v := range tc.Headers {
req.Header.Set(k, v)
}
if req.Header.Get("Content-Type") == "" {
req.Header.Set("Content-Type", "application/json")
}
start := time.Now()
resp, err := r.client.Do(req)
latencyMs := int(time.Since(start).Milliseconds())
if err != nil {
return TestCaseResult{
Error: err.Error(),
LatencyMs: latencyMs,
}
}
defer resp.Body.Close()
// Read response (up to 4KB for validation)
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
passed := resp.StatusCode >= 200 && resp.StatusCode < 300
return TestCaseResult{
Passed: passed,
StatusCode: resp.StatusCode,
LatencyMs: latencyMs,
ResponseLen: len(respBody),
Error: "",
}
}
// BuildTestSuiteForPlatform creates a standard test suite for a platform
func BuildTestSuiteForPlatform(platform, baseURL, apiKey string) TestSuite {
switch platform {
case "openai":
return buildOpenAITestSuite(baseURL, apiKey)
case "anthropic":
return buildAnthropicTestSuite(baseURL, apiKey)
default:
return TestSuite{Platform: platform, Cases: []TestCase{}}
}
}
func buildOpenAITestSuite(baseURL, apiKey string) TestSuite {
if baseURL == "" {
baseURL = "https://api.openai.com"
}
endpoint := baseURL + "/v1/models"
return TestSuite{
Platform: "openai",
Cases: []TestCase{
{
ID: "openai-models-list",
Name: "List Models",
Endpoint: endpoint,
Method: http.MethodGet,
Headers: map[string]string{"Authorization": "Bearer " + apiKey},
TimeoutSecs: 30,
},
{
ID: "openai-chat-completion",
Name: "Chat Completion",
Endpoint: baseURL + "/v1/chat/completions",
Method: http.MethodPost,
Headers: map[string]string{"Authorization": "Bearer " + apiKey, "Content-Type": "application/json"},
Body: `{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hello"}],"max_tokens":10}`,
TimeoutSecs: 30,
},
},
}
}
func buildAnthropicTestSuite(baseURL, apiKey string) TestSuite {
if baseURL == "" {
baseURL = "https://api.anthropic.com"
}
return TestSuite{
Platform: "anthropic",
Cases: []TestCase{
{
ID: "anthropic-messages",
Name: "Claude Messages",
Endpoint: baseURL + "/v1/messages",
Method: http.MethodPost,
Headers: map[string]string{"x-api-key": apiKey, "anthropic-version": "2023-06-01", "Content-Type": "application/json"},
Body: `{"model":"claude-3-5-haiku-20241022","messages":[{"role":"user","content":"hello"}],"max_tokens":10}`,
TimeoutSecs: 30,
},
},
}
}

View File

@@ -0,0 +1,169 @@
package admission
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestHTTPTestRunner_Run_Success(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write([]byte(`{"id":"model-1"}`))
}))
defer server.Close()
runner := NewHTTPTestRunner()
result := runner.Run(context.Background(), TestCase{
ID: "test-1",
Name: "Test Case",
Endpoint: server.URL,
Method: http.MethodGet,
TimeoutSecs: 30,
})
if !result.Passed {
t.Fatalf("expected pass, got failed: status=%d", result.StatusCode)
}
if result.StatusCode != http.StatusOK {
t.Fatalf("expected 200, got: %d", result.StatusCode)
}
if result.LatencyMs < 0 {
t.Fatalf("expected latency >= 0, got: %d", result.LatencyMs)
}
}
func TestHTTPTestRunner_Run_Non2xx_Fails(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
}))
defer server.Close()
runner := NewHTTPTestRunner()
result := runner.Run(context.Background(), TestCase{
ID: "test-2",
Name: "Test 500",
Endpoint: server.URL,
Method: http.MethodGet,
TimeoutSecs: 30,
})
if result.Passed {
t.Fatal("expected failure for 500")
}
if result.StatusCode != http.StatusInternalServerError {
t.Fatalf("expected 500, got: %d", result.StatusCode)
}
}
func TestHTTPTestRunner_Run_Timeout(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(500 * time.Millisecond)
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
runner := NewHTTPTestRunner()
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
defer cancel()
result := runner.Run(ctx, TestCase{
ID: "test-3",
Name: "Test Timeout",
Endpoint: server.URL,
Method: http.MethodGet,
TimeoutSecs: 1, // but context is 50ms
})
if result.Error == "" {
t.Fatal("expected error on timeout")
}
}
func TestHTTPTestRunner_Run_ContextCanceled(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(5 * time.Second)
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
runner := NewHTTPTestRunner()
ctx, cancel := context.WithCancel(context.Background())
cancel() // cancel immediately
result := runner.Run(ctx, TestCase{
ID: "test-4",
Name: "Test Cancel",
Endpoint: server.URL,
Method: http.MethodGet,
TimeoutSecs: 30,
})
if result.Error == "" {
t.Fatal("expected error on context cancel")
}
}
func TestBuildTestSuiteForPlatform_OpenAI(t *testing.T) {
suite := BuildTestSuiteForPlatform("openai", "https://api.openai.com", "sk-test")
if suite.Platform != "openai" {
t.Fatalf("expected openai, got: %s", suite.Platform)
}
if len(suite.Cases) == 0 {
t.Fatal("expected at least 1 test case")
}
if suite.Cases[0].Method != http.MethodGet {
t.Fatalf("expected GET for models list, got: %s", suite.Cases[0].Method)
}
}
func TestBuildTestSuiteForPlatform_Anthropic(t *testing.T) {
suite := BuildTestSuiteForPlatform("anthropic", "https://api.anthropic.com", "sk-ant-test")
if suite.Platform != "anthropic" {
t.Fatalf("expected anthropic, got: %s", suite.Platform)
}
if len(suite.Cases) == 0 {
t.Fatal("expected at least 1 test case")
}
}
func TestBuildTestSuiteForPlatform_Unknown(t *testing.T) {
suite := BuildTestSuiteForPlatform("unknown", "", "")
if len(suite.Cases) != 0 {
t.Fatal("expected 0 cases for unknown platform")
}
}
func TestHTTPTestRunner_Run_PostWithJSONBody(t *testing.T) {
var receivedBody string
var receivedContentType string
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
receivedContentType = r.Header.Get("Content-Type")
body := make([]byte, 1024)
n, _ := r.Body.Read(body)
receivedBody = string(body[:n])
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
runner := NewHTTPTestRunner()
result := runner.Run(context.Background(), TestCase{
ID: "test-post",
Name: "POST JSON",
Endpoint: server.URL,
Method: http.MethodPost,
Headers: map[string]string{"Authorization": "Bearer token"},
Body: `{"model":"gpt-4","messages":[{"role":"user","content":"hi"}]}`,
TimeoutSecs: 30,
})
if !result.Passed {
t.Fatalf("expected pass: %+v", result)
}
if receivedContentType != "application/json" {
t.Fatalf("expected application/json, got: %s", receivedContentType)
}
_ = receivedBody // validated via status code pass check
}

View File

@@ -0,0 +1,166 @@
package admission
import (
"context"
"errors"
"time"
)
var (
ErrCandidateNotFound = errors.New("candidate not found")
ErrInvalidCandidateID = errors.New("invalid candidate id")
ErrTestTimeout = errors.New("admission test timed out")
ErrCandidateNotRunnable = errors.New("candidate not in runnable state")
)
// TestRunner executes a single test case
type TestRunner interface {
Run(ctx context.Context, tc TestCase) TestCaseResult
}
// TestCaseResult is the outcome of a single test case execution
type TestCaseResult struct {
Passed bool
StatusCode int
LatencyMs int
Error string
ResponseLen int
}
// Service orchestrates the admission testing workflow
type Service struct {
candidateRepo CandidateRepository
packageRepo SupplyPackageRepository
testSuites map[string]TestSuite // key = platform
runner TestRunner
now func() time.Time
}
// NewService creates a new admission service
func NewService(candidateRepo CandidateRepository, packageRepo SupplyPackageRepository, suites []TestSuite, runner TestRunner) *Service {
suiteMap := make(map[string]TestSuite)
for _, s := range suites {
suiteMap[s.Platform] = s
}
return &Service{
candidateRepo: candidateRepo,
packageRepo: packageRepo,
testSuites: suiteMap,
runner: runner,
now: func() time.Time { return time.Now().UTC() },
}
}
// RunAdmission executes the full admission test for a candidate
func (s *Service) RunAdmission(ctx context.Context, candidateID string) (*TestResult, error) {
if candidateID == "" {
return nil, ErrInvalidCandidateID
}
candidate, ok := s.candidateRepo.GetCandidateByIDContext(ctx, candidateID)
if !ok {
return nil, ErrCandidateNotFound
}
// Candidate must be in pending_admission state to run
if candidate.Status != CandidateStatusPendingAdmission {
return nil, ErrCandidateNotRunnable
}
suite, ok := s.testSuites[candidate.Platform]
if !ok {
// No test suite for this platform — auto-pass (no known test cases)
s.candidateRepo.UpdateCandidateStatus(ctx, candidateID, CandidateStatusAdmitted, "", "")
return &TestResult{
CandidateID: candidateID,
Status: CandidateStatusAdmitted,
TestedAt: s.now(),
Passed: true,
}, nil
}
// Execute all test cases
var failedCases []string
var failureCode string
var failureSummary string
for _, tc := range suite.Cases {
timeoutCtx, cancel := context.WithTimeout(ctx, time.Duration(tc.TimeoutSecs)*time.Second)
result := s.runner.Run(timeoutCtx, tc)
cancel()
if !result.Passed {
failedCases = append(failedCases, tc.Name)
if failureCode == "" {
failureCode = classifyFailure(result, tc)
failureSummary = formatFailure(result, tc)
}
}
}
testedAt := s.now()
if len(failedCases) > 0 {
// Test failed
err := s.candidateRepo.UpdateCandidateStatus(ctx, candidateID, CandidateStatusRejected, failureCode, failureSummary)
if err != nil {
return nil, err
}
return &TestResult{
CandidateID: candidateID,
Status: CandidateStatusRejected,
TestedAt: testedAt,
FailureCode: failureCode,
FailureSummary: failureSummary,
Passed: false,
}, nil
}
// All cases passed — generate draft package
_, err := s.packageRepo.UpsertDraftPackage(ctx, candidate.Platform, candidate.Model, candidate.Source)
if err != nil {
// Draft generation failed — still mark as admitted but record the error
failureCode = "draft_generation_failed"
failureSummary = err.Error()
_ = s.candidateRepo.UpdateCandidateStatus(ctx, candidateID, CandidateStatusAdmitted, failureCode, failureSummary)
} else {
_ = s.candidateRepo.UpdateCandidateStatus(ctx, candidateID, CandidateStatusAdmitted, "", "")
}
return &TestResult{
CandidateID: candidateID,
Status: CandidateStatusAdmitted,
TestedAt: testedAt,
Passed: true,
}, nil
}
// classifyFailure determines the failure code from a failed test case result
func classifyFailure(result TestCaseResult, tc TestCase) string {
if result.Error != "" {
if result.Error == "context deadline exceeded" {
return "timeout"
}
return "execution_error"
}
if result.StatusCode >= 500 {
return "upstream_error"
}
if result.StatusCode >= 400 {
return "client_error"
}
return "unknown_failure"
}
// formatFailure creates a human-readable failure summary
func formatFailure(result TestCaseResult, tc TestCase) string {
if result.Error != "" {
return tc.Name + ": " + result.Error
}
return tc.Name + ": status=" + string(rune(result.StatusCode))
}
// GetRunnableCandidates returns all candidates eligible for admission testing
func (s *Service) GetRunnableCandidates(ctx context.Context) []Candidate {
return s.candidateRepo.ListCandidatesByStatus(ctx, CandidateStatusPendingAdmission)
}

View File

@@ -0,0 +1,201 @@
package admission
import (
"context"
"errors"
"testing"
"time"
)
type mockCandidateRepo struct {
candidates map[string]Candidate
}
func (r *mockCandidateRepo) GetCandidateByIDContext(ctx context.Context, candidateID string) (Candidate, bool) {
c, ok := r.candidates[candidateID]
return c, ok
}
func (r *mockCandidateRepo) UpdateCandidateStatus(ctx context.Context, candidateID string, status CandidateStatus, failureCode, failureSummary string) error {
if c, ok := r.candidates[candidateID]; ok {
c.Status = status
c.ReasonCode = failureCode
c.UpdatedAt = time.Now().UTC()
r.candidates[candidateID] = c
}
return nil
}
func (r *mockCandidateRepo) ListCandidatesByStatus(ctx context.Context, status CandidateStatus) []Candidate {
var result []Candidate
for _, c := range r.candidates {
if status == "" || c.Status == status {
result = append(result, c)
}
}
return result
}
type mockPackageRepo struct {
drafts map[string]DraftPackage
nextID int64
}
func (r *mockPackageRepo) UpsertDraftPackage(ctx context.Context, platform, model, source string) (int64, error) {
r.nextID++
id := r.nextID
r.drafts[platform+"/"+model] = DraftPackage{
PackageID: id,
Platform: platform,
Model: model,
Status: "draft",
Source: source,
}
return id, nil
}
func (r *mockPackageRepo) GetDraftPackage(ctx context.Context, platform, model string) (DraftPackage, bool) {
d, ok := r.drafts[platform+"/"+model]
return d, ok
}
type mockTestRunner struct {
results map[string]TestCaseResult
}
func (r *mockTestRunner) Run(ctx context.Context, tc TestCase) TestCaseResult {
if res, ok := r.results[tc.ID]; ok {
return res
}
return TestCaseResult{Passed: true, StatusCode: 200}
}
func TestRunAdmission_PassesAllCases(t *testing.T) {
candidateRepo := &mockCandidateRepo{candidates: map[string]Candidate{
"cand-1": {CandidateID: "cand-1", Platform: "openai", Model: "gpt-4", Status: CandidateStatusPendingAdmission},
}}
packageRepo := &mockPackageRepo{drafts: map[string]DraftPackage{}}
runner := &mockTestRunner{results: map[string]TestCaseResult{}}
suites := []TestSuite{{
Platform: "openai",
Cases: []TestCase{
{ID: "case-1", Name: "models endpoint", Endpoint: "/v1/models", Method: "GET", TimeoutSecs: 30},
},
}}
svc := NewService(candidateRepo, packageRepo, suites, runner)
result, err := svc.RunAdmission(context.Background(), "cand-1")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !result.Passed {
t.Fatalf("expected pass, got failed: %+v", result)
}
if result.Status != CandidateStatusAdmitted {
t.Fatalf("expected admitted status, got: %s", result.Status)
}
if len(packageRepo.drafts) != 1 {
t.Fatalf("expected 1 draft package, got %d", len(packageRepo.drafts))
}
}
func TestRunAdmission_FailsOneCase(t *testing.T) {
candidateRepo := &mockCandidateRepo{candidates: map[string]Candidate{
"cand-2": {CandidateID: "cand-2", Platform: "openai", Model: "gpt-4", Status: CandidateStatusPendingAdmission},
}}
packageRepo := &mockPackageRepo{drafts: map[string]DraftPackage{}}
runner := &mockTestRunner{results: map[string]TestCaseResult{
"case-1": {Passed: false, StatusCode: 500, Error: ""},
}}
suites := []TestSuite{{
Platform: "openai",
Cases: []TestCase{
{ID: "case-1", Name: "models endpoint", Endpoint: "/v1/models", Method: "GET", TimeoutSecs: 30},
},
}}
svc := NewService(candidateRepo, packageRepo, suites, runner)
result, err := svc.RunAdmission(context.Background(), "cand-2")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if result.Passed {
t.Fatalf("expected failure, got pass")
}
if result.Status != CandidateStatusRejected {
t.Fatalf("expected rejected status, got: %s", result.Status)
}
if result.FailureCode == "" {
t.Fatalf("expected failure code to be set")
}
if len(packageRepo.drafts) != 0 {
t.Fatalf("expected 0 draft packages on failure, got %d", len(packageRepo.drafts))
}
}
func TestRunAdmission_CandidateNotFound(t *testing.T) {
candidateRepo := &mockCandidateRepo{candidates: map[string]Candidate{}}
packageRepo := &mockPackageRepo{drafts: map[string]DraftPackage{}}
runner := &mockTestRunner{results: map[string]TestCaseResult{}}
svc := NewService(candidateRepo, packageRepo, []TestSuite{}, runner)
_, err := svc.RunAdmission(context.Background(), "nonexistent")
if !errors.Is(err, ErrCandidateNotFound) {
t.Fatalf("expected ErrCandidateNotFound, got: %v", err)
}
}
func TestRunAdmission_CandidateNotRunnable(t *testing.T) {
candidateRepo := &mockCandidateRepo{candidates: map[string]Candidate{
"cand-3": {CandidateID: "cand-3", Platform: "openai", Model: "gpt-4", Status: CandidateStatusAdmitted},
}}
packageRepo := &mockPackageRepo{drafts: map[string]DraftPackage{}}
runner := &mockTestRunner{results: map[string]TestCaseResult{}}
svc := NewService(candidateRepo, packageRepo, []TestSuite{}, runner)
_, err := svc.RunAdmission(context.Background(), "cand-3")
if !errors.Is(err, ErrCandidateNotRunnable) {
t.Fatalf("expected ErrCandidateNotRunnable, got: %v", err)
}
}
func TestRunAdmission_NoTestSuite_AutoPass(t *testing.T) {
candidateRepo := &mockCandidateRepo{candidates: map[string]Candidate{
"cand-4": {CandidateID: "cand-4", Platform: "unknown-platform", Model: "some-model", Status: CandidateStatusPendingAdmission},
}}
packageRepo := &mockPackageRepo{drafts: map[string]DraftPackage{}}
runner := &mockTestRunner{results: map[string]TestCaseResult{}}
svc := NewService(candidateRepo, packageRepo, []TestSuite{}, runner) // no suites
result, err := svc.RunAdmission(context.Background(), "cand-4")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !result.Passed {
t.Fatalf("expected auto-pass for unknown platform, got: %+v", result)
}
}
func TestGetRunnableCandidates(t *testing.T) {
candidateRepo := &mockCandidateRepo{candidates: map[string]Candidate{
"cand-1": {CandidateID: "cand-1", Status: CandidateStatusPendingAdmission},
"cand-2": {CandidateID: "cand-2", Status: CandidateStatusAdmitted},
"cand-3": {CandidateID: "cand-3", Status: CandidateStatusPendingAdmission},
}}
packageRepo := &mockPackageRepo{drafts: map[string]DraftPackage{}}
runner := &mockTestRunner{}
svc := NewService(candidateRepo, packageRepo, []TestSuite{}, runner)
candidates := svc.GetRunnableCandidates(context.Background())
if len(candidates) != 2 {
t.Fatalf("expected 2 pending candidates, got %d", len(candidates))
}
}

View File

@@ -0,0 +1,62 @@
package admission
import "time"
// ProbeClassification mirrors domain.ProbeClassification for internal use
type ProbeClassification string
const (
ProbeClassificationSuccess ProbeClassification = "success"
ProbeClassificationExplicitFailure ProbeClassification = "explicit_failure"
ProbeClassificationInconclusive ProbeClassification = "inconclusive"
)
// CandidateStatus mirrors domain.DiscoveryCandidateStatus
type CandidateStatus string
const (
CandidateStatusPendingAdmission CandidateStatus = "pending_admission"
CandidateStatusAdmitted CandidateStatus = "admitted"
CandidateStatusRejected CandidateStatus = "rejected"
)
// Candidate represents a discovered model waiting for admission testing
type Candidate struct {
CandidateID string `json:"candidate_id"`
AccountID int64 `json:"account_id"`
Platform string `json:"platform"`
Model string `json:"model"`
Status CandidateStatus `json:"status"`
Source string `json:"source"`
ReasonCode string `json:"reason_code,omitempty"`
DiscoveredAt time.Time `json:"discovered_at"`
UpdatedAt time.Time `json:"updated_at"`
Version int64 `json:"version"`
}
// TestResult records the outcome of an admission test run
type TestResult struct {
CandidateID string `json:"candidate_id"`
Status CandidateStatus `json:"status"` // admitted or rejected
TestedAt time.Time `json:"tested_at"`
FailureCode string `json:"failure_code,omitempty"`
FailureSummary string `json:"failure_summary,omitempty"`
Passed bool `json:"passed"`
}
// TestCase defines a single test case within an admission test run
type TestCase struct {
ID string `json:"id"`
Name string `json:"name"`
Endpoint string `json:"endpoint"`
Method string `json:"method"`
Headers map[string]string `json:"headers,omitempty"`
Body string `json:"body,omitempty"`
TimeoutSecs int `json:"timeout_secs"`
}
// TestSuite defines a collection of test cases for a model type
type TestSuite struct {
Platform string `json:"platform"`
Cases []TestCase `json:"cases"`
}

160
internal/app/app.go Normal file
View File

@@ -0,0 +1,160 @@
package app
import (
"context"
"time"
"supply-intelligence/internal/admission"
"supply-intelligence/internal/discovery"
"supply-intelligence/internal/domain"
"supply-intelligence/internal/gatewayconsumer"
"supply-intelligence/internal/httpapi"
"supply-intelligence/internal/poller"
"supply-intelligence/internal/probe"
"supply-intelligence/internal/publish"
"supply-intelligence/internal/repository"
)
type Application struct {
Repo *repository.MemoryRepository
ProbeService *probe.Service
PublishService *publish.Service
DiscoveryService *discovery.Service
GatewayConsumerService *gatewayconsumer.Service
GatewayPoller *poller.GatewayPackagePoller
GatewayRuntime *poller.Runtime
AdmissionService *admission.Service
Server *httpapi.Server
}
func New() *Application {
repo := repository.NewMemoryRepository()
probeService := probe.NewService(repo)
publishService := publish.NewService(repo)
discoveryService := discovery.NewService(repo)
gatewayConsumerService := gatewayconsumer.NewService(repo)
gatewayPoller := poller.NewGatewayPackagePoller(gatewayConsumerService)
gatewayRuntime := poller.NewRuntime(gatewayPoller, time.Second)
// Wire MemoryRepository as admission's CandidateRepository
candidateRepo := &admissionMemoryRepoAdapter{repo: repo}
packageRepo := &admissionSupplyPackageAdapter{repo: repo}
runner := admission.NewHTTPTestRunner()
// Build test suites for known platforms (in real use, loaded from config)
suites := []admission.TestSuite{
admission.BuildTestSuiteForPlatform("openai", "https://api.openai.com", ""),
admission.BuildTestSuiteForPlatform("anthropic", "https://api.anthropic.com", ""),
}
admissionService := admission.NewService(candidateRepo, packageRepo, suites, runner)
return &Application{
Repo: repo,
ProbeService: probeService,
PublishService: publishService,
DiscoveryService: discoveryService,
GatewayConsumerService: gatewayConsumerService,
GatewayPoller: gatewayPoller,
GatewayRuntime: gatewayRuntime,
AdmissionService: admissionService,
Server: httpapi.NewServer(repo, probeService, publishService, gatewayConsumerService, discoveryService, admissionService),
}
}
func (a *Application) StartBackground(ctx context.Context) {
if a == nil || a.GatewayRuntime == nil {
return
}
a.GatewayRuntime.Start(ctx)
}
func (a *Application) StopBackground() {
if a == nil || a.GatewayRuntime == nil {
return
}
a.GatewayRuntime.Stop()
}
func (a *Application) IsInMemoryGatewayState() bool {
return a != nil && a.Repo != nil
}
// --- Adapters that bridge MemoryRepository to admission.Repository interfaces ---
// admissionMemoryRepoAdapter adapts MemoryRepository to admission.CandidateRepository
type admissionMemoryRepoAdapter struct {
repo *repository.MemoryRepository
}
func (a *admissionMemoryRepoAdapter) GetCandidateByIDContext(ctx context.Context, candidateID string) (admission.Candidate, bool) {
c, ok := a.repo.GetDiscoveryCandidateByIDContext(ctx, candidateID)
if !ok {
return admission.Candidate{}, false
}
return toAdmissionCandidate(c), true
}
func (a *admissionMemoryRepoAdapter) UpdateCandidateStatus(ctx context.Context, candidateID string, status admission.CandidateStatus, failureCode, failureSummary string) error {
return a.repo.UpdateCandidateStatus(ctx, candidateID, domain.DiscoveryCandidateStatus(status), failureCode, failureSummary)
}
func (a *admissionMemoryRepoAdapter) ListCandidatesByStatus(ctx context.Context, status admission.CandidateStatus) []admission.Candidate {
candidates := a.repo.ListDiscoveryCandidatesContext(ctx, domain.DiscoveryCandidateStatus(status))
result := make([]admission.Candidate, len(candidates))
for i, c := range candidates {
result[i] = toAdmissionCandidate(c)
}
return result
}
func toAdmissionCandidate(c domain.DiscoveryCandidate) admission.Candidate {
return admission.Candidate{
CandidateID: c.CandidateID,
AccountID: c.AccountID,
Platform: c.Platform,
Model: c.Model,
Status: admission.CandidateStatus(c.Status),
Source: c.Source,
ReasonCode: c.ReasonCode,
DiscoveredAt: c.DiscoveredAt,
UpdatedAt: c.UpdatedAt,
Version: c.Version,
}
}
// admissionSupplyPackageAdapter adapts MemoryRepository to admission.SupplyPackageRepository
type admissionSupplyPackageAdapter struct {
repo *repository.MemoryRepository
}
func (a *admissionSupplyPackageAdapter) UpsertDraftPackage(ctx context.Context, platform, model, source string) (int64, error) {
if existing, ok := a.repo.GetSupplyPackage(platform, model); ok {
return existing.PackageID, nil
}
pkg := domain.SupplyPackage{
Platform: platform,
Model: model,
Status: "draft",
Source: source,
}
a.repo.UpsertSupplyPackage(pkg)
if newPkg, ok := a.repo.GetSupplyPackage(platform, model); ok {
return newPkg.PackageID, nil
}
return 0, nil
}
func (a *admissionSupplyPackageAdapter) GetDraftPackage(ctx context.Context, platform, model string) (admission.DraftPackage, bool) {
pkg, ok := a.repo.GetSupplyPackage(platform, model)
if !ok {
return admission.DraftPackage{}, false
}
return admission.DraftPackage{
PackageID: pkg.PackageID,
Platform: pkg.Platform,
Model: pkg.Model,
Status: pkg.Status,
Source: pkg.Source,
}, true
}

85
internal/app/app_test.go Normal file
View File

@@ -0,0 +1,85 @@
package app
import (
"context"
"testing"
"time"
"supply-intelligence/internal/domain"
)
func TestNewApplication(t *testing.T) {
application := New()
if application == nil {
t.Fatalf("expected application")
}
if application.Repo == nil {
t.Fatalf("expected repository")
}
if application.ProbeService == nil {
t.Fatalf("expected probe service")
}
if application.PublishService == nil {
t.Fatalf("expected publish service")
}
if application.DiscoveryService == nil {
t.Fatalf("expected discovery service")
}
if application.GatewayConsumerService == nil {
t.Fatalf("expected gateway consumer service")
}
if application.GatewayPoller == nil {
t.Fatalf("expected gateway poller")
}
if application.GatewayRuntime == nil {
t.Fatalf("expected gateway runtime")
}
if application.Server == nil {
t.Fatalf("expected server")
}
}
func TestApplicationStartBackgroundPollsEvents(t *testing.T) {
application := New()
application.Repo.AppendPackageEvent(domain.PackageChangeEvent{
EventID: "evt-app-runtime-1",
EventType: "supply_package_published",
PackageID: 11,
Platform: "openai",
Model: "gpt-4.1-mini",
OccurredAt: time.Unix(2, 0).UTC(),
Version: 1,
GatewaySyncStatus: domain.GatewaySyncStatusPending,
})
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
application.StartBackground(ctx)
defer application.StopBackground()
deadline := time.Now().Add(1500 * time.Millisecond)
for time.Now().Before(deadline) {
items, _ := application.Repo.ListPackageEventsAfter("")
if len(items) == 1 && items[0].GatewaySyncStatus == domain.GatewaySyncStatusApplied {
return
}
time.Sleep(20 * time.Millisecond)
}
items, _ := application.Repo.ListPackageEventsAfter("")
t.Fatalf("expected background runtime to apply event, got %+v", items)
}
func TestApplicationStartBackgroundHandlesNilRuntime(t *testing.T) {
application := New()
application.GatewayRuntime = nil
application.StartBackground(context.Background())
if application.GatewayRuntime != nil {
t.Fatalf("expected nil runtime guard to keep runtime nil")
}
}
func TestApplicationReportsInMemoryGatewayState(t *testing.T) {
application := New()
if !application.IsInMemoryGatewayState() {
t.Fatalf("expected in-memory gateway state")
}
}

150
internal/control/module.go Normal file
View File

@@ -0,0 +1,150 @@
package control
import (
"sync"
"time"
)
// ModuleState represents the lifecycle state of a module
type ModuleState string
const (
ModuleStateActive ModuleState = "active"
ModuleStateClosing ModuleState = "closing"
ModuleStateClosed ModuleState = "closed"
)
// ModuleGate controls the enable/disable/close lifecycle of a module
type ModuleGate struct {
mu sync.RWMutex
enabled bool
state ModuleState
closedAt *time.Time
}
func NewModuleGate(enabled bool) *ModuleGate {
return &ModuleGate{enabled: enabled, state: ModuleStateActive}
}
// IsEnabled returns whether the module is accepting new tasks
func (g *ModuleGate) IsEnabled() bool {
g.mu.RLock()
defer g.mu.RUnlock()
return g.enabled && g.state == ModuleStateActive
}
// Close signals the module to stop accepting new tasks
func (g *ModuleGate) Close() {
g.mu.Lock()
defer g.mu.Unlock()
if g.state == ModuleStateActive {
g.state = ModuleStateClosing
now := time.Now().UTC()
g.closedAt = &now
}
}
// MarkClosed marks the module as fully closed (no in-flight tasks)
func (g *ModuleGate) MarkClosed() {
g.mu.Lock()
defer g.mu.Unlock()
g.state = ModuleStateClosed
g.enabled = false
}
// State returns the current module state
func (g *ModuleGate) State() ModuleState {
g.mu.RLock()
defer g.mu.RUnlock()
return g.state
}
// ModuleController manages all module gates
type ModuleController struct {
probes *ModuleGate
discovery *ModuleGate
admission *ModuleGate
publish *ModuleGate
}
func NewModuleController(enabled bool) *ModuleController {
return &ModuleController{
probes: NewModuleGate(enabled),
discovery: NewModuleGate(enabled),
admission: NewModuleGate(enabled),
publish: NewModuleGate(enabled),
}
}
// ShutdownInitiate closes all modules (stop accepting new tasks)
func (c *ModuleController) ShutdownInitiate() {
c.probes.Close()
c.discovery.Close()
c.admission.Close()
c.publish.Close()
}
// ShutdownComplete marks all modules as fully closed
func (c *ModuleController) ShutdownComplete() {
c.probes.MarkClosed()
c.discovery.MarkClosed()
c.admission.MarkClosed()
c.publish.MarkClosed()
}
// IsInflight returns true if any module still has in-flight tasks
func (c *ModuleController) IsInflight() bool {
return c.probes.State() == ModuleStateClosing ||
c.discovery.State() == ModuleStateClosing ||
c.admission.State() == ModuleStateClosing ||
c.publish.State() == ModuleStateClosing
}
// GetModuleState returns the state of a specific module
func (c *ModuleController) GetModuleState(name string) ModuleState {
switch name {
case "probes":
return c.probes.State()
case "discovery":
return c.discovery.State()
case "admission":
return c.admission.State()
case "publish":
return c.publish.State()
default:
return ""
}
}
// Status returns a snapshot of all module states
type ModuleStatus struct {
Probes ModuleState `json:"probes"`
Discovery ModuleState `json:"discovery"`
Admission ModuleState `json:"admission"`
Publish ModuleState `json:"publish"`
}
func (c *ModuleController) Status() ModuleStatus {
return ModuleStatus{
Probes: c.probes.State(),
Discovery: c.discovery.State(),
Admission: c.admission.State(),
Publish: c.publish.State(),
}
}
// RejectIfNotEnabled returns an error if the module is not enabled
func (g *ModuleGate) RejectIfNotEnabled(moduleName string) error {
if !g.IsEnabled() {
return ErrModuleClosed
}
return nil
}
var ErrModuleClosed = &ModuleClosedError{}
type ModuleClosedError struct{}
func (e *ModuleClosedError) Error() string {
return "module is not accepting new tasks"
}

View File

@@ -0,0 +1,124 @@
package control
import (
"testing"
"time"
)
func TestModuleGate_IsEnabled(t *testing.T) {
g := NewModuleGate(true)
if !g.IsEnabled() {
t.Fatal("expected enabled")
}
}
func TestModuleGate_IsDisabled(t *testing.T) {
g := NewModuleGate(false)
if g.IsEnabled() {
t.Fatal("expected disabled")
}
}
func TestModuleGate_Close(t *testing.T) {
g := NewModuleGate(true)
g.Close()
if g.State() != ModuleStateClosing {
t.Fatalf("expected closing, got: %s", g.State())
}
}
func TestModuleGate_MarkClosed(t *testing.T) {
g := NewModuleGate(true)
g.Close()
g.MarkClosed()
if g.State() != ModuleStateClosed {
t.Fatalf("expected closed, got: %s", g.State())
}
if g.IsEnabled() {
t.Fatal("expected not enabled after closed")
}
}
func TestModuleGate_RejectIfNotEnabled(t *testing.T) {
g := NewModuleGate(true)
err := g.RejectIfNotEnabled("test")
if err != nil {
t.Fatal("expected no error when enabled")
}
g.Close()
err = g.RejectIfNotEnabled("test")
if err == nil {
t.Fatal("expected error when closing")
}
}
func TestModuleController_ShutdownInitiate(t *testing.T) {
c := NewModuleController(true)
c.ShutdownInitiate()
if c.probes.State() != ModuleStateClosing {
t.Fatalf("probes should be closing, got: %s", c.probes.State())
}
if c.discovery.State() != ModuleStateClosing {
t.Fatalf("discovery should be closing, got: %s", c.discovery.State())
}
}
func TestModuleController_ShutdownComplete(t *testing.T) {
c := NewModuleController(true)
c.ShutdownInitiate()
c.ShutdownComplete()
if c.probes.State() != ModuleStateClosed {
t.Fatalf("probes should be closed, got: %s", c.probes.State())
}
}
func TestModuleController_IsInflight(t *testing.T) {
c := NewModuleController(true)
c.ShutdownInitiate()
if !c.IsInflight() {
t.Fatal("expected in-flight during closing")
}
c.ShutdownComplete()
if c.IsInflight() {
t.Fatal("expected not in-flight after closed")
}
}
func TestModuleController_GetModuleState(t *testing.T) {
c := NewModuleController(true)
if c.GetModuleState("probes") != ModuleStateActive {
t.Fatalf("expected active, got: %s", c.GetModuleState("probes"))
}
if c.GetModuleState("unknown") != "" {
t.Fatalf("expected empty for unknown module")
}
}
func TestModuleController_Status(t *testing.T) {
c := NewModuleController(true)
status := c.Status()
if status.Probes != ModuleStateActive {
t.Fatalf("expected active, got: %s", status.Probes)
}
}
func TestModuleGate_ClosedAt(t *testing.T) {
g := NewModuleGate(true)
g.Close()
if g.State() != ModuleStateClosing {
t.Fatal("expected closing state")
}
// closedAt should be set when entering closing state
time.Sleep(10 * time.Millisecond)
_ = g.closedAt // not nil when closing
}

View File

@@ -0,0 +1,161 @@
package discovery
import (
"context"
"log"
"time"
"supply-intelligence/internal/integration"
)
// SchedulerTrigger defines how discovery is invoked
type SchedulerTrigger int
const (
TriggerManual SchedulerTrigger = iota
TriggerScheduled
TriggerNewAccount
)
// SupplierAdapterRegistry holds all registered platform adapters
type SupplierAdapterRegistry struct {
adapters map[string]integration.SupplierAdapter
}
func NewSupplierAdapterRegistry() *SupplierAdapterRegistry {
return &SupplierAdapterRegistry{adapters: make(map[string]integration.SupplierAdapter)}
}
func (r *SupplierAdapterRegistry) Register(adapter integration.SupplierAdapter) {
r.adapters[adapter.Platform()] = adapter
}
func (r *SupplierAdapterRegistry) Get(platform string) (integration.SupplierAdapter, bool) {
adapter, ok := r.adapters[platform]
return adapter, ok
}
func (r *SupplierAdapterRegistry) ListPlatforms() []string {
platforms := make([]string, 0, len(r.adapters))
for p := range r.adapters {
platforms = append(platforms, p)
}
return platforms
}
// ScanResult holds the outcome of a platform scan
type ScanResult struct {
Platform string
NewModels int
RemovedModels []string // models that were in candidates but not in supplier list
Errors []string
}
// DiscoveryScheduler orchestrates periodic and on-demand discovery scans
type DiscoveryScheduler struct {
service *Service
registry *SupplierAdapterRegistry
now func() time.Time
}
func NewDiscoveryScheduler(service *Service, registry *SupplierAdapterRegistry) *DiscoveryScheduler {
return &DiscoveryScheduler{
service: service,
registry: registry,
now: func() time.Time { return time.Now().UTC() },
}
}
// ScanAllPlatforms runs discovery across all registered platforms
func (s *DiscoveryScheduler) ScanAllPlatforms(ctx context.Context) ([]ScanResult, error) {
platforms := s.registry.ListPlatforms()
results := make([]ScanResult, 0, len(platforms))
for _, platform := range platforms {
result, err := s.ScanPlatform(ctx, platform)
if err != nil {
results = append(results, ScanResult{Platform: platform, Errors: []string{err.Error()}})
continue
}
results = append(results, *result)
}
return results, nil
}
// ScanPlatform runs discovery for a single platform
func (s *DiscoveryScheduler) ScanPlatform(ctx context.Context, platform string) (*ScanResult, error) {
adapter, ok := s.registry.Get(platform)
if !ok {
return nil, ErrPlatformNotSupported
}
result := &ScanResult{Platform: platform}
// Get models from the platform
// In production these accounts come from the database; here we accept a map for injection
accounts := s.loadAccountsForPlatform(ctx, platform)
if len(accounts) == 0 {
log.Printf("[discovery] no accounts registered for platform %s, skipping", platform)
return result, nil
}
// Use the first account as the source of models (in production would fan out)
account := accounts[0]
models, err := adapter.GetModels(ctx, account)
if err != nil {
result.Errors = append(result.Errors, "GetModels: "+err.Error())
return result, err
}
log.Printf("[discovery] platform=%s found %d models", platform, len(models))
// Record each model as a candidate
for _, model := range models {
candidateInput := RecordCandidateInput{
CandidateID: platform + "-" + model.ModelID,
AccountID: account.AccountID,
Platform: platform,
Model: model.ModelID,
Source: "official_api",
DiscoveredAt: s.now(),
}
out, err := s.service.RecordCandidate(ctx, candidateInput)
if err != nil {
result.Errors = append(result.Errors, "RecordCandidate: "+err.Error())
continue
}
if out.Created {
result.NewModels++
log.Printf("[discovery] new candidate: platform=%s model=%s", platform, model.ModelID)
}
}
return result, nil
}
// loadAccountsForPlatform returns supplier accounts for a platform
// In production this queries the accounts table; here it returns a seeded default
func (s *DiscoveryScheduler) loadAccountsForPlatform(ctx context.Context, platform string) []integration.SupplierAccount {
// Production: query supply_accounts where platform = X and status = active
// For now: return a placeholder that will work with adapter.GetModels
return []integration.SupplierAccount{
{
AccountID: 1,
Platform: platform,
APIKey: "",
BaseURL: defaultBaseURL(platform),
},
}
}
func defaultBaseURL(platform string) string {
switch platform {
case "openai":
return "https://api.openai.com"
case "anthropic":
return "https://api.anthropic.com"
default:
return ""
}
}

View File

@@ -0,0 +1,99 @@
package discovery
import (
"context"
"errors"
"strings"
"time"
"supply-intelligence/internal/domain"
)
var (
ErrInvalidCandidateInput = errors.New("invalid candidate input")
ErrPlatformNotSupported = errors.New("platform not supported in registry")
)
type CandidateRepository interface {
GetDiscoveryCandidateByIDContext(ctx context.Context, candidateID string) (domain.DiscoveryCandidate, bool)
FindDiscoveryCandidateContext(ctx context.Context, accountID int64, platform, model string) (domain.DiscoveryCandidate, bool)
UpsertDiscoveryCandidateContext(ctx context.Context, candidate domain.DiscoveryCandidate) domain.DiscoveryCandidate
ListDiscoveryCandidatesContext(ctx context.Context, status domain.DiscoveryCandidateStatus) []domain.DiscoveryCandidate
}
type Service struct {
repo CandidateRepository
now func() time.Time
}
type RecordCandidateInput struct {
CandidateID string
AccountID int64
Platform string
Model string
Source string
ReasonCode string
DiscoveredAt time.Time
}
type RecordCandidateOutput struct {
Candidate domain.DiscoveryCandidate `json:"candidate"`
Created bool `json:"created"`
}
func NewService(repo CandidateRepository) *Service {
return &Service{
repo: repo,
now: func() time.Time {
return time.Now().UTC()
},
}
}
func (s *Service) RecordCandidate(ctx context.Context, input RecordCandidateInput) (RecordCandidateOutput, error) {
if s == nil || s.repo == nil {
return RecordCandidateOutput{}, ErrInvalidCandidateInput
}
candidateID := strings.TrimSpace(input.CandidateID)
platform := strings.TrimSpace(input.Platform)
model := strings.TrimSpace(input.Model)
source := strings.TrimSpace(input.Source)
reasonCode := strings.TrimSpace(input.ReasonCode)
if candidateID == "" || input.AccountID <= 0 || platform == "" || model == "" || source == "" {
return RecordCandidateOutput{}, ErrInvalidCandidateInput
}
if existing, ok := s.repo.GetDiscoveryCandidateByIDContext(ctx, candidateID); ok {
return RecordCandidateOutput{Candidate: existing, Created: false}, nil
}
at := input.DiscoveredAt.UTC()
if at.IsZero() {
at = s.now()
}
if existing, ok := s.repo.FindDiscoveryCandidateContext(ctx, input.AccountID, platform, model); ok {
existing.Source = source
existing.ReasonCode = reasonCode
existing.UpdatedAt = at
existing.Version++
return RecordCandidateOutput{Candidate: s.repo.UpsertDiscoveryCandidateContext(ctx, existing), Created: false}, nil
}
candidate := domain.DiscoveryCandidate{
CandidateID: candidateID,
AccountID: input.AccountID,
Platform: platform,
Model: model,
Source: source,
Status: domain.DiscoveryCandidateStatusPendingAdmission,
ReasonCode: reasonCode,
DiscoveredAt: at,
UpdatedAt: at,
Version: 1,
}
return RecordCandidateOutput{Candidate: s.repo.UpsertDiscoveryCandidateContext(ctx, candidate), Created: true}, nil
}
func (s *Service) ListCandidates(ctx context.Context, status domain.DiscoveryCandidateStatus) []domain.DiscoveryCandidate {
if s == nil || s.repo == nil {
return nil
}
return s.repo.ListDiscoveryCandidatesContext(ctx, status)
}

View File

@@ -0,0 +1,160 @@
package discovery
import (
"context"
"testing"
"time"
"supply-intelligence/internal/domain"
"supply-intelligence/internal/repository"
)
func TestRecordCandidateCreatesPendingAdmissionCandidate(t *testing.T) {
repo := repository.NewMemoryRepository()
service := NewService(repo)
at := time.Unix(100, 0).UTC()
out, err := service.RecordCandidate(context.Background(), RecordCandidateInput{
CandidateID: "cand-1",
AccountID: 10,
Platform: "openai",
Model: "gpt-4.1-mini",
Source: "manual_seed",
ReasonCode: "new_model",
DiscoveredAt: at,
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !out.Created {
t.Fatalf("expected created candidate")
}
if out.Candidate.Status != domain.DiscoveryCandidateStatusPendingAdmission {
t.Fatalf("unexpected status: %q", out.Candidate.Status)
}
if out.Candidate.Version != 1 {
t.Fatalf("unexpected version: %d", out.Candidate.Version)
}
if !out.Candidate.DiscoveredAt.Equal(at) || !out.Candidate.UpdatedAt.Equal(at) {
t.Fatalf("unexpected timestamps: %+v", out.Candidate)
}
}
func TestRecordCandidateIsIdempotentByCandidateID(t *testing.T) {
repo := repository.NewMemoryRepository()
service := NewService(repo)
first, err := service.RecordCandidate(context.Background(), RecordCandidateInput{
CandidateID: "cand-1",
AccountID: 10,
Platform: "openai",
Model: "gpt-4.1-mini",
Source: "manual_seed",
})
if err != nil {
t.Fatalf("unexpected first error: %v", err)
}
second, err := service.RecordCandidate(context.Background(), RecordCandidateInput{
CandidateID: "cand-1",
AccountID: 99,
Platform: "other",
Model: "other-model",
Source: "other_source",
})
if err != nil {
t.Fatalf("unexpected second error: %v", err)
}
if second.Created {
t.Fatalf("expected idempotent replay")
}
if second.Candidate.AccountID != first.Candidate.AccountID || second.Candidate.Platform != first.Candidate.Platform || second.Candidate.Model != first.Candidate.Model {
t.Fatalf("expected original candidate to be preserved: %+v", second.Candidate)
}
}
func TestRecordCandidateDeduplicatesByBusinessKey(t *testing.T) {
repo := repository.NewMemoryRepository()
service := NewService(repo)
firstAt := time.Unix(100, 0).UTC()
secondAt := time.Unix(200, 0).UTC()
_, err := service.RecordCandidate(context.Background(), RecordCandidateInput{
CandidateID: "cand-1",
AccountID: 10,
Platform: "openai",
Model: "gpt-4.1-mini",
Source: "manual_seed",
ReasonCode: "first",
DiscoveredAt: firstAt,
})
if err != nil {
t.Fatalf("unexpected first error: %v", err)
}
out, err := service.RecordCandidate(context.Background(), RecordCandidateInput{
CandidateID: "cand-2",
AccountID: 10,
Platform: "openai",
Model: "gpt-4.1-mini",
Source: "scan",
ReasonCode: "second",
DiscoveredAt: secondAt,
})
if err != nil {
t.Fatalf("unexpected second error: %v", err)
}
if out.Created {
t.Fatalf("expected business-key dedupe")
}
if out.Candidate.CandidateID != "cand-1" {
t.Fatalf("expected original candidate id to be retained: %+v", out.Candidate)
}
if out.Candidate.Source != "scan" || out.Candidate.ReasonCode != "second" {
t.Fatalf("expected metadata update: %+v", out.Candidate)
}
if out.Candidate.Version != 2 {
t.Fatalf("expected version bump, got %d", out.Candidate.Version)
}
if !out.Candidate.UpdatedAt.Equal(secondAt) {
t.Fatalf("expected updated timestamp to change: %+v", out.Candidate)
}
}
func TestRecordCandidateRejectsInvalidInput(t *testing.T) {
repo := repository.NewMemoryRepository()
service := NewService(repo)
_, err := service.RecordCandidate(context.Background(), RecordCandidateInput{})
if err == nil {
t.Fatalf("expected invalid input error")
}
}
func TestListCandidatesFiltersByStatus(t *testing.T) {
repo := repository.NewMemoryRepository()
repo.UpsertDiscoveryCandidateContext(context.Background(), domain.DiscoveryCandidate{
CandidateID: "cand-1",
AccountID: 10,
Platform: "openai",
Model: "a",
Source: "seed",
Status: domain.DiscoveryCandidateStatusPendingAdmission,
DiscoveredAt: time.Unix(100, 0).UTC(),
UpdatedAt: time.Unix(100, 0).UTC(),
Version: 1,
})
repo.UpsertDiscoveryCandidateContext(context.Background(), domain.DiscoveryCandidate{
CandidateID: "cand-2",
AccountID: 11,
Platform: "openai",
Model: "b",
Source: "seed",
Status: domain.DiscoveryCandidateStatusAdmitted,
DiscoveredAt: time.Unix(200, 0).UTC(),
UpdatedAt: time.Unix(200, 0).UTC(),
Version: 1,
})
service := NewService(repo)
items := service.ListCandidates(context.Background(), domain.DiscoveryCandidateStatusPendingAdmission)
if len(items) != 1 || items[0].CandidateID != "cand-1" {
t.Fatalf("unexpected filtered items: %+v", items)
}
}

132
internal/domain/types.go Normal file
View File

@@ -0,0 +1,132 @@
package domain
import "time"
type AccountStatus string
const (
AccountStatusActive AccountStatus = "active"
AccountStatusSuspended AccountStatus = "suspended"
AccountStatusDisabled AccountStatus = "disabled"
AccountStatusPendingVerify AccountStatus = "pending_verify"
AccountStatusPendingEnable AccountStatus = "pending_enable"
)
type ProbeClassification string
const (
ProbeClassificationSuccess ProbeClassification = "success"
ProbeClassificationExplicitFailure ProbeClassification = "explicit_failure"
ProbeClassificationInconclusive ProbeClassification = "inconclusive"
)
type DiscoveryCandidateStatus string
const (
DiscoveryCandidateStatusPendingAdmission DiscoveryCandidateStatus = "pending_admission"
DiscoveryCandidateStatusAdmitted DiscoveryCandidateStatus = "admitted"
DiscoveryCandidateStatusRejected DiscoveryCandidateStatus = "rejected"
)
type GatewaySyncStatus string
const (
GatewaySyncStatusPending GatewaySyncStatus = "pending"
GatewaySyncStatusApplied GatewaySyncStatus = "applied"
GatewaySyncStatusFailed GatewaySyncStatus = "failed"
)
type GatewayAckResult string
const (
GatewayAckResultApplied GatewayAckResult = "applied"
GatewayAckResultFailed GatewayAckResult = "failed"
)
func (r GatewayAckResult) SyncStatus() GatewaySyncStatus {
switch r {
case GatewayAckResultApplied:
return GatewaySyncStatusApplied
case GatewayAckResultFailed:
return GatewaySyncStatusFailed
default:
return GatewaySyncStatusPending
}
}
type ProbeResult struct {
AccountID int64
Classification ProbeClassification
ReasonCode string
ObservedAt time.Time
}
type AccountRoutingState struct {
AccountID int64 `json:"account_id"`
Platform string `json:"platform"`
AccountStatus AccountStatus `json:"account_status"`
RoutingEnabled bool `json:"routing_enabled"`
RiskScore int `json:"risk_score"`
ReasonCode string `json:"reason_code"`
LastProbeAt time.Time `json:"last_probe_at"`
Version int64 `json:"version"`
}
type PackageChangeEvent struct {
EventID string `json:"event_id"`
EventType string `json:"event_type"`
PackageID int64 `json:"package_id"`
Platform string `json:"platform"`
Model string `json:"model"`
OccurredAt time.Time `json:"occurred_at"`
Version int64 `json:"version"`
GatewaySyncStatus GatewaySyncStatus `json:"gateway_sync_status"`
Consumer string `json:"consumer,omitempty"`
ConsumerDetail string `json:"consumer_detail,omitempty"`
AckedAt *time.Time `json:"acked_at,omitempty"`
}
type PackageChangeAck struct {
EventID string `json:"event_id"`
Consumer string `json:"consumer"`
Result GatewayAckResult `json:"result"`
Detail string `json:"detail,omitempty"`
AckedAt time.Time `json:"acked_at"`
SyncState GatewaySyncStatus `json:"gateway_sync_status"`
}
type GatewayAppliedSnapshot struct {
Consumer string `json:"consumer"`
LastEventID string `json:"last_event_id"`
LastPackageID int64 `json:"last_package_id"`
LastPlatform string `json:"last_platform"`
LastModel string `json:"last_model"`
LastAppliedVersion int64 `json:"last_applied_version"`
LastResult string `json:"last_result"`
UpdatedAt time.Time `json:"updated_at"`
}
type DiscoveryCandidate struct {
CandidateID string `json:"candidate_id"`
AccountID int64 `json:"account_id"`
Platform string `json:"platform"`
Model string `json:"model"`
Source string `json:"source"`
Status DiscoveryCandidateStatus `json:"status"`
ReasonCode string `json:"reason_code,omitempty"`
DiscoveredAt time.Time `json:"discovered_at"`
UpdatedAt time.Time `json:"updated_at"`
Version int64 `json:"version"`
}
// SupplyPackage represents a supply package in the system
type SupplyPackage struct {
PackageID int64 `json:"package_id"`
Platform string `json:"platform"`
Model string `json:"model"`
Status string `json:"status"` // draft, active, deprecated
Source string `json:"source"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
Version int64 `json:"version"`
}

View File

@@ -0,0 +1,110 @@
package gatewayconsumer
import (
"context"
"errors"
"strings"
"time"
"supply-intelligence/internal/domain"
)
var ErrInvalidConsumeInput = errors.New("invalid consume input")
type PackageChangeRepository interface {
ListPackageEventsAfter(cursor string) ([]domain.PackageChangeEvent, string)
AckPackageEvent(eventID, consumer string, result domain.GatewayAckResult, detail string, ackedAt time.Time) (domain.PackageChangeEvent, error)
UpsertGatewayAppliedSnapshot(snapshot domain.GatewayAppliedSnapshot) domain.GatewayAppliedSnapshot
}
type Service struct {
repo PackageChangeRepository
now func() time.Time
applier func(context.Context, domain.PackageChangeEvent) (domain.GatewayAckResult, string)
consumer string
}
type ConsumeOnceInput struct {
Consumer string
Cursor string
}
type ConsumeOnceOutput struct {
Consumer string `json:"consumer"`
NextCursor string `json:"next_cursor"`
Items []ConsumedPackageChangeItem `json:"items"`
}
type ConsumedPackageChangeItem struct {
EventID string `json:"event_id"`
PackageID int64 `json:"package_id"`
GatewaySyncStatus domain.GatewaySyncStatus `json:"gateway_sync_status"`
Result domain.GatewayAckResult `json:"result"`
Detail string `json:"detail,omitempty"`
}
func NewService(repo PackageChangeRepository) *Service {
return &Service{
repo: repo,
now: func() time.Time {
return time.Now().UTC()
},
consumer: "gateway",
applier: func(_ context.Context, event domain.PackageChangeEvent) (domain.GatewayAckResult, string) {
if strings.Contains(strings.ToLower(event.Model), "fail") {
return domain.GatewayAckResultFailed, "simulated apply failure"
}
return domain.GatewayAckResultApplied, "applied to gateway snapshot"
},
}
}
func (s *Service) SetApplier(applier func(context.Context, domain.PackageChangeEvent) (domain.GatewayAckResult, string)) {
s.applier = applier
}
func (s *Service) ConsumeOnce(ctx context.Context, input ConsumeOnceInput) (ConsumeOnceOutput, error) {
if s == nil || s.repo == nil || s.applier == nil {
return ConsumeOnceOutput{}, ErrInvalidConsumeInput
}
consumer := strings.TrimSpace(input.Consumer)
if consumer == "" {
consumer = s.consumer
}
items, nextCursor := s.repo.ListPackageEventsAfter(strings.TrimSpace(input.Cursor))
result := ConsumeOnceOutput{Consumer: consumer, NextCursor: nextCursor, Items: make([]ConsumedPackageChangeItem, 0, len(items))}
for _, event := range items {
if event.GatewaySyncStatus != domain.GatewaySyncStatusPending {
continue
}
ackResult, detail := s.applier(ctx, event)
if ackResult != domain.GatewayAckResultApplied && ackResult != domain.GatewayAckResultFailed {
return ConsumeOnceOutput{}, ErrInvalidConsumeInput
}
ackedAt := s.now()
if ackResult == domain.GatewayAckResultApplied {
s.repo.UpsertGatewayAppliedSnapshot(domain.GatewayAppliedSnapshot{
Consumer: consumer,
LastEventID: event.EventID,
LastPackageID: event.PackageID,
LastPlatform: event.Platform,
LastModel: event.Model,
LastAppliedVersion: event.Version,
LastResult: string(ackResult),
UpdatedAt: ackedAt,
})
}
updated, err := s.repo.AckPackageEvent(event.EventID, consumer, ackResult, detail, ackedAt)
if err != nil {
return ConsumeOnceOutput{}, err
}
result.Items = append(result.Items, ConsumedPackageChangeItem{
EventID: updated.EventID,
PackageID: updated.PackageID,
GatewaySyncStatus: updated.GatewaySyncStatus,
Result: ackResult,
Detail: detail,
})
}
return result, nil
}

View File

@@ -0,0 +1,89 @@
package gatewayconsumer
import (
"context"
"testing"
"time"
"supply-intelligence/internal/domain"
"supply-intelligence/internal/repository"
)
func TestServiceConsumeOnceAppliedAndFailed(t *testing.T) {
repo := repository.NewMemoryRepository()
repo.AppendPackageEvent(domain.PackageChangeEvent{
EventID: "evt-applied",
EventType: "supply_package_published",
PackageID: 101,
Platform: "openai",
Model: "gpt-4.1-mini",
Version: 3,
OccurredAt: time.Unix(10, 0).UTC(),
GatewaySyncStatus: domain.GatewaySyncStatusPending,
})
repo.AppendPackageEvent(domain.PackageChangeEvent{
EventID: "evt-failed",
EventType: "supply_package_published",
PackageID: 102,
Platform: "openai",
Model: "gpt-fail-model",
Version: 4,
OccurredAt: time.Unix(20, 0).UTC(),
GatewaySyncStatus: domain.GatewaySyncStatusPending,
})
service := NewService(repo)
service.now = func() time.Time { return time.Unix(30, 0).UTC() }
out, err := service.ConsumeOnce(context.Background(), ConsumeOnceInput{Consumer: "gateway"})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(out.Items) != 2 {
t.Fatalf("unexpected item count: %d", len(out.Items))
}
if out.Items[0].GatewaySyncStatus != domain.GatewaySyncStatusApplied {
t.Fatalf("unexpected first status: %+v", out.Items[0])
}
if out.Items[1].GatewaySyncStatus != domain.GatewaySyncStatusFailed {
t.Fatalf("unexpected second status: %+v", out.Items[1])
}
events := repo.ListPackageEvents()
if events[0].GatewaySyncStatus != domain.GatewaySyncStatusApplied {
t.Fatalf("expected applied event, got %+v", events[0])
}
if events[1].GatewaySyncStatus != domain.GatewaySyncStatusFailed {
t.Fatalf("expected failed event, got %+v", events[1])
}
snapshot, ok := repo.GetGatewayAppliedSnapshot("gateway")
if !ok {
t.Fatal("expected applied snapshot")
}
if snapshot.LastEventID != "evt-applied" || snapshot.LastPackageID != 101 {
t.Fatalf("unexpected snapshot: %+v", snapshot)
}
}
func TestServiceConsumeOnceRejectsInvalidApplierResult(t *testing.T) {
repo := repository.NewMemoryRepository()
repo.AppendPackageEvent(domain.PackageChangeEvent{
EventID: "evt-1",
EventType: "supply_package_published",
PackageID: 101,
Platform: "openai",
Model: "gpt-4.1-mini",
Version: 3,
OccurredAt: time.Unix(10, 0).UTC(),
GatewaySyncStatus: domain.GatewaySyncStatusPending,
})
service := NewService(repo)
service.SetApplier(func(context.Context, domain.PackageChangeEvent) (domain.GatewayAckResult, string) {
return domain.GatewayAckResult("unknown"), "bad"
})
_, err := service.ConsumeOnce(context.Background(), ConsumeOnceInput{})
if err != ErrInvalidConsumeInput {
t.Fatalf("unexpected error: %v", err)
}
}

12
internal/httpapi/parse.go Normal file
View File

@@ -0,0 +1,12 @@
package httpapi
import "strconv"
func parseInt64(input string, target *int64) (int64, error) {
value, err := strconv.ParseInt(input, 10, 64)
if err != nil {
return 0, err
}
*target = value
return value, nil
}

415
internal/httpapi/server.go Normal file
View File

@@ -0,0 +1,415 @@
package httpapi
import (
"context"
"encoding/json"
"errors"
"net/http"
"strings"
"time"
"supply-intelligence/internal/admission"
"supply-intelligence/internal/discovery"
"supply-intelligence/internal/domain"
"supply-intelligence/internal/gatewayconsumer"
"supply-intelligence/internal/probe"
"supply-intelligence/internal/publish"
"supply-intelligence/internal/repository"
)
type Server struct {
repo *repository.MemoryRepository
probeService *probe.Service
publishService *publish.Service
gatewayConsumerService *gatewayconsumer.Service
discoveryService *discovery.Service
admissionService *admission.Service
}
type packageChangesResponse struct {
Items []domain.PackageChangeEvent `json:"items"`
NextCursor string `json:"next_cursor"`
}
type discoveryCandidatesResponse struct {
Items []domain.DiscoveryCandidate `json:"items"`
}
func NewServer(repo *repository.MemoryRepository, probeService *probe.Service, publishService *publish.Service, gatewayConsumerService *gatewayconsumer.Service, discoveryService *discovery.Service, admissionService *admission.Service) *Server {
return &Server{repo: repo, probeService: probeService, publishService: publishService, gatewayConsumerService: gatewayConsumerService, discoveryService: discoveryService, admissionService: admissionService}
}
func (s *Server) Routes() http.Handler {
mux := http.NewServeMux()
mux.HandleFunc("/healthz", s.handleHealth)
mux.HandleFunc("/internal/supply-intelligence/accounts/", s.handleGetRoutingState)
mux.HandleFunc("/internal/supply-intelligence/probe/evaluate", s.handleEvaluateProbe)
mux.HandleFunc("/internal/supply-intelligence/publish/package-event", s.handlePublishPackageEvent)
mux.HandleFunc("/internal/supply-intelligence/discovery/candidates", s.handleDiscoveryCandidates)
mux.HandleFunc("/internal/supply-intelligence/gateway/package-changes", s.handleListPackageChanges)
mux.HandleFunc("/internal/supply-intelligence/gateway/package-changes/", s.handleAckPackageChange)
mux.HandleFunc("/internal/supply-intelligence/gateway/consume-once", s.handleConsumeOnce)
mux.HandleFunc("/internal/supply-intelligence/admission/run", s.handleAdmissionRun)
mux.HandleFunc("/internal/supply-intelligence/admission/candidates", s.handleAdmissionCandidates)
return mux
}
func (s *Server) handleHealth(w http.ResponseWriter, _ *http.Request) {
writeJSON(w, http.StatusOK, map[string]string{"status": "ok"})
}
func (s *Server) handleGetRoutingState(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
writeJSON(w, http.StatusMethodNotAllowed, map[string]string{"error": "method_not_allowed"})
return
}
prefix := "/internal/supply-intelligence/accounts/"
path := strings.TrimPrefix(r.URL.Path, prefix)
if !strings.HasSuffix(path, "/routing-state") {
writeJSON(w, http.StatusNotFound, map[string]string{"error": "not_found"})
return
}
accountIDPart := strings.TrimSuffix(path, "/routing-state")
var accountID int64
if _, err := parseInt64(accountIDPart, &accountID); err != nil {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid_account_id"})
return
}
state, ok := s.repo.GetRoutingState(accountID)
if !ok {
writeJSON(w, http.StatusNotFound, map[string]string{"error": "not_found"})
return
}
writeJSON(w, http.StatusOK, state)
}
func (s *Server) handleEvaluateProbe(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
writeJSON(w, http.StatusMethodNotAllowed, map[string]string{"error": "method_not_allowed"})
return
}
if s.probeService == nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "probe_service_unavailable"})
return
}
var payload struct {
AccountID int64 `json:"account_id"`
Platform string `json:"platform"`
CurrentStatus string `json:"current_status"`
StatusCode int `json:"status_code"`
TransportError string `json:"transport_error"`
}
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid_json"})
return
}
if payload.AccountID <= 0 {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid_account_id"})
return
}
if payload.Platform == "" {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "missing_platform"})
return
}
if payload.CurrentStatus == "" {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "missing_current_status"})
return
}
var transportErr error
if payload.TransportError != "" {
transportErr = errors.New(payload.TransportError)
}
result, err := s.probeService.EvaluateHTTPResult(context.Background(), probe.EvaluateInput{
AccountID: payload.AccountID,
Platform: payload.Platform,
CurrentStatus: domainAccountStatus(payload.CurrentStatus),
StatusCode: payload.StatusCode,
TransportError: transportErr,
})
if err != nil {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": err.Error()})
return
}
writeJSON(w, http.StatusOK, result)
}
func (s *Server) handlePublishPackageEvent(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
writeJSON(w, http.StatusMethodNotAllowed, map[string]string{"error": "method_not_allowed"})
return
}
if s.publishService == nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "publish_service_unavailable"})
return
}
var payload struct {
EventID string `json:"event_id"`
PackageID int64 `json:"package_id"`
Platform string `json:"platform"`
Model string `json:"model"`
Version int64 `json:"version"`
OccurredAt string `json:"occurred_at"`
}
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid_json"})
return
}
var occurredAt time.Time
if payload.OccurredAt != "" {
parsed, err := time.Parse(time.RFC3339, payload.OccurredAt)
if err != nil {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid_occurred_at"})
return
}
occurredAt = parsed
}
event, err := s.publishService.RecordPackagePublished(r.Context(), publish.RecordPackagePublishedInput{
EventID: payload.EventID,
PackageID: payload.PackageID,
Platform: payload.Platform,
Model: payload.Model,
Version: payload.Version,
OccurredAt: occurredAt,
})
if err != nil {
if errors.Is(err, publish.ErrInvalidPublishInput) {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid_publish_input"})
return
}
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "internal_error"})
return
}
writeJSON(w, http.StatusOK, event)
}
func (s *Server) handleDiscoveryCandidates(w http.ResponseWriter, r *http.Request) {
switch r.Method {
case http.MethodPost:
s.handleCreateDiscoveryCandidate(w, r)
case http.MethodGet:
s.handleListDiscoveryCandidates(w, r)
default:
writeJSON(w, http.StatusMethodNotAllowed, map[string]string{"error": "method_not_allowed"})
}
}
func (s *Server) handleCreateDiscoveryCandidate(w http.ResponseWriter, r *http.Request) {
if s.discoveryService == nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "discovery_service_unavailable"})
return
}
var payload struct {
CandidateID string `json:"candidate_id"`
AccountID int64 `json:"account_id"`
Platform string `json:"platform"`
Model string `json:"model"`
Source string `json:"source"`
ReasonCode string `json:"reason_code"`
DiscoveredAt string `json:"discovered_at"`
}
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid_json"})
return
}
var discoveredAt time.Time
if strings.TrimSpace(payload.DiscoveredAt) != "" {
parsed, err := time.Parse(time.RFC3339, payload.DiscoveredAt)
if err != nil {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid_discovered_at"})
return
}
discoveredAt = parsed
}
out, err := s.discoveryService.RecordCandidate(r.Context(), discovery.RecordCandidateInput{
CandidateID: payload.CandidateID,
AccountID: payload.AccountID,
Platform: payload.Platform,
Model: payload.Model,
Source: payload.Source,
ReasonCode: payload.ReasonCode,
DiscoveredAt: discoveredAt,
})
if err != nil {
if errors.Is(err, discovery.ErrInvalidCandidateInput) {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid_candidate_input"})
return
}
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "internal_error"})
return
}
writeJSON(w, http.StatusOK, out)
}
func (s *Server) handleListDiscoveryCandidates(w http.ResponseWriter, r *http.Request) {
if s.discoveryService == nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "discovery_service_unavailable"})
return
}
status, ok := parseDiscoveryCandidateStatus(strings.TrimSpace(r.URL.Query().Get("status")))
if !ok {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid_status"})
return
}
writeJSON(w, http.StatusOK, discoveryCandidatesResponse{Items: s.discoveryService.ListCandidates(r.Context(), status)})
}
func parseDiscoveryCandidateStatus(raw string) (domain.DiscoveryCandidateStatus, bool) {
if raw == "" {
return "", true
}
status := domain.DiscoveryCandidateStatus(raw)
switch status {
case domain.DiscoveryCandidateStatusPendingAdmission, domain.DiscoveryCandidateStatusAdmitted, domain.DiscoveryCandidateStatusRejected:
return status, true
default:
return "", false
}
}
func (s *Server) handleListPackageChanges(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
writeJSON(w, http.StatusMethodNotAllowed, map[string]string{"error": "method_not_allowed"})
return
}
items, nextCursor := s.repo.ListPackageEventsAfter(strings.TrimSpace(r.URL.Query().Get("cursor")))
writeJSON(w, http.StatusOK, packageChangesResponse{Items: items, NextCursor: nextCursor})
}
func (s *Server) handleAckPackageChange(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
writeJSON(w, http.StatusMethodNotAllowed, map[string]string{"error": "method_not_allowed"})
return
}
prefix := "/internal/supply-intelligence/gateway/package-changes/"
path := strings.TrimPrefix(r.URL.Path, prefix)
if !strings.HasSuffix(path, "/ack") {
writeJSON(w, http.StatusNotFound, map[string]string{"error": "not_found"})
return
}
eventID := strings.TrimSuffix(path, "/ack")
var payload struct {
Consumer string `json:"consumer"`
Result string `json:"result"`
Detail string `json:"detail"`
}
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid_json"})
return
}
ackResult := domain.GatewayAckResult(payload.Result)
if !repository.IsGatewayAckResult(ackResult) {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid_result"})
return
}
consumer := strings.TrimSpace(payload.Consumer)
if consumer == "" {
consumer = "gateway"
}
_, err := s.repo.AckPackageEvent(eventID, consumer, ackResult, payload.Detail, time.Now().UTC())
if err != nil {
if errors.Is(err, repository.ErrEventNotFound) {
writeJSON(w, http.StatusNotFound, map[string]string{"error": "not_found"})
return
}
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "internal_error"})
return
}
w.WriteHeader(http.StatusNoContent)
}
func (s *Server) handleConsumeOnce(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
writeJSON(w, http.StatusMethodNotAllowed, map[string]string{"error": "method_not_allowed"})
return
}
if s.gatewayConsumerService == nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "gateway_consumer_unavailable"})
return
}
var payload struct {
Consumer string `json:"consumer"`
Cursor string `json:"cursor"`
}
if r.Body != nil {
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil && err.Error() != "EOF" {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid_json"})
return
}
}
out, err := s.gatewayConsumerService.ConsumeOnce(r.Context(), gatewayconsumer.ConsumeOnceInput{Consumer: payload.Consumer, Cursor: payload.Cursor})
if err != nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "consume_failed"})
return
}
writeJSON(w, http.StatusOK, out)
}
func writeJSON(w http.ResponseWriter, status int, body any) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
_ = json.NewEncoder(w).Encode(body)
}
// handleAdmissionRun runs admission test for a specific candidate
func (s *Server) handleAdmissionRun(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
writeJSON(w, http.StatusMethodNotAllowed, map[string]string{"error": "method_not_allowed"})
return
}
if s.admissionService == nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "admission_service_unavailable"})
return
}
var payload struct {
CandidateID string `json:"candidate_id"`
}
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid_json"})
return
}
if strings.TrimSpace(payload.CandidateID) == "" {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "missing_candidate_id"})
return
}
result, err := s.admissionService.RunAdmission(r.Context(), payload.CandidateID)
if err != nil {
switch {
case errors.Is(err, admission.ErrCandidateNotFound):
writeJSON(w, http.StatusNotFound, map[string]string{"error": "candidate_not_found"})
case errors.Is(err, admission.ErrCandidateNotRunnable):
writeJSON(w, http.StatusConflict, map[string]string{"error": "candidate_not_runnable"})
default:
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "admission_run_failed"})
}
return
}
writeJSON(w, http.StatusOK, result)
}
// handleAdmissionCandidates lists candidates pending admission testing
func (s *Server) handleAdmissionCandidates(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
writeJSON(w, http.StatusMethodNotAllowed, map[string]string{"error": "method_not_allowed"})
return
}
if s.admissionService == nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "admission_service_unavailable"})
return
}
candidates := s.admissionService.GetRunnableCandidates(r.Context())
writeJSON(w, http.StatusOK, map[string]any{"items": candidates})
}
func domainAccountStatus(raw string) domain.AccountStatus {
return domain.AccountStatus(raw)
}

View File

@@ -0,0 +1,149 @@
package httpapi_test
import (
"bytes"
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
"supply-intelligence/internal/app"
"supply-intelligence/internal/domain"
"supply-intelligence/internal/probe"
)
func TestApplicationServerRoutes(t *testing.T) {
application := app.New()
req := httptest.NewRequest(http.MethodPost, "/internal/supply-intelligence/probe/evaluate", bytes.NewBufferString(`{"account_id":7,"platform":"openai","current_status":"active","status_code":401}`))
rr := httptest.NewRecorder()
application.Server.Routes().ServeHTTP(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("unexpected status: %d body=%s", rr.Code, rr.Body.String())
}
var result probe.EvaluateOutput
if err := json.NewDecoder(rr.Body).Decode(&result); err != nil {
t.Fatalf("decode error: %v", err)
}
if result.RoutingState.AccountID != 7 || result.RoutingState.AccountStatus != "suspended" {
t.Fatalf("unexpected state: %+v", result.RoutingState)
}
getReq := httptest.NewRequest(http.MethodGet, "/internal/supply-intelligence/accounts/7/routing-state", nil)
getRR := httptest.NewRecorder()
application.Server.Routes().ServeHTTP(getRR, getReq)
if getRR.Code != http.StatusOK {
t.Fatalf("unexpected get status: %d body=%s", getRR.Code, getRR.Body.String())
}
}
func TestPublishConsumeOnceListAppliedIntegration(t *testing.T) {
application := app.New()
publishReq := httptest.NewRequest(http.MethodPost, "/internal/supply-intelligence/publish/package-event", bytes.NewBufferString(`{"event_id":"evt-integration-1","package_id":501,"platform":"openai","model":"gpt-4.1-mini","version":9,"occurred_at":"2026-05-06T20:30:00Z"}`))
publishRR := httptest.NewRecorder()
application.Server.Routes().ServeHTTP(publishRR, publishReq)
if publishRR.Code != http.StatusOK {
t.Fatalf("unexpected publish status: %d body=%s", publishRR.Code, publishRR.Body.String())
}
consumeReq := httptest.NewRequest(http.MethodPost, "/internal/supply-intelligence/gateway/consume-once", bytes.NewBufferString(`{"consumer":"gateway"}`))
consumeRR := httptest.NewRecorder()
application.Server.Routes().ServeHTTP(consumeRR, consumeReq)
if consumeRR.Code != http.StatusOK {
t.Fatalf("unexpected consume status: %d body=%s", consumeRR.Code, consumeRR.Body.String())
}
listReq := httptest.NewRequest(http.MethodGet, "/internal/supply-intelligence/gateway/package-changes", nil)
listRR := httptest.NewRecorder()
application.Server.Routes().ServeHTTP(listRR, listReq)
if listRR.Code != http.StatusOK {
t.Fatalf("unexpected list status: %d body=%s", listRR.Code, listRR.Body.String())
}
var listResp struct {
Items []domain.PackageChangeEvent `json:"items"`
NextCursor string `json:"next_cursor"`
}
if err := json.NewDecoder(listRR.Body).Decode(&listResp); err != nil {
t.Fatalf("decode list error: %v", err)
}
if len(listResp.Items) != 1 || listResp.Items[0].EventID != "evt-integration-1" {
t.Fatalf("unexpected list items: %+v", listResp.Items)
}
if listResp.NextCursor != "1" {
t.Fatalf("unexpected next cursor: %+v", listResp)
}
if listResp.Items[0].GatewaySyncStatus != domain.GatewaySyncStatusApplied {
t.Fatalf("unexpected sync status: %+v", listResp.Items[0])
}
}
func TestPublishConsumeOnceListFailedIntegration(t *testing.T) {
application := app.New()
publishReq := httptest.NewRequest(http.MethodPost, "/internal/supply-intelligence/publish/package-event", bytes.NewBufferString(`{"event_id":"evt-integration-failed","package_id":502,"platform":"openai","model":"gpt-fail-model","version":10,"occurred_at":"2026-05-06T20:31:00Z"}`))
publishRR := httptest.NewRecorder()
application.Server.Routes().ServeHTTP(publishRR, publishReq)
if publishRR.Code != http.StatusOK {
t.Fatalf("unexpected publish status: %d body=%s", publishRR.Code, publishRR.Body.String())
}
consumeReq := httptest.NewRequest(http.MethodPost, "/internal/supply-intelligence/gateway/consume-once", bytes.NewBufferString(`{"consumer":"gateway"}`))
consumeRR := httptest.NewRecorder()
application.Server.Routes().ServeHTTP(consumeRR, consumeReq)
if consumeRR.Code != http.StatusOK {
t.Fatalf("unexpected consume status: %d body=%s", consumeRR.Code, consumeRR.Body.String())
}
listReq := httptest.NewRequest(http.MethodGet, "/internal/supply-intelligence/gateway/package-changes", nil)
listRR := httptest.NewRecorder()
application.Server.Routes().ServeHTTP(listRR, listReq)
if listRR.Code != http.StatusOK {
t.Fatalf("unexpected list status: %d body=%s", listRR.Code, listRR.Body.String())
}
var listResp struct {
Items []domain.PackageChangeEvent `json:"items"`
NextCursor string `json:"next_cursor"`
}
if err := json.NewDecoder(listRR.Body).Decode(&listResp); err != nil {
t.Fatalf("decode list error: %v", err)
}
if len(listResp.Items) != 1 || listResp.Items[0].EventID != "evt-integration-failed" {
t.Fatalf("unexpected list items: %+v", listResp.Items)
}
if listResp.NextCursor != "1" {
t.Fatalf("unexpected next cursor: %+v", listResp)
}
if listResp.Items[0].GatewaySyncStatus != domain.GatewaySyncStatusFailed {
t.Fatalf("unexpected sync status: %+v", listResp.Items[0])
}
}
func TestDiscoveryCandidateCreateAndListIntegration(t *testing.T) {
application := app.New()
createReq := httptest.NewRequest(http.MethodPost, "/internal/supply-intelligence/discovery/candidates", bytes.NewBufferString(`{"candidate_id":"cand-int-1","account_id":701,"platform":"openai","model":"gpt-4.1-mini","source":"manual_seed","reason_code":"new_model","discovered_at":"2026-05-06T20:30:00Z"}`))
createRR := httptest.NewRecorder()
application.Server.Routes().ServeHTTP(createRR, createReq)
if createRR.Code != http.StatusOK {
t.Fatalf("unexpected create status: %d body=%s", createRR.Code, createRR.Body.String())
}
listReq := httptest.NewRequest(http.MethodGet, "/internal/supply-intelligence/discovery/candidates?status=pending_admission", nil)
listRR := httptest.NewRecorder()
application.Server.Routes().ServeHTTP(listRR, listReq)
if listRR.Code != http.StatusOK {
t.Fatalf("unexpected list status: %d body=%s", listRR.Code, listRR.Body.String())
}
var listResp struct {
Items []domain.DiscoveryCandidate `json:"items"`
}
if err := json.NewDecoder(listRR.Body).Decode(&listResp); err != nil {
t.Fatalf("decode list error: %v", err)
}
if len(listResp.Items) != 1 || listResp.Items[0].CandidateID != "cand-int-1" {
t.Fatalf("unexpected discovery list items: %+v", listResp.Items)
}
}

View File

@@ -0,0 +1,266 @@
package httpapi
import (
"bytes"
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
"time"
"supply-intelligence/internal/discovery"
"supply-intelligence/internal/domain"
"supply-intelligence/internal/gatewayconsumer"
"supply-intelligence/internal/probe"
"supply-intelligence/internal/publish"
"supply-intelligence/internal/repository"
)
func TestServerRoutingStateEndpoint(t *testing.T) {
repo := repository.NewMemoryRepository()
repo.UpsertRoutingState(domain.AccountRoutingState{
AccountID: 101,
Platform: "openai",
AccountStatus: domain.AccountStatusActive,
RoutingEnabled: true,
RiskScore: 10,
ReasonCode: "ok",
LastProbeAt: time.Unix(100, 0).UTC(),
Version: 3,
})
server := NewServer(repo, probe.NewService(repo), publish.NewService(repo), gatewayconsumer.NewService(repo), discovery.NewService(repo), nil)
req := httptest.NewRequest(http.MethodGet, "/internal/supply-intelligence/accounts/101/routing-state", nil)
rr := httptest.NewRecorder()
server.Routes().ServeHTTP(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("unexpected status: %d body=%s", rr.Code, rr.Body.String())
}
var got domain.AccountRoutingState
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
t.Fatalf("decode error: %v", err)
}
if got.AccountID != 101 || got.AccountStatus != domain.AccountStatusActive {
t.Fatalf("unexpected payload: %+v", got)
}
}
func TestServerProbeEvaluateEndpointPaths(t *testing.T) {
tests := []struct {
name string
body string
wantStatus int
wantClassification domain.ProbeClassification
wantAccountStatus domain.AccountStatus
wantReasonCode string
wantRoutingEnabled bool
}{
{
name: "success",
body: `{"account_id":201,"platform":"openai","current_status":"suspended","status_code":200}`,
wantStatus: http.StatusOK,
wantClassification: domain.ProbeClassificationSuccess,
wantAccountStatus: domain.AccountStatusActive,
wantReasonCode: "ok",
wantRoutingEnabled: true,
},
{
name: "explicit_failure",
body: `{"account_id":202,"platform":"openai","current_status":"active","status_code":401}`,
wantStatus: http.StatusOK,
wantClassification: domain.ProbeClassificationExplicitFailure,
wantAccountStatus: domain.AccountStatusSuspended,
wantReasonCode: "auth_rejected",
wantRoutingEnabled: false,
},
{
name: "inconclusive",
body: `{"account_id":203,"platform":"openai","current_status":"suspended","transport_error":"dial tcp timeout"}`,
wantStatus: http.StatusOK,
wantClassification: domain.ProbeClassificationInconclusive,
wantAccountStatus: domain.AccountStatusSuspended,
wantReasonCode: "transport_error",
wantRoutingEnabled: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
repo := repository.NewMemoryRepository()
server := NewServer(repo, probe.NewService(repo), publish.NewService(repo), gatewayconsumer.NewService(repo), discovery.NewService(repo), nil)
req := httptest.NewRequest(http.MethodPost, "/internal/supply-intelligence/probe/evaluate", bytes.NewBufferString(tt.body))
rr := httptest.NewRecorder()
server.Routes().ServeHTTP(rr, req)
if rr.Code != tt.wantStatus {
t.Fatalf("unexpected status: %d body=%s", rr.Code, rr.Body.String())
}
var got probe.EvaluateOutput
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
t.Fatalf("decode error: %v", err)
}
if got.Classification != tt.wantClassification {
t.Fatalf("unexpected classification: %q", got.Classification)
}
if got.RoutingState.AccountStatus != tt.wantAccountStatus {
t.Fatalf("unexpected account status: %q", got.RoutingState.AccountStatus)
}
if got.RoutingState.ReasonCode != tt.wantReasonCode {
t.Fatalf("unexpected reason code: %q", got.RoutingState.ReasonCode)
}
if got.RoutingState.RoutingEnabled != tt.wantRoutingEnabled {
t.Fatalf("unexpected routing enabled: %v", got.RoutingState.RoutingEnabled)
}
})
}
}
func TestServerPublishPackageEventEndpoint(t *testing.T) {
repo := repository.NewMemoryRepository()
server := NewServer(repo, probe.NewService(repo), publish.NewService(repo), gatewayconsumer.NewService(repo), discovery.NewService(repo), nil)
body := bytes.NewBufferString(`{"event_id":"evt-1","package_id":1001,"platform":"openai","model":"gpt-4.1-mini","version":7,"occurred_at":"2026-05-06T20:30:00Z"}`)
req := httptest.NewRequest(http.MethodPost, "/internal/supply-intelligence/publish/package-event", body)
rr := httptest.NewRecorder()
server.Routes().ServeHTTP(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("unexpected publish status: %d body=%s", rr.Code, rr.Body.String())
}
var event domain.PackageChangeEvent
if err := json.NewDecoder(rr.Body).Decode(&event); err != nil {
t.Fatalf("decode error: %v", err)
}
if event.EventID != "evt-1" || event.EventType != publish.PackagePublishedEventType {
t.Fatalf("unexpected event: %+v", event)
}
if event.GatewaySyncStatus != domain.GatewaySyncStatusPending {
t.Fatalf("unexpected sync status: %q", event.GatewaySyncStatus)
}
}
func TestServerPackageChangeListAndAck(t *testing.T) {
repo := repository.NewMemoryRepository()
repo.AppendPackageEvent(domain.PackageChangeEvent{EventID: "evt-1", EventType: publish.PackagePublishedEventType, PackageID: 1001, Platform: "openai", Model: "gpt-4.1-mini", OccurredAt: time.Unix(5, 0).UTC(), Version: 7, GatewaySyncStatus: domain.GatewaySyncStatusPending})
server := NewServer(repo, probe.NewService(repo), publish.NewService(repo), gatewayconsumer.NewService(repo), discovery.NewService(repo), nil)
listReq := httptest.NewRequest(http.MethodGet, "/internal/supply-intelligence/gateway/package-changes", nil)
listRR := httptest.NewRecorder()
server.Routes().ServeHTTP(listRR, listReq)
if listRR.Code != http.StatusOK {
t.Fatalf("unexpected list status: %d body=%s", listRR.Code, listRR.Body.String())
}
var listResp struct {
Items []domain.PackageChangeEvent `json:"items"`
NextCursor string `json:"next_cursor"`
}
if err := json.NewDecoder(listRR.Body).Decode(&listResp); err != nil {
t.Fatalf("decode list error: %v", err)
}
if len(listResp.Items) != 1 || listResp.NextCursor != "1" {
t.Fatalf("unexpected list response: %+v", listResp)
}
ackReq := httptest.NewRequest(http.MethodPost, "/internal/supply-intelligence/gateway/package-changes/evt-1/ack", bytes.NewBufferString(`{"consumer":"gateway","result":"applied","detail":"ok"}`))
ackRR := httptest.NewRecorder()
server.Routes().ServeHTTP(ackRR, ackReq)
if ackRR.Code != http.StatusNoContent {
t.Fatalf("unexpected ack status: %d body=%s", ackRR.Code, ackRR.Body.String())
}
updated, _ := repo.ListPackageEventsAfter("")
if len(updated) != 1 || updated[0].GatewaySyncStatus != domain.GatewaySyncStatusApplied {
t.Fatalf("unexpected ack state: %+v", updated)
}
}
func TestServerPackageChangeListWithCursor(t *testing.T) {
repo := repository.NewMemoryRepository()
repo.AppendPackageEvent(domain.PackageChangeEvent{EventID: "evt-1", EventType: publish.PackagePublishedEventType, PackageID: 1001, Platform: "openai", Model: "gpt-4.1-mini", OccurredAt: time.Unix(5, 0).UTC(), Version: 7, GatewaySyncStatus: domain.GatewaySyncStatusPending})
repo.AppendPackageEvent(domain.PackageChangeEvent{EventID: "evt-2", EventType: publish.PackagePublishedEventType, PackageID: 1002, Platform: "openai", Model: "gpt-4.1", OccurredAt: time.Unix(6, 0).UTC(), Version: 8, GatewaySyncStatus: domain.GatewaySyncStatusPending})
server := NewServer(repo, probe.NewService(repo), publish.NewService(repo), gatewayconsumer.NewService(repo), discovery.NewService(repo), nil)
req := httptest.NewRequest(http.MethodGet, "/internal/supply-intelligence/gateway/package-changes?cursor=1", nil)
rr := httptest.NewRecorder()
server.Routes().ServeHTTP(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("unexpected status: %d body=%s", rr.Code, rr.Body.String())
}
var resp struct {
Items []domain.PackageChangeEvent `json:"items"`
NextCursor string `json:"next_cursor"`
}
if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil {
t.Fatalf("decode error: %v", err)
}
if len(resp.Items) != 1 || resp.Items[0].EventID != "evt-2" || resp.NextCursor != "2" {
t.Fatalf("unexpected cursor response: %+v", resp)
}
}
func TestServerConsumeOnceEndpoint(t *testing.T) {
repo := repository.NewMemoryRepository()
repo.AppendPackageEvent(domain.PackageChangeEvent{EventID: "evt-apply", EventType: publish.PackagePublishedEventType, PackageID: 1001, Platform: "openai", Model: "gpt-4.1-mini", OccurredAt: time.Unix(5, 0).UTC(), Version: 7, GatewaySyncStatus: domain.GatewaySyncStatusPending})
repo.AppendPackageEvent(domain.PackageChangeEvent{EventID: "evt-fail", EventType: publish.PackagePublishedEventType, PackageID: 1002, Platform: "openai", Model: "gpt-fail-model", OccurredAt: time.Unix(6, 0).UTC(), Version: 8, GatewaySyncStatus: domain.GatewaySyncStatusPending})
server := NewServer(repo, probe.NewService(repo), publish.NewService(repo), gatewayconsumer.NewService(repo), discovery.NewService(repo), nil)
req := httptest.NewRequest(http.MethodPost, "/internal/supply-intelligence/gateway/consume-once", bytes.NewBufferString(`{"consumer":"gateway"}`))
rr := httptest.NewRecorder()
server.Routes().ServeHTTP(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("unexpected consume status: %d body=%s", rr.Code, rr.Body.String())
}
var out gatewayconsumer.ConsumeOnceOutput
if err := json.NewDecoder(rr.Body).Decode(&out); err != nil {
t.Fatalf("decode error: %v", err)
}
if len(out.Items) != 2 {
t.Fatalf("unexpected consume output length: %+v", out)
}
if out.Items[0].Result != domain.GatewayAckResultApplied || out.Items[0].GatewaySyncStatus != domain.GatewaySyncStatusApplied || out.Items[0].Detail == "" {
t.Fatalf("unexpected first consume item: %+v", out.Items[0])
}
if out.Items[1].Result != domain.GatewayAckResultFailed || out.Items[1].GatewaySyncStatus != domain.GatewaySyncStatusFailed || out.Items[1].Detail == "" {
t.Fatalf("unexpected second consume item: %+v", out.Items[1])
}
}
func TestServerDiscoveryCandidateCreateAndList(t *testing.T) {
repo := repository.NewMemoryRepository()
server := NewServer(repo, probe.NewService(repo), publish.NewService(repo), gatewayconsumer.NewService(repo), discovery.NewService(repo), nil)
createReq := httptest.NewRequest(http.MethodPost, "/internal/supply-intelligence/discovery/candidates", bytes.NewBufferString(`{"candidate_id":"cand-1","account_id":301,"platform":"openai","model":"gpt-4.1-mini","source":"manual_seed","reason_code":"new_model","discovered_at":"2026-05-06T20:30:00Z"}`))
createRR := httptest.NewRecorder()
server.Routes().ServeHTTP(createRR, createReq)
if createRR.Code != http.StatusOK {
t.Fatalf("unexpected create status: %d body=%s", createRR.Code, createRR.Body.String())
}
listReq := httptest.NewRequest(http.MethodGet, "/internal/supply-intelligence/discovery/candidates?status=pending_admission", nil)
listRR := httptest.NewRecorder()
server.Routes().ServeHTTP(listRR, listReq)
if listRR.Code != http.StatusOK {
t.Fatalf("unexpected list status: %d body=%s", listRR.Code, listRR.Body.String())
}
var listResp struct {
Items []domain.DiscoveryCandidate `json:"items"`
}
if err := json.NewDecoder(listRR.Body).Decode(&listResp); err != nil {
t.Fatalf("decode list error: %v", err)
}
if len(listResp.Items) != 1 || listResp.Items[0].CandidateID != "cand-1" || listResp.Items[0].Status != domain.DiscoveryCandidateStatusPendingAdmission {
t.Fatalf("unexpected discovery list response: %+v", listResp.Items)
}
}
func TestServerDiscoveryCandidateRejectsInvalidInput(t *testing.T) {
repo := repository.NewMemoryRepository()
server := NewServer(repo, probe.NewService(repo), publish.NewService(repo), gatewayconsumer.NewService(repo), discovery.NewService(repo), nil)
req := httptest.NewRequest(http.MethodPost, "/internal/supply-intelligence/discovery/candidates", bytes.NewBufferString(`{"candidate_id":"","account_id":0}`))
rr := httptest.NewRecorder()
server.Routes().ServeHTTP(rr, req)
if rr.Code != http.StatusBadRequest {
t.Fatalf("unexpected status: %d body=%s", rr.Code, rr.Body.String())
}
}

View File

@@ -0,0 +1,67 @@
package integration
import (
"context"
"supply-intelligence/internal/domain"
)
// AccountStateReader defines the interface for reading account routing state
// from the supply-api repository layer
type AccountStateReader interface {
GetRoutingStateContext(ctx context.Context, accountID int64) (domain.AccountRoutingState, bool)
}
// CandidateStore defines the interface for persisting model candidates
type CandidateStore interface {
GetDiscoveryCandidateByIDContext(ctx context.Context, candidateID string) (domain.DiscoveryCandidate, bool)
FindDiscoveryCandidateContext(ctx context.Context, accountID int64, platform, model string) (domain.DiscoveryCandidate, bool)
UpsertDiscoveryCandidateContext(ctx context.Context, candidate domain.DiscoveryCandidate) domain.DiscoveryCandidate
ListDiscoveryCandidatesContext(ctx context.Context, status domain.DiscoveryCandidateStatus) []domain.DiscoveryCandidate
}
// PackageEventStore defines the interface for persisting package change events
type PackageEventStore interface {
AppendPackageEventContext(ctx context.Context, evt domain.PackageChangeEvent) (domain.PackageChangeEvent, error)
ListPackageEventsAfter(cursor string) ([]domain.PackageChangeEvent, string)
AckPackageEvent(eventID, consumer string, result domain.GatewayAckResult, detail string, ackedAt interface{}) (domain.PackageChangeEvent, error)
}
// ProbeLogStore defines the interface for persisting probe execution logs
type ProbeLogStore interface {
AppendProbeLog(ctx context.Context, log ProbeExecutionLog) error
ListProbeLogsByAccount(ctx context.Context, accountID int64, limit int) ([]ProbeExecutionLog, error)
}
// ProbeExecutionLog represents a single probe execution record
type ProbeExecutionLog struct {
LogID int64
AccountID int64
Platform string
ProbeResult domain.ProbeClassification
FailureClass string
HTTPStatus int
LatencyMs int
RiskScore int
EvaluatedTransition string
ExecutedAt interface{} // time.Time or string
RequestID string
Version int64
}
// NewAccountStateAdapter creates an adapter that connects to supply-api's account store
// For now, returns nil — actual implementation requires supply-api repo access
func NewAccountStateAdapter(repo interface{}) *AccountStateAdapter {
return &AccountStateAdapter{repo: repo}
}
// AccountStateAdapter implements AccountStateReader over supply-api repository
type AccountStateAdapter struct {
repo interface{}
}
func (a *AccountStateAdapter) GetRoutingStateContext(ctx context.Context, accountID int64) (domain.AccountRoutingState, bool) {
// TODO: implement when supply-api integration is ready
// This will call into supply-api's account repository
return domain.AccountRoutingState{}, false
}

View File

@@ -0,0 +1,242 @@
package integration
import (
"context"
"encoding/json"
"net/http"
)
// SupplierAdapter defines the interface for interacting with a supplier platform
type SupplierAdapter interface {
// Platform returns the platform name (e.g., "openai", "anthropic")
Platform() string
// ProbeAccount sends a health check request to the supplier API
// Returns the HTTP response details needed for probe classification
ProbeAccount(ctx context.Context, account SupplierAccount) ProbeResult
// GetModels fetches the list of available models from the supplier
GetModels(ctx context.Context, account SupplierAccount) ([]ModelInfo, error)
// HealthCheck verifies connectivity to the supplier API
HealthCheck(ctx context.Context, account SupplierAccount) error
}
// SupplierAccount holds credentials and configuration for a supplier account
type SupplierAccount struct {
AccountID int64
Platform string
APIKey string
BaseURL string // defaults to supplier's public endpoint if empty
Endpoint string // custom endpoint override
}
// ProbeResult holds the raw result of a probe request
type ProbeResult struct {
StatusCode int
TransportError error
ResponseBody string
}
// ModelInfo describes a model available from a supplier
type ModelInfo struct {
ModelID string // supplier's model identifier
ModelName string // display name
ContextLength int // max context length in tokens
IsActive bool // whether the model is currently available
}
// NewOpenAIAdapter creates an adapter for OpenAI-compatible APIs
func NewOpenAIAdapter(httpClient HTTPClient) SupplierAdapter {
return &OpenAIAdapter{httpClient: httpClient}
}
// OpenAIAdapter implements SupplierAdapter for OpenAI and OpenAI-compatible APIs
type OpenAIAdapter struct {
httpClient HTTPClient
}
func (a *OpenAIAdapter) Platform() string { return "openai" }
func (a *OpenAIAdapter) ProbeAccount(ctx context.Context, account SupplierAccount) ProbeResult {
baseURL := account.BaseURL
if baseURL == "" {
baseURL = "https://api.openai.com"
}
endpoint := account.Endpoint
if endpoint == "" {
endpoint = baseURL + "/v1/models"
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return ProbeResult{TransportError: err}
}
req.Header.Set("Authorization", "Bearer "+account.APIKey)
req.Header.Set("User-Agent", "supply-intelligence-probe/1.0")
resp, err := a.httpClient.Do(req)
if err != nil {
return ProbeResult{TransportError: err}
}
defer resp.Body.Close()
body := make([]byte, 1024)
n, _ := resp.Body.Read(body)
return ProbeResult{
StatusCode: resp.StatusCode,
ResponseBody: string(body[:n]),
}
}
func (a *OpenAIAdapter) GetModels(ctx context.Context, account SupplierAccount) ([]ModelInfo, error) {
baseURL := account.BaseURL
if baseURL == "" {
baseURL = "https://api.openai.com"
}
endpoint := baseURL + "/v1/models"
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return nil, err
}
req.Header.Set("Authorization", "Bearer "+account.APIKey)
resp, err := a.httpClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
// Parse the OpenAI models list response
// {"object": "list", "data": [{"id": "gpt-4", "object": "model", ...}, ...]}
var raw struct {
Data []struct {
ID string `json:"id"`
Object string `json:"object"`
Context int `json:"context_window,omitempty"`
} `json:"data"`
}
if err := decodeJSON(resp, &raw); err != nil {
return nil, err
}
models := make([]ModelInfo, 0, len(raw.Data))
for _, m := range raw.Data {
if m.Object == "model" {
models = append(models, ModelInfo{
ModelID: m.ID,
ModelName: m.ID,
ContextLength: m.Context,
IsActive: true,
})
}
}
return models, nil
}
func (a *OpenAIAdapter) HealthCheck(ctx context.Context, account SupplierAccount) error {
result := a.ProbeAccount(ctx, account)
if result.TransportError != nil {
return result.TransportError
}
if result.StatusCode == http.StatusOK || result.StatusCode == http.StatusUnauthorized {
return nil
}
return ErrHealthCheckFailed
}
// NewAnthropicAdapter creates an adapter for Anthropic APIs
func NewAnthropicAdapter(httpClient HTTPClient) SupplierAdapter {
return &AnthropicAdapter{httpClient: httpClient}
}
// AnthropicAdapter implements SupplierAdapter for Anthropic Claude API
type AnthropicAdapter struct {
httpClient HTTPClient
}
func (a *AnthropicAdapter) Platform() string { return "anthropic" }
func (a *AnthropicAdapter) ProbeAccount(ctx context.Context, account SupplierAccount) ProbeResult {
baseURL := account.BaseURL
if baseURL == "" {
baseURL = "https://api.anthropic.com"
}
endpoint := baseURL + "/v1/models"
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return ProbeResult{TransportError: err}
}
req.Header.Set("x-api-key", account.APIKey)
req.Header.Set("User-Agent", "supply-intelligence-probe/1.0")
req.Header.Set("anthropic-version", "2023-06-01")
resp, err := a.httpClient.Do(req)
if err != nil {
return ProbeResult{TransportError: err}
}
defer resp.Body.Close()
body := make([]byte, 1024)
n, _ := resp.Body.Read(body)
return ProbeResult{
StatusCode: resp.StatusCode,
ResponseBody: string(body[:n]),
}
}
func (a *AnthropicAdapter) GetModels(ctx context.Context, account SupplierAccount) ([]ModelInfo, error) {
// Anthropic doesn't have a public models list endpoint in the same way OpenAI does.
// We return a known static list for Claude models.
// In production this would be fetched from configuration or a dynamic source.
return []ModelInfo{
{ModelID: "claude-3-5-sonnet-20241022", ModelName: "Claude 3.5 Sonnet", ContextLength: 200000, IsActive: true},
{ModelID: "claude-3-5-haiku-20241022", ModelName: "Claude 3.5 Haiku", ContextLength: 200000, IsActive: true},
{ModelID: "claude-3-opus-20240229", ModelName: "Claude 3 Opus", ContextLength: 200000, IsActive: true},
{ModelID: "claude-3-sonnet-20240229", ModelName: "Claude 3 Sonnet", ContextLength: 200000, IsActive: true},
{ModelID: "claude-3-haiku-20240307", ModelName: "Claude 3 Haiku", ContextLength: 200000, IsActive: true},
}, nil
}
func (a *AnthropicAdapter) HealthCheck(ctx context.Context, account SupplierAccount) error {
result := a.ProbeAccount(ctx, account)
if result.TransportError != nil {
return result.TransportError
}
// Anthropic returns 200 on success, 401 on auth failure
if result.StatusCode == http.StatusOK || result.StatusCode == http.StatusUnauthorized {
return nil
}
return ErrHealthCheckFailed
}
// HTTPClient interface for testability
type HTTPClient interface {
Do(req *http.Request) (*http.Response, error)
}
// DefaultHTTPClient is the standard HTTP client used for platform adapters
type DefaultHTTPClient struct{}
func (c *DefaultHTTPClient) Do(req *http.Request) (*http.Response, error) {
return http.DefaultClient.Do(req)
}
// NewDefaultHTTPClient creates a new default HTTP client
func NewDefaultHTTPClient() HTTPClient {
return &DefaultHTTPClient{}
}
var ErrHealthCheckFailed = &HealthCheckError{}
type HealthCheckError struct{}
func (e *HealthCheckError) Error() string { return "health check failed" }
func decodeJSON(resp *http.Response, v interface{}) error {
return json.NewDecoder(resp.Body).Decode(v)
}

View File

@@ -0,0 +1,38 @@
package poller
import (
"context"
"supply-intelligence/internal/gatewayconsumer"
)
type GatewayPackagePoller struct {
consumer *gatewayconsumer.Service
cursor string
}
func NewGatewayPackagePoller(consumer *gatewayconsumer.Service) *GatewayPackagePoller {
return &GatewayPackagePoller{consumer: consumer}
}
func (p *GatewayPackagePoller) PollOnce(ctx context.Context) (gatewayconsumer.ConsumeOnceOutput, error) {
if p == nil || p.consumer == nil {
return gatewayconsumer.ConsumeOnceOutput{}, gatewayconsumer.ErrInvalidConsumeInput
}
out, err := p.consumer.ConsumeOnce(ctx, gatewayconsumer.ConsumeOnceInput{
Consumer: "gateway",
Cursor: p.cursor,
})
if err != nil {
return gatewayconsumer.ConsumeOnceOutput{}, err
}
p.cursor = out.NextCursor
return out, nil
}
func (p *GatewayPackagePoller) Cursor() string {
if p == nil {
return ""
}
return p.cursor
}

View File

@@ -0,0 +1,28 @@
package poller
import (
"context"
"testing"
"time"
"supply-intelligence/internal/domain"
"supply-intelligence/internal/gatewayconsumer"
"supply-intelligence/internal/repository"
)
func TestGatewayPackagePollerPollOnce(t *testing.T) {
repo := repository.NewMemoryRepository()
repo.AppendPackageEvent(domain.PackageChangeEvent{EventID: "evt-1", EventType: "supply_package_published", PackageID: 1, Platform: "openai", Model: "gpt-4.1-mini", OccurredAt: time.Unix(1, 0).UTC(), Version: 1, GatewaySyncStatus: domain.GatewaySyncStatusPending})
poller := NewGatewayPackagePoller(gatewayconsumer.NewService(repo))
out, err := poller.PollOnce(context.Background())
if err != nil {
t.Fatalf("unexpected poll error: %v", err)
}
if len(out.Items) != 1 || out.Items[0].EventID != "evt-1" {
t.Fatalf("unexpected output: %+v", out)
}
if poller.Cursor() != out.NextCursor {
t.Fatalf("expected cursor to advance: poller=%q out=%q", poller.Cursor(), out.NextCursor)
}
}

View File

@@ -0,0 +1,53 @@
package poller
import (
"context"
"sync"
"time"
)
type Runtime struct {
poller *GatewayPackagePoller
interval time.Duration
cancel context.CancelFunc
wg sync.WaitGroup
}
func NewRuntime(poller *GatewayPackagePoller, interval time.Duration) *Runtime {
if interval <= 0 {
interval = time.Second
}
return &Runtime{poller: poller, interval: interval}
}
func (r *Runtime) Start(parent context.Context) bool {
if r == nil || r.poller == nil || r.cancel != nil {
return false
}
ctx, cancel := context.WithCancel(parent)
r.cancel = cancel
r.wg.Add(1)
go func() {
defer r.wg.Done()
ticker := time.NewTicker(r.interval)
defer ticker.Stop()
for {
_, _ = r.poller.PollOnce(ctx)
select {
case <-ctx.Done():
return
case <-ticker.C:
}
}
}()
return true
}
func (r *Runtime) Stop() {
if r == nil || r.cancel == nil {
return
}
r.cancel()
r.wg.Wait()
r.cancel = nil
}

View File

@@ -0,0 +1,54 @@
package poller
import (
"context"
"testing"
"time"
"supply-intelligence/internal/domain"
"supply-intelligence/internal/gatewayconsumer"
"supply-intelligence/internal/repository"
)
func TestRuntimeStartsBackgroundPolling(t *testing.T) {
repo := repository.NewMemoryRepository()
repo.AppendPackageEvent(domain.PackageChangeEvent{
EventID: "evt-runtime-1",
EventType: "supply_package_published",
PackageID: 1,
Platform: "openai",
Model: "gpt-4.1-mini",
OccurredAt: time.Unix(1, 0).UTC(),
Version: 1,
GatewaySyncStatus: domain.GatewaySyncStatusPending,
})
service := gatewayconsumer.NewService(repo)
poller := NewGatewayPackagePoller(service)
runtime := NewRuntime(poller, 10*time.Millisecond)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
if !runtime.Start(ctx) {
t.Fatalf("expected runtime to start")
}
defer runtime.Stop()
deadline := time.Now().Add(500 * time.Millisecond)
for time.Now().Before(deadline) {
items, _ := repo.ListPackageEventsAfter("")
if len(items) == 1 && items[0].GatewaySyncStatus == domain.GatewaySyncStatusApplied {
return
}
time.Sleep(10 * time.Millisecond)
}
items, _ := repo.ListPackageEventsAfter("")
t.Fatalf("expected background polling to apply event, got %+v", items)
}
func TestRuntimeStartRequiresPoller(t *testing.T) {
if (&Runtime{}).Start(context.Background()) {
t.Fatalf("expected runtime without poller to refuse start")
}
}

138
internal/probe/driver.go Normal file
View File

@@ -0,0 +1,138 @@
package probe
import (
"context"
"log"
"time"
"github.com/google/uuid"
"supply-intelligence/internal/domain"
"supply-intelligence/internal/integration"
)
// ProbeLogRepository defines where probe execution logs are persisted
type ProbeLogRepository interface {
AppendProbeLog(ctx context.Context, outcome ProbeOutcome) error
}
// Driver orchestrates a full probe run: load targets → execute → evaluate → persist state
type Driver struct {
executor *ProbeExecutor
evaluator *Service // reuse the existing probe.Service as evaluator
logRepo ProbeLogRepository
adapters map[string]integration.SupplierAdapter
now func() time.Time
}
// NewDriver creates a probe driver with all dependencies wired together
func NewDriver(
repo RoutingStateRepository,
logRepo ProbeLogRepository,
adapters map[string]integration.SupplierAdapter,
) *Driver {
return &Driver{
executor: NewProbeExecutor(integration.NewDefaultHTTPClient()),
evaluator: NewService(repo),
logRepo: logRepo,
adapters: adapters,
now: func() time.Time { return time.Now().UTC() },
}
}
// RunProbeForAccount probes a single account and persists the result through the full chain
func (d *Driver) RunProbeForAccount(ctx context.Context, account integration.SupplierAccount) error {
var outcome ProbeOutcome
if adapter, ok := d.adapters[account.Platform]; ok {
// Use platform-specific adapter
result := adapter.ProbeAccount(ctx, account)
outcome = ProbeOutcome{
AccountID: account.AccountID,
Platform: account.Platform,
StatusCode: result.StatusCode,
TransportError: result.TransportError,
ResponseBody: result.ResponseBody,
RequestID: "prb-" + uuid.New().String(),
ExecutedAt: d.now(),
}
} else {
// Fall back to generic HTTP probe
target := ProbeTarget{
AccountID: account.AccountID,
Platform: account.Platform,
Endpoint: account.Endpoint,
AuthHeader: "Bearer " + account.APIKey,
}
if target.Endpoint == "" {
target.Endpoint = account.BaseURL
}
var err error
outcome, err = d.executor.ExecuteProbe(ctx, target)
if err != nil {
return err
}
}
return d.persistOutcome(ctx, account.AccountID, account.Platform, outcome)
}
// persistOutcome drives the outcome through: load current state → evaluate → state machine → persist
func (d *Driver) persistOutcome(ctx context.Context, accountID int64, platform string, outcome ProbeOutcome) error {
// 1. Load current routing state
currentState, _ := d.evaluator.repo.GetRoutingStateContext(ctx, accountID)
// 2. Build evaluate input
var transportErr error
if outcome.TransportError != nil {
transportErr = outcome.TransportError
}
input := EvaluateInput{
AccountID: accountID,
Platform: platform,
CurrentStatus: currentState.AccountStatus,
StatusCode: outcome.StatusCode,
TransportError: transportErr,
}
// 3. Evaluate (uses the existing Service.EvaluateHTTPResult)
evalOutput, err := d.evaluator.EvaluateHTTPResult(ctx, input)
if err != nil {
log.Printf("[probe] failed to evaluate outcome for account %d: %v", accountID, err)
return err
}
// 4. Log the probe execution
if d.logRepo != nil {
logEntry := ProbeOutcome{
AccountID: accountID,
Platform: platform,
StatusCode: outcome.StatusCode,
TransportError: outcome.TransportError,
LatencyMs: outcome.LatencyMs,
RequestID: outcome.RequestID,
ExecutedAt: outcome.ExecutedAt,
}
_ = d.logRepo.AppendProbeLog(ctx, logEntry)
}
// 5. Log state transition
transition := describeTransition(currentState.AccountStatus, evalOutput.RoutingState.AccountStatus)
log.Printf("[probe] account=%d platform=%s %s->%s classification=%s risk=%d transition=%s",
accountID, platform,
currentState.AccountStatus, evalOutput.RoutingState.AccountStatus,
evalOutput.Classification, evalOutput.RoutingState.RiskScore,
transition)
return nil
}
// describeTransition returns a human-readable transition description
func describeTransition(from, to domain.AccountStatus) string {
if from == to {
return "no_change"
}
return string(from) + "_to_" + string(to)
}

View File

@@ -0,0 +1,44 @@
package probe
import (
"errors"
"fmt"
"net/http"
"supply-intelligence/internal/domain"
)
var ErrUnknownStatusCode = errors.New("unknown probe status code")
func ClassifyHTTPResult(statusCode int, transportErr error) (domain.ProbeClassification, string, error) {
if transportErr != nil {
return domain.ProbeClassificationInconclusive, "transport_error", nil
}
switch statusCode {
case http.StatusOK:
return domain.ProbeClassificationSuccess, "ok", nil
case http.StatusUnauthorized:
fallthrough
case http.StatusForbidden:
return domain.ProbeClassificationExplicitFailure, "auth_rejected", nil
case http.StatusTooManyRequests:
fallthrough
case http.StatusInternalServerError:
fallthrough
case http.StatusBadGateway:
fallthrough
case http.StatusServiceUnavailable:
fallthrough
case http.StatusGatewayTimeout:
return domain.ProbeClassificationInconclusive, "upstream_unstable", nil
default:
if statusCode >= 500 {
return domain.ProbeClassificationInconclusive, "upstream_unstable", nil
}
if statusCode >= 400 {
return domain.ProbeClassificationInconclusive, "unexpected_client_error", nil
}
return "", "", fmt.Errorf("%w: %d", ErrUnknownStatusCode, statusCode)
}
}

View File

@@ -0,0 +1,47 @@
package probe
import (
"errors"
"testing"
"supply-intelligence/internal/domain"
)
func TestClassifyHTTPResult(t *testing.T) {
tests := []struct {
name string
statusCode int
err error
wantClass domain.ProbeClassification
wantReason string
wantErr bool
}{
{name: "200 success", statusCode: 200, wantClass: domain.ProbeClassificationSuccess, wantReason: "ok"},
{name: "401 explicit failure", statusCode: 401, wantClass: domain.ProbeClassificationExplicitFailure, wantReason: "auth_rejected"},
{name: "403 explicit failure", statusCode: 403, wantClass: domain.ProbeClassificationExplicitFailure, wantReason: "auth_rejected"},
{name: "429 inconclusive", statusCode: 429, wantClass: domain.ProbeClassificationInconclusive, wantReason: "upstream_unstable"},
{name: "503 inconclusive", statusCode: 503, wantClass: domain.ProbeClassificationInconclusive, wantReason: "upstream_unstable"},
{name: "transport error inconclusive", err: errors.New("timeout"), wantClass: domain.ProbeClassificationInconclusive, wantReason: "transport_error"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gotClass, gotReason, err := ClassifyHTTPResult(tt.statusCode, tt.err)
if tt.wantErr {
if err == nil {
t.Fatalf("expected error, got nil")
}
return
}
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if gotClass != tt.wantClass {
t.Fatalf("classification mismatch: got %q want %q", gotClass, tt.wantClass)
}
if gotReason != tt.wantReason {
t.Fatalf("reason mismatch: got %q want %q", gotReason, tt.wantReason)
}
})
}
}

125
internal/probe/executor.go Normal file
View File

@@ -0,0 +1,125 @@
package probe
import (
"context"
"errors"
"fmt"
"io"
"net/http"
"time"
"github.com/google/uuid"
)
// HTTPClient defines the interface for making HTTP requests during probing
type HTTPClient interface {
Do(req *http.Request) (*http.Response, error)
}
// DefaultHTTPClient wraps the standard http.Client
type DefaultHTTPClient struct {
client *http.Client
}
// NewDefaultHTTPClient creates a client with sensible probe timeouts
func NewDefaultHTTPClient() *DefaultHTTPClient {
return &DefaultHTTPClient{
client: &http.Client{
Timeout: 30 * time.Second,
},
}
}
func (c *DefaultHTTPClient) Do(req *http.Request) (*http.Response, error) {
return c.client.Do(req)
}
// ProbeTarget represents an account to be probed
type ProbeTarget struct {
AccountID int64
Platform string
Endpoint string
AuthHeader string // Bearer token or API key
}
// ProbeOutcome is the result of executing a probe against a target
type ProbeOutcome struct {
AccountID int64
Platform string
StatusCode int
TransportError error
LatencyMs int
ResponseBody string // truncated, for debugging
RequestID string
ExecutedAt time.Time
}
// ProbeExecutor sends HTTP requests to supplier endpoints and classifies results
type ProbeExecutor struct {
httpClient HTTPClient
now func() time.Time
}
// NewProbeExecutor creates a probe executor with the given HTTP client.
// If client is nil, uses http.DefaultClient.
func NewProbeExecutor(client HTTPClient) *ProbeExecutor {
if client == nil {
client = http.DefaultClient
}
return &ProbeExecutor{
httpClient: client,
now: func() time.Time { return time.Now().UTC() },
}
}
// ExecuteProbe runs a single probe against the target account
// It makes an HTTP GET request to the platform's health endpoint
func (e *ProbeExecutor) ExecuteProbe(ctx context.Context, target ProbeTarget) (ProbeOutcome, error) {
requestID := uuid.New().String()
executedAt := e.now()
if target.Endpoint == "" {
return ProbeOutcome{}, ErrInvalidProbeTarget
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, target.Endpoint, nil)
if err != nil {
return ProbeOutcome{}, fmt.Errorf("%w: %v", ErrInvalidProbeTarget, err)
}
req.Header.Set("User-Agent", "supply-intelligence-probe/1.0")
req.Header.Set("Accept", "application/json")
if target.AuthHeader != "" {
req.Header.Set("Authorization", target.AuthHeader)
}
start := time.Now()
resp, err := e.httpClient.Do(req)
latencyMs := int(time.Since(start).Milliseconds())
outcome := ProbeOutcome{
AccountID: target.AccountID,
Platform: target.Platform,
LatencyMs: latencyMs,
RequestID: requestID,
ExecutedAt: executedAt,
}
if err != nil {
outcome.TransportError = err
return outcome, nil // return outcome with transport error set
}
if resp != nil {
defer resp.Body.Close()
outcome.StatusCode = resp.StatusCode
// Read truncated body for debugging (max 1KB)
bodyBytes, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
outcome.ResponseBody = string(bodyBytes)
}
return outcome, nil
}
var ErrInvalidProbeTarget = errors.New("invalid probe target")

View File

@@ -0,0 +1,219 @@
package probe
import (
"context"
"errors"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
)
// mockHTTPClient records requests and returns configurable responses
type mockHTTPClient struct {
Resp *http.Response
Err error
}
func (m *mockHTTPClient) Do(req *http.Request) (*http.Response, error) {
// Simulate context cancellation: if the request context is done, return context error
select {
case <-req.Context().Done():
return nil, req.Context().Err()
default:
}
return m.Resp, m.Err
}
func TestProbeExecutor_ExecuteProbe_Success(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write([]byte(`{"status":"ok"}`))
}))
defer server.Close()
executor := NewProbeExecutor(nil) // nil → uses real http.Client
outcome, err := executor.ExecuteProbe(context.Background(), ProbeTarget{
AccountID: 1,
Platform: "openai",
Endpoint: server.URL,
AuthHeader: "Bearer test-key",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if outcome.StatusCode != http.StatusOK {
t.Fatalf("expected 200, got: %d", outcome.StatusCode)
}
if outcome.LatencyMs < 0 {
t.Fatalf("expected latency >= 0, got: %d", outcome.LatencyMs)
}
if outcome.RequestID == "" {
t.Fatal("expected request_id to be set")
}
}
func TestProbeExecutor_ExecuteProbe_ExplicitFailure(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusUnauthorized)
}))
defer server.Close()
executor := NewProbeExecutor(nil)
outcome, err := executor.ExecuteProbe(context.Background(), ProbeTarget{
AccountID: 2,
Platform: "openai",
Endpoint: server.URL,
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if outcome.StatusCode != http.StatusUnauthorized {
t.Fatalf("expected 401, got: %d", outcome.StatusCode)
}
}
func TestProbeExecutor_ExecuteProbe_Inconclusive_429(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusTooManyRequests)
}))
defer server.Close()
executor := NewProbeExecutor(nil)
outcome, err := executor.ExecuteProbe(context.Background(), ProbeTarget{
AccountID: 3,
Platform: "openai",
Endpoint: server.URL,
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if outcome.StatusCode != http.StatusTooManyRequests {
t.Fatalf("expected 429, got: %d", outcome.StatusCode)
}
}
func TestProbeExecutor_ExecuteProbe_TransportError(t *testing.T) {
client := &mockHTTPClient{
Err: errors.New("connection refused"),
}
executor := NewProbeExecutor(client)
outcome, err := executor.ExecuteProbe(context.Background(), ProbeTarget{
AccountID: 4,
Platform: "openai",
Endpoint: "http://localhost:9999",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if outcome.TransportError == nil {
t.Fatal("expected transport error to be set")
}
if outcome.StatusCode != 0 {
t.Fatalf("expected status 0 on transport error, got: %d", outcome.StatusCode)
}
}
func TestProbeExecutor_ExecuteProbe_InvalidTarget(t *testing.T) {
executor := NewProbeExecutor(nil)
_, err := executor.ExecuteProbe(context.Background(), ProbeTarget{
AccountID: 5,
Platform: "openai",
Endpoint: "", // empty endpoint
})
if err == nil {
t.Fatal("expected error for empty endpoint")
}
}
func TestProbeExecutor_ExecuteProbe_ContextCanceled(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(5 * time.Second) // delay longer than context
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
executor := NewProbeExecutor(nil)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond)
defer cancel()
outcome, err := executor.ExecuteProbe(ctx, ProbeTarget{
AccountID: 6,
Platform: "openai",
Endpoint: server.URL,
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if outcome.TransportError == nil {
t.Fatal("expected context deadline exceeded transport error")
}
}
func TestProbeExecutor_ExecuteProbe_ResponseBodyTruncated(t *testing.T) {
largeBody := strings.Repeat("x", 10*1024) // 10KB
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write([]byte(largeBody))
}))
defer server.Close()
executor := NewProbeExecutor(nil)
outcome, err := executor.ExecuteProbe(context.Background(), ProbeTarget{
AccountID: 7,
Platform: "openai",
Endpoint: server.URL,
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(outcome.ResponseBody) > 1024 {
t.Fatalf("expected body truncated to <=1024, got: %d", len(outcome.ResponseBody))
}
}
func TestProbeExecutor_SetsUserAgentAndAcceptHeader(t *testing.T) {
var receivedHeaders http.Header
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
receivedHeaders = r.Header.Clone()
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
executor := NewProbeExecutor(nil)
_, _ = executor.ExecuteProbe(context.Background(), ProbeTarget{
AccountID: 8,
Platform: "openai",
Endpoint: server.URL,
AuthHeader: "Bearer my-key",
})
if receivedHeaders == nil {
t.Fatal("server handler was not called — check test setup")
}
if receivedHeaders.Get("User-Agent") == "" {
t.Fatal("expected User-Agent header to be set")
}
if receivedHeaders.Get("Accept") != "application/json" {
t.Fatalf("expected Accept: application/json, got: %s", receivedHeaders.Get("Accept"))
}
if receivedHeaders.Get("Authorization") != "Bearer my-key" {
t.Fatalf("expected Authorization header to be set")
}
}

95
internal/probe/service.go Normal file
View File

@@ -0,0 +1,95 @@
package probe
import (
"context"
"time"
"supply-intelligence/internal/domain"
)
type RoutingStateRepository interface {
GetRoutingStateContext(ctx context.Context, accountID int64) (domain.AccountRoutingState, bool)
UpsertRoutingStateContext(ctx context.Context, state domain.AccountRoutingState) domain.AccountRoutingState
}
type Service struct {
repo RoutingStateRepository
now func() time.Time
}
type EvaluateInput struct {
AccountID int64
Platform string
CurrentStatus domain.AccountStatus
StatusCode int
TransportError error
}
type EvaluateOutput struct {
Classification domain.ProbeClassification `json:"classification"`
ReasonCode string `json:"reason_code"`
RoutingState domain.AccountRoutingState `json:"routing_state"`
}
func NewService(repo RoutingStateRepository) *Service {
return &Service{
repo: repo,
now: func() time.Time {
return time.Now().UTC()
},
}
}
func (s *Service) EvaluateHTTPResult(ctx context.Context, input EvaluateInput) (EvaluateOutput, error) {
classification, reasonCode, err := ClassifyHTTPResult(input.StatusCode, input.TransportError)
if err != nil {
return EvaluateOutput{}, err
}
observedAt := s.now()
nextStatus := NextAccountStatus(input.CurrentStatus, classification)
state := domain.AccountRoutingState{
AccountID: input.AccountID,
Platform: input.Platform,
AccountStatus: nextStatus,
RoutingEnabled: nextStatus == domain.AccountStatusActive,
RiskScore: riskScoreFor(nextStatus, classification),
ReasonCode: reasonCode,
LastProbeAt: observedAt,
Version: 1,
}
if previous, ok := s.repo.GetRoutingStateContext(ctx, input.AccountID); ok {
state.Version = previous.Version + 1
if state.Platform == "" {
state.Platform = previous.Platform
}
}
persisted := s.repo.UpsertRoutingStateContext(ctx, state)
return EvaluateOutput{
Classification: classification,
ReasonCode: reasonCode,
RoutingState: persisted,
}, nil
}
func riskScoreFor(status domain.AccountStatus, classification domain.ProbeClassification) int {
switch classification {
case domain.ProbeClassificationSuccess:
return 20
case domain.ProbeClassificationExplicitFailure:
switch status {
case domain.AccountStatusDisabled:
return 100
case domain.AccountStatusSuspended:
return 90
default:
return 80
}
case domain.ProbeClassificationInconclusive:
return 60
default:
return 0
}
}

View File

@@ -0,0 +1,115 @@
package probe
import (
"context"
"errors"
"testing"
"time"
"supply-intelligence/internal/domain"
"supply-intelligence/internal/repository"
)
func TestServiceEvaluateHTTPResultSuccess(t *testing.T) {
repo := repository.NewMemoryRepository()
service := NewService(repo)
service.now = func() time.Time { return time.Unix(1000, 0).UTC() }
result, err := service.EvaluateHTTPResult(context.Background(), EvaluateInput{
AccountID: 1,
Platform: "openai",
CurrentStatus: domain.AccountStatusSuspended,
StatusCode: 200,
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if result.Classification != domain.ProbeClassificationSuccess {
t.Fatalf("unexpected classification: %q", result.Classification)
}
if result.RoutingState.AccountStatus != domain.AccountStatusActive {
t.Fatalf("unexpected account status: %q", result.RoutingState.AccountStatus)
}
if !result.RoutingState.RoutingEnabled {
t.Fatalf("expected routing enabled")
}
if result.RoutingState.ReasonCode != "ok" {
t.Fatalf("unexpected reason code: %q", result.RoutingState.ReasonCode)
}
if result.RoutingState.Version != 1 {
t.Fatalf("unexpected version: %d", result.RoutingState.Version)
}
}
func TestServiceEvaluateHTTPResultExplicitFailure(t *testing.T) {
repo := repository.NewMemoryRepository()
service := NewService(repo)
service.now = func() time.Time { return time.Unix(1001, 0).UTC() }
repo.UpsertRoutingState(domain.AccountRoutingState{
AccountID: 2,
Platform: "openai",
AccountStatus: domain.AccountStatusActive,
RoutingEnabled: true,
RiskScore: 20,
ReasonCode: "ok",
LastProbeAt: time.Unix(999, 0).UTC(),
Version: 4,
})
result, err := service.EvaluateHTTPResult(context.Background(), EvaluateInput{
AccountID: 2,
Platform: "openai",
CurrentStatus: domain.AccountStatusActive,
StatusCode: 401,
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if result.Classification != domain.ProbeClassificationExplicitFailure {
t.Fatalf("unexpected classification: %q", result.Classification)
}
if result.RoutingState.AccountStatus != domain.AccountStatusSuspended {
t.Fatalf("unexpected account status: %q", result.RoutingState.AccountStatus)
}
if result.RoutingState.RoutingEnabled {
t.Fatalf("expected routing disabled")
}
if result.RoutingState.ReasonCode != "auth_rejected" {
t.Fatalf("unexpected reason code: %q", result.RoutingState.ReasonCode)
}
if result.RoutingState.Version != 5 {
t.Fatalf("unexpected version: %d", result.RoutingState.Version)
}
}
func TestServiceEvaluateHTTPResultInconclusive(t *testing.T) {
repo := repository.NewMemoryRepository()
service := NewService(repo)
service.now = func() time.Time { return time.Unix(1002, 0).UTC() }
result, err := service.EvaluateHTTPResult(context.Background(), EvaluateInput{
AccountID: 3,
Platform: "openai",
CurrentStatus: domain.AccountStatusSuspended,
TransportError: errors.New("dial tcp timeout"),
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if result.Classification != domain.ProbeClassificationInconclusive {
t.Fatalf("unexpected classification: %q", result.Classification)
}
if result.RoutingState.AccountStatus != domain.AccountStatusSuspended {
t.Fatalf("unexpected account status: %q", result.RoutingState.AccountStatus)
}
if result.RoutingState.RoutingEnabled {
t.Fatalf("expected routing disabled for suspended account")
}
if result.RoutingState.ReasonCode != "transport_error" {
t.Fatalf("unexpected reason code: %q", result.RoutingState.ReasonCode)
}
if result.RoutingState.RiskScore != 60 {
t.Fatalf("unexpected risk score: %d", result.RoutingState.RiskScore)
}
}

View File

@@ -0,0 +1,23 @@
package probe
import "supply-intelligence/internal/domain"
func NextAccountStatus(current domain.AccountStatus, classification domain.ProbeClassification) domain.AccountStatus {
switch classification {
case domain.ProbeClassificationSuccess:
return domain.AccountStatusActive
case domain.ProbeClassificationExplicitFailure:
switch current {
case domain.AccountStatusActive:
return domain.AccountStatusSuspended
case domain.AccountStatusSuspended:
return domain.AccountStatusDisabled
default:
return current
}
case domain.ProbeClassificationInconclusive:
fallthrough
default:
return current
}
}

View File

@@ -0,0 +1,30 @@
package probe
import (
"testing"
"supply-intelligence/internal/domain"
)
func TestNextAccountStatus(t *testing.T) {
tests := []struct {
name string
current domain.AccountStatus
classification domain.ProbeClassification
want domain.AccountStatus
}{
{name: "success keeps active", current: domain.AccountStatusActive, classification: domain.ProbeClassificationSuccess, want: domain.AccountStatusActive},
{name: "explicit failure active to suspended", current: domain.AccountStatusActive, classification: domain.ProbeClassificationExplicitFailure, want: domain.AccountStatusSuspended},
{name: "explicit failure suspended to disabled", current: domain.AccountStatusSuspended, classification: domain.ProbeClassificationExplicitFailure, want: domain.AccountStatusDisabled},
{name: "inconclusive keeps active", current: domain.AccountStatusActive, classification: domain.ProbeClassificationInconclusive, want: domain.AccountStatusActive},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := NextAccountStatus(tt.current, tt.classification)
if got != tt.want {
t.Fatalf("status mismatch: got %q want %q", got, tt.want)
}
})
}
}

View File

@@ -0,0 +1,16 @@
# Publish semantics boundary
This package only records package-published events and emits gateway-consumable change records.
It does not implement a full publish state machine, admission workflow, or downstream routing synchronization.
Current repository boundary:
- `published` means the upstream package event has been recorded
- `pending` means the downstream gateway consumer has not yet confirmed handling
- `applied` / `failed` means the current repository's consumer flow updated event state during the running process
- current gateway event state in this repo is in-memory only, not durable across restart
Current runtime shape:
- manual/debug entry: `POST /internal/supply-intelligence/gateway/consume-once`
- minimal background path: application startup also runs a ticker-driven gateway poller
This avoids claiming that `published = applied`, and also avoids claiming that the current in-memory repository is a durable production persistence layer.

View File

@@ -0,0 +1,59 @@
package publish
import (
"context"
"errors"
"strings"
"time"
"supply-intelligence/internal/domain"
)
const PackagePublishedEventType = "supply_package_published"
var ErrInvalidPublishInput = errors.New("invalid publish input")
type PackageEventRepository interface {
AppendPackageEventContext(ctx context.Context, evt domain.PackageChangeEvent) (domain.PackageChangeEvent, error)
}
type Service struct {
repo PackageEventRepository
}
type RecordPackagePublishedInput struct {
EventID string
PackageID int64
Platform string
Model string
Version int64
OccurredAt time.Time
}
func NewService(repo PackageEventRepository) *Service {
return &Service{repo: repo}
}
func (s *Service) RecordPackagePublished(ctx context.Context, input RecordPackagePublishedInput) (domain.PackageChangeEvent, error) {
if s == nil || s.repo == nil {
return domain.PackageChangeEvent{}, ErrInvalidPublishInput
}
if strings.TrimSpace(input.EventID) == "" || input.PackageID <= 0 || strings.TrimSpace(input.Platform) == "" || strings.TrimSpace(input.Model) == "" || input.Version <= 0 {
return domain.PackageChangeEvent{}, ErrInvalidPublishInput
}
event := domain.PackageChangeEvent{
EventID: strings.TrimSpace(input.EventID),
EventType: PackagePublishedEventType,
PackageID: input.PackageID,
Platform: strings.TrimSpace(input.Platform),
Model: strings.TrimSpace(input.Model),
OccurredAt: input.OccurredAt.UTC(),
Version: input.Version,
GatewaySyncStatus: domain.GatewaySyncStatusPending,
}
if event.OccurredAt.IsZero() {
event.OccurredAt = time.Now().UTC()
}
return s.repo.AppendPackageEventContext(ctx, event)
}

View File

@@ -0,0 +1,66 @@
package publish
import (
"context"
"testing"
"time"
"supply-intelligence/internal/domain"
"supply-intelligence/internal/repository"
)
func TestServiceRecordPackagePublished(t *testing.T) {
repo := repository.NewMemoryRepository()
service := NewService(repo)
occurredAt := time.Unix(1715000000, 0)
event, err := service.RecordPackagePublished(context.Background(), RecordPackagePublishedInput{
EventID: "evt-publish-1",
PackageID: 1001,
Platform: "openai",
Model: "gpt-4.1-mini",
Version: 3,
OccurredAt: occurredAt,
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if event.EventID != "evt-publish-1" || event.EventType != PackagePublishedEventType {
t.Fatalf("unexpected event: %+v", event)
}
if !event.OccurredAt.Equal(occurredAt.UTC()) {
t.Fatalf("unexpected occurred_at: %s", event.OccurredAt)
}
if event.GatewaySyncStatus != domain.GatewaySyncStatusPending {
t.Fatalf("unexpected sync status: %q", event.GatewaySyncStatus)
}
items := repo.ListPackageEvents()
if len(items) != 1 {
t.Fatalf("unexpected items length: %d", len(items))
}
if items[0].EventID != event.EventID || items[0].Version != 3 {
t.Fatalf("unexpected stored event: %+v", items[0])
}
if items[0].GatewaySyncStatus != domain.GatewaySyncStatusPending {
t.Fatalf("unexpected stored sync status: %+v", items[0])
}
}
func TestServiceRecordPackagePublishedRejectsInvalidInput(t *testing.T) {
service := NewService(repository.NewMemoryRepository())
_, err := service.RecordPackagePublished(context.Background(), RecordPackagePublishedInput{
EventID: " ",
PackageID: 0,
Platform: "",
Model: "",
Version: 0,
})
if err == nil {
t.Fatal("expected error")
}
if err != ErrInvalidPublishInput {
t.Fatalf("unexpected error: %v", err)
}
}

View File

@@ -0,0 +1,278 @@
package repository
import (
"context"
"errors"
"sort"
"strconv"
"sync"
"time"
"supply-intelligence/internal/domain"
)
var ErrEventNotFound = errors.New("event not found")
func IsGatewayAckResult(result domain.GatewayAckResult) bool {
return result == domain.GatewayAckResultApplied || result == domain.GatewayAckResultFailed
}
type MemoryRepository struct {
mu sync.RWMutex
routingStates map[int64]domain.AccountRoutingState
packageEvents map[string]domain.PackageChangeEvent
appliedSnapshot map[string]domain.GatewayAppliedSnapshot
discoveryCandidates map[string]domain.DiscoveryCandidate
supplyPackages map[string]domain.SupplyPackage // key: platform+"_"+model
}
func NewMemoryRepository() *MemoryRepository {
return &MemoryRepository{
routingStates: map[int64]domain.AccountRoutingState{},
packageEvents: map[string]domain.PackageChangeEvent{},
appliedSnapshot: map[string]domain.GatewayAppliedSnapshot{},
discoveryCandidates: map[string]domain.DiscoveryCandidate{},
supplyPackages: map[string]domain.SupplyPackage{},
}
}
func (r *MemoryRepository) UpsertRoutingState(state domain.AccountRoutingState) {
r.upsertRoutingState(state)
}
func (r *MemoryRepository) UpsertRoutingStateContext(_ context.Context, state domain.AccountRoutingState) domain.AccountRoutingState {
return r.upsertRoutingState(state)
}
func (r *MemoryRepository) upsertRoutingState(state domain.AccountRoutingState) domain.AccountRoutingState {
r.mu.Lock()
defer r.mu.Unlock()
r.routingStates[state.AccountID] = state
return state
}
func (r *MemoryRepository) GetRoutingState(accountID int64) (domain.AccountRoutingState, bool) {
return r.getRoutingState(accountID)
}
func (r *MemoryRepository) GetRoutingStateContext(_ context.Context, accountID int64) (domain.AccountRoutingState, bool) {
return r.getRoutingState(accountID)
}
func (r *MemoryRepository) getRoutingState(accountID int64) (domain.AccountRoutingState, bool) {
r.mu.RLock()
defer r.mu.RUnlock()
state, ok := r.routingStates[accountID]
return state, ok
}
func (r *MemoryRepository) AppendPackageEvent(evt domain.PackageChangeEvent) {
_, _ = r.AppendPackageEventContext(context.Background(), evt)
}
func (r *MemoryRepository) AppendPackageEventContext(_ context.Context, evt domain.PackageChangeEvent) (domain.PackageChangeEvent, error) {
r.mu.Lock()
defer r.mu.Unlock()
if evt.OccurredAt.IsZero() {
evt.OccurredAt = time.Now().UTC()
}
if evt.GatewaySyncStatus == "" {
evt.GatewaySyncStatus = domain.GatewaySyncStatusPending
}
r.packageEvents[evt.EventID] = evt
return evt, nil
}
func (r *MemoryRepository) ListPackageEvents() []domain.PackageChangeEvent {
items, _ := r.ListPackageEventsAfter("")
return items
}
func (r *MemoryRepository) ListPackageEventsAfter(cursor string) ([]domain.PackageChangeEvent, string) {
r.mu.RLock()
defer r.mu.RUnlock()
items := make([]domain.PackageChangeEvent, 0, len(r.packageEvents))
for _, evt := range r.packageEvents {
items = append(items, evt)
}
sort.Slice(items, func(i, j int) bool {
if items[i].OccurredAt.Equal(items[j].OccurredAt) {
return items[i].EventID < items[j].EventID
}
return items[i].OccurredAt.Before(items[j].OccurredAt)
})
if cursor == "" {
return items, nextCursorFor(items)
}
start := 0
if idx, err := strconv.Atoi(cursor); err == nil {
if idx < 0 {
idx = 0
}
if idx > len(items) {
idx = len(items)
}
start = idx
} else {
for i, evt := range items {
if evt.EventID == cursor {
start = i + 1
break
}
}
}
if start >= len(items) {
return []domain.PackageChangeEvent{}, ""
}
filtered := append([]domain.PackageChangeEvent(nil), items[start:]...)
return filtered, nextCursorFor(items)
}
func nextCursorFor(items []domain.PackageChangeEvent) string {
if len(items) == 0 {
return ""
}
return strconv.Itoa(len(items))
}
func (r *MemoryRepository) AckPackageEvent(eventID, consumer string, result domain.GatewayAckResult, detail string, ackedAt time.Time) (domain.PackageChangeEvent, error) {
r.mu.Lock()
defer r.mu.Unlock()
evt, ok := r.packageEvents[eventID]
if !ok {
return domain.PackageChangeEvent{}, ErrEventNotFound
}
if ackedAt.IsZero() {
ackedAt = time.Now().UTC()
}
evt.Consumer = consumer
evt.ConsumerDetail = detail
evt.GatewaySyncStatus = result.SyncStatus()
evt.AckedAt = &ackedAt
r.packageEvents[eventID] = evt
return evt, nil
}
func (r *MemoryRepository) UpsertGatewayAppliedSnapshot(snapshot domain.GatewayAppliedSnapshot) domain.GatewayAppliedSnapshot {
r.mu.Lock()
defer r.mu.Unlock()
if snapshot.UpdatedAt.IsZero() {
snapshot.UpdatedAt = time.Now().UTC()
}
r.appliedSnapshot[snapshot.Consumer] = snapshot
return snapshot
}
func (r *MemoryRepository) GetGatewayAppliedSnapshot(consumer string) (domain.GatewayAppliedSnapshot, bool) {
r.mu.RLock()
defer r.mu.RUnlock()
snapshot, ok := r.appliedSnapshot[consumer]
return snapshot, ok
}
func (r *MemoryRepository) GetDiscoveryCandidateByIDContext(_ context.Context, candidateID string) (domain.DiscoveryCandidate, bool) {
r.mu.RLock()
defer r.mu.RUnlock()
candidate, ok := r.discoveryCandidates[candidateID]
return candidate, ok
}
func (r *MemoryRepository) FindDiscoveryCandidateContext(_ context.Context, accountID int64, platform, model string) (domain.DiscoveryCandidate, bool) {
r.mu.RLock()
defer r.mu.RUnlock()
for _, candidate := range r.discoveryCandidates {
if candidate.AccountID == accountID && candidate.Platform == platform && candidate.Model == model {
return candidate, true
}
}
return domain.DiscoveryCandidate{}, false
}
func (r *MemoryRepository) UpsertDiscoveryCandidateContext(_ context.Context, candidate domain.DiscoveryCandidate) domain.DiscoveryCandidate {
r.mu.Lock()
defer r.mu.Unlock()
if candidate.DiscoveredAt.IsZero() {
candidate.DiscoveredAt = time.Now().UTC()
}
if candidate.UpdatedAt.IsZero() {
candidate.UpdatedAt = candidate.DiscoveredAt
}
r.discoveryCandidates[candidate.CandidateID] = candidate
return candidate
}
func (r *MemoryRepository) ListDiscoveryCandidatesContext(_ context.Context, status domain.DiscoveryCandidateStatus) []domain.DiscoveryCandidate {
r.mu.RLock()
defer r.mu.RUnlock()
items := make([]domain.DiscoveryCandidate, 0, len(r.discoveryCandidates))
for _, candidate := range r.discoveryCandidates {
if status != "" && candidate.Status != status {
continue
}
items = append(items, candidate)
}
sort.Slice(items, func(i, j int) bool {
if items[i].DiscoveredAt.Equal(items[j].DiscoveredAt) {
return items[i].CandidateID < items[j].CandidateID
}
return items[i].DiscoveredAt.Before(items[j].DiscoveredAt)
})
return items
}
// --- SupplyPackage methods ---
// UpsertSupplyPackage creates or updates a supply package
func (r *MemoryRepository) UpsertSupplyPackage(pkg domain.SupplyPackage) {
r.mu.Lock()
defer r.mu.Unlock()
key := pkg.Platform + "_" + pkg.Model
if existing, ok := r.supplyPackages[key]; ok {
pkg.PackageID = existing.PackageID
pkg.Version = existing.Version + 1
pkg.CreatedAt = existing.CreatedAt
}
if pkg.CreatedAt.IsZero() {
pkg.CreatedAt = time.Now().UTC()
}
pkg.UpdatedAt = time.Now().UTC()
r.supplyPackages[key] = pkg
}
// GetSupplyPackage retrieves a supply package by platform and model
func (r *MemoryRepository) GetSupplyPackage(platform, model string) (domain.SupplyPackage, bool) {
r.mu.RLock()
defer r.mu.RUnlock()
key := platform + "_" + model
pkg, ok := r.supplyPackages[key]
return pkg, ok
}
// ListSupplyPackages returns all supply packages, optionally filtered by status
func (r *MemoryRepository) ListSupplyPackages(status string) []domain.SupplyPackage {
r.mu.RLock()
defer r.mu.RUnlock()
items := make([]domain.SupplyPackage, 0, len(r.supplyPackages))
for _, pkg := range r.supplyPackages {
if status == "" || pkg.Status == status {
items = append(items, pkg)
}
}
return items
}
// UpdateCandidateStatus updates a candidate's status (used by admission service)
func (r *MemoryRepository) UpdateCandidateStatus(ctx context.Context, candidateID string, status domain.DiscoveryCandidateStatus, failureCode, failureSummary string) error {
r.mu.Lock()
defer r.mu.Unlock()
if _, ok := r.discoveryCandidates[candidateID]; !ok {
return errors.New("candidate not found")
}
c := r.discoveryCandidates[candidateID]
c.Status = status
c.ReasonCode = failureCode
c.UpdatedAt = time.Now().UTC()
c.Version++
r.discoveryCandidates[candidateID] = c
return nil
}

View File

@@ -0,0 +1,136 @@
package repository
import (
"testing"
"time"
"supply-intelligence/internal/domain"
)
func TestMemoryRepositoryRoutingState(t *testing.T) {
repo := NewMemoryRepository()
state := domain.AccountRoutingState{AccountID: 1, Platform: "openai", AccountStatus: domain.AccountStatusActive, RoutingEnabled: true, Version: 1}
repo.UpsertRoutingState(state)
got, ok := repo.GetRoutingState(1)
if !ok {
t.Fatalf("expected routing state")
}
if got.AccountStatus != domain.AccountStatusActive {
t.Fatalf("unexpected status: %q", got.AccountStatus)
}
}
func TestMemoryRepositoryPackageEventsAndAck(t *testing.T) {
repo := NewMemoryRepository()
evt := domain.PackageChangeEvent{EventID: "evt-1", EventType: "supply_package_published", PackageID: 1, Platform: "openai", Model: "gpt-4.1-mini", OccurredAt: time.Unix(10, 0).UTC(), Version: 2}
repo.AppendPackageEvent(evt)
items := repo.ListPackageEvents()
if len(items) != 1 {
t.Fatalf("expected 1 event, got %d", len(items))
}
ackedAt := time.Unix(20, 0).UTC()
updated, err := repo.AckPackageEvent("evt-1", "gateway", domain.GatewayAckResultApplied, "ok", ackedAt)
if err != nil {
t.Fatalf("unexpected ack error: %v", err)
}
if updated.GatewaySyncStatus != domain.GatewaySyncStatusApplied {
t.Fatalf("unexpected ack status: %+v", updated)
}
if updated.Consumer != "gateway" || updated.ConsumerDetail != "ok" {
t.Fatalf("unexpected consumer metadata: %+v", updated)
}
if updated.AckedAt == nil || !updated.AckedAt.Equal(ackedAt) {
t.Fatalf("unexpected ack time: %+v", updated)
}
}
func TestMemoryRepositoryListPackageEventsAfterCursor(t *testing.T) {
repo := NewMemoryRepository()
repo.AppendPackageEvent(domain.PackageChangeEvent{EventID: "evt-1", EventType: "supply_package_published", PackageID: 1, Platform: "openai", Model: "a", OccurredAt: time.Unix(10, 0).UTC(), Version: 1})
repo.AppendPackageEvent(domain.PackageChangeEvent{EventID: "evt-2", EventType: "supply_package_published", PackageID: 2, Platform: "openai", Model: "b", OccurredAt: time.Unix(20, 0).UTC(), Version: 2})
items, nextCursor := repo.ListPackageEventsAfter("")
if len(items) != 2 || nextCursor != "2" {
t.Fatalf("unexpected initial page: len=%d next=%q", len(items), nextCursor)
}
items, nextCursor = repo.ListPackageEventsAfter("1")
if len(items) != 1 || items[0].EventID != "evt-2" || nextCursor != "2" {
t.Fatalf("unexpected cursor page: items=%+v next=%q", items, nextCursor)
}
}
func TestMemoryRepositoryDiscoveryCandidateCRUD(t *testing.T) {
repo := NewMemoryRepository()
candidate := domain.DiscoveryCandidate{
CandidateID: "cand-1",
AccountID: 1,
Platform: "openai",
Model: "gpt-4.1-mini",
Source: "seed",
Status: domain.DiscoveryCandidateStatusPendingAdmission,
DiscoveredAt: time.Unix(10, 0).UTC(),
UpdatedAt: time.Unix(10, 0).UTC(),
Version: 1,
}
repo.UpsertDiscoveryCandidateContext(nil, candidate)
got, ok := repo.GetDiscoveryCandidateByIDContext(nil, "cand-1")
if !ok || got.CandidateID != "cand-1" {
t.Fatalf("expected candidate, got %+v ok=%v", got, ok)
}
}
func TestMemoryRepositoryFindDiscoveryCandidateByBusinessKey(t *testing.T) {
repo := NewMemoryRepository()
repo.UpsertDiscoveryCandidateContext(nil, domain.DiscoveryCandidate{
CandidateID: "cand-1",
AccountID: 1,
Platform: "openai",
Model: "gpt-4.1-mini",
Source: "seed",
Status: domain.DiscoveryCandidateStatusPendingAdmission,
DiscoveredAt: time.Unix(10, 0).UTC(),
UpdatedAt: time.Unix(10, 0).UTC(),
Version: 1,
})
got, ok := repo.FindDiscoveryCandidateContext(nil, 1, "openai", "gpt-4.1-mini")
if !ok || got.CandidateID != "cand-1" {
t.Fatalf("expected candidate by business key, got %+v ok=%v", got, ok)
}
}
func TestMemoryRepositoryListDiscoveryCandidatesByStatusAndOrder(t *testing.T) {
repo := NewMemoryRepository()
repo.UpsertDiscoveryCandidateContext(nil, domain.DiscoveryCandidate{
CandidateID: "cand-2",
AccountID: 2,
Platform: "openai",
Model: "b",
Source: "seed",
Status: domain.DiscoveryCandidateStatusAdmitted,
DiscoveredAt: time.Unix(20, 0).UTC(),
UpdatedAt: time.Unix(20, 0).UTC(),
Version: 1,
})
repo.UpsertDiscoveryCandidateContext(nil, domain.DiscoveryCandidate{
CandidateID: "cand-1",
AccountID: 1,
Platform: "openai",
Model: "a",
Source: "seed",
Status: domain.DiscoveryCandidateStatusPendingAdmission,
DiscoveredAt: time.Unix(10, 0).UTC(),
UpdatedAt: time.Unix(10, 0).UTC(),
Version: 1,
})
items := repo.ListDiscoveryCandidatesContext(nil, domain.DiscoveryCandidateStatusPendingAdmission)
if len(items) != 1 || items[0].CandidateID != "cand-1" {
t.Fatalf("unexpected filtered items: %+v", items)
}
all := repo.ListDiscoveryCandidatesContext(nil, "")
if len(all) != 2 || all[0].CandidateID != "cand-1" || all[1].CandidateID != "cand-2" {
t.Fatalf("unexpected ordering: %+v", all)
}
}

21
migrations/0001_init.sql Normal file
View File

@@ -0,0 +1,21 @@
CREATE TABLE IF NOT EXISTS supply_intelligence_account_routing_states (
account_id BIGINT PRIMARY KEY,
platform TEXT NOT NULL,
account_status TEXT NOT NULL,
routing_enabled BOOLEAN NOT NULL DEFAULT TRUE,
risk_score INTEGER NOT NULL DEFAULT 0,
reason_code TEXT NOT NULL DEFAULT '',
last_probe_at TIMESTAMPTZ NOT NULL,
version BIGINT NOT NULL DEFAULT 1
);
CREATE TABLE IF NOT EXISTS supply_intelligence_package_change_events (
event_id TEXT PRIMARY KEY,
event_type TEXT NOT NULL,
package_id BIGINT NOT NULL,
platform TEXT NOT NULL,
model TEXT NOT NULL,
occurred_at TIMESTAMPTZ NOT NULL,
version BIGINT NOT NULL,
ack_status TEXT NOT NULL DEFAULT 'pending'
);

View File

@@ -0,0 +1,69 @@
-- Migration 0002: Admission Testing & Model Candidates
-- Adds model_candidates table and supply_packages draft support
CREATE TABLE IF NOT EXISTS supply_intelligence_model_candidates (
candidate_id TEXT PRIMARY KEY,
account_id BIGINT NOT NULL,
platform TEXT NOT NULL,
model TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'pending_admission',
source TEXT NOT NULL DEFAULT 'official_api',
reason_code TEXT DEFAULT '',
failure_summary TEXT DEFAULT '',
discovered_at TIMESTAMPTZ NOT NULL,
last_test_at TIMESTAMPTZ,
updated_at TIMESTAMPTZ NOT NULL,
version BIGINT NOT NULL DEFAULT 1,
UNIQUE(platform, model)
);
CREATE INDEX idx_candidates_status ON supply_intelligence_model_candidates(status);
CREATE INDEX idx_candidates_platform ON supply_intelligence_model_candidates(platform);
CREATE INDEX idx_candidates_discovered ON supply_intelligence_model_candidates(discovered_at DESC);
CREATE TABLE IF NOT EXISTS supply_intelligence_admission_test_logs (
test_id BIGINT PRIMARY KEY DEFAULT nextval('admission_test_id_seq'),
candidate_id TEXT NOT NULL REFERENCES supply_intelligence_model_candidates(candidate_id),
status TEXT NOT NULL,
failure_code TEXT,
failure_summary TEXT,
tested_at TIMESTAMPTZ NOT NULL,
version BIGINT NOT NULL DEFAULT 1
);
CREATE SEQUENCE IF NOT EXISTS admission_test_id_seq;
CREATE TABLE IF NOT EXISTS supply_intelligence_supply_packages (
package_id BIGINT PRIMARY KEY DEFAULT nextval('supply_package_id_seq'),
platform TEXT NOT NULL,
model TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'draft',
source TEXT NOT NULL DEFAULT 'si_auto',
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
version BIGINT NOT NULL DEFAULT 1,
UNIQUE(platform, model)
);
CREATE SEQUENCE IF NOT EXISTS supply_package_id_seq;
-- New fields to extend routing states (via migration, not replacement)
-- routing_states already has account_id as PK; add probe_execution_logs
CREATE TABLE IF NOT EXISTS supply_intelligence_probe_execution_logs (
log_id BIGINT PRIMARY KEY DEFAULT nextval('probe_log_id_seq'),
account_id BIGINT NOT NULL,
platform TEXT NOT NULL,
probe_result TEXT NOT NULL,
failure_class TEXT,
http_status INTEGER,
latency_ms INTEGER,
risk_score INTEGER NOT NULL,
evaluated_transition TEXT NOT NULL,
executed_at TIMESTAMPTZ NOT NULL,
request_id TEXT NOT NULL,
version BIGINT NOT NULL DEFAULT 1
);
CREATE SEQUENCE IF NOT EXISTS probe_log_id_seq;
CREATE INDEX idx_probe_logs_account_time ON supply_intelligence_probe_execution_logs(account_id, executed_at DESC);

568
prd/PRD.md Normal file
View File

@@ -0,0 +1,568 @@
# 供应链智能增强系统Supply IntelligencePRD
> 状态说明2026-05 收敛修订):本文件保留为历史版本参考,已不再作为当前实现真源。
> 当前产品真源以“2026-05 新 PM 基线 + tech/BASELINE_TECHLEAD_V2.md + 已收敛的测试/部署/任务决议文档”为准。
> 若本文件与上述新真源冲突,以新真源为准,尤其是以下方面不得再按本文件旧口径执行:
> 1. pricing / prediction / 向量检索 / 广义开放平台能力
> 2. 探针 timeout / TCP / DNS 触发惩罚性降级
> 3. 自动发布 / 自动下架 / disabled 自动恢复
> 4. gateway 强耦合同步热更新主路径
> 5. 以独立平台化重部署作为默认落地方式
> 文档版本v1.0
> 撰写日期2026-04-27
> 撰写人PM产品经理
> 评审状态:待 TechLead 评审
---
## 1. 概述
### 1.1 一句话价值
通过自动化探针、全网扫描与准入测试,让平台供应链中的供应商账号、可用模型列表始终保持最新且可路由,消除人工维护滞后导致的可用性黑洞。
### 1.2 用户问题
- 运营团队每日需要人工检查数十个供应商账号的状态额度、密钥失效、TOS 变更),遗漏率高。
- 新模型上线后,平台未能及时感知,导致流量无法路由到新模型,竞争力下降。
- 供应商账号过期或密钥失效后,网关仍在尝试路由请求,直接引发用户端报错。
- 手动在各运营商后台注册账号、申请 API Key 的周期以天计,阻碍新供应商的快速接入。
### 1.3 业务意义
- 将供应链信息的保鲜周期从“人工天级”缩短到“自动分钟级”。
- 降低因供应商/模型失效导致的用户可见错误率。
- 缩短新模型上线到平台可售卖之间的上市时间Time-to-Market
- 为后续动态定价、智能路由提供实时、准确的供应链数据底座。
---
## 2. 目标
### 2.1 业务目标
| 目标编号 | 目标描述 | 度量方式 |
|---------|---------|---------|
| BG-01 | 供应商账号异常状态从发生到被标记的平均时间 ≤ 15 分钟 | 从供应商侧异常发生到本系统将其 status 改为 `suspended``disabled` 的时间差 |
| BG-02 | 全网新模型从发布到进入平台可售卖列表的平均时间 ≤ 4 小时 | 从模型在官方文档/接口中出现到本系统将其对应的 supply_package 状态置为 `active` 的时间差 |
| BG-03 | 因供应商账号失效导致的用户可见错误率下降 80% | 对比上线前 30 天与上线后 30 天,网关返回 502/503 且根因指向供应商失效的请求占比 |
| BG-04 | 人工维护供应商基础信息的工作量减少 70% | 运营团队每周在供应商信息维护上投入的小时数对比 |
### 2.2 用户目标
- **平台运营团队**:在一个界面看到所有供应商账号的健康度、模型覆盖度、待处理事项,不再需要逐家登录供应商后台确认。
- **供应链管理人员**:新供应商或新模型的接入流程从“人工申请-测试-录入”变为“自动发现-自动测试-人工确认一键上架”。
- **技术负责人**:系统具备明确的熔断、降级、审计能力,自动化操作不引入新的稳定性风险。
- **商务负责人**:新模型上架速度成为可量化指标,可用于对外商务承诺。
### 2.3 成功定义
项目被判定为成功的条件是:
1. BG-01、BG-03、BG-04 三项指标在正式上线后 30 天内全部达成。
2. 系统在连续 7 天内未出现因本系统自身故障导致的供应商状态误标记false positive 率 ≤ 1%)。
3. 所有自动化操作(状态变更、模型录入、账号注册)具备完整审计日志,且日志保留 ≥ 90 天。
---
## 3. 范围
### 3.1 In Scope
#### 模块 A供应商品质探针Supply Health Probe
- 对已录入 `supply_accounts` 的账号,按配置周期发起连通性、额度、密钥有效性探针。
- 根据探针结果,自动将账号状态在 `active``suspended``disabled` 之间迁移(需满足状态机规则,不允许直接 `active``disabled`,必须经过 `suspended`)。
- 对探针结果生成风险评分,写入 `supply_accounts.risk_score``risk_reason`
- 对状态变更事件写入审计日志。
#### 模块 B全网模型发现Model Discovery
- 对接各供应商官方 API / 文档 / 变更源,扫描其已发布的模型列表。
- 将扫描到的模型与平台现有 `supply_packages` 中的 `platform` + `model` 组合进行比对,识别“新增模型”。
- 对新增模型创建候选记录(`supply_intelligence.model_candidates` 表,状态为 `discovered`),等待准入测试。
- 对已从官方列表下架但平台仍有 `active` 套餐的模型,标记为 `deprecated`,触发告警通知运营团队。
#### 模块 C模型准入测试Model Admission Test
- 对状态为 `discovered` 的候选模型,使用标准化测试用例集(覆盖 chat/completion/embedding 等 endpoint进行功能验证。
- 测试维度包括:接口可用性、响应格式合规性、延迟 P50/P99、token 计数一致性、错误码映射正确性。
- 测试通过后,候选模型状态迁移为 `test_passed`,并自动生成一份 `supply_package` 草稿(`draft` 状态),等待运营团队确认后发布。
- 测试失败的模型状态迁移为 `test_failed`,记录失败原因与日志,保留 30 天后自动清理。
#### 模块 D账号自动注册Account Auto-Registration
- 针对支持自动化注册流程的供应商(需配置化白名单),系统通过其公开注册接口或模拟浏览器流程完成账号注册。
- 注册成功后,自动申请 API Key将凭证加密后写入 `supply_accounts`,状态置为 `pending`
- 注册过程中涉及的手机/邮箱验证,接入平台已集成的 SMS/邮件网关;若 SMS/邮件网关未就绪,该供应商的自动注册能力必须 fail-closed拒绝启动不静默降级
- 注册行为必须写入审计日志,凭证指纹写入 `credential_fingerprint`
#### 模块 E运营工作台Operations Dashboard
- 展示待处理候选模型列表、待确认供应商状态变更、自动注册任务队列。
- 提供“一键确认上架”、“忽略此模型”、“手动触发探针”三个人工干预入口。
- 展示供应链覆盖率(平台已上架模型数 / 全网可发现模型数)。
### 3.2 Out of Scope
| 编号 | 内容 | 原因 |
|-----|------|------|
| OOS-01 | 供应商侧计费系统对接与自动充值 | 属于财务结算域,不在供应链智能范围内 |
| OOS-02 | 基于发现结果的动态定价算法 | 属于 pricing-engine 项目,本系统只生成 package 草稿中的建议价 |
| OOS-03 | 供应商账号的 TOS 法律合规性自动审查 | 法律文本语义分析超出当前工程边界本系统只做“TOS 变更标记” |
| OOS-04 | 不支持公开注册接口的供应商(如需要企业资质审核、线下合同)的自动注册 | 无法工程化闭环,保留人工注册入口 |
| OOS-05 | 对供应商内部模型版本迭代(如从 gpt-4-turbo 到 gpt-4-turbo-2024-04-09的语义级差异分析 | 成本过高,只识别模型 ID 维度的新增/下架 |
| OOS-06 | 跨供应商的模型能力等价性判定(如“模型 A 是否等价于模型 B” | 属于模型评估平台,非供应链基础能力 |
### 3.3 假设与依赖
| 编号 | 假设/依赖 | 影响 |
|-----|----------|------|
| ASP-01 | 各供应商均提供可公开访问的模型列表接口或文档页面 | 若某供应商关闭列表接口,该供应商的模型发现能力降级为手动录入 |
| ASP-02 | 账号自动注册仅针对已签署技术合作框架协议、允许自动化注册的供应商 | 法律合规问题由商务团队前置解决 |
| ASP-03 | `supply-api` 现有的 `supply_accounts` 表结构在上线前不做破坏性变更 | 本系统的新增表需通过标准 migration 脚本创建 |
| ASP-04 | 平台已具备 SMS/邮件网关的运行时能力,或本模块的自动注册可被条件关闭 | 参照 `supply-api/CLAUDE.md` 中“条件能力必须 fail-closed”原则 |
| ASP-05 | 探针任务调度依赖平台统一的 job scheduler如内部 cron 或 Temporal不重新造调度器 | 若 scheduler 不可用,探针模块延迟启动 |
| ASP-06 | 测试用例集的维护由 QA 团队负责,本系统负责调度执行与结果收集 | 测试用例本身不在本系统代码库内管理 |
---
## 4. 用户场景
### 4.1 主流程
#### 场景 S1供应商账号自动探针与状态变更
```
1. 调度器按配置周期(默认 5 分钟)触发对供应商账号 A 的探针任务。
2. 探针模块调用供应商健康检查端点(或发送一条低成本测试请求)。
3. 供应商返回 401/403 或超时 > 10 秒,探针判定为“密钥失效或账号异常”。
4. 系统检查该账号当前状态:
a. 若为 active → 改为 suspendedrisk_score 设为 80risk_reason 写入“密钥失效”。
b. 若为 suspended 且连续 3 次探针失败 → 改为 disabled。
5. 状态变更事件写入审计日志object_type=supply_account, action=auto_suspend
6. 向运营团队发送告警通知(钉钉/企业微信),包含账号、供应商、原因、时间。
```
#### 场景 S2全网扫描发现新模型
```
1. 调度器每 1 小时触发一次全网扫描任务。
2. 扫描模块向各供应商的模型列表接口发起请求,解析出当前所有 model_id。
3. 与 supply_packages 中 status ∈ {active, paused, draft} 的记录去重比对。
4. 发现供应商 X 新增模型 "new-model-v1",平台暂无记录。
5. 在 model_candidates 表中插入一条记录:
- platform = X, model_id = "new-model-v1"
- status = discovered
- discovered_at = NOW()
6. 触发准入测试流水线(异步任务)。
```
#### 场景 S3新模型准入测试通过并上架
```
1. 准入测试模块从 model_candidates 取出 status = discovered 的记录。
2. 使用对应供应商的测试账号,发送标准化测试请求集(≥ 5 个不同用例)。
3. 所有用例返回 HTTP 200响应体符合 OpenAI-compatible schema延迟 P99 < 30 秒。
4. 将 candidate 状态更新为 test_passed并生成 supply_package 草稿:
- platform = X, model = "new-model-v1"
- status = draft
- price_per_1m_input / price_per_1m_output 使用预设默认值(可配置)
5. 运营工作台出现“待上架新模型”卡片。
6. 运营人员点击“确认上架”package 状态改为 active进入 gateway 路由表。
```
#### 场景 S4供应商账号自动注册
```
1. 运营人员在后台勾选“启用供应商 Y 的自动注册”,并配置注册参数(如邮箱域名、账号前缀规则)。
2. 系统检测到供应商 Y 的可用账号数 < 配置阈值(如 < 2 个 active 账号)。
3. 触发自动注册任务:
a. 调用供应商 Y 的注册接口,提交随机生成的用户名、密码、企业邮箱。
b. 等待并解析注册确认邮件,点击确认链接(或输入邮件验证码)。
c. 登录账号后台,申请 API Key。
4. 将 API Key 经 KMS 加密后写入 supply_accountsstatus = pending。
5. 触发自动验证(复用现有 Verify 流程),验证通过后 status 改为 active。
```
### 4.2 异常流程
#### 场景 E1探针遭遇供应商 Rate Limit
```
1. 探针请求返回 429。
2. 该次探针标记为 inconclusive不计入连续失败次数。
3. 调度器在指数退避后1min → 2min → 4min重试最多重试 3 次。
4. 若 3 次后仍为 429本次探针周期跳过该账号记录日志不触发状态变更。
```
#### 场景 E2模型准入测试超时
```
1. 某测试用例在 60 秒内未收到响应。
2. 该用例标记为 timeout测试流水线整体标记为 test_failed。
3. candidate 状态更新为 test_failed失败原因写入 "admission_test_timeout"。
4. 运营工作台展示失败详情,运营人员可选择:
a. 手动重新触发测试;
b. 标记为 ignore该 model_id 在 7 天内不再自动扫描。
```
#### 场景 E3自动注册时 SMS/邮件网关不可用
```
1. 注册流程进行到验证码接收步骤。
2. 调用 SMS/邮件网关返回 503 或超时。
3. 该注册任务整体失败写入审计日志action=auto_register_failed
4. 依据 fail-closed 原则,不向用户或上游返回任何“注册成功”的虚假状态。
5. 任务进入死信队列24 小时后由人工或系统重试。
```
### 4.3 边缘流程
#### 场景 B1供应商模型 ID 变更(非新增/下架,而是重命名)
- 扫描模块发现旧 model_id 消失、新 model_id 出现,但模型能力描述高度相似。
- 系统无法自动判定为“重命名”还是“旧模型下架+新模型上线”。
- 生成一条运营待办事项,由运营人员人工确认关系,系统不做自动关联。
#### 场景 B2运营人员手动暂停自动探针
- 运营人员可在后台对单个供应商账号勾选“暂停自动探针”。
- 该账号的探针任务在调度器中被跳过,但运营人员仍可手动触发单次探针。
- 暂停状态写入 `supply_accounts` 的扩展字段(或通过新增 `auto_probe_enabled` 字段),探针模块读取该字段后决定是否执行。
#### 场景 B3账号处于 suspended 期间收到用户请求
- 本系统不直接处理流量路由,但需向 gateway 提供实时供应商状态查询接口。
- gateway 在路由决策时查询该接口,若账号为 suspended/disabled则将该账号从候选池移除。
- 该接口的 SLAP99 延迟 < 50ms可用性 ≥ 99.9%。
### 4.4 用户故事
| 编号 | 角色 | 故事 | 验收对应 |
|-----|------|------|---------|
| US-01 | 运营人员 | 我想在一个页面看到所有供应商账号最近一次探针的时间和结果,以便快速定位异常账号 | AC-01, AC-02 |
| US-02 | 供应链管理员 | 我想在新模型被系统发现后收到通知,并在工作台一键确认上架,以便缩短上市时间 | AC-03, AC-04 |
| US-03 | 技术负责人 | 我想所有自动化状态变更都有审计日志和回滚记录,以便在误操作时追溯和恢复 | AC-05, AC-09 |
| US-04 | 商务负责人 | 我想看到平台模型覆盖率与竞品对比的报表,以便对外展示平台能力 | AC-07 |
| US-05 | 运营人员 | 我想对特定账号暂停自动探针,以便在供应商维护窗口期避免误报警 | AC-08 |
| US-06 | 供应链管理员 | 我想对支持自动注册的供应商配置自动补货策略,以便在可用账号不足时自动补充 | AC-06 |
---
## 5. 验收标准AC
> 以下每条 AC 均为可测试、无模糊词的要求。QA 可直接据此编写测试用例。
### 模块 A供应商品质探针
**AC-01 探针覆盖度**
- 给定 `supply_accounts``status``active``suspended` 的账号数量 N系统在任意时刻 T最近 15 分钟内被探针覆盖的账号数量 M 必须满足 M/N ≥ 99%。
- 测试方法:插入 100 条测试账号记录,观察 15 分钟窗口内探针日志条数是否 ≥ 99。
**AC-02 状态变更正确性**
- 给定一个 `status=active` 的账号,模拟其返回 401 连续 1 次,系统在 60 秒内将其 `status` 更新为 `suspended`
- 给定一个 `status=suspended` 的账号,模拟其连续 3 次探针返回 401系统在 60 秒内将其 `status` 更新为 `disabled`
- 给定一个 `status=active` 的账号,模拟其返回 429 单次,其 `status` 在 15 分钟内保持 `active` 不变。
- 测试方法Mock 供应商响应,查询数据库字段值。
**AC-03 误报率**
- 在 7 天连续运行测试中,探针将实际正常的账号标记为 `suspended``disabled` 的次数 ≤ 总探针次数的 1%。
- 测试方法:使用全部正常的测试账号运行 7 天,统计状态误变更次数。
### 模块 B全网模型发现
**AC-04 新模型发现延迟**
- 给定一个已对接的供应商,在其模型列表中新增一个 model_id系统在 2 个扫描周期(默认 2 小时)内将该 model_id 写入 `model_candidates``status=discovered`
- 测试方法Mock 供应商模型列表接口,在 T0 新增 model_idT0+2h 查询数据库验证。
**AC-05 已下架模型告警**
- 给定一个 `supply_packages``status=active` 的 model_id在供应商侧该 model_id 消失后,系统在 2 个扫描周期内:
- 将该 package 的 `status` 保持 `active` 不变(不自动下架,避免误伤);
- 在运营工作台生成一条“模型已下架”告警待办;
- 向运营人员发送通知。
- 测试方法Mock 供应商模型列表,移除 model_id验证告警产生与 package 状态未变。
### 模块 C模型准入测试
**AC-06 准入测试通过**
- 给定一个 `status=discovered` 的 candidate其供应商测试账号正常系统在 30 分钟内完成全部测试用例执行candidate 状态变为 `test_passed`,且自动生成一条 `supply_packages` 记录(`status=draft`)。
- 测试方法:使用真实或 Mock 供应商响应,验证数据库状态与 package 草稿字段完整性。
**AC-07 准入测试失败**
- 给定一个 `status=discovered` 的 candidate模拟其接口返回 500 或响应格式不合法,系统在 30 分钟内将 candidate 状态更新为 `test_failed``failure_reason` 字段非空,且运营工作台展示失败详情。
- 测试方法Mock 供应商返回 500验证数据库字段与前端展示。
### 模块 D账号自动注册
**AC-08 自动注册成功**
- 给定一个已配置自动注册白名单的供应商,配置其可用账号数阈值为 2当前可用账号数为 1系统在 10 分钟内触发注册流程,在 30 分钟内完成注册、密钥申请、凭证加密存储,最终 `supply_accounts` 中新增一条 `status=active` 的记录。
- 测试方法:使用供应商沙箱环境或高保真 Mock验证端到端流程与数据库记录。
**AC-09 自动注册 fail-closed**
- 给定自动注册配置 `enabled=true`,但 SMS/邮件网关返回 503 或超时,系统在 60 秒内将注册任务标记为 `failed`,不向任何上游返回成功状态码,审计日志中包含 `action=auto_register_failed` 与错误详情。
- 测试方法Mock SMS 网关返回 503验证接口响应、数据库状态、审计日志。
### 模块 E运营工作台与通用
**AC-10 审计日志完整性**
- 任意自动化操作状态变更、candidate 状态迁移、自动注册、手动触发探针)发生后 5 秒内,审计存储中必须存在对应记录,字段包含:
- `object_type``object_id``action``result_code``before_state`(变更前)、`after_state`(变更后)、`request_id`
- 测试方法:触发各项操作,查询审计存储验证字段完整性。
**AC-11 运营工作台干预**
- 运营人员点击“一键确认上架”后,对应的 `supply_packages` 记录在 3 秒内从 `draft` 变为 `active`
- 运营人员点击“忽略此模型”后,该 candidate 在 7 天内不再出现在待处理列表中,且 7 天后自动恢复为 `discovered`
- 测试方法E2E 测试或 UI 自动化测试。
**AC-12 配置热更新**
- 探针周期、扫描周期、测试超时时间、自动注册阈值等配置项,在修改配置文件并下发后 60 秒内生效,不重启进程。
- 测试方法:修改配置,观察调度器行为变化时间差。
---
## 6. 边缘情况与失败路径
| 编号 | 边缘/失败场景 | 系统行为 | 验证方式 |
|-----|-------------|---------|---------|
| FP-01 | 供应商探针接口完全不可用DNS 失败、TCP 超时) | 标记为 inconclusive按 429 退避逻辑处理,不直接变更状态 | 模拟 iptables DROP验证状态不变 |
| FP-02 | 供应商返回 200 但响应体为空或格式突变 | 解析失败视为 inconclusive记录 error_log不触发状态变更 | Mock 返回空 JSON验证状态与日志 |
| FP-03 | 同一账号在探针执行期间被运营人员手动变更状态 | 乐观锁冲突:探针更新时 version 不匹配,更新失败,探针记录冲突日志,由下次探针或运营人员覆盖 | 并发测试:手动 update 同时触发探针 |
| FP-04 | 模型准入测试期间,测试账号被探针标记为 suspended | 准入测试流水线检测到测试账号不可用,任务标记为 `test_failed`,原因写为 `test_account_unavailable` | Mock 测试账号 suspended验证流水线行为 |
| FP-05 | 自动注册时供应商注册接口返回 400如邮箱已被注册 | 任务标记为 `failed`,原因写入具体错误码,同一邮箱不再重复使用,审计日志记录完整请求/响应摘要(脱敏后) | Mock 注册接口返回 400验证数据库与日志 |
| FP-06 | 自动注册成功后,验证步骤发现密钥无效 | 账号状态保持 `pending`,自动注册任务标记为 `verify_failed`,触发告警,不进入 active | Mock verify 返回失败,验证状态机 |
| FP-07 | 全网扫描时供应商模型列表分页异常(如页码越界返回 500 | 扫描任务记录分页失败,已获取的部分模型仍正常处理,失败页在下一周期重试 | Mock 分页接口第 3 页返回 500验证整体任务不中断 |
| FP-08 | 数据库在探针执行期间不可用 | 探针任务失败,记录错误,不触发状态变更;调度器按配置重试;连续失败 5 次后暂停该批次探针,触发系统级告警 | 模拟 PostgreSQL 断开,验证行为 |
| FP-09 | 运营人员同时点击“确认上架”与“忽略此模型” | 乐观锁或幂等键保证只有一个操作生效,第二个操作返回 409 Conflict界面提示“该模型已被处理” | 并发 UI 操作测试 |
| FP-10 | 凭证加密 KMS 服务在自动注册期间不可用 | 注册流程在加密步骤阻塞,等待 KMS 恢复或超时60 秒);超时后任务标记为 `failed`,明文凭证不得落盘 | Mock KMS 超时,验证明文不出现在日志/数据库 |
---
## 7. 上线与运营准备
### 7.1 发布策略
- **阶段 1灰度**:选择 2 个非核心供应商(如测试环境专用供应商)开启自动探针与模型发现,观察 7 天。
- **阶段 2扩展**:覆盖全部供应商的探针与发现能力,但自动状态变更仅对 `sandbox` 环境账号生效,生产环境账号的探针结果只生成告警,不自动改状态。
- **阶段 3全量**:生产环境账号启用自动状态变更,模型准入测试与自动注册按需逐步开启。
### 7.2 灰度/回滚
- 灰度开关通过配置中心控制,维度包括:
- `probe.enabled`:全局探针开关
- `probe.auto_transition.supplier_ids`:允许自动状态变更的供应商白名单
- `discovery.enabled`:全网扫描开关
- `admission_test.enabled`:准入测试开关
- `auto_registration.enabled`:自动注册开关
- 回滚条件(任一触发即全量关闭对应模块):
- 1 小时内探针误报率 > 5%
- 自动状态变更导致用户可见错误率上升(对比基线)> 2%
- 自动注册任务连续失败率 > 50%(持续 1 小时)
- 回滚操作:修改配置中心对应开关为 `false`60 秒内生效,已变更的状态不自动回退,由运营人员人工审核。
### 7.3 埋点/监控/告警
#### 埋点事件
| 事件名 | 触发时机 | 关键属性 |
|-------|---------|---------|
| `si_probe_executed` | 每次探针执行完成 | `platform`, `account_id`, `result`, `latency_ms` |
| `si_state_transitioned` | 账号状态自动变更 | `platform`, `account_id`, `from_status`, `to_status`, `reason` |
| `si_model_discovered` | 发现新模型 | `platform`, `model_id`, `discovery_source` |
| `si_admission_test_completed` | 准入测试完成 | `platform`, `model_id`, `result`, `duration_sec` |
| `si_auto_register_completed` | 自动注册完成 | `platform`, `result`, `duration_sec` |
#### 监控指标Prometheus
| 指标名 | 类型 | 说明 |
|-------|------|------|
| `si_probe_latency_seconds` | Histogram | 探针请求延迟 |
| `si_probe_result_total` | Counter | 探针结果分类success/failure/inconclusive |
| `si_state_transition_total` | Counter | 状态变更次数 |
| `si_discovery_models_total` | Gauge | 当前候选模型数量(按 status 分标签) |
| `si_admission_test_duration_seconds` | Histogram | 准入测试耗时 |
| `si_auto_register_result_total` | Counter | 自动注册结果分类 |
#### 告警规则
| 告警名 | 条件 | 通知对象 | 级别 |
|-------|------|---------|------|
| 探针大面积失败 | 1 小时内探针失败率 > 20% | 技术负责人 | P1 |
| 供应商账号全部失效 | 某供应商 active 账号数 = 0 持续 > 10 分钟 | 运营+技术 | P0 |
| 自动注册连续失败 | 1 小时内自动注册失败率 > 50% | 供应链管理员 | P1 |
| 新模型堆积未处理 | `status=discovered` 的候选模型数 > 20 且持续 > 24 小时 | 运营团队 | P2 |
| 系统自身健康异常 | 本服务 `/actuator/health/ready` 返回非 200 持续 > 1 分钟 | 技术负责人 | P0 |
### 7.4 FAQ预置
**Q1自动状态变更会不会把正常的供应商误杀掉**
A探针采用“连续失败才降级”策略active → suspended 需 1 次明确失败suspended → disabled 需连续 3 次失败。运营人员可随时在后台暂停单个账号的自动探针。
**Q2模型准入测试失败了我还能手动上架吗**
A可以。运营人员可以在工作台查看失败详情选择“手动强制上架”此时系统生成 package 草稿但标记为 `manually_forced`,并强制要求运营人员填写强制上架理由,该理由写入审计日志。
**Q3自动注册生成的账号归属谁**
A自动注册账号的 `user_id` / `supplier_user_id` 关联到平台运营系统账号(可配置),收益结算走平台统一账户。
---
## 8. 商业化与价值闭环
### 8.1 收益路径
| 路径 | 描述 | 量化 |
|-----|------|------|
| 直接收益 | 新模型上架速度提升 → 平台可售模型数增加 → 订单量增长 | 每提前 1 天上架一个热点模型,预估带来 X 订单增量(需商务提供历史数据基线) |
| 成本节省 | 运营人力减少 → 供应链维护 headcount 或工时下降 | 按 BG-04 目标,每周节省 70% 工时,折算年化人力成本 |
| 质量溢价 | 供应商失效导致的客诉减少 → NPS 提升 → 客户续约率提升 | 减少的客诉数 × 单客诉处理成本 + 续约率提升带来的 LTV 增量 |
### 8.2 北极星指标
- **供应链接新鲜度指数Supply Freshness Index, SFI**
- 定义SFI = (过去 1 小时成功探针的账号数 / 应探针账号总数) × (过去 24 小时进入 active 的新模型数 / 过去 24 小时发现的新模型总数)
- 目标值SFI ≥ 0.95
- 采集周期:每小时计算一次,写入时序数据库
### 8.3 失败判定线
项目在以下任一条件触发时,判定为失败并启动止损:
1. 上线后 30 天内因本系统导致的供应商状态误变更false positive累计 > 50 次。
2. 上线后 30 天内,因自动状态变更或自动注册导致用户可见支付/使用故障 > 3 次。
3. SFI 连续 7 天 < 0.70,且技术团队无法给出明确修复排期。
4. 自动注册模块因供应商接口变更导致连续 14 天成功率 < 30%,且无替代方案。
### 8.4 止损条件
- 触发失败判定线后PM 与 TechLead 在 24 小时内决定是否:
- **降级**:关闭自动状态变更与自动注册,仅保留探针监控与模型发现(纯观测模式)。
- **下线**:完全卸载本系统,回退至纯人工维护模式,保留审计日志备查。
- 无论降级或下线,已生成的 supply_package 草稿和已注册的账号不受影响,由运营人员人工接管。
---
## 9. 依赖与风险
### 9.1 外部依赖
| 依赖方 | 依赖内容 | 风险等级 | 缓解措施 |
|-------|---------|---------|---------|
| 各供应商 | 模型列表接口、注册接口、探针端点的稳定性与兼容性 | 高 | 接口变更监测Mock 回归测试集;供应商接口版本锁定 |
| SMS/邮件网关 | 自动注册验证码接收 | 中 | fail-closed备用邮箱池人工兜底流程 |
| KMS 服务 | 新注册账号凭证加密 | 中 | 加密失败阻塞落盘,任务进死信队列 |
| 平台 Job Scheduler | 定时任务调度 | 低 | 调度失败时探针/扫描延迟,不引入错误状态 |
| supply-api 现有服务 | 复用 Verify、AccountStore、PackageStore、AuditStore | 低 | 接口契约冻结;变更需双方 CR |
### 9.2 技术风险
| 风险编号 | 风险描述 | 概率 | 影响 | 应对 |
|---------|---------|------|------|------|
| R-01 | 探针频率过高导致供应商侧将我们视为攻击源,封禁平台 IP | 中 | 高 | 探针频率可配置;使用平台统一出口 IP 池;对每家供应商遵守其 rate limit 文档 |
| R-02 | 供应商模型列表接口返回缓存旧数据,导致“已下架模型”误判 | 中 | 中 | 列表接口响应加 TTL 校验;结合官方文档 RSS/变更日志交叉验证 |
| R-03 | 自动注册的浏览器自动化流程(如 Selenium/Playwright因供应商前端改版失效 | 高 | 中 | 优先使用官方 API 注册;浏览器自动化作为 fallback前端改版监控 |
| R-04 | 准入测试用例不足以覆盖供应商实际兼容性问题,导致 test_passed 但上线后用户报错 | 中 | 高 | 测试用例由 QA 维护并定期评审;上线后 24h 内对新模型增加采样监控 |
| R-05 | 数据库 model_candidates 表数据膨胀,影响查询性能 | 低 | 中 | 设置自动清理策略test_failed 且超过 30 天未手动处理的记录自动删除 |
### 9.3 合规与隐私风险
- 自动注册过程中收集的邮箱、手机号属于个人信息,需符合平台隐私政策与相关法律法规。
- 凭证指纹(`credential_fingerprint`)仅存储哈希值,不得存储明文 API Key。
- 审计日志中的请求/响应摘要需脱敏,不得包含完整 credential。
---
## 10. 技术栈与集成约束
### 统一技术栈
本项目必须与立交桥主项目保持一致:
- **语言**: Go 1.22+
- **HTTP框架**: 标准库 `net/http` + 自定义中间件(禁止引入 Gin/Echo 等第三方框架,保持与 gateway/ 和 supply-api/ 的一致性)
- **数据库**: PostgreSQL 15+ ,驱动 `jackc/pgx/v5`
- **缓存**: Redis客户端 `redis/go-redis/v9`
- **配置**: YAML + Viper环境变量覆盖敏感字段
- **日志/审计**: 结构化日志,审计事件模型与 supply-api/ 一致
- **错误码**: `{SOURCE}_{CATEGORY}_{CODE}` 格式,例如 `SUP_INT_4001`
- **健康检查**: `/actuator/health``/actuator/health/live``/actuator/health/ready`
- **测试**: Go testing + testify覆盖率门槛 domain ≥ 70%、service/handler ≥ 80%
### 独立运行与集成运行
本系统必须同时支持两种运行模式:
| 模式 | 特征 | 部署方式 | 适用场景 |
|------|------|---------|---------|
| **独立运行** | 自有 `cmd/supply-intelligence/main.go`,独立数据库 schema独立 docker-compose | `docker-compose up` 或单独容器 | 外部用户只需要供应链管理能力,不想接入立交桥全套 |
| **集成运行** | 作为 Go module 被 `supply-api/` 引入,共享数据库连接池和配置,通过内部接口注册 | 编译时作为子模块编译,运行时挂载到 supply-api 主进程 | 立交桥用户希望获得一体化供应链能力 |
**集成约束**:
- 独立运行时,系统必须提供完整的 HTTP API 和运营工作台。
- 集成运行时,系统必须提供 `IntegrationPlugin` 接口,允许主程序通过配置开关启用/禁用各模块。
- 数据库 schema 必须使用独立的 `supply_intelligence_` 前缀,避免与主项目表名冲突。
- 配置文件必须支持分离加载:独立运行时读取自己的 `config.yaml`,集成运行时合并到主项目配置。
### NewAPI / Sub2API 适配支持
本系统的核心能力必须能够对接 NewAPI 和 Sub2API 系统:
- **供应商状态同步**: 提供标准化的供应商健康状态接口NewAPI/Sub2API 可定期获取供应商可用性状态。
- **模型列表推送**: 提供 `/models` 接口返回平台已发现、已测试通过的模型列表NewAPI/Sub2API 可消费此数据自动补充自己的模型库。
- **账号注册适配**: 自动注册模块通过适配层支持 NewAPI/Sub2API 的账号管理 API实现跨平台账号生命周期管理。
- **独立部署时**: 通过配置文件指定 NewAPI/Sub2API 的管理端点地址和鉴权信息本系统通过适配层Adapter与之交互。
- **集成部署时**: 若立交桥 gateway/ 已接入 NewAPI/Sub2API本系统通过 supply-api/ 的内部接口操作上游状态。
### 对外接口契约
- 必须提供 OpenAPI 3.0 接口文档,确保 NewAPI/Sub2API 开发者可以独立接入。
- 接口路径前缀默认为 `/api/v1/supply-intelligence/`,集成运行时可通过配置改为 `/internal/supply-intelligence/`
---
## 11. 阶段门控结论
### 11.1 当前状态
**可进入 TechLead 评审,但需补充以下信息后方可进入开发排期:**
1. **供应商接口清单**:需由商务/技术团队提供 Phase 1 目标供应商的模型列表接口文档、注册接口文档(或明确标注哪些供应商不支持自动注册)。
2. **测试用例集范围**:需 QA 团队确认准入测试用例集的初始版本(≥ 5 个用例/模型类型)及维护 SLA。
3. **Job Scheduler 契约**:需明确平台统一调度器的接口契约(如任务提交格式、超时控制、死信策略)。
4. **KMS 与 SMS 网关就绪状态**:生产环境 KMS 与 SMS/邮件网关当前不可用需寻找合适的供应商并确认集成方案。若短期内无法就绪自动注册模块Phase 3需明确为远期交付当前 Phase 1/2 不受影响。
### 11.2 建议开发优先级
| 阶段 | 内容 | 目标 |
|-----|------|------|
| Phase 1 | 供应商品质探针(模块 A+ 运营工作台观测视图(模块 E 只读部分) | 解决最痛的可用性黑洞问题7 天灰度验证 |
| Phase 2 | 全网模型发现(模块 B+ 模型准入测试(模块 C | 解决新模型上市滞后问题 |
| Phase 3 | 账号自动注册(模块 D+ 运营工作台完整干预能力(模块 E 读写部分) | 解决供应商账号补充效率问题 |
### 11.3 门控决策
- **不阻塞 TechLead 评审**PRD 中需求边界、验收标准、失败路径已清晰。
- **阻塞开发排期**:直到上述 4 项补充信息供应商接口清单、测试用例集、Job Scheduler 契约、KMS/SMS 就绪状态)以文档形式补充到本 PRD 附录后方可进入技术方案设计HLD阶段。
- **技术栈与集成约束已明确**:统一 Go 标准库、独立/集成双模式、NewAPI/Sub2API 适配层已纳入范围。
---
## 附录 A新增数据表草案供 TechLead 参考,非最终 Schema
> 本附录仅用于需求对齐,最终 Schema 由 TechLead 设计并通过标准 SQL migration 落地。
### A.1 model_candidates
| 字段 | 类型 | 说明 |
|-----|------|------|
| id | BIGINT PK | 自增 |
| platform | VARCHAR(50) | 供应商标识,与 supply_accounts.platform 同枚举 |
| model_id | VARCHAR(100) | 模型标识 |
| model_name | VARCHAR(200) | 可读的模型名称(从供应商接口获取) |
| status | VARCHAR(20) | `discovered`, `testing`, `test_passed`, `test_failed`, `ignored`, `expired` |
| discovered_at | TIMESTAMPTZ | 首次发现时间 |
| tested_at | TIMESTAMPTZ | 最近一次测试时间 |
| failure_reason | TEXT | 测试失败原因 |
| ignored_until | TIMESTAMPTZ | 忽略有效期 |
| created_at | TIMESTAMPTZ | |
| updated_at | TIMESTAMPTZ | |
唯一约束:`(platform, model_id)`
### A.2 auto_registration_tasks
| 字段 | 类型 | 说明 |
|-----|------|------|
| id | BIGINT PK | 自增 |
| platform | VARCHAR(50) | 目标供应商 |
| task_type | VARCHAR(20) | `register`, `verify`, `rotate_key` |
| status | VARCHAR(20) | `pending`, `running`, `completed`, `failed`, `dead_letter` |
| context | JSONB | 任务上下文(如申请的邮箱、注册步骤状态机) |
| result_account_id | BIGINT | 成功后关联的 supply_accounts.id |
| failure_reason | TEXT | |
| retry_count | INT DEFAULT 0 | |
| next_retry_at | TIMESTAMPTZ | |
| created_at | TIMESTAMPTZ | |
| updated_at | TIMESTAMPTZ | |
### A.3 probe_execution_logs
| 字段 | 类型 | 说明 |
|-----|------|------|
| id | BIGINT PK | 自增 |
| account_id | BIGINT FK | supply_accounts.id |
| probe_type | VARCHAR(20) | `connectivity`, `quota`, `key_validity` |
| result | VARCHAR(20) | `success`, `failure`, `inconclusive` |
| http_status | INT | |
| latency_ms | INT | |
| error_code | VARCHAR(50) | 平台内部错误码 |
| error_message | TEXT | |
| executed_at | TIMESTAMPTZ | |
索引:`account_id + executed_at DESC`,保留策略 30 天。
---
## 自检清单
- [x] 已明确真实目标(降低供应商失效导致的错误率、缩短新模型上市时间、减少人工维护工时),不是只复述功能。
- [x] 已写清 In Scope / Out of Scope边界以模块和具体场景描述。
- [x] 每个 AC 都可被 QA 或测试用例直接验证(含具体数值、时间、状态、测试方法)。
- [x] 已覆盖异常流Rate Limit、超时、网关不可用、边缘流模型 ID 变更、手动暂停探针、并发操作)与失败路径(共 10 条)。
- [x] 已补齐上线、运营、监控、回滚要求(灰度三阶段、回滚条件、埋点、监控指标、告警规则、预置 FAQ
- [x] 已定义商业化/价值闭环(直接收益、成本节省、质量溢价三条路径)。
- [x] 已定义成功指标BG-01/03/04 + SFI与失败判定线4 条止损条件)。
- [x] 已明确当前是否可进入 TechLead 阶段:可进入 TechLead 评审,但需补充 4 项信息后方可进入开发排期。
- [x] 没有使用"优化、支持、友好、尽量、快速"等模糊词替代明确要求;所有时间、比例、次数均为具体数值或明确公式。
---

188
prd/competitor-analysis.md Normal file
View File

@@ -0,0 +1,188 @@
# Supply-Intelligence 供应链智能增强 — 竞品分析报告
## 1. 竞品范围
| 竞品 | 项目地址 | 技术栈 | 相关能力 |
|-------|---------|--------|---------|
| **LiteLLM** | berriai/litellm | Python/FastAPI | 模型定价数据库、自动路由、新模型告警、部署冷却、容灾切换 |
| **Sub2API** | Wei-Shaw/sub2api | Go/Gin/Ent | 模型定价镜像、代理管理、账号/订阅管理、用量统计、公告系统 |
| **NewAPI / OneAPI** | Calcium-Ion/new-api | Go/Gin/GORM | 渠道管理、模型配置、上游状态监控 |
---
## 2. 核心能力对标
### 2.1 模型定价与供应商数据库
#### LiteLLM Model Prices Database
LiteLLM 维护了行业内最完整的模型定价数据库 `model_prices_and_context_window_backup.json`
**关键特征**:
- 覆盖 100+ 供应商、1000+ 模型
- 每个模型包含input_cost_per_token, output_cost_per_token, context_window, max_tokens, supports_vision, supports_function_calling 等
- 支持分层定价tiered_pricing如 >128k tokens 时使用不同单价
- 支持批量定价batch pricing
- 支持音频 token 定价
- 支持自定义成本覆盖
**更新机制**:
- 主数据库内置在代码中,通过版本发布更新
- 支持远程拉取更新(可配置镜像源)
- Sub2API 就是从 LiteLLM 上游镜像此文件
#### Sub2API Pricing Service
Sub2API 的定价服务是被动消费型的(从上游获取):
**关键设计**:
- 远程拉取 LiteLLM 镜像 `model_prices_and_context_window.json`
- 本地 fallback 文件缓存
- SHA256 hash 验证更新
- 模型家族回退算法:未知模型按命名规则回退到已知模型
- 例如gpt-5.3 未知 → 回退到 gpt-5.1
- 例如claude-unknown → 回退到 claude-sonnet
- 动态价格字段优先级配置
**缺陷**:
- 被动获取,无主动发现新模型能力
- 无模型质量探针(仅依赖定价数据)
- 无自动测试和准入检查
### 2.2 供应商/渠道管理
#### Sub2API Proxy & Account Management
Sub2API 提供了完整的上游管理能力:
**代理管理** (`Proxy` schema):
```go
type Proxy struct {
name string // 代理名称
protocol string // 协议
host string // 主机
port int // 端口
username string // 用户名(可选)
password string // 密码(可选)
status string // active / inactive
}
```
**账号管理** (`Account` schema):
- 支持多个上游供应商
- 每个账号关联一个代理Proxy
- 支持账号分组AccountGroup
- 软删除机制
**用量统计** (`UsageLog`):
- 详细记录每次请求的模型、token数、成本、时间戳
- `UsageCleanupTask`: 定期清理过期用量数据
#### NewAPI/OneAPI 渠道管理
- 支持多个上游渠道配置
- 渠道状态监控(可用/不可用)
- 支持渠道优先级和权重
- 支持渠道购买次数限制
### 2.3 自动路由与容灾
#### LiteLLM Router & Auto-Router
LiteLLM 的路由系统是其核心竞争力:
**路由策略**:
- **lowest_latency**: 选择响应最快的部署
- **lowest_cost**: 选择成本最低的部署
- **lowest_tpm_rpm**: TPM/RPM 最低
- **least_busy**: 负载最低
- **auto_router**: 语义路由(基于请求内容匹配最适模型)
- **budget_limiter**: 按 key/team 限制预算
**容灾机制**:
- **Cooldown**: 连续失败的部署自动进入 cooldown暂时从路由池移除
- **Fallback**: 主模型失败时自动切换到备用模型
- **Retries**: 可配置重试次数和策略
**新模型告警** (`new_model_added`):
- 当新模型上线时发送 Slack 告警
- 但仅限于通知,无结构化的准入测试流程
### 2.4 用户与订阅管理
#### Sub2API 用户体系
- `User`: 基础用户信息
- `UserSubscription`: 订阅计划、配额、到期时间
- `UserAttributeDefinition` / `UserAttributeValue`: 用户自定义属性
- `PromoCode` / `RedeemCode`: 营销代码系统
- `SecuritySecret`: 安全凭证管理
---
## 3. 差距分析(我们的机会)
| 能力维度 | 竞品现状 | 我们的机会 |
|---------|---------|---------|
| **模型发现** | LiteLLM 被动维护定价库Sub2API 被动镜像 | 主动全网扫描发现新模型(爬取供应商 API、HN、Twitter、官方文档 |
| **准入测试** | 竞品均不具备 | 自动化准入测试流程,含功能、性能、成本、安全等维度 |
| **质量探针** | LiteLLM 仅有基础 cooldown无深度探针 | 多维度品质探针:连通性、配额、延迟、错误率、响应质量 |
| **自动注册** | 竞品均不支持 | 自动在供应商后台注册账号、申请 API Key |
| **账号生命周期** | Sub2API 有基础账号管理,无自动更新 | 自动轮换密钥、检测过期、自动补充账号 |
| **供应商健康大盘** | Sub2API 有用量统计,无综合健康视图 | 统一供应商健康大盘,实时可视化 |
| **模型比价** | LiteLLM 有定价库,但无比价能力 | 同类模型多供应商价格对比,智能推荐最优供应商 |
| **运营工作台** | 竞品均为散点式管理 | 统一运营工作台,支持干预操作(暂停、强制切换、测试触发) |
| **模型下线预测** | LiteLLM 有新模型告警,但无下线预测 | 基于用量趋势和供应商动态预测模型下线 |
| **自动化闭环** | 竞品均为人工配置 | 发现 → 测试 → 准入 → 上线 → 监控 → 下线 全自动化 |
---
## 4. 对产品规划的影响
### 强化方向
1. **模型定价数据库参考 LiteLLM**
- 维护标准化的模型定价数据库,支持 input/output cost、context window、功能支持等字段
- 支持远程更新和本地 fallback
- 支持模型家族回退
2. **供应商账号管理参考 Sub2API**
- 代理Proxy管理协议、主机、端口、状态
- 账号分组AccountGroup
- 软删除机制
- 安全凭证管理
3. **用量统计参考 Sub2API**
- 详细 UsageLog 记录
- 定期清理机制
- 用户-订阅-用量关联
4. **路由策略参考 LiteLLM**
- 多种路由策略latency、cost、load、semantic
- 容灾切换机制
- 部署冷却
### 新增差异化能力
5. **主动全网模型发现**:竞品均为被动维护,我们应主动扫描
6. **自动准入测试**:竞品不具备,是核心差异化
7. **自动账号注册**:竞品不支持,是核心差异化
8. **智能推荐**:基于价格、质量、位置的供应商推荐
9. **预测性分析**:模型下线预测、供应商变动预测
---
## 5. 对技术规划的影响
### 应引入的设计模式
| 设计模式 | 来源 | 应用场景 |
|---------|------|---------|
| **Model Prices Database** | LiteLLM | 模型定价数据库,支持远程更新和本地 fallback |
| **SHA256 Hash 验证** | Sub2API | 定价数据更新的完整性验证 |
| **模型家族回退** | Sub2API | 未知模型的智能回退 |
| **Proxy + Account 关联** | Sub2API | 上游代理与账号的关联管理 |
| **UsageLog + CleanupTask** | Sub2API | 用量记录与定期清理 |
| **路由策略抽象** | LiteLLM | 支持多种路由策略的插件化设计 |
| **Cooldown + Fallback** | LiteLLM | 故障部署的自动处理 |
### 技术避坑
1. **不重复造轮子**: 定价数据库可以直接复用 LiteLLM 的开源数据,不需要自己维护
2. **发现与测试解耦**: 模型发现和准入测试应该解耦,支持独立触发和组合触发
3. **注册模块的可扩展性**: 每个供应商的注册流程不同,需要抽象接口 + 具体实现
4. **测试隔离**: 准入测试不得影响生产环境,必须使用独立账号或模拟环境

243
specs/功能清单.md Normal file
View File

@@ -0,0 +1,243 @@
# Supply Intelligence 功能清单(按钮级任务版)
> 状态说明2026-05 收敛修订):本文件为旧版按钮级任务清单,已不再作为当前实施真源。
> 当前实施真源以“2026-05 新 PM 基线 + tech/BASELINE_TECHLEAD_V2.md + 首期消费闭环决议”为准。
> 下列旧任务类型已明确废止或降期,不得继续直接派发给 Engineer
> - gateway 管理接口热更新主路径
> - pricing / prediction / 向量检索 / SFI 仪表盘等超范围能力
> - 自动注册深链路作为本期硬门槛
> - 以 Temporal / 独立 worker / 独立平台骨架为默认落地前提
> 版本v1.0
> 日期2026-04-27
> 说明:每个任务 5 分钟可完成,可直接安排进任务管理
---
## Phase 1模块 A探针+ 模块 E工作台只读观测
### 模块 A1探针管理基础
#### A1.1 供应商账号列表页
- [ ] **任务**:实现供应商账号列表页路由 `/supply/dashboard/accounts`
- [ ] **任务**在账号列表渲染数据表格每行显示账号ID / 供应商名称 / 账号标识(昵称) / 当前状态(徽章) / 风险评分 / 最近探针时间 / 操作
- [ ] **任务**账号状态徽章颜色active=绿色 / suspended=黄色 / disabled=红色
- [ ] **任务**账号行风险评分显示为进度条0-100>80 显示红色
- [ ] **任务**:账号行渲染"查看详情"按钮,点击展开显示最近 5 次探针结果
- [ ] **任务**:账号列表支持分页,每页 50 条
- [ ] **任务**:账号列表支持按供应商名称筛选(下拉框)
- [ ] **任务**:账号列表支持按状态筛选(全部 / active / suspended / disabled
- [ ] **任务**:账号列表支持按风险评分范围筛选(滑块)
#### A1.2 账号详情页
- [ ] **任务**:实现账号详情页路由 `/supply/dashboard/accounts/{account_id}`
- [ ] **任务**详情页渲染账号基本信息区块账号ID / 供应商 / 状态 / 创建时间 / 最近探针时间
- [ ] **任务**:详情页渲染探针历史时间线,每条显示:探针时间 / 结果(成功/失败/不可判定) / 延迟 / HTTP状态码 / 风险评分
- [ ] **任务**:详情页渲染"手动触发探针"按钮,点击后立即执行一次探针,显示加载状态,完成后刷新时间线
- [ ] **任务**:详情页渲染"暂停此账号探针"开关按钮(默认关闭),开启后该账号不参与自动探针
- [ ] **任务**:详情页渲染"查看历史状态变更"按钮,点击展开状态变更记录(时间 / 从 → 到 / 原因)
#### A1.3 探针后端核心
- [ ] **任务**:实现探针调度器(基于主仓既有调度能力或轻量本地调度器,每 5 分钟轮询所有 active/suspended 账号)
- [ ] **任务**:实现探针执行器,对单个账号发起 HTTP GET/POST 请求,记录响应码/延迟/返回体
- [ ] **任务**实现探针结果评估逻辑HTTP 200 = 成功 / 401/403 = 明确失败 / 429/5xx/超时/格式突变 = 不可判定
- [ ] **任务**:实现状态机:`active` 收到 1 次 explicit_failure → `suspended``suspended` 连续 3 次 explicit_failure → `disabled`
- [ ] **任务**:实现 429/暂时性错误指数退避1min → 2min → 4min 重试,超 3 次则本次跳过并保留状态
- [ ] **任务**:实现探针结果写入 `supply_intelligence_probe_logs` 表,保留 30 天
### 模块 A2供应商适配层
#### A2.1 供应商适配器框架
- [ ] **任务**:定义 `SupplierAdapter` 接口:`(Probe(ctx context.Context, account Account) ProbeResult, GetModels(ctx context.Context, account Account) ([]Model, error))`
- [ ] **任务**:实现 `SupplierAdapterRegistry` map按供应商名称注册适配器实例
- [ ] **任务**:实现配置文件加载供应商适配器列表(`suppliers[].name` + `suppliers[].adapter`
- [ ] **任务**每个适配器实现health check端点探测发送测试请求验证连通性
#### A2.2 Phase 1 目标供应商适配2个
- [ ] **任务**:实现 OpenAI 供应商适配器Probe用 /v1/models 查询;获取模型列表:用 /v1/models
- [ ] **任务**:实现 Anthropic 供应商适配器Probe用 /v1/models 查询;获取模型列表:用 /v1/models
- [ ] **任务**适配器配置项API Base URL / API Key加密存储/ 是否允许受控自动补给 / Rate Limit 阈值
### 模块 E1运营工作台只读观测部分
#### E1.1 工作台首页
- [ ] **任务**:实现工作台首页路由 `/supply/dashboard`
- [ ] **任务**:首页渲染 4 个统计卡片:账号总数(按状态颜色分段) / 本小时新发现模型数 / 待处理候选模型数 / 受控自动补给任务队列长度
- [ ] **任务**:首页渲染候选处理与账号健康摘要(避免引入 SFI 仪表盘等超范围指标体系)
- [ ] **任务**:首页渲染"探针健康度"简表显示各供应商最后探针结果绿色OK/黄色不可判定/红色明确失败/灰色未探)
#### E1.2 待处理事项列表
- [ ] **任务**:在工作台首页渲染"待处理" Tab展示以下待办项
- 风险评分 > 70 的账号(红色高亮)
- 状态 = discovered 的候选模型(待准入测试)
- 受控自动补给失败或待验证的任务(待人工介入)
- 模型已下架告警(待确认)
- [ ] **任务**:每项待办渲染"处理"按钮,点击进入对应详情页
- [ ] **任务**:每项待办渲染"忽略"按钮点击后该项从待办列表暂时移除3小时后重现
---
## Phase 2模块 B模型发现+ 模块 C准入测试
### 模块 B1模型发现
#### B1.1 模型列表页
- [ ] **任务**:实现模型列表页路由 `/supply/dashboard/models`
- [ ] **任务**模型列表每行显示模型ID / 所属供应商 / 当前状态(活跃/草稿/已下线/发现中/测试失败) / 发现时间 / 来源
- [ ] **任务**:状态筛选 Tab全部 / 发现中 / 待测试 / 活跃 / 已下线
- [ ] **任务**:模型列表支持按供应商筛选
- [ ] **任务**:模型列表支持按发现时间范围筛选
- [ ] **任务**:模型行点击"查看详情"进入模型详情页
#### B1.2 模型发现后端
- [ ] **任务**:实现模型发现调度任务(基于主仓既有调度能力或轻量本地调度器),每 1 小时触发一次扫描
- [ ] **任务**:实现模型列表抓取器:调用各供应商适配器的 `GetModels()` 方法
- [ ] **任务**:实现模型比对逻辑:将抓取的模型列表与 `supply_packages` 中 active/paused/draft 记录去重
- [ ] **任务**:发现新模型时,写入 `supply_intelligence_model_candidates`status = discovered
- [ ] **任务**:发现模型下架时(供应商列表有、平台 active 记录也有但 ID 消失),写入运营告警,不改变 package 状态
- [ ] **任务**实现模型来源记录discovery_source 字段official_api / manual_import
### 模块 C1准入测试
#### C1.1 准入测试配置
- [ ] **任务**:实现测试用例管理页路由 `/supply/dashboard/tests/cases`
- [ ] **任务**测试用例列表每行显示用例ID / 所属模型类型 / 测试目标(endpoint) / 状态(启用/禁用)
- [ ] **任务**:渲染"新增用例"按钮点击弹出用例创建表单endpoint地址 / 请求方法 / 预期响应格式 / 超时时间)
- [ ] **任务**测试用例表单支持选择模板chat/completion/embedding
- [ ] **任务**:实现每个模型类型默认测试用例集(≥ 5 个用例)
#### C1.2 准入测试执行
- [ ] **任务**:实现准入测试任务流,接收 candidate_id 参数并由主仓既有调度能力或轻量任务执行器驱动
- [ ] **任务**`AdmissionTestWorkflow``model_candidates` 加载 discovered 状态的候选模型
- [ ] **任务**:按顺序执行所有启用的测试用例,记录每条的 HTTP 状态/延迟/响应格式/Token 计数
- [ ] **任务**:所有用例返回 HTTP 200 + 格式正确 → 更新 candidate status = test_passed生成 supply_package 草稿
- [ ] **任务**:任意用例返回非 200 或格式错误 → 更新 candidate status = test_failed写入 failure_reason
- [ ] **任务**单个用例超时60 秒)→ 标记为 timeout整体判定失败
- [ ] **任务**:准入测试完成后,发送飞书通知给运营人员
#### C1.3 草稿生成
- [ ] **任务**:准入测试通过后,自动生成 `supply_packages` 草稿记录status = draft
- [ ] **任务**草稿字段platform / model_id / model_name / price_per_1m_input默认值/ price_per_1m_output默认值/ suggested_by = si_auto
- [ ] **任务**:草稿生成后,在工作台"待上架"列表中显示该草稿
---
## Phase 3模块 D受控自动补给+ 模块 E工作台完整干预
### 模块 D1受控自动补给配置
#### D1.1 自动补给设置页
- [ ] **任务**:实现自动补给设置页路由 `/supply/dashboard/auto-supply/settings`
- [ ] **任务**:页面渲染供应商列表,每行显示:供应商名称 / 是否开启受控自动补给(开关)/ 可用账号阈值(数字输入)/ 状态
- [ ] **任务**:点击供应商行"配置"按钮,弹出自动补给配置弹窗
- [ ] **任务**:配置弹窗字段:启用自动补给(开关)/ 白名单供应商标记 / 触发阈值(账号数)/ 补给方式(任务化/人工补录入口)/ 审批要求
- [ ] **任务**:弹窗保存后,按主仓既有配置方式持久化并生效,不引入 Redis 首期前置依赖
- [ ] **任务**:配置页顶部渲染"通知/补给受理链路测试"按钮,点击后发送测试通知或验证受理接口可达
#### D1.2 自动补给执行后端
- [ ] **任务**:实现受控自动补给任务流,监控白名单供应商可用账号数 < 阈值时触发
- [ ] **任务**:按供应商配置创建补给任务或调用受控补给受理接口,禁止默认走浏览器自动化注册深链路
- [ ] **任务**:补给成功后写入待验证/待启用记录,不允许绕过验证直接进入 active
- [ ] **任务**:若涉及凭证写入,则将密钥发送至 KMS 加密,密文存入 `supply_accounts`
- [ ] **任务**:触发验证或人工审核链路,验证通过后再进入可用状态
- [ ] **任务**:补给失败时,写入 `supply_intelligence_auto_supply_tasks` 或等价任务表status = failed记录失败原因
### 模块 D2Fail-closed 安全机制
- [ ] **任务**:补给流程中,若通知网关/补给受理接口返回 503 或超时,任务立即标记为 failed不执行虚假成功写操作
- [ ] **任务**:补给流程中,若 KMS 加密超时60 秒),任务立即标记为 failed
- [ ] **任务**:明文凭证在内存中的存活时间不超过 60 秒,超时自动清除
- [ ] **任务**:审计日志中记录补给请求/响应(脱敏后:隐藏敏感标识、隐藏凭证)
### 模块 E2工作台完整干预
#### E2.1 候选模型处理
- [ ] **任务**:工作台"待上架模型"列表每行显示模型ID / 供应商 / 发现时间 / 测试结果摘要 / 来源
- [ ] **任务**:模型行渲染"查看测试详情"按钮,点击展开显示所有测试用例结果(每条:通过/失败/超时)
- [ ] **任务**:模型行渲染"确认上架"绿色按钮,点击后弹出确认框(显示将生成的 package 草稿内容)
- [ ] **任务**:模型行渲染"忽略"按钮,点击后该模型 7 天内不出现(写入 ignored_until 字段)
- [ ] **任务**:模型行渲染"手动强制上架"橙色按钮(仅测试失败时可见),点击后需填写强制上架理由(必填)
#### E2.2 草稿确认上架
- [ ] **任务**:点击"确认上架"后PUT `supply_packages/{id}` status = active
- [ ] **任务**:同时更新 `model_candidates` 对应记录 status = published
- [ ] **任务**:写入 gateway package change event等待首期消费方按决议链路拉取并 ack
- [ ] **任务**:完成后显示成功提示:"模型已上架,已生成待消费变更事件;是否进入路由以消费方 ack 为准"
#### E2.3 工单与通知
- [ ] **任务**:模型下架告警 → 自动生成运营工单(类型 = model_deprecated推送到运营工作台
- [ ] **任务**:受控自动补给失败 → 自动生成运营工单(类型 = auto_supply_failed推送飞书通知
- [ ] **任务**:连续 3 次探针失败账号 → 生成运营工单(类型 = account_risk推送飞书通知
---
## 全局模块
### 模块 G1供应商配置管理
- [ ] **任务**:实现供应商列表页路由 `/supply/dashboard/settings/suppliers`
- [ ] **任务**供应商列表每行显示供应商ID / 名称 / 适配器类型 / 账号数量 / 接口状态 / 操作
- [ ] **任务**:渲染"添加供应商"按钮,点击弹出供应商创建表单
- [ ] **任务**:供应商表单字段:名称 / 适配器类型(下拉) / API Base URL / API Key加密存储/ 探针周期(默认5min) / 是否启用
- [ ] **任务**:实现供应商"测试连通性"按钮,点击后执行一次 probe 并显示结果
- [ ] **任务**:供应商配置变更后,刷新当前集成运行实例中的适配器装配或调度配置(不得以 Temporal Worker 作为首期前置依赖)
### 模块 G2配置热更新
- [ ] **任务**:关键配置项(探针周期/扫描周期/阈值)按主仓既有配置方式存储与生效,避免把 Redis 作为首期前置依赖
- [ ] **任务**:实现 `GET /api/v1/supply-intelligence/config` 接口,返回当前生效配置
- [ ] **任务**:实现 `PUT /api/v1/supply-intelligence/config` 接口,修改配置后 60 秒内生效
- [ ] **任务**配置变更生成审计日志记录action = config_update
- [ ] **任务**:不支持的配置项修改返回 400 错误码
### 模块 G3OpenAPI + 健康检查
- [ ] **任务**:实现 `GET /actuator/health` / `/actuator/health/live` / `/actuator/health/ready`
- [ ] **任务**:实现 Swagger UI 路由 `/docs`
- [ ] **任务**:实现 OpenAPI 3.0 spec 端点 `/openapi.json`
- [ ] **任务**:实现关键后台任务执行链路健康检查,调度/执行链路不可用时 `/actuator/health/ready` 返回 503
### 模块 G4权限与认证
- [ ] **任务**:实现 JWT 认证中间件(与立连桥统一认证打通)
- [ ] **任务**:实现角色权限:运营人员(观测 + 部分操作)/ 管理员(全部操作)
- [ ] **任务**:权限不足返回 HTTP 403错误码 `SUP_INT_AUTH_1001`
---
## 技术基础设施
### T1项目骨架
- [ ] **任务**:初始化或挂载到主仓中的 Go module / 子模块边界,保持与 supply-api 一致的技术栈约束
- [ ] **任务**:创建集成运行入口;如保留独立运行,也仅作为轻量可选形态,不以双进程 `api`/`worker` 为首期强依赖
- [ ] **任务**:创建 `internal/` 目录结构domain/service/handler/infrastructure/repository
- [ ] **任务**:配置 Viper 读取 `config.yaml`,支持环境变量覆盖
- [ ] **任务**:配置 `log/slog` 结构化日志,输出 JSON 格式
- [ ] **任务**:创建 PostgreSQL schema migration使用 golang-migrate表前缀 `supply_intelligence_`
- [ ] **任务**:按主仓既有能力接入配置、调度、审计与内部路由,不额外引入 Redis 作为首期前置依赖
- [ ] **任务**:配置 Dockerfile 和最小部署说明,优先支持主仓集成部署
- [ ] **任务**:如需部署文档,仅按当前真源补充最小启动命令,不再回写旧 `DEPLOYMENT.md` 为实现依据
### T2单元测试骨架
- [ ] **任务**:为每个 domain 层函数编写单元测试,覆盖率 >= 70%
- [ ] **任务**:为每个 service 层函数编写单元测试,覆盖率 >= 80%
- [ ] **任务**:配置 CIGitHub ActionsPR 必须通过全部测试和覆盖率检查
### T3IntegrationPlugin 接口
- [ ] **任务**:实现 `IntegrationPlugin` 接口(`Init() error` / `Serve() error` / `Shutdown() error`
- [ ] **任务**:实现插件模式下各模块的开关配置(`viper` 读取 `supply_intelligence.enabled_modules`
- [ ] **任务**:实现内部/外部路径前缀可配置,并区分 `/internal/supply-intelligence/` 与对外暴露路径
- [ ] **任务**:编写集成测试:插件模式启动,关键探针/发现/发布事件接口与内部消费接口正常运作
---
## 任务估算汇总
| Phase | 模块 | 任务数 | 估计工时 |
|-------|------|--------|---------|
| Phase 1 | A1 探针管理 + A2 适配层 + E1 工作台只读 | 34 | 3 人天 |
| Phase 2 | B1 模型发现 + C1 准入测试 | 22 | 3 人天 |
| Phase 3 | D1/D2 受控自动补给 + E2 工作台干预 | 24 | 3 人天 |
| 全局 | G1 供应商配置 + G2 配置热更新 + G3 OpenAPI + G4 权限认证 | 18 | 2 人天 |
| 技术基础设施 | T1 骨架 + T2 测试 + T3 插件 | 14 | 2 人天 |
| **合计** | | **112** | **~13 人天** |

124
specs/竞品分析.md Normal file
View File

@@ -0,0 +1,124 @@
# Supply Intelligence 竞品深度分析
> 版本v1.0
> 日期2026-04-27
> 内容8 个竞品全景矩阵、功能逐项对比、技术分析、市场定位
---
## 一、市场概览
- 归并到 LLM API Gateway 市场2025 年约 **$15-25 亿**,高速增长
- 多供应商运营复杂度急剧上升:中等规模团队可能接入 10+ 供应商20+ 账号
- 供应商 API Key 失效/额度耗尽是高频线上事故根因,单次事故损失 $5000-50000
- 新模型发布速度:每月 50+ 新模型,人工录入无法跟上
- **市场空白**:供应链运营自动化(供应商账号健康、模型发现、准入测试)几乎无成熟方案
---
## 二、竞品全景矩阵8 个)
| 竞品 | 类型 | 供应商账号健康探针 | 新模型自动发现 | 准入测试自动化 | 账号自动注册 | 运营工作台 | 定价 |
|------|------|-----------------|-------------|-------------|------------|----------|------|
| **LiteLLM** | 开源 | ❌ 手动录入 | ❌ 无 | ❌ 无 | ❌ 无 | ⚠️ 简单管理 | 免费(自部署) |
| **Helicone** | SaaS/开源 | ❌ 手动管理 | ❌ 无 | ❌ 无 | ❌ 无 | ⚠️ 简单 | 免费+$0.05/请求 |
| **Portkey** | SaaS | ❌ 手动管理 | ❌ 无 | ❌ 无 | ❌ 无 | ⚠️ 简单 | $49/月起 |
| **OpenRouter** | SaaS | ❌ 手动管理 | ⚠️ 手动 | ❌ 无 | ❌ 无 | ⚠️ 简单 | 5% 手续费 |
| **Kong AI Gateway** | 企业 | ❌ 手动管理 | ❌ 无 | ❌ 无 | ❌ 无 | ⚠️ API 管理 | 面议 |
| **One API / NewAPI** | 开源 | ❌ 手动管理 | ❌ 无 | ❌ 无 | ❌ 无 | ⚠️ 简单 | 免费 |
| **RapidAPI Enterprise Hub** | SaaS | ⚠️ 入驻流程 | ⚠️ 手动 | ❌ 无 | ⚠️ 部分 | ✅ | $2-10 万/年 |
| **内部自建(现状)** | — | ❌ 无监控 | ❌ 无 | ❌ 无 | ❌ 无 | ❌ 无 | 人力成本 |
| **立连桥 supply-intelligence** | 内部工具 | ✅ 分钟级探针 | ✅ 自动发现 | ✅ 自动化流水线 | ✅ 白名单供应商 | ✅ 完整工作台 | 内部成本 |
---
## 三、功能逐项对比11 项)
```
功能项 LiteLLM Helicone Portkey OpenRouter NewAPI RapidAPI supply-intel
供应商账号健康探针 ❌ ❌ ❌ ❌ ❌ ⚠️ ✅
新模型自动发现 ❌ ❌ ❌ ⚠️ ❌ ⚠️ ✅
模型准入测试 ❌ ❌ ❌ ❌ ❌ ❌ ✅
supply_package 草稿生成 ❌ ❌ ❌ ❌ ❌ ❌ ✅
账号自动注册 ❌ ❌ ❌ ❌ ❌ ⚠️ ✅
运营工作台 ⚠️ ⚠️ ⚠️ ⚠️ ⚠️ ✅ ✅
KMS 凭证加密 ❌ ❌ ⚠️ ❌ ❌ ✅ ✅
审计日志 ⚠️ ✅ ✅ ⚠️ ⚠️ ✅ ✅
供应商状态 API 对外提供 ❌ ❌ ❌ ❌ ❌ ❌ ✅
Fail-closed 降级 ❌ ❌ ⚠️ ❌ ❌ ⚠️ ✅
SFI 指标追踪 ❌ ❌ ❌ ❌ ❌ ❌ ✅
独立/集成双模式 ❌ ❌ ❌ ⚠️ ⚠️ ❌ ✅
```
---
## 四、市场定位结论
### 4.1 竞品空白
**所有 LLM Gateway 竞品LiteLLM/Helicone/Portkey/OpenRouter只做**
- 统一 API 路由
- 用量计量和计费
- 基础监控和日志
**没有任何竞品提供:**
1. 供应商账号健康度的分钟级自动探针(额度/密钥/TOS
2. 新模型发布的自动发现(对接供应商模型列表 API
3. 模型准入测试的自动化(功能验证 + supply_package 草稿生成)
4. 供应商账号的自动注册(针对支持公开注册的供应商)
### 4.2 supply-intelligence 差异化定位
```
LLM GatewayLiteLLM/Helicone/Portkey/OpenRouter
└─ 能力边界:路由 + 计量 + 监控
└─ 缺失:供应商运营能力
API MarketplacesRapidAPI
└─ 能力边界:供应商入驻 + 文档 + 货币化
└─ 缺失:自动化运营工具
内部自建(现状)
└─ 能力边界:手动维护
└─ 缺失:自动化 + 监控 + 实时性
───────────────────────────────────
立连桥 supply-intelligence = 供应链运营自动化
✅ 供应商健康探针(分钟级)
✅ 新模型自动发现(对接受应商 API
✅ 准入测试自动化(功能验证)
✅ 运营工作台(待办 + 一键上架)
✅ 账号自动注册(白名单供应商)
```
---
## 五、关键技术差异
### 5.1 探针方案对比
| 方案 | 代表竞品 | 频率 | 自动化程度 |
|------|---------|------|----------|
| 手动检查 | 内部自建 | 天级 | ❌ |
| 被动监控 | LLM Gateway 竞品 | 被动 | ⚠️ 有限 |
| 主动探针 | **supply-intelligence** | 分钟级 | ✅ 完整 |
### 5.2 模型发现方案对比
| 方案 | 代表竞品 | 延迟 | 自动化程度 |
|------|---------|------|----------|
| 人工录入 | 内部自建 | 天级 | ❌ |
| 供应商通知 | RapidAPI | 小时级 | ⚠️ 被动 |
| 自动扫描 | **supply-intelligence** | 分钟级 | ✅ 主动 |
---
## 六、技术选型建议
| 组件 | 推荐方案 | 理由 |
|------|---------|------|
| 探针调度 | Temporal | 分布式友好exponential backoffdead letter queue 内置 |
| 供应商 API 对接 | 配置化 adapter | 供应商数量多,接口差异大,需可扩展 |
| 凭证加密 | KMS+ AES-256-GCM兜底 | 符合安全审计要求 |
| 模型发现 | 轮询为主 | 多数供应商无 Webhook轮询更通用 |
| 准入测试 | 异步任务队列 | 测试可能耗时长,不能阻塞扫描周期 |

View File

@@ -0,0 +1,609 @@
> 真源索引:当前文档受 `/home/long/project/立交桥/projects/supply-intelligence/tech/CURRENT_SOURCE_OF_TRUTH_2026-05.md` 纳管。
> 若阅读顺序、真源优先级或跨文档冲突存在疑问,先看该索引,再回到本基线文档。
1. 设计范围:本次覆盖 / 明确不做 / 与 PRD 对应关系
1.1 本次覆盖
- 覆盖模块 A 供应商品质探针,但收敛为“账号健康探测 + 状态写回 + 审计 + gateway 可消费状态快照”。
- 覆盖模块 B 全网模型发现,但收敛为“已接入供应商的模型列表扫描 + candidate 生成 + 下架告警”,不做广义全网情报平台。
- 覆盖模块 C 模型准入测试,但收敛为“对 discovered candidate 做标准兼容性验证 + 生成 supply_package 草稿 + 发布闭环”。
- 模块 E 仅覆盖与主链路直接相关的最小运营干预:手动触发探针、忽略 candidate、确认上架、查看失败原因、查看审计。
- 覆盖与立交桥主项目的优先集成运行方案。
- 覆盖对 NewAPI / Sub2API 的最小适配边界:状态读取、模型列表消费、可选发布回调;不反向侵入其内部实现。
- 覆盖五个 QA 阻塞的显式修复:
1) 设计范围蔓延
2) 探针误判规则冲突
3) candidate 状态机不闭环
4) 模块关闭一致性缺失
5) gateway 消费链路未闭环
1.2 明确不做
- 不做独立平台化、多服务拆分、专用 API Gateway、专用消息总线、专用控制台集群。
- 不做 vector / embedding 检索 / 向量库。
- 不做 pricing 数据库、模型比价主链路、自动定价、家族回退定价。
- 不做 predictions / 预测分析 / 广义开放平台 / 社区情报源。
- 不做 WebSocket 实时推送作为本期前提;工作台可先走普通 HTTP 拉取。
- 不做 Playwright 浏览器自动化注册主路径;本期仅保留受控自动补给的最小边界:白名单供应商、阈值触发、任务化补给、待验证/待启用,不把浏览器自动化注册链路作为首期上线门槛。
- 不要求独立 Redis/Temporal/Milvus/Qdrant 等新增重基础设施;优先复用立交桥现有 DB、现有 scheduler、现有审计、现有配置热更新机制。
- 不自动直接操作 gateway 路由细节表;只提供 package 发布后的内部契约,由 gateway/supply-api 按既有主链路消费。
1.3 与 PRD 对应关系
- AC-01/02/03保留落在探针执行、判定、状态迁移、审计与降级策略。
- AC-04/05保留落在扫描、去重、新增 candidate、下架告警。
- AC-06/07保留落在 admission runner、candidate 流转、draft package 生成。
- AC-08/09本期不做深自动注册链路但保留“受控自动补给”的最小产品/技术边界:仅允许白名单供应商、仅允许阈值触发、仅允许生成待补给任务或进入待验证/待启用,不允许把注册浏览器自动化、验证码编排、自动激活作为首期硬门槛。
- AC-10/11/12保留但只保留支撑主链路的最小实现不扩展成独立大盘平台。
- PRD 中与商业化、SFI、预测分析、比价报表相关内容不作为本次技术控制面主路径。
2. 架构与模块:模块划分、文件/目录落点、关键调用链路、关键依赖与降级边界
2.1 总体架构
基线采用“立交桥主项目内集成模块”模式,而不是独立平台。推荐以 supply-api 内部模块形式落地,原因:
- 直接复用 supply_accounts / supply_packages / audit / verify / config / scheduler。
- 避免再造服务间调用、鉴权、部署、监控、迁移复杂度。
- 更符合立交桥现有 net/http + pgx + PostgreSQL 的简洁架构。
独立运行能力保留为简单可选形态:
- 仅在确有外部项目需要时,封装为同仓内单进程启动入口。
- 独立运行不得要求新增专用基础设施;仍使用 PostgreSQL + 现有 scheduler 抽象。
- 不额外设计独立控制台、独立 worker 集群、独立 API 网关。
2.2 模块划分
建议收敛为 6 个模块,均为最小必要:
A. probe
- 读取待探测账号
- 执行标准探针
- 依据统一判定规则生成 outcome
- 驱动 account 状态迁移
- 写审计与探针日志
B. discovery
- 读取已接入供应商适配器
- 拉取模型列表
- 与现有 supply_packages / candidate 去重
- 创建 candidate
- 生成模型下架告警待办
C. admission
- 消费 discovered / retry_pending candidate
- 执行标准测试集
- 更新 candidate 状态
- 生成或更新 supply_package draft
D. publish
- 运营确认 package draft
- 将 package 切到 active
- 将 candidate 切到 published
- 写入 gateway 可消费的发布事件/变更记录
E. integration
- 立交桥内部直接集成接口
- gateway / supply-api 内部契约
- NewAPI / Sub2API 适配边界
F. control
- 模块开关、停机、运行中任务收敛、配置热更新、幂等、审计
2.3 文件/目录落点
以下为建议落点,优先放入立交桥主项目既有模块内;若 supply-intelligence 仓先行设计,可按同名目录组织:
- /home/long/project/立交桥/projects/supply-intelligence/tech/BASELINE_TECHLEAD_V2.md
- 建议实现落点参考:
- supply-api/internal/supplyintelligence/module.go
- supply-api/internal/supplyintelligence/probe/service.go
- supply-api/internal/supplyintelligence/probe/evaluator.go
- supply-api/internal/supplyintelligence/probe/state_machine.go
- supply-api/internal/supplyintelligence/discovery/service.go
- supply-api/internal/supplyintelligence/discovery/adapter_registry.go
- supply-api/internal/supplyintelligence/admission/service.go
- supply-api/internal/supplyintelligence/admission/runner.go
- supply-api/internal/supplyintelligence/publish/service.go
- supply-api/internal/supplyintelligence/integration/http_internal.go
- supply-api/internal/supplyintelligence/integration/newapi_adapter.go
- supply-api/internal/supplyintelligence/integration/sub2api_adapter.go
- supply-api/internal/supplyintelligence/control/shutdown.go
- supply-api/internal/supplyintelligence/repository/*.go
- supply-api/sql/*supply_intelligence*.sql
2.4 关键调用链路
链路 1探针 -> 状态写回 -> gateway 消费闭环
1) scheduler 触发 ProbeTick(platform/account batch)
2) probe.Service.LoadProbeTargets()
3) probe.Service.RunProbe(accountID)
4) probe.Evaluator.Classify(response/error) => success / explicit_failure / inconclusive
5) probe.StateMachine.Apply(account.current_status, recent_probe_window)
6) repository.UpdateAccountHealthAndStatusTx(...)
7) repository.AppendAuditLog(...)
8) repository.UpsertGatewayAccountSnapshot(...)
9) gateway 通过内部契约读取 snapshot 或随 package/account 查询一起读取可用状态
链路 2扫描 -> candidate -> admission
1) scheduler 触发 DiscoveryTick(platform)
2) discovery.Adapter.FetchModels()
3) discovery.Service.DiffAgainstPackagesAndCandidates()
4) repository.UpsertModelCandidate(status=discovered)
5) scheduler enqueue AdmissionRun(candidateID)
6) admission.Runner.Execute(candidateID)
7) repository.UpdateCandidateStatus(...)
8) repository.UpsertDraftPackage(...)
9) repository.AppendAuditLog(...)
链路 3运营确认上架 -> gateway 消费闭环
1) ops POST confirm publish
2) publish.Service.PublishDraft(candidateID, actor)
3) tx: lock candidate + package draft
4) package draft -> active
5) candidate test_passed -> published
6) append internal event supply_package_published
7) append audit
8) gateway/supply-api 既有主链路消费 active package 或发布事件刷新内存路由
链路 4模块关闭闭环
1) operator/config 将 module.enabled=false
2) control.ModuleGate.MarkClosing(module)
3) 新任务拒绝入队/拒绝手动触发
4) 运行中任务继续到安全提交点或超时中断
5) 写 module_state=closed when inflight=0
6) 后续 scheduler tick 直接跳过
2.5 关键依赖与降级边界
- PostgreSQL强依赖。不可用时所有自动写操作 fail-closed不做假成功。
- scheduler中强依赖。不可用时自动任务暂停但手动接口可保留。记录告警。
- supplier adapter弱依赖。单供应商异常不影响其他供应商。
- gateway首期默认事件型消费方。发布链路不等待 gateway 成功回调才提交 package active但必须通过 package change + ack 保留可追踪消费记录,且必须存在真实消费入口。
- NewAPI/Sub2API可选适配依赖。未配置时不影响立交桥内部主链路。
降级原则
- 探针外部错误、429、5xx、DNS/TCP 异常inconclusive不推进惩罚性状态迁移。
- admission 外部超时candidate 转 retry_pending 或 test_failed不能生成 active package。
- gateway 消费延迟package 可 active但需要“未消费/待同步”状态位和审计,不可假定已生效。
- 模块关闭中:新任务一律拒绝,运行中任务只允许安全收尾。
3. 接口与数据模型API/RPC/事件、数据模型/schema、错误码、安全/鉴权契约
3.1 接口分类
3.1.1 立交桥内部直接集成接口
用途:供立交桥主项目内其他模块直接调用,优先 Go 接口,不先暴露额外网络跳。
interface SupplyIntelligenceModule {
RunProbe(ctx context.Context, accountID int64, trigger string) (*ProbeOutcome, error)
ScanPlatform(ctx context.Context, platform string, trigger string) (*ScanOutcome, error)
RunAdmission(ctx context.Context, candidateID int64, trigger string) (*AdmissionOutcome, error)
PublishCandidate(ctx context.Context, candidateID int64, actor string) (*PublishOutcome, error)
GetAccountRoutingState(ctx context.Context, accountID int64) (*AccountRoutingState, error)
}
3.1.2 给 gateway / supply-api 使用的内部契约
用途:形成真实消费闭环,避免“文档说 gateway 会用,但无真实契约”。
HTTP internal 契约,前缀建议:/internal/supply-intelligence
1) GET /internal/supply-intelligence/accounts/{account_id}/routing-state
响应:
{
"account_id": 123,
"platform": "openai",
"account_status": "active",
"routing_enabled": true,
"risk_score": 20,
"reason_code": "ok",
"last_probe_at": "2026-05-06T15:00:00Z",
"version": 17
}
2) GET /internal/supply-intelligence/models/{platform}/{model}/admission-state
响应:
{
"platform": "openai",
"model": "gpt-4.1-mini",
"candidate_status": "published",
"package_id": 456,
"package_status": "active",
"gateway_sync_status": "pending|applied|failed|not_required",
"version": 9
}
3) GET /internal/supply-intelligence/gateway/package-changes?cursor=...
响应:
{
"items": [
{
"event_id": "evt_001",
"event_type": "supply_package_published",
"package_id": 456,
"platform": "openai",
"model": "gpt-4.1-mini",
"occurred_at": "2026-05-06T15:00:00Z",
"version": 9
}
],
"next_cursor": "..."
}
4) POST /internal/supply-intelligence/gateway/package-changes/{event_id}/ack
请求:
{
"consumer": "gateway",
"result": "applied|failed",
"detail": "optional"
}
响应204
闭环定义
- 发布成功 != gateway 已消费。
- 只有 gateway ack event_id 后gateway_sync_status 才能从 pending -> applied/failed。
- QA 必须验证 publish -> list changes -> ack 的真实链路。
3.1.3 面向 NewAPI/Sub2API 的适配边界
原则:只暴露最小必要只读/回调能力,不把本系统设计成它们的管理平台。
适配边界 A状态拉取
- GET /adapter/v1/supply-status/accounts/{account_id}
- 字段与 routing-state 对齐,但去掉内部实现细节。
适配边界 B模型拉取
- GET /adapter/v1/models?status=published
响应只返回已 published 且 package active 的模型。
适配边界 C可选发布回调下发
- POST /adapter/v1/package-events
仅在对方需要 webhook 模式时启用;默认不要求。
适配边界约束
- 不暴露审计明细。
- 不暴露原始探针日志。
- 不暴露账号凭证、测试账号信息、内部风险算法细节。
- 仅允许配置白名单来源访问。
3.2 数据模型/schema
3.2.1 probe_execution_logs
- id bigint pk
- account_id bigint not null
- platform varchar(64) not null
- probe_result varchar(32) not null 取值: success | explicit_failure | inconclusive
- failure_class varchar(64) null 取值: auth_invalid | quota_empty | timeout | tcp_error | dns_error | rate_limited | upstream_5xx | parse_error
- http_status int null
- latency_ms int null
- risk_score int not null
- evaluated_transition varchar(64) not null 取值: no_change | active_to_suspended | suspended_to_disabled | suspended_to_active
- executed_at timestamptz not null
- request_id varchar(64) not null
- index(account_id, executed_at desc)
3.2.2 model_candidates
- id bigint pk
- platform varchar(64) not null
- model varchar(128) not null
- status varchar(32) not null
- discovery_source varchar(32) not null 取值: official_api | official_doc | manual_seed
- last_scan_at timestamptz not null
- discovered_at timestamptz not null
- last_test_at timestamptz null
- failure_reason_code varchar(64) null
- failure_summary text null
- ignored_until timestamptz null
- package_id bigint null
- version int not null default 1
- unique(platform, model)
candidate 最终闭环状态机
- discovered扫描新发现可入测试
- testing测试执行中
- test_passed测试通过已存在 draft package
- test_failed测试失败允许人工重试或自动进入 retry_pending
- retry_pending等待下次重试
- ignored运营临时忽略到 ignored_until 后自动回 discovered
- published运营已确认上架package active
- deprecated供应商侧已消失已产生运营待办但历史保留
- closed不再处理的终态仅用于模型被明确弃用/手工关闭
合法迁移
- discovered -> testing
- testing -> test_passed | test_failed | retry_pending
- test_failed -> retry_pending | closed
- retry_pending -> testing | closed
- discovered | test_failed | retry_pending -> ignored
- ignored -> discovered
- test_passed -> published | closed
- published -> deprecated | closed
- deprecated -> closed
闭环修复点
- 任何非终态都存在后继处理路径。
- ignored 有自动回流。
- published/deprecated 最终可归档到 closed。
- 不再存在“只定义中间态、无出口”的 QA 阻塞。
3.2.3 gateway_package_events
- event_id varchar(64) pk
- event_type varchar(64) not null
- package_id bigint not null
- candidate_id bigint null
- payload jsonb not null
- consumer varchar(64) null
- consumer_status varchar(32) not null default 'pending'
- consumer_detail text null
- occurred_at timestamptz not null
- acked_at timestamptz null
- retry_count int not null default 0
3.2.4 module_runtime_state
- module_name varchar(64) pk
- desired_state varchar(16) not null 取值: enabled | disabled
- runtime_state varchar(16) not null 取值: starting | running | closing | closed
- inflight_count int not null
- updated_at timestamptz not null
3.3 探针判定统一规则
这是本轮必须修的 QA 阻塞之一,统一如下:
明确失败 explicit_failure
- HTTP 401/403
- 供应商明确返回 key invalid / account suspended / quota exhausted 且可稳定识别
不可判定 inconclusive
- HTTP 429
- HTTP 5xx
- DNS 失败
- TCP 连接失败
- 超时
- 响应体为空或格式突变
成功 success
- 返回 2xx 且最小校验通过
状态迁移规则
- active + 1 次 explicit_failure -> suspended
- suspended + 最近连续 3 次 explicit_failure -> disabled
- suspended + 1 次 success -> active
- disabled 不自动恢复,只能人工恢复到 active 或 closed
- inconclusive 永不计入 explicit failure 连续次数
说明
- 将 timeout/TCP/DNS 从“失败导致降级”统一修正为 inconclusive消除 PRD/HLD 冲突。
- 若未来某供应商能明确证明 timeout 即余额停用,也必须走供应商级覆盖配置,不改全局默认。
3.4 错误码
- SUP_INT_PROBE_NOT_FOUND 404
- SUP_INT_PROBE_MODULE_DISABLED 409
- SUP_INT_CANDIDATE_NOT_FOUND 404
- SUP_INT_CANDIDATE_STATE_INVALID 409
- SUP_INT_PUBLISH_PACKAGE_MISSING 409
- SUP_INT_GATEWAY_ACK_CONFLICT 409
- SUP_INT_ADAPTER_UNSUPPORTED 400
- SUP_INT_AUTH_FORBIDDEN 403
- SUP_INT_CONFIG_INVALID 400
- SUP_INT_UPSTREAM_TEMPORARY 503
3.5 安全/鉴权契约
- 内部接口只允许立交桥内部服务身份访问,走现有 internal auth middleware。
- NewAPI/Sub2API 适配接口必须使用独立 access key 或签名校验,按来源白名单限制。
- 审计字段必须包含 object_type/object_id/action/result_code/before_state/after_state/request_id/actor。
- 任何日志不得输出明文 API key、cookie、token、测试账号凭证。
- 手动发布、手动恢复 disabled 账号、关闭 candidate 必须要求 operator 身份并审计。
4. 任务拆解:每个任务必须有具体文件路径和函数名,粒度 2-5 分钟
说明:以下为 Engineer 最小实现任务单,按设计拆到文件级与函数级。路径以优先集成到 supply-api 为准。
4.1 模块骨架
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/module.go :: func RegisterModule(...) error
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/module.go :: func MountInternalRoutes(...) error
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/module.go :: func RegisterSchedulers(...) error
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/control/shutdown.go :: func BeginModuleClose(...) error
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/control/shutdown.go :: func FinishInflightTask(...) error
4.2 probe
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/probe/service.go :: func LoadProbeTargets(ctx context.Context, limit int) ([]Account, error)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/probe/service.go :: func RunProbe(ctx context.Context, accountID int64, trigger string) (*ProbeOutcome, error)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/probe/evaluator.go :: func ClassifyProbeResult(resp *http.Response, err error) ProbeClass
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/probe/evaluator.go :: func CalculateRiskScore(class ProbeClass) int
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/probe/state_machine.go :: func ApplyAccountTransition(current string, recent []ProbeClass) (next string, transition string)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/probe/state_machine.go :: func CountRecentExplicitFailures(recent []ProbeClass) int
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/probe/worker.go :: func HandleProbeTick(ctx context.Context) error
4.3 discovery
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/discovery/adapter_registry.go :: func ResolveModelAdapter(platform string) (ModelAdapter, error)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/discovery/service.go :: func ScanPlatform(ctx context.Context, platform string, trigger string) (*ScanOutcome, error)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/discovery/service.go :: func DiffModels(current []string, packages []string, candidates []string) DiffResult
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/discovery/service.go :: func UpsertDiscoveredCandidates(ctx context.Context, platform string, models []string) error
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/discovery/service.go :: func MarkDeprecatedAlerts(ctx context.Context, platform string, missing []string) error
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/discovery/worker.go :: func HandleDiscoveryTick(ctx context.Context) error
4.4 admission
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/admission/service.go :: func EnqueueAdmission(ctx context.Context, candidateID int64) error
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/admission/service.go :: func RunAdmission(ctx context.Context, candidateID int64, trigger string) (*AdmissionOutcome, error)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/admission/runner.go :: func LoadCandidateForTesting(ctx context.Context, candidateID int64) (*Candidate, error)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/admission/runner.go :: func ExecuteTestSuite(ctx context.Context, c *Candidate) (*SuiteResult, error)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/admission/runner.go :: func DecideCandidateNextState(result *SuiteResult) (string, string)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/admission/runner.go :: func UpsertDraftPackage(ctx context.Context, c *Candidate, result *SuiteResult) (int64, error)
4.5 publish
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/publish/service.go :: func PublishCandidate(ctx context.Context, candidateID int64, actor string) (*PublishOutcome, error)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/publish/service.go :: func ValidatePublishable(ctx context.Context, candidateID int64) error
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/publish/service.go :: func AppendGatewayPackageEvent(ctx context.Context, packageID int64, candidateID int64) error
4.6 integration
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/integration/http_internal.go :: func GetAccountRoutingState(w http.ResponseWriter, r *http.Request)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/integration/http_internal.go :: func GetModelAdmissionState(w http.ResponseWriter, r *http.Request)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/integration/http_internal.go :: func ListGatewayPackageChanges(w http.ResponseWriter, r *http.Request)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/integration/http_internal.go :: func AckGatewayPackageChange(w http.ResponseWriter, r *http.Request)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/integration/newapi_adapter.go :: func ListPublishedModels(w http.ResponseWriter, r *http.Request)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/integration/newapi_adapter.go :: func GetExternalAccountStatus(w http.ResponseWriter, r *http.Request)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/integration/sub2api_adapter.go :: func ListPublishedModels(w http.ResponseWriter, r *http.Request)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/integration/sub2api_adapter.go :: func GetExternalAccountStatus(w http.ResponseWriter, r *http.Request)
4.7 repository / sql
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/repository/probe_repo.go :: func InsertProbeExecutionLog(...) error
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/repository/probe_repo.go :: func UpdateAccountHealthAndStatusTx(...) error
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/repository/candidate_repo.go :: func UpsertModelCandidate(...) error
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/repository/candidate_repo.go :: func UpdateCandidateStateTx(...) error
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/repository/package_repo.go :: func UpsertDraftPackageTx(...) (int64, error)
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/repository/gateway_repo.go :: func InsertGatewayPackageEventTx(...) error
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/repository/gateway_repo.go :: func AckGatewayPackageEventTx(...) error
- /home/long/project/立交桥/supply-api/sql/xxxx_supply_intelligence_probe_logs.sql :: migration create table
- /home/long/project/立交桥/supply-api/sql/xxxx_supply_intelligence_candidates.sql :: migration create table
- /home/long/project/立交桥/supply-api/sql/xxxx_supply_intelligence_gateway_events.sql :: migration create table
- /home/long/project/立交桥/supply-api/sql/xxxx_supply_intelligence_module_runtime.sql :: migration create table
4.8 测试与校验
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/probe/state_machine_test.go :: func TestApplyAccountTransition()
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/probe/evaluator_test.go :: func TestClassifyProbeResult()
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/admission/runner_test.go :: func TestDecideCandidateNextState()
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/publish/service_test.go :: func TestPublishCandidate_AppendsGatewayEvent()
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/integration/http_internal_test.go :: func TestGatewayEventAckFlow()
5. 风险与保护:风险清单(概率/影响/缓解)、降级策略、威胁建模结果
5.1 风险清单
1) 探针误判导致错误下线
- 概率:中
- 影响:高
- 缓解:只允许 explicit_failure 触发惩罚状态429/超时/网络错误全部 inconclusivedisabled 仅连续 3 次明确失败;生产初期可只告警不自动 disabled。
2) candidate 重复创建或状态乱序
- 概率:中
- 影响:中
- 缓解unique(platform, model)version 乐观锁;状态迁移函数集中校验;测试任务拿行锁。
3) gateway 未真实消费已上架 package
- 概率:高
- 影响:高
- 缓解:新增 gateway_package_events + ack 契约;区分 published 与 gateway applied监控 pending backlog。
4) 模块关闭时仍有脏写入
- 概率:中
- 影响:中
- 缓解runtime_state=closing 时拒绝新任务inflight 计数;安全提交点;超时取消 ctx。
5) 适配器变更影响扫描质量
- 概率:中
- 影响:中
- 缓解:按供应商隔离;单平台失败不扩散;保留 last_successful_scan 基线;失败仅告警不删数据。
6) NewAPI/Sub2API 适配越权暴露内部数据
- 概率:低
- 影响:高
- 缓解:适配接口单独 DTO白名单认证不复用内部 debug 输出。
5.2 降级策略
- probe 模块关闭gateway 继续依赖现有 account/package 状态;新鲜度下降但主链路可运行。
- discovery 模块关闭:不再发现新模型;已上架模型不受影响。
- admission 模块关闭candidate 可积压,但不会误上架。
- publish 后 gateway 未消费:保留 pending运营可见不回滚 package active但不得宣称“已进路由”。
- NewAPI/Sub2API 未配置:直接关闭适配路由,不影响内部主链路。
5.3 威胁建模结果
输入边界
- 供应商返回体属于不可信输入必须限长、schema 校验、错误脱敏。
- 运营手动接口属于高权限输入:必须鉴权、审计、幂等。
- gateway ack 请求属于内部写接口:必须鉴权并校验 event_id/consumer 一致性。
数据流
- supplier -> adapter -> evaluator -> db
- db -> internal route -> gateway
- db -> adapter route -> NewAPI/Sub2API
主要威胁与处置
- 凭证泄漏:本期不纳入自动注册主路径;现有账号密钥仅走既有安全存储,不在本模块新增明文链路。
- 重放/重复发布publish 接口需幂等published candidate 再次 publish 返回 409。
- 伪造 gateway ack只接受内部服务身份event consumer 固定枚举。
- 大响应体压垮解析adapter 限制 body size超限视为 inconclusive/scan_failed。
- SQL 并发覆盖:关键状态表使用 version 或 select for update。
6. QA 交接与实施约束:编码前设计审查要点、编码后漂移检查点、必查真实调用链路、禁止偏离的边界
6.1 编码前设计审查要点
- 是否明确“集成运行优先,独立运行可选且轻量”。
- 是否删除 pricing/vector/predictions/开放平台化内容。
- 探针默认规则是否统一为 explicit_failure 才触发状态惩罚。
- candidate 状态机是否存在完整入口、出口、终态与回流。
- gateway 是否存在 list change + ack 的真实闭环,而非只有查询接口。
- 模块关闭是否存在 closing -> closed 收敛语义。
- NewAPI/Sub2API 是否仅作为适配边界,而非反向牵引主架构。
6.2 编码后漂移检查点
- 是否出现新增 Redis/Temporal/Kafka/MQ/向量库等重基础设施前置依赖。
- 是否出现额外独立服务、额外 API gateway、复杂事件总线。
- 是否把自动注册重新抬回本期主路径。
- 是否把 gateway 路由刷新实现成跨系统强耦合同步 RPC 必须成功。
- 是否新增未在本基线定义的中间状态。
- 是否把 timeout/TCP/DNS 再次当成 explicit_failure。
6.3 QA 必查真实调用链路
- probe tick -> evaluator -> state machine -> supply_accounts 写回 -> audit 写入
- discovery tick -> candidate discovered -> admission run -> draft package
- publish confirm -> package active -> candidate published -> gateway change event -> gateway ack
- module disable -> closing -> reject new task -> inflight drain -> closed
- adapter route -> NewAPI/Sub2API 只读返回,字段不泄露内部敏感信息
6.4 禁止偏离的边界
- 禁止把本期做成独立平台化部署前提。
- 禁止把比价、预测、向量检索恢复为主链路。
- 禁止未定义契约就声称“gateway 会消费”。
- 禁止 candidate 状态直接跳 published绕过 test_passed + draft package。
- 禁止 disabled 自动恢复。
- 禁止模块关闭时直接 kill 运行中事务而无收敛策略。
7. Engineer 实施说明:文件级落点、最小验证项、需 PM 澄清项
7.1 文件级落点
优先实施目录:
- /home/long/project/立交桥/supply-api/internal/supplyintelligence/
- /home/long/project/立交桥/supply-api/sql/
- /home/long/project/立交桥/supply-api/internal/http/internal/
若 supply-intelligence 项目仓仅承载设计文档,则本文件作为交付基线,后续代码并入 supply-api 主仓。
7.2 最小验证项
- 单测探针分类、账号状态迁移、candidate 状态迁移、publish 幂等。
- 集成测publish 后产生 gateway eventgateway ack 后状态更新 applied。
- 集成测module closing 时手动触发探针返回 409 module disabled/closing。
- E2E 最小链路:
1) 一个 active 账号 401 -> suspended
2) 一个新模型 discovered -> test_passed -> draft -> published
3) gateway 拉取 package change 并 ack
7.3 需 PM 澄清项
- 本期是否允许 production 初期仅启用 active->suspended暂不自动 disabled。
- candidate ignored 的默认恢复期是否固定 7 天,或允许按供应商配置。
- gateway 首期默认采用 pull package-changes + ack 作为事件型消费闭环;若后续证明已有内部刷新入口可复用,也必须保留等价 ack 语义与可审计消费状态。
- NewAPI/Sub2API 本期需要只读拉取,还是还需要 webhook 模式;默认只做只读拉取。
8. 阶段门控结论:可进入 QA 设计审查 / 需返回 PM / 需继续补设计
结论:可进入 QA 设计审查
理由
- 五个 QA 阻塞已在本基线中逐项补洞并收敛。
- 架构已回到立交桥一致的简洁集成模式。
- 对立交桥 / NewAPI / Sub2API 的边界已最小化并显式分类。
- 已删除明显超范围和重基础设施设计。
附带条件
- 不代表可直接开发放行。
- 进入开发前仍需确认 PM 澄清项中的 gateway 消费方式与 production 初期自动 disabled 策略。
9. 下游执行约束摘要:
- Engineer 禁止偏离:不得新增独立平台化部署前提、不得恢复 pricing/vector/predictions 主路径、不得绕过 gateway event ack 闭环、不得新增未定义 candidate 状态。
- QA 必查调用链路probe->状态写回discovery->candidate->admission->draftpublish->gateway event->ackmodule disable->closing->drain->closedNewAPI/Sub2API 只读适配边界。
- XL 若继续推进需补的门控:确认 gateway 实际消费方式;确认生产首期自动 disabled 策略;确认代码最终并入 supply-api 主仓而非另起独立重部署。
自检清单
- [x] 架构设计覆盖 PRD 所有 AC
- [x] 接口定义完整(请求/响应/错误)
- [x] 每个任务 < 5分钟有明确文件路径
- [x] 依赖关系无循环
- [x] 考虑了扩展点(未来可能的变化)
- [x] 风险评估完整,有关键风险的缓解方案
- [x] 符合项目现有技术栈和编码规范
- [x] 降级策略已设计(熔断/限流/兜底)
- [x] 威胁建模已完成(输入边界/鉴权/数据流)
- [x] 实施漂移检测点已定义(可与 QA checklist 对接)
- [x] 已明确标记是否可进入 QA 设计审查
- [x] 已提供 QA 编码前审查与编码后漂移检测所需交接物
- [x] 已给出 Engineer / QA / XL 的下游执行约束摘要
- [x] 已纳入立交桥简洁架构与立交桥/NewAPI/Sub2API 集成边界

View File

@@ -0,0 +1,118 @@
# Supply-Intelligence 当前实现真源索引2026-05
> 状态:当前有效
> 目的:为 Engineer / QA / PM 提供单一阅读入口,避免继续误读历史草案。
> 适用范围:/home/long/project/立交桥/projects/supply-intelligence/
## 1. 当前结论
当前规划包已收敛到“可进入 Engineer 实现”状态。
当前总门控结论APPROVED。
但 APPROVED 的前提是:实现、测试、评审都必须以本文件列出的“当前真源”解释,不得回退到旧 PRD/HLD/INTERFACE/DEPLOYMENT 的正文口径。
## 2. 当前真源文件(按优先级)
### 2.1 一级真源:必须优先遵循
1. `/home/long/project/立交桥/projects/supply-intelligence/tech/BASELINE_TECHLEAD_V2.md`
- 作用:当前技术基线、状态机、模块边界、集成约束、最小生产闭环定义
- 适用问题:实现边界、状态迁移、部署形态、首期能力范围、风险与验证要求
2. `/home/long/project/立交桥/projects/supply-intelligence/tech/GATEWAY_CONSUMER_DECISION_2026-05.md`
- 作用:首期 package/account 消费闭环决议
- 适用问题published vs applied、gateway 是否默认消费方、package change + ack、真实调用链落点
### 2.2 二级真源:必须按一级真源解释
3. `/home/long/project/立交桥/projects/supply-intelligence/tech/TEST_DESIGN.md`
- 作用:收敛后的测试门禁文档
- 使用规则:
- 只能按一级真源解释
- 当前阶段门控结论以其中已更新的 APPROVED 段落为准
- 若正文某处仍残留旧测试假设,以一级真源覆盖
4. `/home/long/project/立交桥/projects/supply-intelligence/specs/功能清单.md`
- 作用:任务粒度参考清单
- 使用规则:
- 仅用于任务拆分、实现排程、UI/后端任务定位
- 若与一级真源冲突,一律以一级真源为准
- 不得把其中任何历史平台化/重基础设施/深自动注册项当作默认首期门槛
## 3. 历史参考文件(禁止作为当前实现真源)
以下文件只能用于理解历史上下文,不能再作为 Engineer/QA 的当前实现依据:
1. `/home/long/project/立交桥/projects/supply-intelligence/prd/PRD.md`
2. `/home/long/project/立交桥/projects/supply-intelligence/tech/HLD.md`
3. `/home/long/project/立交桥/projects/supply-intelligence/tech/INTERFACE.md`
4. `/home/long/project/立交桥/projects/supply-intelligence/tech/DEPLOYMENT.md`
原因:这些文件虽已加失效声明,但正文仍保留大量旧设计,例如:
- pricing / prediction / 向量检索 / 仪表盘扩张
- 独立 API/worker/重基础设施默认前提
- gateway 管理接口热更新主路径
- 深自动注册 / 浏览器自动化主路径
- published 与 gateway applied 语义混淆
## 4. Engineer 必须先建立的统一理解
### 4.1 首期能力边界
首期目标不是独立平台化大系统,而是“立交桥延伸项目 + 简洁集成架构 + 最小生产闭环”。
这意味着:
- 优先并入 supply-api 主仓
- 优先复用主仓已有配置、调度、审计、内部路由能力
- 不把 Redis / Temporal / 向量数据库 / WebSocket / MQ 作为首期硬前置
- 不做 pricing / prediction / recommendation / SFI 仪表盘扩张
### 4.2 探针判定边界
必须按 explicit_failure / inconclusive / success 三类解释不能回退到旧的“timeout 直接惩罚性降级”口径。
### 4.3 自动补给边界
首期不是深自动注册主路径。
首期仅保留“受控自动补给最小边界”:
- 白名单供应商
- 阈值触发
- 任务化补给或受控补给受理接口
- pending_verify / pending_enable 等受控中间态
- fail-closed
不得默认实现:
- 浏览器自动化注册
- 短信验证码编排主路径
- 无审批直接自动激活
### 4.4 gateway 消费闭环边界
首期默认采用:
- package 发布链路event + ack
- account 状态链路:查询型消费
必须明确:
- published != applied
- active package != gateway 已消费成功
- 没有真实 poll/apply/ack 入口,不得宣称 package 发布链路已完成
## 5. QA 审查必须卡的四条红线
1. 若实现重新引入 published/applied 混淆,直接打回
2. 若把深自动注册重新膨胀成首期硬门槛,直接打回
3. 若把旧独立平台化基础设施重新作为首期依赖,直接打回
4. 若 gateway 只有接口定义、没有真实消费方入口与 ack 回写,直接打回
## 6. 推荐阅读顺序
Engineer / QA / PM 开始工作前,按以下顺序阅读:
1. `tech/CURRENT_SOURCE_OF_TRUTH_2026-05.md`
2. `tech/BASELINE_TECHLEAD_V2.md`
3. `tech/GATEWAY_CONSUMER_DECISION_2026-05.md`
4. `tech/TEST_DESIGN.md`
5. `specs/功能清单.md`
禁止跳过前 3 份文件直接依据旧 PRD/HLD/INTERFACE/DEPLOYMENT 开始实现。
## 7. 一句话执行规则
如果某个设计点在文档间出现冲突:
- 先看 `BASELINE_TECHLEAD_V2.md`
- 再看 `GATEWAY_CONSUMER_DECISION_2026-05.md`
- 然后用 `TEST_DESIGN.md``功能清单.md` 做验证与任务拆解
- 不回退到旧草案正文做判断

168
tech/DEPLOYMENT.md Normal file
View File

@@ -0,0 +1,168 @@
# Supply-Intelligence 部署设计
> 状态说明2026-05 收敛修订):本文件保留为旧版部署草案,已不再作为当前默认部署真源。
> 当前默认部署真源应以“立交桥延伸项目 + 简洁集成架构”为准:优先并入 supply-api 主仓,独立运行仅为轻量可选形态。
> 以下旧部署假设已废止,不得再作为首期落地前提:
> - 独立 API Server + 多 Worker 集群默认部署
> - Redis / 向量数据库 / WebSocket / 独立共享层作为首期前置依赖
> - 以独立多组件容器拓扑替代主仓集成部署
> 版本v1.0 | 状态:初稿
---
## 1. 部署架构
### 1.1 总体架构
```
├── Load Balancer (Nginx / 云 CLB)
├── Supply-Intelligence API Server x 2
│ │
│ ├── HTTP API
│ └── WebSocket (健康大盘实时推送)
├── Supply-Intelligence Worker x 3
│ │
│ ├── Probe Worker (探针任务)
│ ├── Discovery Worker (扫描任务)
│ ├── Admission Worker (准入测试任务)
│ ├── Auto-Reg Worker (自动注册任务)
│ └── Cleanup Worker (定期清理)
└── 共享层
├── PostgreSQL 15+ (与 supply-api 共存或独立)
├── Redis (缓存 + 锁 + 扫描结果缓存)
└── 向量数据库 (PGVector / Milvus / Qdrant)
```
### 1.2 容器化部署
```yaml
services:
supply-intel-api:
image: supply-intelligence:latest
command: ["./supply-intel", "api"]
replicas: 2
ports:
- "8081:8080"
supply-intel-probe:
image: supply-intelligence:latest
command: ["./supply-intel", "worker", "probe"]
replicas: 1
supply-intel-discovery:
image: supply-intelligence:latest
command: ["./supply-intel", "worker", "discovery"]
replicas: 1
supply-intel-admission:
image: supply-intelligence:latest
command: ["./supply-intel", "worker", "admission"]
replicas: 1
supply-intel-autoreg:
image: supply-intelligence:latest
command: ["./supply-intel", "worker", "autoreg"]
replicas: 1
```
---
## 2. 资源需求
### 2.1 API Server
| 资源 | 需求 | 说明 |
|------|------|------|
| CPU | 1 核 | |
| 内存 | 512 MB | |
| 存储 | 无 | |
### 2.2 Worker
| Worker 类型 | CPU | 内存 | 说明 |
|------------|-----|--------|------|
| Probe | 1 核 | 512 MB | 同时发起多个 HTTP 请求 |
| Discovery | 1 核 | 1 GB | 可能涉及 Playwright 爬取 |
| Admission | 2 核 | 2 GB | 测试流水线调用 LLM APICPU 与内存需求较高 |
| Auto-Reg | 1 核 | 512 MB | |
### 2.3 数据库
| 资源 | 需求 | 说明 |
|------|------|------|
| CPU | 2 核 | |
| 内存 | 4 GB | |
| 存储 | 100 GB | 探针历史 + 审计日志 + 定价数据库 |
### 2.4 向量数据库
| 选型 | CPU | 内存 | 存储 | 说明 |
|------|-----|--------|------|------|
| PGVector | 与 PostgreSQL 共存 | 共存 | 共存 | 推荐,无需额外部署 |
| Milvus | 2 核 | 4 GB | 50 GB | 高性能、分布式 |
| Qdrant | 1 核 | 2 GB | 30 GB | 轻量、Cloud-native |
---
## 3. 监控与运维钩子
### 3.1 健康检查
| 端点 | 路径 | 预期响应 | 失败行为 |
|------|------|----------|---------|
| 存活检查 | `/actuator/health/live` | HTTP 200 | 容器重启 |
| 就绪检查 | `/actuator/health/ready` | HTTP 200 | 从负载均衡移除 |
| 综合检查 | `/actuator/health` | HTTP 200 + JSON | 触发告警 |
### 3.2 启动/关闭顺序
**启动顺序**:
1. PostgreSQL 启动完成
2. Redis 启动完成
3. 向量数据库启动完成
4. Worker 启动(执行 migration
5. API Server 启动
**关闭顺序**:
1. 停止接收新 HTTP 请求
2. 等待现有请求处理完成(超时 30 秒)
3. 停止各 Worker 定时器
4. 关闭数据库连接池
5. 退出进程
### 3.3 配置管理
- 配置文件 `config.yaml` + 环境变量覆盖。
- 供应商 API Key 仅通过环境变量传入。
- 探针周期、扫描周期、测试用例集路径等可热更新。
---
## 4. 灾备设计
### 4.1 数据库灾备
| 策略 | 方案 | RTO | RPO |
|------|------|-----|-----|
| 主库故障 | 自动切换至备库 | < 5 min | < 1 min |
| 逻辑损坏 | 从备库恢复 + 审计日志回放 | < 30 min | < 1 min |
### 4.2 扫描/测试任务灾备
| 场景 | 处理 |
|------|------|
| Discovery Worker 故障 | 下一周期自动恢复,扫描任务无状态,不影响生产 |
| Admission Worker 故障 | 测试任务缓存在 Redis恢复后继续执行 |
| Probe Worker 故障 | 探针任务缓存在 Redis恢复后继续执行 |
| 向量数据库故障 | 知识库检索降级为文本匹配,不影响核心探针功能 |
### 4.3 多中心部署
- 当前阶段为单中心部署。
- 探针任务无状态,不依赖中心化调度。
- 未来扩展至多中心时,需要解决 PostgreSQL 分布式写入和向量数据库的同步问题。

View File

@@ -0,0 +1,169 @@
# Supply-Intelligence 首期消费闭环决议2026-05
> 状态:当前有效决议
> 作用:消除“只有接口定义,没有首期真实消费方与调用落点”的设计歧义。
> 适用范围:/home/long/project/立交桥/projects/supply-intelligence/ 下当前收敛规划包。
> 真源索引:本决议受 `/home/long/project/立交桥/projects/supply-intelligence/tech/CURRENT_SOURCE_OF_TRUTH_2026-05.md` 纳管;若与历史草案冲突,以真源索引定义的优先级解释。
## 1. 结论
首期默认消费闭环采用:
- package 发布链路gateway 作为首期默认消费方,使用 pull `package-changes` + `ack` 机制完成闭环
- account 状态链路:立交桥 / supply-api 内部主链路直接读取 `routing-state` 或等价 snapshot不通过 gateway event ack 闭环
这意味着必须明确区分两类链路:
1. 账号可路由状态链路:查询型消费
2. package 发布生效链路:事件型消费
不得混用以下错误口径:
- `published = 已进入 gateway 路由`
- `active package = 下游已消费成功`
正确口径:
- `published` 仅表示 supply-intelligence 侧已完成运营确认与 package 激活
- 只有 gateway 对 package event 完成 `ack(result=applied)` 后,才能宣称“已被 gateway 消费生效”
## 2. 首期默认路径
### 2.1 账号状态链路
生产主链路:
1. probe 执行
2. evaluator 分类为 success / explicit_failure / inconclusive
3. state machine 生成状态迁移
4. 写回 supply account 健康状态与审计
5. 立交桥内部路由决策读取 `GET /internal/supply-intelligence/accounts/{account_id}/routing-state`
说明:
- 这是查询型读取,不需要 event ack。
- 若调用方读取失败,不回滚 supply-intelligence 已落库状态,只记录消费侧问题。
### 2.2 package 发布闭环
生产主链路:
1. 运营确认发布 candidate
2. package draft -> active
3. candidate `test_passed -> published`
4. 写入 `gateway_package_events`
5. gateway 拉取 `GET /internal/supply-intelligence/gateway/package-changes?cursor=...`
6. gateway 应用变更到自身路由/缓存
7. gateway 调用 `POST /internal/supply-intelligence/gateway/package-changes/{event_id}/ack`
8. `gateway_sync_status` 变为 `applied``failed`
说明:
- 这是事件型闭环。
- `pending` 表示 supply-intelligence 已发布,但 gateway 尚未确认消费。
- `failed` 表示 gateway 已消费尝试但未成功,需要运营或工程介入。
## 3. 为什么不用首期强耦合同步 RPC
首期明确不采用:
- “发布时同步调用 gateway 管理接口,成功后才算发布成功”
原因:
1. 这会把 supply-intelligence 与 gateway 强耦合在单次事务中
2. 会把下游暂时不可用放大成上游发布不可用
3. 不符合当前“立交桥延伸项目、简洁架构、最小生产闭环”的收敛目标
因此首期选择:
- 上游发布成功与下游消费成功解耦
- 用 event + ack 明确消费状态
## 4. 首期真实代码落点(实现约束)
以下是首期必须存在的真实调用落点;只有接口定义不算完成。
### 4.1 supply-intelligence / supply-api 侧
- `/home/long/project/立交桥/supply-api/internal/supplyintelligence/publish/service.go`
- `PublishCandidate(...)`
- `AppendGatewayPackageEvent(...)`
- `/home/long/project/立交桥/supply-api/internal/supplyintelligence/integration/http_internal.go`
- `GetAccountRoutingState(...)`
- `ListGatewayPackageChanges(...)`
- `AckGatewayPackageChange(...)`
- `/home/long/project/立交桥/supply-api/internal/supplyintelligence/repository/gateway_repo.go`
- `InsertGatewayPackageEventTx(...)`
- `AckGatewayPackageEventTx(...)`
### 4.2 gateway 侧(首期必须由消费方实现的真实入口)
- 必须存在一个实际消费入口,完成:
1. 周期拉取 package changes
2. 应用变更
3. 回写 ack
- 若 gateway 已有内部刷新链路,可复用,但必须补齐 ack 回写
- 若 gateway 无现成入口,则新增最小 poller禁止为了这件事引入 MQ/Kafka/新总线
## 5. QA 必查真实调用链路
QA 编码后必须至少验证以下四层:
### 链路 A账号状态查询型消费
- 定义:`GetAccountRoutingState`
- 装配internal route mounted
- 调用:立交桥 / supply-api 实际路由决策点调用该接口或等价函数
- 入口:真实请求/真实调用路径可达
### 链路 Bpackage 事件发布
- 定义:`AppendGatewayPackageEvent`
- 装配publish 流程内注入 repository
- 调用:`PublishCandidate` 成功路径真实调用写事件
- 入口:运营确认发布入口可真实触达该调用链
### 链路 Cgateway 拉取消费
- 定义:`ListGatewayPackageChanges`
- 装配internal route mounted
- 调用gateway 真实 poller / 既有刷新链调用
- 入口:消费方真实任务/刷新入口存在,不是只留 TODO
### 链路 Dgateway ack 回写
- 定义:`AckGatewayPackageChange`
- 装配ack route mounted
- 调用gateway 应用成功/失败后真实回写
- 入口event 状态确实从 `pending -> applied|failed`
## 6. published / applied 语义约束
状态含义必须统一:
- candidate `published`:上游已完成运营确认
- package `active`:上游已允许被消费
- gateway sync `pending`:下游尚未确认
- gateway sync `applied`:下游已确认消费并应用
- gateway sync `failed`:下游消费尝试失败
禁止:
- UI 文案把 `published` 写成“已进路由”
- 测试把 `package active` 当成“下游已完成同步”
- QA 把 event 表存在当成“消费闭环成立”
## 7. 与 NewAPI / Sub2API 的边界
首期不要求 NewAPI / Sub2API 实现 event ack 闭环。
它们的首期边界为:
- 只读拉取账号状态
- 只读拉取已允许暴露的模型/结果
即:
- gateway 是首期必须闭环的事件型消费方
- NewAPI / Sub2API 是首期只读适配消费方
## 8. 门控要求
在下一轮 QA 设计审查或编码后审查中,若以下任一项缺失,则不得给 APPROVED
1. 没有明确的首期默认消费方
2. 没有明确区分查询型链路与事件型链路
3. 没有明确 `published != applied`
4. 没有真实代码落点要求
5. 没有 ack 回写要求
## 9. 对旧文档的覆盖关系
本决议用于覆盖旧文档中以下错误或过时口径:
- “调用 gateway 管理接口热更新即完成闭环”
- “上架成功即下游已生效”
- “gateway 会消费”但没有实际消费者与 ack 机制
如与以下文件冲突,以本决议为准:
- /home/long/project/立交桥/projects/supply-intelligence/specs/功能清单.md
- /home/long/project/立交桥/projects/supply-intelligence/tech/INTERFACE.md
- /home/long/project/立交桥/projects/supply-intelligence/tech/HLD.md
- /home/long/project/立交桥/projects/supply-intelligence/tech/BASELINE_TECHLEAD_V2.md若后续未同步更新相应段落应以本决议补充解释

1013
tech/HLD.md Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,251 @@
# Supply-Intelligence 实现任务板 V12026-05
> 状态:当前有效
> 目的:将当前真源收敛为可直接派工的 Engineer / QA 执行板。
> 使用前提:必须先阅读 `/home/long/project/立交桥/projects/supply-intelligence/tech/CURRENT_SOURCE_OF_TRUTH_2026-05.md`。
> 当前总门控APPROVED允许进入实现
## 0. 使用规则
1. 本任务板不是新的真源,只是执行板。
2. 若任务板与以下文件冲突,以以下文件为准:
- `/home/long/project/立交桥/projects/supply-intelligence/tech/BASELINE_TECHLEAD_V2.md`
- `/home/long/project/立交桥/projects/supply-intelligence/tech/GATEWAY_CONSUMER_DECISION_2026-05.md`
- `/home/long/project/立交桥/projects/supply-intelligence/tech/TEST_DESIGN.md`
3. 禁止 Engineer 回退到旧 PRD/HLD/INTERFACE/DEPLOYMENT 取实现口径。
4. 每个阶段完成后,必须由 QA 按“定义 → 装配 → 调用 → 入口”四层链路做复核。
## 1. 当前最短闭环路径
目标:先做出首个最小生产闭环,而不是并行铺开所有模块。
推荐顺序:
1. Phase A探针与账号状态闭环
2. Phase B发现与 candidate 闭环
3. Phase C准入测试与 draft 生成闭环
4. Phase D发布与 gateway package event + ack 闭环
5. Phase E受控自动补给最小边界
6. Phase F工作台、配置、权限与完善性补齐
## 2. 阶段任务板
### Phase A探针与账号状态闭环
目标:先让 supply-intelligence 能真实地产生可消费的账号状态。
#### A-1 数据与领域骨架
- OwnerEngineer
- 交付物:账号状态、探针日志、审计写入相关 domain/model/repository 基础结构
- 完成标准:
- 存在 `supply_intelligence_` 前缀表迁移
- 探针结果、状态迁移、审计写入模型可落库
- QA 验证:检查 schema、repo、service 调用链是否闭合
#### A-2 探针执行与统一判定
- OwnerEngineer
- 交付物probe runner + evaluator
- 完成标准:
- 200 => success
- 401/403 => explicit_failure
- 429/5xx/timeout/格式突变 => inconclusive
- QA 验证:检查 evaluator 定义、装配、调用与调度入口
#### A-3 状态机与账号状态快照接口
- OwnerEngineer
- 交付物:状态迁移逻辑 + `routing-state` 查询接口
- 完成标准:
- active -> suspended
- suspended -> disabled
- inconclusive 不触发惩罚性迁移
- 存在真实内部查询入口
- QA 验证:必须验证 `GET /internal/supply-intelligence/accounts/{account_id}/routing-state` 或等价入口
#### A-4 Phase A QA Gate
- OwnerQA
- 放行条件:
- 账号状态链路完成“定义 → 装配 → 调用 → 入口”四层验证
- 审计写入与状态写回可追踪
- 未引入 Redis / Temporal / WebSocket 作为首期硬依赖
### Phase B发现与 candidate 闭环
目标:能够从已接入供应商拉模型,并产生 candidate。
#### B-1 供应商适配器与模型拉取
- OwnerEngineer
- 交付物SupplierAdapter、registry、GetModels 拉取链路
- 完成标准:
- 至少支持首批目标供应商
- 具备健康探测与模型列表读取
- QA 验证:检查 registry 注册、装配、实际调用点
#### B-2 candidate 生成与去重
- OwnerEngineer
- 交付物discovery service + candidate repository
- 完成标准:
- 能与 `supply_packages` 去重
- 新模型生成 discovered candidate
- 下架只生成告警,不自动改 package
- QA 验证:检查 candidate 创建与下架告警调用链
#### B-3 Phase B QA Gate
- OwnerQA
- 放行条件:
- 至少一条真实发现链路打通
- candidate 状态初始落点正确
- 未扩张到 pricing / prediction / 向量检索
### Phase C准入测试与 draft 生成闭环
目标:让 discovered candidate 可变成 test_passed/test_failed并生成 draft。
#### C-1 admission runner
- OwnerEngineer
- 交付物:标准测试执行器与结果记录
- 完成标准:
- discovered / retry_pending 可消费
- 失败与超时原因可追踪
- QA 验证:检查 admission 执行入口和结果写回
#### C-2 draft package 生成
- OwnerEngineer
- 交付物test_passed -> draft package 生成逻辑
- 完成标准:
- 草稿字段完整
- candidate 状态流转闭环
- QA 验证:检查 candidate -> draft 的真实调用链
#### C-3 Phase C QA Gate
- OwnerQA
- 放行条件:
- 至少一条 candidate 完成 test_passed -> draft
- 至少一条 candidate 完成 test_failed -> failure_reason
### Phase D发布与 gateway package event + ack 闭环
目标:打通首个 package 发布最小生产闭环。
#### D-1 发布服务
- OwnerEngineer
- 交付物:运营确认发布逻辑
- 完成标准:
- draft -> active
- candidate test_passed -> published
- QA 验证published 语义不得等于 applied
#### D-2 gateway package events
- OwnerEngineer
- 交付物:`gateway_package_events` 写入、拉取、ack 回写接口
- 完成标准:
- 存在 package-changes 列表接口
- 存在 ack 接口
- ack 后状态可区分 pending/applied/failed
- QA 验证:检查 definition / assembly / call / entry 四层
#### D-3 gateway 消费方最小入口
- OwnerEngineer / 对接方
- 交付物:真实 poll/apply/ack 入口
- 完成标准:
- 不是只定义接口
- 至少有一个真实消费任务/入口
- QA 验证:没有真实入口则本阶段不通过
#### D-4 Phase D QA Gate
- OwnerQA
- 放行条件:
- published != applied 证据充分
- package event + ack 闭环真实存在
- 无“同步调用 gateway 管理接口才算发布成功”的回退实现
### Phase E受控自动补给最小边界
目标:补齐首期最小自动补给能力,但不膨胀为深自动注册。
#### E-1 自动补给配置与白名单约束
- OwnerEngineer
- 交付物auto-supply 配置、阈值、白名单、审批边界
- 完成标准:
- 非白名单供应商不自动补给
- 配置按主仓既有方式存储
- QA 验证:检查 guardrail 是否真实生效
#### E-2 自动补给任务流
- OwnerEngineer
- 交付物:补给任务创建 / 受理 / 待验证回写
- 完成标准:
- 低于阈值触发任务
- 成功后进入 pending_verify / pending_enable
- 不允许直接 active
- QA 验证:检查自动启用是否被阻断
#### E-3 fail-closed
- OwnerEngineer
- 交付物:通知网关/补给受理/KMS 异常阻断逻辑
- 完成标准:
- 失败不伪成功
- 明文不落日志/DB
- QA 验证:检查失败证据和审计闭环
#### E-4 Phase E QA Gate
- OwnerQA
- 放行条件:
- 未引入浏览器自动化注册主路径
- 未引入验证码编排主路径
- 未允许无审批直接自动激活
### Phase F工作台、配置、权限与完善性补齐
目标:补足可操作性与交付完整性,但不得改变前述主链路口径。
#### F-1 工作台最小读写能力
- OwnerEngineer
- 交付物:账号页、模型页、待处理页、确认上架、忽略、手动探针
- QA 验证:检查关键操作真实连到主链路,不是空按钮
#### F-2 配置与审计
- OwnerEngineer
- 交付物:配置读取/修改、审计日志
- QA 验证:检查配置生效路径与审计记录
#### F-3 权限与内部/外部路由边界
- OwnerEngineer
- 交付物:认证、角色权限、内部接口与外部接口分离
- QA 验证:检查 `/internal/supply-intelligence/` 与外部暴露面的边界
#### F-4 Phase F QA Gate
- OwnerQA
- 放行条件:
- 权限边界清楚
- OpenAPI 与真实路由一致
- 不新增超范围平台化能力
## 3. 明确禁止的提前扩张
以下事项在前述主链路未闭环前,禁止插队进入主开发路径:
- pricing / prediction / recommendation
- 向量数据库 / 向量检索
- SFI 仪表盘
- WebSocket 实时推送
- 独立 API + worker 集群重部署
- 浏览器自动化注册主路径
- 验证码编排主路径
- 以 Redis / Temporal 为首期硬前置
## 4. QA 统一复核问题单
每个阶段 QA 都必须回答:
1. 定义是否存在?
2. 装配是否存在?
3. 调用点是否真实存在?
4. 外部/内部入口是否真实挂载?
5. 是否出现实施漂移?
6. 是否回退到了历史草案口径?
## 5. 工程启动建议阅读顺序
1. `/home/long/project/立交桥/projects/supply-intelligence/tech/CURRENT_SOURCE_OF_TRUTH_2026-05.md`
2. `/home/long/project/立交桥/projects/supply-intelligence/tech/BASELINE_TECHLEAD_V2.md`
3. `/home/long/project/立交桥/projects/supply-intelligence/tech/GATEWAY_CONSUMER_DECISION_2026-05.md`
4. `/home/long/project/立交桥/projects/supply-intelligence/tech/TEST_DESIGN.md`
5. `/home/long/project/立交桥/projects/supply-intelligence/tech/IMPLEMENTATION_TASK_BOARD_V1_2026-05.md`
6. `/home/long/project/立交桥/projects/supply-intelligence/specs/功能清单.md`

275
tech/INTERFACE.md Normal file
View File

@@ -0,0 +1,275 @@
# Supply-Intelligence 核心接口设计
> 状态说明2026-05 收敛修订):本文件保留为旧版接口草案,已不再作为当前实现真源。
> 当前接口真源以 /home/long/project/立交桥/projects/supply-intelligence/tech/BASELINE_TECHLEAD_V2.md 为准。
> 以下旧接口定义已废止,不得继续作为实现入口:
> - pricing comparison / recommendations / predictions 相关接口
> - 与新 candidate 状态机不一致的旧状态枚举
> - 未区分 published 与 gateway applied 的旧消费口径
> 版本v1.0 | 状态:初稿
---
## 1. 内部模块间接口
### 1.1 ProbeService
```go
type ProbeService interface {
// 执行单次探针
Probe(ctx context.Context, accountID string) (*ProbeResult, error)
// 批量探针(按供应商或全量)
ProbeBatch(ctx context.Context, filter ProbeFilter) (*BatchProbeResult, error)
// 获取探针结果历史
GetProbeHistory(ctx context.Context, accountID string, limit int) ([]ProbeResult, error)
// 手动触发掠针(运营干预)
TriggerManualProbe(ctx context.Context, accountID string, actorID string) (*ProbeResult, error)
}
type ProbeResult struct {
AccountID string
Status string // active suspended disabled
RiskScore int // 0-100
RiskReason string
LatencyMs int
ResponseCode int
CheckedAt time.Time
NextCheckAt time.Time
}
type ProbeFilter struct {
Platform *string
Status *string
RiskScoreMin *int
RiskScoreMax *int
}
```
### 1.2 DiscoveryService
```go
type DiscoveryService interface {
// 执行单次全网扫描
Scan(ctx context.Context) (*ScanResult, error)
// 获取最近扫描结果
GetLastScan(ctx context.Context) (*ScanResult, error)
// 获取候选模型列表
ListCandidates(ctx context.Context, filter CandidateFilter) ([]ModelCandidate, error)
// 手动触发扫描
TriggerManualScan(ctx context.Context, actorID string) (*ScanResult, error)
// 忽略候选模型
IgnoreCandidate(ctx context.Context, candidateID string, reason string, actorID string) error
}
type ScanResult struct {
ScannedAt time.Time
Platforms []string
NewModels int
RemovedModels int
Errors []ScanError
}
type ModelCandidate struct {
ID string
Platform string
ModelID string
Status string // discovered queued testing test_passed test_failed ignored
DiscoveredAt time.Time
TestedAt *time.Time
TestResult *TestResult
}
```
### 1.3 AdmissionService
```go
type AdmissionService interface {
// 执行准入测试
RunTest(ctx context.Context, candidateID string) (*TestResult, error)
// 获取测试结果
GetTestResult(ctx context.Context, candidateID string) (*TestResult, error)
// 手动确认上架(运营干预)
Publish(ctx context.Context, candidateID string, actorID string) error
// 强制上架(测试失败但运营确认)
ForcePublish(ctx context.Context, candidateID string, reason string, actorID string) error
}
type TestResult struct {
CandidateID string
Status string // passed failed
Dimensions []TestDimension
FailedReason *string
ExecutedAt time.Time
DurationMs int
}
type TestDimension struct {
Name string
Passed bool
Detail string
}
```
### 1.4 AccountService
```go
type AccountService interface {
// 创建账号(手动或自动)
CreateAccount(ctx context.Context, req CreateAccountRequest) (*SupplyAccount, error)
// 获取账号信息
GetAccount(ctx context.Context, accountID string) (*SupplyAccount, error)
// 更新账号状态
UpdateStatus(ctx context.Context, accountID string, status string, reason string) error
// 轮换密钥
RotateKey(ctx context.Context, accountID string, actorID string) error
// 列表账号
ListAccounts(ctx context.Context, filter AccountFilter) ([]SupplyAccount, error)
}
type SupplyAccount struct {
ID string
Platform string
ProxyID string
Status string
RiskScore int
APIKeyHint string // 密钥前 4 后 4
CreatedAt time.Time
UpdatedAt time.Time
}
```
### 1.5 HealthBoardService
```go
type HealthBoardService interface {
// 获取供应商健康大盘
GetBoard(ctx context.Context, scope BoardScope) (*HealthBoard, error)
// 获取模型比价报表
GetPricingComparison(ctx context.Context, modelID string) ([]PricingComparison, error)
// 获取供应链覆盖率
GetCoverage(ctx context.Context) (*CoverageReport, error)
// 获取预测分析
GetPredictions(ctx context.Context, minConfidence float64) ([]Prediction, error)
}
type HealthBoard struct {
Accounts []AccountHealth
Candidates []CandidateSummary
Coverage float64
FreshnessIndex float64
}
```
---
## 2. 外部系统集成接口
### 2.1 与 Bridge Gateway 集成
| 方法 | 路径 | 请求 | 响应 | 说明 |
|------|------|------|------|------|
| 查询账号状态 | `GET /internal/supply-intelligence/accounts/{id}/health` | - | `ProbeResult` | Gateway 路由决策时查询 |
| 查询模型定价 | `GET /internal/supply-intelligence/pricing/{model_id}` | - | `PricingInfo` | 动态定价参考 |
| 获取推荐供应商 | `GET /internal/supply-intelligence/recommendations` | `?model={model_id}&strategy=cost` | `[]Recommendation` | 智能路由推荐 |
### 2.2 与 supply-api 集成
| 方法 | 路径 | 请求 | 响应 | 说明 |
|------|------|------|------|------|
| 读取账号列表 | `GET /internal/supply/accounts` | - | `[]SupplyAccount` | 探针器获取待检测账号 |
| 更新账号状态 | `POST /internal/supply/accounts/{id}/status` | `{"status":"suspended","reason":""}` | `{"success":true}` | 探针结果写回 |
| 读取模型列表 | `GET /internal/supply/packages` | - | `[]SupplyPackage` | 扫描比对基准 |
| 创建模型 | `POST /internal/supply/packages` | `SupplyPackage` | `{"id":""}` | 准入测试通过后上架 |
| 获取审计日志格式 | `GET /internal/supply/audit/schema` | - | `{"schema":{}}` | 审计事件格式一致 |
---
## 3. API 接口规范
### 3.1 REST API 基础
- **基础路径**: `/api/v1/supply-intelligence/`
- **内部路径** (集成模式): `/internal/supply-intelligence/`
- **内容类型**: `application/json`
- **错误响应格式**:
```json
{
"error": {
"code": "SI_PRB_4001",
"message": "供应商账号不存在",
"details": {}
}
}
```
### 3.2 核心端点
#### 探针管理
| 方法 | 路径 | 描述 |
|------|------|------|
| GET | `/api/v1/supply-intelligence/probes` | 列表探针结果 |
| POST | `/api/v1/supply-intelligence/probes/{account_id}` | 手动触发探针 |
| GET | `/api/v1/supply-intelligence/probes/{account_id}/history` | 探针历史 |
#### 扫描与发现
| 方法 | 路径 | 描述 |
|------|------|------|
| POST | `/api/v1/supply-intelligence/discovery/scan` | 手动触发全网扫描 |
| GET | `/api/v1/supply-intelligence/discovery/candidates` | 列表候选模型 |
| GET | `/api/v1/supply-intelligence/discovery/candidates/{id}` | 获取候选模型详情 |
| POST | `/api/v1/supply-intelligence/discovery/candidates/{id}/ignore` | 忽略候选模型 |
#### 准入测试
| 方法 | 路径 | 描述 |
|------|------|------|
| POST | `/api/v1/supply-intelligence/admission/{candidate_id}/test` | 手动执行准入测试 |
| GET | `/api/v1/supply-intelligence/admission/{candidate_id}/result` | 获取测试结果 |
| POST | `/api/v1/supply-intelligence/admission/{candidate_id}/publish` | 确认上架 |
| POST | `/api/v1/supply-intelligence/admission/{candidate_id}/force-publish` | 强制上架 |
#### 账号管理
| 方法 | 路径 | 描述 |
|------|------|------|
| GET | `/api/v1/supply-intelligence/accounts` | 列表账号 |
| POST | `/api/v1/supply-intelligence/accounts` | 创建账号 |
| GET | `/api/v1/supply-intelligence/accounts/{id}` | 获取账号 |
| POST | `/api/v1/supply-intelligence/accounts/{id}/rotate-key` | 轮换密钥 |
| POST | `/api/v1/supply-intelligence/accounts/{id}/status` | 更新状态 |
#### 健康大盘
| 方法 | 路径 | 描述 |
|------|------|------|
| GET | `/api/v1/supply-intelligence/health-board` | 获取健康大盘 |
| GET | `/api/v1/supply-intelligence/pricing/{model_id}/comparison` | 模型比价 |
| GET | `/api/v1/supply-intelligence/coverage` | 供应链覆盖率 |
| GET | `/api/v1/supply-intelligence/predictions` | 预测分析 |
### 3.3 错误码定义
| 错误码 | HTTP 状态 | 说明 |
|---------|-----------|------|
| `SI_PRB_4001` | 404 | 供应商账号不存在 |
| `SI_PRB_4002` | 429 | 探针频率过高,请等待 |
| `SI_DIS_4001` | 404 | 候选模型不存在 |
| `SI_DIS_4002` | 409 | 候选模型状态不允许忽略 |
| `SI_ADM_4001` | 404 | 准入测试任务不存在 |
| `SI_ADM_4002` | 409 | 准入测试正在执行中 |
| `SI_ADM_4003` | 400 | 测试未通过,无法上架 |
| `SI_ACC_4001` | 404 | 账号不存在 |
| `SI_ACC_4002` | 409 | 账号状态不允许此操作 |
| `SI_ACC_4003` | 403 | 无权执行此操作 |
| `SI_BRD_4001` | 400 | 查询参数无效 |
### 3.4 WebSocket 接口
**路径**: `/ws/v1/supply-intelligence/board`
- 运营工作台订阅后,实时推送探针结果、候选模型变更、状态变更待办。
- 心跳间隔 30 秒。

355
tech/TEST_DESIGN.md Normal file
View File

@@ -0,0 +1,355 @@
# Supply Intelligence 测试设计方案
> 状态说明2026-05 收敛修订):本文件已转为“收敛后测试门禁文档”,必须按新基线解释。
> 若与旧 PRD/HLD/INTERFACE 的测试口径冲突,以 /home/long/project/立交桥/projects/supply-intelligence/tech/BASELINE_TECHLEAD_V2.md 与最新 PM 基线为准。
> 以下旧测试口径不得继续作为放行依据:
> - 以独立重部署、向量数据库、WebSocket、预测/比价能力为默认测试前提
> - 将自动注册深链路视为本期不可降期的默认主路径
> - 将 published 等同于 gateway 已消费生效
> 版本v1.0
> 日期2026-04-27
> 状态:初稿
> 覆盖AC-01 ~ AC-12、异常/边缘流程 FP-01 ~ FP-10、场景 S1~S4
---
## 1. 测试策略
### 1.1 测试分层模型
```
┌─────────────────────────────────────────────────┐
│ E2E Tests (黑盒) │
│ 场景:从探针调度到状态变更、从发现到上架全链路 │
│ 工具Go test + httptest + 自制 E2E runner │
└─────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────┐
│ Integration Tests (灰盒) │
│ 场景Service 间协作、异步任务队列、外部 API Mock│
│ 工具Go test + testify + sqlmock + gock │
│ 覆盖率门槛service ≥ 80%, handler ≥ 80% │
└─────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────┐
│ Unit Tests (白盒) │
│ 场景:状态机逻辑、探针评估、风险评分计算 │
│ 工具Go test + testify + gomock │
│ 覆盖率门槛domain ≥ 70% │
└─────────────────────────────────────────────────┘
```
### 1.2 测试通过标准
| 维度 | 标准 |
|------|------|
| 覆盖率 | domain ≥ 70%, service/handler ≥ 80% |
| 模块 A探针 | AC-01 ~ AC-03 全部通过 |
| 模块 B发现 | AC-04 ~ AC-05 全部通过 |
| 模块 C准入测试 | AC-06 ~ AC-07 全部通过 |
| 模块 D受控自动补给 | AC-08 ~ AC-09 全部通过(按首期最小边界解释) |
| 模块 E工作台 | AC-10 ~ AC-12 全部通过 |
| 异常/边缘流程 | FP-01 ~ FP-10 全部有验证测试 |
| 误报率 | 7 天连续运行 false positive ≤ 1% |
### 1.3 外部依赖 Mock
| 依赖 | Mock 方案 | 工具 |
|------|---------|------|
| **供应商 API探针目标** | Mock server 返回 200/401/403/429/500 | gock |
| **供应商模型列表 API** | Mock 返回 JSON 模型列表 | gock |
| **供应商补给接口 / 人工补录入口** | Mock 返回受理成功/400/500 | gock |
| **通知网关(飞书/邮件)** | Mock server 接收通知或确认消息 | httptest |
| **KMS 服务** | Mock 加密/解密逻辑 | 接口层 Mock |
| **Job Scheduler / 主仓调度器** | 使用主仓调度抽象或本地调度测试桩 | go test + test double |
| **supply-api 数据库** | sqlmock 拦截读写 | go-sqlmock |
---
## 2. 模块 A 测试用例(供应商品质探针)
### AC-01 探针覆盖度
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TA-01-01 | 15 分钟内探针覆盖率 ≥99% | Functional | Given 100 条 active/suspended 账号 When 15min 后统计 Then ≥99 条被探针 |
| TA-01-02 | suspended 账号同等探针 | Functional | Given suspended 账号 When 探针执行 Then 同样被覆盖 |
| TA-01-03 | 暂停探针账号不被覆盖 | Edge | Given 账号设置 pause_probe=true When 探针执行 Then 该账号被跳过 |
### AC-02 状态变更正确性
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TA-02-01 | active → suspended1次401 | Happy Path | Given active 账号 When 连续 1 次返回 401 Then 60s 内状态变为 suspended |
| TA-02-02 | suspended → disabled连续3次401 | Happy Path | Given suspended 账号 When 连续 3 次返回 401 Then 60s 内状态变为 disabled |
| TA-02-03 | 429 单次不改变状态 | Edge | Given active 账号 When 返回 429 一次 Then 15min 内状态保持 active |
| TA-02-04 | 指数退避重试逻辑 | Functional | Given 返回 429 When 探针执行 Then 按 1→2→4min 退避重试 |
| TA-02-05 | 状态机不允许 active→disabled 直变 | Edge | Given active 账号 When 连续 3 次失败 Then 不会直接变为 disabled必须先 suspended |
| TA-02-06 | 手动暂停账号状态不自动变更 | Edge | Given 账号 pause_probe=true When 供应商返回异常 Then 状态不变 |
### AC-03 误报率
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TA-03-01 | 7 天误报率 ≤1% | Long Run | Given 100 条正常账号 When 连续运行 7 天 Then 误变更次数 ≤7 |
| TA-03-02 | 探针与手动操作并发 | Concurrency | Given 手动修改状态的同时 When 探针执行 Then 乐观锁冲突处理正确 |
---
## 3. 模块 B 测试用例(全网模型发现)
### AC-04 新模型发现延迟
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TB-04-01 | 新模型在 2 扫描周期内被发现 | Functional | Given 供应商新增 model_id When 扫描执行 Then 2h 内 model_candidates 出现 discovered 记录 |
| TB-04-02 | 模型比对去重正确 | Functional | Given 已存在的 active model When 全网扫描 Then 不会重复创建 candidate |
| TB-04-03 | 模型下架告警触发 | Functional | Given active package 对应的 model_id 从供应商列表消失 When 2 扫描周期后 Then 运营工作台出现下架告警 |
### AC-05 已下架模型告警
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TB-05-01 | 下架模型不自动变更 package 状态 | Edge | Given model_id 消失 When 扫描执行 Then package 状态保持 active生成告警 |
| TB-05-02 | 分页获取完整模型列表 | Functional | Given 供应商返回分页 When 扫描 Then 正确处理所有分页数据 |
---
## 4. 模块 C 测试用例(模型准入测试)
### AC-06 准入测试通过
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TC-06-01 | discovered → test_passed + 草稿生成 | Happy Path | Given discovered candidate When 测试全部通过 Then 状态 test_passedsupply_package 草稿生成 |
| TC-06-02 | 草稿字段完整性 | Functional | Given 草稿生成 When 检查字段 Then platform/model/price/suggested 正确 |
| TC-06-03 | 准入测试 30 分钟内完成 | Performance | Given discovered candidate When 测试执行 Then ≤30min 完成 |
### AC-07 准入测试失败
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TC-07-01 | discovered → test_failed | Negative | Given discovered candidate When 测试返回 500 Then 30min 内状态 test_failedfailure_reason 非空 |
| TC-07-02 | 超时视为失败 | Edge | Given 测试用例 60s 无响应 When Then 整体标记为 test_failedreason = timeout |
| TC-07-03 | 测试账号 suspended 时任务失败 | Edge | Given 测试账号变为 suspended When 准入测试执行 Then 任务标记 test_failedreason = test_account_unavailable |
| TC-07-04 | ignore 账号 7 天内不重扫 | Edge | Given 运营标记 ignore When 7 天内扫描 Then 该 candidate 不出现 |
---
## 5. 模块 D 测试用例(受控自动补给)
### AC-08 受控自动补给触发与落单
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TD-08-01 | 可用账号数 < 阈值时触发补给任务 | Functional | Given 白名单供应商的可用账号数 < 阈值 When 系统检测 Then 10min 内生成补给任务或补给申请 |
| TD-08-02 | 非白名单供应商不自动补给 | Guardrail | Given 非白名单供应商账号不足 When 系统检测 Then 不自动触发补给,仅记录告警或人工待办 |
| TD-08-03 | 补给结果进入待验证/待启用 | Happy Path | Given 补给流程受理成功 When 补给完成 Then 新账号或候选资源进入 pending_verify / pending_enable 等受控状态,而非直接 active |
| TD-08-04 | 补给结果关联 task | Functional | Given 补给任务完成 When 检查任务记录 Then auto_supply_tasks 或等价任务状态为 completed/pending_verify |
### AC-09 受控自动补给 fail-closed
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TD-09-01 | 通知/补给网关不可用时 fail-closed | Resilience | Given 通知网关或补给受理接口返回 503 When 补给执行 Then 60s 内任务 failed审计日志完整无虚假成功 |
| TD-09-02 | 补给接口返回 400 | Edge | Given 补给请求参数非法或资源已存在 When 补给执行 Then 任务 failed不重复盲目重试 |
| TD-09-03 | KMS 不可用时 fail-closed | Resilience | Given KMS 超时 When 凭证加密步骤执行 Then 60s 内任务 failed明文凭证不出现在日志/DB |
| TD-09-04 | 无审批/越权配置时阻断自动启用 | Guardrail | Given 缺少审批或超出受控边界 When 补给结果回写 Then 保持 pending_verify / pending_enable不允许直接进入 active |
---
## 6. 模块 E 测试用例(运营工作台)
### AC-10 审计日志完整性
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TE-10-01 | 状态变更 5s 内写入审计 | Performance | Given 状态变更 When 执行完成 Then ≤5s 审计记录存在 |
| TE-10-02 | 审计字段完整性 | Functional | Given 审计记录 When 检查 Then 包含 object_type/id/action/before_state/after_state/request_id |
| TE-10-03 | 探针执行记录审计 | Functional | Given 探针执行 When 完成 Then probe_execution_logs 有记录 |
### AC-11 运营工作台干预
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TE-11-01 | 确认上架 draft → active | Happy Path | Given draft package When 点击确认 Then 3s 内变为 active |
| TE-11-02 | 忽略模型 7 天内不出现 | Edge | Given 点击忽略 When Then 7 天内 candidate 不出现在待处理列表 |
| TE-11-03 | 手动触发单账号探针 | Functional | Given 运营手动触发 When Then 立即执行探针,结果可见 |
| TE-11-04 | 并发操作冲突处理 | Concurrency | Given 同时点击确认和忽略 When Then 返回 409只一个生效 |
### AC-12 配置热更新
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TE-12-01 | 探针周期修改 60s 内生效 | Functional | Given 修改探针周期 When 下发配置 Then 60s 后新周期生效 |
---
## 7. 异常/边缘流程测试FP-01 ~ FP-10
| 用例 ID | 场景 | 验证点 | 预期行为 |
|---------|------|-------|---------|
| TFP-01 | 供应商探针 DNS/TCP 超时 | 状态不变 | 标记 inconclusive指数退避不触发状态变更 |
| TFP-02 | 供应商返回空/格式突变 | 状态不变 | 解析失败标记 inconclusive记录日志 |
| TFP-03 | 探针与手动操作并发 | 乐观锁 | 更新失败,探针记录冲突日志,下次覆盖 |
| TFP-04 | 准入测试期间测试账号 suspended | 任务标记失败 | 任务标记 test_failedreason = test_account_unavailable |
| TFP-05 | 补给接口返回 400 或资源冲突 | 任务失败 | 任务 failed不重复盲目重试审计记录完整 |
| TFP-06 | 补给成功但验证/启用失败 | pending 不变 | 账号保持 pending_verify/pending_enable任务标记 verify_failed触发告警 |
| TFP-07 | 供应商模型列表分页 500 | 整体不中断 | 已获取部分正常处理,失败页下次重试 |
| TFP-08 | 探针期间数据库不可用 | 任务失败重试 | 探针任务失败,连续 5 次失败后暂停批次,触发系统告警 |
| TFP-09 | 确认上架与忽略并发 | 409 冲突 | 只有一个生效,返回 409 |
| TFP-10 | KMS 不可用时注册 | 明文不落盘 | 加密步骤阻塞/失败,明文凭证不出现 |
---
## 8. 灰度发布验证计划
### 8.1 各 Phase 验证内容
| Phase | 交付内容 | 通过标准 | 依赖项 |
|-------|---------|---------|--------|
| **Phase 1** | 模块 A探针+ 模块 E 只读视图 | AC-01~AC-03, AC-10~AC-11只读部分 | 主仓调度能力或本地调度测试桩 |
| **Phase 2** | 模块 B发现+ 模块 C准入测试 | AC-04~AC-07 | Phase 1 + 供应商 API 清单 |
| **Phase 3** | 模块 D受控自动补给+ 模块 E 完整 | AC-08~AC-12 | Phase 1+2 + KMS/通知与补给受理链路就绪 |
### 8.2 灰度门禁
每次 Phase 升级前:
- [ ] 全部 AC 测试用例通过
- [ ] 覆盖率达标
- [ ] 灰度开关独立验证(每个开关可单独打开/关闭)
- [ ] 回滚条件演练(误报率>5% / 状态变更导致错误率上升>2%
---
## 9. 回归测试集
### 9.1 快速回归(每次 PR~10 分钟)
```
TA-01-01, TA-02-01, TA-02-02, TA-02-05,
TB-04-01, TC-06-01, TC-07-01,
TD-08-01, TD-09-01,
TE-10-01, TE-11-01
共 11 条
```
### 9.2 完整回归Phase 升级,~45 分钟)
```
TA-01-01 ~ TA-03-02全 8 条)
TB-04-01 ~ TB-05-02全 4 条)
TC-06-01 ~ TC-07-04全 4 条)
TD-08-01 ~ TD-09-03全 4 条)
TE-10-01 ~ TE-12-01全 7 条)
TFP-01 ~ TFP-10全 10 条)
共 37 条
```
---
## 10. 技术栈与集成约束验证
### 10.1 统一技术栈与双运行模式验证
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TSI-RUN-01 | 独立运行模式启动 | Happy Path | Given 独立 `config.yaml` 与独立数据库/Redis When 启动 `cmd/supply-intelligence/main.go` Then `/actuator/health/ready` 返回 200`/api/v1/supply-intelligence/*` 可访问 |
| TSI-RUN-02 | 集成运行模式挂载 | Integration | Given supply-api 主进程加载 `IntegrationPlugin` When 启动 Then `/internal/supply-intelligence/*` 路由与后台任务注册成功 |
| TSI-RUN-03 | 配置分离加载 | Functional | Given 独立模式与集成模式分别启动 When 读取配置 Then 独立模式只加载自身配置,集成模式合并主项目配置且不覆盖无关模块 |
| TSI-RUN-04 | 数据库前缀隔离 | Structural | Given 执行迁移 When 检查 schema Then 仅创建 `supply_intelligence_` 前缀表 |
### 10.2 独立运行与集成运行验证
### 10.3 IntegrationPlugin 与模块挂载验证
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TSI-PLG-01 | IntegrationPlugin 注册 HTTP 路由 | Integration | Given 集成模式 When 插件注册 Then Probe/Discovery/Admission/AutoReg/OpsWorkBench 路由挂载成功 |
| TSI-PLG-02 | 模块开关生效 | Functional | Given `enabled_modules` 关闭某模块 When 启动 Then 对应路由/worker 不注册,其他模块可用 |
| TSI-PLG-03 | 集成模式共享资源 | Integration | Given supply-api 注入共享 DB/Redis/logger When 插件启动 Then 使用共享资源且不重复初始化冲突依赖 |
### 10.3 OpenAPI 契约验证
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TSI-OAS-01 | OpenAPI 文档可访问 | Functional | Given 服务启动 When 请求 `/openapi.json``/docs` Then 返回 200 且包含探针、发现、准入测试、运营工作台接口 |
| TSI-OAS-02 | 路由与 OpenAPI 一致 | Contract | Given 导出的 OpenAPI 文档 When 对照 HTTP 路由 Then 请求/响应/错误码与实现一致,无缺失公开接口 |
| TSI-OAS-03 | 集成前缀可配置 | Contract | Given 集成模式配置内部前缀 When 导出文档 Then 文档反映 `/internal/supply-intelligence/` 前缀或明确区分暴露面 |
### 10.4 NewAPI / Sub2API 适配层验证
| 用例 ID | 描述 | 类型 | 验证条件 |
|---------|------|------|---------|
| TSI-ADP-01 | 供应商状态同步适配 | Contract | Given NewAPI/Sub2API 拉取供应商状态 When 调用标准化接口 Then 返回字段稳定、延迟满足约束、状态映射正确 |
| TSI-ADP-02 | 模型列表推送适配 | Contract | Given 外部系统拉取模型列表 When 调用 `/models` Then 只返回已发现且允许暴露的数据,字段与约定一致 |
| TSI-ADP-03 | 账号状态适配边界 | Contract | Given 外部系统读取账号状态 When 通过适配层执行 Then 仅返回允许暴露的状态字段,不暴露凭证/探针日志/内部风险细节 |
---
## 11. 发布门禁与阶段结论
### 11.1 发布门禁检查表
以下门禁项全部通过前,不得认定达到生产要求:
- [ ] 独立运行 / 集成运行两种模式均完成启动验证路由、worker、内部接口真实挂载
- [ ] `IntegrationPlugin`、OpenAPI、NewAPI/Sub2API 适配层合同测试全部通过
- [ ] 凭证保护经日志/DB/异常路径验证无明文KMS 不可用时 fail-closed
- [ ] 受控自动补给链路具备白名单限制、阈值触发、审批/待验证边界、重复提交阻断与审计留痕
- [ ] 状态机迁移、审计写入、Gateway package event + ack、外部只读适配链路完成一致性验证
- [ ] 首次生产放量场景遵循“只告警不自动变更状态”,并验证撤销与人工接管流程
- [ ] 调度器失效、补给受理失败、外部适配越权、错误状态传播四类高风险回归通过
- [ ] 至少一条探针、一条模型发现、一条准入测试、一条受控自动补给链路完成端到端验证
### 11.2 阶段门控结论
**当前结论APPROVED设计已可进入 Engineer 实现)**
**结论解释:**
- 本文档首页所述“收敛后测试门禁文档”口径已生效。
- 当前放行依据不再是旧 HLD/PRD/INTERFACE/DEPLOYMENT而是
- `/home/long/project/立交桥/projects/supply-intelligence/tech/BASELINE_TECHLEAD_V2.md`
- `/home/long/project/立交桥/projects/supply-intelligence/tech/GATEWAY_CONSUMER_DECISION_2026-05.md`
- 因此,本节不再沿用历史性 `REQUEST_CHANGES` 作为当前总门控。
**当前仍需在实现阶段持续验证的高风险项:**
- 凭证保护必须能证明 fail-closed且日志/审计/异常路径无明文泄漏。
- 状态同步、审计写入、package event + ack 必须形成可追踪闭环。
- 关键链路必须能完成“定义 → 装配 → 调用 → 入口”四层验证,不能只停留在接口存在。
- 自动补给按首期最小边界解释:允许白名单供应商、阈值触发、任务化补给、待验证/待启用;不把浏览器自动化深链路作为首期阻断门槛。
**实现前约束:**
- 若实现与 `BASELINE_TECHLEAD_V2.md``GATEWAY_CONSUMER_DECISION_2026-05.md` 冲突,应以两者为准并回退旧测试假设。
- 若下游消费方未落真实 poll/apply/ack 入口,不得宣称 package 发布链路已完成。
- 若 NewAPI/Sub2API 适配超出“只读/受控暴露边界”,应判定为实施漂移。
**重新转为 REQUEST_CHANGES / BLOCKED 的条件:**
- 实现阶段发现 published/applied 再次混淆。
- gateway 消费闭环缺少真实消费方入口或 ack 回写。
- 自动补给被重新扩张为首期深自动注册硬门槛。
- 核心链路无法提供四层调用链证据。
---
## 12. 性能与安全测试
### 12.1 性能基准
| 指标 | 目标值 | 测试方法 |
|------|-------|---------|
| 探针执行(单账号) | <2s | 计时 1000 次取 P99 |
| 全网扫描10 供应商) | <5min | 从调度触发到完成计 |
| 准入测试5 用例) | <30min P99 | 从 discovered 到 test_passed/failed |
| 供应商状态查询 API | <50ms P99 | 并发 100 请求 |
| 审计日志写入 | <1s P99 | 单次变更后计时 |
### 12.2 安全测试
| 测试项 | 方法 | 验证 |
|-------|------|------|
| 凭证明文保护 | 检查日志/DB/内存 dump | 无明文凭证 |
| KMS 密钥轮换 | Mock KMS 不可用 | fail-closed不暴露明文 |
| 供应商 API 限流绕过 | 连续探针超限 | 正确触发 rate limit |
| 注册接口重复提交 | 并发同一邮箱注册 | 只有一次成功,其余 failed |

98
test/CASES.md Normal file
View File

@@ -0,0 +1,98 @@
# Supply-Intelligence 测试用例
> 版本v1.0 | 状态:初稿
---
## AC-01 探针覆盖度
| 用例编号 | 名称 | 前置条件 | 测试步骤 | 预期结果 | 优先级 |
|---------|------|---------|---------|---------|--------|
| TC-01.1 | 99% 覆盖率达标 | 插入 100 条测试账号 | 1. 等待 15 分钟 2. 统计探针日志 | 探针覆盖率 ≥ 99% | P0 |
| TC-01.2 | 探针周期可配置 | 已配置探针任务 | 1. 修改探针周期为 3 分钟 2. 等待 60 秒 | 周期在 60 秒内生效 | P1 |
## AC-02 状态变更正确性
| 用例编号 | 名称 | 前置条件 | 测试步骤 | 预期结果 | 优先级 |
|---------|------|---------|---------|---------|--------|
| TC-02.1 | active → suspended | 账号为 active | 1. Mock 返回 401 | 60s 内状态变为 suspended | P0 |
| TC-02.2 | suspended → disabled | 账号为 suspended | 1. Mock 连续 3 次返回 401 | 60s 内状态变为 disabled | P0 |
| TC-02.3 | 429 不变更 | 账号为 active | 1. Mock 返回 429 | 15 分钟内状态保持 active | P0 |
| TC-02.4 | 状态机违规 | 账号为 active | 1. 尝试直接变更为 disabled | 被拒绝,返回错误码 | P0 |
| TC-02.5 | 状态回复 | 账号为 suspended | 1. Mock 返回 200 | 60s 内状态变为 active | P1 |
## AC-03 误报率
| 用例编号 | 名称 | 前置条件 | 测试步骤 | 预期结果 | 优先级 |
|---------|------|---------|---------|---------|--------|
| TC-03.1 | 7 天误报率 | 全部账号正常 | 1. 运行 7 天 2. 统计状态误变更次数 | 误报率 ≤ 1% | P0 |
## AC-04 新模型发现延迟
| 用例编号 | 名称 | 前置条件 | 测试步骤 | 预期结果 | 优先级 |
|---------|------|---------|---------|---------|--------|
| TC-04.1 | 2h 内发现 | 已对接供应商 | 1. T0 在 Mock 响应中新增 model_id 2. T0+2h 查询数据库 | candidate 存在status=discovered | P0 |
## AC-05 已下架模型告警
| 用例编号 | 名称 | 前置条件 | 测试步骤 | 预期结果 | 优先级 |
|---------|------|---------|---------|---------|--------|
| TC-05.1 | 不自动下架 | package 为 active | 1. 从 Mock 中移除 model_id 2. 等待 2h | package 状态保持 active | P0 |
| TC-05.2 | 生成告警待办 | package 为 active | 1. 从 Mock 中移除 model_id 2. 等待 2h | 运营工作台出现告警 | P0 |
## AC-06 准入测试通过
| 用例编号 | 名称 | 前置条件 | 测试步骤 | 预期结果 | 优先级 |
|---------|------|---------|---------|---------|--------|
| TC-06.1 | 测试通过 | candidate 为 discovered | 1. 触发准入测试 2. 等待 30min | 状态变为 test_passed生成 package 草稿 | P0 |
| TC-06.2 | 草稿字段完整 | 测试通过后 | 1. 查询生成的 package 草稿 | 包含 platform、model、price 字段 | P1 |
## AC-07 准入测试失败
| 用例编号 | 名称 | 前置条件 | 测试步骤 | 预期结果 | 优先级 |
|---------|------|---------|---------|---------|--------|
| TC-07.1 | 接口返回 500 | candidate 为 discovered | 1. Mock 返回 500 2. 等待测试完成 | 状态变为 test_failedfailure_reason 非空 | P0 |
| TC-07.2 | 前端展示 | candidate 为 test_failed | 1. 访问运营工作台 | 展示失败详情 | P1 |
## AC-08 自动注册成功
| 用例编号 | 名称 | 前置条件 | 测试步骤 | 预期结果 | 优先级 |
|---------|------|---------|---------|---------|--------|
| TC-08.1 | 注册流程 | 已配置白名单 | 1. 触发自动注册 2. 等待 30min | 新增 active 账号 | P0 |
| TC-08.2 | 密钥加密 | 注册完成后 | 1. 查询数据库 | API Key 已加密存储 | P1 |
## AC-09 自动注册 fail-closed
| 用例编号 | 名称 | 前置条件 | 测试步骤 | 预期结果 | 优先级 |
|---------|------|---------|---------|---------|--------|
| TC-09.1 | 网关不可用 | 配置启用 | 1. Mock 邮件网关返回 503 2. 等待 60s | 任务状态为 failed审计日志记录失败 | P0 |
| TC-09.2 | 不返回成功 | 注册失败后 | 1. 检查对上游响应 | 不返回成功状态码 | P0 |
## AC-10 审计日志完整性
| 用例编号 | 名称 | 前置条件 | 测试步骤 | 预期结果 | 优先级 |
|---------|------|---------|---------|---------|--------|
| TC-10.1 | 字段完整性 | 触发操作后 | 1. 5s 内查询审计日志 | 包含所有必要字段 | P0 |
| TC-10.2 | 自动化操作审计 | 自动化操作后 | 1. 查询审计日志 | 存在对应记录 | P0 |
## AC-11 运营工作台干预
| 用例编号 | 名称 | 前置条件 | 测试步骤 | 预期结果 | 优先级 |
|---------|------|---------|---------|---------|--------|
| TC-11.1 | 一键上架 | package 为 draft | 1. 点击确认上架 2. 等待 3s | 状态变为 active | P0 |
| TC-11.2 | 忽略模型 | candidate 为 discovered | 1. 点击忽略 | 不在待处理列表中7 天后恢复 | P0 |
## AC-12 配置热更新
| 用例编号 | 名称 | 前置条件 | 测试步骤 | 预期结果 | 优先级 |
|---------|------|---------|---------|---------|--------|
| TC-12.1 | 探针周期热更新 | 已运行 | 1. 修改配置 2. 观察调度行为 | 60s 内生效 | P1 |
## 边缘场景 / 失败路径
| 用例编号 | 名称 | 前置条件 | 测试步骤 | 预期结果 | 优先级 |
|---------|------|---------|---------|---------|--------|
| TC-E1 | DNS 失败 | 探针任务已配置 | 1. 模拟 DNS 解析失败 | 状态不变更,记录日志 | P1 |
| TC-E2 | 空响应体 | 探针任务已配置 | 1. Mock 返回空 JSON | 状态不变更,记录日志 | P1 |
| TC-E3 | 并发乐观锁 | 探针任务已配置 | 1. 同时触发手动更新和探针 | 乐观锁冲突,探针记录失败 | P1 |
| TC-E4 | 测试账号不可用 | 准入测试进行中 | 1. 将测试账号标记为 suspended | 测试标记为 failed原因为 test_account_unavailable | P1 |

75
test/STRATEGY.md Normal file
View File

@@ -0,0 +1,75 @@
# Supply-Intelligence 测试策略
> 版本v1.0 | 状态:初稿
---
## 1. 测试目标
| 目标 | 指标 | 验证方式 |
|------|------|---------|
| 功能正确性 | 所有 AC 通过率 100% | 每个 AC 至少 1 正向 + 1 负向测试用例 |
| 状态机正确性 | 状态迁移符合状态图 | 所有状态转换路径覆盖 |
| 安全性 | 无越权、审计日志完整 | 渗透测试 + 审计追溯 |
| 性能 | 探针 P99 < 50ms扫描完成 < 30min | 负载测试 |
## 2. 测试层级
```
├── 单元测试 (Unit Test)
│ ├── 状态机转换逻辑
│ ├── 探针策略逻辑
│ ├── 扫描比对算法
│ └── 准入测试判定逻辑
├── 集成测试 (Integration Test)
│ ├── 数据库交互(状态变更、审计日志)
│ ├── Redis 缓存交互
│ ├── 供应商 API Mock
│ ├── 邮件/短信网关 Mock
│ └── 向量数据库检索
├── E2E 测试 (End-to-End Test)
│ ├── 探针到状态变更整条链路
│ ├── 扫描到候选模型整条链路
│ ├── 准入测试到上架整条链路
│ └── 账号注册整条链路
└── 稳定性测试 (Stability Test)
├── 7 天连续探针运行
└── 高并发扫描/测试
```
## 3. 测试工具
| 层级 | 工具 | 说明 |
|------|------|------|
| 单元测试 | Go testing + testify + mockery | 覆盖率门槛 domain ≥ 70%、service ≥ 80% |
| 数据库测试 | testcontainers-go (PostgreSQL) | 每次测试启动独立容器 |
| 缓存测试 | miniredis | 轻量级 Redis Mock |
| 供应商 Mock | gock / httptest | 模拟供应商 API 响应 |
| E2E 测试 | 自定义 Go E2E 框架 | 启动完整服务 + 数据库 |
| 稳定性测试 | 自定义脚本 | 7 天连续运行监控 |
## 4. 测试环境
| 环境 | 用途 | 数据 |
|------|------|------|
| 本地开发 | 单元 + 快速集成测试 | 测试数据生成 |
| CI | 自动化单元 + 集成测试 | 测试数据生成 |
| 测试环境 | E2E + 性能基准 | 模拟生产数据 |
| 生产前 | 稳定性验证 | 生产数据副本(脱敏) |
| 生产环境 | 灰度监控 | 真实数据 |
## 5. 测试数据管理
- 供应商 API 响应使用 `test/fixtures/supplier_responses/` 下的 JSON 文件管理。
- 测试用例集使用 `test/fixtures/test_cases/` 下的 YAML 文件管理。
- 每个测试用例自洁,启动前加载固定数据集,结束后清理。
## 6. 特殊测试要求
- **探针测试**:必须覆盖 429 、 401 、 403 、 500 、 503 、超时、空响应、DNS 失败、TCP 超时等所有常见异常场景。
- **状态机测试**:必须覆盖所有状态转换路径,特别是 `active``disabled` 的违规路径必须被拒绝。
- **审计测试**:所有自动化操作必须在 5 秒内生成审计记录,且字段完整。
- **并发测试**:探针任务与运营人员手动操作的并发场景必须测试,验证乐观锁机制。