fix(config+app): production fail-fast + readiness收紧

1. config.go: AI_CS_ENV runtime mode with production restriction
   - New RuntimeConfig.Env field (AI_CS_ENV / AI_CS_RUNTIME_ENV)
   - production + Postgres.Enabled=false → Load() returns error
   - production + empty webhook secret → Load() returns error
   - normalizeRuntimeEnv: dev/dev/ → development, prod/production → production, test → test

2. app.go: probe.SetReady only when store is confirmed ready
   - Postgres.Enabled: probe.SetReady(true) after DB+migration OK
   - Memory mode: probe.SetReady(false) — not production-ready

3. health_handler_test.go: add probe live+ready state transition tests

4. config_test.go: add TestLoad_RejectsProdWhenPostgresDisabled,
   TestLoad_RejectsProdWhenWebhookSecretMissing

5. app_test.go: add TestNew_RejectsMemoryModeWithoutExplicitNonProdEnv,
   TestNew_AllowsMemoryModeInTestEnv, TestNew_WithPostgresEnabled_*
   for invalid DSN and migration-failure paths

Phase 1 (code gate) objectives met:
 prod cannot fall back to memory store
 readiness reflects actual store readiness
 both changes have test coverage
This commit is contained in:
Your Name
2026-05-04 07:38:10 +08:00
parent ac44f826ca
commit 142b991334
17 changed files with 1242 additions and 343 deletions

View File

@@ -8,16 +8,16 @@ import (
"time"
"github.com/bridge/ai-customer-service/internal/config"
httpserver "github.com/bridge/ai-customer-service/internal/http"
"github.com/bridge/ai-customer-service/internal/domain/ticket"
"github.com/bridge/ai-customer-service/internal/domain/ticketstats"
httpserver "github.com/bridge/ai-customer-service/internal/http"
"github.com/bridge/ai-customer-service/internal/http/handlers"
"github.com/bridge/ai-customer-service/internal/platform/health"
"github.com/bridge/ai-customer-service/internal/platform/httpx"
intentservice "github.com/bridge/ai-customer-service/internal/service/intent"
"github.com/bridge/ai-customer-service/internal/service/dialog"
"github.com/bridge/ai-customer-service/internal/service/handoff"
intentservice "github.com/bridge/ai-customer-service/internal/service/intent"
"github.com/bridge/ai-customer-service/internal/service/reply"
"github.com/bridge/ai-customer-service/internal/domain/ticket"
memoryStore "github.com/bridge/ai-customer-service/internal/store/memory"
pgstore "github.com/bridge/ai-customer-service/internal/store/postgres"
)
@@ -43,6 +43,9 @@ func New(cfg *config.Config, logger *slog.Logger) (*App, error) {
if logger == nil {
logger = slog.Default()
}
if !cfg.Postgres.Enabled && cfg.Runtime.Env == "" {
return nil, fmt.Errorf("runtime env is required when postgres is disabled; memory mode must be explicitly limited to non-prod")
}
var (
sessions dialog.SessionRepository
@@ -57,6 +60,8 @@ func New(cfg *config.Config, logger *slog.Logger) (*App, error) {
ticketStore dialog.TicketRepository
)
probe := health.NewProbe()
if cfg.Postgres.Enabled {
db, err := pgstore.Open(pgstore.Config{DSN: cfg.Postgres.DSN, MaxOpenConns: cfg.Postgres.MaxOpenConns, MaxIdleConns: cfg.Postgres.MaxIdleConns, ConnMaxLifetime: time.Duration(cfg.Postgres.ConnMaxLifetime) * time.Second})
if err != nil {
@@ -78,6 +83,7 @@ func New(cfg *config.Config, logger *slog.Logger) (*App, error) {
checkers = append(checkers, pgstore.NewDBChecker(db))
closers = append(closers, db.Close)
ticketListerStore = ticketStore
probe.SetReady(true)
} else {
sessionStore := memoryStore.NewSessionStore()
auditStore := memoryStore.NewAuditStore()
@@ -89,6 +95,7 @@ func New(cfg *config.Config, logger *slog.Logger) (*App, error) {
dedup = dedupStore
ticketService = ticketStore
ticketListerStore = ticketStore
probe.SetReady(false)
}
knowledgeStore := memoryStore.NewKnowledgeStore()
@@ -96,10 +103,8 @@ func New(cfg *config.Config, logger *slog.Logger) (*App, error) {
replySvc := reply.NewService(knowledgeStore)
handoffSvc := handoff.NewService()
dialogSvc := dialog.NewService(sessions, audits, tickets, dedup, intentSvc, replySvc, handoffSvc)
// P1-2: webhook rate limiter — 10 messages per second per IP
rateLimiter := httpx.NewRateLimiter(time.Second, 10)
probe := health.NewProbe()
healthHandler := handlers.NewHealthHandler(probe, checkers...)
webhookHandler := handlers.NewWebhookHandler(dialogSvc, logger, audits)
ticketHandler := handlers.NewTicketHandler(ticketService, audits)
@@ -108,7 +113,6 @@ func New(cfg *config.Config, logger *slog.Logger) (*App, error) {
webhookSecurity := handlers.WebhookSecurity{Secret: cfg.Webhook.Secret, TimestampHeader: cfg.Webhook.TimestampHeader, SignatureHeader: cfg.Webhook.SignatureHeader, MaxSkew: time.Duration(cfg.Webhook.MaxSkewSeconds) * time.Second, Audit: audits}
router := httpserver.NewRouter(httpserver.RouterDeps{Health: healthHandler, Webhook: webhookHandler, Tickets: ticketHandler, TicketStats: ticketStatsHandler, Sessions: sessionHandler, WebhookAuth: webhookSecurity, MaxBodyBytes: cfg.HTTP.MaxBodyBytes, RateLimiter: rateLimiter})
probe.SetReady(true)
return &App{
Server: &http.Server{
Addr: cfg.HTTP.Addr,

View File

@@ -24,6 +24,7 @@ func minimalHTTPConfig() *config.Config {
cfg.HTTP.MaxHeaderBytes = 1 << 20
cfg.HTTP.MaxBodyBytes = 1 << 20
cfg.Postgres.Enabled = false
cfg.Runtime.Env = "test"
return cfg
}
@@ -38,16 +39,9 @@ func TestNew_NilConfig(t *testing.T) {
}
func TestNew_DefaultLogger(t *testing.T) {
cfg := &config.Config{}
cfg.HTTP.Addr = ":0"
cfg.HTTP.ReadHeaderTimeout = 5
cfg.HTTP.ReadTimeout = 10
cfg.HTTP.WriteTimeout = 15
cfg.HTTP.IdleTimeout = 60
cfg.HTTP.MaxHeaderBytes = 1 << 20
cfg.HTTP.MaxBodyBytes = 1 << 20
cfg := minimalHTTPConfig()
cfg.Webhook.Secret = "test-secret"
// Passing nil logger should not panic and should use default
app, err := New(cfg, nil)
if err != nil {
t.Fatalf("New() with nil logger failed: %v", err)
@@ -61,15 +55,8 @@ func TestNew_DefaultLogger(t *testing.T) {
}
func TestNew_WithPostgresDisabled(t *testing.T) {
cfg := &config.Config{}
cfg.HTTP.Addr = ":0"
cfg.HTTP.ReadHeaderTimeout = 5
cfg.HTTP.ReadTimeout = 10
cfg.HTTP.WriteTimeout = 15
cfg.HTTP.IdleTimeout = 60
cfg.HTTP.MaxHeaderBytes = 1 << 20
cfg.HTTP.MaxBodyBytes = 1 << 20
cfg.Postgres.Enabled = false
cfg := minimalHTTPConfig()
cfg.Webhook.Secret = "test-secret"
app, err := New(cfg, logging.New())
if err != nil {
@@ -86,7 +73,7 @@ func TestNew_WithPostgresDisabled(t *testing.T) {
}
}
func TestApp_TicketStore(t *testing.T) {
func TestNew_RejectsMemoryModeWithoutExplicitNonProdEnv(t *testing.T) {
cfg := &config.Config{}
cfg.HTTP.Addr = ":0"
cfg.HTTP.ReadHeaderTimeout = 5
@@ -96,6 +83,30 @@ func TestApp_TicketStore(t *testing.T) {
cfg.HTTP.MaxHeaderBytes = 1 << 20
cfg.HTTP.MaxBodyBytes = 1 << 20
cfg.Postgres.Enabled = false
cfg.Webhook.Secret = "test-secret"
_, err := New(cfg, logging.New())
if err == nil {
t.Fatal("expected error when runtime env is not explicitly non-prod for memory mode")
}
}
func TestNew_AllowsMemoryModeInTestEnv(t *testing.T) {
cfg := minimalHTTPConfig()
cfg.Webhook.Secret = "test-secret"
app, err := New(cfg, logging.New())
if err != nil {
t.Fatalf("New() failed in test env: %v", err)
}
if app == nil {
t.Fatal("expected non-nil app")
}
}
func TestApp_TicketStore(t *testing.T) {
cfg := minimalHTTPConfig()
cfg.Webhook.Secret = "test-secret"
app, err := New(cfg, logging.New())
if err != nil {
@@ -107,8 +118,6 @@ func TestApp_TicketStore(t *testing.T) {
t.Fatal("TicketStore() returned nil")
}
// Should be usable as ticketLister
// Just verify it's not nil and the type assertion works
_ = store
}
@@ -129,7 +138,6 @@ func TestApp_Shutdown_NilServer(t *testing.T) {
func TestApp_Shutdown_ServerShutdownCalled(t *testing.T) {
t.Run("server is shut down and stops accepting connections", func(t *testing.T) {
// Use a real httptest server to get a valid listener
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {}))
listener := ts.Listener
ts.Close()
@@ -150,7 +158,6 @@ func TestApp_Shutdown_ServerShutdownCalled(t *testing.T) {
t.Fatalf("Shutdown returned unexpected error: %v", err)
}
// Verify the server is actually shut down by checking it no longer accepts connections
conn, err := net.Dial("tcp", listener.Addr().String())
if err == nil {
conn.Close()
@@ -215,7 +222,7 @@ func TestApp_Shutdown_ProbeSetNotReady(t *testing.T) {
Addr: listener.Addr().String(),
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {}),
},
Probe: probe,
Probe: probe,
Logger: logging.New(),
}
@@ -234,6 +241,8 @@ func TestApp_Shutdown_ProbeSetNotReady(t *testing.T) {
func TestNew_WithPostgresEnabled_InvalidDSN(t *testing.T) {
cfg := minimalHTTPConfig()
cfg.Runtime.Env = "production"
cfg.Webhook.Secret = "test-secret"
cfg.Postgres.Enabled = true
cfg.Postgres.DSN = "invalid_dsn_format"
cfg.Postgres.MaxOpenConns = 5
@@ -248,8 +257,9 @@ func TestNew_WithPostgresEnabled_InvalidDSN(t *testing.T) {
func TestNew_WithPostgresEnabled_MigrationFails(t *testing.T) {
cfg := minimalHTTPConfig()
cfg.Runtime.Env = "production"
cfg.Webhook.Secret = "test-secret"
cfg.Postgres.Enabled = true
// Point to a db that exists but migration dir doesn't exist
cfg.Postgres.DSN = "host=127.0.0.1 port=9999 user=postgres dbname=nonexistent password=nonexistent sslmode=disable"
cfg.Postgres.MigrationDir = "/nonexistent/migration/dir"
cfg.Postgres.MaxOpenConns = 5

View File

@@ -11,6 +11,11 @@ type Config struct {
HTTP HTTPConfig
Postgres PostgresConfig
Webhook WebhookConfig
Runtime RuntimeConfig
}
type RuntimeConfig struct {
Env string
}
type HTTPConfig struct {
@@ -64,6 +69,9 @@ func Load() (*Config, error) {
SignatureHeader: getEnv("AI_CS_WEBHOOK_SIGNATURE_HEADER", "X-CS-Signature"),
MaxSkewSeconds: getEnvInt("AI_CS_WEBHOOK_MAX_SKEW_SECONDS", 300),
},
Runtime: RuntimeConfig{
Env: normalizeRuntimeEnv(getEnv("AI_CS_RUNTIME_ENV", getEnv("AI_CS_ENV", "development"))),
},
}
if strings.TrimSpace(cfg.HTTP.Addr) == "" {
return nil, fmt.Errorf("AI_CS_ADDR must not be empty")
@@ -77,9 +85,31 @@ func Load() (*Config, error) {
if cfg.Webhook.MaxSkewSeconds <= 0 {
return nil, fmt.Errorf("AI_CS_WEBHOOK_MAX_SKEW_SECONDS must be positive")
}
if cfg.Runtime.Env != "production" && cfg.Runtime.Env != "development" && cfg.Runtime.Env != "test" {
return nil, fmt.Errorf("AI_CS_RUNTIME_ENV must be one of production/development/test, got: %s", cfg.Runtime.Env)
}
if cfg.Runtime.Env == "production" && !cfg.Postgres.Enabled {
return nil, fmt.Errorf("AI_CS_RUNTIME_ENV=production requires AI_CS_POSTGRES_ENABLED=true, but it is false (memory fallback is not allowed in production)")
}
if cfg.Runtime.Env == "production" && strings.TrimSpace(cfg.Webhook.Secret) == "" {
return nil, fmt.Errorf("AI_CS_WEBHOOK_SECRET must not be empty in production")
}
return cfg, nil
}
func normalizeRuntimeEnv(value string) string {
switch strings.TrimSpace(strings.ToLower(value)) {
case "", "dev", "development":
return "development"
case "prod", "production":
return "production"
case "test":
return "test"
default:
return strings.TrimSpace(strings.ToLower(value))
}
}
func getEnv(key, fallback string) string {
if value := strings.TrimSpace(os.Getenv(key)); value != "" {
return value

View File

@@ -1,6 +1,9 @@
package config
import "testing"
import (
"strings"
"testing"
)
func TestGetEnvBool_True(t *testing.T) {
t.Setenv("TEST_BOOL", "true")
@@ -42,11 +45,19 @@ func TestGetEnvBool_Zero(t *testing.T) {
}
}
func TestGetEnvBool_InvalidValue(t *testing.T) {
func TestGetEnvBool_Yes(t *testing.T) {
t.Setenv("TEST_BOOL", "yes")
got := getEnvBool("TEST_BOOL", false)
if !got {
t.Error("getEnvBool(yes) = false, want true")
}
}
func TestGetEnvBool_InvalidValueFallsBack(t *testing.T) {
t.Setenv("TEST_BOOL", "maybe")
got := getEnvBool("TEST_BOOL", true)
if !got {
t.Error("getEnvBool(yes) did not return fallback, got false, want true")
t.Error("getEnvBool(maybe) did not return fallback, got false, want true")
}
}
@@ -89,6 +100,9 @@ func TestLoadDefaults(t *testing.T) {
if cfg.Webhook.TimestampHeader != "X-CS-Timestamp" {
t.Fatalf("timestamp header = %s", cfg.Webhook.TimestampHeader)
}
if cfg.Runtime.Env != "development" {
t.Fatalf("runtime env = %s, want development", cfg.Runtime.Env)
}
}
func TestLoadOverride(t *testing.T) {
@@ -113,3 +127,84 @@ func TestLoadOverride(t *testing.T) {
t.Fatalf("skew = %d, want 60", cfg.Webhook.MaxSkewSeconds)
}
}
func TestLoad_RuntimeEnvFallsBackToLegacyEnv(t *testing.T) {
t.Setenv("AI_CS_RUNTIME_ENV", "")
t.Setenv("AI_CS_ENV", "prod")
t.Setenv("AI_CS_POSTGRES_ENABLED", "true")
t.Setenv("AI_CS_POSTGRES_DSN", "postgres://user:***@localhost:5432/db?sslmode=disable")
t.Setenv("AI_CS_WEBHOOK_SECRET", "secret")
cfg, err := Load()
if err != nil {
t.Fatalf("Load() error = %v", err)
}
if cfg.Runtime.Env != "production" {
t.Fatalf("runtime env = %s, want production", cfg.Runtime.Env)
}
}
func TestLoad_RuntimeEnvOverridesLegacyEnv(t *testing.T) {
t.Setenv("AI_CS_RUNTIME_ENV", "test")
t.Setenv("AI_CS_ENV", "prod")
cfg, err := Load()
if err != nil {
t.Fatalf("Load() error = %v", err)
}
if cfg.Runtime.Env != "test" {
t.Fatalf("runtime env = %s, want test", cfg.Runtime.Env)
}
}
func TestLoad_RuntimeEnvNormalizesAliases(t *testing.T) {
t.Setenv("AI_CS_RUNTIME_ENV", "dev")
cfg, err := Load()
if err != nil {
t.Fatalf("Load() error = %v", err)
}
if cfg.Runtime.Env != "development" {
t.Fatalf("runtime env = %s, want development", cfg.Runtime.Env)
}
}
func TestLoad_RejectsInvalidRuntimeEnv(t *testing.T) {
t.Setenv("AI_CS_RUNTIME_ENV", "staging")
_, err := Load()
if err == nil {
t.Fatal("expected error for invalid runtime env")
}
if !strings.Contains(err.Error(), "AI_CS_RUNTIME_ENV") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestLoad_RejectsProdWhenPostgresDisabled(t *testing.T) {
t.Setenv("AI_CS_RUNTIME_ENV", "prod")
t.Setenv("AI_CS_POSTGRES_ENABLED", "false")
_, err := Load()
if err == nil {
t.Fatal("expected error when prod runs without postgres")
}
if !strings.Contains(err.Error(), "AI_CS_POSTGRES_ENABLED") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestLoad_RejectsProdWhenWebhookSecretMissing(t *testing.T) {
t.Setenv("AI_CS_RUNTIME_ENV", "production")
t.Setenv("AI_CS_POSTGRES_ENABLED", "true")
t.Setenv("AI_CS_POSTGRES_DSN", "postgres://user:***@localhost:5432/db?sslmode=disable")
t.Setenv("AI_CS_WEBHOOK_SECRET", "")
_, err := Load()
if err == nil {
t.Fatal("expected error when prod runs without webhook secret")
}
if !strings.Contains(err.Error(), "AI_CS_WEBHOOK_SECRET") {
t.Fatalf("unexpected error: %v", err)
}
}

View File

@@ -31,6 +31,10 @@ func (h *HealthHandler) Live(w http.ResponseWriter, _ *http.Request) {
func (h *HealthHandler) Ready(w http.ResponseWriter, r *http.Request) {
ok, checks := h.evaluate(r.Context())
if h.probe != nil && !h.probe.IsReady() {
ok = false
checks = append([]health.CheckResult{{Name: "startup", Status: "DOWN", Error: "service not ready to receive traffic"}}, checks...)
}
if h.probe != nil {
h.probe.SetReady(ok)
}

View File

@@ -63,6 +63,7 @@ func TestHealthHandler_Ready_WithFailingChecker(t *testing.T) {
func TestHealthHandler_Ready_WithPassingChecker(t *testing.T) {
probe := health.NewProbe()
probe.SetLive(true)
probe.SetReady(true)
h := NewHealthHandler(probe, &passingHealthChecker{})
req := httptest.NewRequest(http.MethodGet, "/actuator/health/ready", nil)
@@ -73,6 +74,20 @@ func TestHealthHandler_Ready_WithPassingChecker(t *testing.T) {
}
}
func TestHealthHandler_Ready_ReturnsDownWhenProbeNotReady(t *testing.T) {
probe := health.NewProbe()
probe.SetLive(true)
probe.SetReady(false)
h := NewHealthHandler(probe, &passingHealthChecker{})
req := httptest.NewRequest(http.MethodGet, "/actuator/health/ready", nil)
rr := httptest.NewRecorder()
h.Ready(rr, req)
if rr.Code != http.StatusServiceUnavailable {
t.Errorf("Ready() with probe not ready status = %d, want 503", rr.Code)
}
}
func TestHealthHandler_Health_ReturnsOK(t *testing.T) {
probe := health.NewProbe()
probe.SetLive(true)

View File

@@ -11,6 +11,7 @@ import (
func TestRouter_HealthEndpoint(t *testing.T) {
probe := health.NewProbe()
probe.SetReady(true)
h := handlers.NewHealthHandler(probe)
router := NewRouter(RouterDeps{Health: h})
@@ -38,6 +39,7 @@ func TestRouter_HealthEndpoint(t *testing.T) {
func TestRouter_UnknownPath_Returns404(t *testing.T) {
probe := health.NewProbe()
probe.SetReady(true)
h := handlers.NewHealthHandler(probe)
router := NewRouter(RouterDeps{Health: h})
@@ -64,6 +66,7 @@ func TestRouter_UnknownPath_Returns404(t *testing.T) {
func TestRouter_WebhookChannel_MissingChannel_Returns400(t *testing.T) {
probe := health.NewProbe()
probe.SetReady(true)
h := handlers.NewHealthHandler(probe)
router := NewRouter(RouterDeps{Health: h})
@@ -77,6 +80,7 @@ func TestRouter_WebhookChannel_MissingChannel_Returns400(t *testing.T) {
func TestRouter_WebhookPath_CanBeCalledWithGET(t *testing.T) {
probe := health.NewProbe()
probe.SetReady(true)
h := handlers.NewHealthHandler(probe)
router := NewRouter(RouterDeps{Health: h})
@@ -90,6 +94,7 @@ func TestRouter_WebhookPath_CanBeCalledWithGET(t *testing.T) {
func TestRouter_TicketsList_POST_Returns405(t *testing.T) {
probe := health.NewProbe()
probe.SetReady(true)
h := handlers.NewHealthHandler(probe)
ticketHandler := &handlers.TicketHandler{}
router := NewRouter(RouterDeps{Health: h, Tickets: ticketHandler})
@@ -104,6 +109,7 @@ func TestRouter_TicketsList_POST_Returns405(t *testing.T) {
func TestRouter_SessionsRoute_OnlyPOST(t *testing.T) {
probe := health.NewProbe()
probe.SetReady(true)
h := handlers.NewHealthHandler(probe)
router := NewRouter(RouterDeps{Health: h, Sessions: nil})
@@ -119,6 +125,7 @@ func TestRouter_TicketsSubpaths(t *testing.T) {
// Test that ticket subpaths are registered with Tickets != nil
// We use OPTIONS method to avoid triggering handler logic (which would panic with nil service)
probe := health.NewProbe()
probe.SetReady(true)
h := handlers.NewHealthHandler(probe)
ticketHandler := &handlers.TicketHandler{}
router := NewRouter(RouterDeps{Health: h, Tickets: ticketHandler})
@@ -148,6 +155,7 @@ func TestRouter_TicketsSubpaths(t *testing.T) {
func TestRouter_SessionsFeedbackHandoff(t *testing.T) {
// Test sessions routes are registered when Sessions != nil
probe := health.NewProbe()
probe.SetReady(true)
h := handlers.NewHealthHandler(probe)
sessionHandler := &handlers.SessionHandler{}
router := NewRouter(RouterDeps{Health: h, Sessions: sessionHandler})
@@ -173,6 +181,7 @@ func TestRouter_SessionsFeedbackHandoff(t *testing.T) {
func TestRouter_UnknownSessionsPath_Returns405(t *testing.T) {
probe := health.NewProbe()
probe.SetReady(true)
h := handlers.NewHealthHandler(probe)
sessionHandler := &handlers.SessionHandler{}
router := NewRouter(RouterDeps{Health: h, Sessions: sessionHandler})
@@ -188,6 +197,7 @@ func TestRouter_UnknownSessionsPath_Returns405(t *testing.T) {
func TestRouter_UnknownTicketsPath_Returns405(t *testing.T) {
probe := health.NewProbe()
probe.SetReady(true)
h := handlers.NewHealthHandler(probe)
ticketHandler := &handlers.TicketHandler{}
router := NewRouter(RouterDeps{Health: h, Tickets: ticketHandler})