feat(outbox): implement concurrent claim mechanism with UPDATE RETURNING + SKIP LOCKED

- Add migration 0004 to introduce 'claiming' status and timeout index
- Add StatusClaiming to platformevent domain and allow it in Validate()
- Rewrite ListDue as transactional UPDATE ... RETURNING with FOR UPDATE SKIP LOCKED
- Add ReleaseStaleClaims to reset expired claiming events back to retrying
- Worker Start() now runs a 30s ticker for stale claim recovery (5m timeout)
- Update stubEventStore in tests to satisfy new EventStore interface

Refs: D-02
This commit is contained in:
Your Name
2026-05-11 13:16:28 +08:00
parent 771304eabe
commit 34b175b130
5 changed files with 86 additions and 9 deletions

View File

@@ -0,0 +1,11 @@
-- Add 'claiming' status to outbox CHECK constraint and add claim timeout index
ALTER TABLE cs_platform_event_outbox
DROP CONSTRAINT IF EXISTS chk_cs_platform_event_outbox_status;
ALTER TABLE cs_platform_event_outbox
ADD CONSTRAINT chk_cs_platform_event_outbox_status
CHECK (status IN ('pending','retrying','delivered','dead_letter','claiming'));
CREATE INDEX IF NOT EXISTS idx_cs_platform_event_outbox_claiming_timeout
ON cs_platform_event_outbox(status, updated_at);

View File

@@ -13,6 +13,7 @@ const (
StatusRetrying Status = "retrying" StatusRetrying Status = "retrying"
StatusDelivered Status = "delivered" StatusDelivered Status = "delivered"
StatusDeadLetter Status = "dead_letter" StatusDeadLetter Status = "dead_letter"
StatusClaiming Status = "claiming"
) )
const ( const (
@@ -60,7 +61,7 @@ func (e Event) Validate() error {
return fmt.Errorf("event type is required") return fmt.Errorf("event type is required")
} }
switch e.Status { switch e.Status {
case StatusPending, StatusRetrying, StatusDelivered, StatusDeadLetter: case StatusPending, StatusRetrying, StatusDelivered, StatusDeadLetter, StatusClaiming:
default: default:
return fmt.Errorf("invalid status: %s", e.Status) return fmt.Errorf("invalid status: %s", e.Status)
} }

View File

@@ -20,6 +20,7 @@ type EventStore interface {
RecordDeliveryAttempt(ctx context.Context, eventID string, attemptNo int, responseStatus int, responseBody string, errorMessage string) error RecordDeliveryAttempt(ctx context.Context, eventID string, attemptNo int, responseStatus int, responseBody string, errorMessage string) error
MarkRetry(ctx context.Context, eventID string, attemptCount int, nextAttemptAt time.Time, lastError string) error MarkRetry(ctx context.Context, eventID string, attemptCount int, nextAttemptAt time.Time, lastError string) error
MarkDeadLetter(ctx context.Context, eventID string, attemptCount int, finalError string) error MarkDeadLetter(ctx context.Context, eventID string, attemptCount int, finalError string) error
ReleaseStaleClaims(ctx context.Context, timeout time.Duration) (int, error)
} }
type Worker struct { type Worker struct {
@@ -31,6 +32,7 @@ type Worker struct {
MaxRetries int MaxRetries int
BatchSize int BatchSize int
PollInterval time.Duration PollInterval time.Duration
ClaimTimeout time.Duration
RetrySchedule []time.Duration RetrySchedule []time.Duration
Now func() time.Time Now func() time.Time
Logger *slog.Logger Logger *slog.Logger
@@ -52,6 +54,7 @@ func NewWorker(platform, callbackURL string, store EventStore, client *http.Clie
MaxRetries: maxRetries, MaxRetries: maxRetries,
BatchSize: 20, BatchSize: 20,
PollInterval: 5 * time.Second, PollInterval: 5 * time.Second,
ClaimTimeout: 5 * time.Minute,
RetrySchedule: []time.Duration{10 * time.Second, 30 * time.Second, 60 * time.Second, 5 * time.Minute, 15 * time.Minute}, RetrySchedule: []time.Duration{10 * time.Second, 30 * time.Second, 60 * time.Second, 5 * time.Minute, 15 * time.Minute},
Now: time.Now, Now: time.Now,
} }
@@ -63,6 +66,8 @@ func (w *Worker) Start(ctx context.Context) {
} }
ticker := time.NewTicker(w.pollInterval()) ticker := time.NewTicker(w.pollInterval())
defer ticker.Stop() defer ticker.Stop()
claimTicker := time.NewTicker(30 * time.Second)
defer claimTicker.Stop()
for { for {
select { select {
case <-ctx.Done(): case <-ctx.Done():
@@ -77,6 +82,16 @@ func (w *Worker) Start(ctx context.Context) {
return return
case <-ticker.C: case <-ticker.C:
} }
select {
case <-ctx.Done():
return
case <-claimTicker.C:
if w.Store != nil {
if _, err := w.Store.ReleaseStaleClaims(ctx, w.claimTimeout()); err != nil && w.Logger != nil {
w.Logger.Error("release stale claims failed", "platform", w.Platform, "error", err.Error())
}
}
}
} }
} }
@@ -169,6 +184,13 @@ func (w *Worker) pollInterval() time.Duration {
return w.PollInterval return w.PollInterval
} }
func (w *Worker) claimTimeout() time.Duration {
if w.ClaimTimeout <= 0 {
return 5 * time.Minute
}
return w.ClaimTimeout
}
func (w *Worker) now() time.Time { func (w *Worker) now() time.Time {
if w.Now == nil { if w.Now == nil {
return time.Now() return time.Now()

View File

@@ -64,6 +64,10 @@ func (s *stubEventStore) MarkDeadLetter(_ context.Context, eventID string, attem
return nil return nil
} }
func (s *stubEventStore) ReleaseStaleClaims(_ context.Context, _ time.Duration) (int, error) {
return 0, nil
}
func TestWorker_ShouldDeliverPendingEventToCallbackServer(t *testing.T) { func TestWorker_ShouldDeliverPendingEventToCallbackServer(t *testing.T) {
now := time.Now().UTC().Truncate(time.Second) now := time.Now().UTC().Truncate(time.Second)
store := &stubEventStore{ store := &stubEventStore{

View File

@@ -75,14 +75,30 @@ func (s *PlatformEventStore) ListDue(ctx context.Context, platform string, dueBe
if platform == "" { if platform == "" {
return nil, fmt.Errorf("platform is required") return nil, fmt.Errorf("platform is required")
} }
rows, err := s.db.QueryContext(ctx, `
SELECT id, platform, event_type, COALESCE(session_id::text, ''), COALESCE(ticket_id::text, ''), COALESCE(source_message_id, ''), tx, err := s.db.BeginTx(ctx, nil)
payload, status, attempt_count, next_attempt_at, occurred_at, created_at, updated_at, if err != nil {
delivered_at, COALESCE(last_error, '') return nil, err
FROM cs_platform_event_outbox }
defer func() {
if err != nil {
_ = tx.Rollback()
}
}()
rows, err := tx.QueryContext(ctx, `
UPDATE cs_platform_event_outbox
SET status = 'claiming', updated_at = NOW()
WHERE id IN (
SELECT id FROM cs_platform_event_outbox
WHERE platform = $1 AND status IN ('pending','retrying') AND next_attempt_at <= $2 WHERE platform = $1 AND status IN ('pending','retrying') AND next_attempt_at <= $2
ORDER BY next_attempt_at ASC, occurred_at ASC, created_at ASC, id ASC ORDER BY next_attempt_at ASC, occurred_at ASC, created_at ASC, id ASC
LIMIT $3 LIMIT $3
FOR UPDATE SKIP LOCKED
)
RETURNING id, platform, event_type, COALESCE(session_id::text, ''), COALESCE(ticket_id::text, ''), COALESCE(source_message_id, ''),
payload, status, attempt_count, next_attempt_at, occurred_at, created_at, updated_at,
delivered_at, COALESCE(last_error, '')
`, platform, dueBefore, limit) `, platform, dueBefore, limit)
if err != nil { if err != nil {
return nil, err return nil, err
@@ -126,9 +142,32 @@ func (s *PlatformEventStore) ListDue(ctx context.Context, platform string, dueBe
if err := rows.Err(); err != nil { if err := rows.Err(); err != nil {
return nil, err return nil, err
} }
if err := tx.Commit(); err != nil {
return nil, err
}
return events, nil return events, nil
} }
func (s *PlatformEventStore) ReleaseStaleClaims(ctx context.Context, timeout time.Duration) (int, error) {
if s.db == nil {
return 0, fmt.Errorf("db is nil")
}
res, err := s.db.ExecContext(ctx, `
UPDATE cs_platform_event_outbox
SET status = 'retrying', updated_at = NOW()
WHERE status = 'claiming' AND updated_at < NOW() - $1::interval
`, timeout.Seconds())
if err != nil {
return 0, err
}
n, err := res.RowsAffected()
if err != nil {
return 0, err
}
return int(n), nil
}
func (s *PlatformEventStore) MarkDelivered(ctx context.Context, eventID string, deliveredAt time.Time) error { func (s *PlatformEventStore) MarkDelivered(ctx context.Context, eventID string, deliveredAt time.Time) error {
if s.db == nil { if s.db == nil {
return fmt.Errorf("db is nil") return fmt.Errorf("db is nil")