feat(intraday): add discovery and verification watch pipeline
This commit is contained in:
410
scripts/discover_intraday_news_candidates.go
Normal file
410
scripts/discover_intraday_news_candidates.go
Normal file
@@ -0,0 +1,410 @@
|
||||
//go:build llm_script
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
_ "github.com/lib/pq"
|
||||
)
|
||||
|
||||
type intradayNewsCandidate struct {
|
||||
CandidateDate string
|
||||
EventType string
|
||||
ProviderName string
|
||||
ModelName string
|
||||
ProviderCountry string
|
||||
Title string
|
||||
Summary string
|
||||
CandidateURLs []string
|
||||
DiscoverySource string
|
||||
DiscoveryQuery string
|
||||
DiscoveryEvidence map[string]any
|
||||
NormalizedKey string
|
||||
Status string
|
||||
VerificationConfidence string
|
||||
VerificationNotes string
|
||||
}
|
||||
|
||||
type intradayDiscoveryConfig struct {
|
||||
Date string
|
||||
DryRun bool
|
||||
Search intradayProviderConfig
|
||||
LLM intradayProviderConfig
|
||||
DatabaseURL string
|
||||
Timeout time.Duration
|
||||
ProviderLimit int
|
||||
}
|
||||
|
||||
type intradayDiscoverySummary struct {
|
||||
CandidateTotal int `json:"candidate_total"`
|
||||
ProviderHitCount int `json:"provider_hit_count"`
|
||||
EventTypeCounts map[string]int `json:"event_type_counts"`
|
||||
DiscoverySourceSet []string `json:"discovery_source_set"`
|
||||
DryRun bool `json:"dry_run"`
|
||||
}
|
||||
|
||||
var intradayDiscoveryLogger *slog.Logger
|
||||
|
||||
func init() {
|
||||
intradayDiscoveryLogger = slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelInfo}))
|
||||
}
|
||||
|
||||
func main() {
|
||||
loadIntradayEnv()
|
||||
cfg := loadIntradayDiscoveryConfig()
|
||||
if err := runIntradayCandidateDiscovery(cfg); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "discover_intraday_news_candidates: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
func loadIntradayDiscoveryConfig() intradayDiscoveryConfig {
|
||||
var cfg intradayDiscoveryConfig
|
||||
flag.StringVar(&cfg.Date, "date", intradayDateValue(), "候选发现日期,格式 YYYY-MM-DD")
|
||||
flag.BoolVar(&cfg.DryRun, "dry-run", false, "仅输出摘要,不写数据库")
|
||||
flag.IntVar(&cfg.ProviderLimit, "provider-limit", 10, "最大 provider 数")
|
||||
flag.Parse()
|
||||
|
||||
cfg.DatabaseURL = intradayDefaultDSN()
|
||||
cfg.Timeout = discoveryTimeoutFromEnv()
|
||||
cfg.Search = intradayProviderConfig{
|
||||
Mode: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_SEARCH_PROVIDER")),
|
||||
Command: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_SEARCH_COMMAND")),
|
||||
URL: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_SEARCH_URL")),
|
||||
Fixture: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_SEARCH_FIXTURE")),
|
||||
Timeout: cfg.Timeout,
|
||||
}
|
||||
cfg.LLM = intradayProviderConfig{
|
||||
Mode: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_LLM_PROVIDER")),
|
||||
Command: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_LLM_COMMAND")),
|
||||
URL: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_LLM_URL")),
|
||||
Fixture: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_LLM_FIXTURE")),
|
||||
Timeout: cfg.Timeout,
|
||||
}
|
||||
return cfg
|
||||
}
|
||||
|
||||
func runIntradayCandidateDiscovery(cfg intradayDiscoveryConfig) error {
|
||||
if strings.TrimSpace(cfg.Date) == "" {
|
||||
return fmt.Errorf("date 未设置")
|
||||
}
|
||||
if err := validateIntradayProviderConfig("search", cfg.Search); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := validateIntradayProviderConfig("llm", cfg.LLM); err != nil {
|
||||
return err
|
||||
}
|
||||
queries := buildIntradayQueries(cfg.Date, cfg.ProviderLimit)
|
||||
searchRecords, err := loadIntradaySearchRecords(cfg.Search, cfg.Date, queries)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
llmRecords, err := loadIntradayLLMRecords(cfg.LLM, cfg.Date, searchRecords)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
candidates := normalizeIntradayCandidates(cfg.Date, searchRecords, llmRecords)
|
||||
summary := summarizeIntradayCandidates(candidates, cfg.DryRun)
|
||||
if cfg.DryRun {
|
||||
return printIntradayDiscoverySummary(summary)
|
||||
}
|
||||
|
||||
db, err := sql.Open("postgres", cfg.DatabaseURL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("open db: %w", err)
|
||||
}
|
||||
defer db.Close()
|
||||
if err := upsertIntradayCandidates(context.Background(), db, candidates); err != nil {
|
||||
return err
|
||||
}
|
||||
return printIntradayDiscoverySummary(summary)
|
||||
}
|
||||
|
||||
func validateIntradayProviderConfig(name string, cfg intradayProviderConfig) error {
|
||||
if strings.TrimSpace(cfg.Mode) == "" {
|
||||
return fmt.Errorf("%s provider 未设置", name)
|
||||
}
|
||||
switch cfg.Mode {
|
||||
case "fixture":
|
||||
if strings.TrimSpace(cfg.Fixture) == "" {
|
||||
return fmt.Errorf("%s provider fixture 未设置", name)
|
||||
}
|
||||
case "command_json":
|
||||
if strings.TrimSpace(cfg.Command) == "" {
|
||||
return fmt.Errorf("%s provider command 未设置", name)
|
||||
}
|
||||
case "http_json":
|
||||
if strings.TrimSpace(cfg.URL) == "" {
|
||||
return fmt.Errorf("%s provider url 未设置", name)
|
||||
}
|
||||
default:
|
||||
return fmt.Errorf("%s provider mode 不支持: %s", name, cfg.Mode)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func buildIntradayQueries(date string, providerLimit int) []string {
|
||||
providers := []string{
|
||||
"OpenAI", "Anthropic", "Google Gemini", "xAI", "DeepSeek",
|
||||
"DashScope", "Qwen", "智谱", "百度文心", "腾讯混元", "火山方舟", "MiniMax",
|
||||
}
|
||||
keywords := []string{"pricing release announcement", "模型 降价 发布 活动"}
|
||||
if providerLimit > 0 && providerLimit < len(providers) {
|
||||
providers = providers[:providerLimit]
|
||||
}
|
||||
queries := make([]string, 0, len(providers)*len(keywords))
|
||||
for _, provider := range providers {
|
||||
for _, keyword := range keywords {
|
||||
queries = append(queries, strings.TrimSpace(date+" "+provider+" "+keyword))
|
||||
}
|
||||
}
|
||||
return queries
|
||||
}
|
||||
|
||||
func normalizeIntradayCandidates(date string, searchRecords []intradaySearchRecord, llmRecords []intradayLLMRecord) []intradayNewsCandidate {
|
||||
searchIndex := indexSearchRecordsByURL(searchRecords)
|
||||
candidatesByKey := map[string]intradayNewsCandidate{}
|
||||
for _, record := range llmRecords {
|
||||
candidate := candidateFromLLMRecord(date, record, searchIndex)
|
||||
if len(candidate.CandidateURLs) == 0 {
|
||||
continue
|
||||
}
|
||||
if candidate.ProviderName == "" {
|
||||
candidate.ProviderName = inferProviderFromTitle(candidate.Title)
|
||||
}
|
||||
candidate.EventType = normalizeIntradayEventType(candidate.EventType)
|
||||
candidate.NormalizedKey = buildIntradayNormalizedKey(candidate)
|
||||
mergeIntradayCandidate(candidatesByKey, candidate)
|
||||
}
|
||||
result := make([]intradayNewsCandidate, 0, len(candidatesByKey))
|
||||
for _, candidate := range candidatesByKey {
|
||||
result = append(result, candidate)
|
||||
}
|
||||
sort.Slice(result, func(i, j int) bool {
|
||||
if result[i].ProviderName != result[j].ProviderName {
|
||||
return result[i].ProviderName < result[j].ProviderName
|
||||
}
|
||||
if result[i].EventType != result[j].EventType {
|
||||
return result[i].EventType < result[j].EventType
|
||||
}
|
||||
return result[i].NormalizedKey < result[j].NormalizedKey
|
||||
})
|
||||
return result
|
||||
}
|
||||
|
||||
func candidateFromLLMRecord(date string, record intradayLLMRecord, searchIndex map[string]intradaySearchRecord) intradayNewsCandidate {
|
||||
candidate := intradayNewsCandidate{
|
||||
CandidateDate: date,
|
||||
EventType: record.EventType,
|
||||
ProviderName: strings.TrimSpace(record.ProviderName),
|
||||
ModelName: strings.TrimSpace(record.ModelName),
|
||||
ProviderCountry: strings.TrimSpace(record.ProviderCountry),
|
||||
Title: strings.TrimSpace(record.Title),
|
||||
Summary: strings.TrimSpace(record.Summary),
|
||||
CandidateURLs: dedupeStrings(record.CandidateURLs),
|
||||
DiscoverySource: "llm_answer",
|
||||
DiscoveryEvidence: map[string]any{"llm_record": record},
|
||||
Status: "candidate",
|
||||
VerificationConfidence: "candidate",
|
||||
}
|
||||
for _, url := range candidate.CandidateURLs {
|
||||
if searchRecord, ok := searchIndex[url]; ok {
|
||||
candidate.DiscoverySource = "web_search+llm"
|
||||
candidate.DiscoveryQuery = searchRecord.Title
|
||||
candidate.DiscoveryEvidence["search_record"] = searchRecord
|
||||
if candidate.ProviderName == "" {
|
||||
candidate.ProviderName = strings.TrimSpace(searchRecord.Provider)
|
||||
}
|
||||
if candidate.Title == "" {
|
||||
candidate.Title = strings.TrimSpace(searchRecord.Title)
|
||||
}
|
||||
if candidate.Summary == "" {
|
||||
candidate.Summary = strings.TrimSpace(searchRecord.Summary)
|
||||
}
|
||||
}
|
||||
}
|
||||
return candidate
|
||||
}
|
||||
|
||||
func indexSearchRecordsByURL(records []intradaySearchRecord) map[string]intradaySearchRecord {
|
||||
indexed := make(map[string]intradaySearchRecord, len(records))
|
||||
for _, record := range records {
|
||||
url := strings.TrimSpace(record.URL)
|
||||
if url == "" {
|
||||
continue
|
||||
}
|
||||
indexed[url] = record
|
||||
}
|
||||
return indexed
|
||||
}
|
||||
|
||||
func mergeIntradayCandidate(target map[string]intradayNewsCandidate, candidate intradayNewsCandidate) {
|
||||
if candidate.NormalizedKey == "" {
|
||||
return
|
||||
}
|
||||
existing, ok := target[candidate.NormalizedKey]
|
||||
if !ok {
|
||||
target[candidate.NormalizedKey] = candidate
|
||||
return
|
||||
}
|
||||
merged := existing
|
||||
merged.CandidateURLs = dedupeStrings(append(existing.CandidateURLs, candidate.CandidateURLs...))
|
||||
if strings.TrimSpace(merged.Summary) == "" {
|
||||
merged.Summary = candidate.Summary
|
||||
}
|
||||
if strings.TrimSpace(merged.ProviderCountry) == "" {
|
||||
merged.ProviderCountry = candidate.ProviderCountry
|
||||
}
|
||||
if merged.DiscoverySource != candidate.DiscoverySource && candidate.DiscoverySource != "" {
|
||||
merged.DiscoverySource = "web_search+llm"
|
||||
}
|
||||
if merged.DiscoveryEvidence == nil {
|
||||
merged.DiscoveryEvidence = map[string]any{}
|
||||
}
|
||||
if llmRecord, ok := candidate.DiscoveryEvidence["llm_record"]; ok {
|
||||
merged.DiscoveryEvidence["llm_record"] = llmRecord
|
||||
}
|
||||
if searchRecord, ok := candidate.DiscoveryEvidence["search_record"]; ok {
|
||||
merged.DiscoveryEvidence["search_record"] = searchRecord
|
||||
}
|
||||
target[candidate.NormalizedKey] = merged
|
||||
}
|
||||
|
||||
func buildIntradayNormalizedKey(candidate intradayNewsCandidate) string {
|
||||
provider := normalizeWord(candidate.ProviderName)
|
||||
model := normalizeWord(candidate.ModelName)
|
||||
if model == "" {
|
||||
model = normalizeWord(candidate.Title)
|
||||
}
|
||||
return strings.Join([]string{
|
||||
candidate.CandidateDate,
|
||||
normalizeWord(candidate.EventType),
|
||||
provider,
|
||||
model,
|
||||
}, "|")
|
||||
}
|
||||
|
||||
|
||||
func summarizeIntradayCandidates(candidates []intradayNewsCandidate, dryRun bool) intradayDiscoverySummary {
|
||||
eventTypeCounts := make(map[string]int)
|
||||
providerSet := map[string]struct{}{}
|
||||
sourceSet := map[string]struct{}{}
|
||||
for _, candidate := range candidates {
|
||||
eventTypeCounts[candidate.EventType]++
|
||||
if candidate.ProviderName != "" {
|
||||
providerSet[candidate.ProviderName] = struct{}{}
|
||||
}
|
||||
if candidate.DiscoverySource != "" {
|
||||
sourceSet[candidate.DiscoverySource] = struct{}{}
|
||||
}
|
||||
}
|
||||
sources := make([]string, 0, len(sourceSet))
|
||||
for source := range sourceSet {
|
||||
sources = append(sources, source)
|
||||
}
|
||||
sort.Strings(sources)
|
||||
return intradayDiscoverySummary{
|
||||
CandidateTotal: len(candidates),
|
||||
ProviderHitCount: len(providerSet),
|
||||
EventTypeCounts: eventTypeCounts,
|
||||
DiscoverySourceSet: sources,
|
||||
DryRun: dryRun,
|
||||
}
|
||||
}
|
||||
|
||||
func printIntradayDiscoverySummary(summary intradayDiscoverySummary) error {
|
||||
payload, err := json.Marshal(summary)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Println(string(payload))
|
||||
return nil
|
||||
}
|
||||
|
||||
func upsertIntradayCandidates(ctx context.Context, db *sql.DB, candidates []intradayNewsCandidate) error {
|
||||
if db == nil {
|
||||
return fmt.Errorf("db is nil")
|
||||
}
|
||||
for _, candidate := range candidates {
|
||||
urls, err := json.Marshal(candidate.CandidateURLs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal candidate urls: %w", err)
|
||||
}
|
||||
evidence, err := json.Marshal(candidate.DiscoveryEvidence)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal discovery evidence: %w", err)
|
||||
}
|
||||
_, err = db.ExecContext(ctx, `
|
||||
INSERT INTO intraday_news_candidate (
|
||||
candidate_date, event_type, provider_name, model_name, provider_country,
|
||||
title, summary, candidate_urls, discovery_source, discovery_query,
|
||||
discovery_evidence, normalized_key, status, verification_confidence, verification_notes
|
||||
) VALUES (
|
||||
$1::date, $2, $3, NULLIF($4, ''), NULLIF($5, ''),
|
||||
$6, NULLIF($7, ''), $8::jsonb, $9, NULLIF($10, ''),
|
||||
$11::jsonb, $12, $13, $14, NULLIF($15, '')
|
||||
)
|
||||
ON CONFLICT (normalized_key) DO UPDATE SET
|
||||
title = EXCLUDED.title,
|
||||
summary = COALESCE(NULLIF(EXCLUDED.summary, ''), intraday_news_candidate.summary),
|
||||
candidate_urls = EXCLUDED.candidate_urls,
|
||||
discovery_source = EXCLUDED.discovery_source,
|
||||
discovery_query = COALESCE(NULLIF(EXCLUDED.discovery_query, ''), intraday_news_candidate.discovery_query),
|
||||
discovery_evidence = EXCLUDED.discovery_evidence,
|
||||
provider_country = COALESCE(NULLIF(EXCLUDED.provider_country, ''), intraday_news_candidate.provider_country),
|
||||
updated_at = CURRENT_TIMESTAMP`,
|
||||
candidate.CandidateDate,
|
||||
candidate.EventType,
|
||||
candidate.ProviderName,
|
||||
candidate.ModelName,
|
||||
candidate.ProviderCountry,
|
||||
candidate.Title,
|
||||
candidate.Summary,
|
||||
string(urls),
|
||||
candidate.DiscoverySource,
|
||||
candidate.DiscoveryQuery,
|
||||
string(evidence),
|
||||
candidate.NormalizedKey,
|
||||
candidate.Status,
|
||||
candidate.VerificationConfidence,
|
||||
candidate.VerificationNotes,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("upsert intraday candidate %s: %w", candidate.NormalizedKey, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func inferProviderFromTitle(title string) string {
|
||||
lower := strings.ToLower(title)
|
||||
for _, pair := range []struct{ match, provider string }{
|
||||
{"openai", "OpenAI"},
|
||||
{"anthropic", "Anthropic"},
|
||||
{"gemini", "Google"},
|
||||
{"deepseek", "DeepSeek"},
|
||||
{"qwen", "Qwen"},
|
||||
{"dashscope", "DashScope"},
|
||||
{"xai", "xAI"},
|
||||
{"minimax", "MiniMax"},
|
||||
{"智谱", "智谱"},
|
||||
{"百度", "百度"},
|
||||
{"腾讯", "腾讯"},
|
||||
} {
|
||||
if strings.Contains(lower, pair.match) {
|
||||
return pair.provider
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
127
scripts/discover_intraday_news_candidates_test.go
Normal file
127
scripts/discover_intraday_news_candidates_test.go
Normal file
@@ -0,0 +1,127 @@
|
||||
//go:build llm_script
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLoadIntradaySearchRecordsFromFixture(t *testing.T) {
|
||||
cfg := intradayProviderConfig{
|
||||
Mode: "fixture",
|
||||
Fixture: filepath.Join("testdata", "intraday_discovery_search_sample.json"),
|
||||
}
|
||||
records, err := loadIntradaySearchRecords(cfg, "2026-05-25", []string{"OpenAI pricing release"})
|
||||
if err != nil {
|
||||
t.Fatalf("loadIntradaySearchRecords 返回错误: %v", err)
|
||||
}
|
||||
if len(records) != 2 {
|
||||
t.Fatalf("搜索样例条数错误: got=%d", len(records))
|
||||
}
|
||||
if records[0].URL == "" || records[0].Provider == "" {
|
||||
t.Fatalf("搜索样例未保留 URL/provider: %+v", records[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadIntradayLLMRecordsFromFixture(t *testing.T) {
|
||||
cfg := intradayProviderConfig{
|
||||
Mode: "fixture",
|
||||
Fixture: filepath.Join("testdata", "intraday_discovery_llm_sample.json"),
|
||||
}
|
||||
records, err := loadIntradayLLMRecords(cfg, "2026-05-25", nil)
|
||||
if err != nil {
|
||||
t.Fatalf("loadIntradayLLMRecords 返回错误: %v", err)
|
||||
}
|
||||
if len(records) != 2 {
|
||||
t.Fatalf("LLM 样例条数错误: got=%d", len(records))
|
||||
}
|
||||
if records[0].EventType != "official_release" {
|
||||
t.Fatalf("LLM 事件类型错误: %+v", records[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeIntradayCandidatesDedupesEquivalentEvents(t *testing.T) {
|
||||
searchRecords := []intradaySearchRecord{{
|
||||
Title: "OpenAI announces GPT-5.6 preview pricing update",
|
||||
Summary: "Search summary",
|
||||
URL: "https://openai.example.com/news/gpt-5-6-pricing",
|
||||
Provider: "OpenAI",
|
||||
}}
|
||||
llmRecords := []intradayLLMRecord{
|
||||
{
|
||||
EventType: "official_release",
|
||||
ProviderName: "OpenAI",
|
||||
ModelName: "GPT-5.6",
|
||||
ProviderCountry: "US",
|
||||
Title: "GPT-5.6 preview pricing update",
|
||||
Summary: "First summary",
|
||||
CandidateURLs: []string{"https://openai.example.com/news/gpt-5-6-pricing"},
|
||||
},
|
||||
{
|
||||
EventType: "official_release",
|
||||
ProviderName: "OpenAI",
|
||||
ModelName: "GPT 5.6",
|
||||
ProviderCountry: "US",
|
||||
Title: "OpenAI GPT 5.6 preview pricing update",
|
||||
Summary: "Second summary",
|
||||
CandidateURLs: []string{"https://openai.example.com/news/gpt-5-6-pricing"},
|
||||
},
|
||||
}
|
||||
candidates := normalizeIntradayCandidates("2026-05-25", searchRecords, llmRecords)
|
||||
if len(candidates) != 1 {
|
||||
t.Fatalf("期望去重后只剩 1 条候选, got=%d", len(candidates))
|
||||
}
|
||||
if candidates[0].DiscoverySource != "web_search+llm" {
|
||||
t.Fatalf("期望 discovery source 合并, got=%q", candidates[0].DiscoverySource)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeIntradayCandidatesDropsURLlessRecords(t *testing.T) {
|
||||
llmRecords := []intradayLLMRecord{{
|
||||
EventType: "promo_campaign",
|
||||
ProviderName: "DeepSeek",
|
||||
ModelName: "DeepSeek-V4-Flash",
|
||||
Title: "No URL candidate",
|
||||
Summary: "Should be dropped",
|
||||
}}
|
||||
candidates := normalizeIntradayCandidates("2026-05-25", nil, llmRecords)
|
||||
if len(candidates) != 0 {
|
||||
t.Fatalf("无 URL 候选应被丢弃, got=%d", len(candidates))
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateIntradayProviderConfigRequiresCommandOrURLOrFixture(t *testing.T) {
|
||||
if err := validateIntradayProviderConfig("search", intradayProviderConfig{Mode: "command_json"}); err == nil {
|
||||
t.Fatal("缺少 command 时应报错")
|
||||
}
|
||||
if err := validateIntradayProviderConfig("llm", intradayProviderConfig{Mode: "http_json"}); err == nil {
|
||||
t.Fatal("缺少 url 时应报错")
|
||||
}
|
||||
if err := validateIntradayProviderConfig("search", intradayProviderConfig{Mode: "fixture", Fixture: "fixture.json"}); err != nil {
|
||||
t.Fatalf("fixture provider 不应报错: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildIntradayNormalizedKeyUsesProviderModelAndDate(t *testing.T) {
|
||||
key := buildIntradayNormalizedKey(intradayNewsCandidate{
|
||||
CandidateDate: "2026-05-25",
|
||||
EventType: "official_release",
|
||||
ProviderName: "OpenAI",
|
||||
ModelName: "GPT-5.6",
|
||||
})
|
||||
if !strings.Contains(key, "2026-05-25") || !strings.Contains(key, "openai") || !strings.Contains(key, "gpt-5-6") {
|
||||
t.Fatalf("normalized key 不符合预期: %q", key)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpsertIntradayCandidatesRequiresDB(t *testing.T) {
|
||||
var db *sql.DB
|
||||
err := upsertIntradayCandidates(context.Background(), db, nil)
|
||||
if err == nil {
|
||||
t.Fatal("nil db 时应报错")
|
||||
}
|
||||
}
|
||||
111
scripts/intraday_discovery_common.go
Normal file
111
scripts/intraday_discovery_common.go
Normal file
@@ -0,0 +1,111 @@
|
||||
//go:build llm_script
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
func loadIntradayEnv() {
|
||||
for _, path := range []string{".env.local", ".env"} {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
for _, line := range strings.Split(string(data), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
key, value, ok := strings.Cut(line, "=")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
key = strings.TrimSpace(key)
|
||||
value = strings.Trim(strings.TrimSpace(value), `"'`)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
if _, exists := os.LookupEnv(key); exists {
|
||||
continue
|
||||
}
|
||||
_ = os.Setenv(key, value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func intradayDefaultDSN() string {
|
||||
if dsn := os.Getenv("DATABASE_URL"); dsn != "" {
|
||||
return dsn
|
||||
}
|
||||
return "postgres://long@/llm_intelligence?host=/var/run/postgresql"
|
||||
}
|
||||
|
||||
func intradayDateValue() string {
|
||||
if value := strings.TrimSpace(os.Getenv("REPORT_DATE")); value != "" {
|
||||
return value
|
||||
}
|
||||
return time.Now().Format("2006-01-02")
|
||||
}
|
||||
|
||||
func discoveryTimeoutFromEnv() time.Duration {
|
||||
raw := strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_TIMEOUT_SEC"))
|
||||
if raw == "" {
|
||||
return 20 * time.Second
|
||||
}
|
||||
var seconds int
|
||||
if _, err := fmt.Sscanf(raw, "%d", &seconds); err != nil || seconds <= 0 {
|
||||
return 20 * time.Second
|
||||
}
|
||||
return time.Duration(seconds) * time.Second
|
||||
}
|
||||
|
||||
func normalizeIntradayEventType(value string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(value)) {
|
||||
case "price_cut":
|
||||
return "price_cut"
|
||||
case "price_increase":
|
||||
return "price_increase"
|
||||
case "official_release":
|
||||
return "official_release"
|
||||
case "promo_campaign":
|
||||
return "promo_campaign"
|
||||
case "leak_or_rumor":
|
||||
return "leak_or_rumor"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeWord(value string) string {
|
||||
value = strings.ToLower(strings.TrimSpace(value))
|
||||
value = strings.ReplaceAll(value, "_", "-")
|
||||
re := regexp.MustCompile(`[^a-z0-9\-]+`)
|
||||
value = re.ReplaceAllString(value, "-")
|
||||
value = strings.Trim(value, "-")
|
||||
if value == "" {
|
||||
return "unknown"
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
func dedupeStrings(values []string) []string {
|
||||
seen := map[string]struct{}{}
|
||||
result := make([]string, 0, len(values))
|
||||
for _, value := range values {
|
||||
trimmed := strings.TrimSpace(value)
|
||||
if trimmed == "" {
|
||||
continue
|
||||
}
|
||||
if _, exists := seen[trimmed]; exists {
|
||||
continue
|
||||
}
|
||||
seen[trimmed] = struct{}{}
|
||||
result = append(result, trimmed)
|
||||
}
|
||||
return result
|
||||
}
|
||||
188
scripts/intraday_discovery_provider.go
Normal file
188
scripts/intraday_discovery_provider.go
Normal file
@@ -0,0 +1,188 @@
|
||||
//go:build llm_script
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type intradayProviderConfig struct {
|
||||
Mode string
|
||||
Command string
|
||||
URL string
|
||||
Fixture string
|
||||
Timeout time.Duration
|
||||
}
|
||||
|
||||
type intradaySearchRecord struct {
|
||||
Title string `json:"title"`
|
||||
Summary string `json:"summary"`
|
||||
URL string `json:"url"`
|
||||
Provider string `json:"provider"`
|
||||
ProviderURL string `json:"provider_url"`
|
||||
PublishedAt string `json:"published_at"`
|
||||
}
|
||||
|
||||
type intradayLLMRecord struct {
|
||||
EventType string `json:"event_type"`
|
||||
ProviderName string `json:"provider_name"`
|
||||
ModelName string `json:"model_name"`
|
||||
ProviderCountry string `json:"provider_country"`
|
||||
Title string `json:"title"`
|
||||
Summary string `json:"summary"`
|
||||
CandidateURLs []string `json:"candidate_urls"`
|
||||
}
|
||||
|
||||
type intradayLLMRequest struct {
|
||||
Date string `json:"date"`
|
||||
SearchResults []intradaySearchRecord `json:"search_results"`
|
||||
}
|
||||
|
||||
func loadIntradaySearchRecords(cfg intradayProviderConfig, date string, queries []string) ([]intradaySearchRecord, error) {
|
||||
var all []intradaySearchRecord
|
||||
for _, query := range queries {
|
||||
payload, err := loadIntradayProviderPayload(cfg, intradayProviderPayloadInput{
|
||||
Date: date,
|
||||
Query: query,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(bytes.TrimSpace(payload)) == 0 {
|
||||
continue
|
||||
}
|
||||
var records []intradaySearchRecord
|
||||
if err := json.Unmarshal(payload, &records); err != nil {
|
||||
return nil, fmt.Errorf("unmarshal search records for query %q: %w", query, err)
|
||||
}
|
||||
all = append(all, records...)
|
||||
if cfg.Mode == "fixture" {
|
||||
break
|
||||
}
|
||||
}
|
||||
return all, nil
|
||||
}
|
||||
|
||||
func loadIntradayLLMRecords(cfg intradayProviderConfig, date string, searchResults []intradaySearchRecord) ([]intradayLLMRecord, error) {
|
||||
request := intradayLLMRequest{Date: date, SearchResults: searchResults}
|
||||
body, err := json.Marshal(request)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshal llm request: %w", err)
|
||||
}
|
||||
payload, err := loadIntradayProviderPayload(cfg, intradayProviderPayloadInput{
|
||||
Date: date,
|
||||
RequestBody: body,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(bytes.TrimSpace(payload)) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
var records []intradayLLMRecord
|
||||
if err := json.Unmarshal(payload, &records); err != nil {
|
||||
return nil, fmt.Errorf("unmarshal llm records: %w", err)
|
||||
}
|
||||
return records, nil
|
||||
}
|
||||
|
||||
type intradayProviderPayloadInput struct {
|
||||
Date string
|
||||
Query string
|
||||
RequestBody []byte
|
||||
}
|
||||
|
||||
func loadIntradayProviderPayload(cfg intradayProviderConfig, input intradayProviderPayloadInput) ([]byte, error) {
|
||||
mode := strings.TrimSpace(cfg.Mode)
|
||||
switch mode {
|
||||
case "fixture":
|
||||
if strings.TrimSpace(cfg.Fixture) == "" {
|
||||
return nil, fmt.Errorf("provider fixture 未设置")
|
||||
}
|
||||
return os.ReadFile(cfg.Fixture)
|
||||
case "command_json":
|
||||
if strings.TrimSpace(cfg.Command) == "" {
|
||||
return nil, fmt.Errorf("provider command 未设置")
|
||||
}
|
||||
return runIntradayCommand(cfg, input)
|
||||
case "http_json":
|
||||
if strings.TrimSpace(cfg.URL) == "" {
|
||||
return nil, fmt.Errorf("provider url 未设置")
|
||||
}
|
||||
return fetchIntradayHTTP(cfg, input)
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported provider mode %q", mode)
|
||||
}
|
||||
}
|
||||
|
||||
func runIntradayCommand(cfg intradayProviderConfig, input intradayProviderPayloadInput) ([]byte, error) {
|
||||
command := strings.TrimSpace(cfg.Command)
|
||||
command = strings.ReplaceAll(command, "{{date}}", input.Date)
|
||||
command = strings.ReplaceAll(command, "{{query}}", shellEscapeSingleArg(input.Query))
|
||||
cmd := exec.Command("sh", "-c", command)
|
||||
cmd.Env = append(os.Environ(),
|
||||
"INTRADAY_DISCOVERY_DATE="+input.Date,
|
||||
"INTRADAY_DISCOVERY_QUERY="+input.Query,
|
||||
)
|
||||
if len(input.RequestBody) > 0 {
|
||||
cmd.Stdin = bytes.NewReader(input.RequestBody)
|
||||
}
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
if exitErr, ok := err.(*exec.ExitError); ok {
|
||||
return nil, fmt.Errorf("run provider command: %w: %s", err, strings.TrimSpace(string(exitErr.Stderr)))
|
||||
}
|
||||
return nil, fmt.Errorf("run provider command: %w", err)
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func fetchIntradayHTTP(cfg intradayProviderConfig, input intradayProviderPayloadInput) ([]byte, error) {
|
||||
client := &http.Client{Timeout: cfg.Timeout}
|
||||
rawURL := strings.TrimSpace(cfg.URL)
|
||||
rawURL = strings.ReplaceAll(rawURL, "{{date}}", input.Date)
|
||||
rawURL = strings.ReplaceAll(rawURL, "{{query}}", input.Query)
|
||||
|
||||
method := http.MethodGet
|
||||
var body io.Reader
|
||||
if len(input.RequestBody) > 0 {
|
||||
method = http.MethodPost
|
||||
body = bytes.NewReader(input.RequestBody)
|
||||
}
|
||||
req, err := http.NewRequest(method, rawURL, body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build provider request: %w", err)
|
||||
}
|
||||
if len(input.RequestBody) > 0 {
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("call provider url: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
payload, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("call provider url: unexpected status %d: %s", resp.StatusCode, strings.TrimSpace(string(payload)))
|
||||
}
|
||||
payload, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read provider response: %w", err)
|
||||
}
|
||||
return payload, nil
|
||||
}
|
||||
|
||||
func shellEscapeSingleArg(value string) string {
|
||||
if value == "" {
|
||||
return "''"
|
||||
}
|
||||
return "'" + strings.ReplaceAll(value, "'", "'\"'\"'") + "'"
|
||||
}
|
||||
@@ -46,6 +46,7 @@ type signalModelEvent struct {
|
||||
TrustLabel string `json:"trust_label"`
|
||||
SourceKindLabel string `json:"source_kind_label"`
|
||||
PrimarySource string `json:"primary_source"`
|
||||
SourceURL string `json:"source_url"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
EvidenceDetail string `json:"evidence_detail"`
|
||||
Baseline string `json:"baseline"`
|
||||
@@ -367,6 +368,12 @@ func loadSignalModelEvents(db *sql.DB, date string) ([]signalModelEvent, error)
|
||||
}
|
||||
events = append(events, priceEvents...)
|
||||
|
||||
discoveryEvents, err := loadVerifiedDiscoverySignalEvents(db, date)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
events = mergeVerifiedDiscoveryEvents(events, discoveryEvents)
|
||||
|
||||
sort.Slice(events, func(i, j int) bool {
|
||||
if events[i].Priority != events[j].Priority {
|
||||
return events[i].Priority > events[j].Priority
|
||||
@@ -409,6 +416,7 @@ func loadSignalPromoCampaignEvents(date string) ([]signalModelEvent, error) {
|
||||
TrustLabel: signalFirstNonEmpty(definition.TrustLabel, "官方来源 / 一级证据"),
|
||||
SourceKindLabel: signalFirstNonEmpty(definition.SourceKindLabel, "官方活动页"),
|
||||
PrimarySource: definition.PrimarySource,
|
||||
SourceURL: definition.PrimarySource,
|
||||
UpdatedAt: signalFormatEventUpdatedAt("", definition.Date),
|
||||
EvidenceDetail: definition.EvidenceDetail,
|
||||
Baseline: signalFirstNonEmpty(definition.Baseline, "活动窗口开启"),
|
||||
@@ -520,6 +528,7 @@ func loadSignalOfficialReleaseEvents(db *sql.DB, date string) ([]signalModelEven
|
||||
TrustLabel: buildSignalReleaseTrustLabel(model, dateConfidence),
|
||||
SourceKindLabel: buildSignalReleaseSourceKindLabel(dateSourceKind, dateConfidence),
|
||||
PrimarySource: sourceURL,
|
||||
SourceURL: sourceURL,
|
||||
UpdatedAt: releaseDate.Format("2006-01-02 15:04"),
|
||||
EvidenceDetail: buildSignalReleaseEvidenceDetail(dateSourceKind, dateConfidence),
|
||||
Baseline: "官方首次发布",
|
||||
@@ -610,6 +619,7 @@ func loadSignalNewModelEvents(db *sql.DB, date string) ([]signalModelEvent, erro
|
||||
TrustLabel: buildSignalTrustLabel(model),
|
||||
SourceKindLabel: "模型快照",
|
||||
PrimarySource: buildSignalPrimarySource("region_pricing", model.OperatorName),
|
||||
SourceURL: buildSignalPrimarySource("region_pricing", model.OperatorName),
|
||||
UpdatedAt: createdAt.Format("2006-01-02 15:04"),
|
||||
EvidenceDetail: "models.created_at = 今日,且已存在最新价格快照",
|
||||
Baseline: "首次出现",
|
||||
@@ -709,6 +719,7 @@ func loadSignalPriceChangeEvents(db *sql.DB, date string) ([]signalModelEvent, e
|
||||
TrustLabel: buildSignalTrustLabel(model),
|
||||
SourceKindLabel: "价格快照",
|
||||
PrimarySource: "pricing_history",
|
||||
SourceURL: buildSignalPrimarySource("region_pricing", model.OperatorName),
|
||||
UpdatedAt: changedAt.Format("2006-01-02 15:04"),
|
||||
EvidenceDetail: buildSignalPriceEvidenceDetail(changePct, oldInputPrice, newInputPrice, model.Currency),
|
||||
Baseline: fmt.Sprintf("较昨日 %+.0f%%", changePct),
|
||||
@@ -747,6 +758,241 @@ func dedupeSignalEvents(events []signalModelEvent) []signalModelEvent {
|
||||
return result
|
||||
}
|
||||
|
||||
func loadVerifiedDiscoverySignalEvents(db *sql.DB, date string) ([]signalModelEvent, error) {
|
||||
rows, err := db.Query(`
|
||||
SELECT
|
||||
event_type,
|
||||
provider_name,
|
||||
COALESCE(model_name, ''),
|
||||
COALESCE(provider_country, ''),
|
||||
title,
|
||||
COALESCE(summary, ''),
|
||||
COALESCE(candidate_urls::text, '[]'),
|
||||
COALESCE(verification_notes, ''),
|
||||
updated_at
|
||||
FROM intraday_news_candidate
|
||||
WHERE candidate_date = $1::date
|
||||
AND status = 'verified'
|
||||
AND verification_confidence = 'official_confirmed'
|
||||
ORDER BY updated_at DESC, id DESC
|
||||
`, date)
|
||||
if err != nil {
|
||||
if strings.Contains(err.Error(), `relation "intraday_news_candidate" does not exist`) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var events []signalModelEvent
|
||||
for rows.Next() {
|
||||
var (
|
||||
eventType string
|
||||
providerName string
|
||||
modelName string
|
||||
providerCountry string
|
||||
title string
|
||||
summary string
|
||||
rawURLs string
|
||||
notes string
|
||||
updatedAt time.Time
|
||||
)
|
||||
if err := rows.Scan(&eventType, &providerName, &modelName, &providerCountry, &title, &summary, &rawURLs, ¬es, &updatedAt); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var urls []string
|
||||
if err := json.Unmarshal([]byte(rawURLs), &urls); err != nil {
|
||||
return nil, fmt.Errorf("unmarshal discovery candidate urls: %w", err)
|
||||
}
|
||||
primaryURL := firstString(urls)
|
||||
if strings.TrimSpace(primaryURL) == "" {
|
||||
continue
|
||||
}
|
||||
normalizedType := signalNormalizeIntradayEventType(eventType)
|
||||
events = append(events, signalModelEvent{
|
||||
EventType: normalizedType,
|
||||
ModelName: signalFirstNonEmpty(modelName, title),
|
||||
ProviderName: providerName,
|
||||
OperatorName: providerName,
|
||||
Audience: buildDiscoveryAudience(normalizedType),
|
||||
TrustLabel: "官方来源 / discovery 验证",
|
||||
SourceKindLabel: buildDiscoverySourceKind(normalizedType),
|
||||
PrimarySource: primaryURL,
|
||||
SourceURL: primaryURL,
|
||||
UpdatedAt: updatedAt.Format("2006-01-02 15:04"),
|
||||
EvidenceDetail: signalFirstNonEmpty(notes, summary),
|
||||
Baseline: buildDiscoveryBaseline(normalizedType),
|
||||
Summary: signalFirstNonEmpty(summary, title),
|
||||
Priority: buildDiscoveryPriority(normalizedType),
|
||||
})
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return filterVerifiedDiscoverySignalEvents(events), nil
|
||||
}
|
||||
|
||||
func filterVerifiedDiscoverySignalEvents(events []signalModelEvent) []signalModelEvent {
|
||||
filtered := make([]signalModelEvent, 0, len(events))
|
||||
for _, event := range events {
|
||||
switch event.EventType {
|
||||
case "official_release", "promo_campaign", "price_cut", "price_increase":
|
||||
filtered = append(filtered, event)
|
||||
}
|
||||
}
|
||||
return filtered
|
||||
}
|
||||
|
||||
func mergeVerifiedDiscoveryEvents(nativeEvents, discoveryEvents []signalModelEvent) []signalModelEvent {
|
||||
merged := append([]signalModelEvent{}, nativeEvents...)
|
||||
index := make(map[string]int, len(merged))
|
||||
for i, event := range merged {
|
||||
index[signalEventMergeKey(event)] = i
|
||||
}
|
||||
for _, event := range filterVerifiedDiscoverySignalEvents(discoveryEvents) {
|
||||
key := signalEventMergeKey(event)
|
||||
if idx, exists := index[key]; exists {
|
||||
merged[idx] = mergeSignalEventEvidence(merged[idx], event)
|
||||
continue
|
||||
}
|
||||
index[key] = len(merged)
|
||||
merged = append(merged, event)
|
||||
}
|
||||
return merged
|
||||
}
|
||||
|
||||
func mergeSignalEventEvidence(native, discovery signalModelEvent) signalModelEvent {
|
||||
merged := native
|
||||
if strings.TrimSpace(merged.SourceKindLabel) == "" {
|
||||
merged.SourceKindLabel = discovery.SourceKindLabel
|
||||
}
|
||||
if strings.TrimSpace(merged.SourceURL) == "" {
|
||||
merged.SourceURL = discovery.SourceURL
|
||||
}
|
||||
if strings.TrimSpace(merged.PrimarySource) == "" {
|
||||
merged.PrimarySource = discovery.PrimarySource
|
||||
}
|
||||
if strings.TrimSpace(merged.EvidenceDetail) == "" {
|
||||
merged.EvidenceDetail = discovery.EvidenceDetail
|
||||
}
|
||||
if strings.TrimSpace(merged.TrustLabel) == "" {
|
||||
merged.TrustLabel = discovery.TrustLabel
|
||||
}
|
||||
return merged
|
||||
}
|
||||
|
||||
func signalEventMergeKey(event signalModelEvent) string {
|
||||
return strings.Join([]string{
|
||||
signalNormalizeIntradayEventType(event.EventType),
|
||||
signalNormalizeWord(event.ProviderName),
|
||||
signalNormalizeWord(event.ModelName),
|
||||
}, "|")
|
||||
}
|
||||
|
||||
func buildDiscoveryAudience(eventType string) string {
|
||||
switch eventType {
|
||||
case "official_release":
|
||||
return "适合需要尽快复查默认选型与路线图影响的团队"
|
||||
case "promo_campaign":
|
||||
return "适合想利用活动窗口压低成本的团队"
|
||||
case "price_cut":
|
||||
return "适合准备趁降价重排默认模型的团队"
|
||||
case "price_increase":
|
||||
return "适合提前准备替代模型和预算回退方案的团队"
|
||||
default:
|
||||
return "适合关注日内情报变化的读者"
|
||||
}
|
||||
}
|
||||
|
||||
func buildDiscoverySourceKind(eventType string) string {
|
||||
switch eventType {
|
||||
case "official_release":
|
||||
return "discovery 验证 / 官方发布页"
|
||||
case "promo_campaign":
|
||||
return "discovery 验证 / 官方活动页"
|
||||
case "price_cut", "price_increase":
|
||||
return "discovery 验证 / 官方价格页"
|
||||
default:
|
||||
return "discovery 验证"
|
||||
}
|
||||
}
|
||||
|
||||
func buildDiscoveryBaseline(eventType string) string {
|
||||
switch eventType {
|
||||
case "official_release":
|
||||
return "discovery 验证通过"
|
||||
case "promo_campaign":
|
||||
return "活动窗口已验证"
|
||||
case "price_cut", "price_increase":
|
||||
return "official_confirmed"
|
||||
default:
|
||||
return "discovery verified"
|
||||
}
|
||||
}
|
||||
|
||||
func buildDiscoveryPriority(eventType string) int {
|
||||
switch eventType {
|
||||
case "official_release":
|
||||
return 118
|
||||
case "promo_campaign":
|
||||
return 112
|
||||
case "price_cut":
|
||||
return 96
|
||||
case "price_increase":
|
||||
return 94
|
||||
default:
|
||||
return 80
|
||||
}
|
||||
}
|
||||
|
||||
func firstString(values []string) string {
|
||||
for _, value := range values {
|
||||
if strings.TrimSpace(value) != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func signalNormalizeIntradayEventType(value string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(value)) {
|
||||
case "price_cut":
|
||||
return "price_cut"
|
||||
case "price_increase":
|
||||
return "price_increase"
|
||||
case "official_release":
|
||||
return "official_release"
|
||||
case "promo_campaign":
|
||||
return "promo_campaign"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
func signalNormalizeWord(value string) string {
|
||||
value = strings.ToLower(strings.TrimSpace(value))
|
||||
value = strings.ReplaceAll(value, "_", "-")
|
||||
var b strings.Builder
|
||||
lastDash := false
|
||||
for _, r := range value {
|
||||
isAlphaNum := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
|
||||
if isAlphaNum {
|
||||
b.WriteRune(r)
|
||||
lastDash = false
|
||||
continue
|
||||
}
|
||||
if !lastDash {
|
||||
b.WriteByte('-')
|
||||
lastDash = true
|
||||
}
|
||||
}
|
||||
result := strings.Trim(b.String(), "-")
|
||||
if result == "" {
|
||||
return "unknown"
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func classifySignalFreeSource(model signalModelInfo) string {
|
||||
switch model.OperatorType {
|
||||
case "official", "cloud":
|
||||
|
||||
@@ -31,3 +31,64 @@ func TestBuildSignalPageMode(t *testing.T) {
|
||||
t.Fatalf("官方发布日 page_mode 错误: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildSignalPageModeTreatsVerifiedDiscoveryPromoAsHot(t *testing.T) {
|
||||
got := buildSignalPageMode(signalDailySignals{}, []signalModelEvent{{EventType: "promo_campaign", ModelName: "GPT-5.6"}})
|
||||
if got != "hot" {
|
||||
t.Fatalf("已验证活动事件应触发 hot, got=%q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFilterDiscoveryEventsDropsLeakAndCandidateOnly(t *testing.T) {
|
||||
events := []signalModelEvent{
|
||||
{EventType: "official_release", ModelName: "GPT-5.6", Priority: 120},
|
||||
{EventType: "leak_or_rumor", ModelName: "GPT-5.6", Priority: 200},
|
||||
{EventType: "unknown", ModelName: "Mystery", Priority: 50},
|
||||
}
|
||||
filtered := filterVerifiedDiscoverySignalEvents(events)
|
||||
if len(filtered) != 1 {
|
||||
t.Fatalf("期望仅保留 1 条正式事实事件, got=%d", len(filtered))
|
||||
}
|
||||
if filtered[0].EventType != "official_release" {
|
||||
t.Fatalf("错误保留了非正式事件: %+v", filtered)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeVerifiedDiscoveryEventsPrefersNativeFact(t *testing.T) {
|
||||
native := []signalModelEvent{{
|
||||
EventType: "official_release",
|
||||
ModelName: "GPT-5.6",
|
||||
ProviderName: "OpenAI",
|
||||
PrimarySource: "native_release",
|
||||
EvidenceDetail: "native evidence",
|
||||
Priority: 120,
|
||||
}}
|
||||
discovery := []signalModelEvent{{
|
||||
EventType: "official_release",
|
||||
ModelName: "GPT-5.6",
|
||||
ProviderName: "OpenAI",
|
||||
PrimarySource: "discovery_release",
|
||||
EvidenceDetail: "discovery evidence",
|
||||
SourceKindLabel: "官方博客",
|
||||
Priority: 110,
|
||||
}}
|
||||
merged := mergeVerifiedDiscoveryEvents(native, discovery)
|
||||
if len(merged) != 1 {
|
||||
t.Fatalf("期望去重后只剩 1 条事件, got=%d", len(merged))
|
||||
}
|
||||
if merged[0].PrimarySource != "native_release" {
|
||||
t.Fatalf("原生事实不应被 discovery 覆盖: %+v", merged[0])
|
||||
}
|
||||
if merged[0].SourceKindLabel != "官方博客" {
|
||||
t.Fatalf("原生事实应补入 discovery 证据缺口: %+v", merged[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeVerifiedDiscoveryEventsDropsUnverifiedPriceNarrative(t *testing.T) {
|
||||
native := []signalModelEvent{{EventType: "new_model", ModelName: "DeepSeek-V4-Flash", ProviderName: "DeepSeek", Priority: 80}}
|
||||
discovery := []signalModelEvent{{EventType: "leak_or_rumor", ModelName: "DeepSeek-V4-Flash", ProviderName: "DeepSeek", Priority: 130}}
|
||||
merged := mergeVerifiedDiscoveryEvents(native, discovery)
|
||||
if len(merged) != 1 || merged[0].EventType != "new_model" {
|
||||
t.Fatalf("非正式 discovery 事件不应进入正式快照: %+v", merged)
|
||||
}
|
||||
}
|
||||
|
||||
46
scripts/run_intraday_discovery_watch.sh
Normal file
46
scripts/run_intraday_discovery_watch.sh
Normal file
@@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
cd "$ROOT_DIR"
|
||||
|
||||
if [[ -f ".env.local" ]]; then
|
||||
# shellcheck disable=SC1091
|
||||
source ".env.local"
|
||||
fi
|
||||
if [[ -f ".env" ]]; then
|
||||
# shellcheck disable=SC1091
|
||||
source ".env"
|
||||
fi
|
||||
|
||||
if [[ -z "${DATABASE_URL:-}" ]]; then
|
||||
echo "DATABASE_URL 未设置" >&2
|
||||
exit 1
|
||||
fi
|
||||
if [[ -z "${INTRADAY_DISCOVERY_SEARCH_PROVIDER:-}" ]]; then
|
||||
echo "INTRADAY_DISCOVERY_SEARCH_PROVIDER 未设置" >&2
|
||||
exit 1
|
||||
fi
|
||||
if [[ -z "${INTRADAY_DISCOVERY_LLM_PROVIDER:-}" ]]; then
|
||||
echo "INTRADAY_DISCOVERY_LLM_PROVIDER 未设置" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REPORT_DATE="${REPORT_DATE:-$(date +%F)}"
|
||||
DRY_RUN="false"
|
||||
if [[ "${1:-}" == "--dry-run" ]]; then
|
||||
DRY_RUN="true"
|
||||
fi
|
||||
|
||||
discovery_args=(--date "$REPORT_DATE")
|
||||
verification_args=(--date "$REPORT_DATE")
|
||||
materialize_args=(--date "$REPORT_DATE")
|
||||
if [[ "$DRY_RUN" == "true" ]]; then
|
||||
discovery_args+=(--dry-run)
|
||||
verification_args+=(--dry-run)
|
||||
materialize_args+=(--dry-run)
|
||||
fi
|
||||
|
||||
go run -tags llm_script ./scripts/discover_intraday_news_candidates.go ./scripts/intraday_discovery_provider.go ./scripts/intraday_discovery_common.go "${discovery_args[@]}"
|
||||
go run -tags llm_script ./scripts/verify_intraday_news_candidates.go ./scripts/intraday_discovery_common.go "${verification_args[@]}"
|
||||
REPORT_TRIGGER_SOURCE="intraday_discovery" go run -tags llm_script ./scripts/materialize_daily_signals.go "${materialize_args[@]}"
|
||||
24
scripts/testdata/intraday_discovery_llm_sample.json
vendored
Normal file
24
scripts/testdata/intraday_discovery_llm_sample.json
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
[
|
||||
{
|
||||
"event_type": "official_release",
|
||||
"provider_name": "OpenAI",
|
||||
"model_name": "GPT-5.6",
|
||||
"provider_country": "US",
|
||||
"title": "GPT-5.6 preview pricing update",
|
||||
"summary": "OpenAI preview material indicates GPT-5.6 entered a preview pricing window.",
|
||||
"candidate_urls": [
|
||||
"https://openai.example.com/news/gpt-5-6-pricing"
|
||||
]
|
||||
},
|
||||
{
|
||||
"event_type": "promo_campaign",
|
||||
"provider_name": "DeepSeek",
|
||||
"model_name": "DeepSeek-V4-Flash",
|
||||
"provider_country": "CN",
|
||||
"title": "DeepSeek V4 Flash campaign",
|
||||
"summary": "Official campaign page shows a temporary promotional window for DeepSeek-V4-Flash.",
|
||||
"candidate_urls": [
|
||||
"https://deepseek.example.com/campaign/v4-flash"
|
||||
]
|
||||
}
|
||||
]
|
||||
18
scripts/testdata/intraday_discovery_search_sample.json
vendored
Normal file
18
scripts/testdata/intraday_discovery_search_sample.json
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
[
|
||||
{
|
||||
"title": "OpenAI announces GPT-5.6 preview pricing update",
|
||||
"summary": "OpenAI preview announcement mentions GPT-5.6 and updated API pricing references.",
|
||||
"url": "https://openai.example.com/news/gpt-5-6-pricing",
|
||||
"provider": "OpenAI",
|
||||
"provider_url": "https://openai.example.com",
|
||||
"published_at": "2026-05-25T09:00:00Z"
|
||||
},
|
||||
{
|
||||
"title": "DeepSeek launches V4 Flash campaign",
|
||||
"summary": "Campaign page suggests temporary promotional pricing for DeepSeek-V4-Flash.",
|
||||
"url": "https://deepseek.example.com/campaign/v4-flash",
|
||||
"provider": "DeepSeek",
|
||||
"provider_url": "https://deepseek.example.com",
|
||||
"published_at": "2026-05-25T10:00:00Z"
|
||||
}
|
||||
]
|
||||
7
scripts/testdata/intraday_verification_official_release.html
vendored
Normal file
7
scripts/testdata/intraday_verification_official_release.html
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
<html><body>
|
||||
<article>
|
||||
<h1>OpenAI announces GPT-5.6 preview pricing update</h1>
|
||||
<p>GPT-5.6 preview is now available in official preview channels.</p>
|
||||
<p>Published 2026-05-25.</p>
|
||||
</article>
|
||||
</body></html>
|
||||
8
scripts/testdata/intraday_verification_pricing_page.html
vendored
Normal file
8
scripts/testdata/intraday_verification_pricing_page.html
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
<html><body>
|
||||
<section>
|
||||
<h1>DeepSeek-V4-Flash pricing</h1>
|
||||
<p>Old price: $10</p>
|
||||
<p>New price: $6</p>
|
||||
<p>Campaign window active now.</p>
|
||||
</section>
|
||||
</body></html>
|
||||
6
scripts/testdata/intraday_verification_secondary_media.html
vendored
Normal file
6
scripts/testdata/intraday_verification_secondary_media.html
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
<html><body>
|
||||
<article>
|
||||
<h1>Industry blog discusses GPT-5.6 leak</h1>
|
||||
<p>Writers speculate GPT-5.6 may appear soon based on references.</p>
|
||||
</article>
|
||||
</body></html>
|
||||
501
scripts/verify_intraday_news_candidates.go
Normal file
501
scripts/verify_intraday_news_candidates.go
Normal file
@@ -0,0 +1,501 @@
|
||||
//go:build llm_script
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
_ "github.com/lib/pq"
|
||||
)
|
||||
|
||||
type verificationCandidateRow struct {
|
||||
ID int64
|
||||
CandidateDate string
|
||||
EventType string
|
||||
ProviderName string
|
||||
ModelName string
|
||||
ProviderCountry string
|
||||
Title string
|
||||
Summary string
|
||||
CandidateURLs []string
|
||||
Status string
|
||||
VerificationConfidence string
|
||||
}
|
||||
|
||||
type intradayVerificationConfig struct {
|
||||
Date string
|
||||
DryRun bool
|
||||
DatabaseURL string
|
||||
Timeout time.Duration
|
||||
}
|
||||
|
||||
type intradayVerificationResult struct {
|
||||
CandidateID int64
|
||||
CandidateStatus string
|
||||
VerificationConfidence string
|
||||
VerifierSource string
|
||||
VerifierURL string
|
||||
VerifierStatus string
|
||||
ExtractedFacts map[string]any
|
||||
Notes string
|
||||
}
|
||||
|
||||
type intradayVerificationSummary struct {
|
||||
CandidateTotal int `json:"candidate_total"`
|
||||
VerifiedTotal int `json:"verified_total"`
|
||||
OfficialConfirmedTotal int `json:"official_confirmed_total"`
|
||||
SecondaryConfirmedTotal int `json:"secondary_confirmed_total"`
|
||||
RejectedTotal int `json:"rejected_total"`
|
||||
DryRun bool `json:"dry_run"`
|
||||
}
|
||||
|
||||
func main() {
|
||||
loadIntradayEnv()
|
||||
cfg := intradayVerificationConfig{}
|
||||
flag.StringVar(&cfg.Date, "date", intradayDateValue(), "验证日期,格式 YYYY-MM-DD")
|
||||
flag.BoolVar(&cfg.DryRun, "dry-run", false, "仅输出摘要,不写数据库")
|
||||
flag.Parse()
|
||||
cfg.DatabaseURL = intradayDefaultDSN()
|
||||
cfg.Timeout = discoveryTimeoutFromEnv()
|
||||
if err := runIntradayCandidateVerification(cfg); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "verify_intraday_news_candidates: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
func runIntradayCandidateVerification(cfg intradayVerificationConfig) error {
|
||||
if strings.TrimSpace(cfg.Date) == "" {
|
||||
return fmt.Errorf("date 未设置")
|
||||
}
|
||||
db, err := sql.Open("postgres", cfg.DatabaseURL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("open db: %w", err)
|
||||
}
|
||||
defer db.Close()
|
||||
candidates, err := loadIntradayVerificationCandidates(context.Background(), db, cfg.Date)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
results := make([]intradayVerificationResult, 0, len(candidates))
|
||||
for _, candidate := range candidates {
|
||||
result, err := verifyIntradayCandidate(candidate, cfg.Timeout)
|
||||
if err != nil {
|
||||
result = intradayVerificationResult{
|
||||
CandidateID: candidate.ID,
|
||||
CandidateStatus: "candidate",
|
||||
VerificationConfidence: candidate.VerificationConfidence,
|
||||
VerifierStatus: "error",
|
||||
Notes: err.Error(),
|
||||
}
|
||||
}
|
||||
results = append(results, result)
|
||||
}
|
||||
if !cfg.DryRun {
|
||||
if err := persistIntradayVerificationResults(context.Background(), db, results); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return printIntradayVerificationSummary(summarizeIntradayVerification(results, cfg.DryRun))
|
||||
}
|
||||
|
||||
func loadIntradayVerificationCandidates(ctx context.Context, db *sql.DB, date string) ([]verificationCandidateRow, error) {
|
||||
rows, err := db.QueryContext(ctx, `
|
||||
SELECT id, candidate_date::text, event_type, provider_name, COALESCE(model_name, ''), COALESCE(provider_country, ''),
|
||||
title, COALESCE(summary, ''), COALESCE(candidate_urls::text, '[]'), status, verification_confidence
|
||||
FROM intraday_news_candidate
|
||||
WHERE candidate_date = $1::date
|
||||
AND status IN ('candidate', 'verifying')
|
||||
ORDER BY discovered_at DESC, id DESC`, date)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query intraday candidates: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
var candidates []verificationCandidateRow
|
||||
for rows.Next() {
|
||||
var row verificationCandidateRow
|
||||
var rawURLs string
|
||||
if err := rows.Scan(&row.ID, &row.CandidateDate, &row.EventType, &row.ProviderName, &row.ModelName, &row.ProviderCountry, &row.Title, &row.Summary, &rawURLs, &row.Status, &row.VerificationConfidence); err != nil {
|
||||
return nil, fmt.Errorf("scan intraday candidate: %w", err)
|
||||
}
|
||||
if err := json.Unmarshal([]byte(rawURLs), &row.CandidateURLs); err != nil {
|
||||
return nil, fmt.Errorf("unmarshal candidate urls: %w", err)
|
||||
}
|
||||
candidates = append(candidates, row)
|
||||
}
|
||||
return candidates, rows.Err()
|
||||
}
|
||||
|
||||
func verifyIntradayCandidate(candidate verificationCandidateRow, timeout time.Duration) (intradayVerificationResult, error) {
|
||||
client := &http.Client{Timeout: timeout}
|
||||
best := intradayVerificationResult{
|
||||
CandidateID: candidate.ID,
|
||||
CandidateStatus: "candidate",
|
||||
VerificationConfidence: candidate.VerificationConfidence,
|
||||
VerifierStatus: "insufficient",
|
||||
Notes: "未找到足够证据",
|
||||
ExtractedFacts: map[string]any{},
|
||||
}
|
||||
for _, candidateURL := range candidate.CandidateURLs {
|
||||
body, err := fetchVerificationDocument(candidateURL, client)
|
||||
if err != nil {
|
||||
best = preferVerificationResult(best, intradayVerificationResult{
|
||||
CandidateID: candidate.ID,
|
||||
CandidateStatus: "candidate",
|
||||
VerificationConfidence: candidate.VerificationConfidence,
|
||||
VerifierURL: candidateURL,
|
||||
VerifierStatus: "error",
|
||||
Notes: err.Error(),
|
||||
ExtractedFacts: map[string]any{},
|
||||
})
|
||||
continue
|
||||
}
|
||||
result := verifyCandidateDocument(candidate, candidateURL, body)
|
||||
if result.CandidateID == 0 {
|
||||
result.CandidateID = candidate.ID
|
||||
}
|
||||
best = preferVerificationResult(best, result)
|
||||
if best.CandidateStatus == "verified" && best.VerificationConfidence == "official_confirmed" {
|
||||
return best, nil
|
||||
}
|
||||
}
|
||||
return best, nil
|
||||
}
|
||||
|
||||
func fetchVerificationDocument(rawURL string, client *http.Client) (string, error) {
|
||||
req, err := http.NewRequest(http.MethodGet, rawURL, nil)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("build verification request: %w", err)
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; llm-intelligence intraday verifier)")
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("fetch verification document: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
payload, _ := io.ReadAll(resp.Body)
|
||||
return "", fmt.Errorf("fetch verification document: unexpected status %d: %s", resp.StatusCode, strings.TrimSpace(string(payload)))
|
||||
}
|
||||
payload, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read verification document: %w", err)
|
||||
}
|
||||
return string(payload), nil
|
||||
}
|
||||
|
||||
func verifyCandidateDocument(candidate verificationCandidateRow, candidateURL, body string) intradayVerificationResult {
|
||||
source := classifyVerificationSource(candidate.ProviderName, candidateURL)
|
||||
facts := extractVerificationFacts(body)
|
||||
modelMatched := bodyMentionsModel(body, candidate.ModelName, candidate.Title)
|
||||
result := intradayVerificationResult{
|
||||
CandidateID: candidate.ID,
|
||||
CandidateStatus: "candidate",
|
||||
VerificationConfidence: "candidate",
|
||||
VerifierSource: source,
|
||||
VerifierURL: candidateURL,
|
||||
VerifierStatus: "insufficient",
|
||||
ExtractedFacts: facts,
|
||||
Notes: "证据不足",
|
||||
}
|
||||
if isOfficialVerificationSource(source) {
|
||||
switch normalizeIntradayEventType(candidate.EventType) {
|
||||
case "official_release":
|
||||
if modelMatched {
|
||||
result.CandidateStatus = "verified"
|
||||
result.VerificationConfidence = "official_confirmed"
|
||||
result.VerifierStatus = "matched"
|
||||
result.Notes = "官方页面命中模型发布线索"
|
||||
}
|
||||
case "promo_campaign":
|
||||
if modelMatched && bodyMentionsPromo(body) {
|
||||
result.CandidateStatus = "verified"
|
||||
result.VerificationConfidence = "official_confirmed"
|
||||
result.VerifierStatus = "matched"
|
||||
result.Notes = "官方页面命中活动窗口或促销语义"
|
||||
}
|
||||
case "price_cut", "price_increase":
|
||||
if priceResult, ok := deriveVerifiedPriceEvent(candidate.EventType, facts); ok {
|
||||
result.CandidateStatus = "verified"
|
||||
result.VerificationConfidence = "official_confirmed"
|
||||
result.VerifierStatus = "matched"
|
||||
result.ExtractedFacts = priceResult
|
||||
result.Notes = "官方价格页命中真实价格变化"
|
||||
} else if modelMatched {
|
||||
result.VerifierStatus = "insufficient"
|
||||
result.Notes = "命中模型但缺少可计算的价格变化事实"
|
||||
}
|
||||
case "leak_or_rumor":
|
||||
if modelMatched {
|
||||
result.CandidateStatus = "verified"
|
||||
result.VerificationConfidence = "secondary_confirmed"
|
||||
result.VerifierStatus = "matched"
|
||||
result.Notes = "保留为待确认情报,不进入正式事实层"
|
||||
}
|
||||
}
|
||||
} else if modelMatched {
|
||||
result.CandidateStatus = "verified"
|
||||
result.VerificationConfidence = "secondary_confirmed"
|
||||
result.VerifierStatus = "matched"
|
||||
result.Notes = "仅二手来源命中,不能进入正式事实层"
|
||||
}
|
||||
if result.VerifierStatus == "insufficient" && modelMatched && !isOfficialVerificationSource(source) {
|
||||
result.VerificationConfidence = "secondary_confirmed"
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func isOfficialVerificationSource(source string) bool {
|
||||
switch source {
|
||||
case "official_page", "official_docs", "official_blog", "pricing_page":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func classifyVerificationSource(providerName, rawURL string) string {
|
||||
parsed, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return "secondary_media"
|
||||
}
|
||||
host := strings.ToLower(parsed.Host)
|
||||
path := strings.ToLower(parsed.Path)
|
||||
if isOfficialProviderHost(providerName, host) {
|
||||
switch {
|
||||
case strings.Contains(host, "docs.") || strings.Contains(path, "/docs"):
|
||||
return "official_docs"
|
||||
case strings.Contains(host, "pricing") || strings.Contains(path, "pricing") || strings.Contains(path, "price"):
|
||||
return "pricing_page"
|
||||
case strings.Contains(path, "blog") || strings.Contains(path, "news") || strings.Contains(path, "announcement"):
|
||||
return "official_blog"
|
||||
default:
|
||||
return "official_page"
|
||||
}
|
||||
}
|
||||
return "secondary_media"
|
||||
}
|
||||
|
||||
func isOfficialProviderHost(providerName, host string) bool {
|
||||
tokens := providerHostTokens(providerName)
|
||||
for _, token := range tokens {
|
||||
if token != "" && strings.Contains(host, token) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func providerHostTokens(providerName string) []string {
|
||||
switch strings.ToLower(strings.TrimSpace(providerName)) {
|
||||
case "openai":
|
||||
return []string{"openai.com"}
|
||||
case "anthropic":
|
||||
return []string{"anthropic.com"}
|
||||
case "google", "google gemini", "gemini":
|
||||
return []string{"google.com", "google.dev", "ai.google.dev"}
|
||||
case "deepseek":
|
||||
return []string{"deepseek.com", "deepseek.ai"}
|
||||
case "qwen", "dashscope":
|
||||
return []string{"aliyun.com", "dashscope.com"}
|
||||
case "xai":
|
||||
return []string{"x.ai"}
|
||||
case "智谱":
|
||||
return []string{"zhipuai.cn"}
|
||||
case "百度", "百度文心":
|
||||
return []string{"baidu.com", "cloud.baidu.com"}
|
||||
case "腾讯", "腾讯混元":
|
||||
return []string{"tencent.com", "cloud.tencent.com"}
|
||||
case "minimax":
|
||||
return []string{"minimax.io", "minimax.chat"}
|
||||
default:
|
||||
clean := strings.ToLower(strings.TrimSpace(providerName))
|
||||
if clean == "" {
|
||||
return nil
|
||||
}
|
||||
return []string{clean}
|
||||
}
|
||||
}
|
||||
|
||||
func bodyMentionsModel(body, modelName, title string) bool {
|
||||
normBody := normalizeEvidenceText(body)
|
||||
for _, candidate := range []string{modelName, title} {
|
||||
normCandidate := normalizeEvidenceText(candidate)
|
||||
if normCandidate != "" && strings.Contains(normBody, normCandidate) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func bodyMentionsPromo(body string) bool {
|
||||
lower := strings.ToLower(body)
|
||||
for _, marker := range []string{"campaign", "promo", "promotion", "discount", "活动", "优惠", "限时", "窗口"} {
|
||||
if strings.Contains(lower, marker) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func extractVerificationFacts(body string) map[string]any {
|
||||
facts := map[string]any{}
|
||||
oldPrice, newPrice, ok := extractPricePair(body)
|
||||
if ok {
|
||||
facts["old_input_price"] = oldPrice
|
||||
facts["new_input_price"] = newPrice
|
||||
if oldPrice != 0 {
|
||||
facts["price_change_pct"] = ((newPrice - oldPrice) / oldPrice) * 100
|
||||
}
|
||||
}
|
||||
return facts
|
||||
}
|
||||
|
||||
func deriveVerifiedPriceEvent(eventType string, facts map[string]any) (map[string]any, bool) {
|
||||
oldValue, oldOK := facts["old_input_price"].(float64)
|
||||
newValue, newOK := facts["new_input_price"].(float64)
|
||||
changePct, pctOK := facts["price_change_pct"].(float64)
|
||||
if !oldOK || !newOK || !pctOK || oldValue <= 0 || newValue <= 0 {
|
||||
return nil, false
|
||||
}
|
||||
normalized := normalizeIntradayEventType(eventType)
|
||||
if normalized == "price_cut" && changePct >= 0 {
|
||||
return nil, false
|
||||
}
|
||||
if normalized == "price_increase" && changePct <= 0 {
|
||||
return nil, false
|
||||
}
|
||||
return facts, true
|
||||
}
|
||||
|
||||
func extractPricePair(body string) (float64, float64, bool) {
|
||||
oldRe := regexp.MustCompile(`(?i)(old|from)\s*price[^0-9$¥]*[$¥]?([0-9]+(?:\.[0-9]+)?)`)
|
||||
newRe := regexp.MustCompile(`(?i)(new|to)\s*price[^0-9$¥]*[$¥]?([0-9]+(?:\.[0-9]+)?)`)
|
||||
oldMatch := oldRe.FindStringSubmatch(body)
|
||||
newMatch := newRe.FindStringSubmatch(body)
|
||||
if len(oldMatch) < 3 || len(newMatch) < 3 {
|
||||
return 0, 0, false
|
||||
}
|
||||
var oldValue, newValue float64
|
||||
if _, err := fmt.Sscanf(oldMatch[2], "%f", &oldValue); err != nil {
|
||||
return 0, 0, false
|
||||
}
|
||||
if _, err := fmt.Sscanf(newMatch[2], "%f", &newValue); err != nil {
|
||||
return 0, 0, false
|
||||
}
|
||||
return oldValue, newValue, true
|
||||
}
|
||||
|
||||
func normalizeEvidenceText(value string) string {
|
||||
value = strings.ToLower(value)
|
||||
re := regexp.MustCompile(`[^a-z0-9\p{Han}]+`)
|
||||
value = re.ReplaceAllString(value, "")
|
||||
return strings.TrimSpace(value)
|
||||
}
|
||||
|
||||
func preferVerificationResult(current, next intradayVerificationResult) intradayVerificationResult {
|
||||
if verificationScore(next) > verificationScore(current) {
|
||||
return next
|
||||
}
|
||||
return current
|
||||
}
|
||||
|
||||
func verificationScore(result intradayVerificationResult) int {
|
||||
score := 0
|
||||
switch result.CandidateStatus {
|
||||
case "verified":
|
||||
score += 20
|
||||
case "rejected":
|
||||
score += 5
|
||||
}
|
||||
switch result.VerificationConfidence {
|
||||
case "official_confirmed":
|
||||
score += 10
|
||||
case "secondary_confirmed":
|
||||
score += 5
|
||||
}
|
||||
switch result.VerifierStatus {
|
||||
case "matched":
|
||||
score += 3
|
||||
case "contradicted":
|
||||
score += 1
|
||||
}
|
||||
return score
|
||||
}
|
||||
|
||||
func persistIntradayVerificationResults(ctx context.Context, db *sql.DB, results []intradayVerificationResult) error {
|
||||
for _, result := range results {
|
||||
facts, err := json.Marshal(result.ExtractedFacts)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal extracted facts: %w", err)
|
||||
}
|
||||
_, err = db.ExecContext(ctx, `
|
||||
INSERT INTO intraday_news_verification (
|
||||
candidate_id, verifier_source, verifier_url, verifier_status, extracted_facts, notes
|
||||
) VALUES ($1, NULLIF($2, ''), NULLIF($3, ''), $4, $5::jsonb, NULLIF($6, ''))`,
|
||||
result.CandidateID,
|
||||
result.VerifierSource,
|
||||
result.VerifierURL,
|
||||
result.VerifierStatus,
|
||||
string(facts),
|
||||
result.Notes,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("insert intraday verification: %w", err)
|
||||
}
|
||||
_, err = db.ExecContext(ctx, `
|
||||
UPDATE intraday_news_candidate
|
||||
SET status = $2,
|
||||
verification_confidence = $3,
|
||||
verification_notes = NULLIF($4, ''),
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1`,
|
||||
result.CandidateID,
|
||||
result.CandidateStatus,
|
||||
result.VerificationConfidence,
|
||||
result.Notes,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("update intraday candidate: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func summarizeIntradayVerification(results []intradayVerificationResult, dryRun bool) intradayVerificationSummary {
|
||||
summary := intradayVerificationSummary{CandidateTotal: len(results), DryRun: dryRun}
|
||||
for _, result := range results {
|
||||
if result.CandidateStatus == "verified" {
|
||||
summary.VerifiedTotal++
|
||||
}
|
||||
switch result.VerificationConfidence {
|
||||
case "official_confirmed":
|
||||
summary.OfficialConfirmedTotal++
|
||||
case "secondary_confirmed":
|
||||
summary.SecondaryConfirmedTotal++
|
||||
}
|
||||
if result.CandidateStatus == "rejected" {
|
||||
summary.RejectedTotal++
|
||||
}
|
||||
}
|
||||
return summary
|
||||
}
|
||||
|
||||
func printIntradayVerificationSummary(summary intradayVerificationSummary) error {
|
||||
payload, err := json.Marshal(summary)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Println(string(payload))
|
||||
return nil
|
||||
}
|
||||
99
scripts/verify_intraday_news_candidates_test.go
Normal file
99
scripts/verify_intraday_news_candidates_test.go
Normal file
@@ -0,0 +1,99 @@
|
||||
//go:build llm_script
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestVerifyCandidateDocumentOfficialRelease(t *testing.T) {
|
||||
body, err := os.ReadFile(filepath.Join("testdata", "intraday_verification_official_release.html"))
|
||||
if err != nil {
|
||||
t.Fatalf("读取 official release fixture 失败: %v", err)
|
||||
}
|
||||
candidate := verificationCandidateRow{
|
||||
ID: 1,
|
||||
EventType: "official_release",
|
||||
ProviderName: "OpenAI",
|
||||
ModelName: "GPT-5.6",
|
||||
Title: "GPT-5.6 preview pricing update",
|
||||
}
|
||||
result := verifyCandidateDocument(candidate, "https://openai.com/news/gpt-5-6-preview", string(body))
|
||||
if result.CandidateStatus != "verified" || result.VerificationConfidence != "official_confirmed" {
|
||||
t.Fatalf("官方发布应被确认: %+v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyCandidateDocumentPriceCutNeedsRealPriceFacts(t *testing.T) {
|
||||
body, err := os.ReadFile(filepath.Join("testdata", "intraday_verification_pricing_page.html"))
|
||||
if err != nil {
|
||||
t.Fatalf("读取 pricing fixture 失败: %v", err)
|
||||
}
|
||||
candidate := verificationCandidateRow{
|
||||
ID: 2,
|
||||
EventType: "price_cut",
|
||||
ProviderName: "DeepSeek",
|
||||
ModelName: "DeepSeek-V4-Flash",
|
||||
Title: "DeepSeek-V4-Flash price cut",
|
||||
}
|
||||
result := verifyCandidateDocument(candidate, "https://deepseek.com/pricing/v4-flash", string(body))
|
||||
if result.CandidateStatus != "verified" || result.VerificationConfidence != "official_confirmed" {
|
||||
t.Fatalf("价格页命中真实价格变化后应确认: %+v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyCandidateDocumentPromoCampaignOfficial(t *testing.T) {
|
||||
body, err := os.ReadFile(filepath.Join("testdata", "intraday_verification_pricing_page.html"))
|
||||
if err != nil {
|
||||
t.Fatalf("读取 promo fixture 失败: %v", err)
|
||||
}
|
||||
candidate := verificationCandidateRow{
|
||||
ID: 3,
|
||||
EventType: "promo_campaign",
|
||||
ProviderName: "DeepSeek",
|
||||
ModelName: "DeepSeek-V4-Flash",
|
||||
Title: "DeepSeek V4 Flash campaign",
|
||||
}
|
||||
result := verifyCandidateDocument(candidate, "https://deepseek.com/campaign/v4-flash", string(body))
|
||||
if result.CandidateStatus != "verified" || result.VerificationConfidence != "official_confirmed" {
|
||||
t.Fatalf("官方活动页应被确认: %+v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyCandidateDocumentSecondaryMediaDowngrades(t *testing.T) {
|
||||
body, err := os.ReadFile(filepath.Join("testdata", "intraday_verification_secondary_media.html"))
|
||||
if err != nil {
|
||||
t.Fatalf("读取 secondary fixture 失败: %v", err)
|
||||
}
|
||||
candidate := verificationCandidateRow{
|
||||
ID: 4,
|
||||
EventType: "official_release",
|
||||
ProviderName: "OpenAI",
|
||||
ModelName: "GPT-5.6",
|
||||
Title: "GPT-5.6 leak discussion",
|
||||
}
|
||||
result := verifyCandidateDocument(candidate, "https://someblog.example.com/gpt-5-6-leak", string(body))
|
||||
if result.VerificationConfidence != "secondary_confirmed" {
|
||||
t.Fatalf("二手媒体应降级为 secondary_confirmed: %+v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyCandidateDocumentLeakStaysOutOfOfficialFacts(t *testing.T) {
|
||||
body, err := os.ReadFile(filepath.Join("testdata", "intraday_verification_secondary_media.html"))
|
||||
if err != nil {
|
||||
t.Fatalf("读取 leak fixture 失败: %v", err)
|
||||
}
|
||||
candidate := verificationCandidateRow{
|
||||
ID: 5,
|
||||
EventType: "leak_or_rumor",
|
||||
ProviderName: "OpenAI",
|
||||
ModelName: "GPT-5.6",
|
||||
Title: "GPT-5.6 leak discussion",
|
||||
}
|
||||
result := verifyCandidateDocument(candidate, "https://someblog.example.com/gpt-5-6-leak", string(body))
|
||||
if result.VerificationConfidence == "official_confirmed" {
|
||||
t.Fatalf("泄露类不应升级为正式事实: %+v", result)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user