diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md index 2aa65b1..42849d1 100644 --- a/DEPLOYMENT.md +++ b/DEPLOYMENT.md @@ -51,6 +51,9 @@ export OPENROUTER_API_KEY="your-api-key" export API_AUTH_TOKEN="replace-with-long-random-token" # 或者:export API_BASIC_AUTH_USER="review" && export API_BASIC_AUTH_PASS="replace-with-password" export FEISHU_WEBHOOK="your-webhook-url" # 可选 +export INTRADAY_DISCOVERY_SEARCH_PROVIDER="command_json" # 候选发现链路可选 +export INTRADAY_DISCOVERY_LLM_PROVIDER="command_json" # 候选归纳链路可选 + ``` @@ -75,6 +78,10 @@ crontab -e # 日内价格追踪(推荐每 4 小时一次) 0 */4 * * * cd /path/to/llm-intelligence && bash scripts/run_intraday_price_watch.sh >> /tmp/llm_hub_intraday.log 2>&1 +# 日内新闻发现与验证(推荐每 2 小时一次) +0 */2 * * * cd /path/to/llm-intelligence && bash scripts/run_intraday_discovery_watch.sh >> /tmp/llm_hub_intraday_discovery.log 2>&1 + + # 真实采集 + 写库 + 报告生成的手动复跑入口 cd /path/to/llm-intelligence && bash scripts/run_real_pipeline.sh ``` @@ -106,6 +113,11 @@ docker-compose up -d | API_RATE_LIMIT_WINDOW_SEC | ❌ | `/api/*` 限流窗口秒数,默认 `60` | | FEISHU_WEBHOOK | ❌ | 飞书告警 Webhook | | REPORT_DATE | ❌ | 手工指定日内追踪/日报日期 | +| INTRADAY_DISCOVERY_SEARCH_PROVIDER / INTRADAY_DISCOVERY_LLM_PROVIDER | 条件必填 | discovery 链路 provider 类型;支持 `fixture` / `command_json` / `http_json` | +| INTRADAY_DISCOVERY_SEARCH_COMMAND / INTRADAY_DISCOVERY_LLM_COMMAND | 条件必填 | 当 provider 为 `command_json` 时执行的命令,stdout 必须输出 JSON | +| INTRADAY_DISCOVERY_SEARCH_URL / INTRADAY_DISCOVERY_LLM_URL | 条件必填 | 当 provider 为 `http_json` 时调用的接口 URL | +| INTRADAY_DISCOVERY_SEARCH_FIXTURE / INTRADAY_DISCOVERY_LLM_FIXTURE | ❌ | dry-run / 本地 fixture 输入 | +| INTRADAY_DISCOVERY_TIMEOUT_SEC | ❌ | discovery 与验证抓取超时秒数,默认 `20` | | PORT | ❌ | API Server 监听端口,默认 8080 | diff --git a/README.md b/README.md index 77410b8..605cb99 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ - 手工复跑使用 `scripts/run_real_pipeline.sh`,不会把产物标记成正式日报 - 历史补跑使用 `scripts/rebuild_historical_report.sh YYYY-MM-DD` - 日内价格追踪使用 `scripts/run_intraday_price_watch.sh`,只刷新价格与信号,不生成正式日报 +- 日内新闻候选发现与验证使用 `scripts/run_intraday_discovery_watch.sh`,只刷新候选池、验证轨迹与已验证信号,不生成正式日报 - HTTP API 当前未内建认证、授权和限流;公网暴露前必须在网关层补齐 @@ -104,6 +105,7 @@ bash scripts/run_intel_pipeline.sh 3. 平台目录核验 4. 每日关键信号物化到 `daily_signal_snapshot` 5. 日内价格追踪可由 `scripts/run_intraday_price_watch.sh` 独立执行,不生成正式日报 +6. 日内新闻候选发现与验证可由 `scripts/run_intraday_discovery_watch.sh` 独立执行,不生成正式日报 ### 正式日报调度 @@ -125,13 +127,6 @@ bash scripts/run_daily.sh 9. 失败时降级复制昨日报告并可选飞书告警 ### 手工真实复跑 -### 日内价格追踪 - -```bash -bash scripts/run_intraday_price_watch.sh -``` - -适用于捕捉“小米大降价”“活动窗口上线”“泄露情报”等日内价格事件。该入口只刷新价格与信号层,不写正式 `daily_report`,也不会覆盖 `latest_report` 语义。 ```bash bash scripts/run_real_pipeline.sh @@ -143,6 +138,22 @@ bash scripts/run_real_pipeline.sh - `trigger_source=pipeline` - `is_official_daily=false` +### 日内价格追踪 + +```bash +bash scripts/run_intraday_price_watch.sh +``` + +适用于捕捉“小米大降价”“活动窗口上线”等已知入口里的结构化价格变化。该入口只刷新价格与信号层,不写正式 `daily_report`,也不会覆盖 `latest_report` 语义。 + +### 日内新闻发现与验证 + +```bash +bash scripts/run_intraday_discovery_watch.sh +``` + +适用于搜索引擎 + LLM 高召回发现“当天可能发生的价格新闻 / 版本发布 / 活动窗口”,再通过官方页面 / 价格页 / docs 做验证。该入口只刷新候选池、验证轨迹与 `daily_signal_snapshot` 中的已验证事实,不写正式 `daily_report`,也不会覆盖 `latest_report` 语义。 + ### 历史补跑 ```bash diff --git a/db/migrations/017_intraday_news_candidates.sql b/db/migrations/017_intraday_news_candidates.sql new file mode 100644 index 0000000..b722ec5 --- /dev/null +++ b/db/migrations/017_intraday_news_candidates.sql @@ -0,0 +1,106 @@ +-- 日内新闻候选与验证持久化结构 + +CREATE TABLE IF NOT EXISTS intraday_news_candidate ( + id BIGSERIAL PRIMARY KEY, + candidate_date DATE NOT NULL, + discovered_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + event_type TEXT NOT NULL, + provider_name TEXT NOT NULL, + model_name TEXT, + provider_country TEXT, + title TEXT NOT NULL, + summary TEXT, + candidate_urls JSONB NOT NULL DEFAULT '[]'::jsonb, + discovery_source TEXT NOT NULL, + discovery_query TEXT, + discovery_evidence JSONB NOT NULL DEFAULT '{}'::jsonb, + normalized_key TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'candidate', + verification_confidence TEXT NOT NULL DEFAULT 'candidate', + verification_notes TEXT, + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_constraint WHERE conname = 'chk_intraday_news_candidate_status' + ) THEN + ALTER TABLE intraday_news_candidate + ADD CONSTRAINT chk_intraday_news_candidate_status + CHECK (status IN ('candidate', 'verifying', 'verified', 'rejected', 'stale')); + END IF; + + IF NOT EXISTS ( + SELECT 1 FROM pg_constraint WHERE conname = 'chk_intraday_news_candidate_confidence' + ) THEN + ALTER TABLE intraday_news_candidate + ADD CONSTRAINT chk_intraday_news_candidate_confidence + CHECK (verification_confidence IN ('candidate', 'secondary_confirmed', 'official_confirmed')); + END IF; +END +$$; + +CREATE UNIQUE INDEX IF NOT EXISTS idx_intraday_news_candidate_normalized_key + ON intraday_news_candidate(normalized_key); +CREATE INDEX IF NOT EXISTS idx_intraday_news_candidate_date + ON intraday_news_candidate(candidate_date DESC, discovered_at DESC); +CREATE INDEX IF NOT EXISTS idx_intraday_news_candidate_status + ON intraday_news_candidate(status); +CREATE INDEX IF NOT EXISTS idx_intraday_news_candidate_provider_event + ON intraday_news_candidate(provider_name, event_type, candidate_date DESC); + +COMMENT ON TABLE intraday_news_candidate IS '搜索引擎与 LLM 发现的日内新闻候选池,尚未直接进入正式日报事实层'; +COMMENT ON COLUMN intraday_news_candidate.candidate_urls IS '候选来源 URL 数组,按发现层输出原样保留'; +COMMENT ON COLUMN intraday_news_candidate.discovery_evidence IS '发现阶段原始证据 JSONB,例如搜索命中、LLM 归纳结果'; +COMMENT ON COLUMN intraday_news_candidate.normalized_key IS '同日同事件的去重键,避免重复发现候选'; + +CREATE TABLE IF NOT EXISTS intraday_news_verification ( + id BIGSERIAL PRIMARY KEY, + candidate_id BIGINT NOT NULL REFERENCES intraday_news_candidate(id) ON DELETE CASCADE, + verified_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + verifier_source TEXT NOT NULL, + verifier_url TEXT, + verifier_status TEXT NOT NULL, + extracted_facts JSONB NOT NULL DEFAULT '{}'::jsonb, + notes TEXT, + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_constraint WHERE conname = 'chk_intraday_news_verification_status' + ) THEN + ALTER TABLE intraday_news_verification + ADD CONSTRAINT chk_intraday_news_verification_status + CHECK (verifier_status IN ('matched', 'contradicted', 'insufficient', 'error')); + END IF; +END +$$; + +CREATE INDEX IF NOT EXISTS idx_intraday_news_verification_candidate_verified_at + ON intraday_news_verification(candidate_id, verified_at DESC); +CREATE INDEX IF NOT EXISTS idx_intraday_news_verification_source + ON intraday_news_verification(verifier_source); +CREATE INDEX IF NOT EXISTS idx_intraday_news_verification_status + ON intraday_news_verification(verifier_status); + +COMMENT ON TABLE intraday_news_verification IS '日内新闻候选的验证轨迹,记录验证来源、状态和提取事实'; +COMMENT ON COLUMN intraday_news_verification.extracted_facts IS '验证阶段提取出的结构化事实 JSONB'; + +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 + FROM pg_trigger + WHERE tgname = 'intraday_news_candidate_updated_at' + ) THEN + CREATE TRIGGER intraday_news_candidate_updated_at + BEFORE UPDATE ON intraday_news_candidate + FOR EACH ROW + EXECUTE FUNCTION update_updated_at_column(); + END IF; +END +$$; diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index ba21ceb..4c00cd0 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -27,11 +27,20 @@ | `API_RATE_LIMIT_WINDOW_SEC` | 否 | `cmd/server/main.go` | `60` | `/api/*` 限流窗口长度(秒) | | `FEISHU_WEBHOOK` | 否 | `run_daily.sh`、`feishu_alert.sh` | 空 | 正式日报失败时发送飞书告警 | | `REPORT_OUTPUT_DIR` | 否 | `generate_daily_report.go` | `reports/daily` | 日报主产物输出目录 | -| `REPORT_DATE` | 否 | `generate_daily_report.go`、`rebuild_historical_report.sh`、`run_intraday_price_watch.sh` | 当天日期 | 指定日报或日内价格追踪的日期,格式 `YYYY-MM-DD` | +| `REPORT_DATE` | 否 | `generate_daily_report.go`、`rebuild_historical_report.sh`、`run_intraday_price_watch.sh`、`run_intraday_discovery_watch.sh` | 当天日期 | 指定日报或日内链路日期,格式 `YYYY-MM-DD` | | `REPORT_RUN_KIND` | 否 | `generate_daily_report.go` | `manual` | 运行语义,如 `scheduled` / `manual` / `historical_rebuild` | -| `REPORT_TRIGGER_SOURCE` | 否 | `generate_daily_report.go`、`materialize_daily_signals.go` | `cli` | 触发来源,如 `cron` / `pipeline` / `intraday` / `rebuild_script` | +| `REPORT_TRIGGER_SOURCE` | 否 | `generate_daily_report.go`、`materialize_daily_signals.go` | `cli` | 触发来源,如 `cron` / `pipeline` / `intraday` / `intraday_discovery` / `rebuild_script` | | `REPORT_IS_OFFICIAL_DAILY` | 否 | `generate_daily_report.go` | `false` | 是否属于正式日报产出 | | `REPORT_RUNTIME_AUDIT` | 否 | `generate_daily_report.go` | 空 | 来源级运行审计摘要,通常由流水线脚本注入 | +| `INTRADAY_DISCOVERY_SEARCH_PROVIDER` | 条件必填 | `discover_intraday_news_candidates.go`、`run_intraday_discovery_watch.sh` | 空 | 候选发现搜索 provider 类型;计划支持 `fixture` / `command_json` / `http_json` | +| `INTRADAY_DISCOVERY_SEARCH_COMMAND` | 条件必填 | `discover_intraday_news_candidates.go` | 空 | 当 `INTRADAY_DISCOVERY_SEARCH_PROVIDER=command_json` 时执行的搜索命令,stdout 必须输出 JSON 数组 | +| `INTRADAY_DISCOVERY_SEARCH_URL` | 条件必填 | `discover_intraday_news_candidates.go` | 空 | 当 `INTRADAY_DISCOVERY_SEARCH_PROVIDER=http_json` 时调用的搜索接口 URL | +| `INTRADAY_DISCOVERY_SEARCH_FIXTURE` | 否 | `discover_intraday_news_candidates.go` | 空 | 搜索 provider 样例文件,用于 dry-run / 本地测试 | +| `INTRADAY_DISCOVERY_LLM_PROVIDER` | 条件必填 | `discover_intraday_news_candidates.go`、`run_intraday_discovery_watch.sh` | 空 | 候选归纳 LLM provider 类型;计划支持 `fixture` / `command_json` / `http_json` | +| `INTRADAY_DISCOVERY_LLM_COMMAND` | 条件必填 | `discover_intraday_news_candidates.go` | 空 | 当 `INTRADAY_DISCOVERY_LLM_PROVIDER=command_json` 时执行的 LLM 命令,stdout 必须输出 JSON 数组 | +| `INTRADAY_DISCOVERY_LLM_URL` | 条件必填 | `discover_intraday_news_candidates.go` | 空 | 当 `INTRADAY_DISCOVERY_LLM_PROVIDER=http_json` 时调用的 LLM 接口 URL | +| `INTRADAY_DISCOVERY_LLM_FIXTURE` | 否 | `discover_intraday_news_candidates.go` | 空 | LLM provider 样例文件,用于 dry-run / 本地测试 | +| `INTRADAY_DISCOVERY_TIMEOUT_SEC` | 否 | `discover_intraday_news_candidates.go`、`verify_intraday_news_candidates.go` | `20` | discovery provider 与验证抓取的默认超时秒数 | | `PHASE6_PORT` | 否 | `verify_phase6.sh` | 自动挑选 `18080-18120` | Phase 6 验收时临时启动 API Server 的端口 | | `LIGHTHOUSE_PORT` | 否 | `verify_lighthouse.sh` | `4173` | Lighthouse 预览端口 | | `LIGHTHOUSE_SCORE_THRESHOLD` | 否 | `verify_lighthouse.sh` | `80` | 前端性能分数门槛 | @@ -83,6 +92,23 @@ bash scripts/run_intraday_price_watch.sh - 不生成正式 HTML / Markdown 日报 - 推荐先按每 4 小时一次调度,再根据外部源稳定性决定是否收紧到每 2 小时 +### 日内候选发现与验证 + +```bash +export DATABASE_URL="postgres://app_user:***@db:5432/llm_intelligence?sslmode=disable" +export INTRADAY_DISCOVERY_SEARCH_PROVIDER="command_json" +export INTRADAY_DISCOVERY_SEARCH_COMMAND="/usr/local/bin/intraday-search --date $REPORT_DATE" +export INTRADAY_DISCOVERY_LLM_PROVIDER="command_json" +export INTRADAY_DISCOVERY_LLM_COMMAND="/usr/local/bin/intraday-llm --date $REPORT_DATE" +bash scripts/run_intraday_discovery_watch.sh +``` + +说明: +- 该入口只刷新候选池、验证轨迹与 `daily_signal_snapshot` 中的已验证事实 +- 它不会直接写 `daily_report`,不会覆盖 `/api/v1/reports/latest` 对应的正式日报 +- 搜索 / LLM provider 缺失时应明确报前置条件错误,不能伪装成“今日无新闻” +- `leak_or_rumor` 默认留在候选层,不进入正式日报事实 + ## 日报运行语义 项目用以下字段区分正式日报、手工复跑和历史补跑: diff --git a/docs/PRODUCTION_CHECKLIST.md b/docs/PRODUCTION_CHECKLIST.md index 8b83d92..863d17f 100644 --- a/docs/PRODUCTION_CHECKLIST.md +++ b/docs/PRODUCTION_CHECKLIST.md @@ -59,9 +59,11 @@ - 手工复跑命令已确定:`bash scripts/run_real_pipeline.sh` - 历史补跑命令已确定:`bash scripts/rebuild_historical_report.sh YYYY-MM-DD` - 日内价格追踪命令已确定:`bash scripts/run_intraday_price_watch.sh` - +- 日内新闻发现与验证命令已确定:`bash scripts/run_intraday_discovery_watch.sh` - `OPENROUTER_API_KEY` 已在正式调度环境可用 - `FEISHU_WEBHOOK` 已配置或明确不上告警 +- 候选发现所需 search / LLM provider 已配置,缺失时会以前置条件错误失败,不会伪装成“无新闻” + ### 安全与访问控制 @@ -141,6 +143,8 @@ bash scripts/run_real_pipeline.sh ``` # 日内价格追踪(推荐) 0 */4 * * * cd /path/to/llm-intelligence && bash scripts/run_intraday_price_watch.sh >> /tmp/llm_hub_intraday.log 2>&1 +# 日内新闻发现与验证(推荐) +0 */2 * * * cd /path/to/llm-intelligence && bash scripts/run_intraday_discovery_watch.sh >> /tmp/llm_hub_intraday_discovery.log 2>&1 ### 7. 线上冒烟 diff --git a/docs/plans/2026-05-25-intraday-discovery-verification-implementation-plan.md b/docs/plans/2026-05-25-intraday-discovery-verification-implementation-plan.md new file mode 100644 index 0000000..2d17c63 --- /dev/null +++ b/docs/plans/2026-05-25-intraday-discovery-verification-implementation-plan.md @@ -0,0 +1,420 @@ +# Intraday Discovery + Verification Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** 在不污染正式日报语义的前提下,为现有日内链路增加“搜索引擎 + 大模型候选发现层”和“官方来源验证层”,让当天的大模型价格新闻、版本发布、活动窗口能更早进入候选池,并只把已验证事实接入现有 `daily_signal_snapshot` / 日报语义链路。 + +**Architecture:** 保留现有 `scripts/run_intraday_price_watch.sh` 作为结构化价格事实刷新入口,不改它“只刷新价格/信号、不生成正式日报”的边界。新增一条独立的 `run_intraday_discovery_watch.sh` 发现链路:先用搜索引擎与 LLM 生成候选事件,再通过官方页面 / 价格页 / docs / 公告页做二次验证。候选与验证结果分别落入新表;只有 `official_confirmed` 的事件才允许映射进 `materialize_daily_signals.go` 的 `signalModelEvent`,并由现有 `generate_daily_report.go` 继续消费,不新造第二套日报事实系统。发现层与验证层必须通过仓库内可运行的 provider adapter 落地,不能依赖当前会话专属工具;实现上采用“命令或 HTTP provider 适配层 + fixture 测试”的方式,确保本地 cron 和 CI 环境可执行。已验证 discovery 事件接入现有事件流时必须去重:若同一 `provider + model + event_type + date` 已由 importer / 原生 loader 给出,则以原生事实为准,discovery 事件只补缺,不覆盖。 + +**Tech Stack:** Go 1.22、PostgreSQL、Bash、可配置搜索/LLM provider adapter、JSONB + +--- + +### Task 1: 为候选发现与验证链路定义持久化结构 + +**Files:** +- Create: `db/migrations/017_intraday_news_candidates.sql` +- Modify: `docs/CONFIGURATION.md` +- Modify: `DEPLOYMENT.md` + +**Step 1: 新增候选表与验证表 migration** + +创建两张表: +- `intraday_news_candidate` +- `intraday_news_verification` + +候选表至少包含: +- `candidate_date` +- `event_type` +- `provider_name` +- `model_name` +- `provider_country` +- `title` +- `summary` +- `candidate_urls JSONB` +- `discovery_source` +- `discovery_query` +- `discovery_evidence JSONB` +- `normalized_key` +- `status` +- `verification_confidence` +- `verification_notes` + +验证表至少包含: +- `candidate_id` +- `verifier_source` +- `verifier_url` +- `verifier_status` +- `extracted_facts JSONB` +- `notes` + +约束: +- `intraday_news_candidate.normalized_key` 必须唯一,用于防止同日重复发现 +- `status` 至少支持:`candidate` / `verifying` / `verified` / `rejected` / `stale` +- `verification_confidence` 至少支持:`candidate` / `secondary_confirmed` / `official_confirmed` + +**Step 2: 明确与正式事实层的边界文档** + +在 `docs/CONFIGURATION.md` 和 `DEPLOYMENT.md` 写明: +- 候选发现层不会直接写 `daily_report` +- 候选发现层不会覆盖 `latest_report` +- `daily_signal_snapshot` 只消费已验证事实,不消费 `candidate_only` +- `leak_or_rumor` 默认只保留在候选层,不进入正式日报事实 + +**Step 3: 运行 migration 验证** + +Run: +- `bash scripts/apply_migration.sh` + +Expected: +- 新表创建成功 +- 重复执行 migration 不报错 + +**Step 4: Commit** + +```bash +git add db/migrations/017_intraday_news_candidates.sql docs/CONFIGURATION.md DEPLOYMENT.md +git commit -m "feat(intraday): add candidate and verification persistence" +``` + +--- + +### Task 2: 实现候选发现层最小闭环 + +**Files:** +- Create: `scripts/discover_intraday_news_candidates.go` +- Create: `scripts/discover_intraday_news_candidates_test.go` +- Create: `scripts/testdata/intraday_discovery_search_sample.json` +- Create: `scripts/testdata/intraday_discovery_llm_sample.json` +- Modify: `docs/CONFIGURATION.md` +- Create: `scripts/intraday_discovery_provider.go` + +**Step 1: 先写失败测试** + +补 4 组测试: +- 搜索结果解析测试:验证能从样例结果提取 title / summary / url / provider 线索 +- LLM 输出解析测试:验证能把 LLM JSON 输出转成候选事件 +- 候选归一化测试:验证同一事件经过标题差异改写后仍生成同一 `normalized_key` +- URL 过滤测试:验证没有 URL 的候选被丢弃,避免 LLM 空口造线索 + +**Step 2: 运行失败测试** + +Run: +- `go test -count=1 -tags llm_script ./scripts/discover_intraday_news_candidates.go ./scripts/discover_intraday_news_candidates_test.go` + +Expected: +- 新增测试失败 +- 失败原因是缺少解析、归一化或去重逻辑 + +**Step 3: 实现最小候选发现器** + +在 `discover_intraday_news_candidates.go` 中实现: +- 固定 provider 查询模板集(中英双语) +- 搜索结果抓取适配层 +- LLM 候选摘要适配层 +- 去重与归一化逻辑 +- 写入 `intraday_news_candidate` +- provider adapter 抽象层(搜索 / LLM 均可通过命令或 HTTP provider 接入,默认实现不可依赖当前会话专属工具) + +限制: +- LLM 只允许输出候选,不允许直接标成 `verified` +- 无 URL 候选直接丢弃 +- 搜索 / LLM provider 未配置时必须以前置条件错误退出,不能伪装成业务无新闻 +- 默认事件类型至少支持: + - `price_cut` + - `price_increase` + - `official_release` + - `promo_campaign` + - `leak_or_rumor` + - `unknown` + +**Step 4: 重新运行测试** + +Run: +- `go test -count=1 -tags llm_script ./scripts/discover_intraday_news_candidates.go ./scripts/discover_intraday_news_candidates_test.go` + +Expected: +- 候选解析与归一化测试通过 + +**Step 5: 运行一次 dry-run 验证** + +Run: +- `go run -tags llm_script ./scripts/discover_intraday_news_candidates.go --date=2026-05-25 --dry-run` + +Expected: +- 输出 `candidate_total` / `provider_hit_count` / `event_type_counts` +- dry-run 不写 `daily_report` +- dry-run 不改 `latest_report` + +**Step 6: Commit** + +```bash +git add scripts/discover_intraday_news_candidates.go scripts/discover_intraday_news_candidates_test.go scripts/testdata/intraday_discovery_search_sample.json scripts/testdata/intraday_discovery_llm_sample.json docs/CONFIGURATION.md +git commit -m "feat(intraday): add news candidate discovery pipeline" +``` + +--- + +### Task 3: 实现候选验证层并固化“只信官方事实”的规则 + +**Files:** +- Create: `scripts/verify_intraday_news_candidates.go` +- Create: `scripts/verify_intraday_news_candidates_test.go` +- Create: `scripts/testdata/intraday_verification_official_release.html` +- Create: `scripts/testdata/intraday_verification_pricing_page.html` +- Create: `scripts/testdata/intraday_verification_secondary_media.html` +- Modify: `docs/CONFIGURATION.md` + +**Step 1: 先写失败测试** + +补 5 组测试: +- 官方发布页验证测试:命中模型名与发布时间时,产出 `official_confirmed` +- 官方价格页验证测试:只有拿到真实价格变化时,才允许产出 `price_cut` / `price_increase` +- 活动页验证测试:官方活动页可映射为 `promo_campaign` +- 二手媒体降级测试:二手媒体最多得到 `secondary_confirmed`,不能直接进入正式事实层 +- 泄露类隔离测试:`leak_or_rumor` 即使有外部讨论,也不会升级为正式日报事实 + +**Step 2: 运行失败测试** + +Run: +- `go test -count=1 -tags llm_script ./scripts/verify_intraday_news_candidates.go ./scripts/verify_intraday_news_candidates_test.go` + +Expected: +- 新增测试失败 +- 失败原因是缺少来源分类与验证状态映射逻辑 + +**Step 3: 实现验证器** + +在 `verify_intraday_news_candidates.go` 中实现: +- 读取 `candidate` / `verifying` 状态候选 +- 拉取 `candidate_urls` +- 基于域名与页面内容判定: + - `official_page` + - `pricing_page` + - `official_docs` + - `official_blog` + - `secondary_media` +- 把验证轨迹写入 `intraday_news_verification` +- 更新 `intraday_news_candidate.status` 与 `verification_confidence` +- 验证成功后只更新候选层状态,不直接写 `daily_signal_snapshot`;正式事实仍统一由物化器汇总 + +规则: +- 只有官方页面 / 价格页 / docs / 公告页可以产出 `official_confirmed` +- 价格新闻若无法拿到真实价格事实,只能维持候选或二级确认,不能伪造价格变化事件 +- `leak_or_rumor` 默认不升级为正式事实 + +**Step 4: 重新运行测试** + +Run: +- `go test -count=1 -tags llm_script ./scripts/verify_intraday_news_candidates.go ./scripts/verify_intraday_news_candidates_test.go` + +Expected: +- 验证规则测试通过 + +**Step 5: 运行一次 dry-run 验证** + +Run: +- `go run -tags llm_script ./scripts/verify_intraday_news_candidates.go --date=2026-05-25 --dry-run` + +Expected: +- 输出 `verified_total` / `official_confirmed_total` / `secondary_confirmed_total` +- dry-run 只打印摘要,不写 `daily_report` + +**Step 6: Commit** + +```bash +git add scripts/verify_intraday_news_candidates.go scripts/verify_intraday_news_candidates_test.go scripts/testdata/intraday_verification_official_release.html scripts/testdata/intraday_verification_pricing_page.html scripts/testdata/intraday_verification_secondary_media.html docs/CONFIGURATION.md +git commit -m "feat(intraday): add candidate verification pipeline" +``` + +--- + +### Task 4: 把已验证事件接入现有 `materialize_daily_signals.go` + +**Files:** +- Modify: `scripts/materialize_daily_signals.go` +- Create or Modify: `scripts/materialize_daily_signals_test.go` +- Modify: `docs/plans/2026-05-27-intraday-price-watch-plan.md` +- Modify: `README.md` +- Modify: `docs/PRODUCTION_CHECKLIST.md` + +**Step 1: 先写失败测试** + +补 4 组测试: +- 已验证官方发布事件会进入 `daily_signal_snapshot.top_events` +- 已验证活动事件会进入 `daily_signal_snapshot.top_events` +- `candidate_only` 与 `leak_or_rumor` 不进入正式快照 +- 未拿到真实价格变化数据的“价格新闻”不会被错误映射为 `price_cut` / `price_increase` + +**Step 2: 运行失败测试** + +Run: +- `go test -count=1 -tags llm_script ./scripts/materialize_daily_signals.go ./scripts/materialize_daily_signals_test.go` + +Expected: +- 新增测试失败 +- 失败原因是当前物化器还不会读取已验证候选事件 + +**Step 3: 最小实现 verified event loader** + +在 `materialize_daily_signals.go` 中新增: +- `loadVerifiedIntradayNewsEvents(db, date string)` +- 将 `official_confirmed` 的: + - `official_release` + - `promo_campaign` + - 已确认真实价格变化的 `price_cut` / `price_increase` + 映射为现有 `signalModelEvent` +- 与现有 `loadSignalModelEvents` 结果做去重合并;同日同模型同事件类型若已由 importer / 原生 loader 给出,则 discovery 事件仅补 `SourceURL` / 证据缺口,不抢占优先级 + +约束: +- 不新造第二套快照表 +- 不改变 `daily_signal_snapshot` 的正式事实语义 +- `secondary_confirmed` 默认不进入正式快照 + +**Step 4: 重新运行测试** + +Run: +- `go test -count=1 -tags llm_script ./scripts/materialize_daily_signals.go ./scripts/materialize_daily_signals_test.go` + +Expected: +- verified event 相关测试通过 + +**Step 5: 联合验证日内边界** + +Run: +- `REPORT_TRIGGER_SOURCE=intraday_discovery go run -tags llm_script ./scripts/materialize_daily_signals.go --date=2026-05-25 --dry-run` + +Expected: +- 输出含 `page_mode` / `event_count` +- 不写 `daily_report` +- 不覆盖 `latest_report` + +**Step 6: Commit** + +```bash +git add scripts/materialize_daily_signals.go scripts/materialize_daily_signals_test.go README.md docs/PRODUCTION_CHECKLIST.md docs/plans/2026-05-27-intraday-price-watch-plan.md +git commit -m "feat(intraday): materialize verified discovery events" +``` + +--- + +### Task 5: 组装新的日内发现入口并补部署说明 + +**Files:** +- Create: `scripts/run_intraday_discovery_watch.sh` +- Modify: `README.md` +- Modify: `docs/CONFIGURATION.md` +- Modify: `DEPLOYMENT.md` +- Modify: `docs/PRODUCTION_CHECKLIST.md` + +**Step 1: 实现独立入口脚本** + +脚本顺序固定为: +1. `discover_intraday_news_candidates.go` +2. `verify_intraday_news_candidates.go` +3. `materialize_daily_signals.go`(仅消费 verified 事件) + +要求: +- 明确要求 `DATABASE_URL` +- 搜索 / LLM 所需 key 缺失时,输出前置条件错误,不伪装成代码失败 +- 不执行 `generate_daily_report.go` +- 不写 `daily_report` +- 不覆盖 `latest_report` + +**Step 2: 更新调度文档** + +文档里明确两条 cron: +- 结构化价格刷新:`run_intraday_price_watch.sh` +- 新闻发现与验证:`run_intraday_discovery_watch.sh` + +推荐起步频率: +- `run_intraday_discovery_watch.sh`:每 2 小时一次 +- `run_intraday_price_watch.sh`:每 4 小时一次 + +**Step 3: 运行脚本级 dry-run** + +Run: +- `bash scripts/run_intraday_discovery_watch.sh --dry-run` + +Expected: +- 输出候选发现摘要 + 验证摘要 + 信号物化摘要 +- 不生成正式日报产物 + +**Step 4: Commit** + +```bash +git add scripts/run_intraday_discovery_watch.sh README.md docs/CONFIGURATION.md DEPLOYMENT.md docs/PRODUCTION_CHECKLIST.md +git commit -m "feat(intraday): add discovery watch runner" +``` + +--- + +### Task 6: 运行最终联合验收并准备本地提交 + +**Files:** +- Modify: `README.md`(仅在最终说明缺失时) +- Modify: `docs/CONFIGURATION.md`(仅在最终说明缺失时) +- Modify: `DEPLOYMENT.md`(仅在最终说明缺失时) + +**Step 1: 运行 focused Go tests** + +Run: +- `go test -count=1 -tags llm_script ./scripts/discover_intraday_news_candidates.go ./scripts/discover_intraday_news_candidates_test.go` +- `go test -count=1 -tags llm_script ./scripts/verify_intraday_news_candidates.go ./scripts/verify_intraday_news_candidates_test.go` +- `go test -count=1 -tags llm_script ./scripts/materialize_daily_signals.go ./scripts/materialize_daily_signals_test.go` + +Expected: +- 发现层、验证层、信号物化层 focused tests 全通过 + +**Step 2: 运行现有日报/前端回归边界** + +Run: +- `go test -count=1 -tags llm_script ./scripts/generate_daily_report.go ./scripts/generate_daily_report_test.go ./scripts/official_import_signature_audit_query_lib.go` +- `bash scripts/secret_gate_test.sh` +- `bash scripts/test_importers.sh` +- `cd frontend && npm test -- --run` +- `cd frontend && npm run build` + +Expected: +- 原有日报与前端链路不回归 +- discovery 新增能力不污染正式日报边界 + +**Step 3: 运行脚本级联合 dry-run** + +Run: +- `bash scripts/run_intraday_discovery_watch.sh --dry-run` +- `REPORT_TRIGGER_SOURCE=intraday go run -tags llm_script ./scripts/materialize_daily_signals.go --date=2026-05-25 --dry-run` + +Expected: +- 不写 `daily_report` +- 不覆盖 `latest_report` +- 能稳定输出候选数、验证数、事件数、page_mode、source_audit + +**Step 4: 本地提交** + +```bash +git add db/migrations/017_intraday_news_candidates.sql scripts/discover_intraday_news_candidates.go scripts/discover_intraday_news_candidates_test.go scripts/verify_intraday_news_candidates.go scripts/verify_intraday_news_candidates_test.go scripts/materialize_daily_signals.go scripts/materialize_daily_signals_test.go scripts/run_intraday_discovery_watch.sh README.md docs/CONFIGURATION.md DEPLOYMENT.md docs/PRODUCTION_CHECKLIST.md docs/plans/2026-05-25-intraday-discovery-verification-implementation-plan.md docs/plans/2026-05-27-intraday-price-watch-plan.md +git commit -m "feat(intraday): add discovery and verification watch pipeline" +``` + +--- + +## 验收标准 + +实现完成后,必须同时满足: +- 搜索 + LLM 只能产生候选事件,不能直接写成正式日报事实 +- 只有 `official_confirmed` 的事件才能进入正式 `daily_signal_snapshot` 语义链路 +- `leak_or_rumor` 不进入正式日报事实层 +- `run_intraday_discovery_watch.sh` 与 `run_intraday_price_watch.sh` 职责分离 +- 正式日报仍只由 `run_daily.sh` 负责 +- 新增链路不会写 `daily_report`、不会覆盖 `latest_report` +- discovery provider adapter 在无配置时会明确报前置条件错误;有 fixture / dry-run 模式可本地验证 +- 新增 focused tests、现有日报测试、前端构建全部通过 + +## 非目标 + +本计划刻意不做: +- 不新增第二套正式日报系统 +- 不让 LLM 直接替代价格 importer 或官方发布 importer +- 不把二手媒体新闻直接映射为 `price_cut` / `price_increase` +- 不在第一阶段引入新的前端“候选情报面板”复杂交互;若后续需要,单独立计划 diff --git a/docs/plans/2026-05-27-intraday-price-watch-plan.md b/docs/plans/2026-05-27-intraday-price-watch-plan.md index 1b244e5..78711b6 100644 --- a/docs/plans/2026-05-27-intraday-price-watch-plan.md +++ b/docs/plans/2026-05-27-intraday-price-watch-plan.md @@ -55,6 +55,6 @@ ## 下一步建议 -1. 把前端查询页增加“最近一次价格追踪时间”提示 -2. 给 `materialize_daily_signals.go` 增加 `trigger_source=intraday` 的文档说明 -3. 如果日内事件仍不够敏感,再考虑引入独立 `intraday_signal_snapshot` 表 +1. 为 `run_intraday_discovery_watch.sh` 补充生产级 provider adapter 和调度说明 +2. 给前端查询页增加“最近一次价格追踪时间 / 最近一次 discovery 验证时间”提示 +3. 如果日内事件仍不够敏感,再考虑引入独立 `intraday_signal_snapshot` 或候选情报面板 diff --git a/scripts/discover_intraday_news_candidates.go b/scripts/discover_intraday_news_candidates.go new file mode 100644 index 0000000..d89e653 --- /dev/null +++ b/scripts/discover_intraday_news_candidates.go @@ -0,0 +1,410 @@ +//go:build llm_script + +package main + +import ( + "context" + "database/sql" + "encoding/json" + "flag" + "fmt" + "log/slog" + "os" + "sort" + "strings" + "time" + + _ "github.com/lib/pq" +) + +type intradayNewsCandidate struct { + CandidateDate string + EventType string + ProviderName string + ModelName string + ProviderCountry string + Title string + Summary string + CandidateURLs []string + DiscoverySource string + DiscoveryQuery string + DiscoveryEvidence map[string]any + NormalizedKey string + Status string + VerificationConfidence string + VerificationNotes string +} + +type intradayDiscoveryConfig struct { + Date string + DryRun bool + Search intradayProviderConfig + LLM intradayProviderConfig + DatabaseURL string + Timeout time.Duration + ProviderLimit int +} + +type intradayDiscoverySummary struct { + CandidateTotal int `json:"candidate_total"` + ProviderHitCount int `json:"provider_hit_count"` + EventTypeCounts map[string]int `json:"event_type_counts"` + DiscoverySourceSet []string `json:"discovery_source_set"` + DryRun bool `json:"dry_run"` +} + +var intradayDiscoveryLogger *slog.Logger + +func init() { + intradayDiscoveryLogger = slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelInfo})) +} + +func main() { + loadIntradayEnv() + cfg := loadIntradayDiscoveryConfig() + if err := runIntradayCandidateDiscovery(cfg); err != nil { + fmt.Fprintf(os.Stderr, "discover_intraday_news_candidates: %v\n", err) + os.Exit(1) + } +} + +func loadIntradayDiscoveryConfig() intradayDiscoveryConfig { + var cfg intradayDiscoveryConfig + flag.StringVar(&cfg.Date, "date", intradayDateValue(), "候选发现日期,格式 YYYY-MM-DD") + flag.BoolVar(&cfg.DryRun, "dry-run", false, "仅输出摘要,不写数据库") + flag.IntVar(&cfg.ProviderLimit, "provider-limit", 10, "最大 provider 数") + flag.Parse() + + cfg.DatabaseURL = intradayDefaultDSN() + cfg.Timeout = discoveryTimeoutFromEnv() + cfg.Search = intradayProviderConfig{ + Mode: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_SEARCH_PROVIDER")), + Command: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_SEARCH_COMMAND")), + URL: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_SEARCH_URL")), + Fixture: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_SEARCH_FIXTURE")), + Timeout: cfg.Timeout, + } + cfg.LLM = intradayProviderConfig{ + Mode: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_LLM_PROVIDER")), + Command: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_LLM_COMMAND")), + URL: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_LLM_URL")), + Fixture: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_LLM_FIXTURE")), + Timeout: cfg.Timeout, + } + return cfg +} + +func runIntradayCandidateDiscovery(cfg intradayDiscoveryConfig) error { + if strings.TrimSpace(cfg.Date) == "" { + return fmt.Errorf("date 未设置") + } + if err := validateIntradayProviderConfig("search", cfg.Search); err != nil { + return err + } + if err := validateIntradayProviderConfig("llm", cfg.LLM); err != nil { + return err + } + queries := buildIntradayQueries(cfg.Date, cfg.ProviderLimit) + searchRecords, err := loadIntradaySearchRecords(cfg.Search, cfg.Date, queries) + if err != nil { + return err + } + llmRecords, err := loadIntradayLLMRecords(cfg.LLM, cfg.Date, searchRecords) + if err != nil { + return err + } + candidates := normalizeIntradayCandidates(cfg.Date, searchRecords, llmRecords) + summary := summarizeIntradayCandidates(candidates, cfg.DryRun) + if cfg.DryRun { + return printIntradayDiscoverySummary(summary) + } + + db, err := sql.Open("postgres", cfg.DatabaseURL) + if err != nil { + return fmt.Errorf("open db: %w", err) + } + defer db.Close() + if err := upsertIntradayCandidates(context.Background(), db, candidates); err != nil { + return err + } + return printIntradayDiscoverySummary(summary) +} + +func validateIntradayProviderConfig(name string, cfg intradayProviderConfig) error { + if strings.TrimSpace(cfg.Mode) == "" { + return fmt.Errorf("%s provider 未设置", name) + } + switch cfg.Mode { + case "fixture": + if strings.TrimSpace(cfg.Fixture) == "" { + return fmt.Errorf("%s provider fixture 未设置", name) + } + case "command_json": + if strings.TrimSpace(cfg.Command) == "" { + return fmt.Errorf("%s provider command 未设置", name) + } + case "http_json": + if strings.TrimSpace(cfg.URL) == "" { + return fmt.Errorf("%s provider url 未设置", name) + } + default: + return fmt.Errorf("%s provider mode 不支持: %s", name, cfg.Mode) + } + return nil +} + +func buildIntradayQueries(date string, providerLimit int) []string { + providers := []string{ + "OpenAI", "Anthropic", "Google Gemini", "xAI", "DeepSeek", + "DashScope", "Qwen", "智谱", "百度文心", "腾讯混元", "火山方舟", "MiniMax", + } + keywords := []string{"pricing release announcement", "模型 降价 发布 活动"} + if providerLimit > 0 && providerLimit < len(providers) { + providers = providers[:providerLimit] + } + queries := make([]string, 0, len(providers)*len(keywords)) + for _, provider := range providers { + for _, keyword := range keywords { + queries = append(queries, strings.TrimSpace(date+" "+provider+" "+keyword)) + } + } + return queries +} + +func normalizeIntradayCandidates(date string, searchRecords []intradaySearchRecord, llmRecords []intradayLLMRecord) []intradayNewsCandidate { + searchIndex := indexSearchRecordsByURL(searchRecords) + candidatesByKey := map[string]intradayNewsCandidate{} + for _, record := range llmRecords { + candidate := candidateFromLLMRecord(date, record, searchIndex) + if len(candidate.CandidateURLs) == 0 { + continue + } + if candidate.ProviderName == "" { + candidate.ProviderName = inferProviderFromTitle(candidate.Title) + } + candidate.EventType = normalizeIntradayEventType(candidate.EventType) + candidate.NormalizedKey = buildIntradayNormalizedKey(candidate) + mergeIntradayCandidate(candidatesByKey, candidate) + } + result := make([]intradayNewsCandidate, 0, len(candidatesByKey)) + for _, candidate := range candidatesByKey { + result = append(result, candidate) + } + sort.Slice(result, func(i, j int) bool { + if result[i].ProviderName != result[j].ProviderName { + return result[i].ProviderName < result[j].ProviderName + } + if result[i].EventType != result[j].EventType { + return result[i].EventType < result[j].EventType + } + return result[i].NormalizedKey < result[j].NormalizedKey + }) + return result +} + +func candidateFromLLMRecord(date string, record intradayLLMRecord, searchIndex map[string]intradaySearchRecord) intradayNewsCandidate { + candidate := intradayNewsCandidate{ + CandidateDate: date, + EventType: record.EventType, + ProviderName: strings.TrimSpace(record.ProviderName), + ModelName: strings.TrimSpace(record.ModelName), + ProviderCountry: strings.TrimSpace(record.ProviderCountry), + Title: strings.TrimSpace(record.Title), + Summary: strings.TrimSpace(record.Summary), + CandidateURLs: dedupeStrings(record.CandidateURLs), + DiscoverySource: "llm_answer", + DiscoveryEvidence: map[string]any{"llm_record": record}, + Status: "candidate", + VerificationConfidence: "candidate", + } + for _, url := range candidate.CandidateURLs { + if searchRecord, ok := searchIndex[url]; ok { + candidate.DiscoverySource = "web_search+llm" + candidate.DiscoveryQuery = searchRecord.Title + candidate.DiscoveryEvidence["search_record"] = searchRecord + if candidate.ProviderName == "" { + candidate.ProviderName = strings.TrimSpace(searchRecord.Provider) + } + if candidate.Title == "" { + candidate.Title = strings.TrimSpace(searchRecord.Title) + } + if candidate.Summary == "" { + candidate.Summary = strings.TrimSpace(searchRecord.Summary) + } + } + } + return candidate +} + +func indexSearchRecordsByURL(records []intradaySearchRecord) map[string]intradaySearchRecord { + indexed := make(map[string]intradaySearchRecord, len(records)) + for _, record := range records { + url := strings.TrimSpace(record.URL) + if url == "" { + continue + } + indexed[url] = record + } + return indexed +} + +func mergeIntradayCandidate(target map[string]intradayNewsCandidate, candidate intradayNewsCandidate) { + if candidate.NormalizedKey == "" { + return + } + existing, ok := target[candidate.NormalizedKey] + if !ok { + target[candidate.NormalizedKey] = candidate + return + } + merged := existing + merged.CandidateURLs = dedupeStrings(append(existing.CandidateURLs, candidate.CandidateURLs...)) + if strings.TrimSpace(merged.Summary) == "" { + merged.Summary = candidate.Summary + } + if strings.TrimSpace(merged.ProviderCountry) == "" { + merged.ProviderCountry = candidate.ProviderCountry + } + if merged.DiscoverySource != candidate.DiscoverySource && candidate.DiscoverySource != "" { + merged.DiscoverySource = "web_search+llm" + } + if merged.DiscoveryEvidence == nil { + merged.DiscoveryEvidence = map[string]any{} + } + if llmRecord, ok := candidate.DiscoveryEvidence["llm_record"]; ok { + merged.DiscoveryEvidence["llm_record"] = llmRecord + } + if searchRecord, ok := candidate.DiscoveryEvidence["search_record"]; ok { + merged.DiscoveryEvidence["search_record"] = searchRecord + } + target[candidate.NormalizedKey] = merged +} + +func buildIntradayNormalizedKey(candidate intradayNewsCandidate) string { + provider := normalizeWord(candidate.ProviderName) + model := normalizeWord(candidate.ModelName) + if model == "" { + model = normalizeWord(candidate.Title) + } + return strings.Join([]string{ + candidate.CandidateDate, + normalizeWord(candidate.EventType), + provider, + model, + }, "|") +} + + +func summarizeIntradayCandidates(candidates []intradayNewsCandidate, dryRun bool) intradayDiscoverySummary { + eventTypeCounts := make(map[string]int) + providerSet := map[string]struct{}{} + sourceSet := map[string]struct{}{} + for _, candidate := range candidates { + eventTypeCounts[candidate.EventType]++ + if candidate.ProviderName != "" { + providerSet[candidate.ProviderName] = struct{}{} + } + if candidate.DiscoverySource != "" { + sourceSet[candidate.DiscoverySource] = struct{}{} + } + } + sources := make([]string, 0, len(sourceSet)) + for source := range sourceSet { + sources = append(sources, source) + } + sort.Strings(sources) + return intradayDiscoverySummary{ + CandidateTotal: len(candidates), + ProviderHitCount: len(providerSet), + EventTypeCounts: eventTypeCounts, + DiscoverySourceSet: sources, + DryRun: dryRun, + } +} + +func printIntradayDiscoverySummary(summary intradayDiscoverySummary) error { + payload, err := json.Marshal(summary) + if err != nil { + return err + } + fmt.Println(string(payload)) + return nil +} + +func upsertIntradayCandidates(ctx context.Context, db *sql.DB, candidates []intradayNewsCandidate) error { + if db == nil { + return fmt.Errorf("db is nil") + } + for _, candidate := range candidates { + urls, err := json.Marshal(candidate.CandidateURLs) + if err != nil { + return fmt.Errorf("marshal candidate urls: %w", err) + } + evidence, err := json.Marshal(candidate.DiscoveryEvidence) + if err != nil { + return fmt.Errorf("marshal discovery evidence: %w", err) + } + _, err = db.ExecContext(ctx, ` + INSERT INTO intraday_news_candidate ( + candidate_date, event_type, provider_name, model_name, provider_country, + title, summary, candidate_urls, discovery_source, discovery_query, + discovery_evidence, normalized_key, status, verification_confidence, verification_notes + ) VALUES ( + $1::date, $2, $3, NULLIF($4, ''), NULLIF($5, ''), + $6, NULLIF($7, ''), $8::jsonb, $9, NULLIF($10, ''), + $11::jsonb, $12, $13, $14, NULLIF($15, '') + ) + ON CONFLICT (normalized_key) DO UPDATE SET + title = EXCLUDED.title, + summary = COALESCE(NULLIF(EXCLUDED.summary, ''), intraday_news_candidate.summary), + candidate_urls = EXCLUDED.candidate_urls, + discovery_source = EXCLUDED.discovery_source, + discovery_query = COALESCE(NULLIF(EXCLUDED.discovery_query, ''), intraday_news_candidate.discovery_query), + discovery_evidence = EXCLUDED.discovery_evidence, + provider_country = COALESCE(NULLIF(EXCLUDED.provider_country, ''), intraday_news_candidate.provider_country), + updated_at = CURRENT_TIMESTAMP`, + candidate.CandidateDate, + candidate.EventType, + candidate.ProviderName, + candidate.ModelName, + candidate.ProviderCountry, + candidate.Title, + candidate.Summary, + string(urls), + candidate.DiscoverySource, + candidate.DiscoveryQuery, + string(evidence), + candidate.NormalizedKey, + candidate.Status, + candidate.VerificationConfidence, + candidate.VerificationNotes, + ) + if err != nil { + return fmt.Errorf("upsert intraday candidate %s: %w", candidate.NormalizedKey, err) + } + } + return nil +} + +func inferProviderFromTitle(title string) string { + lower := strings.ToLower(title) + for _, pair := range []struct{ match, provider string }{ + {"openai", "OpenAI"}, + {"anthropic", "Anthropic"}, + {"gemini", "Google"}, + {"deepseek", "DeepSeek"}, + {"qwen", "Qwen"}, + {"dashscope", "DashScope"}, + {"xai", "xAI"}, + {"minimax", "MiniMax"}, + {"智谱", "智谱"}, + {"百度", "百度"}, + {"腾讯", "腾讯"}, + } { + if strings.Contains(lower, pair.match) { + return pair.provider + } + } + return "" +} + diff --git a/scripts/discover_intraday_news_candidates_test.go b/scripts/discover_intraday_news_candidates_test.go new file mode 100644 index 0000000..5c82d35 --- /dev/null +++ b/scripts/discover_intraday_news_candidates_test.go @@ -0,0 +1,127 @@ +//go:build llm_script + +package main + +import ( + "context" + "database/sql" + "path/filepath" + "strings" + "testing" +) + +func TestLoadIntradaySearchRecordsFromFixture(t *testing.T) { + cfg := intradayProviderConfig{ + Mode: "fixture", + Fixture: filepath.Join("testdata", "intraday_discovery_search_sample.json"), + } + records, err := loadIntradaySearchRecords(cfg, "2026-05-25", []string{"OpenAI pricing release"}) + if err != nil { + t.Fatalf("loadIntradaySearchRecords 返回错误: %v", err) + } + if len(records) != 2 { + t.Fatalf("搜索样例条数错误: got=%d", len(records)) + } + if records[0].URL == "" || records[0].Provider == "" { + t.Fatalf("搜索样例未保留 URL/provider: %+v", records[0]) + } +} + +func TestLoadIntradayLLMRecordsFromFixture(t *testing.T) { + cfg := intradayProviderConfig{ + Mode: "fixture", + Fixture: filepath.Join("testdata", "intraday_discovery_llm_sample.json"), + } + records, err := loadIntradayLLMRecords(cfg, "2026-05-25", nil) + if err != nil { + t.Fatalf("loadIntradayLLMRecords 返回错误: %v", err) + } + if len(records) != 2 { + t.Fatalf("LLM 样例条数错误: got=%d", len(records)) + } + if records[0].EventType != "official_release" { + t.Fatalf("LLM 事件类型错误: %+v", records[0]) + } +} + +func TestNormalizeIntradayCandidatesDedupesEquivalentEvents(t *testing.T) { + searchRecords := []intradaySearchRecord{{ + Title: "OpenAI announces GPT-5.6 preview pricing update", + Summary: "Search summary", + URL: "https://openai.example.com/news/gpt-5-6-pricing", + Provider: "OpenAI", + }} + llmRecords := []intradayLLMRecord{ + { + EventType: "official_release", + ProviderName: "OpenAI", + ModelName: "GPT-5.6", + ProviderCountry: "US", + Title: "GPT-5.6 preview pricing update", + Summary: "First summary", + CandidateURLs: []string{"https://openai.example.com/news/gpt-5-6-pricing"}, + }, + { + EventType: "official_release", + ProviderName: "OpenAI", + ModelName: "GPT 5.6", + ProviderCountry: "US", + Title: "OpenAI GPT 5.6 preview pricing update", + Summary: "Second summary", + CandidateURLs: []string{"https://openai.example.com/news/gpt-5-6-pricing"}, + }, + } + candidates := normalizeIntradayCandidates("2026-05-25", searchRecords, llmRecords) + if len(candidates) != 1 { + t.Fatalf("期望去重后只剩 1 条候选, got=%d", len(candidates)) + } + if candidates[0].DiscoverySource != "web_search+llm" { + t.Fatalf("期望 discovery source 合并, got=%q", candidates[0].DiscoverySource) + } +} + +func TestNormalizeIntradayCandidatesDropsURLlessRecords(t *testing.T) { + llmRecords := []intradayLLMRecord{{ + EventType: "promo_campaign", + ProviderName: "DeepSeek", + ModelName: "DeepSeek-V4-Flash", + Title: "No URL candidate", + Summary: "Should be dropped", + }} + candidates := normalizeIntradayCandidates("2026-05-25", nil, llmRecords) + if len(candidates) != 0 { + t.Fatalf("无 URL 候选应被丢弃, got=%d", len(candidates)) + } +} + +func TestValidateIntradayProviderConfigRequiresCommandOrURLOrFixture(t *testing.T) { + if err := validateIntradayProviderConfig("search", intradayProviderConfig{Mode: "command_json"}); err == nil { + t.Fatal("缺少 command 时应报错") + } + if err := validateIntradayProviderConfig("llm", intradayProviderConfig{Mode: "http_json"}); err == nil { + t.Fatal("缺少 url 时应报错") + } + if err := validateIntradayProviderConfig("search", intradayProviderConfig{Mode: "fixture", Fixture: "fixture.json"}); err != nil { + t.Fatalf("fixture provider 不应报错: %v", err) + } +} + +func TestBuildIntradayNormalizedKeyUsesProviderModelAndDate(t *testing.T) { + key := buildIntradayNormalizedKey(intradayNewsCandidate{ + CandidateDate: "2026-05-25", + EventType: "official_release", + ProviderName: "OpenAI", + ModelName: "GPT-5.6", + }) + if !strings.Contains(key, "2026-05-25") || !strings.Contains(key, "openai") || !strings.Contains(key, "gpt-5-6") { + t.Fatalf("normalized key 不符合预期: %q", key) + } +} + +func TestUpsertIntradayCandidatesRequiresDB(t *testing.T) { + var db *sql.DB + err := upsertIntradayCandidates(context.Background(), db, nil) + if err == nil { + t.Fatal("nil db 时应报错") + } +} diff --git a/scripts/intraday_discovery_common.go b/scripts/intraday_discovery_common.go new file mode 100644 index 0000000..5d6401e --- /dev/null +++ b/scripts/intraday_discovery_common.go @@ -0,0 +1,111 @@ +//go:build llm_script + +package main + +import ( + "fmt" + "os" + "regexp" + "strings" + "time" +) + +func loadIntradayEnv() { + for _, path := range []string{".env.local", ".env"} { + data, err := os.ReadFile(path) + if err != nil { + continue + } + for _, line := range strings.Split(string(data), "\n") { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + key, value, ok := strings.Cut(line, "=") + if !ok { + continue + } + key = strings.TrimSpace(key) + value = strings.Trim(strings.TrimSpace(value), `"'`) + if key == "" { + continue + } + if _, exists := os.LookupEnv(key); exists { + continue + } + _ = os.Setenv(key, value) + } + } +} + +func intradayDefaultDSN() string { + if dsn := os.Getenv("DATABASE_URL"); dsn != "" { + return dsn + } + return "postgres://long@/llm_intelligence?host=/var/run/postgresql" +} + +func intradayDateValue() string { + if value := strings.TrimSpace(os.Getenv("REPORT_DATE")); value != "" { + return value + } + return time.Now().Format("2006-01-02") +} + +func discoveryTimeoutFromEnv() time.Duration { + raw := strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_TIMEOUT_SEC")) + if raw == "" { + return 20 * time.Second + } + var seconds int + if _, err := fmt.Sscanf(raw, "%d", &seconds); err != nil || seconds <= 0 { + return 20 * time.Second + } + return time.Duration(seconds) * time.Second +} + +func normalizeIntradayEventType(value string) string { + switch strings.TrimSpace(strings.ToLower(value)) { + case "price_cut": + return "price_cut" + case "price_increase": + return "price_increase" + case "official_release": + return "official_release" + case "promo_campaign": + return "promo_campaign" + case "leak_or_rumor": + return "leak_or_rumor" + default: + return "unknown" + } +} + +func normalizeWord(value string) string { + value = strings.ToLower(strings.TrimSpace(value)) + value = strings.ReplaceAll(value, "_", "-") + re := regexp.MustCompile(`[^a-z0-9\-]+`) + value = re.ReplaceAllString(value, "-") + value = strings.Trim(value, "-") + if value == "" { + return "unknown" + } + return value +} + +func dedupeStrings(values []string) []string { + seen := map[string]struct{}{} + result := make([]string, 0, len(values)) + for _, value := range values { + trimmed := strings.TrimSpace(value) + if trimmed == "" { + continue + } + if _, exists := seen[trimmed]; exists { + continue + } + seen[trimmed] = struct{}{} + result = append(result, trimmed) + } + return result +} diff --git a/scripts/intraday_discovery_provider.go b/scripts/intraday_discovery_provider.go new file mode 100644 index 0000000..f38e316 --- /dev/null +++ b/scripts/intraday_discovery_provider.go @@ -0,0 +1,188 @@ +//go:build llm_script + +package main + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "strings" + "time" +) + +type intradayProviderConfig struct { + Mode string + Command string + URL string + Fixture string + Timeout time.Duration +} + +type intradaySearchRecord struct { + Title string `json:"title"` + Summary string `json:"summary"` + URL string `json:"url"` + Provider string `json:"provider"` + ProviderURL string `json:"provider_url"` + PublishedAt string `json:"published_at"` +} + +type intradayLLMRecord struct { + EventType string `json:"event_type"` + ProviderName string `json:"provider_name"` + ModelName string `json:"model_name"` + ProviderCountry string `json:"provider_country"` + Title string `json:"title"` + Summary string `json:"summary"` + CandidateURLs []string `json:"candidate_urls"` +} + +type intradayLLMRequest struct { + Date string `json:"date"` + SearchResults []intradaySearchRecord `json:"search_results"` +} + +func loadIntradaySearchRecords(cfg intradayProviderConfig, date string, queries []string) ([]intradaySearchRecord, error) { + var all []intradaySearchRecord + for _, query := range queries { + payload, err := loadIntradayProviderPayload(cfg, intradayProviderPayloadInput{ + Date: date, + Query: query, + }) + if err != nil { + return nil, err + } + if len(bytes.TrimSpace(payload)) == 0 { + continue + } + var records []intradaySearchRecord + if err := json.Unmarshal(payload, &records); err != nil { + return nil, fmt.Errorf("unmarshal search records for query %q: %w", query, err) + } + all = append(all, records...) + if cfg.Mode == "fixture" { + break + } + } + return all, nil +} + +func loadIntradayLLMRecords(cfg intradayProviderConfig, date string, searchResults []intradaySearchRecord) ([]intradayLLMRecord, error) { + request := intradayLLMRequest{Date: date, SearchResults: searchResults} + body, err := json.Marshal(request) + if err != nil { + return nil, fmt.Errorf("marshal llm request: %w", err) + } + payload, err := loadIntradayProviderPayload(cfg, intradayProviderPayloadInput{ + Date: date, + RequestBody: body, + }) + if err != nil { + return nil, err + } + if len(bytes.TrimSpace(payload)) == 0 { + return nil, nil + } + var records []intradayLLMRecord + if err := json.Unmarshal(payload, &records); err != nil { + return nil, fmt.Errorf("unmarshal llm records: %w", err) + } + return records, nil +} + +type intradayProviderPayloadInput struct { + Date string + Query string + RequestBody []byte +} + +func loadIntradayProviderPayload(cfg intradayProviderConfig, input intradayProviderPayloadInput) ([]byte, error) { + mode := strings.TrimSpace(cfg.Mode) + switch mode { + case "fixture": + if strings.TrimSpace(cfg.Fixture) == "" { + return nil, fmt.Errorf("provider fixture 未设置") + } + return os.ReadFile(cfg.Fixture) + case "command_json": + if strings.TrimSpace(cfg.Command) == "" { + return nil, fmt.Errorf("provider command 未设置") + } + return runIntradayCommand(cfg, input) + case "http_json": + if strings.TrimSpace(cfg.URL) == "" { + return nil, fmt.Errorf("provider url 未设置") + } + return fetchIntradayHTTP(cfg, input) + default: + return nil, fmt.Errorf("unsupported provider mode %q", mode) + } +} + +func runIntradayCommand(cfg intradayProviderConfig, input intradayProviderPayloadInput) ([]byte, error) { + command := strings.TrimSpace(cfg.Command) + command = strings.ReplaceAll(command, "{{date}}", input.Date) + command = strings.ReplaceAll(command, "{{query}}", shellEscapeSingleArg(input.Query)) + cmd := exec.Command("sh", "-c", command) + cmd.Env = append(os.Environ(), + "INTRADAY_DISCOVERY_DATE="+input.Date, + "INTRADAY_DISCOVERY_QUERY="+input.Query, + ) + if len(input.RequestBody) > 0 { + cmd.Stdin = bytes.NewReader(input.RequestBody) + } + out, err := cmd.Output() + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + return nil, fmt.Errorf("run provider command: %w: %s", err, strings.TrimSpace(string(exitErr.Stderr))) + } + return nil, fmt.Errorf("run provider command: %w", err) + } + return out, nil +} + +func fetchIntradayHTTP(cfg intradayProviderConfig, input intradayProviderPayloadInput) ([]byte, error) { + client := &http.Client{Timeout: cfg.Timeout} + rawURL := strings.TrimSpace(cfg.URL) + rawURL = strings.ReplaceAll(rawURL, "{{date}}", input.Date) + rawURL = strings.ReplaceAll(rawURL, "{{query}}", input.Query) + + method := http.MethodGet + var body io.Reader + if len(input.RequestBody) > 0 { + method = http.MethodPost + body = bytes.NewReader(input.RequestBody) + } + req, err := http.NewRequest(method, rawURL, body) + if err != nil { + return nil, fmt.Errorf("build provider request: %w", err) + } + if len(input.RequestBody) > 0 { + req.Header.Set("Content-Type", "application/json") + } + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("call provider url: %w", err) + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + payload, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("call provider url: unexpected status %d: %s", resp.StatusCode, strings.TrimSpace(string(payload))) + } + payload, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("read provider response: %w", err) + } + return payload, nil +} + +func shellEscapeSingleArg(value string) string { + if value == "" { + return "''" + } + return "'" + strings.ReplaceAll(value, "'", "'\"'\"'") + "'" +} diff --git a/scripts/materialize_daily_signals.go b/scripts/materialize_daily_signals.go index 813ffc9..963cad7 100644 --- a/scripts/materialize_daily_signals.go +++ b/scripts/materialize_daily_signals.go @@ -46,6 +46,7 @@ type signalModelEvent struct { TrustLabel string `json:"trust_label"` SourceKindLabel string `json:"source_kind_label"` PrimarySource string `json:"primary_source"` + SourceURL string `json:"source_url"` UpdatedAt string `json:"updated_at"` EvidenceDetail string `json:"evidence_detail"` Baseline string `json:"baseline"` @@ -367,6 +368,12 @@ func loadSignalModelEvents(db *sql.DB, date string) ([]signalModelEvent, error) } events = append(events, priceEvents...) + discoveryEvents, err := loadVerifiedDiscoverySignalEvents(db, date) + if err != nil { + return nil, err + } + events = mergeVerifiedDiscoveryEvents(events, discoveryEvents) + sort.Slice(events, func(i, j int) bool { if events[i].Priority != events[j].Priority { return events[i].Priority > events[j].Priority @@ -409,6 +416,7 @@ func loadSignalPromoCampaignEvents(date string) ([]signalModelEvent, error) { TrustLabel: signalFirstNonEmpty(definition.TrustLabel, "官方来源 / 一级证据"), SourceKindLabel: signalFirstNonEmpty(definition.SourceKindLabel, "官方活动页"), PrimarySource: definition.PrimarySource, + SourceURL: definition.PrimarySource, UpdatedAt: signalFormatEventUpdatedAt("", definition.Date), EvidenceDetail: definition.EvidenceDetail, Baseline: signalFirstNonEmpty(definition.Baseline, "活动窗口开启"), @@ -520,6 +528,7 @@ func loadSignalOfficialReleaseEvents(db *sql.DB, date string) ([]signalModelEven TrustLabel: buildSignalReleaseTrustLabel(model, dateConfidence), SourceKindLabel: buildSignalReleaseSourceKindLabel(dateSourceKind, dateConfidence), PrimarySource: sourceURL, + SourceURL: sourceURL, UpdatedAt: releaseDate.Format("2006-01-02 15:04"), EvidenceDetail: buildSignalReleaseEvidenceDetail(dateSourceKind, dateConfidence), Baseline: "官方首次发布", @@ -610,6 +619,7 @@ func loadSignalNewModelEvents(db *sql.DB, date string) ([]signalModelEvent, erro TrustLabel: buildSignalTrustLabel(model), SourceKindLabel: "模型快照", PrimarySource: buildSignalPrimarySource("region_pricing", model.OperatorName), + SourceURL: buildSignalPrimarySource("region_pricing", model.OperatorName), UpdatedAt: createdAt.Format("2006-01-02 15:04"), EvidenceDetail: "models.created_at = 今日,且已存在最新价格快照", Baseline: "首次出现", @@ -709,6 +719,7 @@ func loadSignalPriceChangeEvents(db *sql.DB, date string) ([]signalModelEvent, e TrustLabel: buildSignalTrustLabel(model), SourceKindLabel: "价格快照", PrimarySource: "pricing_history", + SourceURL: buildSignalPrimarySource("region_pricing", model.OperatorName), UpdatedAt: changedAt.Format("2006-01-02 15:04"), EvidenceDetail: buildSignalPriceEvidenceDetail(changePct, oldInputPrice, newInputPrice, model.Currency), Baseline: fmt.Sprintf("较昨日 %+.0f%%", changePct), @@ -747,6 +758,241 @@ func dedupeSignalEvents(events []signalModelEvent) []signalModelEvent { return result } +func loadVerifiedDiscoverySignalEvents(db *sql.DB, date string) ([]signalModelEvent, error) { + rows, err := db.Query(` + SELECT + event_type, + provider_name, + COALESCE(model_name, ''), + COALESCE(provider_country, ''), + title, + COALESCE(summary, ''), + COALESCE(candidate_urls::text, '[]'), + COALESCE(verification_notes, ''), + updated_at + FROM intraday_news_candidate + WHERE candidate_date = $1::date + AND status = 'verified' + AND verification_confidence = 'official_confirmed' + ORDER BY updated_at DESC, id DESC + `, date) + if err != nil { + if strings.Contains(err.Error(), `relation "intraday_news_candidate" does not exist`) { + return nil, nil + } + return nil, err + } + defer rows.Close() + + var events []signalModelEvent + for rows.Next() { + var ( + eventType string + providerName string + modelName string + providerCountry string + title string + summary string + rawURLs string + notes string + updatedAt time.Time + ) + if err := rows.Scan(&eventType, &providerName, &modelName, &providerCountry, &title, &summary, &rawURLs, ¬es, &updatedAt); err != nil { + return nil, err + } + var urls []string + if err := json.Unmarshal([]byte(rawURLs), &urls); err != nil { + return nil, fmt.Errorf("unmarshal discovery candidate urls: %w", err) + } + primaryURL := firstString(urls) + if strings.TrimSpace(primaryURL) == "" { + continue + } + normalizedType := signalNormalizeIntradayEventType(eventType) + events = append(events, signalModelEvent{ + EventType: normalizedType, + ModelName: signalFirstNonEmpty(modelName, title), + ProviderName: providerName, + OperatorName: providerName, + Audience: buildDiscoveryAudience(normalizedType), + TrustLabel: "官方来源 / discovery 验证", + SourceKindLabel: buildDiscoverySourceKind(normalizedType), + PrimarySource: primaryURL, + SourceURL: primaryURL, + UpdatedAt: updatedAt.Format("2006-01-02 15:04"), + EvidenceDetail: signalFirstNonEmpty(notes, summary), + Baseline: buildDiscoveryBaseline(normalizedType), + Summary: signalFirstNonEmpty(summary, title), + Priority: buildDiscoveryPriority(normalizedType), + }) + } + if err := rows.Err(); err != nil { + return nil, err + } + return filterVerifiedDiscoverySignalEvents(events), nil +} + +func filterVerifiedDiscoverySignalEvents(events []signalModelEvent) []signalModelEvent { + filtered := make([]signalModelEvent, 0, len(events)) + for _, event := range events { + switch event.EventType { + case "official_release", "promo_campaign", "price_cut", "price_increase": + filtered = append(filtered, event) + } + } + return filtered +} + +func mergeVerifiedDiscoveryEvents(nativeEvents, discoveryEvents []signalModelEvent) []signalModelEvent { + merged := append([]signalModelEvent{}, nativeEvents...) + index := make(map[string]int, len(merged)) + for i, event := range merged { + index[signalEventMergeKey(event)] = i + } + for _, event := range filterVerifiedDiscoverySignalEvents(discoveryEvents) { + key := signalEventMergeKey(event) + if idx, exists := index[key]; exists { + merged[idx] = mergeSignalEventEvidence(merged[idx], event) + continue + } + index[key] = len(merged) + merged = append(merged, event) + } + return merged +} + +func mergeSignalEventEvidence(native, discovery signalModelEvent) signalModelEvent { + merged := native + if strings.TrimSpace(merged.SourceKindLabel) == "" { + merged.SourceKindLabel = discovery.SourceKindLabel + } + if strings.TrimSpace(merged.SourceURL) == "" { + merged.SourceURL = discovery.SourceURL + } + if strings.TrimSpace(merged.PrimarySource) == "" { + merged.PrimarySource = discovery.PrimarySource + } + if strings.TrimSpace(merged.EvidenceDetail) == "" { + merged.EvidenceDetail = discovery.EvidenceDetail + } + if strings.TrimSpace(merged.TrustLabel) == "" { + merged.TrustLabel = discovery.TrustLabel + } + return merged +} + +func signalEventMergeKey(event signalModelEvent) string { + return strings.Join([]string{ + signalNormalizeIntradayEventType(event.EventType), + signalNormalizeWord(event.ProviderName), + signalNormalizeWord(event.ModelName), + }, "|") +} + +func buildDiscoveryAudience(eventType string) string { + switch eventType { + case "official_release": + return "适合需要尽快复查默认选型与路线图影响的团队" + case "promo_campaign": + return "适合想利用活动窗口压低成本的团队" + case "price_cut": + return "适合准备趁降价重排默认模型的团队" + case "price_increase": + return "适合提前准备替代模型和预算回退方案的团队" + default: + return "适合关注日内情报变化的读者" + } +} + +func buildDiscoverySourceKind(eventType string) string { + switch eventType { + case "official_release": + return "discovery 验证 / 官方发布页" + case "promo_campaign": + return "discovery 验证 / 官方活动页" + case "price_cut", "price_increase": + return "discovery 验证 / 官方价格页" + default: + return "discovery 验证" + } +} + +func buildDiscoveryBaseline(eventType string) string { + switch eventType { + case "official_release": + return "discovery 验证通过" + case "promo_campaign": + return "活动窗口已验证" + case "price_cut", "price_increase": + return "official_confirmed" + default: + return "discovery verified" + } +} + +func buildDiscoveryPriority(eventType string) int { + switch eventType { + case "official_release": + return 118 + case "promo_campaign": + return 112 + case "price_cut": + return 96 + case "price_increase": + return 94 + default: + return 80 + } +} + +func firstString(values []string) string { + for _, value := range values { + if strings.TrimSpace(value) != "" { + return value + } + } + return "" +} + +func signalNormalizeIntradayEventType(value string) string { + switch strings.TrimSpace(strings.ToLower(value)) { + case "price_cut": + return "price_cut" + case "price_increase": + return "price_increase" + case "official_release": + return "official_release" + case "promo_campaign": + return "promo_campaign" + default: + return "unknown" + } +} + +func signalNormalizeWord(value string) string { + value = strings.ToLower(strings.TrimSpace(value)) + value = strings.ReplaceAll(value, "_", "-") + var b strings.Builder + lastDash := false + for _, r := range value { + isAlphaNum := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') + if isAlphaNum { + b.WriteRune(r) + lastDash = false + continue + } + if !lastDash { + b.WriteByte('-') + lastDash = true + } + } + result := strings.Trim(b.String(), "-") + if result == "" { + return "unknown" + } + return result +} + func classifySignalFreeSource(model signalModelInfo) string { switch model.OperatorType { case "official", "cloud": diff --git a/scripts/materialize_daily_signals_test.go b/scripts/materialize_daily_signals_test.go index 9b2f45b..a57c49e 100644 --- a/scripts/materialize_daily_signals_test.go +++ b/scripts/materialize_daily_signals_test.go @@ -31,3 +31,64 @@ func TestBuildSignalPageMode(t *testing.T) { t.Fatalf("官方发布日 page_mode 错误: %q", got) } } + +func TestBuildSignalPageModeTreatsVerifiedDiscoveryPromoAsHot(t *testing.T) { + got := buildSignalPageMode(signalDailySignals{}, []signalModelEvent{{EventType: "promo_campaign", ModelName: "GPT-5.6"}}) + if got != "hot" { + t.Fatalf("已验证活动事件应触发 hot, got=%q", got) + } +} + +func TestFilterDiscoveryEventsDropsLeakAndCandidateOnly(t *testing.T) { + events := []signalModelEvent{ + {EventType: "official_release", ModelName: "GPT-5.6", Priority: 120}, + {EventType: "leak_or_rumor", ModelName: "GPT-5.6", Priority: 200}, + {EventType: "unknown", ModelName: "Mystery", Priority: 50}, + } + filtered := filterVerifiedDiscoverySignalEvents(events) + if len(filtered) != 1 { + t.Fatalf("期望仅保留 1 条正式事实事件, got=%d", len(filtered)) + } + if filtered[0].EventType != "official_release" { + t.Fatalf("错误保留了非正式事件: %+v", filtered) + } +} + +func TestMergeVerifiedDiscoveryEventsPrefersNativeFact(t *testing.T) { + native := []signalModelEvent{{ + EventType: "official_release", + ModelName: "GPT-5.6", + ProviderName: "OpenAI", + PrimarySource: "native_release", + EvidenceDetail: "native evidence", + Priority: 120, + }} + discovery := []signalModelEvent{{ + EventType: "official_release", + ModelName: "GPT-5.6", + ProviderName: "OpenAI", + PrimarySource: "discovery_release", + EvidenceDetail: "discovery evidence", + SourceKindLabel: "官方博客", + Priority: 110, + }} + merged := mergeVerifiedDiscoveryEvents(native, discovery) + if len(merged) != 1 { + t.Fatalf("期望去重后只剩 1 条事件, got=%d", len(merged)) + } + if merged[0].PrimarySource != "native_release" { + t.Fatalf("原生事实不应被 discovery 覆盖: %+v", merged[0]) + } + if merged[0].SourceKindLabel != "官方博客" { + t.Fatalf("原生事实应补入 discovery 证据缺口: %+v", merged[0]) + } +} + +func TestMergeVerifiedDiscoveryEventsDropsUnverifiedPriceNarrative(t *testing.T) { + native := []signalModelEvent{{EventType: "new_model", ModelName: "DeepSeek-V4-Flash", ProviderName: "DeepSeek", Priority: 80}} + discovery := []signalModelEvent{{EventType: "leak_or_rumor", ModelName: "DeepSeek-V4-Flash", ProviderName: "DeepSeek", Priority: 130}} + merged := mergeVerifiedDiscoveryEvents(native, discovery) + if len(merged) != 1 || merged[0].EventType != "new_model" { + t.Fatalf("非正式 discovery 事件不应进入正式快照: %+v", merged) + } +} diff --git a/scripts/run_intraday_discovery_watch.sh b/scripts/run_intraday_discovery_watch.sh new file mode 100644 index 0000000..d92b047 --- /dev/null +++ b/scripts/run_intraday_discovery_watch.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT_DIR" + +if [[ -f ".env.local" ]]; then + # shellcheck disable=SC1091 + source ".env.local" +fi +if [[ -f ".env" ]]; then + # shellcheck disable=SC1091 + source ".env" +fi + +if [[ -z "${DATABASE_URL:-}" ]]; then + echo "DATABASE_URL 未设置" >&2 + exit 1 +fi +if [[ -z "${INTRADAY_DISCOVERY_SEARCH_PROVIDER:-}" ]]; then + echo "INTRADAY_DISCOVERY_SEARCH_PROVIDER 未设置" >&2 + exit 1 +fi +if [[ -z "${INTRADAY_DISCOVERY_LLM_PROVIDER:-}" ]]; then + echo "INTRADAY_DISCOVERY_LLM_PROVIDER 未设置" >&2 + exit 1 +fi + +REPORT_DATE="${REPORT_DATE:-$(date +%F)}" +DRY_RUN="false" +if [[ "${1:-}" == "--dry-run" ]]; then + DRY_RUN="true" +fi + +discovery_args=(--date "$REPORT_DATE") +verification_args=(--date "$REPORT_DATE") +materialize_args=(--date "$REPORT_DATE") +if [[ "$DRY_RUN" == "true" ]]; then + discovery_args+=(--dry-run) + verification_args+=(--dry-run) + materialize_args+=(--dry-run) +fi + +go run -tags llm_script ./scripts/discover_intraday_news_candidates.go ./scripts/intraday_discovery_provider.go ./scripts/intraday_discovery_common.go "${discovery_args[@]}" +go run -tags llm_script ./scripts/verify_intraday_news_candidates.go ./scripts/intraday_discovery_common.go "${verification_args[@]}" +REPORT_TRIGGER_SOURCE="intraday_discovery" go run -tags llm_script ./scripts/materialize_daily_signals.go "${materialize_args[@]}" diff --git a/scripts/testdata/intraday_discovery_llm_sample.json b/scripts/testdata/intraday_discovery_llm_sample.json new file mode 100644 index 0000000..ab033d5 --- /dev/null +++ b/scripts/testdata/intraday_discovery_llm_sample.json @@ -0,0 +1,24 @@ +[ + { + "event_type": "official_release", + "provider_name": "OpenAI", + "model_name": "GPT-5.6", + "provider_country": "US", + "title": "GPT-5.6 preview pricing update", + "summary": "OpenAI preview material indicates GPT-5.6 entered a preview pricing window.", + "candidate_urls": [ + "https://openai.example.com/news/gpt-5-6-pricing" + ] + }, + { + "event_type": "promo_campaign", + "provider_name": "DeepSeek", + "model_name": "DeepSeek-V4-Flash", + "provider_country": "CN", + "title": "DeepSeek V4 Flash campaign", + "summary": "Official campaign page shows a temporary promotional window for DeepSeek-V4-Flash.", + "candidate_urls": [ + "https://deepseek.example.com/campaign/v4-flash" + ] + } +] diff --git a/scripts/testdata/intraday_discovery_search_sample.json b/scripts/testdata/intraday_discovery_search_sample.json new file mode 100644 index 0000000..18f5357 --- /dev/null +++ b/scripts/testdata/intraday_discovery_search_sample.json @@ -0,0 +1,18 @@ +[ + { + "title": "OpenAI announces GPT-5.6 preview pricing update", + "summary": "OpenAI preview announcement mentions GPT-5.6 and updated API pricing references.", + "url": "https://openai.example.com/news/gpt-5-6-pricing", + "provider": "OpenAI", + "provider_url": "https://openai.example.com", + "published_at": "2026-05-25T09:00:00Z" + }, + { + "title": "DeepSeek launches V4 Flash campaign", + "summary": "Campaign page suggests temporary promotional pricing for DeepSeek-V4-Flash.", + "url": "https://deepseek.example.com/campaign/v4-flash", + "provider": "DeepSeek", + "provider_url": "https://deepseek.example.com", + "published_at": "2026-05-25T10:00:00Z" + } +] diff --git a/scripts/testdata/intraday_verification_official_release.html b/scripts/testdata/intraday_verification_official_release.html new file mode 100644 index 0000000..4d707fa --- /dev/null +++ b/scripts/testdata/intraday_verification_official_release.html @@ -0,0 +1,7 @@ + +
+

OpenAI announces GPT-5.6 preview pricing update

+

GPT-5.6 preview is now available in official preview channels.

+

Published 2026-05-25.

+
+ diff --git a/scripts/testdata/intraday_verification_pricing_page.html b/scripts/testdata/intraday_verification_pricing_page.html new file mode 100644 index 0000000..63c572d --- /dev/null +++ b/scripts/testdata/intraday_verification_pricing_page.html @@ -0,0 +1,8 @@ + +
+

DeepSeek-V4-Flash pricing

+

Old price: $10

+

New price: $6

+

Campaign window active now.

+
+ diff --git a/scripts/testdata/intraday_verification_secondary_media.html b/scripts/testdata/intraday_verification_secondary_media.html new file mode 100644 index 0000000..900a436 --- /dev/null +++ b/scripts/testdata/intraday_verification_secondary_media.html @@ -0,0 +1,6 @@ + +
+

Industry blog discusses GPT-5.6 leak

+

Writers speculate GPT-5.6 may appear soon based on references.

+
+ diff --git a/scripts/verify_intraday_news_candidates.go b/scripts/verify_intraday_news_candidates.go new file mode 100644 index 0000000..dd584fb --- /dev/null +++ b/scripts/verify_intraday_news_candidates.go @@ -0,0 +1,501 @@ +//go:build llm_script + +package main + +import ( + "context" + "database/sql" + "encoding/json" + "flag" + "fmt" + "io" + "net/http" + "net/url" + "os" + "regexp" + "strings" + "time" + + _ "github.com/lib/pq" +) + +type verificationCandidateRow struct { + ID int64 + CandidateDate string + EventType string + ProviderName string + ModelName string + ProviderCountry string + Title string + Summary string + CandidateURLs []string + Status string + VerificationConfidence string +} + +type intradayVerificationConfig struct { + Date string + DryRun bool + DatabaseURL string + Timeout time.Duration +} + +type intradayVerificationResult struct { + CandidateID int64 + CandidateStatus string + VerificationConfidence string + VerifierSource string + VerifierURL string + VerifierStatus string + ExtractedFacts map[string]any + Notes string +} + +type intradayVerificationSummary struct { + CandidateTotal int `json:"candidate_total"` + VerifiedTotal int `json:"verified_total"` + OfficialConfirmedTotal int `json:"official_confirmed_total"` + SecondaryConfirmedTotal int `json:"secondary_confirmed_total"` + RejectedTotal int `json:"rejected_total"` + DryRun bool `json:"dry_run"` +} + +func main() { + loadIntradayEnv() + cfg := intradayVerificationConfig{} + flag.StringVar(&cfg.Date, "date", intradayDateValue(), "验证日期,格式 YYYY-MM-DD") + flag.BoolVar(&cfg.DryRun, "dry-run", false, "仅输出摘要,不写数据库") + flag.Parse() + cfg.DatabaseURL = intradayDefaultDSN() + cfg.Timeout = discoveryTimeoutFromEnv() + if err := runIntradayCandidateVerification(cfg); err != nil { + fmt.Fprintf(os.Stderr, "verify_intraday_news_candidates: %v\n", err) + os.Exit(1) + } +} + +func runIntradayCandidateVerification(cfg intradayVerificationConfig) error { + if strings.TrimSpace(cfg.Date) == "" { + return fmt.Errorf("date 未设置") + } + db, err := sql.Open("postgres", cfg.DatabaseURL) + if err != nil { + return fmt.Errorf("open db: %w", err) + } + defer db.Close() + candidates, err := loadIntradayVerificationCandidates(context.Background(), db, cfg.Date) + if err != nil { + return err + } + results := make([]intradayVerificationResult, 0, len(candidates)) + for _, candidate := range candidates { + result, err := verifyIntradayCandidate(candidate, cfg.Timeout) + if err != nil { + result = intradayVerificationResult{ + CandidateID: candidate.ID, + CandidateStatus: "candidate", + VerificationConfidence: candidate.VerificationConfidence, + VerifierStatus: "error", + Notes: err.Error(), + } + } + results = append(results, result) + } + if !cfg.DryRun { + if err := persistIntradayVerificationResults(context.Background(), db, results); err != nil { + return err + } + } + return printIntradayVerificationSummary(summarizeIntradayVerification(results, cfg.DryRun)) +} + +func loadIntradayVerificationCandidates(ctx context.Context, db *sql.DB, date string) ([]verificationCandidateRow, error) { + rows, err := db.QueryContext(ctx, ` + SELECT id, candidate_date::text, event_type, provider_name, COALESCE(model_name, ''), COALESCE(provider_country, ''), + title, COALESCE(summary, ''), COALESCE(candidate_urls::text, '[]'), status, verification_confidence + FROM intraday_news_candidate + WHERE candidate_date = $1::date + AND status IN ('candidate', 'verifying') + ORDER BY discovered_at DESC, id DESC`, date) + if err != nil { + return nil, fmt.Errorf("query intraday candidates: %w", err) + } + defer rows.Close() + var candidates []verificationCandidateRow + for rows.Next() { + var row verificationCandidateRow + var rawURLs string + if err := rows.Scan(&row.ID, &row.CandidateDate, &row.EventType, &row.ProviderName, &row.ModelName, &row.ProviderCountry, &row.Title, &row.Summary, &rawURLs, &row.Status, &row.VerificationConfidence); err != nil { + return nil, fmt.Errorf("scan intraday candidate: %w", err) + } + if err := json.Unmarshal([]byte(rawURLs), &row.CandidateURLs); err != nil { + return nil, fmt.Errorf("unmarshal candidate urls: %w", err) + } + candidates = append(candidates, row) + } + return candidates, rows.Err() +} + +func verifyIntradayCandidate(candidate verificationCandidateRow, timeout time.Duration) (intradayVerificationResult, error) { + client := &http.Client{Timeout: timeout} + best := intradayVerificationResult{ + CandidateID: candidate.ID, + CandidateStatus: "candidate", + VerificationConfidence: candidate.VerificationConfidence, + VerifierStatus: "insufficient", + Notes: "未找到足够证据", + ExtractedFacts: map[string]any{}, + } + for _, candidateURL := range candidate.CandidateURLs { + body, err := fetchVerificationDocument(candidateURL, client) + if err != nil { + best = preferVerificationResult(best, intradayVerificationResult{ + CandidateID: candidate.ID, + CandidateStatus: "candidate", + VerificationConfidence: candidate.VerificationConfidence, + VerifierURL: candidateURL, + VerifierStatus: "error", + Notes: err.Error(), + ExtractedFacts: map[string]any{}, + }) + continue + } + result := verifyCandidateDocument(candidate, candidateURL, body) + if result.CandidateID == 0 { + result.CandidateID = candidate.ID + } + best = preferVerificationResult(best, result) + if best.CandidateStatus == "verified" && best.VerificationConfidence == "official_confirmed" { + return best, nil + } + } + return best, nil +} + +func fetchVerificationDocument(rawURL string, client *http.Client) (string, error) { + req, err := http.NewRequest(http.MethodGet, rawURL, nil) + if err != nil { + return "", fmt.Errorf("build verification request: %w", err) + } + req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; llm-intelligence intraday verifier)") + resp, err := client.Do(req) + if err != nil { + return "", fmt.Errorf("fetch verification document: %w", err) + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + payload, _ := io.ReadAll(resp.Body) + return "", fmt.Errorf("fetch verification document: unexpected status %d: %s", resp.StatusCode, strings.TrimSpace(string(payload))) + } + payload, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("read verification document: %w", err) + } + return string(payload), nil +} + +func verifyCandidateDocument(candidate verificationCandidateRow, candidateURL, body string) intradayVerificationResult { + source := classifyVerificationSource(candidate.ProviderName, candidateURL) + facts := extractVerificationFacts(body) + modelMatched := bodyMentionsModel(body, candidate.ModelName, candidate.Title) + result := intradayVerificationResult{ + CandidateID: candidate.ID, + CandidateStatus: "candidate", + VerificationConfidence: "candidate", + VerifierSource: source, + VerifierURL: candidateURL, + VerifierStatus: "insufficient", + ExtractedFacts: facts, + Notes: "证据不足", + } + if isOfficialVerificationSource(source) { + switch normalizeIntradayEventType(candidate.EventType) { + case "official_release": + if modelMatched { + result.CandidateStatus = "verified" + result.VerificationConfidence = "official_confirmed" + result.VerifierStatus = "matched" + result.Notes = "官方页面命中模型发布线索" + } + case "promo_campaign": + if modelMatched && bodyMentionsPromo(body) { + result.CandidateStatus = "verified" + result.VerificationConfidence = "official_confirmed" + result.VerifierStatus = "matched" + result.Notes = "官方页面命中活动窗口或促销语义" + } + case "price_cut", "price_increase": + if priceResult, ok := deriveVerifiedPriceEvent(candidate.EventType, facts); ok { + result.CandidateStatus = "verified" + result.VerificationConfidence = "official_confirmed" + result.VerifierStatus = "matched" + result.ExtractedFacts = priceResult + result.Notes = "官方价格页命中真实价格变化" + } else if modelMatched { + result.VerifierStatus = "insufficient" + result.Notes = "命中模型但缺少可计算的价格变化事实" + } + case "leak_or_rumor": + if modelMatched { + result.CandidateStatus = "verified" + result.VerificationConfidence = "secondary_confirmed" + result.VerifierStatus = "matched" + result.Notes = "保留为待确认情报,不进入正式事实层" + } + } + } else if modelMatched { + result.CandidateStatus = "verified" + result.VerificationConfidence = "secondary_confirmed" + result.VerifierStatus = "matched" + result.Notes = "仅二手来源命中,不能进入正式事实层" + } + if result.VerifierStatus == "insufficient" && modelMatched && !isOfficialVerificationSource(source) { + result.VerificationConfidence = "secondary_confirmed" + } + return result +} + +func isOfficialVerificationSource(source string) bool { + switch source { + case "official_page", "official_docs", "official_blog", "pricing_page": + return true + default: + return false + } +} + +func classifyVerificationSource(providerName, rawURL string) string { + parsed, err := url.Parse(rawURL) + if err != nil { + return "secondary_media" + } + host := strings.ToLower(parsed.Host) + path := strings.ToLower(parsed.Path) + if isOfficialProviderHost(providerName, host) { + switch { + case strings.Contains(host, "docs.") || strings.Contains(path, "/docs"): + return "official_docs" + case strings.Contains(host, "pricing") || strings.Contains(path, "pricing") || strings.Contains(path, "price"): + return "pricing_page" + case strings.Contains(path, "blog") || strings.Contains(path, "news") || strings.Contains(path, "announcement"): + return "official_blog" + default: + return "official_page" + } + } + return "secondary_media" +} + +func isOfficialProviderHost(providerName, host string) bool { + tokens := providerHostTokens(providerName) + for _, token := range tokens { + if token != "" && strings.Contains(host, token) { + return true + } + } + return false +} + +func providerHostTokens(providerName string) []string { + switch strings.ToLower(strings.TrimSpace(providerName)) { + case "openai": + return []string{"openai.com"} + case "anthropic": + return []string{"anthropic.com"} + case "google", "google gemini", "gemini": + return []string{"google.com", "google.dev", "ai.google.dev"} + case "deepseek": + return []string{"deepseek.com", "deepseek.ai"} + case "qwen", "dashscope": + return []string{"aliyun.com", "dashscope.com"} + case "xai": + return []string{"x.ai"} + case "智谱": + return []string{"zhipuai.cn"} + case "百度", "百度文心": + return []string{"baidu.com", "cloud.baidu.com"} + case "腾讯", "腾讯混元": + return []string{"tencent.com", "cloud.tencent.com"} + case "minimax": + return []string{"minimax.io", "minimax.chat"} + default: + clean := strings.ToLower(strings.TrimSpace(providerName)) + if clean == "" { + return nil + } + return []string{clean} + } +} + +func bodyMentionsModel(body, modelName, title string) bool { + normBody := normalizeEvidenceText(body) + for _, candidate := range []string{modelName, title} { + normCandidate := normalizeEvidenceText(candidate) + if normCandidate != "" && strings.Contains(normBody, normCandidate) { + return true + } + } + return false +} + +func bodyMentionsPromo(body string) bool { + lower := strings.ToLower(body) + for _, marker := range []string{"campaign", "promo", "promotion", "discount", "活动", "优惠", "限时", "窗口"} { + if strings.Contains(lower, marker) { + return true + } + } + return false +} + +func extractVerificationFacts(body string) map[string]any { + facts := map[string]any{} + oldPrice, newPrice, ok := extractPricePair(body) + if ok { + facts["old_input_price"] = oldPrice + facts["new_input_price"] = newPrice + if oldPrice != 0 { + facts["price_change_pct"] = ((newPrice - oldPrice) / oldPrice) * 100 + } + } + return facts +} + +func deriveVerifiedPriceEvent(eventType string, facts map[string]any) (map[string]any, bool) { + oldValue, oldOK := facts["old_input_price"].(float64) + newValue, newOK := facts["new_input_price"].(float64) + changePct, pctOK := facts["price_change_pct"].(float64) + if !oldOK || !newOK || !pctOK || oldValue <= 0 || newValue <= 0 { + return nil, false + } + normalized := normalizeIntradayEventType(eventType) + if normalized == "price_cut" && changePct >= 0 { + return nil, false + } + if normalized == "price_increase" && changePct <= 0 { + return nil, false + } + return facts, true +} + +func extractPricePair(body string) (float64, float64, bool) { + oldRe := regexp.MustCompile(`(?i)(old|from)\s*price[^0-9$¥]*[$¥]?([0-9]+(?:\.[0-9]+)?)`) + newRe := regexp.MustCompile(`(?i)(new|to)\s*price[^0-9$¥]*[$¥]?([0-9]+(?:\.[0-9]+)?)`) + oldMatch := oldRe.FindStringSubmatch(body) + newMatch := newRe.FindStringSubmatch(body) + if len(oldMatch) < 3 || len(newMatch) < 3 { + return 0, 0, false + } + var oldValue, newValue float64 + if _, err := fmt.Sscanf(oldMatch[2], "%f", &oldValue); err != nil { + return 0, 0, false + } + if _, err := fmt.Sscanf(newMatch[2], "%f", &newValue); err != nil { + return 0, 0, false + } + return oldValue, newValue, true +} + +func normalizeEvidenceText(value string) string { + value = strings.ToLower(value) + re := regexp.MustCompile(`[^a-z0-9\p{Han}]+`) + value = re.ReplaceAllString(value, "") + return strings.TrimSpace(value) +} + +func preferVerificationResult(current, next intradayVerificationResult) intradayVerificationResult { + if verificationScore(next) > verificationScore(current) { + return next + } + return current +} + +func verificationScore(result intradayVerificationResult) int { + score := 0 + switch result.CandidateStatus { + case "verified": + score += 20 + case "rejected": + score += 5 + } + switch result.VerificationConfidence { + case "official_confirmed": + score += 10 + case "secondary_confirmed": + score += 5 + } + switch result.VerifierStatus { + case "matched": + score += 3 + case "contradicted": + score += 1 + } + return score +} + +func persistIntradayVerificationResults(ctx context.Context, db *sql.DB, results []intradayVerificationResult) error { + for _, result := range results { + facts, err := json.Marshal(result.ExtractedFacts) + if err != nil { + return fmt.Errorf("marshal extracted facts: %w", err) + } + _, err = db.ExecContext(ctx, ` + INSERT INTO intraday_news_verification ( + candidate_id, verifier_source, verifier_url, verifier_status, extracted_facts, notes + ) VALUES ($1, NULLIF($2, ''), NULLIF($3, ''), $4, $5::jsonb, NULLIF($6, ''))`, + result.CandidateID, + result.VerifierSource, + result.VerifierURL, + result.VerifierStatus, + string(facts), + result.Notes, + ) + if err != nil { + return fmt.Errorf("insert intraday verification: %w", err) + } + _, err = db.ExecContext(ctx, ` + UPDATE intraday_news_candidate + SET status = $2, + verification_confidence = $3, + verification_notes = NULLIF($4, ''), + updated_at = CURRENT_TIMESTAMP + WHERE id = $1`, + result.CandidateID, + result.CandidateStatus, + result.VerificationConfidence, + result.Notes, + ) + if err != nil { + return fmt.Errorf("update intraday candidate: %w", err) + } + } + return nil +} + +func summarizeIntradayVerification(results []intradayVerificationResult, dryRun bool) intradayVerificationSummary { + summary := intradayVerificationSummary{CandidateTotal: len(results), DryRun: dryRun} + for _, result := range results { + if result.CandidateStatus == "verified" { + summary.VerifiedTotal++ + } + switch result.VerificationConfidence { + case "official_confirmed": + summary.OfficialConfirmedTotal++ + case "secondary_confirmed": + summary.SecondaryConfirmedTotal++ + } + if result.CandidateStatus == "rejected" { + summary.RejectedTotal++ + } + } + return summary +} + +func printIntradayVerificationSummary(summary intradayVerificationSummary) error { + payload, err := json.Marshal(summary) + if err != nil { + return err + } + fmt.Println(string(payload)) + return nil +} diff --git a/scripts/verify_intraday_news_candidates_test.go b/scripts/verify_intraday_news_candidates_test.go new file mode 100644 index 0000000..0012827 --- /dev/null +++ b/scripts/verify_intraday_news_candidates_test.go @@ -0,0 +1,99 @@ +//go:build llm_script + +package main + +import ( + "os" + "path/filepath" + "testing" +) + +func TestVerifyCandidateDocumentOfficialRelease(t *testing.T) { + body, err := os.ReadFile(filepath.Join("testdata", "intraday_verification_official_release.html")) + if err != nil { + t.Fatalf("读取 official release fixture 失败: %v", err) + } + candidate := verificationCandidateRow{ + ID: 1, + EventType: "official_release", + ProviderName: "OpenAI", + ModelName: "GPT-5.6", + Title: "GPT-5.6 preview pricing update", + } + result := verifyCandidateDocument(candidate, "https://openai.com/news/gpt-5-6-preview", string(body)) + if result.CandidateStatus != "verified" || result.VerificationConfidence != "official_confirmed" { + t.Fatalf("官方发布应被确认: %+v", result) + } +} + +func TestVerifyCandidateDocumentPriceCutNeedsRealPriceFacts(t *testing.T) { + body, err := os.ReadFile(filepath.Join("testdata", "intraday_verification_pricing_page.html")) + if err != nil { + t.Fatalf("读取 pricing fixture 失败: %v", err) + } + candidate := verificationCandidateRow{ + ID: 2, + EventType: "price_cut", + ProviderName: "DeepSeek", + ModelName: "DeepSeek-V4-Flash", + Title: "DeepSeek-V4-Flash price cut", + } + result := verifyCandidateDocument(candidate, "https://deepseek.com/pricing/v4-flash", string(body)) + if result.CandidateStatus != "verified" || result.VerificationConfidence != "official_confirmed" { + t.Fatalf("价格页命中真实价格变化后应确认: %+v", result) + } +} + +func TestVerifyCandidateDocumentPromoCampaignOfficial(t *testing.T) { + body, err := os.ReadFile(filepath.Join("testdata", "intraday_verification_pricing_page.html")) + if err != nil { + t.Fatalf("读取 promo fixture 失败: %v", err) + } + candidate := verificationCandidateRow{ + ID: 3, + EventType: "promo_campaign", + ProviderName: "DeepSeek", + ModelName: "DeepSeek-V4-Flash", + Title: "DeepSeek V4 Flash campaign", + } + result := verifyCandidateDocument(candidate, "https://deepseek.com/campaign/v4-flash", string(body)) + if result.CandidateStatus != "verified" || result.VerificationConfidence != "official_confirmed" { + t.Fatalf("官方活动页应被确认: %+v", result) + } +} + +func TestVerifyCandidateDocumentSecondaryMediaDowngrades(t *testing.T) { + body, err := os.ReadFile(filepath.Join("testdata", "intraday_verification_secondary_media.html")) + if err != nil { + t.Fatalf("读取 secondary fixture 失败: %v", err) + } + candidate := verificationCandidateRow{ + ID: 4, + EventType: "official_release", + ProviderName: "OpenAI", + ModelName: "GPT-5.6", + Title: "GPT-5.6 leak discussion", + } + result := verifyCandidateDocument(candidate, "https://someblog.example.com/gpt-5-6-leak", string(body)) + if result.VerificationConfidence != "secondary_confirmed" { + t.Fatalf("二手媒体应降级为 secondary_confirmed: %+v", result) + } +} + +func TestVerifyCandidateDocumentLeakStaysOutOfOfficialFacts(t *testing.T) { + body, err := os.ReadFile(filepath.Join("testdata", "intraday_verification_secondary_media.html")) + if err != nil { + t.Fatalf("读取 leak fixture 失败: %v", err) + } + candidate := verificationCandidateRow{ + ID: 5, + EventType: "leak_or_rumor", + ProviderName: "OpenAI", + ModelName: "GPT-5.6", + Title: "GPT-5.6 leak discussion", + } + result := verifyCandidateDocument(candidate, "https://someblog.example.com/gpt-5-6-leak", string(body)) + if result.VerificationConfidence == "official_confirmed" { + t.Fatalf("泄露类不应升级为正式事实: %+v", result) + } +}