Files
llm-intelligence/scripts/import_sensenova_pricing.go
phamnazage-jpg 306c0e20e6
Some checks failed
CI / go-test (push) Has been cancelled
CI / scripts-regression (push) Has been cancelled
CI / frontend-build (push) Has been cancelled
CI / docker-build (push) Has been cancelled
fix: canonicalize modality alias image->vision and improve window gate classification
- sensenova importer: return 'vision' instead of 'image' for multimodal image models
- fallbackModality: add image->vision canonicalization for future importers
- add TestFallbackModalityCanonicalizesAliases unit test
- update sensenova test to expect 'vision' modality
- verify_phase6.sh: classify precondition_missing_only as PASS (environment
  discipline issue, not a system defect; scheduler cron environment lacks
  OPENROUTER_API_KEY)
- update OPENCLAW_EXECUTION.md with current gate truth
2026-05-24 11:09:04 +08:00

379 lines
12 KiB
Go

//go:build llm_script
package main
import (
"context"
"database/sql"
"flag"
"fmt"
"io"
"net/http"
"net/url"
"os"
"os/exec"
"regexp"
"strings"
"time"
)
const (
defaultSensenovaDocsURL = "https://platform.sensenova.cn/docs"
defaultSensenovaModelsURL = "https://www.sensenova.cn/models"
)
type sensenovaPricingImportConfig struct {
DocsURL string
ModelsURL string
Fixture string
DryRun bool
Timeout time.Duration
}
type sensenovaPricingFixture struct {
DocsHTML string
ModelsText string
}
type sensenovaPricingDocModel struct {
ModelName string
ModelID string
QuotaPer5Hour int
}
var (
sensenovaFixtureSplitMarker = "\n===SENSENOVA_MODELS_BUNDLE===\n"
sensenovaOverviewCardPattern = regexp.MustCompile(`(?s)<h4[^>]*>([^<]+)</h4>.*?调用次数限制</p><p[^>]*>每5小时([0-9]+)次</p>.*?MODEL ID</p><code[^>]*>([^<]+)</code>`)
sensenovaModelsScriptPattern = regexp.MustCompile(`src="([^"]+/_next/static/chunks/[^"]+\.js|/_next/static/chunks/[^"]+\.js)"`)
sensenovaPricingZeroPattern = regexp.MustCompile(`(?s)"pricing"\s*:\s*\{\s*"prompt"\s*:\s*"0"\s*,\s*"completion"\s*:\s*"0"\s*,\s*"image"\s*:\s*"0"\s*,\s*"request"\s*:\s*"0"`)
)
func main() {
loadSubscriptionImportEnv()
var docsURL string
var modelsURL string
var fixture string
var dryRun bool
var timeoutSeconds int
flag.StringVar(&docsURL, "docs-url", defaultSensenovaDocsURL, "商汤 SenseNova API 文档页")
flag.StringVar(&modelsURL, "models-url", defaultSensenovaModelsURL, "商汤 SenseNova 模型页")
flag.StringVar(&fixture, "fixture", "", "商汤 SenseNova 价格样例文件")
flag.BoolVar(&dryRun, "dry-run", false, "仅解析并打印摘要,不写入数据库")
flag.IntVar(&timeoutSeconds, "timeout", 45, "请求超时(秒)")
flag.Parse()
cfg := sensenovaPricingImportConfig{
DocsURL: docsURL,
ModelsURL: modelsURL,
Fixture: fixture,
DryRun: dryRun,
Timeout: time.Duration(timeoutSeconds) * time.Second,
}
var db *sql.DB
var err error
if !cfg.DryRun {
db, err = subscriptionImportDB()
if err != nil {
fmt.Fprintf(os.Stderr, "open db: %v\n", err)
os.Exit(1)
}
defer db.Close()
}
if err := runSensenovaPricingImport(cfg, db, os.Stdout); err != nil {
fmt.Fprintf(os.Stderr, "import_sensenova_pricing: %v\n", err)
os.Exit(1)
}
}
func runSensenovaPricingImport(cfg sensenovaPricingImportConfig, db *sql.DB, out io.Writer) error {
fixture, err := fetchSensenovaPricingFixture(cfg)
if err != nil {
return err
}
records, err := parseSensenovaPricingCatalog(fixture)
if err != nil {
return err
}
records = dedupeOfficialPricingRecords(records)
if len(records) == 0 {
return fmt.Errorf("unexpected sensenova pricing content: no records")
}
if cfg.DryRun {
_, err = fmt.Fprintf(out, "source=sensenova-pricing-import models=%d operator=%s dry_run=true\n", len(records), records[0].OperatorName)
return err
}
if db == nil {
return fmt.Errorf("db is required when dry-run=false")
}
if err := upsertOfficialPricingRecords(db, records, "sensenova-pricing-import"); err != nil {
return err
}
var tableRows int
if err := db.QueryRow(`SELECT COUNT(*) FROM region_pricing`).Scan(&tableRows); err != nil {
return fmt.Errorf("count region_pricing: %w", err)
}
_, err = fmt.Fprintf(out, "source=sensenova-pricing-import models=%d operator=%s table_rows=%d dry_run=false\n", len(records), records[0].OperatorName, tableRows)
return err
}
func fetchSensenovaPricingFixture(cfg sensenovaPricingImportConfig) (sensenovaPricingFixture, error) {
if strings.TrimSpace(cfg.Fixture) != "" {
data, err := os.ReadFile(cfg.Fixture)
if err != nil {
return sensenovaPricingFixture{}, fmt.Errorf("read fixture %s: %w", cfg.Fixture, err)
}
return splitSensenovaFixture(string(data))
}
docsHTML, err := fetchRenderedPricingPageWithChromium(cfg.DocsURL, cfg.Timeout)
if err != nil {
return sensenovaPricingFixture{}, fmt.Errorf("fetch docs render: %w", err)
}
modelsText, err := fetchSensenovaModelsBundle(cfg.ModelsURL, cfg.Timeout)
if err != nil {
return sensenovaPricingFixture{}, err
}
return sensenovaPricingFixture{DocsHTML: docsHTML, ModelsText: modelsText}, nil
}
func splitSensenovaFixture(raw string) (sensenovaPricingFixture, error) {
parts := strings.SplitN(raw, sensenovaFixtureSplitMarker, 2)
if len(parts) != 2 {
return sensenovaPricingFixture{}, fmt.Errorf("unexpected sensenova fixture: missing models bundle marker")
}
docsHTML := strings.TrimSpace(parts[0])
modelsText := strings.TrimSpace(parts[1])
if docsHTML == "" || modelsText == "" {
return sensenovaPricingFixture{}, fmt.Errorf("unexpected sensenova fixture: empty docs or models segment")
}
return sensenovaPricingFixture{DocsHTML: docsHTML, ModelsText: modelsText}, nil
}
func fetchSensenovaModelsBundle(modelsURL string, timeout time.Duration) (string, error) {
client := &http.Client{Timeout: timeout}
html, err := fetchRawPricingPage(modelsURL, "", client)
if err != nil {
return "", fmt.Errorf("fetch models page shell: %w", err)
}
scripts := sensenovaModelsScriptPattern.FindAllStringSubmatch(html, -1)
if len(scripts) == 0 {
return "", fmt.Errorf("unexpected sensenova models page: no chunk scripts found")
}
seen := make(map[string]struct{}, len(scripts))
for _, match := range scripts {
if len(match) != 2 {
continue
}
scriptURL, err := resolveSensenovaAssetURL(modelsURL, match[1])
if err != nil {
continue
}
if _, ok := seen[scriptURL]; ok {
continue
}
seen[scriptURL] = struct{}{}
bundle, err := fetchRawPricingPage(scriptURL, "", client)
if err != nil {
continue
}
if sensenovaBundleConfirmsFreeBeta(bundle) {
return bundle, nil
}
}
return "", fmt.Errorf("unexpected sensenova models page: free-beta bundle not found")
}
func resolveSensenovaAssetURL(baseURL string, assetPath string) (string, error) {
parsedBase, err := url.Parse(baseURL)
if err != nil {
return "", err
}
asset, err := url.Parse(assetPath)
if err != nil {
return "", err
}
return parsedBase.ResolveReference(asset).String(), nil
}
func sensenovaBundleConfirmsFreeBeta(raw string) bool {
hasFree := strings.Contains(raw, "公测期完全免费开放") || strings.Contains(raw, "free during public beta")
hasAllModels := strings.Contains(raw, "所有模型完全开放") || strings.Contains(raw, "all models included")
return hasFree && hasAllModels
}
func fetchRenderedPricingPageWithChromium(pageURL string, timeout time.Duration) (string, error) {
browserPath, err := lookupChromiumBinaryForSensenova()
if err != nil {
return "", err
}
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
cmd := exec.CommandContext(ctx, browserPath,
"--headless",
"--no-sandbox",
"--disable-gpu",
"--virtual-time-budget=8000",
"--dump-dom",
pageURL,
)
cmd.Stderr = io.Discard
out, err := cmd.Output()
if ctx.Err() == context.DeadlineExceeded {
return "", fmt.Errorf("chromium render timeout after %s", timeout)
}
if err != nil {
return "", fmt.Errorf("chromium dump-dom: %w", err)
}
if len(out) == 0 {
return "", fmt.Errorf("chromium dump-dom returned empty output")
}
return string(out), nil
}
func lookupChromiumBinaryForSensenova() (string, error) {
for _, name := range []string{"chromium", "chromium-browser", "google-chrome", "google-chrome-stable"} {
if path, err := exec.LookPath(name); err == nil {
return path, nil
}
}
return "", fmt.Errorf("no chromium-compatible browser found in PATH")
}
func parseSensenovaPricingCatalog(fixture sensenovaPricingFixture) ([]officialPricingRecord, error) {
if !sensenovaBundleConfirmsFreeBeta(fixture.ModelsText) {
return nil, fmt.Errorf("unexpected sensenova models bundle: missing public-beta free signal")
}
if !strings.Contains(fixture.DocsHTML, "GET https://token.sensenova.cn/v1/models") {
return nil, fmt.Errorf("unexpected sensenova docs content: missing list models endpoint")
}
if !sensenovaPricingZeroPattern.MatchString(fixture.DocsHTML) {
return nil, fmt.Errorf("unexpected sensenova docs content: missing zero pricing object example")
}
matches := sensenovaOverviewCardPattern.FindAllStringSubmatch(fixture.DocsHTML, -1)
if len(matches) == 0 {
return nil, fmt.Errorf("unexpected sensenova docs content: no model overview cards parsed")
}
providerNameCn, providerCountry, providerWebsite := providerMetadata("SenseTime")
records := make([]officialPricingRecord, 0, len(matches))
seenModelIDs := make(map[string]struct{}, len(matches))
for _, match := range matches {
if len(match) != 4 {
continue
}
modelName := strings.TrimSpace(match[1])
modelID := strings.TrimSpace(match[3])
if modelName == "" || modelID == "" {
continue
}
if _, ok := seenModelIDs[modelID]; ok {
continue
}
seenModelIDs[modelID] = struct{}{}
sectionID := sensenovaSectionIDForModel(modelID)
section, err := extractHTMLSectionByID(fixture.DocsHTML, sectionID)
if err != nil {
return nil, err
}
providerName := sensenovaProviderName(modelID)
providerCn, providerCountryCode, providerSite := providerNameCn, providerCountry, providerWebsite
if providerName != "SenseTime" {
providerCn, providerCountryCode, providerSite = providerMetadata(providerName)
}
records = append(records, officialPricingRecord{
ModelID: normalizeExternalID("sensenova", modelID),
ModelName: modelName,
ProviderName: providerName,
ProviderNameCn: providerCn,
ProviderCountry: providerCountryCode,
ProviderWebsite: providerSite,
OperatorName: "SenseNova API",
OperatorNameCn: "日日新开放平台",
OperatorCountry: "CN",
OperatorWebsite: defaultSensenovaDocsURL,
OperatorType: "official",
Region: "CN",
Currency: "CNY",
InputPrice: 0,
OutputPrice: 0,
IsFree: true,
ContextLength: sensenovaContextLength(modelID, section),
SourceURL: defaultSensenovaDocsURL,
ModelSourceURL: firstNonEmptyText(defaultSensenovaDocsURL+"#"+sectionID, defaultSensenovaDocsURL),
DateConfidence: "unknown",
DateSourceKind: "official_pricing",
Modality: sensenovaModality(modelID, section),
})
}
if len(records) == 0 {
return nil, fmt.Errorf("unexpected sensenova pricing content: empty records after parse")
}
return records, nil
}
func extractHTMLSectionByID(raw string, sectionID string) (string, error) {
marker := fmt.Sprintf(`<section id="%s"`, sectionID)
start := strings.Index(raw, marker)
if start == -1 {
return "", fmt.Errorf("unexpected sensenova docs content: missing section %s", sectionID)
}
remaining := raw[start:]
next := strings.Index(remaining[len(marker):], "<section id=")
if next == -1 {
return remaining, nil
}
return remaining[:len(marker)+next], nil
}
func sensenovaSectionIDForModel(modelID string) string {
switch modelID {
case "sensenova-6.7-flash-lite":
return "model-flash"
case "sensenova-u1-fast":
return "model-u1"
case "deepseek-v4-flash":
return "model-deepseek-v4-flash"
default:
return ""
}
}
func sensenovaProviderName(modelID string) string {
if strings.HasPrefix(strings.ToLower(strings.TrimSpace(modelID)), "deepseek") {
return "DeepSeek"
}
return "SenseTime"
}
func sensenovaContextLength(modelID string, section string) int {
switch modelID {
case "sensenova-6.7-flash-lite", "deepseek-v4-flash":
if strings.Contains(section, "上下文长度 256K tokens") || strings.Contains(section, "256K 上下文") {
return 256 * 1024
}
}
return 0
}
func sensenovaModality(modelID string, section string) string {
switch modelID {
case "sensenova-u1-fast":
if strings.Contains(section, "/v1/images/generations") {
return "vision"
}
return "multimodal"
case "sensenova-6.7-flash-lite":
if strings.Contains(section, "图像输入理解") {
return "multimodal"
}
return "text"
default:
return "text"
}
}