Files
llm-intelligence/scripts/import_hunyuan_pricing.go
2026-05-29 18:48:48 +08:00

169 lines
5.5 KiB
Go

//go:build llm_script && !scripts_pkg
package main
import (
"database/sql"
"flag"
"fmt"
"html"
"io"
"net/http"
"os"
"regexp"
"strings"
"time"
)
const defaultHunyuanPricingURL = "https://cloud.tencent.com/document/product/1729/97731"
var hunyuanModelLinePattern = regexp.MustCompile(`^[A-Za-z0-9 ._-]+$`)
type hunyuanPricingImportConfig struct {
URL string
Fixture string
DryRun bool
Timeout time.Duration
}
func main() {
loadSubscriptionImportEnv()
var url string
var fixture string
var dryRun bool
var timeoutSeconds int
flag.StringVar(&url, "url", defaultHunyuanPricingURL, "腾讯混元官方价格页")
flag.StringVar(&fixture, "fixture", "", "腾讯混元价格样例文件")
flag.BoolVar(&dryRun, "dry-run", false, "仅解析并打印摘要,不写入数据库")
flag.IntVar(&timeoutSeconds, "timeout", 20, "请求超时(秒)")
flag.Parse()
cfg := hunyuanPricingImportConfig{URL: url, Fixture: fixture, DryRun: dryRun, Timeout: time.Duration(timeoutSeconds) * time.Second}
var db *sql.DB
var err error
if !cfg.DryRun {
db, err = subscriptionImportDB()
if err != nil {
fmt.Fprintf(os.Stderr, "open db: %v\n", err)
os.Exit(1)
}
defer db.Close()
}
if err := runHunyuanPricingImport(cfg, db, os.Stdout); err != nil {
fmt.Fprintf(os.Stderr, "import_hunyuan_pricing: %v\n", err)
os.Exit(1)
}
}
func runHunyuanPricingImport(cfg hunyuanPricingImportConfig, db *sql.DB, out io.Writer) error {
client := &http.Client{Timeout: cfg.Timeout}
raw, err := fetchRawPricingPage(cfg.URL, cfg.Fixture, client)
if err != nil {
return err
}
records, err := parseHunyuanPricingCatalog(raw)
if err != nil {
return err
}
records = dedupeOfficialPricingRecords(records)
if cfg.DryRun {
_, err = fmt.Fprintf(out, "source=hunyuan-pricing-import models=%d operator=%s dry_run=true\n", len(records), records[0].OperatorName)
return err
}
if db == nil {
return fmt.Errorf("db is required when dry-run=false")
}
if err := upsertOfficialPricingRecords(db, records, "hunyuan-pricing-import"); err != nil {
return err
}
var tableRows int
if err := db.QueryRow(`SELECT COUNT(*) FROM region_pricing`).Scan(&tableRows); err != nil {
return fmt.Errorf("count region_pricing: %w", err)
}
_, err = fmt.Fprintf(out, "source=hunyuan-pricing-import models=%d operator=%s table_rows=%d dry_run=false\n", len(records), records[0].OperatorName, tableRows)
return err
}
func parseHunyuanPricingCatalog(raw string) ([]officialPricingRecord, error) {
lines := hunyuanPricingLines(raw)
records := make([]officialPricingRecord, 0)
currentModel := ""
currentInput := 0.0
for _, line := range lines {
trimmed := strings.TrimSpace(line)
switch {
case trimmed == "" || strings.Contains(trimmed, "混元生文价格说明") || strings.Contains(trimmed, "token 后付费") ||
strings.Contains(trimmed, "产品名") || strings.Contains(trimmed, "输入长度") || strings.Contains(trimmed, "免费额度"):
continue
case strings.HasPrefix(trimmed, "输入:"):
currentInput = mustParseSubscriptionPrice(strings.TrimSuffix(strings.TrimPrefix(trimmed, "输入:"), "元"))
case strings.HasPrefix(trimmed, "输出:"):
if currentModel == "" || currentInput == 0 {
continue
}
outputPrice := mustParseSubscriptionPrice(strings.TrimSuffix(strings.TrimPrefix(trimmed, "输出:"), "元"))
providerNameCn, providerCountry, providerWebsite := providerMetadata("Tencent")
records = append(records, officialPricingRecord{
ModelID: normalizeExternalID("hunyuan", currentModel),
ModelName: currentModel,
ProviderName: "Tencent",
ProviderNameCn: providerNameCn,
ProviderCountry: providerCountry,
ProviderWebsite: providerWebsite,
OperatorName: "Tencent Hunyuan",
OperatorNameCn: "腾讯混元",
OperatorCountry: "CN",
OperatorWebsite: "https://cloud.tencent.com/product/hunyuan",
OperatorType: "official",
Region: "CN",
Currency: "CNY",
InputPrice: currentInput,
OutputPrice: outputPrice,
SourceURL: defaultHunyuanPricingURL,
ModelSourceURL: defaultHunyuanPricingURL,
DateConfidence: "unknown",
DateSourceKind: "official_pricing",
Modality: detectModality(currentModel),
})
currentModel = ""
currentInput = 0
case hunyuanModelLinePattern.MatchString(trimmed) && !strings.Contains(trimmed, "元") && !strings.Contains(trimmed, "tokens") && trimmed != "-":
currentModel = trimmed
currentInput = 0
}
}
if len(records) == 0 {
return nil, fmt.Errorf("unexpected hunyuan pricing content")
}
return records, nil
}
func hunyuanPricingLines(raw string) []string {
raw = strings.ReplaceAll(raw, `\u003c`, "<")
raw = strings.ReplaceAll(raw, `\u003e`, ">")
raw = strings.ReplaceAll(raw, `\n`, "\n")
raw = strings.ReplaceAll(raw, `\t`, " ")
raw = html.UnescapeString(raw)
replacer := strings.NewReplacer(
"<br>", "\n", "<br/>", "\n", "<br />", "\n",
"</p>", "\n", "</div>", "\n", "</section>", "\n", "</tr>", "\n",
"</td>", "\n", "</th>", "\n", "</li>", "\n", "</h1>", "\n",
"</h2>", "\n", "</h3>", "\n", "</h4>", "\n", "</h5>", "\n", "</h6>", "\n",
)
withBreaks := replacer.Replace(raw)
withBreaks = regexp.MustCompile(`(?is)<[^>]+>`).ReplaceAllString(withBreaks, " ")
parts := strings.Split(withBreaks, "\n")
lines := make([]string, 0, len(parts))
for _, part := range parts {
line := strings.TrimSpace(regexp.MustCompile(`\s+`).ReplaceAllString(part, " "))
if line != "" {
lines = append(lines, line)
}
}
return lines
}