Files
llm-intelligence/scripts/import_bytedance_pricing.go
2026-05-29 18:48:48 +08:00

285 lines
7.9 KiB
Go

//go:build llm_script && !scripts_pkg
package main
import (
"database/sql"
"encoding/json"
"flag"
"fmt"
"html"
"io"
"net/http"
"os"
"regexp"
"strings"
"time"
)
const defaultBytedanceArkPricingURL = "https://www.volcengine.com/docs/82379/1544106"
type bytedanceArkPricingImportConfig struct {
URL string
Fixture string
DryRun bool
Timeout time.Duration
}
func main() {
loadSubscriptionImportEnv()
var url string
var fixture string
var dryRun bool
var timeoutSeconds int
flag.StringVar(&url, "url", defaultBytedanceArkPricingURL, "火山方舟官方模型价格页")
flag.StringVar(&fixture, "fixture", "", "火山方舟价格样例文件")
flag.BoolVar(&dryRun, "dry-run", false, "仅解析并打印摘要,不写入数据库")
flag.IntVar(&timeoutSeconds, "timeout", 20, "请求超时(秒)")
flag.Parse()
cfg := bytedanceArkPricingImportConfig{URL: url, Fixture: fixture, DryRun: dryRun, Timeout: time.Duration(timeoutSeconds) * time.Second}
var db *sql.DB
var err error
if !cfg.DryRun {
db, err = subscriptionImportDB()
if err != nil {
fmt.Fprintf(os.Stderr, "open db: %v\n", err)
os.Exit(1)
}
defer db.Close()
}
if err := runBytedanceArkPricingImport(cfg, db, os.Stdout); err != nil {
fmt.Fprintf(os.Stderr, "import_bytedance_pricing: %v\n", err)
os.Exit(1)
}
}
func runBytedanceArkPricingImport(cfg bytedanceArkPricingImportConfig, db *sql.DB, out io.Writer) error {
client := &http.Client{Timeout: cfg.Timeout}
raw, err := fetchRawPricingPage(cfg.URL, cfg.Fixture, client)
if err != nil {
return err
}
records, err := parseBytedanceArkPricingCatalog(raw)
if err != nil {
return err
}
records = dedupeOfficialPricingRecords(records)
if cfg.DryRun {
_, err = fmt.Fprintf(out, "source=bytedance-pricing-import models=%d operator=%s dry_run=true\n", len(records), records[0].OperatorName)
return err
}
if db == nil {
return fmt.Errorf("db is required when dry-run=false")
}
if err := upsertOfficialPricingRecords(db, records, "bytedance-pricing-import"); err != nil {
return err
}
var tableRows int
if err := db.QueryRow(`SELECT COUNT(*) FROM region_pricing`).Scan(&tableRows); err != nil {
return fmt.Errorf("count region_pricing: %w", err)
}
_, err = fmt.Fprintf(out, "source=bytedance-pricing-import models=%d operator=%s table_rows=%d dry_run=false\n", len(records), records[0].OperatorName, tableRows)
return err
}
func parseBytedanceArkPricingCatalog(raw string) ([]officialPricingRecord, error) {
markdown, err := extractBytedanceArkPricingMarkdown(raw)
if err != nil {
return nil, err
}
rows, err := extractMarkdownTableRowsForHeading(markdown, "## 在线推理(常规)")
if err != nil {
return nil, err
}
if len(rows) < 2 {
return nil, fmt.Errorf("unexpected bytedance ark pricing table")
}
records := make([]officialPricingRecord, 0, len(rows)-1)
for _, row := range rows[1:] {
if len(row) < 6 {
continue
}
modelName := cleanBytedanceArkCell(row[0])
if modelName == "" || isBytedanceArkConditionRow(modelName) {
continue
}
inputPrice := bytedanceArkPriceValue(row[2])
outputPrice := bytedanceArkPriceValue(row[5])
if inputPrice <= 0 || outputPrice <= 0 {
continue
}
providerName := bytedanceArkProviderName(modelName)
providerNameCn, providerCountry, providerWebsite := providerMetadata(providerName)
records = append(records, officialPricingRecord{
ModelID: normalizeExternalID("bytedance", modelName),
ModelName: modelName,
ProviderName: providerName,
ProviderNameCn: providerNameCn,
ProviderCountry: providerCountry,
ProviderWebsite: providerWebsite,
OperatorName: "ByteDance Volcano",
OperatorNameCn: "火山引擎",
OperatorCountry: "CN",
OperatorWebsite: "https://www.volcengine.com/product/ark",
OperatorType: "official",
Region: "CN",
Currency: "CNY",
InputPrice: inputPrice,
OutputPrice: outputPrice,
SourceURL: defaultBytedanceArkPricingURL,
ModelSourceURL: defaultBytedanceArkPricingURL,
DateConfidence: "unknown",
DateSourceKind: "official_pricing",
Modality: detectModality(modelName),
})
}
if len(records) == 0 {
return nil, fmt.Errorf("no bytedance ark input/output pricing rows found")
}
return records, nil
}
func extractBytedanceArkPricingMarkdown(raw string) (string, error) {
if !strings.Contains(raw, "window._ROUTER_DATA = ") {
return raw, nil
}
jsonText, err := extractJSONAfterMarker(raw, "window._ROUTER_DATA = ")
if err != nil {
return "", err
}
var envelope map[string]any
if err := json.Unmarshal([]byte(jsonText), &envelope); err != nil {
return "", fmt.Errorf("parse bytedance router json: %w", err)
}
loaderData, _ := envelope["loaderData"].(map[string]any)
page, _ := loaderData["docs/(libid)/(docid$)/page"].(map[string]any)
curDoc, _ := page["curDoc"].(map[string]any)
markdown, _ := curDoc["MDContent"].(string)
if strings.TrimSpace(markdown) == "" {
return "", fmt.Errorf("missing bytedance pricing markdown content")
}
return markdown, nil
}
func extractJSONAfterMarker(raw string, marker string) (string, error) {
start := strings.Index(raw, marker)
if start < 0 {
return "", fmt.Errorf("marker %q not found", marker)
}
start += len(marker)
braceDepth := 0
inString := false
escaped := false
end := -1
for i := start; i < len(raw); i++ {
ch := raw[i]
if inString {
if escaped {
escaped = false
continue
}
switch ch {
case '\\':
escaped = true
case '"':
inString = false
}
continue
}
switch ch {
case '"':
inString = true
case '{':
braceDepth++
case '}':
braceDepth--
if braceDepth == 0 {
end = i + 1
i = len(raw)
}
}
}
if end <= start {
return "", fmt.Errorf("unable to locate router json boundary")
}
return raw[start:end], nil
}
func extractMarkdownTableRowsForHeading(markdown string, heading string) ([][]string, error) {
lines := strings.Split(markdown, "\n")
capturing := false
rows := make([][]string, 0)
for _, line := range lines {
trimmed := strings.TrimSpace(line)
switch {
case trimmed == heading:
capturing = true
case capturing && strings.HasPrefix(trimmed, "#") && trimmed != heading:
if len(rows) > 0 {
return rows, nil
}
capturing = false
}
if !capturing || !strings.HasPrefix(trimmed, "|") || strings.Contains(trimmed, "|---") {
continue
}
cells := strings.Split(strings.Trim(trimmed, "|"), "|")
for i := range cells {
cells[i] = strings.TrimSpace(cells[i])
}
rows = append(rows, cells)
}
if len(rows) == 0 {
return nil, fmt.Errorf("missing markdown table for heading %s", heading)
}
return rows, nil
}
func cleanBytedanceArkCell(raw string) string {
cleaned := html.UnescapeString(strings.TrimSpace(raw))
cleaned = strings.ReplaceAll(cleaned, `\-`, "-")
cleaned = strings.ReplaceAll(cleaned, `\`, "")
cleaned = strings.ReplaceAll(cleaned, "<br><br>", " ")
cleaned = strings.ReplaceAll(cleaned, "<br />", " ")
cleaned = strings.ReplaceAll(cleaned, "<br/>", " ")
cleaned = strings.ReplaceAll(cleaned, "<br>", " ")
cleaned = regexp.MustCompile(`(?is)<[^>]+>`).ReplaceAllString(cleaned, " ")
cleaned = regexp.MustCompile(`\s+`).ReplaceAllString(cleaned, " ")
return strings.TrimSpace(cleaned)
}
func bytedanceArkPriceValue(raw string) float64 {
cleaned := cleanBytedanceArkCell(raw)
if cleaned == "" || strings.Contains(cleaned, "不支持") {
return 0
}
match := regexp.MustCompile(`([0-9]+(?:\.[0-9]+)?)`).FindStringSubmatch(cleaned)
if len(match) != 2 {
return 0
}
return mustParseSubscriptionPrice(match[1])
}
func isBytedanceArkConditionRow(value string) bool {
lower := strings.ToLower(strings.TrimSpace(value))
return lower == "" || strings.HasPrefix(lower, "输入长度")
}
func bytedanceArkProviderName(modelName string) string {
lower := strings.ToLower(strings.TrimSpace(modelName))
switch {
case strings.HasPrefix(lower, "deepseek"):
return "DeepSeek"
case strings.HasPrefix(lower, "glm"):
return "Zhipu AI"
default:
return "ByteDance"
}
}