From 92c9a40f4bc740d9d3884a63ace5d102dbab124d Mon Sep 17 00:00:00 2001 From: phamnazage-jpg Date: Wed, 13 May 2026 22:37:37 +0800 Subject: [PATCH] feat(import): enrich baidu and bytedance release metadata --- scripts/import_bytedance_data.go | 68 ++++++++++++++++++++--- scripts/import_bytedance_data_test.go | 66 ++++++++++++++++++++++ scripts/import_phase2_data.go | 80 +++++++++++++++++++++++---- scripts/import_phase2_data_test.go | 71 ++++++++++++++++++++++++ scripts/import_zhipu_data.go | 6 +- scripts/import_zhipu_data_test.go | 9 +++ 6 files changed, 279 insertions(+), 21 deletions(-) create mode 100644 scripts/import_bytedance_data_test.go create mode 100644 scripts/import_phase2_data_test.go diff --git a/scripts/import_bytedance_data.go b/scripts/import_bytedance_data.go index 3e21838..2d8b117 100644 --- a/scripts/import_bytedance_data.go +++ b/scripts/import_bytedance_data.go @@ -7,6 +7,7 @@ import ( "encoding/json" "log" "os" + "strings" "time" _ "github.com/lib/pq" @@ -26,21 +27,65 @@ type ModelPricing struct { ContextLength int IsFree bool SourceURL string + ModelSourceURL string ReleaseDate string Modality string } -func releaseDateValue(raw string) time.Time { +func releaseDateValue(raw string) any { if raw == "" { - return time.Now() + return nil } parsed, err := time.Parse("2006-01-02", raw) if err != nil { - return time.Now() + return nil } return parsed } +type bytedanceModelMetadata struct { + Prefix string + ReleaseDate string + ModelSourceURL string +} + +var bytedanceModelMetadataRules = []bytedanceModelMetadata{ + { + Prefix: "bytedance-doubao-1.5-thinking", + ReleaseDate: "2025-04-17", + ModelSourceURL: "https://developer.volcengine.com/articles/7496718897794039827", + }, + { + Prefix: "bytedance-doubao-seed-1.6", + ReleaseDate: "2025-06-11", + ModelSourceURL: "https://developer.volcengine.com/articles/7517188354606104612", + }, + { + Prefix: "bytedance-seedance-1.0-lite", + ReleaseDate: "2025-05-13", + ModelSourceURL: "https://developer.volcengine.com/articles/7504284064976502823", + }, +} + +func enrichBytedanceModelMetadata(model ModelPricing) ModelPricing { + normalizedID := strings.ToLower(model.ModelID) + for _, metadata := range bytedanceModelMetadataRules { + if strings.HasPrefix(normalizedID, metadata.Prefix) { + if metadata.ReleaseDate != "" { + model.ReleaseDate = metadata.ReleaseDate + } + if metadata.ModelSourceURL != "" { + model.ModelSourceURL = metadata.ModelSourceURL + } + return model + } + } + if model.ModelSourceURL == "" { + model.ModelSourceURL = model.SourceURL + } + return model +} + func main() { dsn := os.Getenv("DATABASE_URL") if dsn == "" { @@ -79,7 +124,7 @@ func main() { batchID := "manual-seed" for _, b := range raw.Bytedance { - p := ModelPricing{ + p := enrichBytedanceModelMetadata(ModelPricing{ ModelID: "bytedance-" + b.Model, ModelName: b.Model, ProviderName: "ByteDance", @@ -94,7 +139,7 @@ func main() { IsFree: b.InputPrice == 0, SourceURL: "https://www.volcengine.com/docs/82379/1099320", Modality: "text", - } + }) // Find or create provider var providerID int64 @@ -131,7 +176,7 @@ func main() { err = db.QueryRow( `INSERT INTO models (external_id, name, provider_id, modality, context_length, status, source, batch_id, source_url, release_date) VALUES ($1, $2, $3, $4, $5, 'active', $6, $7, $8, $9) RETURNING id`, - p.ModelID, p.ModelName, providerID, p.Modality, p.ContextLength, p.OperatorName, batchID, p.SourceURL, releaseDateValue(p.ReleaseDate), + p.ModelID, p.ModelName, providerID, p.Modality, p.ContextLength, p.OperatorName, batchID, firstNonEmpty(p.ModelSourceURL, p.SourceURL), releaseDateValue(p.ReleaseDate), ).Scan(&modelID) } if err != nil { @@ -144,7 +189,7 @@ func main() { release_date = COALESCE(release_date, $3), updated_at = CURRENT_TIMESTAMP WHERE id = $1`, - modelID, p.SourceURL, releaseDateValue(p.ReleaseDate), + modelID, firstNonEmpty(p.ModelSourceURL, p.SourceURL), releaseDateValue(p.ReleaseDate), ) // Insert pricing @@ -181,3 +226,12 @@ func main() { log.Printf("Successfully imported %d ByteDance models", len(raw.Bytedance)) } + +func firstNonEmpty(values ...string) string { + for _, value := range values { + if value != "" { + return value + } + } + return "" +} diff --git a/scripts/import_bytedance_data_test.go b/scripts/import_bytedance_data_test.go new file mode 100644 index 0000000..7018dd4 --- /dev/null +++ b/scripts/import_bytedance_data_test.go @@ -0,0 +1,66 @@ +//go:build llm_script + +package main + +import "testing" + +func TestEnrichBytedanceModelMetadataUsesSpecificFamilyRules(t *testing.T) { + cases := []struct { + modelID string + wantReleaseDate string + wantSourceURL string + }{ + { + modelID: "bytedance-doubao-seed-1.6-thinking", + wantReleaseDate: "2025-06-11", + wantSourceURL: "https://developer.volcengine.com/articles/7517188354606104612", + }, + { + modelID: "bytedance-doubao-1.5-thinking-pro", + wantReleaseDate: "2025-04-17", + wantSourceURL: "https://developer.volcengine.com/articles/7496718897794039827", + }, + { + modelID: "bytedance-seedance-1.0-lite", + wantReleaseDate: "2025-05-13", + wantSourceURL: "https://developer.volcengine.com/articles/7504284064976502823", + }, + } + + for _, tc := range cases { + enriched := enrichBytedanceModelMetadata(ModelPricing{ + ModelID: tc.modelID, + SourceURL: "https://www.volcengine.com/docs/82379/1099320", + }) + + if enriched.ReleaseDate != tc.wantReleaseDate { + t.Fatalf("%s release date = %q, want %q", tc.modelID, enriched.ReleaseDate, tc.wantReleaseDate) + } + if enriched.ModelSourceURL != tc.wantSourceURL { + t.Fatalf("%s source url = %q, want %q", tc.modelID, enriched.ModelSourceURL, tc.wantSourceURL) + } + } +} + +func TestEnrichBytedanceModelMetadataFallsBackToPricingSource(t *testing.T) { + enriched := enrichBytedanceModelMetadata(ModelPricing{ + ModelID: "bytedance-deepseek-r1", + SourceURL: "https://www.volcengine.com/docs/82379/1099320", + }) + + if enriched.ReleaseDate != "" { + t.Fatalf("unexpected release date: %q", enriched.ReleaseDate) + } + if enriched.ModelSourceURL != "https://www.volcengine.com/docs/82379/1099320" { + t.Fatalf("model source url = %q, want pricing source fallback", enriched.ModelSourceURL) + } +} + +func TestBytedanceReleaseDateValueReturnsNilForUnknownDate(t *testing.T) { + if value := releaseDateValue(""); value != nil { + t.Fatalf("blank release date should stay nil, got %v", value) + } + if value := releaseDateValue("invalid"); value != nil { + t.Fatalf("invalid release date should stay nil, got %v", value) + } +} diff --git a/scripts/import_phase2_data.go b/scripts/import_phase2_data.go index 5622324..6807793 100644 --- a/scripts/import_phase2_data.go +++ b/scripts/import_phase2_data.go @@ -49,22 +49,71 @@ type ModelPricing struct { ContextLength int IsFree bool SourceURL string + ModelSourceURL string ReleaseDate string Modality string SceneTags []string } -func releaseDateValue(raw string) time.Time { +func releaseDateValue(raw string) any { if strings.TrimSpace(raw) == "" { - return time.Now() + return nil } parsed, err := time.Parse("2006-01-02", raw) if err != nil { - return time.Now() + return nil } return parsed } +type baiduModelMetadata struct { + Prefix string + ReleaseDate string + ModelSourceURL string +} + +var baiduModelMetadataRules = []baiduModelMetadata{ + { + Prefix: "baidu-ernie-4.5-turbo", + ReleaseDate: "2025-04-25", + ModelSourceURL: "https://cloud.baidu.com/article/3887765", + }, + { + Prefix: "baidu-ernie-x1-turbo", + ReleaseDate: "2025-04-25", + ModelSourceURL: "https://cloud.baidu.com/article/3887765", + }, + { + Prefix: "baidu-ernie-4.5", + ReleaseDate: "2025-03-16", + ModelSourceURL: "https://cloud.baidu.com/article/3835921", + }, + { + Prefix: "baidu-ernie-x1", + ReleaseDate: "2025-03-16", + ModelSourceURL: "https://cloud.baidu.com/article/3835921", + }, +} + +func enrichBaiduModelMetadata(model ModelPricing) ModelPricing { + normalizedID := strings.ToLower(model.ModelID) + for _, metadata := range baiduModelMetadataRules { + if strings.HasPrefix(normalizedID, metadata.Prefix) { + if metadata.ReleaseDate != "" { + model.ReleaseDate = metadata.ReleaseDate + } + if metadata.ModelSourceURL != "" { + model.ModelSourceURL = metadata.ModelSourceURL + } + return model + } + } + if model.ModelSourceURL == "" { + model.ModelSourceURL = model.SourceURL + } + return model +} + func parseZhipuPrice(s string) float64 { // Extract price from strings like "6元", "免费", "限时免费" if strings.Contains(s, "免费") { @@ -148,7 +197,7 @@ func main() { } for model, pricesMap := range modelPrices { - prices = append(prices, ModelPricing{ + prices = append(prices, enrichBaiduModelMetadata(ModelPricing{ ModelID: "baidu-" + strings.ToLower(strings.ReplaceAll(model, " ", "-")), ModelName: model, ProviderName: "Baidu", @@ -162,7 +211,7 @@ func main() { IsFree: pricesMap["input"] == 0 && pricesMap["output"] == 0, SourceURL: "https://cloud.baidu.com/doc/qianfan/s/wmh4sv6ya", Modality: "text", - }) + })) } log.Printf("Parsed %d unique models from Baidu", len(prices)) @@ -201,11 +250,11 @@ func main() { var modelID int64 err = db.QueryRow("SELECT id FROM models WHERE external_id = $1", p.ModelID).Scan(&modelID) if err == sql.ErrNoRows { - err = db.QueryRow( - `INSERT INTO models (external_id, name, provider_id, modality, context_length, status, source, batch_id, source_url, release_date) - VALUES ($1, $2, $3, $4, $5, 'active', $6, $7, $8, $9) RETURNING id`, - p.ModelID, p.ModelName, providerID, p.Modality, p.ContextLength, p.OperatorName, batchID, p.SourceURL, releaseDateValue(p.ReleaseDate), - ).Scan(&modelID) + err = db.QueryRow( + `INSERT INTO models (external_id, name, provider_id, modality, context_length, status, source, batch_id, source_url, release_date) + VALUES ($1, $2, $3, $4, $5, 'active', $6, $7, $8, $9) RETURNING id`, + p.ModelID, p.ModelName, providerID, p.Modality, p.ContextLength, p.OperatorName, batchID, firstNonEmpty(p.ModelSourceURL, p.SourceURL), releaseDateValue(p.ReleaseDate), + ).Scan(&modelID) } if err != nil { log.Printf("Model error: %v", err) @@ -217,7 +266,7 @@ func main() { release_date = COALESCE(release_date, $3), updated_at = CURRENT_TIMESTAMP WHERE id = $1`, - modelID, p.SourceURL, releaseDateValue(p.ReleaseDate), + modelID, firstNonEmpty(p.ModelSourceURL, p.SourceURL), releaseDateValue(p.ReleaseDate), ) // Insert pricing @@ -254,3 +303,12 @@ func main() { log.Printf("Successfully imported %d models into database", len(prices)) } + +func firstNonEmpty(values ...string) string { + for _, value := range values { + if value != "" { + return value + } + } + return "" +} diff --git a/scripts/import_phase2_data_test.go b/scripts/import_phase2_data_test.go new file mode 100644 index 0000000..2c828ab --- /dev/null +++ b/scripts/import_phase2_data_test.go @@ -0,0 +1,71 @@ +//go:build llm_script + +package main + +import "testing" + +func TestEnrichBaiduModelMetadataUsesSpecificFamilyRules(t *testing.T) { + cases := []struct { + modelID string + wantReleaseDate string + wantSourceURL string + }{ + { + modelID: "baidu-ernie-4.5-turbo-128k", + wantReleaseDate: "2025-04-25", + wantSourceURL: "https://cloud.baidu.com/article/3887765", + }, + { + modelID: "baidu-ernie-x1-turbo-32k", + wantReleaseDate: "2025-04-25", + wantSourceURL: "https://cloud.baidu.com/article/3887765", + }, + { + modelID: "baidu-ernie-4.5-8k", + wantReleaseDate: "2025-03-16", + wantSourceURL: "https://cloud.baidu.com/article/3835921", + }, + { + modelID: "baidu-ernie-x1-8k", + wantReleaseDate: "2025-03-16", + wantSourceURL: "https://cloud.baidu.com/article/3835921", + }, + } + + for _, tc := range cases { + enriched := enrichBaiduModelMetadata(ModelPricing{ + ModelID: tc.modelID, + SourceURL: "https://cloud.baidu.com/doc/qianfan/s/wmh4sv6ya", + }) + + if enriched.ReleaseDate != tc.wantReleaseDate { + t.Fatalf("%s release date = %q, want %q", tc.modelID, enriched.ReleaseDate, tc.wantReleaseDate) + } + if enriched.ModelSourceURL != tc.wantSourceURL { + t.Fatalf("%s source url = %q, want %q", tc.modelID, enriched.ModelSourceURL, tc.wantSourceURL) + } + } +} + +func TestEnrichBaiduModelMetadataFallsBackToPricingSource(t *testing.T) { + enriched := enrichBaiduModelMetadata(ModelPricing{ + ModelID: "baidu-deepseek-r1", + SourceURL: "https://cloud.baidu.com/doc/qianfan/s/wmh4sv6ya", + }) + + if enriched.ReleaseDate != "" { + t.Fatalf("unexpected release date: %q", enriched.ReleaseDate) + } + if enriched.ModelSourceURL != "https://cloud.baidu.com/doc/qianfan/s/wmh4sv6ya" { + t.Fatalf("model source url = %q, want pricing source fallback", enriched.ModelSourceURL) + } +} + +func TestBaiduReleaseDateValueReturnsNilForUnknownDate(t *testing.T) { + if value := releaseDateValue(""); value != nil { + t.Fatalf("blank release date should stay nil, got %v", value) + } + if value := releaseDateValue("invalid"); value != nil { + t.Fatalf("invalid release date should stay nil, got %v", value) + } +} diff --git a/scripts/import_zhipu_data.go b/scripts/import_zhipu_data.go index 042e55b..23dd875 100644 --- a/scripts/import_zhipu_data.go +++ b/scripts/import_zhipu_data.go @@ -32,13 +32,13 @@ type ModelPricing struct { SceneTags []string } -func releaseDateValue(raw string) time.Time { +func releaseDateValue(raw string) any { if raw == "" { - return time.Now() + return nil } parsed, err := time.Parse("2006-01-02", raw) if err != nil { - return time.Now() + return nil } return parsed } diff --git a/scripts/import_zhipu_data_test.go b/scripts/import_zhipu_data_test.go index cccb6b0..94f548a 100644 --- a/scripts/import_zhipu_data_test.go +++ b/scripts/import_zhipu_data_test.go @@ -70,3 +70,12 @@ func TestEnrichZhipuModelMetadataFallsBackToPricingSource(t *testing.T) { t.Fatalf("model source url = %q, want pricing source fallback", enriched.ModelSourceURL) } } + +func TestZhipuReleaseDateValueReturnsNilForUnknownDate(t *testing.T) { + if value := releaseDateValue(""); value != nil { + t.Fatalf("blank release date should stay nil, got %v", value) + } + if value := releaseDateValue("invalid"); value != nil { + t.Fatalf("invalid release date should stay nil, got %v", value) + } +}