feat(import): enrich baidu and bytedance release metadata
This commit is contained in:
@@ -7,6 +7,7 @@ import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
_ "github.com/lib/pq"
|
||||
@@ -26,21 +27,65 @@ type ModelPricing struct {
|
||||
ContextLength int
|
||||
IsFree bool
|
||||
SourceURL string
|
||||
ModelSourceURL string
|
||||
ReleaseDate string
|
||||
Modality string
|
||||
}
|
||||
|
||||
func releaseDateValue(raw string) time.Time {
|
||||
func releaseDateValue(raw string) any {
|
||||
if raw == "" {
|
||||
return time.Now()
|
||||
return nil
|
||||
}
|
||||
parsed, err := time.Parse("2006-01-02", raw)
|
||||
if err != nil {
|
||||
return time.Now()
|
||||
return nil
|
||||
}
|
||||
return parsed
|
||||
}
|
||||
|
||||
type bytedanceModelMetadata struct {
|
||||
Prefix string
|
||||
ReleaseDate string
|
||||
ModelSourceURL string
|
||||
}
|
||||
|
||||
var bytedanceModelMetadataRules = []bytedanceModelMetadata{
|
||||
{
|
||||
Prefix: "bytedance-doubao-1.5-thinking",
|
||||
ReleaseDate: "2025-04-17",
|
||||
ModelSourceURL: "https://developer.volcengine.com/articles/7496718897794039827",
|
||||
},
|
||||
{
|
||||
Prefix: "bytedance-doubao-seed-1.6",
|
||||
ReleaseDate: "2025-06-11",
|
||||
ModelSourceURL: "https://developer.volcengine.com/articles/7517188354606104612",
|
||||
},
|
||||
{
|
||||
Prefix: "bytedance-seedance-1.0-lite",
|
||||
ReleaseDate: "2025-05-13",
|
||||
ModelSourceURL: "https://developer.volcengine.com/articles/7504284064976502823",
|
||||
},
|
||||
}
|
||||
|
||||
func enrichBytedanceModelMetadata(model ModelPricing) ModelPricing {
|
||||
normalizedID := strings.ToLower(model.ModelID)
|
||||
for _, metadata := range bytedanceModelMetadataRules {
|
||||
if strings.HasPrefix(normalizedID, metadata.Prefix) {
|
||||
if metadata.ReleaseDate != "" {
|
||||
model.ReleaseDate = metadata.ReleaseDate
|
||||
}
|
||||
if metadata.ModelSourceURL != "" {
|
||||
model.ModelSourceURL = metadata.ModelSourceURL
|
||||
}
|
||||
return model
|
||||
}
|
||||
}
|
||||
if model.ModelSourceURL == "" {
|
||||
model.ModelSourceURL = model.SourceURL
|
||||
}
|
||||
return model
|
||||
}
|
||||
|
||||
func main() {
|
||||
dsn := os.Getenv("DATABASE_URL")
|
||||
if dsn == "" {
|
||||
@@ -79,7 +124,7 @@ func main() {
|
||||
batchID := "manual-seed"
|
||||
|
||||
for _, b := range raw.Bytedance {
|
||||
p := ModelPricing{
|
||||
p := enrichBytedanceModelMetadata(ModelPricing{
|
||||
ModelID: "bytedance-" + b.Model,
|
||||
ModelName: b.Model,
|
||||
ProviderName: "ByteDance",
|
||||
@@ -94,7 +139,7 @@ func main() {
|
||||
IsFree: b.InputPrice == 0,
|
||||
SourceURL: "https://www.volcengine.com/docs/82379/1099320",
|
||||
Modality: "text",
|
||||
}
|
||||
})
|
||||
|
||||
// Find or create provider
|
||||
var providerID int64
|
||||
@@ -131,7 +176,7 @@ func main() {
|
||||
err = db.QueryRow(
|
||||
`INSERT INTO models (external_id, name, provider_id, modality, context_length, status, source, batch_id, source_url, release_date)
|
||||
VALUES ($1, $2, $3, $4, $5, 'active', $6, $7, $8, $9) RETURNING id`,
|
||||
p.ModelID, p.ModelName, providerID, p.Modality, p.ContextLength, p.OperatorName, batchID, p.SourceURL, releaseDateValue(p.ReleaseDate),
|
||||
p.ModelID, p.ModelName, providerID, p.Modality, p.ContextLength, p.OperatorName, batchID, firstNonEmpty(p.ModelSourceURL, p.SourceURL), releaseDateValue(p.ReleaseDate),
|
||||
).Scan(&modelID)
|
||||
}
|
||||
if err != nil {
|
||||
@@ -144,7 +189,7 @@ func main() {
|
||||
release_date = COALESCE(release_date, $3),
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1`,
|
||||
modelID, p.SourceURL, releaseDateValue(p.ReleaseDate),
|
||||
modelID, firstNonEmpty(p.ModelSourceURL, p.SourceURL), releaseDateValue(p.ReleaseDate),
|
||||
)
|
||||
|
||||
// Insert pricing
|
||||
@@ -181,3 +226,12 @@ func main() {
|
||||
|
||||
log.Printf("Successfully imported %d ByteDance models", len(raw.Bytedance))
|
||||
}
|
||||
|
||||
func firstNonEmpty(values ...string) string {
|
||||
for _, value := range values {
|
||||
if value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
66
scripts/import_bytedance_data_test.go
Normal file
66
scripts/import_bytedance_data_test.go
Normal file
@@ -0,0 +1,66 @@
|
||||
//go:build llm_script
|
||||
|
||||
package main
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestEnrichBytedanceModelMetadataUsesSpecificFamilyRules(t *testing.T) {
|
||||
cases := []struct {
|
||||
modelID string
|
||||
wantReleaseDate string
|
||||
wantSourceURL string
|
||||
}{
|
||||
{
|
||||
modelID: "bytedance-doubao-seed-1.6-thinking",
|
||||
wantReleaseDate: "2025-06-11",
|
||||
wantSourceURL: "https://developer.volcengine.com/articles/7517188354606104612",
|
||||
},
|
||||
{
|
||||
modelID: "bytedance-doubao-1.5-thinking-pro",
|
||||
wantReleaseDate: "2025-04-17",
|
||||
wantSourceURL: "https://developer.volcengine.com/articles/7496718897794039827",
|
||||
},
|
||||
{
|
||||
modelID: "bytedance-seedance-1.0-lite",
|
||||
wantReleaseDate: "2025-05-13",
|
||||
wantSourceURL: "https://developer.volcengine.com/articles/7504284064976502823",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
enriched := enrichBytedanceModelMetadata(ModelPricing{
|
||||
ModelID: tc.modelID,
|
||||
SourceURL: "https://www.volcengine.com/docs/82379/1099320",
|
||||
})
|
||||
|
||||
if enriched.ReleaseDate != tc.wantReleaseDate {
|
||||
t.Fatalf("%s release date = %q, want %q", tc.modelID, enriched.ReleaseDate, tc.wantReleaseDate)
|
||||
}
|
||||
if enriched.ModelSourceURL != tc.wantSourceURL {
|
||||
t.Fatalf("%s source url = %q, want %q", tc.modelID, enriched.ModelSourceURL, tc.wantSourceURL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichBytedanceModelMetadataFallsBackToPricingSource(t *testing.T) {
|
||||
enriched := enrichBytedanceModelMetadata(ModelPricing{
|
||||
ModelID: "bytedance-deepseek-r1",
|
||||
SourceURL: "https://www.volcengine.com/docs/82379/1099320",
|
||||
})
|
||||
|
||||
if enriched.ReleaseDate != "" {
|
||||
t.Fatalf("unexpected release date: %q", enriched.ReleaseDate)
|
||||
}
|
||||
if enriched.ModelSourceURL != "https://www.volcengine.com/docs/82379/1099320" {
|
||||
t.Fatalf("model source url = %q, want pricing source fallback", enriched.ModelSourceURL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBytedanceReleaseDateValueReturnsNilForUnknownDate(t *testing.T) {
|
||||
if value := releaseDateValue(""); value != nil {
|
||||
t.Fatalf("blank release date should stay nil, got %v", value)
|
||||
}
|
||||
if value := releaseDateValue("invalid"); value != nil {
|
||||
t.Fatalf("invalid release date should stay nil, got %v", value)
|
||||
}
|
||||
}
|
||||
@@ -49,22 +49,71 @@ type ModelPricing struct {
|
||||
ContextLength int
|
||||
IsFree bool
|
||||
SourceURL string
|
||||
ModelSourceURL string
|
||||
ReleaseDate string
|
||||
Modality string
|
||||
SceneTags []string
|
||||
}
|
||||
|
||||
func releaseDateValue(raw string) time.Time {
|
||||
func releaseDateValue(raw string) any {
|
||||
if strings.TrimSpace(raw) == "" {
|
||||
return time.Now()
|
||||
return nil
|
||||
}
|
||||
parsed, err := time.Parse("2006-01-02", raw)
|
||||
if err != nil {
|
||||
return time.Now()
|
||||
return nil
|
||||
}
|
||||
return parsed
|
||||
}
|
||||
|
||||
type baiduModelMetadata struct {
|
||||
Prefix string
|
||||
ReleaseDate string
|
||||
ModelSourceURL string
|
||||
}
|
||||
|
||||
var baiduModelMetadataRules = []baiduModelMetadata{
|
||||
{
|
||||
Prefix: "baidu-ernie-4.5-turbo",
|
||||
ReleaseDate: "2025-04-25",
|
||||
ModelSourceURL: "https://cloud.baidu.com/article/3887765",
|
||||
},
|
||||
{
|
||||
Prefix: "baidu-ernie-x1-turbo",
|
||||
ReleaseDate: "2025-04-25",
|
||||
ModelSourceURL: "https://cloud.baidu.com/article/3887765",
|
||||
},
|
||||
{
|
||||
Prefix: "baidu-ernie-4.5",
|
||||
ReleaseDate: "2025-03-16",
|
||||
ModelSourceURL: "https://cloud.baidu.com/article/3835921",
|
||||
},
|
||||
{
|
||||
Prefix: "baidu-ernie-x1",
|
||||
ReleaseDate: "2025-03-16",
|
||||
ModelSourceURL: "https://cloud.baidu.com/article/3835921",
|
||||
},
|
||||
}
|
||||
|
||||
func enrichBaiduModelMetadata(model ModelPricing) ModelPricing {
|
||||
normalizedID := strings.ToLower(model.ModelID)
|
||||
for _, metadata := range baiduModelMetadataRules {
|
||||
if strings.HasPrefix(normalizedID, metadata.Prefix) {
|
||||
if metadata.ReleaseDate != "" {
|
||||
model.ReleaseDate = metadata.ReleaseDate
|
||||
}
|
||||
if metadata.ModelSourceURL != "" {
|
||||
model.ModelSourceURL = metadata.ModelSourceURL
|
||||
}
|
||||
return model
|
||||
}
|
||||
}
|
||||
if model.ModelSourceURL == "" {
|
||||
model.ModelSourceURL = model.SourceURL
|
||||
}
|
||||
return model
|
||||
}
|
||||
|
||||
func parseZhipuPrice(s string) float64 {
|
||||
// Extract price from strings like "6元", "免费", "限时免费"
|
||||
if strings.Contains(s, "免费") {
|
||||
@@ -148,7 +197,7 @@ func main() {
|
||||
}
|
||||
|
||||
for model, pricesMap := range modelPrices {
|
||||
prices = append(prices, ModelPricing{
|
||||
prices = append(prices, enrichBaiduModelMetadata(ModelPricing{
|
||||
ModelID: "baidu-" + strings.ToLower(strings.ReplaceAll(model, " ", "-")),
|
||||
ModelName: model,
|
||||
ProviderName: "Baidu",
|
||||
@@ -162,7 +211,7 @@ func main() {
|
||||
IsFree: pricesMap["input"] == 0 && pricesMap["output"] == 0,
|
||||
SourceURL: "https://cloud.baidu.com/doc/qianfan/s/wmh4sv6ya",
|
||||
Modality: "text",
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
log.Printf("Parsed %d unique models from Baidu", len(prices))
|
||||
@@ -204,7 +253,7 @@ func main() {
|
||||
err = db.QueryRow(
|
||||
`INSERT INTO models (external_id, name, provider_id, modality, context_length, status, source, batch_id, source_url, release_date)
|
||||
VALUES ($1, $2, $3, $4, $5, 'active', $6, $7, $8, $9) RETURNING id`,
|
||||
p.ModelID, p.ModelName, providerID, p.Modality, p.ContextLength, p.OperatorName, batchID, p.SourceURL, releaseDateValue(p.ReleaseDate),
|
||||
p.ModelID, p.ModelName, providerID, p.Modality, p.ContextLength, p.OperatorName, batchID, firstNonEmpty(p.ModelSourceURL, p.SourceURL), releaseDateValue(p.ReleaseDate),
|
||||
).Scan(&modelID)
|
||||
}
|
||||
if err != nil {
|
||||
@@ -217,7 +266,7 @@ func main() {
|
||||
release_date = COALESCE(release_date, $3),
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1`,
|
||||
modelID, p.SourceURL, releaseDateValue(p.ReleaseDate),
|
||||
modelID, firstNonEmpty(p.ModelSourceURL, p.SourceURL), releaseDateValue(p.ReleaseDate),
|
||||
)
|
||||
|
||||
// Insert pricing
|
||||
@@ -254,3 +303,12 @@ func main() {
|
||||
|
||||
log.Printf("Successfully imported %d models into database", len(prices))
|
||||
}
|
||||
|
||||
func firstNonEmpty(values ...string) string {
|
||||
for _, value := range values {
|
||||
if value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
71
scripts/import_phase2_data_test.go
Normal file
71
scripts/import_phase2_data_test.go
Normal file
@@ -0,0 +1,71 @@
|
||||
//go:build llm_script
|
||||
|
||||
package main
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestEnrichBaiduModelMetadataUsesSpecificFamilyRules(t *testing.T) {
|
||||
cases := []struct {
|
||||
modelID string
|
||||
wantReleaseDate string
|
||||
wantSourceURL string
|
||||
}{
|
||||
{
|
||||
modelID: "baidu-ernie-4.5-turbo-128k",
|
||||
wantReleaseDate: "2025-04-25",
|
||||
wantSourceURL: "https://cloud.baidu.com/article/3887765",
|
||||
},
|
||||
{
|
||||
modelID: "baidu-ernie-x1-turbo-32k",
|
||||
wantReleaseDate: "2025-04-25",
|
||||
wantSourceURL: "https://cloud.baidu.com/article/3887765",
|
||||
},
|
||||
{
|
||||
modelID: "baidu-ernie-4.5-8k",
|
||||
wantReleaseDate: "2025-03-16",
|
||||
wantSourceURL: "https://cloud.baidu.com/article/3835921",
|
||||
},
|
||||
{
|
||||
modelID: "baidu-ernie-x1-8k",
|
||||
wantReleaseDate: "2025-03-16",
|
||||
wantSourceURL: "https://cloud.baidu.com/article/3835921",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
enriched := enrichBaiduModelMetadata(ModelPricing{
|
||||
ModelID: tc.modelID,
|
||||
SourceURL: "https://cloud.baidu.com/doc/qianfan/s/wmh4sv6ya",
|
||||
})
|
||||
|
||||
if enriched.ReleaseDate != tc.wantReleaseDate {
|
||||
t.Fatalf("%s release date = %q, want %q", tc.modelID, enriched.ReleaseDate, tc.wantReleaseDate)
|
||||
}
|
||||
if enriched.ModelSourceURL != tc.wantSourceURL {
|
||||
t.Fatalf("%s source url = %q, want %q", tc.modelID, enriched.ModelSourceURL, tc.wantSourceURL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichBaiduModelMetadataFallsBackToPricingSource(t *testing.T) {
|
||||
enriched := enrichBaiduModelMetadata(ModelPricing{
|
||||
ModelID: "baidu-deepseek-r1",
|
||||
SourceURL: "https://cloud.baidu.com/doc/qianfan/s/wmh4sv6ya",
|
||||
})
|
||||
|
||||
if enriched.ReleaseDate != "" {
|
||||
t.Fatalf("unexpected release date: %q", enriched.ReleaseDate)
|
||||
}
|
||||
if enriched.ModelSourceURL != "https://cloud.baidu.com/doc/qianfan/s/wmh4sv6ya" {
|
||||
t.Fatalf("model source url = %q, want pricing source fallback", enriched.ModelSourceURL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBaiduReleaseDateValueReturnsNilForUnknownDate(t *testing.T) {
|
||||
if value := releaseDateValue(""); value != nil {
|
||||
t.Fatalf("blank release date should stay nil, got %v", value)
|
||||
}
|
||||
if value := releaseDateValue("invalid"); value != nil {
|
||||
t.Fatalf("invalid release date should stay nil, got %v", value)
|
||||
}
|
||||
}
|
||||
@@ -32,13 +32,13 @@ type ModelPricing struct {
|
||||
SceneTags []string
|
||||
}
|
||||
|
||||
func releaseDateValue(raw string) time.Time {
|
||||
func releaseDateValue(raw string) any {
|
||||
if raw == "" {
|
||||
return time.Now()
|
||||
return nil
|
||||
}
|
||||
parsed, err := time.Parse("2006-01-02", raw)
|
||||
if err != nil {
|
||||
return time.Now()
|
||||
return nil
|
||||
}
|
||||
return parsed
|
||||
}
|
||||
|
||||
@@ -70,3 +70,12 @@ func TestEnrichZhipuModelMetadataFallsBackToPricingSource(t *testing.T) {
|
||||
t.Fatalf("model source url = %q, want pricing source fallback", enriched.ModelSourceURL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestZhipuReleaseDateValueReturnsNilForUnknownDate(t *testing.T) {
|
||||
if value := releaseDateValue(""); value != nil {
|
||||
t.Fatalf("blank release date should stay nil, got %v", value)
|
||||
}
|
||||
if value := releaseDateValue("invalid"); value != nil {
|
||||
t.Fatalf("invalid release date should stay nil, got %v", value)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user