Files
llm-intelligence/scripts/deepseek_news_snapshot_lib.go
phamnazage-jpg 88833fac8b
Some checks failed
CI / go-test (push) Has been cancelled
CI / scripts-regression (push) Has been cancelled
CI / frontend-build (push) Has been cancelled
CI / docker-build (push) Has been cancelled
feat(intraday): monitor DeepSeek official page drift
2026-05-27 22:01:20 +08:00

197 lines
6.6 KiB
Go

//go:build llm_script
package main
import (
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"os"
"path/filepath"
"regexp"
"sort"
"strings"
"time"
)
type deepseekNewsStructureSignature struct {
ByteSize int `json:"byte_size"`
SHA256 string `json:"sha256"`
StructureSHA256 string `json:"structure_sha256"`
Title string `json:"title"`
MetaDescription string `json:"meta_description"`
Headings []string `json:"headings"`
Contains map[string]bool `json:"contains"`
GeneratedAt string `json:"generated_at,omitempty"`
SourceURL string `json:"source_url,omitempty"`
SnapshotPath string `json:"snapshot_path,omitempty"`
}
var deepseekNewsContainsNeedles = map[string]string{
"deepseek": "deepseek",
"release": "release",
"news": "news",
"api_docs": "api docs",
}
var htmlTagRe = regexp.MustCompile(`(?s)<[^>]+>`)
var titleRe = regexp.MustCompile(`(?is)<title[^>]*>(.*?)</title>`)
var metaDescRe = regexp.MustCompile(`(?is)<meta[^>]+name=["']description["'][^>]+content=["']([^"']+)["']`)
var h1Re = regexp.MustCompile(`(?is)<h1[^>]*>(.*?)</h1>`)
func buildDeepSeekNewsStructureSignature(raw string) deepseekNewsStructureSignature {
title := firstHTMLMatch(titleRe, raw)
meta := firstHTMLMatch(metaDescRe, raw)
h1Matches := h1Re.FindAllStringSubmatch(raw, -1)
headings := make([]string, 0, len(h1Matches))
seen := make(map[string]struct{})
for _, match := range h1Matches {
if len(match) < 2 {
continue
}
clean := cleanHTMLText(match[1])
if clean == "" {
continue
}
if _, exists := seen[clean]; exists {
continue
}
seen[clean] = struct{}{}
headings = append(headings, clean)
}
contains := make(map[string]bool, len(deepseekNewsContainsNeedles))
lower := strings.ToLower(raw)
for key, needle := range deepseekNewsContainsNeedles {
contains[key] = strings.Contains(lower, strings.ToLower(needle))
}
signature := deepseekNewsStructureSignature{
ByteSize: len([]byte(raw)),
SHA256: deepseekNewsSHA256Hex(raw),
Title: title,
MetaDescription: meta,
Headings: headings,
Contains: contains,
}
signature.StructureSHA256 = deepseekNewsSHA256Hex(deepseekNewsStructureDigestPayload(signature))
return signature
}
func writeDeepSeekNewsSnapshotArtifacts(raw string, sourceURL string, snapshotPath string, signaturePath string, now time.Time) (deepseekNewsStructureSignature, error) {
if strings.TrimSpace(snapshotPath) == "" {
return deepseekNewsStructureSignature{}, fmt.Errorf("snapshot path is required")
}
if strings.TrimSpace(signaturePath) == "" {
return deepseekNewsStructureSignature{}, fmt.Errorf("signature path is required")
}
if err := os.MkdirAll(filepath.Dir(snapshotPath), 0o755); err != nil {
return deepseekNewsStructureSignature{}, fmt.Errorf("mkdir snapshot dir: %w", err)
}
if err := os.MkdirAll(filepath.Dir(signaturePath), 0o755); err != nil {
return deepseekNewsStructureSignature{}, fmt.Errorf("mkdir signature dir: %w", err)
}
if err := os.WriteFile(snapshotPath, []byte(raw), 0o644); err != nil {
return deepseekNewsStructureSignature{}, fmt.Errorf("write snapshot: %w", err)
}
signature := buildDeepSeekNewsStructureSignature(raw)
signature.GeneratedAt = now.Format(time.RFC3339)
signature.SourceURL = sourceURL
signature.SnapshotPath = snapshotPath
payload, err := json.MarshalIndent(signature, "", " ")
if err != nil {
return deepseekNewsStructureSignature{}, fmt.Errorf("marshal signature: %w", err)
}
if err := os.WriteFile(signaturePath, payload, 0o644); err != nil {
return deepseekNewsStructureSignature{}, fmt.Errorf("write signature: %w", err)
}
return signature, nil
}
func resolveDeepSeekNewsSnapshotPaths(snapshotPath string, signaturePath string, snapshotDir string, now time.Time) (string, string) {
if strings.TrimSpace(snapshotDir) == "" {
snapshotDir = filepath.Join("logs", "deepseek-news-snapshots")
}
if strings.TrimSpace(snapshotPath) == "" {
base := filepath.Join(snapshotDir, fmt.Sprintf("deepseek-news-%s", now.Format("20060102-150405")))
snapshotPath = base + ".html"
if strings.TrimSpace(signaturePath) == "" {
signaturePath = base + ".signature.json"
}
}
if strings.TrimSpace(signaturePath) == "" {
signaturePath = strings.TrimSuffix(snapshotPath, filepath.Ext(snapshotPath)) + ".signature.json"
}
return snapshotPath, signaturePath
}
func readDeepSeekNewsStructureSignature(path string) (deepseekNewsStructureSignature, error) {
data, err := os.ReadFile(path)
if err != nil {
return deepseekNewsStructureSignature{}, err
}
var signature deepseekNewsStructureSignature
if err := json.Unmarshal(data, &signature); err != nil {
return deepseekNewsStructureSignature{}, fmt.Errorf("unmarshal signature %s: %w", path, err)
}
return signature, nil
}
func hasDeepSeekNewsStructureSignature(signature deepseekNewsStructureSignature) bool {
return signature.ByteSize > 0 ||
strings.TrimSpace(signature.StructureSHA256) != "" ||
strings.TrimSpace(signature.SHA256) != "" ||
strings.TrimSpace(signature.Title) != "" ||
len(signature.Headings) > 0 ||
len(signature.Contains) > 0
}
func deepseekNewsStructureDigestPayload(signature deepseekNewsStructureSignature) string {
type containsEntry struct {
Name string `json:"name"`
Value bool `json:"value"`
}
keys := make([]string, 0, len(signature.Contains))
for key := range signature.Contains {
keys = append(keys, key)
}
sort.Strings(keys)
entries := make([]containsEntry, 0, len(keys))
for _, key := range keys {
entries = append(entries, containsEntry{Name: key, Value: signature.Contains[key]})
}
payload := struct {
Title string `json:"title"`
MetaDescription string `json:"meta_description"`
Headings []string `json:"headings"`
Contains []containsEntry `json:"contains"`
}{
Title: signature.Title,
MetaDescription: signature.MetaDescription,
Headings: signature.Headings,
Contains: entries,
}
bytes, _ := json.Marshal(payload)
return string(bytes)
}
func deepseekNewsSHA256Hex(raw string) string {
sum := sha256.Sum256([]byte(raw))
return hex.EncodeToString(sum[:])
}
func firstHTMLMatch(re *regexp.Regexp, raw string) string {
match := re.FindStringSubmatch(raw)
if len(match) < 2 {
return ""
}
return cleanHTMLText(match[1])
}
func cleanHTMLText(raw string) string {
text := htmlTagRe.ReplaceAllString(raw, " ")
text = strings.ReplaceAll(text, "&amp;", "&")
text = strings.ReplaceAll(text, "&nbsp;", " ")
text = strings.Join(strings.Fields(text), " ")
return strings.TrimSpace(text)
}