197 lines
6.6 KiB
Go
197 lines
6.6 KiB
Go
//go:build llm_script
|
|
|
|
package main
|
|
|
|
import (
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
type deepseekNewsStructureSignature struct {
|
|
ByteSize int `json:"byte_size"`
|
|
SHA256 string `json:"sha256"`
|
|
StructureSHA256 string `json:"structure_sha256"`
|
|
Title string `json:"title"`
|
|
MetaDescription string `json:"meta_description"`
|
|
Headings []string `json:"headings"`
|
|
Contains map[string]bool `json:"contains"`
|
|
GeneratedAt string `json:"generated_at,omitempty"`
|
|
SourceURL string `json:"source_url,omitempty"`
|
|
SnapshotPath string `json:"snapshot_path,omitempty"`
|
|
}
|
|
|
|
var deepseekNewsContainsNeedles = map[string]string{
|
|
"deepseek": "deepseek",
|
|
"release": "release",
|
|
"news": "news",
|
|
"api_docs": "api docs",
|
|
}
|
|
|
|
var htmlTagRe = regexp.MustCompile(`(?s)<[^>]+>`)
|
|
var titleRe = regexp.MustCompile(`(?is)<title[^>]*>(.*?)</title>`)
|
|
var metaDescRe = regexp.MustCompile(`(?is)<meta[^>]+name=["']description["'][^>]+content=["']([^"']+)["']`)
|
|
var h1Re = regexp.MustCompile(`(?is)<h1[^>]*>(.*?)</h1>`)
|
|
|
|
func buildDeepSeekNewsStructureSignature(raw string) deepseekNewsStructureSignature {
|
|
title := firstHTMLMatch(titleRe, raw)
|
|
meta := firstHTMLMatch(metaDescRe, raw)
|
|
h1Matches := h1Re.FindAllStringSubmatch(raw, -1)
|
|
headings := make([]string, 0, len(h1Matches))
|
|
seen := make(map[string]struct{})
|
|
for _, match := range h1Matches {
|
|
if len(match) < 2 {
|
|
continue
|
|
}
|
|
clean := cleanHTMLText(match[1])
|
|
if clean == "" {
|
|
continue
|
|
}
|
|
if _, exists := seen[clean]; exists {
|
|
continue
|
|
}
|
|
seen[clean] = struct{}{}
|
|
headings = append(headings, clean)
|
|
}
|
|
contains := make(map[string]bool, len(deepseekNewsContainsNeedles))
|
|
lower := strings.ToLower(raw)
|
|
for key, needle := range deepseekNewsContainsNeedles {
|
|
contains[key] = strings.Contains(lower, strings.ToLower(needle))
|
|
}
|
|
signature := deepseekNewsStructureSignature{
|
|
ByteSize: len([]byte(raw)),
|
|
SHA256: deepseekNewsSHA256Hex(raw),
|
|
Title: title,
|
|
MetaDescription: meta,
|
|
Headings: headings,
|
|
Contains: contains,
|
|
}
|
|
signature.StructureSHA256 = deepseekNewsSHA256Hex(deepseekNewsStructureDigestPayload(signature))
|
|
return signature
|
|
}
|
|
|
|
func writeDeepSeekNewsSnapshotArtifacts(raw string, sourceURL string, snapshotPath string, signaturePath string, now time.Time) (deepseekNewsStructureSignature, error) {
|
|
if strings.TrimSpace(snapshotPath) == "" {
|
|
return deepseekNewsStructureSignature{}, fmt.Errorf("snapshot path is required")
|
|
}
|
|
if strings.TrimSpace(signaturePath) == "" {
|
|
return deepseekNewsStructureSignature{}, fmt.Errorf("signature path is required")
|
|
}
|
|
if err := os.MkdirAll(filepath.Dir(snapshotPath), 0o755); err != nil {
|
|
return deepseekNewsStructureSignature{}, fmt.Errorf("mkdir snapshot dir: %w", err)
|
|
}
|
|
if err := os.MkdirAll(filepath.Dir(signaturePath), 0o755); err != nil {
|
|
return deepseekNewsStructureSignature{}, fmt.Errorf("mkdir signature dir: %w", err)
|
|
}
|
|
if err := os.WriteFile(snapshotPath, []byte(raw), 0o644); err != nil {
|
|
return deepseekNewsStructureSignature{}, fmt.Errorf("write snapshot: %w", err)
|
|
}
|
|
signature := buildDeepSeekNewsStructureSignature(raw)
|
|
signature.GeneratedAt = now.Format(time.RFC3339)
|
|
signature.SourceURL = sourceURL
|
|
signature.SnapshotPath = snapshotPath
|
|
payload, err := json.MarshalIndent(signature, "", " ")
|
|
if err != nil {
|
|
return deepseekNewsStructureSignature{}, fmt.Errorf("marshal signature: %w", err)
|
|
}
|
|
if err := os.WriteFile(signaturePath, payload, 0o644); err != nil {
|
|
return deepseekNewsStructureSignature{}, fmt.Errorf("write signature: %w", err)
|
|
}
|
|
return signature, nil
|
|
}
|
|
|
|
func resolveDeepSeekNewsSnapshotPaths(snapshotPath string, signaturePath string, snapshotDir string, now time.Time) (string, string) {
|
|
if strings.TrimSpace(snapshotDir) == "" {
|
|
snapshotDir = filepath.Join("logs", "deepseek-news-snapshots")
|
|
}
|
|
if strings.TrimSpace(snapshotPath) == "" {
|
|
base := filepath.Join(snapshotDir, fmt.Sprintf("deepseek-news-%s", now.Format("20060102-150405")))
|
|
snapshotPath = base + ".html"
|
|
if strings.TrimSpace(signaturePath) == "" {
|
|
signaturePath = base + ".signature.json"
|
|
}
|
|
}
|
|
if strings.TrimSpace(signaturePath) == "" {
|
|
signaturePath = strings.TrimSuffix(snapshotPath, filepath.Ext(snapshotPath)) + ".signature.json"
|
|
}
|
|
return snapshotPath, signaturePath
|
|
}
|
|
|
|
func readDeepSeekNewsStructureSignature(path string) (deepseekNewsStructureSignature, error) {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return deepseekNewsStructureSignature{}, err
|
|
}
|
|
var signature deepseekNewsStructureSignature
|
|
if err := json.Unmarshal(data, &signature); err != nil {
|
|
return deepseekNewsStructureSignature{}, fmt.Errorf("unmarshal signature %s: %w", path, err)
|
|
}
|
|
return signature, nil
|
|
}
|
|
|
|
func hasDeepSeekNewsStructureSignature(signature deepseekNewsStructureSignature) bool {
|
|
return signature.ByteSize > 0 ||
|
|
strings.TrimSpace(signature.StructureSHA256) != "" ||
|
|
strings.TrimSpace(signature.SHA256) != "" ||
|
|
strings.TrimSpace(signature.Title) != "" ||
|
|
len(signature.Headings) > 0 ||
|
|
len(signature.Contains) > 0
|
|
}
|
|
|
|
func deepseekNewsStructureDigestPayload(signature deepseekNewsStructureSignature) string {
|
|
type containsEntry struct {
|
|
Name string `json:"name"`
|
|
Value bool `json:"value"`
|
|
}
|
|
keys := make([]string, 0, len(signature.Contains))
|
|
for key := range signature.Contains {
|
|
keys = append(keys, key)
|
|
}
|
|
sort.Strings(keys)
|
|
entries := make([]containsEntry, 0, len(keys))
|
|
for _, key := range keys {
|
|
entries = append(entries, containsEntry{Name: key, Value: signature.Contains[key]})
|
|
}
|
|
payload := struct {
|
|
Title string `json:"title"`
|
|
MetaDescription string `json:"meta_description"`
|
|
Headings []string `json:"headings"`
|
|
Contains []containsEntry `json:"contains"`
|
|
}{
|
|
Title: signature.Title,
|
|
MetaDescription: signature.MetaDescription,
|
|
Headings: signature.Headings,
|
|
Contains: entries,
|
|
}
|
|
bytes, _ := json.Marshal(payload)
|
|
return string(bytes)
|
|
}
|
|
|
|
func deepseekNewsSHA256Hex(raw string) string {
|
|
sum := sha256.Sum256([]byte(raw))
|
|
return hex.EncodeToString(sum[:])
|
|
}
|
|
|
|
func firstHTMLMatch(re *regexp.Regexp, raw string) string {
|
|
match := re.FindStringSubmatch(raw)
|
|
if len(match) < 2 {
|
|
return ""
|
|
}
|
|
return cleanHTMLText(match[1])
|
|
}
|
|
|
|
func cleanHTMLText(raw string) string {
|
|
text := htmlTagRe.ReplaceAllString(raw, " ")
|
|
text = strings.ReplaceAll(text, "&", "&")
|
|
text = strings.ReplaceAll(text, " ", " ")
|
|
text = strings.Join(strings.Fields(text), " ")
|
|
return strings.TrimSpace(text)
|
|
}
|