Tighten real-ingestion success rules, separate scheduled reports from historical rebuilds, and persist source-level runtime audit across daily pipeline runs. Also add the Phase 5 CI workflow contract plus verification updates and supporting docs so the full uncommitted change set can be validated together.
225 lines
8.1 KiB
Bash
Executable File
225 lines
8.1 KiB
Bash
Executable File
#!/bin/bash
|
||
# run_daily.sh - 每日数据采集与报告生成流水线
|
||
# Sprint 3: 完整调度脚本(采集→质量检查→报告生成→归档→通知)
|
||
set -euo pipefail
|
||
|
||
PROJECT_DIR="/home/long/project/llm-intelligence"
|
||
. "$PROJECT_DIR/scripts/report_utils.sh"
|
||
if [[ -f "$PROJECT_DIR/.env.local" ]]; then
|
||
# shellcheck disable=SC1091
|
||
source "$PROJECT_DIR/.env.local"
|
||
fi
|
||
if [[ -f "$PROJECT_DIR/.env" ]]; then
|
||
# shellcheck disable=SC1091
|
||
source "$PROJECT_DIR/.env"
|
||
fi
|
||
DB_URL="${DATABASE_URL:-host=/var/run/postgresql dbname=llm_intelligence user=long sslmode=disable}"
|
||
REPORT_DATE="$(report_date_value)"
|
||
LOG_FILE="/tmp/llm_hub_daily_${REPORT_DATE}.log"
|
||
FEISHU_WEBHOOK="${FEISHU_WEBHOOK:-}"
|
||
MODEL_COUNT=""
|
||
FETCH_OUT="${PROJECT_DIR}/models.json"
|
||
FETCH_TOTAL="0"
|
||
PIPELINE_STAGE_SET="openrouter,multi_source,official_imports,daily_report"
|
||
PIPELINE_SOURCE_SET="openrouter,moonshot,deepseek,openai,zhipu,baidu,bytedance"
|
||
PIPELINE_FAILED_SOURCE_SET="none"
|
||
MULTI_SOURCE_AUDIT="multi_source_audit=unavailable"
|
||
PIPELINE_AUDIT_SUMMARY=""
|
||
|
||
# 日志函数
|
||
log() {
|
||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||
}
|
||
|
||
normalize_summary_file() {
|
||
local path="$1"
|
||
if [ ! -f "$path" ]; then
|
||
return
|
||
fi
|
||
tr '\n' ' ' < "$path" | sed 's/[[:space:]]\+/ /g; s/^ //; s/ $//'
|
||
}
|
||
|
||
extract_failed_source_keys() {
|
||
local summary="$1"
|
||
printf '%s\n' "$summary" | sed -n 's/.*failed_source_keys=\([^ ]*\).*/\1/p'
|
||
}
|
||
|
||
merge_failed_source_keys() {
|
||
local keys="$1"
|
||
if [ -z "$keys" ] || [ "$keys" = "none" ]; then
|
||
return
|
||
fi
|
||
if [ "$PIPELINE_FAILED_SOURCE_SET" = "none" ]; then
|
||
PIPELINE_FAILED_SOURCE_SET="$keys"
|
||
return
|
||
fi
|
||
PIPELINE_FAILED_SOURCE_SET="${PIPELINE_FAILED_SOURCE_SET},${keys}"
|
||
}
|
||
|
||
refresh_pipeline_audit() {
|
||
PIPELINE_AUDIT_SUMMARY="runtime_audit stage_set=${PIPELINE_STAGE_SET} selected_source_keys=${PIPELINE_SOURCE_SET} failed_source_keys=${PIPELINE_FAILED_SOURCE_SET} openrouter_total=${FETCH_TOTAL:-0} ${MULTI_SOURCE_AUDIT}"
|
||
}
|
||
|
||
# 错误处理
|
||
error_exit() {
|
||
local output_path=""
|
||
log "❌ 错误: $1"
|
||
refresh_pipeline_audit
|
||
# 降级:复制昨日报告
|
||
fallback_report
|
||
if [ -f "$(report_markdown_path "$REPORT_DATE")" ]; then
|
||
output_path="$(report_markdown_path "$REPORT_DATE")"
|
||
fi
|
||
track_report_state "$DB_URL" "$REPORT_DATE" "failed" "${MODEL_COUNT:-}" "$PIPELINE_AUDIT_SUMMARY" "$output_path" "$1" "scheduled" "cron" "true" >> "$LOG_FILE" 2>&1 || true
|
||
# 发送告警
|
||
if [ -n "$FEISHU_WEBHOOK" ]; then
|
||
send_alert "$1"
|
||
fi
|
||
exit 1
|
||
}
|
||
|
||
refresh_pipeline_audit
|
||
|
||
# 降级:复制昨日报告
|
||
fallback_report() {
|
||
local yesterday yesterday_md today_md yesterday_html today_html
|
||
yesterday=$(date -d "yesterday" +%Y-%m-%d)
|
||
yesterday_md="${PROJECT_DIR}/$(report_markdown_path "$yesterday")"
|
||
today_md="${PROJECT_DIR}/$(report_markdown_path "$REPORT_DATE")"
|
||
yesterday_html="${PROJECT_DIR}/$(report_html_path "$yesterday")"
|
||
today_html="${PROJECT_DIR}/$(report_html_path "$REPORT_DATE")"
|
||
|
||
if [ -f "$yesterday_md" ]; then
|
||
cp "$yesterday_md" "$today_md"
|
||
sed -i "s/${yesterday}/${REPORT_DATE}/g" "$today_md"
|
||
sed -i "1s/^/# [数据延迟] /" "$today_md"
|
||
if [ -f "$yesterday_html" ]; then
|
||
cp "$yesterday_html" "$today_html"
|
||
sed -i "s/${yesterday}/${REPORT_DATE}/g" "$today_html"
|
||
fi
|
||
if [ -f "$today_md" ] && [ -f "$today_html" ]; then
|
||
archive_report_artifacts "$REPORT_DATE" >> "$LOG_FILE" 2>&1 || true
|
||
fi
|
||
log "⚠️ 已复制昨日报告并标记[数据延迟]"
|
||
else
|
||
log "⚠️ 无昨日报告可供复制"
|
||
fi
|
||
}
|
||
|
||
# 发送飞书告警
|
||
send_alert() {
|
||
local msg="$1"
|
||
local payload="{\"msg_type\":\"text\",\"content\":{\"text\":\"🚨 LLM Hub 日报失败\\n日期: ${REPORT_DATE}\\n错误: ${msg}\\n请检查日志: ${LOG_FILE}\"}}"
|
||
|
||
curl -s -X POST -H "Content-Type: application/json" \
|
||
-d "$payload" \
|
||
"$FEISHU_WEBHOOK" > /dev/null || true
|
||
log "📢 飞书告警已发送"
|
||
}
|
||
|
||
# 主流程
|
||
log "🚀 开始每日流水线: ${REPORT_DATE}"
|
||
|
||
cd "$PROJECT_DIR"
|
||
|
||
# 1. 数据采集
|
||
log "1️⃣ 数据采集..."
|
||
if ! go run scripts/fetch_openrouter.go -strict-real -out "$FETCH_OUT" >> "$LOG_FILE" 2>&1; then
|
||
merge_failed_source_keys "openrouter"
|
||
error_exit "数据采集失败"
|
||
fi
|
||
FETCH_TOTAL=$(python3 - <<'PY' "$FETCH_OUT"
|
||
import json, sys
|
||
path = sys.argv[1]
|
||
with open(path, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
print(int(data.get("total", 0)))
|
||
PY
|
||
)
|
||
if [ "${FETCH_TOTAL:-0}" -lt 10 ]; then
|
||
merge_failed_source_keys "openrouter"
|
||
error_exit "本次采集结果异常: total=${FETCH_TOTAL:-0} < 10"
|
||
fi
|
||
refresh_pipeline_audit
|
||
log "✅ 数据采集完成"
|
||
|
||
# 1.5 多源补充同步
|
||
log "1️⃣➕ 多源补充同步..."
|
||
MULTI_SOURCE_OUTPUT="$(mktemp)"
|
||
if ! go run scripts/fetch_multi_source.go --sources moonshot,deepseek,openai > "$MULTI_SOURCE_OUTPUT" 2>> "$LOG_FILE"; then
|
||
MULTI_SOURCE_SUMMARY="$(normalize_summary_file "$MULTI_SOURCE_OUTPUT")"
|
||
if [ -n "$MULTI_SOURCE_SUMMARY" ]; then
|
||
MULTI_SOURCE_AUDIT="multi_source_audit=${MULTI_SOURCE_SUMMARY}"
|
||
merge_failed_source_keys "$(extract_failed_source_keys "$MULTI_SOURCE_SUMMARY")"
|
||
else
|
||
MULTI_SOURCE_AUDIT="multi_source_audit=stage_failed"
|
||
merge_failed_source_keys "moonshot,deepseek,openai"
|
||
fi
|
||
cat "$MULTI_SOURCE_OUTPUT" >> "$LOG_FILE"
|
||
rm -f "$MULTI_SOURCE_OUTPUT"
|
||
error_exit "多源补充同步失败"
|
||
fi
|
||
MULTI_SOURCE_SUMMARY="$(normalize_summary_file "$MULTI_SOURCE_OUTPUT")"
|
||
MULTI_SOURCE_AUDIT="multi_source_audit=${MULTI_SOURCE_SUMMARY:-none}"
|
||
merge_failed_source_keys "$(extract_failed_source_keys "$MULTI_SOURCE_SUMMARY")"
|
||
refresh_pipeline_audit
|
||
cat "$MULTI_SOURCE_OUTPUT" >> "$LOG_FILE"
|
||
rm -f "$MULTI_SOURCE_OUTPUT"
|
||
if ! go run -tags llm_script scripts/import_zhipu_data.go >> "$LOG_FILE" 2>&1; then
|
||
merge_failed_source_keys "zhipu"
|
||
error_exit "智谱官方导入失败"
|
||
fi
|
||
if ! go run -tags llm_script scripts/export_official_seed_json.go >> "$LOG_FILE" 2>&1; then
|
||
merge_failed_source_keys "official_seed_export"
|
||
error_exit "官方种子导出失败"
|
||
fi
|
||
if ! go run -tags llm_script scripts/import_phase2_data.go >> "$LOG_FILE" 2>&1; then
|
||
merge_failed_source_keys "baidu"
|
||
error_exit "百度官方导入失败"
|
||
fi
|
||
if ! go run -tags llm_script scripts/import_bytedance_data.go >> "$LOG_FILE" 2>&1; then
|
||
merge_failed_source_keys "bytedance"
|
||
error_exit "字节官方导入失败"
|
||
fi
|
||
refresh_pipeline_audit
|
||
log "✅ 多源补充同步完成"
|
||
|
||
# 2. 数据质量检查
|
||
log "2️⃣ 数据质量检查..."
|
||
MODEL_COUNT=$(psql "$DB_URL" -t -c "SELECT COUNT(*) FROM models WHERE deleted_at IS NULL" 2>/dev/null | tr -d ' ')
|
||
if [ "$MODEL_COUNT" -lt 10 ]; then
|
||
error_exit "模型数量不足: ${MODEL_COUNT} < 10"
|
||
fi
|
||
log "✅ 数据质量检查通过 (模型数: ${MODEL_COUNT})"
|
||
|
||
# 3. 生成日报
|
||
log "3️⃣ 生成日报..."
|
||
export DATABASE_URL="$DB_URL"
|
||
if ! REPORT_RUN_KIND="scheduled" REPORT_TRIGGER_SOURCE="cron" REPORT_IS_OFFICIAL_DAILY="true" REPORT_RUNTIME_AUDIT="$PIPELINE_AUDIT_SUMMARY" go run scripts/generate_daily_report.go >> "$LOG_FILE" 2>&1; then
|
||
error_exit "日报生成失败"
|
||
fi
|
||
log "✅ 日报生成完成"
|
||
|
||
# 4. 校验归档
|
||
log "4️⃣ 校验归档..."
|
||
if [ ! -f "$(report_archive_markdown_path "$REPORT_DATE")" ] || [ ! -f "$(report_archive_html_path "$REPORT_DATE")" ]; then
|
||
error_exit "日报归档失败"
|
||
fi
|
||
log "✅ 归档完成"
|
||
|
||
# 5. 校验运行记录
|
||
log "5️⃣ 校验运行记录..."
|
||
if ! psql "$DB_URL" -Atqc "select count(*) from daily_report where report_date = DATE '${REPORT_DATE}' and status = 'generated';" | awk '{ exit !($1 >= 1) }'; then
|
||
error_exit "daily_report 未写入 generated 记录"
|
||
fi
|
||
if ! psql "$DB_URL" -Atqc "select count(*) from report_runs where report_date = DATE '${REPORT_DATE}' and status = 'generated';" | awk '{ exit !($1 >= 1) }'; then
|
||
error_exit "report_runs 未写入 generated 记录"
|
||
fi
|
||
log "✅ 日报记录更新完成"
|
||
|
||
log "🎉 每日流水线全部完成!"
|
||
log "📄 Markdown: $(report_markdown_path "$REPORT_DATE")"
|
||
log "🌐 HTML: $(report_html_path "$REPORT_DATE")"
|
||
|
||
exit 0
|