#!/bin/bash # run_daily.sh - 每日数据采集与报告生成流水线 # Sprint 3: 完整调度脚本(采集→质量检查→报告生成→归档→通知) set -euo pipefail PROJECT_DIR="/home/long/project/llm-intelligence" . "$PROJECT_DIR/scripts/report_utils.sh" if [[ -f "$PROJECT_DIR/.env.local" ]]; then # shellcheck disable=SC1091 source "$PROJECT_DIR/.env.local" fi if [[ -f "$PROJECT_DIR/.env" ]]; then # shellcheck disable=SC1091 source "$PROJECT_DIR/.env" fi DB_URL="${DATABASE_URL:-host=/var/run/postgresql dbname=llm_intelligence user=long sslmode=disable}" REPORT_DATE="$(report_date_value)" LOG_FILE="/tmp/llm_hub_daily_${REPORT_DATE}.log" FEISHU_WEBHOOK="${FEISHU_WEBHOOK:-}" MODEL_COUNT="" FETCH_OUT="${PROJECT_DIR}/models.json" FETCH_TOTAL="0" PIPELINE_STAGE_SET="openrouter,multi_source,official_imports,daily_report" PIPELINE_SOURCE_SET="openrouter,moonshot,deepseek,openai,zhipu,baidu,bytedance" PIPELINE_FAILED_SOURCE_SET="none" MULTI_SOURCE_AUDIT="multi_source_audit=unavailable" PIPELINE_AUDIT_SUMMARY="" # 日志函数 log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" } normalize_summary_file() { local path="$1" if [ ! -f "$path" ]; then return fi tr '\n' ' ' < "$path" | sed 's/[[:space:]]\+/ /g; s/^ //; s/ $//' } extract_failed_source_keys() { local summary="$1" printf '%s\n' "$summary" | sed -n 's/.*failed_source_keys=\([^ ]*\).*/\1/p' } merge_failed_source_keys() { local keys="$1" if [ -z "$keys" ] || [ "$keys" = "none" ]; then return fi if [ "$PIPELINE_FAILED_SOURCE_SET" = "none" ]; then PIPELINE_FAILED_SOURCE_SET="$keys" return fi PIPELINE_FAILED_SOURCE_SET="${PIPELINE_FAILED_SOURCE_SET},${keys}" } refresh_pipeline_audit() { PIPELINE_AUDIT_SUMMARY="runtime_audit stage_set=${PIPELINE_STAGE_SET} selected_source_keys=${PIPELINE_SOURCE_SET} failed_source_keys=${PIPELINE_FAILED_SOURCE_SET} openrouter_total=${FETCH_TOTAL:-0} ${MULTI_SOURCE_AUDIT}" } # 错误处理 error_exit() { local output_path="" log "❌ 错误: $1" refresh_pipeline_audit # 降级:复制昨日报告 fallback_report if [ -f "$(report_markdown_path "$REPORT_DATE")" ]; then output_path="$(report_markdown_path "$REPORT_DATE")" fi track_report_state "$DB_URL" "$REPORT_DATE" "failed" "${MODEL_COUNT:-}" "$PIPELINE_AUDIT_SUMMARY" "$output_path" "$1" "scheduled" "cron" "true" >> "$LOG_FILE" 2>&1 || true # 发送告警 if [ -n "$FEISHU_WEBHOOK" ]; then send_alert "$1" fi exit 1 } refresh_pipeline_audit # 降级:复制昨日报告 fallback_report() { local yesterday yesterday_md today_md yesterday_html today_html yesterday=$(date -d "yesterday" +%Y-%m-%d) yesterday_md="${PROJECT_DIR}/$(report_markdown_path "$yesterday")" today_md="${PROJECT_DIR}/$(report_markdown_path "$REPORT_DATE")" yesterday_html="${PROJECT_DIR}/$(report_html_path "$yesterday")" today_html="${PROJECT_DIR}/$(report_html_path "$REPORT_DATE")" if [ -f "$yesterday_md" ]; then cp "$yesterday_md" "$today_md" sed -i "s/${yesterday}/${REPORT_DATE}/g" "$today_md" sed -i "1s/^/# [数据延迟] /" "$today_md" if [ -f "$yesterday_html" ]; then cp "$yesterday_html" "$today_html" sed -i "s/${yesterday}/${REPORT_DATE}/g" "$today_html" fi if [ -f "$today_md" ] && [ -f "$today_html" ]; then archive_report_artifacts "$REPORT_DATE" >> "$LOG_FILE" 2>&1 || true fi log "⚠️ 已复制昨日报告并标记[数据延迟]" else log "⚠️ 无昨日报告可供复制" fi } # 发送飞书告警 send_alert() { local msg="$1" local payload="{\"msg_type\":\"text\",\"content\":{\"text\":\"🚨 LLM Hub 日报失败\\n日期: ${REPORT_DATE}\\n错误: ${msg}\\n请检查日志: ${LOG_FILE}\"}}" curl -s -X POST -H "Content-Type: application/json" \ -d "$payload" \ "$FEISHU_WEBHOOK" > /dev/null || true log "📢 飞书告警已发送" } # 主流程 log "🚀 开始每日流水线: ${REPORT_DATE}" cd "$PROJECT_DIR" # 1. 数据采集 log "1️⃣ 数据采集..." if ! go run scripts/fetch_openrouter.go -strict-real -out "$FETCH_OUT" >> "$LOG_FILE" 2>&1; then merge_failed_source_keys "openrouter" error_exit "数据采集失败" fi FETCH_TOTAL=$(python3 - <<'PY' "$FETCH_OUT" import json, sys path = sys.argv[1] with open(path, 'r', encoding='utf-8') as f: data = json.load(f) print(int(data.get("total", 0))) PY ) if [ "${FETCH_TOTAL:-0}" -lt 10 ]; then merge_failed_source_keys "openrouter" error_exit "本次采集结果异常: total=${FETCH_TOTAL:-0} < 10" fi refresh_pipeline_audit log "✅ 数据采集完成" # 1.5 多源补充同步 log "1️⃣➕ 多源补充同步..." MULTI_SOURCE_OUTPUT="$(mktemp)" if ! go run scripts/fetch_multi_source.go --sources moonshot,deepseek,openai > "$MULTI_SOURCE_OUTPUT" 2>> "$LOG_FILE"; then MULTI_SOURCE_SUMMARY="$(normalize_summary_file "$MULTI_SOURCE_OUTPUT")" if [ -n "$MULTI_SOURCE_SUMMARY" ]; then MULTI_SOURCE_AUDIT="multi_source_audit=${MULTI_SOURCE_SUMMARY}" merge_failed_source_keys "$(extract_failed_source_keys "$MULTI_SOURCE_SUMMARY")" else MULTI_SOURCE_AUDIT="multi_source_audit=stage_failed" merge_failed_source_keys "moonshot,deepseek,openai" fi cat "$MULTI_SOURCE_OUTPUT" >> "$LOG_FILE" rm -f "$MULTI_SOURCE_OUTPUT" error_exit "多源补充同步失败" fi MULTI_SOURCE_SUMMARY="$(normalize_summary_file "$MULTI_SOURCE_OUTPUT")" MULTI_SOURCE_AUDIT="multi_source_audit=${MULTI_SOURCE_SUMMARY:-none}" merge_failed_source_keys "$(extract_failed_source_keys "$MULTI_SOURCE_SUMMARY")" refresh_pipeline_audit cat "$MULTI_SOURCE_OUTPUT" >> "$LOG_FILE" rm -f "$MULTI_SOURCE_OUTPUT" if ! go run -tags llm_script scripts/import_zhipu_data.go >> "$LOG_FILE" 2>&1; then merge_failed_source_keys "zhipu" error_exit "智谱官方导入失败" fi if ! go run -tags llm_script scripts/export_official_seed_json.go >> "$LOG_FILE" 2>&1; then merge_failed_source_keys "official_seed_export" error_exit "官方种子导出失败" fi if ! go run -tags llm_script scripts/import_phase2_data.go >> "$LOG_FILE" 2>&1; then merge_failed_source_keys "baidu" error_exit "百度官方导入失败" fi if ! go run -tags llm_script scripts/import_bytedance_data.go >> "$LOG_FILE" 2>&1; then merge_failed_source_keys "bytedance" error_exit "字节官方导入失败" fi refresh_pipeline_audit log "✅ 多源补充同步完成" # 2. 数据质量检查 log "2️⃣ 数据质量检查..." MODEL_COUNT=$(psql "$DB_URL" -t -c "SELECT COUNT(*) FROM models WHERE deleted_at IS NULL" 2>/dev/null | tr -d ' ') if [ "$MODEL_COUNT" -lt 10 ]; then error_exit "模型数量不足: ${MODEL_COUNT} < 10" fi log "✅ 数据质量检查通过 (模型数: ${MODEL_COUNT})" # 3. 生成日报 log "3️⃣ 生成日报..." export DATABASE_URL="$DB_URL" if ! REPORT_RUN_KIND="scheduled" REPORT_TRIGGER_SOURCE="cron" REPORT_IS_OFFICIAL_DAILY="true" REPORT_RUNTIME_AUDIT="$PIPELINE_AUDIT_SUMMARY" go run scripts/generate_daily_report.go >> "$LOG_FILE" 2>&1; then error_exit "日报生成失败" fi log "✅ 日报生成完成" # 4. 校验归档 log "4️⃣ 校验归档..." if [ ! -f "$(report_archive_markdown_path "$REPORT_DATE")" ] || [ ! -f "$(report_archive_html_path "$REPORT_DATE")" ]; then error_exit "日报归档失败" fi log "✅ 归档完成" # 5. 校验运行记录 log "5️⃣ 校验运行记录..." if ! psql "$DB_URL" -Atqc "select count(*) from daily_report where report_date = DATE '${REPORT_DATE}' and status = 'generated';" | awk '{ exit !($1 >= 1) }'; then error_exit "daily_report 未写入 generated 记录" fi if ! psql "$DB_URL" -Atqc "select count(*) from report_runs where report_date = DATE '${REPORT_DATE}' and status = 'generated';" | awk '{ exit !($1 >= 1) }'; then error_exit "report_runs 未写入 generated 记录" fi log "✅ 日报记录更新完成" log "🎉 每日流水线全部完成!" log "📄 Markdown: $(report_markdown_path "$REPORT_DATE")" log "🌐 HTML: $(report_html_path "$REPORT_DATE")" exit 0