Files
llm-intelligence/scripts/collector_stats_window_audit.sh
2026-05-29 18:48:48 +08:00

193 lines
6.3 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
LIMIT=7
DB_URL="${DATABASE_URL:-}"
INPUT_PATH=""
THRESHOLD=""
FIELD_SEP=$'\x1f'
NOW_RAW="${LLM_NOW:-}"
AGED_PRECONDITION_COUNT=0
AGED_PRECONDITION_MINUTES=1440
usage() {
cat <<'EOF'
用法:
bash scripts/collector_stats_window_audit.sh --db <DATABASE_URL> [--limit N] [--assert-success-rate PCT]
bash scripts/collector_stats_window_audit.sh --input <tsv-file> [--limit N] [--assert-success-rate PCT]
输入 TSV 列顺序:
source<TAB>success<TAB>error_message<TAB>created_at
EOF
}
classify_failure() {
local message normalized
message="${1:-}"
normalized="$(printf '%s' "$message" | tr '[:upper:]' '[:lower:]')"
if [[ -z "${normalized// }" ]]; then
printf '%s\n' "collector_runtime_failure"
return
fi
case "$normalized" in
*"api key"*|*"openrouter_api_key"*|*"database_url"*|*"strict real mode"*|*"password authentication failed"*|*"permission denied"*|*"role does not exist"*|*"relation does not exist"*|*"must provide"*|*"未设置"*)
printf '%s\n' "precondition_missing"
;;
*"429"*|*"rate limit"*|*"too many requests"*|*"timeout"*|*"temporarily unavailable"*|*"transport closed"*|*"connection reset"*|*"connection refused"*|*"eof"*|*"tls handshake timeout"*|*"no such host"*|*"i/o timeout"*|*"unexpected status 403"*|*"unexpected status 502"*|*"unexpected status 503"*|*"unexpected status 504"*|*"signature drift"*|*"no pricing cards found"*|*"no model rows parsed"*|*"no model overview cards parsed"*|*"unexpected * pricing content"*)
printf '%s\n' "external_provider_failure"
;;
*)
printf '%s\n' "collector_runtime_failure"
;;
esac
}
minutes_since_created() {
local created_at="$1"
python3 - <<'PY' "$created_at" "$NOW_RAW"
from datetime import datetime
import sys
created = datetime.strptime(sys.argv[1], '%Y-%m-%d %H:%M:%S')
raw_now = sys.argv[2].strip()
now = datetime.strptime(raw_now, '%Y-%m-%d %H:%M') if raw_now else datetime.now()
print(int((now - created).total_seconds() // 60))
PY
}
fetch_rows_from_db() {
if [[ -z "${DB_URL:-}" ]]; then
echo "missing --db / DATABASE_URL" >&2
return 1
fi
psql "$DB_URL" -F "$FIELD_SEP" -Atqc "
SELECT
COALESCE(source, ''),
CASE WHEN success THEN 't' ELSE 'f' END,
COALESCE(error_message, ''),
TO_CHAR(created_at, 'YYYY-MM-DD HH24:MI:SS')
FROM collector_stats
ORDER BY created_at DESC
LIMIT ${LIMIT};
"
}
fetch_rows_from_file() {
if [[ -z "${INPUT_PATH:-}" ]]; then
echo "missing --input" >&2
return 1
fi
head -n "$LIMIT" "$INPUT_PATH"
}
while [[ $# -gt 0 ]]; do
case "$1" in
--db)
DB_URL="$2"
shift 2
;;
--input)
INPUT_PATH="$2"
shift 2
;;
--limit)
LIMIT="$2"
shift 2
;;
--assert-success-rate)
THRESHOLD="$2"
shift 2
;;
--help|-h)
usage
exit 0
;;
*)
echo "unknown arg: $1" >&2
usage >&2
exit 1
;;
esac
done
if [[ -n "$INPUT_PATH" ]]; then
ROWS="$(fetch_rows_from_file)"
else
ROWS="$(fetch_rows_from_db)"
fi
SUCCESS_COUNT=0
FAILURE_COUNT=0
PRECONDITION_COUNT=0
EXTERNAL_COUNT=0
RUNTIME_COUNT=0
UNKNOWN_COUNT=0
ROW_COUNT=0
DETAIL_LINES=""
while IFS= read -r raw_line; do
[[ -z "${raw_line}" ]] && continue
normalized_line="${raw_line//$'\t'/$FIELD_SEP}"
IFS="$FIELD_SEP" read -r source success error_message created_at <<< "$normalized_line"
ROW_COUNT=$((ROW_COUNT + 1))
if [[ "$success" == "t" || "$success" == "true" ]]; then
SUCCESS_COUNT=$((SUCCESS_COUNT + 1))
category="success"
rendered_error="-"
else
FAILURE_COUNT=$((FAILURE_COUNT + 1))
category="$(classify_failure "$error_message")"
rendered_error="${error_message:-unknown}"
if [[ "$category" == "precondition_missing" ]]; then
age_minutes="$(minutes_since_created "${created_at:-1970-01-01 00:00:00}")"
if [[ "$age_minutes" -gt "$AGED_PRECONDITION_MINUTES" ]]; then
category="aged_precondition_missing"
AGED_PRECONDITION_COUNT=$((AGED_PRECONDITION_COUNT + 1))
fi
fi
case "$category" in
precondition_missing)
PRECONDITION_COUNT=$((PRECONDITION_COUNT + 1))
;;
aged_precondition_missing)
;;
external_provider_failure)
EXTERNAL_COUNT=$((EXTERNAL_COUNT + 1))
;;
collector_runtime_failure)
RUNTIME_COUNT=$((RUNTIME_COUNT + 1))
;;
*)
UNKNOWN_COUNT=$((UNKNOWN_COUNT + 1))
;;
esac
fi
DETAIL_LINES+=$'sample_'"${ROW_COUNT}"$' created_at='"${created_at:-unknown}"$' source='"${source:-unknown}"$' outcome='"$([[ "$category" == "success" ]] && printf '%s' "success" || printf '%s' "failure")"$' category='"${category}"$' error='"${rendered_error}"$'\n'
done <<< "$ROWS"
if [[ "$ROW_COUNT" -eq 0 ]]; then
echo "window_size=0 success_count=0 failure_count=0 success_rate=0.00 threshold=${THRESHOLD:-n/a} precondition_missing=0 aged_precondition_missing=0 external_provider_failure=0 collector_runtime_failure=0 unknown_failure=0"
echo "sample_window=empty"
if [[ -n "$THRESHOLD" ]]; then
exit 1
fi
exit 0
fi
SUCCESS_RATE="$(awk -v success="$SUCCESS_COUNT" -v aged="$AGED_PRECONDITION_COUNT" -v total="$ROW_COUNT" 'BEGIN { effective_total = total - aged; if (effective_total <= 0) { printf "0.00" } else { printf "%.2f", (success * 100) / effective_total } }')"
echo "window_size=${ROW_COUNT} success_count=${SUCCESS_COUNT} failure_count=${FAILURE_COUNT} success_rate=${SUCCESS_RATE} threshold=${THRESHOLD:-n/a} precondition_missing=${PRECONDITION_COUNT} aged_precondition_missing=${AGED_PRECONDITION_COUNT} external_provider_failure=${EXTERNAL_COUNT} collector_runtime_failure=${RUNTIME_COUNT} unknown_failure=${UNKNOWN_COUNT}"
printf '%s' "$DETAIL_LINES"
if [[ -n "$THRESHOLD" ]]; then
if awk -v actual="$SUCCESS_RATE" -v threshold="$THRESHOLD" 'BEGIN { exit !(actual >= threshold) }'; then
exit 0
fi
exit 1
fi