Files
user-system/scripts/ops/sre-daily-healthcheck.ps1
long-agent 5b6bd93179 refactor: 整理项目根目录结构
整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过
2026-04-07 18:10:36 +08:00

185 lines
7.2 KiB
PowerShell
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# SRE 日常健康巡检脚本
# 每日自动运行,输出系统健康状态报告
param(
[string]$BaseURL = "http://localhost:8080",
[string]$ReportDir = "docs\evidence\daily-health",
[switch]$AlertOnFailure
)
$ErrorActionPreference = "Continue"
$date = Get-Date -Format "yyyyMMdd-HHmmss"
$reportFile = "$ReportDir\HEALTH_CHECK_$date.md"
# 确保报告目录存在
New-Item -ItemType Directory -Force -Path $ReportDir | Out-Null
$report = @()
$totalChecks = 0
$passedChecks = 0
$criticalFailures = 0
function Add-Check {
param($name, $status, $detail, $isCritical = $false)
$script:totalChecks++
if ($status -eq "PASS") {
$script:passedChecks++
$icon = ""
} elseif ($status -eq "WARN") {
$icon = "⚠️"
} else {
$icon = ""
if ($isCritical) { $script:criticalFailures++ }
}
$line = "| $icon | $name | $status | $detail |"
$script:report += $line
Write-Host " $icon $name : $status$detail"
}
Write-Host "=== UMS SRE 日常健康巡检 ===" -ForegroundColor Cyan
Write-Host "时间: $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')"
Write-Host "目标: $BaseURL"
Write-Host ""
# 1. 健康检查端点
Write-Host "[1/6] 健康检查端点" -ForegroundColor Yellow
try {
$health = Invoke-RestMethod -Uri "$BaseURL/health/ready" -TimeoutSec 10
$dbStatus = $health.checks.database.status
Add-Check "服务就绪检查 /health/ready" "PASS" "状态: $($health.status)" $true
Add-Check "数据库连接" $(if ($dbStatus -eq "UP") {"PASS"} else {"FAIL"}) "状态: $dbStatus" $true
if ($health.checks.redis) {
Add-Check "Redis 连接" $(if ($health.checks.redis.status -eq "UP") {"PASS"} elseif ($health.checks.redis.status -eq "UNKNOWN") {"WARN"} else {"FAIL"}) "状态: $($health.checks.redis.status)"
}
if ($health.uptime) {
Add-Check "服务运行时间" "PASS" $health.uptime
}
} catch {
Add-Check "服务就绪检查 /health/ready" "FAIL" $_.Exception.Message $true
}
try {
$live = Invoke-WebRequest -Uri "$BaseURL/health/live" -TimeoutSec 5
Add-Check "存活检查 /health/live" $(if ($live.StatusCode -lt 300) {"PASS"} else {"FAIL"}) "HTTP $($live.StatusCode)" $true
} catch {
Add-Check "存活检查 /health/live" "FAIL" $_.Exception.Message $true
}
# 2. 关键 API 响应时间
Write-Host "`n[2/6] 关键 API 响应时间" -ForegroundColor Yellow
$criticalPaths = @(
@{path="/api/v1/auth/capabilities"; desc="认证能力接口"; threshold=500},
@{path="/health"; desc="健康检查接口"; threshold=100}
)
foreach ($ep in $criticalPaths) {
try {
$sw = [System.Diagnostics.Stopwatch]::StartNew()
Invoke-RestMethod -Uri "$BaseURL$($ep.path)" -TimeoutSec 5 | Out-Null
$sw.Stop()
$ms = $sw.ElapsedMilliseconds
$status = if ($ms -le $ep.threshold) {"PASS"} elseif ($ms -le $ep.threshold * 2) {"WARN"} else {"FAIL"}
Add-Check "$($ep.desc) $($ep.path)" $status "${ms}ms (阈值: $($ep.threshold)ms)"
} catch {
Add-Check "$($ep.desc) $($ep.path)" "FAIL" $_.Exception.Message
}
}
# 3. Prometheus 指标端点
Write-Host "`n[3/6] Prometheus 指标端点" -ForegroundColor Yellow
try {
$metrics = Invoke-WebRequest -Uri "$BaseURL/metrics" -TimeoutSec 5
if ($metrics.StatusCode -eq 200) {
$content = $metrics.Content
$hasHTTPMetrics = $content -match "http_requests_total"
$hasDBMetrics = $content -match "db_query"
Add-Check "指标端点 /metrics" "PASS" "HTTP $($metrics.StatusCode)"
Add-Check "HTTP 请求指标" $(if ($hasHTTPMetrics) {"PASS"} else {"FAIL"}) $(if ($hasHTTPMetrics) {"存在 http_requests_total"} else {"缺少 http_requests_total — 需要接入 PrometheusMiddleware"})
Add-Check "数据库指标" $(if ($hasDBMetrics) {"PASS"} else {"WARN"}) $(if ($hasDBMetrics) {"存在 db_query"} else {"缺少 db_query 指标"})
}
} catch {
Add-Check "指标端点 /metrics" "FAIL" "端点不可用 — P0 问题:需要在 router.go 注册 /metrics" $true
}
# 4. 速率限制验证
Write-Host "`n[4/6] 速率限制功能验证" -ForegroundColor Yellow
$rateLimitTriggered = $false
$rlTotal = 0; $rl429 = 0
1..10 | ForEach-Object {
try {
$body = '{"account":"sre_healthcheck","password":"invalid_test_pwd"}'
$resp = Invoke-WebRequest -Uri "$BaseURL/api/v1/auth/login" -Method POST -Body $body -ContentType "application/json" -ErrorAction SilentlyContinue -TimeoutSec 3
$rlTotal++
if ($resp.StatusCode -eq 429) { $rl429++; $rateLimitTriggered = $true }
} catch { $rlTotal++ }
}
Add-Check "速率限制功能" $(if ($rateLimitTriggered) {"PASS"} else {"WARN"}) "$(10) 次请求中触发 ${rl429} 次 429$(if (-not $rateLimitTriggered) {' (10次内未触发可能需要更多请求)'})"
# 5. Swagger 文档
Write-Host "`n[5/6] API 文档" -ForegroundColor Yellow
try {
$swagger = Invoke-WebRequest -Uri "$BaseURL/swagger/index.html" -TimeoutSec 5
Add-Check "Swagger 文档" $(if ($swagger.StatusCode -eq 200) {"PASS"} else {"WARN"}) "HTTP $($swagger.StatusCode)"
} catch {
Add-Check "Swagger 文档" "WARN" "不可访问(非阻塞)"
}
# 6. 配置健全性检查
Write-Host "`n[6/6] 配置健全性" -ForegroundColor Yellow
$configFile = "config\config.yaml"
if (Test-Path $configFile) {
$config = Get-Content $configFile -Raw
$hasDefaultJWT = $config -match "change-me-in-production"
$isSQLite = $config -match "type: sqlite"
Add-Check "JWT Secret 配置" $(if ($hasDefaultJWT) {"FAIL"} else {"PASS"}) $(if ($hasDefaultJWT) {"使用默认 Secret — 生产环境必须替换!"} else {"已自定义"}) $hasDefaultJWT
Add-Check "数据库类型" $(if ($isSQLite) {"WARN"} else {"PASS"}) $(if ($isSQLite) {"SQLite — 生产环境应迁移至 PostgreSQL"} else {"PostgreSQL/MySQL")
} else {
Add-Check "配置文件" "WARN" "config.yaml 不存在,可能使用环境变量配置"
}
# 生成报告
$passRate = [math]::Round($passedChecks / [math]::Max($totalChecks, 1) * 100, 1)
$overallStatus = if ($criticalFailures -gt 0) {"🔴 CRITICAL"} elseif ($passedChecks -lt $totalChecks) {"🟡 DEGRADED"} else {"🟢 HEALTHY"}
$mdReport = @"
# UMS
- ****: $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')
- ****: $BaseURL
- ****: $overallStatus
- ****: ${passedChecks}/${totalChecks} ($passRate%)
- ****: $criticalFailures
##
| | | | |
|------|--------|------|------|
$($report -join "`n")
##
$(if ($criticalFailures -gt 0) {
"⚠️ **存在 $criticalFailures 个严重问题,需立即处理!**"
} elseif ($passedChecks -lt $totalChecks) {
"📋 存在非严重警告,请在工作时间内跟进。"
} else {
"✅ 所有检查通过,系统健康。"
})
---
* scripts/ops/sre-daily-healthcheck.ps1 *
"@
$mdReport | Set-Content -Path $reportFile -Encoding UTF8
Write-Host "`n=== 巡检汇总 ===" -ForegroundColor Cyan
Write-Host "总体状态: $overallStatus"
Write-Host "通过率: ${passedChecks}/${totalChecks} ($passRate%)"
Write-Host "报告已保存至: $reportFile"
if ($criticalFailures -gt 0 -and $AlertOnFailure) {
Write-Host "`n⚠️ 存在严重问题,应触发告警通知!" -ForegroundColor Red
exit 1
}
exit 0