整理内容: - 删除 60+ 临时测试输出文件 (*.txt) - 移动二进制文件到 bin/ 目录 - 移动 Shell 脚本到 scripts/ 目录 - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh - scripts/deploy/: deploy_*.sh, simple_deploy.sh - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh - scripts/test/: test_*.sh, test_*.bat - 移动批处理文件到 scripts/ - 移动 Python 脚本到 tools/ - 清理临时日志文件 保留根目录必要文件: - go.mod, go.sum, go.work - Makefile, docker-compose.yml - .env.example, .gitignore - README.md, AGENTS.md, DEPLOY_GUIDE.md 验证: go build ./... && go test ./... 通过
185 lines
7.2 KiB
PowerShell
185 lines
7.2 KiB
PowerShell
# SRE 日常健康巡检脚本
|
||
# 每日自动运行,输出系统健康状态报告
|
||
|
||
param(
|
||
[string]$BaseURL = "http://localhost:8080",
|
||
[string]$ReportDir = "docs\evidence\daily-health",
|
||
[switch]$AlertOnFailure
|
||
)
|
||
|
||
$ErrorActionPreference = "Continue"
|
||
$date = Get-Date -Format "yyyyMMdd-HHmmss"
|
||
$reportFile = "$ReportDir\HEALTH_CHECK_$date.md"
|
||
|
||
# 确保报告目录存在
|
||
New-Item -ItemType Directory -Force -Path $ReportDir | Out-Null
|
||
|
||
$report = @()
|
||
$totalChecks = 0
|
||
$passedChecks = 0
|
||
$criticalFailures = 0
|
||
|
||
function Add-Check {
|
||
param($name, $status, $detail, $isCritical = $false)
|
||
$script:totalChecks++
|
||
if ($status -eq "PASS") {
|
||
$script:passedChecks++
|
||
$icon = "✅"
|
||
} elseif ($status -eq "WARN") {
|
||
$icon = "⚠️"
|
||
} else {
|
||
$icon = "❌"
|
||
if ($isCritical) { $script:criticalFailures++ }
|
||
}
|
||
$line = "| $icon | $name | $status | $detail |"
|
||
$script:report += $line
|
||
Write-Host " $icon $name : $status — $detail"
|
||
}
|
||
|
||
Write-Host "=== UMS SRE 日常健康巡检 ===" -ForegroundColor Cyan
|
||
Write-Host "时间: $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')"
|
||
Write-Host "目标: $BaseURL"
|
||
Write-Host ""
|
||
|
||
# 1. 健康检查端点
|
||
Write-Host "[1/6] 健康检查端点" -ForegroundColor Yellow
|
||
try {
|
||
$health = Invoke-RestMethod -Uri "$BaseURL/health/ready" -TimeoutSec 10
|
||
$dbStatus = $health.checks.database.status
|
||
Add-Check "服务就绪检查 /health/ready" "PASS" "状态: $($health.status)" $true
|
||
Add-Check "数据库连接" $(if ($dbStatus -eq "UP") {"PASS"} else {"FAIL"}) "状态: $dbStatus" $true
|
||
if ($health.checks.redis) {
|
||
Add-Check "Redis 连接" $(if ($health.checks.redis.status -eq "UP") {"PASS"} elseif ($health.checks.redis.status -eq "UNKNOWN") {"WARN"} else {"FAIL"}) "状态: $($health.checks.redis.status)"
|
||
}
|
||
if ($health.uptime) {
|
||
Add-Check "服务运行时间" "PASS" $health.uptime
|
||
}
|
||
} catch {
|
||
Add-Check "服务就绪检查 /health/ready" "FAIL" $_.Exception.Message $true
|
||
}
|
||
|
||
try {
|
||
$live = Invoke-WebRequest -Uri "$BaseURL/health/live" -TimeoutSec 5
|
||
Add-Check "存活检查 /health/live" $(if ($live.StatusCode -lt 300) {"PASS"} else {"FAIL"}) "HTTP $($live.StatusCode)" $true
|
||
} catch {
|
||
Add-Check "存活检查 /health/live" "FAIL" $_.Exception.Message $true
|
||
}
|
||
|
||
# 2. 关键 API 响应时间
|
||
Write-Host "`n[2/6] 关键 API 响应时间" -ForegroundColor Yellow
|
||
$criticalPaths = @(
|
||
@{path="/api/v1/auth/capabilities"; desc="认证能力接口"; threshold=500},
|
||
@{path="/health"; desc="健康检查接口"; threshold=100}
|
||
)
|
||
foreach ($ep in $criticalPaths) {
|
||
try {
|
||
$sw = [System.Diagnostics.Stopwatch]::StartNew()
|
||
Invoke-RestMethod -Uri "$BaseURL$($ep.path)" -TimeoutSec 5 | Out-Null
|
||
$sw.Stop()
|
||
$ms = $sw.ElapsedMilliseconds
|
||
$status = if ($ms -le $ep.threshold) {"PASS"} elseif ($ms -le $ep.threshold * 2) {"WARN"} else {"FAIL"}
|
||
Add-Check "$($ep.desc) $($ep.path)" $status "${ms}ms (阈值: $($ep.threshold)ms)"
|
||
} catch {
|
||
Add-Check "$($ep.desc) $($ep.path)" "FAIL" $_.Exception.Message
|
||
}
|
||
}
|
||
|
||
# 3. Prometheus 指标端点
|
||
Write-Host "`n[3/6] Prometheus 指标端点" -ForegroundColor Yellow
|
||
try {
|
||
$metrics = Invoke-WebRequest -Uri "$BaseURL/metrics" -TimeoutSec 5
|
||
if ($metrics.StatusCode -eq 200) {
|
||
$content = $metrics.Content
|
||
$hasHTTPMetrics = $content -match "http_requests_total"
|
||
$hasDBMetrics = $content -match "db_query"
|
||
Add-Check "指标端点 /metrics" "PASS" "HTTP $($metrics.StatusCode)"
|
||
Add-Check "HTTP 请求指标" $(if ($hasHTTPMetrics) {"PASS"} else {"FAIL"}) $(if ($hasHTTPMetrics) {"存在 http_requests_total"} else {"缺少 http_requests_total — 需要接入 PrometheusMiddleware"})
|
||
Add-Check "数据库指标" $(if ($hasDBMetrics) {"PASS"} else {"WARN"}) $(if ($hasDBMetrics) {"存在 db_query"} else {"缺少 db_query 指标"})
|
||
}
|
||
} catch {
|
||
Add-Check "指标端点 /metrics" "FAIL" "端点不可用 — P0 问题:需要在 router.go 注册 /metrics" $true
|
||
}
|
||
|
||
# 4. 速率限制验证
|
||
Write-Host "`n[4/6] 速率限制功能验证" -ForegroundColor Yellow
|
||
$rateLimitTriggered = $false
|
||
$rlTotal = 0; $rl429 = 0
|
||
|
||
1..10 | ForEach-Object {
|
||
try {
|
||
$body = '{"account":"sre_healthcheck","password":"invalid_test_pwd"}'
|
||
$resp = Invoke-WebRequest -Uri "$BaseURL/api/v1/auth/login" -Method POST -Body $body -ContentType "application/json" -ErrorAction SilentlyContinue -TimeoutSec 3
|
||
$rlTotal++
|
||
if ($resp.StatusCode -eq 429) { $rl429++; $rateLimitTriggered = $true }
|
||
} catch { $rlTotal++ }
|
||
}
|
||
Add-Check "速率限制功能" $(if ($rateLimitTriggered) {"PASS"} else {"WARN"}) "$(10) 次请求中触发 ${rl429} 次 429$(if (-not $rateLimitTriggered) {' (10次内未触发,可能需要更多请求)'})"
|
||
|
||
# 5. Swagger 文档
|
||
Write-Host "`n[5/6] API 文档" -ForegroundColor Yellow
|
||
try {
|
||
$swagger = Invoke-WebRequest -Uri "$BaseURL/swagger/index.html" -TimeoutSec 5
|
||
Add-Check "Swagger 文档" $(if ($swagger.StatusCode -eq 200) {"PASS"} else {"WARN"}) "HTTP $($swagger.StatusCode)"
|
||
} catch {
|
||
Add-Check "Swagger 文档" "WARN" "不可访问(非阻塞)"
|
||
}
|
||
|
||
# 6. 配置健全性检查
|
||
Write-Host "`n[6/6] 配置健全性" -ForegroundColor Yellow
|
||
$configFile = "config\config.yaml"
|
||
if (Test-Path $configFile) {
|
||
$config = Get-Content $configFile -Raw
|
||
$hasDefaultJWT = $config -match "change-me-in-production"
|
||
$isSQLite = $config -match "type: sqlite"
|
||
Add-Check "JWT Secret 配置" $(if ($hasDefaultJWT) {"FAIL"} else {"PASS"}) $(if ($hasDefaultJWT) {"使用默认 Secret — 生产环境必须替换!"} else {"已自定义"}) $hasDefaultJWT
|
||
Add-Check "数据库类型" $(if ($isSQLite) {"WARN"} else {"PASS"}) $(if ($isSQLite) {"SQLite — 生产环境应迁移至 PostgreSQL"} else {"PostgreSQL/MySQL")
|
||
} else {
|
||
Add-Check "配置文件" "WARN" "config.yaml 不存在,可能使用环境变量配置"
|
||
}
|
||
|
||
# 生成报告
|
||
$passRate = [math]::Round($passedChecks / [math]::Max($totalChecks, 1) * 100, 1)
|
||
$overallStatus = if ($criticalFailures -gt 0) {"🔴 CRITICAL"} elseif ($passedChecks -lt $totalChecks) {"🟡 DEGRADED"} else {"🟢 HEALTHY"}
|
||
|
||
$mdReport = @"
|
||
# UMS 日常健康巡检报告
|
||
|
||
- **检查时间**: $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')
|
||
- **服务地址**: $BaseURL
|
||
- **总体状态**: $overallStatus
|
||
- **通过率**: ${passedChecks}/${totalChecks} ($passRate%)
|
||
- **严重失败**: $criticalFailures
|
||
|
||
## 检查详情
|
||
|
||
| 状态 | 检查项 | 结果 | 说明 |
|
||
|------|--------|------|------|
|
||
$($report -join "`n")
|
||
|
||
## 后续行动
|
||
|
||
$(if ($criticalFailures -gt 0) {
|
||
"⚠️ **存在 $criticalFailures 个严重问题,需立即处理!**"
|
||
} elseif ($passedChecks -lt $totalChecks) {
|
||
"📋 存在非严重警告,请在工作时间内跟进。"
|
||
} else {
|
||
"✅ 所有检查通过,系统健康。"
|
||
})
|
||
|
||
---
|
||
*由 scripts/ops/sre-daily-healthcheck.ps1 自动生成*
|
||
"@
|
||
|
||
$mdReport | Set-Content -Path $reportFile -Encoding UTF8
|
||
|
||
Write-Host "`n=== 巡检汇总 ===" -ForegroundColor Cyan
|
||
Write-Host "总体状态: $overallStatus"
|
||
Write-Host "通过率: ${passedChecks}/${totalChecks} ($passRate%)"
|
||
Write-Host "报告已保存至: $reportFile"
|
||
|
||
if ($criticalFailures -gt 0 -and $AlertOnFailure) {
|
||
Write-Host "`n⚠️ 存在严重问题,应触发告警通知!" -ForegroundColor Red
|
||
exit 1
|
||
}
|
||
exit 0
|