From 54a73e66f49d161d6ddfa528a7d45b7267760a2c Mon Sep 17 00:00:00 2001 From: long-agent Date: Sat, 11 Apr 2026 22:57:31 +0800 Subject: [PATCH] docs: add runbooks and Kubernetes Helm Chart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 6 runbook documents: - 服务启动 (Service Startup) - 服务停止 (Service Shutdown) - 配置更新 (Configuration Update) - 日志分析 (Log Analysis) - 备份恢复 (Backup & Recovery) - 安全事件 (Security Incident) Add Kubernetes Helm Chart: - Chart.yaml, values.yaml - Deployment with health checks - Ingress with TLS support - PVC for data persistence - PDB for high availability - HPA for autoscaling - ServiceAccount configuration Add cron-backup.conf for automated backup scheduling. --- docs/runbooks/01-服务启动.md | 152 +++++++++++ docs/runbooks/02-服务停止.md | 99 +++++++ docs/runbooks/03-配置更新.md | 173 ++++++++++++ docs/runbooks/04-日志分析.md | 213 +++++++++++++++ docs/runbooks/05-备份恢复.md | 237 +++++++++++++++++ docs/runbooks/06-安全事件.md | 249 ++++++++++++++++++ kubernetes/cron-backup.conf | 54 ++++ kubernetes/user-management/Chart.yaml | 13 + kubernetes/user-management/README.md | 172 ++++++++++++ .../user-management/templates/_helpers.tpl | 60 +++++ .../user-management/templates/configmap.yaml | 27 ++ .../user-management/templates/deployment.yaml | 112 ++++++++ kubernetes/user-management/templates/hpa.yaml | 32 +++ .../user-management/templates/ingress.yaml | 46 ++++ kubernetes/user-management/templates/pdb.yaml | 17 ++ kubernetes/user-management/templates/pvc.yaml | 15 ++ .../templates/serviceaccount.yaml | 6 + kubernetes/user-management/values.yaml | 90 +++++++ 18 files changed, 1767 insertions(+) create mode 100644 docs/runbooks/01-服务启动.md create mode 100644 docs/runbooks/02-服务停止.md create mode 100644 docs/runbooks/03-配置更新.md create mode 100644 docs/runbooks/04-日志分析.md create mode 100644 docs/runbooks/05-备份恢复.md create mode 100644 docs/runbooks/06-安全事件.md create mode 100644 kubernetes/cron-backup.conf create mode 100644 kubernetes/user-management/Chart.yaml create mode 100644 kubernetes/user-management/README.md create mode 100644 kubernetes/user-management/templates/_helpers.tpl create mode 100644 kubernetes/user-management/templates/configmap.yaml create mode 100644 kubernetes/user-management/templates/deployment.yaml create mode 100644 kubernetes/user-management/templates/hpa.yaml create mode 100644 kubernetes/user-management/templates/ingress.yaml create mode 100644 kubernetes/user-management/templates/pdb.yaml create mode 100644 kubernetes/user-management/templates/pvc.yaml create mode 100644 kubernetes/user-management/templates/serviceaccount.yaml create mode 100644 kubernetes/user-management/values.yaml diff --git a/docs/runbooks/01-服务启动.md b/docs/runbooks/01-服务启动.md new file mode 100644 index 0000000..fcbda2c --- /dev/null +++ b/docs/runbooks/01-服务启动.md @@ -0,0 +1,152 @@ +# 服务启动 Runbook + +**用途**: 新服务器部署或服务重启后启动用户管理系统 + +**适用场景**: 首次部署、服务器重启、故障恢复后 + +--- + +## 前提条件 + +- [ ] 服务器系统已安装 Docker 和 Docker Compose +- [ ] 已配置防火墙开放 8080 端口 +- [ ] 已准备好配置文件 `configs/config.yaml` +- [ ] 已设置必要的环境变量(参考 `.env.example`) + +--- + +## 启动步骤 + +### 1. 检查系统环境 + +```bash +# 检查 Docker 版本 +docker --version +docker-compose --version + +# 检查端口占用 +netstat -tlnp | grep 8080 +# 或在 Windows 上 +Get-NetTCPConnection -LocalPort 8080 +``` + +### 2. 准备配置文件 + +```bash +# 复制配置模板 +cp .env.example .env + +# 编辑配置(重点关注以下项) +vi .env +``` + +**必须配置项**: +- `JWT_SECRET` - JWT 签名密钥(生产环境必须使用强密钥) +- `ADMIN_EMAIL` - 初始管理员邮箱 +- `ADMIN_PASSWORD` - 初始管理员密码 + +### 3. 启动服务 + +```bash +# 使用 Docker Compose 启动 +docker-compose up -d + +# 查看服务状态 +docker-compose ps + +# 查看日志确认启动成功 +docker-compose logs -f +``` + +### 4. 验证服务 + +```bash +# 健康检查 +curl http://localhost:8080/api/v1/health + +# 预期响应: {"status":"ok"} + +# 检查所有端口 +curl http://localhost:8080/api/v1/health/ready +``` + +### 5. 初始化数据库 + +首次启动时,系统会自动创建 SQLite 数据库文件 (`data/user_management.db`)。 + +```bash +# 确认数据目录存在 +ls -la data/ + +# 确认数据库文件已创建 +ls -la data/*.db +``` + +--- + +## 故障排查 + +### 服务启动失败 + +```bash +# 查看详细日志 +docker-compose logs app + +# 常见问题: +# 1. 端口被占用 -> 改端口或停止占用进程 +# 2. 配置文件错误 -> 检查 config.yaml 语法 +# 3. 权限问题 -> 检查目录权限 +``` + +### 数据库初始化失败 + +```bash +# 检查数据目录 +ls -la data/ + +# 手动初始化数据库 +mkdir -p data +chmod 755 data +``` + +### 网络/防火墙问题 + +```bash +# Linux 检查防火墙 +sudo firewall-cmd --list-ports +sudo iptables -L -n | grep 8080 + +# 测试本地连接 +curl http://127.0.0.1:8080/api/v1/health +``` + +--- + +## 回滚操作 + +如果启动失败且无法修复: + +```bash +# 停止服务 +docker-compose down + +# 查看之前运行的容器 +docker ps -a | grep user-management + +# 从备份恢复(参考 备份恢复 Runbook) +./scripts/backup/backup.sh --restore +``` + +--- + +## 验证检查清单 + +- [ ] `docker-compose ps` 显示 app 服务状态为 Up +- [ ] `curl http://localhost:8080/api/v1/health` 返回 `{"status":"ok"}` +- [ ] 可以访问管理后台 `http://localhost:8080/admin` +- [ ] 可以使用初始管理员账号登录 + +--- + +**维护日期**: 2026-04-11 +**下次审查**: 每月检查一次 diff --git a/docs/runbooks/02-服务停止.md b/docs/runbooks/02-服务停止.md new file mode 100644 index 0000000..0e26a42 --- /dev/null +++ b/docs/runbooks/02-服务停止.md @@ -0,0 +1,99 @@ +# 服务停止 Runbook + +**用途**: 正常维护停止服务或紧急停止服务 + +**适用场景**: 系统维护、配置更新、紧急故障处理 + +--- + +## 正常停止(维护场景) + +### 1. 通知用户(可选) + +如果需要停机维护,提前通知: + +```bash +# 检查当前在线用户数(通过日志估算) +docker-compose logs --since=5m app | grep -c "POST /api/v1/auth/login" +``` + +### 2. 优雅停止服务 + +```bash +# 发送停止信号(会等待现有请求处理完成) +docker-compose stop + +# 或直接 down(不会等待) +docker-compose down +``` + +### 3. 确认停止 + +```bash +# 确认没有运行的容器 +docker-compose ps + +# 确认端口已释放 +netstat -tlnp | grep 8080 +``` + +--- + +## 紧急停止(故障场景) + +当服务出现严重问题时,需要紧急停止: + +### 1. 立即停止 + +```bash +# 强制停止所有容器 +docker-compose kill + +# 如果 docker-compose 无响应,直接 kill +docker kill $(docker ps -q -f name=user-management) +``` + +### 2. 确认资源释放 + +```bash +# 确认容器已停止 +docker ps -a | grep user-management + +# 确认端口已释放 +netstat -tlnp | grep 8080 +``` + +### 3. 记录故障现场 + +```bash +# 保存故障时的日志 +docker-compose logs > logs/emergency_$(date +%Y%m%d_%H%M%S).log + +# 保存当前数据库状态 +cp data/user_management.db data/user_management_emergency_$(date +%Y%m%d_%H%M%S).db +``` + +--- + +## 停止后的检查 + +停止服务后,确认以下内容: + +- [ ] 所有容器已停止 +- [ ] 端口 8080 已释放 +- [ ] 日志已保存 +- [ ] 数据库文件完整 +- [ ] 无残留进程 + +--- + +## 相关文档 + +- [服务启动](./01-服务启动.md) - 如何启动服务 +- [日志分析](./04-日志分析.md) - 如何分析日志排查问题 +- [备份恢复](./05-备份恢复.md) - 如何恢复数据 + +--- + +**维护日期**: 2026-04-11 +**下次审查**: 每月检查一次 diff --git a/docs/runbooks/03-配置更新.md b/docs/runbooks/03-配置更新.md new file mode 100644 index 0000000..c47281d --- /dev/null +++ b/docs/runbooks/03-配置更新.md @@ -0,0 +1,173 @@ +# 配置更新 Runbook + +**用途**: 安全地更新系统配置 + +**适用场景**: 修改系统参数、调整安全设置、更新外部服务配置 + +--- + +## 风险等级评估 + +| 风险等级 | 配置类型 | 需要审批 | 需要备份 | +|---------|---------|---------|---------| +| 低 | 日志级别、超时设置 | 否 | 否 | +| 中 | 端口、缓存设置 | 是 | 是 | +| 高 | JWT密钥、数据库路径 | 是 | 是 | + +--- + +## 配置更新步骤 + +### 1. 备份当前配置 + +```bash +# 备份当前配置文件 +cp configs/config.yaml configs/config.yaml.bak.$(date +%Y%m%d_%H%M%S) + +# 如果是 Docker 环境,备份环境变量 +docker inspect user-management-app | grep -A 50 "Env" > configs/env_backup_$(date +%Y%m%d_%H%M%S).txt +``` + +### 2. 审查变更内容 + +```bash +# 查看当前配置(生产环境慎用 cat) +cat configs/config.yaml + +# 或使用 diff 对比 +diff configs/config.yaml configs/config.yaml.bak.* +``` + +### 3. 应用配置更新 + +**方式 A: 通过环境变量更新(推荐)** + +```bash +# 设置环境变量后重启 +export JWT_SECRET="your-new-secret-here" +docker-compose up -d +``` + +**方式 B: 直接编辑配置文件** + +```bash +vi configs/config.yaml + +# 验证 YAML 语法 +python3 -c "import yaml; yaml.safe_load(open('configs/config.yaml'))" +``` + +### 4. 验证配置生效 + +```bash +# 重启服务 +docker-compose restart + +# 检查日志确认启动正常 +docker-compose logs --tail=50 | grep -i "config\|start\|error" +``` + +### 5. 测试关键功能 + +```bash +# 测试认证功能 +curl -X POST http://localhost:8080/api/v1/auth/login \ + -H "Content-Type: application/json" \ + -d '{"username":"admin","password":"your-password"}' + +# 测试 API 调用 +curl http://localhost:8080/api/v1/health +``` + +--- + +## 高风险配置更新 + +### JWT 密钥更新 + +> **警告**: 更新 JWT 密钥会导致所有现有登录会话失效 + +```bash +# 1. 通知所有用户将断开连接 + +# 2. 备份当前配置 +cp configs/config.yaml configs/config.yaml.jwt_backup.$(date +%Y%m%d) + +# 3. 更新配置 +vi configs/config.yaml +# 修改 jwt.secret + +# 4. 重启服务 +docker-compose restart + +# 5. 确认服务正常 +curl http://localhost:8080/api/v1/health +``` + +### 数据库路径变更 + +```bash +# 1. 停止服务 +docker-compose stop + +# 2. 备份数据库 +./scripts/backup/backup.sh + +# 3. 更新配置 +vi configs/config.yaml +# 修改 database.path + +# 4. 移动数据库文件 +mv data/user_management.db data/new_path/ + +# 5. 启动服务 +docker-compose up -d + +# 6. 验证数据完整性 +sqlite3 data/new_path/user_management.db "PRAGMA integrity_check;" +``` + +--- + +## 回滚配置 + +如果配置更新后出现问题: + +```bash +# 1. 停止服务 +docker-compose stop + +# 2. 恢复备份的配置 +cp configs/config.yaml.bak.* configs/config.yaml + +# 3. 如果需要,恢复数据库 +./scripts/backup/backup.sh --restore + +# 4. 重启服务 +docker-compose up -d + +# 5. 验证 +curl http://localhost:8080/api/v1/health +``` + +--- + +## 配置变更记录 + +所有生产配置变更必须记录: + +| 日期 | 变更内容 | 变更人 | 审批人 | 回滚方案 | +|-----|---------|-------|-------|---------| +| YYYY-MM-DD | 描述变更内容 | 姓名 | 姓名 | 如需要 | + +--- + +## 相关文档 + +- [服务启动](./01-服务启动.md) - 初始配置指导 +- [备份恢复](./05-备份恢复.md) - 数据备份与恢复 + +--- + +**维护日期**: 2026-04-11 +**下次审查**: 每月检查一次 diff --git a/docs/runbooks/04-日志分析.md b/docs/runbooks/04-日志分析.md new file mode 100644 index 0000000..1a92508 --- /dev/null +++ b/docs/runbooks/04-日志分析.md @@ -0,0 +1,213 @@ +# 日志分析 Runbook + +**用途**: 排查系统问题、分析故障原因 + +**适用场景**: 服务异常、用户投诉、安全审计 + +--- + +## 日志位置 + +``` +# Docker 环境 +docker-compose logs -f app # 实时查看 +docker-compose logs app > app.log # 导出日志 + +# 本地环境 +./logs/app.log # 本地日志文件 +./logs/access.log # 访问日志 +``` + +--- + +## 日志格式 + +系统使用结构化日志格式: + +``` +2026-04-11 10:30:45 [API] 2026-04-11 10:30:45 POST /api/v1/auth/login | status: 200 | latency: 45.2ms | ip: 192.168.1.100 | user_id: 123 | trace_id: abc123 +``` + +**字段说明**: +- `timestamp` - 请求时间 +- `method` - HTTP 方法 +- `path` - 请求路径 +- `status` - HTTP 状态码 +- `latency` - 响应延迟 +- `ip` - 客户端 IP +- `user_id` - 用户 ID(未登录为 ``) +- `trace_id` - 请求追踪 ID + +--- + +## 常见问题排查 + +### 1. 服务无法访问 + +```bash +# 检查服务状态 +docker-compose ps + +# 查看最近错误日志 +docker-compose logs --tail=100 app | grep -i error + +# 检查端口监听 +netstat -tlnp | grep 8080 +``` + +### 2. 登录失败 + +```bash +# 搜索登录相关日志 +docker-compose logs --tail=500 app | grep -i "login\|auth" + +# 检查具体错误 +docker-compose logs --tail=500 app | grep "status: 401\|status: 403" + +# 检查密码验证日志 +docker-compose logs --tail=500 app | grep -i "password\|verify" +``` + +### 3. API 响应慢 + +```bash +# 搜索慢请求(latency > 1s) +docker-compose logs --tail=1000 app | grep -E "latency: [0-9]+\.[0-9]+s|latency: [2-9][0-9]+ms" + +# 分析慢请求模式 +docker-compose logs app | grep "latency" | awk -F'latency: ' '{print $2}' | awk '{sum+=$1; count++} END {print "平均延迟:", sum/count "ms"}' +``` + +### 4. 数据库错误 + +```bash +# 搜索数据库相关错误 +docker-compose logs --tail=500 app | grep -i "sql\|database\|sqlite" + +# 检查数据库文件 +ls -la data/*.db +sqlite3 data/user_management.db "PRAGMA integrity_check;" +``` + +### 5. 内存/资源问题 + +```bash +# 检查容器资源使用 +docker stats --no-stream + +# 查看内存相关日志 +docker-compose logs --tail=500 app | grep -i "memory\|oom\|alloc" + +# 检查 goroutine 数量 +docker-compose logs --tail=500 app | grep -i "goroutine" +``` + +--- + +## 日志分析命令 + +### 常用 grep 命令 + +```bash +# 搜索错误日志 +docker-compose logs app | grep -i error + +# 搜索特定用户的操作 +docker-compose logs app | grep "user_id: 123" + +# 搜索特定时间段的日志 +docker-compose logs --since="2026-04-11T10:00:00" app + +# 搜索特定 trace_id +docker-compose logs app | grep "trace_id: abc123" + +# 统计各状态码出现次数 +docker-compose logs app | grep -oE "status: [0-9]+" | sort | uniq -c +``` + +### 日志统计脚本 + +```bash +#!/bin/bash +# 日志统计脚本 + +echo "=== 请求统计 ===" +docker-compose logs app | grep -c "POST\|GET\|PUT\|DELETE" + +echo "=== 状态码分布 ===" +docker-compose logs app | grep -oE "status: [0-9]+" | sort | uniq -c + +echo "=== 慢请求 (>1s) ===" +docker-compose logs app | grep -E "latency: [2-9][0-9]+ms|latency: [0-9]+\.[0-9]+s" | wc -l + +echo "=== 错误请求 ===" +docker-compose logs app | grep -i "error\|fail\|panic" | wc -l +``` + +--- + +## 日志级别 + +| 级别 | 关键词 | 含义 | +|-----|-------|-----| +| DEBUG | `DEBUG` | 调试信息 | +| INFO | `INFO` | 正常信息 | +| WARN | `WARN` | 警告信息 | +| ERROR | `ERROR` | 错误信息 | + +```bash +# 设置日志级别(通过配置或环境变量) +# 生产环境建议: INFO 或 WARN +# 开发环境: DEBUG + +docker-compose logs --tail=100 app | grep -E "DEBUG|INFO|WARN|ERROR" +``` + +--- + +## 安全审计 + +### 1. 查找异常登录尝试 + +```bash +# 查找失败的登录 +docker-compose logs app | grep "status: 401" + +# 查找异地登录(同一用户不同 IP) +docker-compose logs app | grep "user_id: " | awk '{print $NF}' | sort | uniq -c | sort -rn | head -10 +``` + +### 2. 查找敏感操作 + +```bash +# 查找密码修改 +docker-compose logs app | grep -i "password\|change" + +# 查找权限变更 +docker-compose logs app | grep -i "role\|permission\|admin" + +# 查找数据导出 +docker-compose logs app | grep -i "export\|download" +``` + +### 3. 查找恶意请求 + +```bash +# 查找 SQL 注入尝试 +docker-compose logs app | grep -i "sql\|union\|select\|drop" + +# 查找 XSS 尝试 +docker-compose logs app | grep -i "> /var/log/backup.log 2>&1 + +# 验证 crontab +crontab -l +``` + +### 设置定时任务 (Docker 环境) + +```bash +# 创建定时任务容器或使用宿主机的 cron +# 在 docker-compose.yml 中添加 cron 服务,或使用宿主机 crontab +``` + +### Windows 任务计划 + +```powershell +# 使用 PowerShell 创建计划任务 +$action = New-ScheduledTaskAction -Execute "C:\path\to\scripts\backup\backup.sh" +$trigger = New-ScheduledTaskTrigger -Daily -At "2:00AM" +Register-ScheduledTask -Action $action -Trigger $trigger -TaskName "UserManagementBackup" +``` + +--- + +## 手动备份 + +### 执行备份 + +```bash +# 基本备份 +./scripts/backup/backup.sh + +# 指定备份目录 +BACKUP_DIR=/mnt/backups ./scripts/backup/backup.sh + +# 指定数据库路径 +DB_PATH=/custom/path/user_management.db ./scripts/backup/backup.sh +``` + +### 备份输出 + +``` +[INFO] Starting backup... +[INFO] Backing up database: ./data/user_management.db +[SUCCESS] Database backed up to: /backups/user-management_20260411_020000/database.db +[INFO] Backing up config: ./configs/config.yaml +[SUCCESS] Config backed up to: /backups/user-management_20260411_020000/config.yaml +[SUCCESS] Backup completed: /backups/user-management_20260411_020000.tar.gz +[SUCCESS] Checksum: abc123... user-management_20260411_020000.tar.gz +``` + +--- + +## 备份恢复 + +### 1. 确认恢复需求 + +> **警告**: 恢复操作会覆盖当前数据! + +- [ ] 确认需要恢复的原因 +- [ ] 确认备份文件完整 +- [ ] 通知相关用户 + +### 2. 检查备份完整性 + +```bash +# 列出可用备份 +./scripts/backup/backup.sh --list + +# 验证备份 +./scripts/backup/backup.sh --verify +``` + +### 3. 执行恢复 + +```bash +# 恢复前先停止服务 +docker-compose stop + +# 执行恢复(会提示确认) +./scripts/backup/backup.sh --restore + +# 如果需要恢复特定备份 +LATEST_BACKUP=/path/to/specific/backup.tar.gz ./scripts/backup/backup.sh --restore +``` + +### 4. 验证恢复 + +```bash +# 启动服务 +docker-compose up -d + +# 验证数据库 +sqlite3 data/user_management.db "PRAGMA integrity_check;" + +# 验证数据 +curl http://localhost:8080/api/v1/health +``` + +--- + +## 增量备份策略 + +对于数据量大的场景,可以实现增量备份: + +### 方案 A: 文件级增量 + +```bash +#!/bin/bash +# 增量备份脚本 +# 只备份自上次备份以来修改的文件 + +LAST_BACKUP=$(ls -t backups/*.tar.gz | head -1) +BACKUP_DIR="./incremental_backups" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) + +mkdir -p $BACKUP_DIR + +# 使用 rsync 进行增量备份 +rsync -av --compare-dest=$LAST_BACKUP data/ $BACKUP_DIR/incremental_$TIMESTAMP/ +``` + +### 方案 B: SQLite 在线备份 + +```bash +#!/bin/bash +# SQLite 在线备份(不需要停止服务) + +DB_PATH="./data/user_management.db" +BACKUP_PATH="./backups/incremental_$(date +%Y%m%d_%H%M%S).db" + +# 使用 SQLite 的 .backup 命令(事务一致) +sqlite3 $DB_PATH "VACUUM INTO '$BACKUP_PATH';" + +echo "增量备份完成: $BACKUP_PATH" +``` + +--- + +## 异地备份 + +### 方案 A: SCP 到远程服务器 + +```bash +#!/bin/bash +# 备份到远程服务器 + +BACKUP_FILE=$(ls -t backups/*.tar.gz | head -1) +REMOTE_USER="backup" +REMOTE_HOST="backup-server.example.com" +REMOTE_PATH="/backups/user-management" + +scp $BACKUP_FILE $REMOTE_USER@$REMOTE_HOST:$REMOTE_PATH/ +``` + +### 方案 B: 云存储 + +```bash +#!/bin/bash +# 备份到 S3 兼容存储 + +BACKUP_FILE=$(ls -t backups/*.tar.gz | head -1) + +# 使用 s3cmd +s3cmd put $BACKUP_FILE s3://my-bucket/user-management-backups/ + +# 或使用 aws cli +aws s3 cp $BACKUP_FILE s3://my-bucket/user-management-backups/ +``` + +--- + +## 灾难恢复计划 (DRP) + +### RTO (恢复时间目标): 4 小时 +### RPO (恢复点目标): 24 小时 + +### 灾难恢复步骤 + +1. **宣布灾难** - 联系运维团队和相关负责人 +2. **评估损失** - 确定数据丢失范围和时间点 +3. **启动恢复** - 按以下顺序恢复: + - 基础设施(服务器、网络) + - 最新稳定备份 + - 增量备份(如有) +4. **验证服务** - 确认所有核心功能正常 +5. **通知用户** - 告知恢复完成和服务可用 + +### 恢复检查清单 + +- [ ] 数据库完整恢复 +- [ ] 配置文件正确 +- [ ] 服务正常启动 +- [ ] 用户认证正常 +- [ ] 核心 API 可用 +- [ ] 数据完整性验证 + +--- + +## 相关文档 + +- [服务启动](./01-服务启动.md) - 恢复后启动服务 +- [服务停止](./02-服务停止.md) - 备份前停止服务 +- [配置更新](./03-配置更新.md) - 配置文件备份 + +--- + +**维护日期**: 2026-04-11 +**下次审查**: 每季度检查一次 +**测试频率**: 每季度执行一次恢复演练 diff --git a/docs/runbooks/06-安全事件.md b/docs/runbooks/06-安全事件.md new file mode 100644 index 0000000..564f7dd --- /dev/null +++ b/docs/runbooks/06-安全事件.md @@ -0,0 +1,249 @@ +# 安全事件 Runbook + +**用途**: 处理安全事件和漏洞响应 + +**适用场景**: 账户被盗、数据泄露、恶意攻击、权限异常 + +--- + +## 安全事件分级 + +| 级别 | 名称 | 描述 | 响应时间 | +|-----|------|------|---------| +| P0 | 严重 | 数据泄露、系统入侵、权限被完全绕过 | 立即 | +| P1 | 高危 | 账户被盗、密码泄露、疑似入侵 | 1小时内 | +| P2 | 中危 | 异常登录、权限提升尝试、API滥用 | 4小时内 | +| P3 | 低危 | 可疑行为、配置弱点、潜在风险 | 24小时内 | + +--- + +## 事件响应流程 + +``` +发现事件 → 评估确认 → 遏制影响 → 调查取证 → 修复漏洞 → 恢复服务 → 事后复盘 +``` + +--- + +## 1. 发现与评估 + +### 识别安全事件 + +**异常迹象**: +- 大量失败登录尝试 +- 异常用户活动(异地登录、时间异常) +- 未经授权的配置变更 +- 服务性能异常下降 +- 用户报告账户异常 + +### 初步评估 + +```bash +# 检查最近登录失败 +docker-compose logs --since=1h app | grep "status: 401" + +# 检查异常 IP 访问 +docker-compose logs --since=1h app | awk '{print $NF}' | grep -v "user_id" | sort | uniq -c | sort -rn + +# 检查用户权限异常 +docker-compose logs --since=1h app | grep -i "admin\|permission\|role" + +# 检查配置文件变更 +stat configs/config.yaml +ls -la configs/config.yaml.* +``` + +--- + +## 2. 遏制影响 + +### P0 严重事件 - 立即行动 + +```bash +# 1. 隔离受影响系统 +docker-compose kill + +# 2. 保存现场 +docker-compose logs > logs/security_$(date +%Y%m%d_%H%M%S).log +cp -r data data_backup_$(date +%Y%m%d_%H%M%S) + +# 3. 撤销会话 +# 如果使用 Redis,清除所有会话 +docker exec user-management-app redis-cli FLUSHALL + +# 4. 重置所有密码(紧急情况) +# 参考下面的密码重置流程 +``` + +### P1 高危事件 + +```bash +# 1. 禁用受影响账户 +docker-compose logs app | grep "user_id: XXX" # 找出受影响用户 + +# 2. 撤销可疑会话 +# 检查并清除可疑 token + +# 3. 加强监控 +# 增加日志详细程度 +``` + +--- + +## 3. 调查取证 + +### 日志分析 + +```bash +# 导出相关日志 +docker-compose logs --since="2026-04-11T00:00:00" > logs/investigation_$(date +%Y%m%d).log + +# 分析攻击痕迹 +grep -E "error|warning|fail|invalid" logs/investigation_*.log + +# 分析攻击者行为 +docker-compose logs | grep "attacker_ip" -A 5 -B 5 + +# 检查数据库异常 +sqlite3 data/user_management.db "SELECT * FROM users WHERE updated_at > '2026-04-11';" +``` + +### 常见攻击特征 + +| 攻击类型 | 日志特征 | 检查命令 | +|---------|---------|---------| +| 暴力破解 | 大量 401 状态码 | `grep status: 401` | +| SQL 注入 | SQL 关键字在请求中 | `grep -i sql\|union\|select` | +| XSS | 脚本标签在请求中 | `grep -i > /var/log/backup.log 2>&1 + +# 每周日凌晨 3:00 执行完整备份(包含上传到远程存储) +0 3 * * 0 /opt/user-management/scripts/backup/backup.sh && \ + scp /opt/user-management/backups/latest.tar.gz backup@remote-server:/backups/ + +# 每天下午 6:00 检查备份状态并发送报告 +0 18 * * * /opt/user-management/scripts/backup/backup.sh --verify || \ + echo "Backup verification failed" | mail -s "Backup Alert" admin@example.com + +# ============================================ +# 清理任务 +# ============================================ + +# 每月 1 日凌晨 4:00 清理超过 90 天的备份 +0 4 1 * * find /opt/user-management/backups -name "*.tar.gz" -mtime +90 -delete + +# ============================================ +# 监控任务 +# ============================================ + +# 每 15 分钟检查服务健康状态 +*/15 * * * * curl -sf http://localhost:8080/api/v1/health || \ + echo "Service down at $(date)" | mail -s "Service Alert" admin@example.com + +# ============================================ +# 日志轮转配置 (/etc/logrotate.d/user-management) +# ============================================ + +/var/log/backup.log { + daily + rotate 7 + compress + delaycompress + missingok + notifempty + create 644 root root +} diff --git a/kubernetes/user-management/Chart.yaml b/kubernetes/user-management/Chart.yaml new file mode 100644 index 0000000..ae144fb --- /dev/null +++ b/kubernetes/user-management/Chart.yaml @@ -0,0 +1,13 @@ +apiVersion: v2 +name: user-management +description: A Helm chart for User Management System +type: application +version: 1.0.0 +appVersion: "1.0.0" +keywords: + - user-management + - authentication + - rbac +maintainers: + - name: DevOps Team + email: devops@example.com diff --git a/kubernetes/user-management/README.md b/kubernetes/user-management/README.md new file mode 100644 index 0000000..54cdf0d --- /dev/null +++ b/kubernetes/user-management/README.md @@ -0,0 +1,172 @@ +# User Management System - Helm Chart + +Kubernetes Helm Chart for deploying the User Management System. + +## Prerequisites + +- Kubernetes 1.19+ +- Helm 3.2.0+ +- ingress-nginx controller (for Ingress) +- cert-manager (for TLS, optional) + +## Installation + +```bash +# Add the repository +helm repo add user-management https://charts.example.com +helm repo update + +# Install the chart +helm install user-management user-management/user-management \ + --set config.jwtSecret="your-secret-key" \ + --set config.adminEmail="admin@example.com" +``` + +## Using with Custom Values + +```bash +# Create a values file +cat > values.yaml << EOF +replicaCount: 2 + +config: + jwtSecret: "your-production-secret-key" + adminEmail: "admin@example.com" + logLevel: "warn" + +ingress: + enabled: true + hosts: + - host: ums.example.com + paths: + - path: / + tls: + - secretName: ums-tls + hosts: + - ums.example.com + +resources: + limits: + cpu: 1000m + memory: 1Gi +EOF + +# Install with custom values +helm install user-management user-management/user-management -f values.yaml +``` + +## Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `replicaCount` | Number of replicas | `1` | +| `image.repository` | Docker image repository | `user-management` | +| `image.tag` | Docker image tag | `latest` | +| `service.type` | Service type | `ClusterIP` | +| `service.port` | Service port | `8080` | +| `ingress.enabled` | Enable Ingress | `true` | +| `ingress.className` | Ingress class | `nginx` | +| `config.jwtSecret` | JWT signing secret (required) | `""` | +| `config.adminEmail` | Admin email | `admin@example.com` | +| `config.logLevel` | Log level | `info` | +| `resources.limits.cpu` | CPU limit | `500m` | +| `resources.limits.memory` | Memory limit | `512Mi` | +| `persistence.enabled` | Enable PVC | `true` | +| `persistence.size` | PVC size | `5Gi` | +| `autoscaling.enabled` | Enable HPA | `false` | +| `autoscaling.minReplicas` | Min replicas | `1` | +| `autoscaling.maxReplicas` | Max replicas | `3` | + +## Production Best Practices + +### 1. Use TLS + +```bash +helm install user-management user-management/user-management \ + --set config.jwtSecret="$(openssl rand -base64 32)" \ + --set ingress.enabled=true \ + --set ingress.tls[0].secretName=ums-tls \ + --set ingress.tls[0].hosts[0]=ums.example.com +``` + +### 2. Set Resource Limits + +```bash +helm install user-management user-management/user-management \ + --set resources.limits.cpu="1000m" \ + --set resources.limits.memory="1Gi" \ + --set resources.requests.cpu="250m" \ + --set resources.requests.memory="512Mi" +``` + +### 3. Enable Autoscaling + +```bash +helm install user-management user-management/user-management \ + --set autoscaling.enabled=true \ + --set autoscaling.minReplicas=2 \ + --set autoscaling.maxReplicas=10 \ + --set autoscaling.targetCPUUtilizationPercentage=70 +``` + +### 4. Use a Strong JWT Secret + +```bash +# Generate a secure random secret +JWT_SECRET=$(openssl rand -base64 32 | tr -d '\n') + +helm install user-management user-management/user-management \ + --set config.jwtSecret="$JWT_SECRET" +``` + +## Upgrading + +```bash +# Upgrade to a new version +helm upgrade user-management user-management/user-management + +# Upgrade with new values +helm upgrade user-management user-management/user-management \ + --set config.logLevel="debug" +``` + +## Uninstall + +```bash +helm uninstall user-management + +# Note: PVC data persists by default. To delete all data: +kubectl delete pvc -l app.kubernetes.io/name=user-management +``` + +## Troubleshooting + +### Pod not starting + +```bash +# Check pod status +kubectl get pods -l app.kubernetes.io/name=user-management + +# View pod logs +kubectl logs -l app.kubernetes.io/name=user-management + +# Describe pod for events +kubectl describe pod -l app.kubernetes.io/name=user-management +``` + +### Ingress not working + +```bash +# Check ingress controller +kubectl get pods -n ingress-nginx + +# Check ingress resource +kubectl get ingress -l app.kubernetes.io/name=user-management + +# Check certificate +kubectl get certificate -l app.kubernetes.io/name=user-management +``` + +## License + +Internal use only. diff --git a/kubernetes/user-management/templates/_helpers.tpl b/kubernetes/user-management/templates/_helpers.tpl new file mode 100644 index 0000000..9f9a3b3 --- /dev/null +++ b/kubernetes/user-management/templates/_helpers.tpl @@ -0,0 +1,60 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "user-management.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +*/}} +{{- define "user-management.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "user-management.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "_" "-" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "user-management.labels" -}} +helm.sh/chart: {{ include "user-management.chart" . }} +{{ include "user-management.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "user-management.selectorLabels" -}} +app.kubernetes.io/name: {{ include "user-management.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "user-management.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "user-management.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/kubernetes/user-management/templates/configmap.yaml b/kubernetes/user-management/templates/configmap.yaml new file mode 100644 index 0000000..a3992cc --- /dev/null +++ b/kubernetes/user-management/templates/configmap.yaml @@ -0,0 +1,27 @@ +{{- /* +ConfigMap template - stores non-sensitive configuration +*/ -}} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "user-management.fullname" . }}-config + labels: + {{- include "user-management.labels" . | nindent 4 }} +data: + GIN_MODE: "release" + TZ: "Asia/Shanghai" + LOG_LEVEL: {{ .Values.config.logLevel | quote }} + ADMIN_EMAIL: {{ .Values.config.adminEmail | quote }} +--- +{{- /* +Secret template - stores sensitive configuration +*/ -}} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "user-management.fullname" . }}-config + labels: + {{- include "user-management.labels" . | nindent 4 }} +type: Opaque +stringData: + JWT_SECRET: {{ required "config.jwtSecret is required" .Values.config.jwtSecret | b64enc | quote }} diff --git a/kubernetes/user-management/templates/deployment.yaml b/kubernetes/user-management/templates/deployment.yaml new file mode 100644 index 0000000..c7ff2c9 --- /dev/null +++ b/kubernetes/user-management/templates/deployment.yaml @@ -0,0 +1,112 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "user-management.fullname" . }} + labels: + {{- include "user-management.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "user-management.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "user-management.selectorLabels" . | nindent 8 }} + annotations: + checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "user-management.serviceAccountName" . }} + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + {{- if .Values.podAntiAffinity.enabled }} + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + {{- include "user-management.selectorLabels" . | nindent 12 }} + topologyKey: {{ .Values.podAntiAffinity.topologyKey }} + {{- end }} + containers: + - name: {{ .Chart.Name }} + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: http + containerPort: 8080 + protocol: TCP + envFrom: + - configMapRef: + name: {{ include "user-management.fullname" . }}-config + {{- if .Values.livenessProbe.enabled }} + livenessProbe: + httpGet: + path: {{ .Values.livenessProbe.path }} + port: http + initialDelaySeconds: {{ .Values.livenessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.livenessProbe.periodSeconds }} + timeoutSeconds: {{ .Values.livenessProbe.timeoutSeconds }} + failureThreshold: {{ .Values.livenessProbe.failureThreshold }} + {{- end }} + {{- if .Values.readinessProbe.enabled }} + readinessProbe: + httpGet: + path: {{ .Values.readinessProbe.path }} + port: http + initialDelaySeconds: {{ .Values.readinessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.readinessProbe.periodSeconds }} + timeoutSeconds: {{ .Values.readinessProbe.timeoutSeconds }} + failureThreshold: {{ .Values.readinessProbe.failureThreshold }} + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumeMounts: + - name: data + mountPath: /app/data + - name: config + mountPath: /app/configs + readOnly: true + - name: tmp + mountPath: /tmp + volumes: + - name: data + {{- if .Values.persistence.enabled }} + persistentVolumeClaim: + claimName: {{ include "user-management.fullname" . }}-data + {{- else }} + emptyDir: {} + {{- end }} + - name: config + secret: + secretName: {{ include "user-management.fullname" . }}-config + - name: tmp + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ include "user-management.fullname" . }} + labels: + {{- include "user-management.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "user-management.selectorLabels" . | nindent 4 }} diff --git a/kubernetes/user-management/templates/hpa.yaml b/kubernetes/user-management/templates/hpa.yaml new file mode 100644 index 0000000..7255b7f --- /dev/null +++ b/kubernetes/user-management/templates/hpa.yaml @@ -0,0 +1,32 @@ +{{- if .Values.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "user-management.fullname" . }} + labels: + {{- include "user-management.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "user-management.fullname" . }} + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} diff --git a/kubernetes/user-management/templates/ingress.yaml b/kubernetes/user-management/templates/ingress.yaml new file mode 100644 index 0000000..19228b6 --- /dev/null +++ b/kubernetes/user-management/templates/ingress.yaml @@ -0,0 +1,46 @@ +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "user-management.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if and .Values.ingress.className (not (eq .Values.ingress.className "nginx")) }} +{{- panic "ERROR: ingress.className must be 'nginx' for this chart compatibility" }} +{{- end }} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "user-management.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + nginx.ingress.kubernetes.io/proxy-body-size: "10m" + nginx.ingress.kubernetes.io/proxy-read-timeout: "300" + nginx.ingress.kubernetes.io/proxy-send-timeout: "300" +spec: + {{- if .Values.ingress.tls }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + pathType: {{ .pathType | default "Prefix" }} + backend: + service: + name: {{ $fullName }} + port: + number: {{ $svcPort }} + {{- end }} + {{- end }} +{{- end }} diff --git a/kubernetes/user-management/templates/pdb.yaml b/kubernetes/user-management/templates/pdb.yaml new file mode 100644 index 0000000..46eb183 --- /dev/null +++ b/kubernetes/user-management/templates/pdb.yaml @@ -0,0 +1,17 @@ +{{- if .Values.podDisruptionBudget.enabled }} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ include "user-management.fullname" . }} + labels: + {{- include "user-management.labels" . | nindent 4 }} +spec: + {{- if .Values.podDisruptionBudget.minAvailable }} + minAvailable: {{ .Values.podDisruptionBudget.minAvailable }} + {{- else }} + maxUnavailable: 1 + {{- end }} + selector: + matchLabels: + {{- include "user-management.selectorLabels" . | nindent 6 }} +{{- end }} diff --git a/kubernetes/user-management/templates/pvc.yaml b/kubernetes/user-management/templates/pvc.yaml new file mode 100644 index 0000000..5083964 --- /dev/null +++ b/kubernetes/user-management/templates/pvc.yaml @@ -0,0 +1,15 @@ +{{- if .Values.persistence.enabled -}} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "user-management.fullname" . }}-data + labels: + {{- include "user-management.labels" . | nindent 4 }} +spec: + accessModes: + - {{ .Values.persistence.accessMode | quote }} + resources: + requests: + storage: {{ .Values.persistence.size | quote }} + storageClassName: {{ .Values.persistence.storageClass | quote }} +{{- end }} diff --git a/kubernetes/user-management/templates/serviceaccount.yaml b/kubernetes/user-management/templates/serviceaccount.yaml new file mode 100644 index 0000000..7757370 --- /dev/null +++ b/kubernetes/user-management/templates/serviceaccount.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "user-management.serviceAccountName" . }} + labels: + {{- include "user-management.labels" . | nindent 4 }} diff --git a/kubernetes/user-management/values.yaml b/kubernetes/user-management/values.yaml new file mode 100644 index 0000000..b66f9fc --- /dev/null +++ b/kubernetes/user-management/values.yaml @@ -0,0 +1,90 @@ +# Default values for user-management. + +replicaCount: 1 + +image: + repository: user-management + tag: latest + pullPolicy: IfNotPresent + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +service: + type: ClusterIP + port: 8080 + +ingress: + enabled: true + className: nginx + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + nginx.ingress.kubernetes.io/ssl-redirect: "true" + hosts: + - host: ums.example.com + paths: + - path: / + pathType: Prefix + tls: + - secretName: ums-tls + hosts: + - ums.example.com + +resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 256Mi + +persistence: + enabled: true + storageClass: standard + accessMode: ReadWriteOnce + size: 5Gi + +# Pod Anti-Affinity settings +podAntiAffinity: + enabled: true + topologyKey: kubernetes.io/hostname + +# Readiness and Liveness probes +readinessProbe: + enabled: true + path: /api/v1/health/ready + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + +livenessProbe: + enabled: true + path: /api/v1/health + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + +# Pod Disruption Budget +podDisruptionBudget: + enabled: true + minAvailable: 1 + +# Horizontal Pod Autoscaler +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 3 + targetCPUUtilizationPercentage: 70 + targetMemoryUtilizationPercentage: 80 + +# Config +config: + jwtSecret: "" + adminEmail: "admin@example.com" + logLevel: "info" + +# Ingress controller version (for annotation compatibility) +ingressControllerVersion: "1.0"