23 KiB
23 KiB
部署和运维指南
概述
本文档描述用户管理系统的部署方案和运维规范,包括容器化部署、集群部署、监控告警、日志管理等。
1. 部署方案
1.1 系统架构
┌─────────────────────────────────────────────────────────┐
│ 负载均衡 (Nginx) │
└────────────────────┬────────────────────────────────────┘
│
┌────────────┴────────────┐
│ │
┌───────▼────────┐ ┌────────▼────────┐
│ 应用实例 1 │ │ 应用实例 N │
│ (Port 8080) │ │ (Port 8080) │
└───────┬────────┘ └────────┬────────┘
│ │
└───────────┬───────────┘
│
┌───────────┴───────────┐
│ │
┌───────▼────────┐ ┌────────▼────────┐
│ MySQL │ │ Redis │
│ (主从复制) │ │ (哨兵模式) │
└────────────────┘ └─────────────────┘
1.3 Docker 部署
单机 Docker 部署
docker-compose.yml(单机版)
version: '3.8'
services:
user-management:
image: user-management-system:1.0.0
container_name: user-ms
ports:
- "8080:8080"
volumes:
- ./data:/app/data
- ./config:/app/config
- ./logs:/app/logs
environment:
- SPRING_PROFILES_ACTIVE=docker
- DATABASE_TYPE=sqlite
- DATABASE_PATH=/app/data/user_management.db
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health/ready"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
启动命令
# 启动
docker-compose up -d
# 查看日志
docker-compose logs -f
# 停止
docker-compose down
# 停止并删除数据
docker-compose down -v
集群 Docker 部署
目录结构
deployment/
├── docker/
│ ├── auth-service/
│ │ └── Dockerfile
│ ├── user-service/
│ │ └── Dockerfile
│ ├── permission-service/
│ │ └── Dockerfile
│ └── gateway/
│ └── Dockerfile
├── docker-compose.yml
├── docker-compose.prod.yml
└── init/
└── init.sql
Dockerfile 示例(Go)
# 构建阶段
FROM golang:1.21-alpine AS builder
WORKDIR /app
# 复制依赖文件
COPY go.mod go.sum ./
RUN go mod download
# 复制源代码
COPY . .
# 编译
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
-ldflags="-w -s" \
-o user-service \
./cmd/user-service
# 运行阶段
FROM alpine:latest
RUN apk --no-cache add ca-certificates tzdata
WORKDIR /app
COPY --from=builder /app/user-service .
EXPOSE 8080
CMD ["./user-service"]
docker-compose.yml
version: '3.8'
services:
mysql:
image: mysql:8.0
container_name: user-ms-mysql
environment:
MYSQL_ROOT_PASSWORD: root_password
MYSQL_DATABASE: user_management
MYSQL_USER: app_user
MYSQL_PASSWORD: app_password
ports:
- "3306:3306"
volumes:
- mysql-data:/var/lib/mysql
- ./init/init.sql:/docker-entrypoint-initdb.d/init.sql
networks:
- user-ms-network
redis:
image: redis:7-alpine
container_name: user-ms-redis
ports:
- "6379:6379"
volumes:
- redis-data:/data
networks:
- user-ms-network
auth-service:
build:
context: ./docker/auth-service
container_name: user-ms-auth-service
environment:
- SPRING_PROFILES_ACTIVE=prod
- DB_HOST=mysql
- DB_PORT=3306
- DB_NAME=user_management
- DB_USER=app_user
- DB_PASSWORD=app_password
- REDIS_HOST=redis
- REDIS_PORT=6379
ports:
- "8081:8080"
depends_on:
- mysql
- redis
networks:
- user-ms-network
user-service:
build:
context: ./docker/user-service
container_name: user-ms-user-service
environment:
- SPRING_PROFILES_ACTIVE=prod
- DB_HOST=mysql
- DB_PORT=3306
- DB_NAME=user_management
- DB_USER=app_user
- DB_PASSWORD=app_password
- REDIS_HOST=redis
- REDIS_PORT=6379
ports:
- "8082:8080"
depends_on:
- mysql
- redis
networks:
- user-ms-network
permission-service:
build:
context: ./docker/permission-service
container_name: user-ms-permission-service
environment:
- SPRING_PROFILES_ACTIVE=prod
- DB_HOST=mysql
- DB_PORT=3306
- DB_NAME=user_management
- DB_USER=app_user
- DB_PASSWORD=app_password
- REDIS_HOST=redis
- REDIS_PORT=6379
ports:
- "8083:8080"
depends_on:
- mysql
- redis
networks:
- user-ms-network
gateway:
build:
context: ./docker/gateway
container_name: user-ms-gateway
environment:
- AUTH_SERVICE_URL=http://auth-service:8080
- USER_SERVICE_URL=http://user-service:8080
- PERMISSION_SERVICE_URL=http://permission-service:8080
ports:
- "8080:8080"
depends_on:
- auth-service
- user-service
- permission-service
networks:
- user-ms-network
prometheus:
image: prom/prometheus:latest
container_name: user-ms-prometheus
ports:
- "9090:9090"
volumes:
- ./deployment/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus-data:/prometheus
networks:
- user-ms-network
grafana:
image: grafana/grafana:latest
container_name: user-ms-grafana
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana-data:/var/lib/grafana
- ./deployment/grafana/provisioning:/etc/grafana/provisioning
networks:
- user-ms-network
volumes:
mysql-data:
redis-data:
prometheus-data:
grafana-data:
networks:
user-ms-network:
driver: bridge
启动命令
# 开发环境
docker-compose up -d
# 生产环境
docker-compose -f docker-compose.prod.yml up -d
# 查看日志
docker-compose logs -f
# 停止服务
docker-compose down
# 清理数据
docker-compose down -v
1.3 Kubernetes 部署
Helm Charts 结构
deployment/kubernetes/helm/user-management-system/
├── Chart.yaml
├── values.yaml
├── values-prod.yaml
└── templates/
├── _helpers.tpl
├── deployment.yaml
├── service.yaml
├── ingress.yaml
├── configmap.yaml
├── secret.yaml
├── hpa.yaml
└── pdb.yaml
values.yaml
# 默认配置
replicaCount: 2
image:
repository: example.com/user-management-system
pullPolicy: IfNotPresent
tag: "1.0.0"
imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""
serviceAccount:
create: true
annotations: {}
name: ""
podAnnotations: {}
podSecurityContext: {}
# fsGroup: 2000
securityContext: {}
# capabilities:
# drop:
# - ALL
# readOnlyRootFilesystem: true
# runAsNonRoot: true
# runAsUser: 1000
service:
type: ClusterIP
port: 8080
ingress:
enabled: true
className: "nginx"
annotations: {}
# kubernetes.io/ingress.class: nginx
# cert-manager.io/cluster-issuer: letsencrypt-prod
hosts:
- host: api.example.com
paths:
- path: /
pathType: Prefix
tls: []
# - secretName: user-ms-tls
# hosts:
# - api.example.com
resources:
limits:
cpu: 1000m
memory: 512Mi
requests:
cpu: 500m
memory: 256Mi
autoscaling:
enabled: true
minReplicas: 2
maxReplicas: 10
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
nodeSelector: {}
tolerations: []
affinity: {}
# 数据库配置
database:
host: mysql-service
port: 3306
name: user_management
username: app_user
password: app_password
# Redis 配置
redis:
host: redis-service
port: 6379
password: ""
database: 0
# 环境变量
env:
- name: SPRING_PROFILES_ACTIVE
value: "prod"
- name: LOG_LEVEL
value: "info"
# 健康检查
livenessProbe:
httpGet:
path: /health/live
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /health/ready
port: 8080
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
部署命令
# 安装 Helm Chart
helm install user-ms ./user-management-system \
-f values-prod.yaml \
--namespace production \
--create-namespace
# 更新部署
helm upgrade user-ms ./user-management-system \
-f values-prod.yaml \
--namespace production
# 回滚
helm rollback user-ms 1 --namespace production
# 卸载
helm uninstall user-ms --namespace production
1.4 传统安装包部署
目录结构
user-management-system-1.0.0/
├── bin/
│ ├── auth-service
│ ├── user-service
│ ├── permission-service
│ └── gateway
├── config/
│ ├── application.yml
│ └── application-prod.yml
├── lib/
│ ├── *.jar
│ └── *.so
├── scripts/
│ ├── install.sh
│ ├── start.sh
│ ├── stop.sh
│ └── restart.sh
└── README.md
安装脚本(install.sh)
#!/bin/bash
set -e
echo "开始安装用户管理系统..."
# 检查 Java 环境
if ! command -v java &> /dev/null; then
echo "错误: 未检测到 Java 环境"
exit 1
fi
# 检查 MySQL
if ! command -v mysql &> /dev/null; then
echo "错误: 未检测到 MySQL"
exit 1
fi
# 创建用户
if ! id -u userms &> /dev/null; then
echo "创建系统用户 userms..."
useradd -r -s /bin/false userms
fi
# 创建目录
INSTALL_DIR="/opt/user-management-system"
echo "安装目录: $INSTALL_DIR"
mkdir -p $INSTALL_DIR/{bin,config,lib,logs}
# 复制文件
echo "复制文件..."
cp -r bin/* $INSTALL_DIR/bin/
cp -r config/* $INSTALL_DIR/config/
cp -r lib/* $INSTALL_DIR/lib/
# 设置权限
chown -R userms:userms $INSTALL_DIR
chmod +x $INSTALL_DIR/bin/*
chmod +x scripts/*.sh
# 创建服务文件
cat > /etc/systemd/system/user-ms.service <<EOF
[Unit]
Description=User Management System
After=network.target mysql.service
[Service]
Type=forking
User=userms
WorkingDirectory=$INSTALL_DIR
ExecStart=$INSTALL_DIR/scripts/start.sh
ExecStop=$INSTALL_DIR/scripts/stop.sh
Restart=on-failure
RestartSec=10
[Install]
WantedBy=multi-user.target
EOF
# 重载 systemd
systemctl daemon-reload
echo "安装完成!"
echo "请修改配置文件 $INSTALL_DIR/config/application-prod.yml"
echo "启动服务: systemctl start user-ms"
echo "设置开机启动: systemctl enable user-ms"
启动脚本(start.sh)
#!/bin/bash
INSTALL_DIR="/opt/user-management-system"
LOG_DIR="$INSTALL_DIR/logs"
cd $INSTALL_DIR
echo "启动用户管理系统..."
# 启动认证服务
nohup $INSTALL_DIR/bin/auth-service \
--spring.config.location=$INSTALL_DIR/config/application-prod.yml \
> $LOG_DIR/auth-service.log 2>&1 &
AUTH_PID=$!
echo "认证服务启动 (PID: $AUTH_PID)"
# 启动用户服务
nohup $INSTALL_DIR/bin/user-service \
--spring.config.location=$INSTALL_DIR/config/application-prod.yml \
> $LOG_DIR/user-service.log 2>&1 &
USER_PID=$!
echo "用户服务启动 (PID: $USER_PID)"
# 启动权限服务
nohup $INSTALL_DIR/bin/permission-service \
--spring.config.location=$INSTALL_DIR/config/application-prod.yml \
> $LOG_DIR/permission-service.log 2>&1 &
PERM_PID=$!
echo "权限服务启动 (PID: $PERM_PID)"
# 启动网关
nohup $INSTALL_DIR/bin/gateway \
--spring.config.location=$INSTALL_DIR/config/application-prod.yml \
> $LOG_DIR/gateway.log 2>&1 &
GATEWAY_PID=$!
echo "网关启动 (PID: $GATEWAY_PID)"
# 保存 PID
echo $AUTH_PID > $LOG_DIR/auth-service.pid
echo $USER_PID > $LOG_DIR/user-service.pid
echo $PERM_PID > $LOG_DIR/permission-service.pid
echo $GATEWAY_PID > $LOG_DIR/gateway.pid
echo "启动完成!"
2. 监控与告警
2.1 Prometheus 配置
prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
rule_files:
- "alerts/*.yml"
scrape_configs:
- job_name: 'user-ms-auth'
static_configs:
- targets: ['auth-service:8080']
metrics_path: '/metrics'
- job_name: 'user-ms-user'
static_configs:
- targets: ['user-service:8080']
metrics_path: '/metrics'
- job_name: 'user-ms-permission'
static_configs:
- targets: ['permission-service:8080']
metrics_path: '/metrics'
- job_name: 'mysql'
static_configs:
- targets: ['mysql-exporter:9104']
- job_name: 'redis'
static_configs:
- targets: ['redis-exporter:9121']
告警规则(alerts.yml)
groups:
- name: user-ms-alerts
interval: 30s
rules:
# 高错误率告警
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "高错误率告警"
description: "{{ $labels.instance }} 的错误率超过 5%"
# 高响应时间告警
- alert: HighResponseTime
expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "高响应时间告警"
description: "{{ $labels.instance }} 的 P99 响应时间超过 500ms"
# CPU 使用率告警
- alert: HighCPUUsage
expr: rate(process_cpu_seconds_total[5m]) > 0.7
for: 5m
labels:
severity: warning
annotations:
summary: "高 CPU 使用率"
description: "{{ $labels.instance }} 的 CPU 使用率超过 70%"
# 内存使用率告警
- alert: HighMemoryUsage
expr: (jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"}) > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "高内存使用率"
description: "{{ $labels.instance }} 的堆内存使用率超过 80%"
# 数据库连接告警
- alert: DatabaseConnectionPoolExhausted
expr: hikaricp_connections_active / hikaricp_connections_max > 0.9
for: 5m
labels:
severity: critical
annotations:
summary: "数据库连接池耗尽"
description: "{{ $labels.instance }} 的数据库连接池使用率超过 90%"
# 在线用户数异常
- alert: LowOnlineUsers
expr: system_online_users < 10
for: 10m
labels:
severity: warning
annotations:
summary: "在线用户数异常"
description: "在线用户数低于 10,可能存在服务异常"
2.2 Grafana 仪表盘
核心指标面板
| 面板名称 | 指标 | 说明 |
|---|---|---|
| 总用户数 | system_total_users |
系统总用户数 |
| 在线用户数 | system_online_users |
当前在线用户数 |
| 今日注册数 | increase(user_register_total[1d]) |
今日注册用户数 |
| 今日登录数 | increase(user_login_total[1d]) |
今日登录次数 |
| QPS | rate(http_requests_total[1m]) |
每秒请求数 |
| 响应时间 (P99) | histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) |
P99 响应时间 |
| 错误率 | rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) |
错误率 |
| CPU 使用率 | rate(process_cpu_seconds_total[5m]) |
CPU 使用率 |
| 内存使用率 | jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"} |
内存使用率 |
2.3 日志管理
日志配置(Logback)
<configuration>
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>logs/application.log</file>
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
<fileNamePattern>logs/application.%d{yyyy-MM-dd}.log</fileNamePattern>
<maxHistory>30</maxHistory>
<totalSizeCap>10GB</totalSizeCap>
</rollingPolicy>
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<root level="INFO">
<appender-ref ref="CONSOLE" />
<appender-ref ref="FILE" />
</root>
</configuration>
ELK 集成
# Filebeat 配置
filebeat.inputs:
- type: log
enabled: true
paths:
- /opt/user-management-system/logs/*.log
fields:
app: user-management-system
env: production
output.elasticsearch:
hosts: ["elasticsearch:9200"]
indices:
- index: "user-ms-%{+yyyy.MM.dd}"
when.contains:
app: "user-management-system"
setup.template.name: "user-ms"
setup.template.pattern: "user-ms-*"
3. 运维操作
3.1 日常巡检
巡检清单
| 检查项 | 检查方法 | 正常值 | 异常处理 |
|---|---|---|---|
| 服务状态 | systemctl status | Active | 重启服务 |
| 磁盘空间 | df -h | 使用率 < 80% | 清理日志 |
| 内存使用 | free -h | 使用率 < 80% | 扩容或优化 |
| CPU 使用 | top | 使用率 < 70% | 扩容或优化 |
| 数据库连接 | SHOW PROCESSLIST | 连接数 < 100 | 优化连接池 |
| Redis 连接 | redis-cli info clients | 连接数正常 | 扩容 Redis |
| API 响应时间 | curl -w @curl-format.txt | < 500ms | 优化代码 |
| 错误日志 | tail -f error.log | 无新错误 | 排查问题 |
3.2 备份与恢复
数据库备份
#!/bin/bash
# backup-mysql.sh
BACKUP_DIR="/backup/mysql"
DATE=$(date +%Y%m%d_%H%M%S)
DB_NAME="user_management"
DB_USER="root"
DB_PASSWORD="your_password"
mkdir -p $BACKUP_DIR
# 全量备份
mysqldump -u$DB_USER -p$DB_PASSWORD $DB_NAME | gzip > $BACKUP_DIR/$DB_NAME_$DATE.sql.gz
# 删除 7 天前的备份
find $BACKUP_DIR -name "*.sql.gz" -mtime +7 -delete
echo "备份完成: $BACKUP_DIR/$DB_NAME_$DATE.sql.gz"
数据恢复
# 解压备份文件
gunzip user_management_20260310_120000.sql.gz
# 恢复数据库
mysql -u root -p user_management < user_management_20260310_120000.sql
3.3 版本升级
升级流程
# 1. 备份数据库
./scripts/backup-mysql.sh
# 2. 停止服务
./scripts/stop.sh
# 3. 备份旧版本
cp -r /opt/user-management-system /opt/user-management-system.bak
# 4. 部署新版本
unzip user-management-system-1.1.0.zip -d /opt/
# 5. 执行数据库迁移
mysql -u root -p user_management < migration/1.1.0.sql
# 6. 启动服务
./scripts/start.sh
# 7. 验证服务
curl http://localhost:8080/health
curl http://localhost:8080/health/live
curl http://localhost:8080/health/ready
回滚流程
# 1. 停止服务
./scripts/stop.sh
# 2. 删除新版本
rm -rf /opt/user-management-system
# 3. 恢复旧版本
mv /opt/user-management-system.bak /opt/user-management-system
# 4. 恢复数据库
mysql -u root -p user_management < /backup/mysql/user_management_20260310_120000.sql
# 5. 启动服务
./scripts/start.sh
3.4 故障排查
常见问题
| 问题 | 可能原因 | 排查方法 | 解决方案 |
|---|---|---|---|
| 服务启动失败 | 端口被占用 | netstat -tunlp | 修改端口或停止占用进程 |
| 数据库连接失败 | 网络问题 | ping、telnet | 检查网络和防火墙 |
| 响应慢 | 数据库查询慢 | 慢查询日志 | 优化 SQL、加索引 |
| 内存溢出 | 内存泄漏 | jmap -heap | 优化代码、扩容 |
| 登录失败 | 验证码过期 | 检查 Redis | 调整验证码有效期 |
4. 性能优化
4.1 数据库优化
索引优化
-- 查看慢查询
SHOW VARIABLES LIKE 'slow_query%';
SHOW VARIABLES LIKE 'long_query_time';
-- 分析慢查询
EXPLAIN SELECT * FROM users WHERE username = 'john_doe';
-- 添加索引
CREATE INDEX idx_username ON users(username);
CREATE INDEX idx_email ON users(email);
CREATE INDEX idx_phone ON users(phone);
查询优化
-- 使用覆盖索引
SELECT id, username, email FROM users WHERE status = 1;
-- 避免 SELECT *
SELECT id, username FROM users WHERE id = ?;
-- 使用 LIMIT 分页
SELECT * FROM users ORDER BY id LIMIT 20 OFFSET 0;
4.2 Redis 优化
缓存策略
cache:
# 用户信息缓存
user_info:
ttl: 3600 # 1 小时
max_size: 10000
# 权限信息缓存
user_permissions:
ttl: 1800 # 30 分钟
max_size: 5000
# Token 黑名单
token_blacklist:
ttl: 2592000 # 30 天
max_size: 50000
Redis 配置
# redis.conf
maxmemory 2gb
maxmemory-policy allkeys-lru
save 900 1
save 300 10
save 60 10000
4.3 应用优化
JVM 参数优化
java -jar app.jar \
-Xms512m \
-Xmx2g \
-XX:+UseG1GC \
-XX:MaxGCPauseMillis=200 \
-XX:+HeapDumpOnOutOfMemoryError \
-XX:HeapDumpPath=/opt/logs/heap_dump.hprof
连接池优化
datasource:
hikari:
maximum-pool-size: 50
minimum-idle: 10
connection-timeout: 30000
idle-timeout: 600000
max-lifetime: 1800000
5. 安全加固
5.1 防火墙配置
# 只开放必要端口
firewall-cmd --permanent --add-port=80/tcp
firewall-cmd --permanent --add-port=443/tcp
firewall-cmd --permanent --add-port=22/tcp
firewall-cmd --reload
# 限制数据库访问
firewall-cmd --permanent --add-rich-rule='rule family="ipv4" source address="10.0.0.0/8" port port="3306" protocol="tcp" accept'
firewall-cmd --reload
5.2 SSL/TLS 配置
server {
listen 443 ssl http2;
server_name api.example.com;
ssl_certificate /path/to/cert.pem;
ssl_certificate_key /path/to/key.pem;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers HIGH:!aNULL:!MD5;
ssl_prefer_server_ciphers on;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 10m;
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
}
6. 监控告警联系人
| 级别 | 联系人 | 通知方式 |
|---|---|---|
| Critical | 运维团队 | 电话 + 短信 + 邮件 |
| Warning | 开发团队 | 邮件 + 钉钉/企业微信 |
| Info | 项目经理 | 邮件 |
本文档持续更新中,如有疑问请联系运维团队。