📋 概述
本文档详细描述了 Atlas Mapper 企业级应用的部署运维方案,包括环境准备、部署策略、监控配置、故障处理、性能调优等运维管理的各个方面。
🏗️ 部署架构设计
生产环境架构图
graph TB
subgraph "生产环境部署架构"
subgraph "负载均衡层"
LB1[主负载均衡器]
LB2[备负载均衡器]
end
subgraph "应用服务层"
subgraph "应用集群A"
APP1[应用服务器1]
APP2[应用服务器2]
end
subgraph "应用集群B"
APP3[应用服务器3]
APP4[应用服务器4]
end
end
subgraph "数据存储层"
subgraph "数据库集群"
DB1[主数据库]
DB2[从数据库1]
DB3[从数据库2]
end
subgraph "缓存集群"
REDIS1[Redis主节点]
REDIS2[Redis从节点1]
REDIS3[Redis从节点2]
end
end
subgraph "监控运维层"
PROM[Prometheus]
GRAF[Grafana]
ELK[ELK Stack]
ZIPKIN[Zipkin]
end
subgraph "基础设施层"
K8S[Kubernetes集群]
DOCKER[Docker容器]
STORAGE[共享存储]
end
end
LB1 --> APP1
LB1 --> APP2
LB2 --> APP3
LB2 --> APP4
APP1 --> DB1
APP2 --> DB2
APP3 --> DB3
APP4 --> REDIS1
APP1 --> PROM
APP2 --> GRAF
APP3 --> ELK
APP4 --> ZIPKIN
K8S --> DOCKER
DOCKER --> STORAGE
部署流程图
sequenceDiagram
participant Dev as 开发团队
participant CI as CI/CD系统
participant Registry as 镜像仓库
participant K8s as Kubernetes
participant Monitor as 监控系统
participant Ops as 运维团队
Dev->>CI: 提交代码
CI->>CI: 代码检查和测试
CI->>Registry: 构建并推送镜像
CI->>K8s: 触发部署
K8s->>K8s: 滚动更新部署
K8s->>Monitor: 健康检查
Monitor->>Monitor: 性能监控
alt 部署成功
Monitor->>Ops: 发送成功通知
K8s->>K8s: 清理旧版本
else 部署失败
Monitor->>Ops: 发送告警
K8s->>K8s: 自动回滚
Ops->>Dev: 反馈问题
end
Note over Dev,Ops: 持续部署和监控流程
🚀 环境准备
1. 基础环境要求
硬件配置
yaml
# 生产环境最低配置
production:
application_servers:
cpu: 8 cores
memory: 16GB
disk: 200GB SSD
network: 1Gbps
count: 4
database_servers:
cpu: 16 cores
memory: 32GB
disk: 1TB SSD (RAID 10)
network: 10Gbps
count: 3 (1主2从)
cache_servers:
cpu: 4 cores
memory: 8GB
disk: 100GB SSD
network: 1Gbps
count: 3
monitoring_servers:
cpu: 8 cores
memory: 16GB
disk: 500GB SSD
network: 1Gbps
count: 2
# 测试环境配置
testing:
application_servers:
cpu: 4 cores
memory: 8GB
disk: 100GB SSD
count: 2
database_servers:
cpu: 8 cores
memory: 16GB
disk: 500GB SSD
count: 1
软件环境
bash
# 操作系统
OS: Ubuntu 20.04 LTS / CentOS 8
# 容器运行时
Docker: 20.10+
Kubernetes: 1.21+
# Java运行环境
JDK: OpenJDK 8 (Zulu 8.62+)
JVM参数: -Xms2g -Xmx8g -XX:+UseG1GC
# 数据库
MySQL: 8.0+
Redis: 6.2+
# 监控组件
Prometheus: 2.30+
Grafana: 8.0+
Elasticsearch: 7.15+
Zipkin: 2.23+
2. 环境初始化脚本
bash
#!/bin/bash
# 环境初始化脚本 - init-environment.sh
set -e
echo "开始初始化 Atlas Mapper 部署环境..."
# 1. 系统更新和基础软件安装
update_system() {
echo "更新系统软件包..."
if command -v apt-get &> /dev/null; then
apt-get update && apt-get upgrade -y
apt-get install -y curl wget git vim htop iotop net-tools
elif command -v yum &> /dev/null; then
yum update -y
yum install -y curl wget git vim htop iotop net-tools
fi
}
# 2. 安装Docker
install_docker() {
echo "安装Docker..."
curl -fsSL https://get.docker.com -o get-docker.sh
sh get-docker.sh
# 配置Docker
mkdir -p /etc/docker
cat > /etc/docker/daemon.json << EOF
{
"registry-mirrors": ["https://mirror.ccs.tencentyun.com"],
"log-driver": "json-file",
"log-opts": {
"max-size": "100m",
"max-file": "3"
},
"storage-driver": "overlay2"
}
EOF
systemctl enable docker
systemctl start docker
# 添加用户到docker组
usermod -aG docker $USER
}
# 3. 安装Kubernetes
install_kubernetes() {
echo "安装Kubernetes..."
# 安装kubeadm, kubelet, kubectl
curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
cat <<EOF >/etc/apt/sources.list.d/kubernetes.list
deb https://apt.kubernetes.io/ kubernetes-xenial main
EOF
apt-get update
apt-get install -y kubelet kubeadm kubectl
apt-mark hold kubelet kubeadm kubectl
# 配置kubelet
echo 'KUBELET_EXTRA_ARGS="--cgroup-driver=systemd"' > /etc/default/kubelet
systemctl enable kubelet
}
# 4. 配置系统参数
configure_system() {
echo "配置系统参数..."
# 内核参数优化
cat >> /etc/sysctl.conf << EOF
# Atlas Mapper 优化参数
net.core.somaxconn = 65535
net.core.netdev_max_backlog = 5000
net.ipv4.tcp_max_syn_backlog = 65535
net.ipv4.tcp_fin_timeout = 10
net.ipv4.tcp_tw_reuse = 1
net.ipv4.tcp_keepalive_time = 600
vm.swappiness = 1
vm.max_map_count = 262144
fs.file-max = 1000000
EOF
sysctl -p
# 文件描述符限制
cat >> /etc/security/limits.conf << EOF
* soft nofile 65535
* hard nofile 65535
* soft nproc 65535
* hard nproc 65535
EOF
# 创建应用目录
mkdir -p /opt/atlas-mapper/{config,logs,data,backup}
chown -R 1000:1000 /opt/atlas-mapper
}
# 5. 安装监控组件
install_monitoring() {
echo "安装监控组件..."
# 安装Node Exporter
wget https://github.com/prometheus/node_exporter/releases/download/v1.3.1/node_exporter-1.3.1.linux-amd64.tar.gz
tar xvf node_exporter-1.3.1.linux-amd64.tar.gz
cp node_exporter-1.3.1.linux-amd64/node_exporter /usr/local/bin/
# 创建systemd服务
cat > /etc/systemd/system/node_exporter.service << EOF
[Unit]
Description=Node Exporter
After=network.target
[Service]
Type=simple
ExecStart=/usr/local/bin/node_exporter
Restart=always
[Install]
WantedBy=multi-user.target
EOF
systemctl enable node_exporter
systemctl start node_exporter
}
# 执行初始化
main() {
update_system
install_docker
install_kubernetes
configure_system
install_monitoring
echo "环境初始化完成!"
echo "请重新登录以使Docker组权限生效"
}
main "$@"
📦 应用部署
1. Docker镜像构建
dockerfile
# Dockerfile
FROM openjdk:8-jre-alpine
LABEL maintainer="杨杨杨大侠 <mastery@example.com>"
LABEL version="1.0.0"
LABEL description="Atlas Mapper Enterprise Application"
# 设置时区
RUN apk add --no-cache tzdata && \
cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
echo "Asia/Shanghai" > /etc/timezone
# 创建应用用户
RUN addgroup -g 1000 atlas && \
adduser -D -s /bin/sh -u 1000 -G atlas atlas
# 创建应用目录
WORKDIR /opt/atlas-mapper
RUN mkdir -p logs config data temp && \
chown -R atlas:atlas /opt/atlas-mapper
# 复制应用文件
COPY --chown=atlas:atlas target/atlas-mapper-*.jar app.jar
COPY --chown=atlas:atlas docker/entrypoint.sh entrypoint.sh
COPY --chown=atlas:atlas config/ config/
# 设置执行权限
RUN chmod +x entrypoint.sh
# JVM参数优化
ENV JAVA_OPTS="-Xms2g -Xmx4g -XX:+UseG1GC -XX:MaxGCPauseMillis=200 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -Xloggc:logs/gc.log"
ENV SPRING_PROFILES_ACTIVE="production"
# 健康检查
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl -f http://localhost:8080/actuator/health || exit 1
# 切换到应用用户
USER atlas
# 暴露端口
EXPOSE 8080 8081
# 启动应用
ENTRYPOINT ["./entrypoint.sh"]
bash
#!/bin/sh
# docker/entrypoint.sh - 容器启动脚本
set -e
# 等待数据库就绪
wait_for_database() {
echo "等待数据库连接..."
while ! nc -z ${DB_HOST:-localhost} ${DB_PORT:-3306}; do
echo "数据库未就绪,等待5秒..."
sleep 5
done
echo "数据库连接成功"
}
# 等待Redis就绪
wait_for_redis() {
echo "等待Redis连接..."
while ! nc -z ${REDIS_HOST:-localhost} ${REDIS_PORT:-6379}; do
echo "Redis未就绪,等待5秒..."
sleep 5
done
echo "Redis连接成功"
}
# 初始化应用配置
init_config() {
echo "初始化应用配置..."
# 替换配置文件中的环境变量
envsubst < config/application-template.yml > config/application.yml
# 设置JVM参数
export JAVA_OPTS="${JAVA_OPTS} -Dspring.profiles.active=${SPRING_PROFILES_ACTIVE}"
export JAVA_OPTS="${JAVA_OPTS} -Dfile.encoding=UTF-8"
export JAVA_OPTS="${JAVA_OPTS} -Djava.security.egd=file:/dev/./urandom"
# 添加监控参数
if [ "${ENABLE_JMX:-false}" = "true" ]; then
export JAVA_OPTS="${JAVA_OPTS} -Dcom.sun.management.jmxremote"
export JAVA_OPTS="${JAVA_OPTS} -Dcom.sun.management.jmxremote.port=9999"
export JAVA_OPTS="${JAVA_OPTS} -Dcom.sun.management.jmxremote.authenticate=false"
export JAVA_OPTS="${JAVA_OPTS} -Dcom.sun.management.jmxremote.ssl=false"
fi
}
# 启动应用
start_application() {
echo "启动Atlas Mapper应用..."
echo "JVM参数: ${JAVA_OPTS}"
exec java ${JAVA_OPTS} -jar app.jar
}
# 主流程
main() {
wait_for_database
wait_for_redis
init_config
start_application
}
main "$@"
2. Kubernetes部署配置
yaml
# k8s/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: atlas-mapper
labels:
name: atlas-mapper
environment: production
---
# k8s/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: atlas-mapper-config
namespace: atlas-mapper
data:
application.yml: |
server:
port: 8080
servlet:
context-path: /atlas-mapper
spring:
application:
name: atlas-mapper
profiles:
active: production
datasource:
url: jdbc:mysql://${DB_HOST}:${DB_PORT}/${DB_NAME}?useSSL=true&serverTimezone=Asia/Shanghai
username: ${DB_USERNAME}
password: ${DB_PASSWORD}
driver-class-name: com.mysql.cj.jdbc.Driver
hikari:
maximum-pool-size: 20
minimum-idle: 5
connection-timeout: 30000
idle-timeout: 600000
max-lifetime: 1800000
redis:
host: ${REDIS_HOST}
port: ${REDIS_PORT}
password: ${REDIS_PASSWORD}
timeout: 5000
lettuce:
pool:
max-active: 20
max-idle: 10
min-idle: 5
management:
endpoints:
web:
exposure:
include: health,info,metrics,prometheus
endpoint:
health:
show-details: always
metrics:
export:
prometheus:
enabled: true
---
# k8s/secret.yaml
apiVersion: v1
kind: Secret
metadata:
name: atlas-mapper-secret
namespace: atlas-mapper
type: Opaque
data:
db-username: YXRsYXM= # base64 encoded 'atlas'
db-password: YXRsYXNfcGFzc3dvcmQ= # base64 encoded 'atlas_password'
redis-password: cmVkaXNfcGFzc3dvcmQ= # base64 encoded 'redis_password'
---
# k8s/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: atlas-mapper
namespace: atlas-mapper
labels:
app: atlas-mapper
version: v1.0.0
spec:
replicas: 4
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 1
selector:
matchLabels:
app: atlas-mapper
template:
metadata:
labels:
app: atlas-mapper
version: v1.0.0
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/atlas-mapper/actuator/prometheus"
spec:
containers:
- name: atlas-mapper
image: atlas-mapper:1.0.0
imagePullPolicy: IfNotPresent
ports:
- containerPort: 8080
name: http
- containerPort: 9999
name: jmx
env:
- name: SPRING_PROFILES_ACTIVE
value: "production"
- name: DB_HOST
value: "mysql-service"
- name: DB_PORT
value: "3306"
- name: DB_NAME
value: "atlas_mapper"
- name: DB_USERNAME
valueFrom:
secretKeyRef:
name: atlas-mapper-secret
key: db-username
- name: DB_PASSWORD
valueFrom:
secretKeyRef:
name: atlas-mapper-secret
key: db-password
- name: REDIS_HOST
value: "redis-service"
- name: REDIS_PORT
value: "6379"
- name: REDIS_PASSWORD
valueFrom:
secretKeyRef:
name: atlas-mapper-secret
key: redis-password
- name: ENABLE_JMX
value: "true"
- name: JAVA_OPTS
value: "-Xms2g -Xmx4g -XX:+UseG1GC -XX:MaxGCPauseMillis=200"
resources:
requests:
memory: "2Gi"
cpu: "1000m"
limits:
memory: "4Gi"
cpu: "2000m"
livenessProbe:
httpGet:
path: /atlas-mapper/actuator/health
port: 8080
initialDelaySeconds: 120
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 3
readinessProbe:
httpGet:
path: /atlas-mapper/actuator/health
port: 8080
initialDelaySeconds: 60
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
volumeMounts:
- name: config-volume
mountPath: /opt/atlas-mapper/config
- name: logs-volume
mountPath: /opt/atlas-mapper/logs
- name: temp-volume
mountPath: /opt/atlas-mapper/temp
volumes:
- name: config-volume
configMap:
name: atlas-mapper-config
- name: logs-volume
emptyDir: {}
- name: temp-volume
emptyDir: {}
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- atlas-mapper
topologyKey: kubernetes.io/hostname
---
# k8s/service.yaml
apiVersion: v1
kind: Service
metadata:
name: atlas-mapper-service
namespace: atlas-mapper
labels:
app: atlas-mapper
spec:
type: ClusterIP
ports:
- port: 8080
targetPort: 8080
protocol: TCP
name: http
- port: 9999
targetPort: 9999
protocol: TCP
name: jmx
selector:
app: atlas-mapper
---
# k8s/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: atlas-mapper-ingress
namespace: atlas-mapper
annotations:
kubernetes.io/ingress.class: nginx
nginx.ingress.kubernetes.io/rewrite-target: /
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/proxy-body-size: "50m"
nginx.ingress.kubernetes.io/proxy-connect-timeout: "600"
nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
spec:
tls:
- hosts:
- atlas-mapper.example.com
secretName: atlas-mapper-tls
rules:
- host: atlas-mapper.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: atlas-mapper-service
port:
number: 8080
---
# k8s/hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: atlas-mapper-hpa
namespace: atlas-mapper
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: atlas-mapper
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
behavior:
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 10
periodSeconds: 60
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Percent
value: 50
periodSeconds: 60
3. 部署脚本
bash
#!/bin/bash
# deploy.sh - 自动化部署脚本
set -e
# 配置变量
NAMESPACE="atlas-mapper"
IMAGE_TAG="${1:-latest}"
REGISTRY="registry.example.com"
IMAGE_NAME="${REGISTRY}/atlas-mapper:${IMAGE_TAG}"
# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检查前置条件
check_prerequisites() {
log_info "检查部署前置条件..."
# 检查kubectl
if ! command -v kubectl &> /dev/null; then
log_error "kubectl 未安装"
exit 1
fi
# 检查集群连接
if ! kubectl cluster-info &> /dev/null; then
log_error "无法连接到Kubernetes集群"
exit 1
fi
# 检查镜像是否存在
if ! docker pull ${IMAGE_NAME} &> /dev/null; then
log_error "镜像 ${IMAGE_NAME} 不存在"
exit 1
fi
log_info "前置条件检查通过"
}
# 创建命名空间
create_namespace() {
log_info "创建命名空间..."
kubectl apply -f k8s/namespace.yaml
}
# 部署配置
deploy_config() {
log_info "部署配置文件..."
kubectl apply -f k8s/configmap.yaml
kubectl apply -f k8s/secret.yaml
}
# 部署应用
deploy_application() {
log_info "部署应用..."
# 更新镜像标签
sed -i "s|image: atlas-mapper:.*|image: ${IMAGE_NAME}|g" k8s/deployment.yaml
kubectl apply -f k8s/deployment.yaml
kubectl apply -f k8s/service.yaml
kubectl apply -f k8s/ingress.yaml
kubectl apply -f k8s/hpa.yaml
}
# 等待部署完成
wait_for_deployment() {
log_info "等待部署完成..."
kubectl rollout status deployment/atlas-mapper -n ${NAMESPACE} --timeout=600s
if [ $? -eq 0 ]; then
log_info "部署成功完成"
else
log_error "部署超时或失败"
exit 1
fi
}
# 健康检查
health_check() {
log_info "执行健康检查..."
# 等待Pod就绪
kubectl wait --for=condition=ready pod -l app=atlas-mapper -n ${NAMESPACE} --timeout=300s
# 检查服务状态
ENDPOINT=$(kubectl get ingress atlas-mapper-ingress -n ${NAMESPACE} -o jsonpath='{.spec.rules[0].host}')
for i in {1..10}; do
if curl -f -s "https://${ENDPOINT}/atlas-mapper/actuator/health" > /dev/null; then
log_info "健康检查通过"
return 0
fi
log_warn "健康检查失败,重试 ${i}/10..."
sleep 30
done
log_error "健康检查失败"
return 1
}
# 部署验证
verify_deployment() {
log_info "验证部署状态..."
# 检查Pod状态
kubectl get pods -n ${NAMESPACE} -l app=atlas-mapper
# 检查服务状态
kubectl get svc -n ${NAMESPACE}
# 检查Ingress状态
kubectl get ingress -n ${NAMESPACE}
# 检查HPA状态
kubectl get hpa -n ${NAMESPACE}
}
# 回滚部署
rollback_deployment() {
log_warn "开始回滚部署..."
kubectl rollout undo deployment/atlas-mapper -n ${NAMESPACE}
kubectl rollout status deployment/atlas-mapper -n ${NAMESPACE} --timeout=300s
log_info "回滚完成"
}
# 清理部署
cleanup_deployment() {
log_warn "清理部署资源..."
kubectl delete -f k8s/ --ignore-not-found=true
kubectl delete namespace ${NAMESPACE} --ignore-not-found=true
log_info "清理完成"
}
# 主函数
main() {
case "${2:-deploy}" in
"deploy")
check_prerequisites
create_namespace
deploy_config
deploy_application
wait_for_deployment
if ! health_check; then
log_error "健康检查失败,开始回滚..."
rollback_deployment
exit 1
fi
verify_deployment
log_info "部署成功完成!"
;;
"rollback")
rollback_deployment
;;
"cleanup")
cleanup_deployment
;;
"status")
verify_deployment
;;
*)
echo "用法: $0 <image_tag> [deploy|rollback|cleanup|status]"
echo "示例: $0 v1.0.0 deploy"
exit 1
;;
esac
}
main "$@"
📊 监控配置
1. Prometheus配置
yaml
# monitoring/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "atlas-mapper-rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
# Atlas Mapper应用监控
- job_name: 'atlas-mapper'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- atlas-mapper
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
# JVM监控
- job_name: 'atlas-mapper-jmx'
static_configs:
- targets: ['atlas-mapper-service:9999']
metrics_path: /metrics
scrape_interval: 30s
# 节点监控
- job_name: 'node-exporter'
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
---
# monitoring/atlas-mapper-rules.yml
groups:
- name: atlas-mapper-alerts
rules:
# 应用可用性告警
- alert: AtlasMapperDown
expr: up{job="atlas-mapper"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Atlas Mapper实例宕机"
description: "Atlas Mapper实例 {{ $labels.instance }} 已宕机超过1分钟"
# 高错误率告警
- alert: AtlasMapperHighErrorRate
expr: rate(atlas_mapper_errors_total[5m]) / rate(atlas_mapper_mappings_total[5m]) > 0.05
for: 2m
labels:
severity: warning
annotations:
summary: "Atlas Mapper错误率过高"
description: "Atlas Mapper错误率为 {{ $value | humanizePercentage }},超过5%阈值"
# 响应时间告警
- alert: AtlasMapperHighLatency
expr: histogram_quantile(0.95, rate(atlas_mapper_mapping_duration_seconds_bucket[5m])) > 1
for: 3m
labels:
severity: warning
annotations:
summary: "Atlas Mapper响应时间过长"
description: "Atlas Mapper 95%分位响应时间为 {{ $value }}s,超过1秒阈值"
# 内存使用告警
- alert: AtlasMapperHighMemoryUsage
expr: atlas_mapper_memory_usage / atlas_mapper_memory_total > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "Atlas Mapper内存使用率过高"
description: "Atlas Mapper内存使用率为 {{ $value | humanizePercentage }},超过85%阈值"
# JVM GC告警
- alert: AtlasMapperHighGCTime
expr: rate(jvm_gc_collection_seconds_sum[5m]) > 0.1
for: 3m
labels:
severity: warning
annotations:
summary: "Atlas Mapper GC时间过长"
description: "Atlas Mapper GC时间占比为 {{ $value | humanizePercentage }},超过10%阈值"
# 缓存命中率告警
- alert: AtlasMapperLowCacheHitRate
expr: atlas_mapper_cache_hit_rate < 0.7
for: 5m
labels:
severity: info
annotations:
summary: "Atlas Mapper缓存命中率过低"
description: "Atlas Mapper缓存命中率为 {{ $value | humanizePercentage }},低于70%阈值"
2. Grafana仪表板
json
{
"dashboard": {
"id": null,
"title": "Atlas Mapper 监控大屏",
"tags": ["atlas-mapper", "monitoring"],
"timezone": "Asia/Shanghai",
"panels": [
{
"id": 1,
"title": "系统概览",
"type": "stat",
"targets": [
{
"expr": "up{job=\"atlas-mapper\"}",
"legendFormat": "实例状态"
},
{
"expr": "sum(rate(atlas_mapper_mappings_total[5m]))",
"legendFormat": "映射TPS"
},
{
"expr": "sum(rate(atlas_mapper_errors_total[5m])) / sum(rate(atlas_mapper_mappings_total[5m]))",
"legendFormat": "错误率"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 0.01},
{"color": "red", "value": 0.05}
]
}
}
}
},
{
"id": 2,
"title": "映射操作趋势",
"type": "graph",
"targets": [
{
"expr": "sum(rate(atlas_mapper_mappings_total[1m])) by (type)",
"legendFormat": "{{type}}"
}
],
"yAxes": [
{
"label": "操作/秒",
"min": 0
}
]
},
{
"id": 3,
"title": "响应时间分布",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(atlas_mapper_mapping_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "P50"
},
{
"expr": "histogram_quantile(0.95, sum(rate(atlas_mapper_mapping_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "P95"
},
{
"expr": "histogram_quantile(0.99, sum(rate(atlas_mapper_mapping_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "P99"
}
]
},
{
"id": 4,
"title": "内存使用情况",
"type": "graph",
"targets": [
{
"expr": "atlas_mapper_memory_used",
"legendFormat": "已使用内存"
},
{
"expr": "atlas_mapper_memory_total",
"legendFormat": "总内存"
}
]
},
{
"id": 5,
"title": "缓存性能",
"type": "graph",
"targets": [
{
"expr": "rate(atlas_mapper_cache_hits[5m])",
"legendFormat": "缓存命中"
},
{
"expr": "rate(atlas_mapper_cache_misses[5m])",
"legendFormat": "缓存未命中"
}
]
},
{
"id": 6,
"title": "JVM指标",
"type": "graph",
"targets": [
{
"expr": "jvm_memory_used_bytes{area=\"heap\"}",
"legendFormat": "堆内存使用"
},
{
"expr": "jvm_memory_max_bytes{area=\"heap\"}",
"legendFormat": "堆内存最大值"
},
{
"expr": "rate(jvm_gc_collection_seconds_sum[5m])",
"legendFormat": "GC时间"
}
]
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}
}
🚨 故障处理
1. 常见故障诊断
bash
#!/bin/bash
# troubleshoot.sh - 故障诊断脚本
NAMESPACE="atlas-mapper"
# 检查Pod状态
check_pods() {
echo "=== Pod状态检查 ==="
kubectl get pods -n ${NAMESPACE} -o wide
echo -e "\n=== 异常Pod详情 ==="
kubectl get pods -n ${NAMESPACE} --field-selector=status.phase!=Running
# 获取Pod日志
for pod in $(kubectl get pods -n ${NAMESPACE} -o jsonpath='{.items[*].metadata.name}'); do
echo -e "\n=== Pod ${pod} 日志 ==="
kubectl logs ${pod} -n ${NAMESPACE} --tail=50
done
}
# 检查服务状态
check_services() {
echo "=== 服务状态检查 ==="
kubectl get svc -n ${NAMESPACE}
echo -e "\n=== Ingress状态 ==="
kubectl get ingress -n ${NAMESPACE}
echo -e "\n=== Endpoints状态 ==="
kubectl get endpoints -n ${NAMESPACE}
}
# 检查资源使用
check_resources() {
echo "=== 资源使用检查 ==="
kubectl top pods -n ${NAMESPACE}
echo -e "\n=== 节点资源 ==="
kubectl top nodes
}
# 检查事件
check_events() {
echo "=== 集群事件 ==="
kubectl get events -n ${NAMESPACE} --sort-by='.lastTimestamp'
}
# 网络连通性测试
test_connectivity() {
echo "=== 网络连通性测试 ==="
# 测试数据库连接
kubectl run mysql-test --image=mysql:8.0 --rm -it --restart=Never -n ${NAMESPACE} -- \
mysql -h mysql-service -u atlas -p atlas_mapper -e "SELECT 1"
# 测试Redis连接
kubectl run redis-test --image=redis:6.2 --rm -it --restart=Never -n ${NAMESPACE} -- \
redis-cli -h redis-service ping
}
# 性能分析
performance_analysis() {
echo "=== 性能分析 ==="
# JVM堆转储
for pod in $(kubectl get pods -n ${NAMESPACE} -l app=atlas-mapper -o jsonpath='{.items[*].metadata.name}'); do
echo "生成Pod ${pod} 的堆转储..."
kubectl exec ${pod} -n ${NAMESPACE} -- jcmd 1 GC.run_finalization
kubectl exec ${pod} -n ${NAMESPACE} -- jcmd 1 VM.gc
kubectl exec ${pod} -n ${NAMESPACE} -- jcmd 1 GC.heap_dump /tmp/heapdump.hprof
done
# 线程转储
for pod in $(kubectl get pods -n ${NAMESPACE} -l app=atlas-mapper -o jsonpath='{.items[*].metadata.name}'); do
echo "生成Pod ${pod} 的线程转储..."
kubectl exec ${pod} -n ${NAMESPACE} -- jcmd 1 Thread.print > ${pod}-threaddump.txt
done
}
# 主函数
main() {
case "${1:-all}" in
"pods")
check_pods
;;
"services")
check_services
;;
"resources")
check_resources
;;
"events")
check_events
;;
"network")
test_connectivity
;;
"performance")
performance_analysis
;;
"all")
check_pods
check_services
check_resources
check_events
;;
*)
echo "用法: $0 [pods|services|resources|events|network|performance|all]"
exit 1
;;
esac
}
main "$@"
2. 故障处理手册
markdown
# Atlas Mapper 故障处理手册
## 应用启动失败
### 症状
- Pod状态为CrashLoopBackOff
- 应用日志显示启动异常
### 可能原因
1. 数据库连接失败
2. Redis连接失败
3. 配置文件错误
4. 资源不足
### 处理步骤
1. 检查Pod日志:`kubectl logs <pod-name> -n atlas-mapper`
2. 检查配置:`kubectl get configmap atlas-mapper-config -o yaml`
3. 验证数据库连接:运行连通性测试
4. 检查资源限制:`kubectl describe pod <pod-name>`
## 性能问题
### 症状
- 响应时间过长
- 内存使用率过高
- CPU使用率过高
### 处理步骤
1. 查看监控指标确认问题
2. 生成JVM堆转储分析内存泄漏
3. 生成线程转储分析死锁
4. 调整JVM参数或扩容
## 网络问题
### 症状
- 服务无法访问
- 间歇性连接失败
### 处理步骤
1. 检查Service和Endpoints
2. 验证Ingress配置
3. 测试Pod间网络连通性
4. 检查网络策略
## 数据库问题
### 症状
- 数据库连接超时
- 查询性能下降
### 处理步骤
1. 检查数据库服务状态
2. 验证连接池配置
3. 分析慢查询日志
4. 优化数据库索引
🔧 性能调优
1. JVM调优参数
bash
# 生产环境JVM参数
JAVA_OPTS="
-Xms4g
-Xmx8g
-XX:NewRatio=1
-XX:SurvivorRatio=8
-XX:+UseG1GC
-XX:MaxGCPauseMillis=200
-XX:G1HeapRegionSize=16m
-XX:+G1UseAdaptiveIHOP
-XX:G1MixedGCCountTarget=8
-XX:G1MixedGCLiveThresholdPercent=85
-XX:+PrintGC
-XX:+PrintGCDetails
-XX:+PrintGCTimeStamps
-XX:+PrintGCApplicationStoppedTime
-Xloggc:/opt/atlas-mapper/logs/gc.log
-XX:+UseGCLogFileRotation
-XX:NumberOfGCLogFiles=5
-XX:GCLogFileSize=100M
-XX:+HeapDumpOnOutOfMemoryError
-XX:HeapDumpPath=/opt/atlas-mapper/logs/
-XX:+ExitOnOutOfMemoryError
-Djava.security.egd=file:/dev/./urandom
-Dfile.encoding=UTF-8
-Duser.timezone=Asia/Shanghai
"
2. 应用配置优化
yaml
# application-production.yml
server:
tomcat:
threads:
max: 200
min-spare: 20
connection-timeout: 20000
max-connections: 8192
accept-count: 100
max-http-post-size: 50MB
spring:
datasource:
hikari:
maximum-pool-size: 50
minimum-idle: 10
connection-timeout: 30000
idle-timeout: 600000
max-lifetime: 1800000
leak-detection-threshold: 60000
redis:
lettuce:
pool:
max-active: 50
max-idle: 20
min-idle: 10
max-wait: 5000
shutdown-timeout: 100ms
cache:
caffeine:
spec: maximumSize=10000,expireAfterWrite=300s
atlas:
mapper:
performance:
batch-size: 1000
thread-pool-size: 20
cache-enabled: true
async-enabled: true
monitoring:
metrics-enabled: true
tracing-enabled: true
sampling-rate: 0.1
3. 数据库优化
sql
-- 数据库索引优化
CREATE INDEX idx_order_customer_id ON orders(customer_id);
CREATE INDEX idx_order_date ON orders(order_date);
CREATE INDEX idx_order_status ON orders(status);
CREATE INDEX idx_order_tenant ON orders(tenant_id);
-- 复合索引
CREATE INDEX idx_order_customer_date ON orders(customer_id, order_date);
CREATE INDEX idx_order_tenant_status ON orders(tenant_id, status);
-- 分区表(如果数据量大)
ALTER TABLE orders PARTITION BY RANGE (YEAR(order_date)) (
PARTITION p2023 VALUES LESS THAN (2024),
PARTITION p2024 VALUES LESS THAN (2025),
PARTITION p2025 VALUES LESS THAN (2026)
);
-- 数据库配置优化
SET GLOBAL innodb_buffer_pool_size = 8G;
SET GLOBAL innodb_log_file_size = 512M;
SET GLOBAL innodb_flush_log_at_trx_commit = 2;
SET GLOBAL sync_binlog = 0;
SET GLOBAL query_cache_size = 256M;
📈 容量规划
1. 性能基准测试
bash
#!/bin/bash
# benchmark.sh - 性能基准测试脚本
# 测试配置
ENDPOINT="https://atlas-mapper.example.com"
CONCURRENT_USERS=100
TEST_DURATION=300 # 5分钟
RAMP_UP_TIME=60 # 1分钟
# 单个映射测试
test_single_mapping() {
echo "执行单个映射性能测试..."
ab -n 10000 -c ${CONCURRENT_USERS} \
-H "Content-Type: application/json" \
-p test-data/single-mapping.json \
${ENDPOINT}/atlas-mapper/api/v1/mapping/single
}
# 批量映射测试
test_batch_mapping() {
echo "执行批量映射性能测试..."
ab -n 1000 -c 50 \
-H "Content-Type: application/json" \
-p test-data/batch-mapping.json \
${ENDPOINT}/atlas-mapper/api/v1/mapping/batch
}
# 压力测试
stress_test() {
echo "执行压力测试..."
# 使用JMeter进行压力测试
jmeter -n -t test-plans/atlas-mapper-stress-test.jmx \
-l results/stress-test-results.jtl \
-e -o results/stress-test-report/
}
# 生成测试报告
generate_report() {
echo "生成测试报告..."
cat > performance-report.md << EOF
# Atlas Mapper 性能测试报告
## 测试环境
- 并发用户数: ${CONCURRENT_USERS}
- 测试时长: ${TEST_DURATION}秒
- 测试时间: $(date)
## 测试结果
$(cat results/test-summary.txt)
## 性能指标
- 平均响应时间: $(grep "mean" results/stress-test-results.jtl | awk '{print $2}')ms
- 95%响应时间: $(grep "95%" results/stress-test-results.jtl | awk '{print $2}')ms
- 吞吐量: $(grep "throughput" results/stress-test-results.jtl | awk '{print $2}') TPS
- 错误率: $(grep "error" results/stress-test-results.jtl | awk '{print $2}')%
## 建议
基于测试结果,建议进行以下优化:
1. 调整JVM堆内存大小
2. 优化数据库连接池配置
3. 增加应用实例数量
EOF
}
main() {
mkdir -p results test-data
test_single_mapping
test_batch_mapping
stress_test
generate_report
echo "性能测试完成,报告已生成:performance-report.md"
}
main "$@"
2. 容量规划建议
yaml
# 容量规划配置
capacity_planning:
# 基础配置(支持1000 TPS)
basic:
application_instances: 2
cpu_per_instance: "2 cores"
memory_per_instance: "4GB"
database_connections: 20
# 标准配置(支持5000 TPS)
standard:
application_instances: 4
cpu_per_instance: "4 cores"
memory_per_instance: "8GB"
database_connections: 50
# 高性能配置(支持10000+ TPS)
high_performance:
application_instances: 8
cpu_per_instance: "8 cores"
memory_per_instance: "16GB"
database_connections: 100
# 扩容策略
scaling_strategy:
cpu_threshold: 70%
memory_threshold: 80%
response_time_threshold: 1000ms
error_rate_threshold: 5%
# 预警阈值
alert_thresholds:
cpu_warning: 60%
cpu_critical: 80%
memory_warning: 70%
memory_critical: 85%
disk_warning: 80%
disk_critical: 90%
🎯 运维总结
关键运维指标
-
可用性指标
- 系统可用性:99.9%
- 平均故障恢复时间:< 5分钟
- 平均无故障时间:> 720小时
-
性能指标
- 平均响应时间:< 100ms
- 95%响应时间:< 500ms
- 系统吞吐量:> 5000 TPS
- 错误率:< 0.1%
-
资源指标
- CPU使用率:< 70%
- 内存使用率:< 80%
- 磁盘使用率:< 80%
- 网络带宽使用率:< 60%
运维最佳实践
-
自动化运维
- 自动化部署和回滚
- 自动化监控和告警
- 自动化扩缩容
- 自动化备份和恢复
-
预防性维护
- 定期性能评估
- 定期安全扫描
- 定期容量规划
- 定期灾备演练
-
故障处理
- 建立故障响应流程
- 维护故障处理手册
- 定期故障复盘
- 持续改进优化
-
团队协作
- 建立运维值班制度
- 制定应急响应预案
- 定期技术培训
- 知识文档管理