RocketMQ 运维自动化是提升运维效率的关键。本文将深入探讨 Ansible、Kubernetes、CI/CD、监控告警等自动化运维实践。
一、Ansible 自动化
1.1 Inventory 配置
# inventory.yml
all:
children:
nameserver:
hosts:
ns1:
ansible_host: 192.168.1.10
ns2:
ansible_host: 192.168.1.11
broker:
hosts:
broker-a:
ansible_host: 192.168.1.20
broker_id: 0
broker_role: ASYNC_MASTER
broker-b:
ansible_host: 192.168.1.21
broker_id: 1
broker_role: SLAVE
1.2 部署 Playbook
# deploy-rocketmq.yml
- name: Deploy RocketMQ Cluster
hosts: all
become: yes
vars:
rocketmq_version: "5.0.0"
rocketmq_install_dir: /opt/rocketmq
rocketmq_data_dir: /data/rocketmq/store
tasks:
- name: Install Java
yum:
name: java-11-openjdk
state: present
- name: Download RocketMQ
get_url:
url: "https://archive.apache.org/dist/rocketmq/{{ rocketmq_version }}/rocketmq-all-{{ rocketmq_version }}-bin-release.zip"
dest: /tmp/rocketmq.zip
- name: Extract RocketMQ
unarchive:
src: /tmp/rocketmq.zip
dest: /opt/
remote_src: yes
- name: Create RocketMQ directory
file:
path: "{{ rocketmq_install_dir }}"
state: directory
owner: rocketmq
group: rocketmq
- name: Configure NameServer
template:
src: namesrv.conf.j2
dest: "{{ rocketmq_install_dir }}/conf/namesrv.conf"
when: inventory_hostname in groups['nameserver']
- name: Configure Broker
template:
src: broker.conf.j2
dest: "{{ rocketmq_install_dir }}/conf/broker.conf"
when: inventory_hostname in groups['broker']
vars:
broker_name: "{{ inventory_hostname }}"
broker_id: "{{ broker_id }}"
broker_role: "{{ broker_role }}"
- name: Create systemd service
template:
src: "rocketmq-{{ item }}.service.j2"
dest: "/etc/systemd/system/rocketmq-{{ item }}.service"
loop:
- namesrv
- broker
- name: Start NameServer
systemd:
name: rocketmq-namesrv
enabled: yes
state: started
when: inventory_hostname in groups['nameserver']
- name: Start Broker
systemd:
name: rocketmq-broker
enabled: yes
state: started
when: inventory_hostname in groups['broker']
1.3 配置模板
# broker.conf.j2
brokerClusterName=DefaultCluster
brokerName={{ broker_name }}
brokerId={{ broker_id }}
namesrvAddr=ns1:9876;ns2:9876
brokerRole={{ broker_role }}
flushDiskType=ASYNC_FLUSH
storePathRootDir={{ rocketmq_data_dir }}
storePathCommitLog={{ rocketmq_data_dir }}/commitlog
listenPort=10911
二、Kubernetes 部署
2.1 Helm Chart
# Chart.yaml
apiVersion: v2
name: rocketmq
description: A Helm chart for RocketMQ
type: application
version: 1.0.0
appVersion: "5.0.0"
2.2 NameServer StatefulSet
# templates/nameserver-statefulset.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: rocketmq-nameserver
spec:
serviceName: rocketmq-nameserver
replicas: 2
selector:
matchLabels:
app: rocketmq-nameserver
template:
metadata:
labels:
app: rocketmq-nameserver
spec:
containers:
- name: nameserver
image: apache/rocketmq:5.0.0
command: ["sh", "-c", "mqnamesrv"]
ports:
- containerPort: 9876
readinessProbe:
tcpSocket:
port: 9876
initialDelaySeconds: 30
periodSeconds: 10
2.3 Broker StatefulSet
# templates/broker-statefulset.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: rocketmq-broker
spec:
serviceName: rocketmq-broker
replicas: 2
selector:
matchLabels:
app: rocketmq-broker
template:
metadata:
labels:
app: rocketmq-broker
spec:
containers:
- name: broker
image: apache/rocketmq:5.0.0
command: ["sh", "-c", "mqbroker -c /opt/rocketmq/conf/broker.conf"]
ports:
- containerPort: 10911
- containerPort: 10909
env:
- name: NAMESRV_ADDR
value: "rocketmq-nameserver-0.rocketmq-nameserver:9876;rocketmq-nameserver-1.rocketmq-nameserver:9876"
volumeMounts:
- name: data
mountPath: /root/store
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 100Gi
2.4 自动扩缩容
# hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: rocketmq-broker-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: StatefulSet
name: rocketmq-broker
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
三、CI/CD 流水线
3.1 GitLab CI
# .gitlab-ci.yml
stages:
- test
- build
- deploy
test:
stage: test
script:
- docker-compose up -d nameserver broker
- sleep 30
- ./run-tests.sh
artifacts:
reports:
junit: test-results.xml
build:
stage: build
script:
- docker build -t myapp:rocketmq-$CI_COMMIT_SHA .
- docker push myapp:rocketmq-$CI_COMMIT_SHA
deploy:
stage: deploy
script:
- kubectl set image deployment/myapp myapp=myapp:rocketmq-$CI_COMMIT_SHA
only:
- main
3.2 Jenkins Pipeline
// Jenkinsfile
pipeline {
agent any
stages {
stage('Test') {
steps {
sh 'docker-compose up -d nameserver broker'
sh 'sleep 30'
sh './run-tests.sh'
}
}
stage('Build') {
steps {
sh 'docker build -t myapp:rocketmq-${BUILD_NUMBER} .'
sh 'docker push myapp:rocketmq-${BUILD_NUMBER}'
}
}
stage('Deploy') {
steps {
sh 'kubectl set image deployment/myapp myapp=myapp:rocketmq-${BUILD_NUMBER}'
}
}
}
post {
always {
sh 'docker-compose down'
}
}
}
四、监控告警
4.1 Prometheus 配置
# prometheus.yml
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'rocketmq-broker'
static_configs:
- targets: ['broker-1:5557', 'broker-2:5557']
metrics_path: '/metrics'
- job_name: 'rocketmq-nameserver'
static_configs:
- targets: ['ns1:5558', 'ns2:5558']
4.2 告警规则
# alerting_rules.yml
groups:
- name: rocketmq
rules:
- alert: RocketMQBrokerDown
expr: up{job="rocketmq-broker"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Broker 宕机:{{ $labels.instance }}"
- alert: RocketMQConsumerLag
expr: rocketmq_group_diff > 10000
for: 5m
labels:
severity: warning
annotations:
summary: "消费滞后:{{ $labels.group }}"
- alert: RocketMQDiskHigh
expr: rocketmq_commitlog_disk_ratio > 80
for: 5m
labels:
severity: warning
annotations:
summary: "磁盘使用率过高:{{ $value }}%"
4.3 Grafana 仪表盘
{
"dashboard": {
"title": "RocketMQ 监控",
"panels": [
{
"title": "生产/消费 TPS",
"targets": [{
"expr": "sum(rate(rocketmq_broker_tps[1m]))"
}]
},
{
"title": "消费堆积",
"targets": [{
"expr": "sum(rocketmq_group_diff) by (group)"
}]
},
{
"title": "磁盘使用率",
"targets": [{
"expr": "rocketmq_commitlog_disk_ratio"
}]
}
]
}
}
五、日志管理
5.1 ELK 配置
# filebeat.yml
filebeat.inputs:
- type: log
enabled: true
paths:
- /var/log/rocketmq/*.log
fields:
service: rocketmq
output.elasticsearch:
hosts: ["elasticsearch:9200"]
indices:
- index: "rocketmq-%{+yyyy.MM.dd}"
5.2 日志分析
#!/bin/bash
# 日志分析脚本
LOG_FILE="/var/log/rocketmq/broker.log"
echo "=== 错误统计 ==="
grep -i "error" $LOG_FILE | awk '{print $NF}' | sort | uniq -c | sort -rn
echo -e "\n=== 警告统计 ==="
grep -i "warn" $LOG_FILE | awk '{print $NF}' | sort | uniq -c | sort -rn
echo -e "\n=== 最近错误 ==="
grep -i "error" $LOG_FILE | tail -20
六、备份恢复
6.1 配置备份
#!/bin/bash
# 配置备份脚本
BACKUP_DIR="/backup/rocketmq"
DATE=$(date +%Y%m%d)
# 备份配置
scp broker-a:/opt/rocketmq/conf/broker.conf $BACKUP_DIR/broker-a_$DATE.conf
scp broker-b:/opt/rocketmq/conf/broker.conf $BACKUP_DIR/broker-b_$DATE.conf
# 备份 Topic 配置
mqadmin updateTopic -n ns1:9876 -t all -c DefaultCluster > $BACKUP_DIR/topics_$DATE.txt
# 备份消费组
mqadmin consumerProgress -n ns1:9876 > $BACKUP_DIR/consumers_$DATE.txt
# 保留 30 天
find $BACKUP_DIR -name "*.txt" -mtime +30 -delete
find $BACKUP_DIR -name "*.conf" -mtime +30 -delete
6.2 数据恢复
#!/bin/bash
# 数据恢复脚本
BACKUP_DATE=$1
if [ -z "$BACKUP_DATE" ]; then
echo "用法:$0 <backup_date>"
exit 1
fi
# 恢复配置
scp $BACKUP_DIR/broker-a_$BACKUP_DATE.conf broker-a:/opt/rocketmq/conf/broker.conf
scp $BACKUP_DIR/broker-b_$BACKUP_DATE.conf broker-b:/opt/rocketmq/conf/broker.conf
# 重启 Broker
for broker in broker-a broker-b; do
ssh $broker "systemctl restart rocketmq-broker"
done
echo "恢复完成"
七、最佳实践
7.1 自动化建议
自动化建议:
1. 使用 Ansible 部署集群
2. 使用 Kubernetes 编排
3. 配置 CI/CD 流水线
4. 建立监控告警体系
5. 定期备份配置
7.2 运维检查清单
日常检查:
- [ ] 检查 NameServer 状态
- [ ] 检查 Broker 状态
- [ ] 检查 Consumer Lag
- [ ] 检查磁盘使用率
- [ ] 检查错误日志
- [ ] 检查监控指标
定期检查:
- [ ] 备份配置
- [ ] 性能基准测试
- [ ] 故障演练
- [ ] 容量评估
总结
RocketMQ 运维自动化的核心要点:
- Ansible:自动化部署、配置管理
- Kubernetes:容器编排、自动扩缩容
- CI/CD:自动化测试、持续部署
- 监控告警:Prometheus、Grafana、告警规则
- 日志管理:ELK、日志分析
- 备份恢复:配置备份、数据恢复
核心要点:
- 使用自动化工具提升效率
- 建立完善的监控体系
- 定期备份配置和数据
- 实现 CI/CD 流水线
- 建立运维检查清单