一、电商核心场景延迟需求分析
1.1 电商延迟场景分类表
| 业务场景 | 典型延迟时间 | 精度要求 | 消息量级 | 关键特性 | 推荐方案 |
|---|---|---|---|---|---|
| 订单超时取消 | 15-30分钟 | 高(±10秒) | 大 | 高并发、强一致性 | 插件延迟 |
| 库存预占释放 | 10-30分钟 | 中(±1分钟) | 中 | 幂等性、可靠性 | TTL+DLX |
| 促销定时开始 | 固定时间点 | 高(秒级) | 小 | 定时精确、广播 | 插件延迟 |
| 物流状态更新 | 1-24小时 | 低(±5分钟) | 大 | 批量处理、容错 | TTL+DLX |
| 用户消息推送 | 用户指定时间 | 高(±30秒) | 中 | 个性化、动态延迟 | 插件延迟 |
| 评价提醒 | 7天 | 低(±1小时) | 大 | 长延迟、可靠 | 外部调度 |
| 优惠券过期 | 1-30天 | 中(±10分钟) | 中 | 批量处理、通知 | TTL+DLX |
1.2 业务需求与技术指标映射
电商延迟队列SLA指标:
订单取消场景:
-
最大延迟偏差: ≤10秒
-
处理成功率: ≥99.99%
-
最大积压: ≤1000单
-
故障恢复时间: ≤30秒
库存预占场景:
-
最大延迟偏差: ≤60秒
-
处理成功率: ≥99.95%
-
消息丢失率: ≤0.001%
-
幂等性保证: 必须
促销定时场景:
-
时间同步精度: ≤1秒
-
同时触发达标率: ≥99.9%
-
广播可靠性: 100%
-
预热时间: 提前5分钟
二、订单超时取消方案(核心场景)

2.2 订单取消延迟队列实现
order_cancel_delay.py
import pika
import json
import time
import uuid
from datetime import datetime, timedelta
from typing import Dict, Any, Optional
from enum import Enum
import redis
import hashlib
class OrderCancelDelaySystem:
"""订单取消延迟系统"""
def init(self, rabbitmq_hosts: list, redis_host: str):
"""
初始化订单取消延迟系统
Args:
rabbitmq_hosts: RabbitMQ集群节点列表
redis_host: Redis地址,用于幂等性控制
"""
self.rabbitmq_hosts = rabbitmq_hosts
self.redis_client = redis.Redis(
host=redis_host,
port=6379,
db=0,
decode_responses=True
)
业务配置
self.cancel_config = {
'timeout_seconds': {
'normal': 15 * 60, # 普通订单15分钟
'presale': 30 * 60, # 预售订单30分钟
'group': 2 * 60 * 60, # 拼团订单2小时
'custom': 24 * 60 * 60 # 定制订单24小时
},
'retry_policy': {
'max_retries': 3,
'retry_delay': [30, 60, 120], # 重试延迟(秒)
'dead_letter_queue': 'order.cancel.dlq'
},
'monitor': {
'cancel_rate_window': 300, # 5分钟窗口
'max_cancel_rate': 0.1 # 最大取消率10%
}
}
初始化RabbitMQ连接
self.init_rabbitmq_connection()
def init_rabbitmq_connection(self):
"""初始化RabbitMQ连接"""
credentials = pika.PlainCredentials('order_user', 'OrderPass@2024')
尝试连接所有节点
for host_info in self.rabbitmq_hosts:
try:
parameters = pika.ConnectionParameters(
host=host_info['host'],
port=host_info['port'],
credentials=credentials,
heartbeat=600,
connection_attempts=2,
retry_delay=3,
socket_timeout=30
)
self.connection = pika.BlockingConnection(parameters)
self.channel = self.connection.channel()
print(f"✅ 连接到RabbitMQ: {host_info['host']}:{host_info['port']}")
break
except Exception as e:
print(f"❌ 连接失败 {host_info['host']}: {e}")
continue
设置订单取消基础设施
self.setup_order_cancel_infrastructure()
def setup_order_cancel_infrastructure(self):
"""设置订单取消基础设施"""
1. 订单取消延迟交换机
exchange_args = {
'x-delayed-type': 'direct',
'x-max-in-memory-length': 10000,
'x-max-in-memory-bytes': 1073741824 # 1GB
}
self.channel.exchange_declare(
exchange='order.cancel.delayed',
exchange_type='x-delayed-message',
durable=True,
arguments=exchange_args
)
2. 订单取消队列(主队列)
queue_args = {
'x-max-length': 50000,
'x-overflow': 'reject-publish',
'x-message-ttl': 86400000, # 24小时
'x-dead-letter-exchange': 'order.cancel.dlx'
}
self.channel.queue_declare(
queue='order.cancel.queue',
durable=True,
arguments=queue_args
)
绑定
self.channel.queue_bind(
exchange='order.cancel.delayed',
queue='order.cancel.queue',
routing_key='order.cancel'
)
3. 死信交换机和队列
self.channel.exchange_declare(
exchange='order.cancel.dlx',
exchange_type='topic',
durable=True
)
self.channel.queue_declare(
queue='order.cancel.dlq',
durable=True,
arguments={
'x-message-ttl': 7 * 24 * 60 * 60 * 1000, # 保留7天
'x-max-length': 10000
}
)
self.channel.queue_bind(
exchange='order.cancel.dlx',
queue='order.cancel.dlq',
routing_key='#'
)
4. 处理结果队列
self.channel.queue_declare(
queue='order.cancel.result',
durable=True
)
print("✅ 订单取消基础设施设置完成")
def generate_order_id(self, user_id: str, product_id: str) -> str:
"""生成订单ID(带业务语义)"""
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
random_suffix = str(uuid.uuid4())[:8]
return f"ORD{timestamp}{user_id[:6]}{product_id[:6]}_{random_suffix}"
def calculate_order_timeout(self, order_type: str, order_data: Dict) -> int:
"""
计算订单超时时间
Args:
order_type: 订单类型
order_data: 订单数据
Returns:
超时时间(毫秒)
"""
base_timeout = self.cancel_config['timeout_seconds'].get(
order_type, 15 * 60
)
动态调整逻辑
adjustments = 0
1. VIP用户延长
if order_data.get('user_level') == 'vip':
adjustments += 5 * 60 # VIP用户延长5分钟
2. 高价值订单延长
if order_data.get('amount', 0) > 1000:
adjustments += 10 * 60 # 高价值订单延长10分钟
3. 活动期间延长
if self.is_promotion_period():
adjustments += 5 * 60 # 活动期间延长5分钟
return (base_timeout + adjustments) * 1000 # 转换为毫秒
def is_promotion_period(self) -> bool:
"""判断是否在促销期间"""
实际实现应该从配置中心或数据库获取
这里简化实现
current_hour = datetime.now().hour
return 20 <= current_hour <= 22 # 晚8点到10点是促销时间
def create_order_cancel_message(self, order_id: str, order_data: Dict) -> Dict:
"""
创建订单取消消息
Args:
order_id: 订单ID
order_data: 订单数据
Returns:
取消消息字典
"""
message_id = f"cancel_{order_id}_{int(time.time()*1000)}"
计算消息指纹(用于幂等性)
message_fingerprint = hashlib.md5(
f"{order_id}_{order_data.get('create_time')}".encode()
).hexdigest()
return {
'message_id': message_id,
'order_id': order_id,
'order_type': order_data.get('order_type', 'normal'),
'user_id': order_data.get('user_id'),
'amount': order_data.get('amount', 0),
'products': order_data.get('products', []),
'create_time': order_data.get('create_time'),
'expected_cancel_time': order_data.get('expected_cancel_time'),
'metadata': {
'fingerprint': message_fingerprint,
'retry_count': 0,
'source_service': 'order-service',
'business_scene': 'order_timeout_cancel'
}
}
def send_order_cancel_delay(self, order_id: str, order_data: Dict) -> bool:
"""
发送订单取消延迟消息
Args:
order_id: 订单ID
order_data: 订单数据
Returns:
是否发送成功
"""
try:
1. 幂等性检查
redis_key = f"order_cancel:{order_id}"
if self.redis_client.exists(redis_key):
print(f"⚠️ 订单 {order_id} 已存在取消消息,跳过")
return True
2. 计算延迟时间
order_type = order_data.get('order_type', 'normal')
delay_ms = self.calculate_order_timeout(order_type, order_data)
3. 创建消息
cancel_message = self.create_order_cancel_message(order_id, order_data)
4. 设置消息属性
properties = pika.BasicProperties(
headers={
'x-delay': delay_ms,
'x-order-id': order_id,
'x-order-type': order_type,
'x-fingerprint': cancel_message['metadata']['fingerprint']
},
delivery_mode=2, # 持久化
content_type='application/json',
timestamp=int(time.time()),
message_id=cancel_message['message_id'],
correlation_id=order_id,
expiration=str(delay_ms + 60000) # 延迟时间+1分钟
)
5. 发送消息
self.channel.basic_publish(
exchange='order.cancel.delayed',
routing_key='order.cancel',
body=json.dumps(cancel_message, ensure_ascii=False),
properties=properties,
mandatory=True # 确保消息被路由
)
6. 记录Redis状态
self.redis_client.setex(
redis_key,
2 * delay_ms // 1000, # 过期时间是延迟的2倍
json.dumps({
'status': 'pending',
'send_time': datetime.now().isoformat(),
'expected_cancel': datetime.now() + timedelta(milliseconds=delay_ms)
})
)
7. 发送监控事件
self.send_monitoring_event('order_cancel_scheduled', {
'order_id': order_id,
'delay_ms': delay_ms,
'order_type': order_type,
'timestamp': datetime.now().isoformat()
})
print(f"✅ 订单取消延迟消息已发送: {order_id}, 延迟: {delay_ms}ms")
return True
except Exception as e:
print(f"❌ 发送订单取消延迟消息失败: {e}")
self.send_monitoring_event('order_cancel_schedule_failed', {
'order_id': order_id,
'error': str(e),
'timestamp': datetime.now().isoformat()
})
return False
def cancel_order_timeout(self, order_id: str) -> bool:
"""
取消订单超时(用户支付成功后调用)
Args:
order_id: 订单ID
Returns:
是否取消成功
"""
try:
1. 从Redis获取消息信息
redis_key = f"order_cancel:{order_id}"
message_info = self.redis_client.get(redis_key)
if not message_info:
print(f"⚠️ 订单 {order_id} 未找到取消消息记录")
return False
2. 发送取消命令到结果队列
cancel_cmd = {
'action': 'cancel_timeout',
'order_id': order_id,
'timestamp': datetime.now().isoformat(),
'reason': 'user_paid'
}
self.channel.basic_publish(
exchange='',
routing_key='order.cancel.result',
body=json.dumps(cancel_cmd),
properties=pika.BasicProperties(
delivery_mode=2,
content_type='application/json'
)
)
3. 更新Redis状态
self.redis_client.setex(
redis_key,
300, # 保留5分钟用于审计
json.dumps({
'status': 'cancelled',
'cancel_time': datetime.now().isoformat(),
'cancel_reason': 'user_paid'
})
)
4. 发送监控事件
self.send_monitoring_event('order_cancel_cancelled', {
'order_id': order_id,
'reason': 'user_paid',
'timestamp': datetime.now().isoformat()
})
print(f"✅ 订单超时已取消: {order_id}")
return True
except Exception as e:
print(f"❌ 取消订单超时失败: {e}")
return False
def process_order_cancel(self, ch, method, properties, body):
"""处理订单取消消息"""
try:
message = json.loads(body.decode('utf-8'))
order_id = message['order_id']
fingerprint = message['metadata']['fingerprint']
print(f"处理订单取消: {order_id}")
1. 幂等性检查(使用Redis分布式锁)
lock_key = f"order_cancel_lock:{order_id}"
lock_acquired = self.redis_client.setnx(lock_key, "processing")
self.redis_client.expire(lock_key, 30) # 锁30秒
if not lock_acquired:
print(f"订单 {order_id} 正在被其他进程处理,跳过")
ch.basic_ack(delivery_tag=method.delivery_tag)
return
try:
2. 检查订单状态(调用订单服务)
order_status = self.check_order_status(order_id)
if order_status == 'paid':
print(f"订单 {order_id} 已支付,跳过取消")
更新Redis状态
self.redis_client.setex(
f"order_cancel:{order_id}",
600, # 保留10分钟
json.dumps({
'status': 'skipped',
'reason': 'already_paid',
'check_time': datetime.now().isoformat()
})
)
elif order_status == 'cancelled':
print(f"订单 {order_id} 已取消,跳过")
elif order_status == 'pending':
3. 执行订单取消
success = self.execute_order_cancel(order_id, message)
if success:
4. 更新Redis状态
self.redis_client.setex(
f"order_cancel:{order_id}",
24 * 60 * 60, # 保留24小时
json.dumps({
'status': 'completed',
'cancel_time': datetime.now().isoformat(),
'message_id': message['message_id']
})
)
5. 发送取消通知
self.send_cancel_notification(order_id, message)
print(f"✅ 订单取消完成: {order_id}")
else:
6. 处理失败,重试或进入死信队列
retry_count = message['metadata'].get('retry_count', 0)
if retry_count < self.cancel_config['retry_policy']['max_retries']:
重试
self.retry_order_cancel(message, method.delivery_tag)
else:
进入死信队列
ch.basic_nack(
delivery_tag=method.delivery_tag,
requeue=False
)
print(f"订单 {order_id} 取消失败,进入死信队列")
else:
print(f"未知订单状态: {order_status}")
ch.basic_nack(
delivery_tag=method.delivery_tag,
requeue=False
)
finally:
释放锁
self.redis_client.delete(lock_key)
确认消息
ch.basic_ack(delivery_tag=method.delivery_tag)
7. 发送监控事件
self.send_monitoring_event('order_cancel_processed', {
'order_id': order_id,
'status': order_status,
'timestamp': datetime.now().isoformat()
})
except Exception as e:
print(f"处理订单取消消息异常: {e}")
ch.basic_nack(
delivery_tag=method.delivery_tag,
requeue=False
)
def check_order_status(self, order_id: str) -> str:
"""检查订单状态(模拟实现)"""
实际实现应该调用订单服务的API
这里简化返回随机状态
import random
statuses = ['pending', 'paid', 'cancelled']
return random.choice(statuses)
def execute_order_cancel(self, order_id: str, message: Dict) -> bool:
"""执行订单取消(模拟实现)"""
实际实现应该调用订单服务的取消接口
这里模拟成功或失败
import random
return random.random() > 0.1 # 90%成功率
def send_cancel_notification(self, order_id: str, message: Dict):
"""发送取消通知(模拟实现)"""
实际实现应该调用通知服务
print(f"发送订单取消通知: {order_id}")
def retry_order_cancel(self, message: Dict, delivery_tag: int):
"""重试订单取消"""
retry_count = message['metadata'].get('retry_count', 0) + 1
message['metadata']['retry_count'] = retry_count
计算重试延迟
retry_delay = self.cancel_config['retry_policy']['retry_delay'][
min(retry_count - 1, len(self.cancel_config['retry_policy']['retry_delay']) - 1)
]
发送重试消息
properties = pika.BasicProperties(
headers={'x-delay': retry_delay * 1000},
delivery_mode=2,
content_type='application/json'
)
self.channel.basic_publish(
exchange='order.cancel.delayed',
routing_key='order.cancel.retry',
body=json.dumps(message),
properties=properties
)
print(f"订单 {message['order_id']} 重试 {retry_count},延迟 {retry_delay}秒")
def send_monitoring_event(self, event_type: str, data: Dict):
"""发送监控事件"""
实际实现应该发送到监控系统
这里简化打印
print(f"[监控] {event_type}: {json.dumps(data)}")
def start_order_cancel_consumer(self):
"""启动订单取消消费者"""
设置QoS
self.channel.basic_qos(prefetch_count=10)
开始消费
self.channel.basic_consume(
queue='order.cancel.queue',
on_message_callback=self.process_order_cancel,
auto_ack=False
)
print("订单取消消费者启动...")
self.channel.start_consuming()
使用示例
if name == "main":
配置
rabbitmq_hosts = [
{'host': '192.168.5.101', 'port': 5672},
{'host': '192.168.5.102', 'port': 5672},
{'host': '192.168.5.103', 'port': 5672}
]
redis_host = '192.168.5.100'
创建订单取消系统
order_system = OrderCancelDelaySystem(rabbitmq_hosts, redis_host)
try:
模拟创建订单并设置取消延迟
order_data = {
'order_type': 'normal',
'user_id': 'user_123456',
'user_level': 'vip',
'amount': 1500.00,
'products': [
{'id': 'prod_001', 'name': 'iPhone 15', 'price': 1299.00},
{'id': 'prod_002', 'name': 'AirPods Pro', 'price': 201.00}
],
'create_time': datetime.now().isoformat(),
'expected_cancel_time': None
}
生成订单ID
order_id = order_system.generate_order_id(
order_data['user_id'],
order_data['products'][0]['id']
)
print(f"创建订单: {order_id}")
发送取消延迟消息
success = order_system.send_order_cancel_delay(order_id, order_data)
if success:
print(f"订单 {order_id} 取消延迟设置成功")
模拟用户支付(取消超时)
import time
time.sleep(2)
print("用户完成支付...")
order_system.cancel_order_timeout(order_id)
启动消费者(在另一个线程)
import threading
consumer_thread = threading.Thread(
target=order_system.start_order_cancel_consumer,
daemon=True
)
consumer_thread.start()
保持主线程运行
time.sleep(30)
except KeyboardInterrupt:
print("程序退出")
finally:
if hasattr(order_system, 'connection'):
order_system.connection.close()
2.3 订单取消监控仪表板
order_cancel_monitor.py
import streamlit as st
import pandas as pd
import plotly.graph_objects as go
from datetime import datetime, timedelta
import json
import redis
import pika
class OrderCancelMonitor:
"""订单取消监控仪表板"""
def init(self):
初始化连接
self.redis_client = redis.Redis(
host='192.168.5.100',
port=6379,
db=0,
decode_responses=True
)
模拟数据生成器
self.data_generator = OrderCancelDataGenerator()
def get_dashboard_data(self):
"""获取仪表板数据"""
实际应该从监控系统获取
这里使用模拟数据
return self.data_generator.generate_monitor_data()
def run_dashboard(self):
"""运行监控仪表板"""
st.set_page_config(
page_title="订单取消延迟监控",
page_icon="⏰",
layout="wide"
)
st.title("📊 订单取消延迟监控仪表板")
获取数据
data = self.get_dashboard_data()
1. 概览指标
st.subheader("📈 实时概览")
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric(
label="今日取消订单",
value=data['today_cancels'],
delta=data['today_cancels_trend']
)
with col2:
st.metric(
label="取消成功率",
value=f"{data['success_rate']:.2f}%",
delta=f"{data['success_rate_trend']:.2f}%"
)
with col3:
st.metric(
label="平均延迟偏差",
value=f"{data['avg_delay_deviation']:.2f}s",
delta=f"{data['delay_deviation_trend']:.2f}s"
)
with col4:
st.metric(
label="当前积压",
value=data['current_backlog'],
delta=data['backlog_trend']
)
2. 延迟精度分析
st.subheader("⏱️ 延迟精度分析")
fig1 = go.Figure()
延迟分布图
fig1.add_trace(go.Histogram(
x=data['delay_samples'],
name='延迟分布',
nbinsx=20,
marker_color='skyblue'
))
fig1.add_vline(
x=0,
line_dash="dash",
line_color="red",
annotation_text="目标延迟"
)
fig1.update_layout(
title='订单取消延迟分布',
xaxis_title='延迟偏差(秒)',
yaxis_title='数量',
showlegend=True
)
st.plotly_chart(fig1, use_container_width=True)
3. 取消原因分析
st.subheader("🔍 取消原因分析")
col1, col2 = st.columns(2)
with col1:
取消原因饼图
fig2 = go.Figure(data=[go.Pie(
labels=list(data['cancel_reasons'].keys()),
values=list(data['cancel_reasons'].values()),
hole=.3
)])
fig2.update_layout(title='取消原因分布')
st.plotly_chart(fig2, use_container_width=True)
with col2:
订单类型分布
fig3 = go.Figure(data=[go.Bar(
x=list(data['order_types'].keys()),
y=list(data['order_types'].values()),
marker_color='lightcoral'
)])
fig3.update_layout(
title='订单类型分布',
xaxis_title='订单类型',
yaxis_title='数量'
)
st.plotly_chart(fig3, use_container_width=True)
4. 时间趋势图
st.subheader("📅 时间趋势")
创建时间序列数据
time_data = data['time_series']
fig4 = go.Figure()
fig4.add_trace(go.Scatter(
x=time_data['time'],
y=time_data['cancels'],
mode='lines+markers',
name='取消数量',
line=dict(color='royalblue', width=2)
))
fig4.add_trace(go.Scatter(
x=time_data['time'],
y=time_data['success_rate'],
mode='lines',
name='成功率',
yaxis='y2',
line=dict(color='firebrick', width=2, dash='dash')
))
fig4.update_layout(
title='取消趋势与成功率',
xaxis_title='时间',
yaxis_title='取消数量',
yaxis2=dict(
title='成功率(%)',
overlaying='y',
side='right'
),
showlegend=True
)
st.plotly_chart(fig4, use_container_width=True)
5. 实时告警
st.subheader("🚨 实时告警")
if data['alerts']:
for alert in data['alerts']:
if alert['level'] == 'critical':
st.error(f"🔴 {alert['message']} - {alert['time']}")
elif alert['level'] == 'warning':
st.warning(f"🟡 {alert['message']} - {alert['time']}")
else:
st.info(f"🔵 {alert['message']} - {alert['time']}")
else:
st.success("✅ 当前无告警")
6. 详细数据表
st.subheader("📋 详细数据")
df = pd.DataFrame(data['recent_cancels'])
st.dataframe(df, use_container_width=True)
7. 手动操作面板
st.subheader("⚙️ 操作面板")
with st.expander("手动操作"):
col1, col2 = st.columns(2)
with col1:
order_id = st.text_input("订单ID", "ORD20240101000001")
if st.button("取消超时设置"):
调用API取消超时
st.success(f"订单 {order_id} 超时已取消")
with col2:
delay_seconds = st.slider("延迟时间(秒)", 60, 3600, 900)
if st.button("批量测试"):
批量测试
st.info(f"已发送测试消息,延迟 {delay_seconds}秒")
系统配置
st.text("系统配置")
config_col1, config_col2, config_col3 = st.columns(3)
with config_col1:
max_backlog = st.number_input("最大积压", 100, 10000, 1000)
with config_col2:
target_success_rate = st.number_input(
"目标成功率(%)", 90.0, 100.0, 99.5
)
with config_col3:
if st.button("保存配置"):
st.success("配置已保存")
class OrderCancelDataGenerator:
"""订单取消数据生成器(模拟数据)"""
def generate_monitor_data(self):
"""生成监控数据"""
import random
from datetime import datetime, timedelta
模拟今日取消订单
today_cancels = random.randint(1000, 5000)
模拟成功率
success_rate = 99.5 + random.uniform(-0.5, 0.5)
模拟延迟样本
delay_samples = [random.gauss(0, 5) for _ in range(100)]
取消原因分布
cancel_reasons = {
'超时未支付': random.randint(400, 600),
'用户主动取消': random.randint(100, 300),
'库存不足': random.randint(50, 150),
'价格变化': random.randint(20, 80),
'其他': random.randint(10, 50)
}
订单类型分布
order_types = {
'普通订单': random.randint(300, 500),
'预售订单': random.randint(100, 200),
'拼团订单': random.randint(50, 150),
'秒杀订单': random.randint(20, 80),
'定制订单': random.randint(10, 30)
}
时间序列数据
time_points = []
cancels_data = []
success_data = []
base_time = datetime.now() - timedelta(hours=24)
for i in range(24):
time_point = base_time + timedelta(hours=i)
time_points.append(time_point.strftime('%H:%M'))
cancels_data.append(random.randint(20, 100))
success_data.append(99.0 + random.uniform(0, 1))
告警信息
alerts = []
if random.random() > 0.7:
alerts.append({
'level': 'warning',
'message': '延迟精度偏差增大,当前标准差5.2s',
'time': datetime.now().strftime('%H:%M:%S')
})
if random.random() > 0.9:
alerts.append({
'level': 'critical',
'message': '取消成功率下降至98.5%,低于阈值99.0%',
'time': datetime.now().strftime('%H:%M:%S')
})
最近取消记录
recent_cancels = []
for i in range(10):
recent_cancels.append({
'order_id': f'ORD20240101{random.randint(10000, 99999)}',
'user_id': f'user_{random.randint(10000, 99999)}',
'cancel_time': (datetime.now() - timedelta(minutes=random.randint(1, 60))).strftime('%H:%M:%S'),
'delay': f'{random.gauss(0, 5):.2f}s',
'reason': random.choice(list(cancel_reasons.keys())),
'status': random.choice(['成功', '失败', '重试中'])
})
return {
'today_cancels': today_cancels,
'today_cancels_trend': random.randint(-50, 50),
'success_rate': success_rate,
'success_rate_trend': random.uniform(-0.2, 0.2),
'avg_delay_deviation': abs(random.gauss(0, 3)),
'delay_deviation_trend': random.uniform(-0.5, 0.5),
'current_backlog': random.randint(100, 500),
'backlog_trend': random.randint(-20, 20),
'delay_samples': delay_samples,
'cancel_reasons': cancel_reasons,
'order_types': order_types,
'time_series': {
'time': time_points,
'cancels': cancels_data,
'success_rate': success_data
},
'alerts': alerts,
'recent_cancels': recent_cancels
}
if name == "main":
monitor = OrderCancelMonitor()
monitor.run_dashboard()
三、库存预占释放方案
3.1 TTL+DLX库存预占实现
inventory_reservation.py
import pika
import json
import time
from datetime import datetime, timedelta
from typing import Dict, List, Tuple
import threading
from concurrent.futures import ThreadPoolExecutor
class InventoryReservationSystem:
"""库存预占释放系统"""
def init(self, rabbitmq_host: str):
"""
初始化库存预占系统
Args:
rabbitmq_host: RabbitMQ主机地址
"""
self.connection = pika.BlockingConnection(
pika.ConnectionParameters(
host=rabbitmq_host,
credentials=pika.PlainCredentials('inventory_user', 'InventoryPass@2024'),
heartbeat=300
)
)
self.channel = self.connection.channel()
库存预占配置
self.reservation_config = {
'timeout_seconds': {
'normal': 10 * 60, # 普通商品10分钟
'hot': 5 * 60, # 热销商品5分钟
'seckill': 2 * 60, # 秒杀商品2分钟
'presale': 30 * 60 # 预售商品30分钟
},
'batch_size': 50, # 批量处理大小
'max_retries': 3, # 最大重试次数
'release_strategy': {
'immediate': True, # 立即释放
'gradual': False, # 渐进释放
'batch_delay': 1000 # 批量延迟(ms)
}
}
初始化库存队列
self.setup_inventory_queues()
启动库存释放处理器
self.start_inventory_release_processor()
def setup_inventory_queues(self):
"""设置库存队列"""
1. 库存预占延迟队列(按商品类型)
product_types = ['normal', 'hot', 'seckill', 'presale']
for ptype in product_types:
queue_name = f"inventory.reserve.{ptype}"
ttl_seconds = self.reservation_config['timeout_seconds'][ptype] * 1000
queue_args = {
'x-dead-letter-exchange': 'inventory.dlx',
'x-dead-letter-routing-key': f'inventory.release.{ptype}',
'x-message-ttl': ttl_seconds,
'x-max-length': 10000,
'x-overflow': 'reject-publish'
}
self.channel.queue_declare(
queue=queue_name,
durable=True,
arguments=queue_args
)
print(f"库存预占队列创建: {queue_name}, TTL: {ttl_seconds}ms")
2. 死信交换机和释放队列
self.channel.exchange_declare(
exchange='inventory.dlx',
exchange_type='topic',
durable=True
)
为每种商品类型创建释放队列
for ptype in product_types:
release_queue = f"inventory.release.{ptype}"
self.channel.queue_declare(
queue=release_queue,
durable=True,
arguments={
'x-max-length': 50000,
'x-message-ttl': 3600000 # 释放消息保留1小时
}
)
self.channel.queue_bind(
exchange='inventory.dlx',
queue=release_queue,
routing_key=f'inventory.release.{ptype}'
)
3. 库存操作结果队列
self.channel.queue_declare(
queue='inventory.operation.result',
durable=True
)
print("✅ 库存队列基础设施设置完成")
def reserve_inventory(self, order_id: str, items: List[Dict]) -> Tuple[bool, Dict]:
"""
预占库存
Args:
order_id: 订单ID
items: 商品列表
Returns:
(是否成功, 预占详情)
"""
try:
reservation_id = f"RES_{order_id}_{int(time.time()*1000)}"
reservations = []
for item in items:
product_id = item['product_id']
quantity = item['quantity']
product_type = item.get('type', 'normal')
创建库存预占消息
reserve_message = {
'reservation_id': f"{reservation_id}_{product_id}",
'order_id': order_id,
'product_id': product_id,
'sku_code': item.get('sku_code'),
'quantity': quantity,
'product_type': product_type,
'reserve_time': datetime.now().isoformat(),
'expected_release_time': None, # 由TTL决定
'metadata': {
'user_id': item.get('user_id'),
'source': 'order_create',
'priority': item.get('priority', 1)
}
}
计算TTL
ttl_ms = self.reservation_config['timeout_seconds'][product_type] * 1000
expected_release = datetime.now() + timedelta(milliseconds=ttl_ms)
reserve_message['expected_release_time'] = expected_release.isoformat()
发送到对应的预占队列
target_queue = f"inventory.reserve.{product_type}"
properties = pika.BasicProperties(
delivery_mode=2,
content_type='application/json',
timestamp=int(time.time()),
message_id=reserve_message['reservation_id'],
expiration=str(ttl_ms)
)
self.channel.basic_publish(
exchange='',
routing_key=target_queue,
body=json.dumps(reserve_message, ensure_ascii=False),
properties=properties
)
reservations.append({
'product_id': product_id,
'quantity': quantity,
'reservation_id': reserve_message['reservation_id'],
'timeout_seconds': ttl_ms // 1000
})
print(f"✅ 库存预占: {product_id} x{quantity}, 超时: {ttl_ms//1000}秒")
发送预占完成事件
self.send_reservation_event('inventory_reserved', {
'order_id': order_id,
'reservation_id': reservation_id,
'items': len(items),
'total_quantity': sum(item['quantity'] for item in items),
'timestamp': datetime.now().isoformat()
})
return True, {
'reservation_id': reservation_id,
'reservations': reservations,
'message': '库存预占成功'
}
except Exception as e:
print(f"❌ 库存预占失败: {e}")
self.send_reservation_event('inventory_reserve_failed', {
'order_id': order_id,
'error': str(e),
'timestamp': datetime.now().isoformat()
})
return False, {'error': str(e)}
def confirm_inventory(self, order_id: str, reservation_id: str) -> bool:
"""
确认库存(支付成功后调用)
Args:
order_id: 订单ID
reservation_id: 预占ID
Returns:
是否确认成功
"""
try:
发送库存确认消息
confirm_message = {
'action': 'confirm',
'order_id': order_id,
'reservation_id': reservation_id,
'confirm_time': datetime.now().isoformat(),
'confirmed_by': 'payment_service'
}
self.channel.basic_publish(
exchange='',
routing_key='inventory.operation.result',
body=json.dumps(confirm_message),
properties=pika.BasicProperties(
delivery_mode=2,
content_type='application/json'
)
)
print(f"✅ 库存确认: {reservation_id}")
发送确认事件
self.send_reservation_event('inventory_confirmed', {
'order_id': order_id,
'reservation_id': reservation_id,
'timestamp': datetime.now().isoformat()
})
return True
except Exception as e:
print(f"❌ 库存确认失败: {e}")
return False
def release_inventory_batch(self, messages: List[Dict]):
"""
批量释放库存
Args:
messages: 库存释放消息列表
"""
if not messages:
return
print(f"批量释放库存,数量: {len(messages)}")
按商品分组
product_groups = {}
for msg in messages:
product_id = msg['product_id']
if product_id not in product_groups:
product_groups[product_id] = []
product_groups[product_id].append(msg)
批量释放
release_results = []
for product_id, product_messages in product_groups.items():
total_quantity = sum(msg['quantity'] for msg in product_messages)
try:
调用库存服务API释放库存
success = self.call_inventory_service_release(
product_id,
total_quantity,
msg\['reservation_id'\] for msg in product_messages
)
if success:
release_results.append({
'product_id': product_id,
'quantity': total_quantity,
'success': True,
'message': f'释放成功 {len(product_messages)} 个预占'
})
记录成功日志
for msg in product_messages:
self.record_release_success(msg)
else:
release_results.append({
'product_id': product_id,
'quantity': total_quantity,
'success': False,
'message': '库存服务调用失败'
})
重试或记录失败
for msg in product_messages:
self.handle_release_failure(msg)
except Exception as e:
print(f"释放库存异常 {product_id}: {e}")
release_results.append({
'product_id': product_id,
'quantity': total_quantity,
'success': False,
'message': str(e)
})
发送批量释放完成事件
self.send_reservation_event('inventory_batch_released', {
'batch_size': len(messages),
'success_count': sum(1 for r in release_results if r['success']),
'fail_count': sum(1 for r in release_results if not r['success']),
'results': release_results,
'timestamp': datetime.now().isoformat()
})
def call_inventory_service_release(self, product_id: str,
quantity: int,
reservation_ids: List[str]) -> bool:
"""
调用库存服务释放库存(模拟实现)
Args:
product_id: 商品ID
quantity: 数量
reservation_ids: 预占ID列表
Returns:
是否成功
"""
实际实现应该调用库存服务的HTTP API
这里模拟成功
import random
success = random.random() > 0.05 # 95%成功率
if success:
print(f"库存服务释放: {product_id} x{quantity}, 预占IDs: {reservation_ids}")
return True
else:
print(f"库存服务释放失败: {product_id}")
return False
def record_release_success(self, message: Dict):
"""记录释放成功"""
实际实现应该记录到数据库
print(f"库存释放成功记录: {message['reservation_id']}")
def handle_release_failure(self, message: Dict):
"""处理释放失败"""
实际实现应该进行重试或告警
retry_count = message.get('_retry_count', 0)
if retry_count < self.reservation_config['max_retries']:
重试
message['_retry_count'] = retry_count + 1
计算重试延迟
retry_delay = 2 ** retry_count # 指数退避
重新发布到延迟队列
self.retry_release_message(message, retry_delay)
else:
进入人工处理队列
self.send_to_manual_process(message)
print(f"库存释放最终失败: {message['reservation_id']}")
def retry_release_message(self, message: Dict, delay_seconds: int):
"""重试释放消息"""
retry_queue = f"inventory.retry.{message['product_type']}"
创建重试队列(如果不存在)
self.channel.queue_declare(
queue=retry_queue,
durable=True,
arguments={
'x-dead-letter-exchange': 'inventory.dlx',
'x-dead-letter-routing-key': f'inventory.release.{message["product_type"]}',
'x-message-ttl': delay_seconds * 1000,
'x-max-length': 1000
}
)
发送重试消息
self.channel.basic_publish(
exchange='',
routing_key=retry_queue,
body=json.dumps(message),
properties=pika.BasicProperties(
delivery_mode=2,
content_type='application/json'
)
)
print(f"库存释放重试: {message['reservation_id']}, 延迟: {delay_seconds}秒")
def send_to_manual_process(self, message: Dict):
"""发送到人工处理队列"""
manual_queue = 'inventory.manual.process'
self.channel.queue_declare(
queue=manual_queue,
durable=True
)
error_message = {
'type': 'inventory_release_failure',
'original_message': message,
'failure_time': datetime.now().isoformat(),
'retry_count': message.get('_retry_count', 0)
}
self.channel.basic_publish(
exchange='',
routing_key=manual_queue,
body=json.dumps(error_message),
properties=pika.BasicProperties(
delivery_mode=2,
content_type='application/json'
)
)
print(f"库存释放进入人工处理: {message['reservation_id']}")
def process_inventory_release(self, ch, method, properties, body):
"""处理库存释放消息"""
try:
message = json.loads(body.decode('utf-8'))
添加到批处理缓冲区
self.release_buffer.append(message)
如果达到批量大小,立即处理
if len(self.release_buffer) >= self.reservation_config['batch_size']:
self.process_release_buffer()
确认消息
ch.basic_ack(delivery_tag=method.delivery_tag)
except Exception as e:
print(f"处理库存释放消息异常: {e}")
ch.basic_nack(delivery_tag=method.delivery_tag, requeue=False)
def process_release_buffer(self):
"""处理释放缓冲区"""
if not self.release_buffer:
return
复制缓冲区内容
messages_to_process = self.release_buffer.copy()
self.release_buffer.clear()
批量处理
self.release_inventory_batch(messages_to_process)
def start_inventory_release_processor(self):
"""启动库存释放处理器"""
初始化缓冲区
self.release_buffer = []
设置批量处理定时器
self.batch_timer = threading.Timer(
self.reservation_config['release_strategy']['batch_delay'] / 1000,
self.process_release_buffer
)
self.batch_timer.daemon = True
self.batch_timer.start()
启动消费者
product_types = ['normal', 'hot', 'seckill', 'presale']
for ptype in product_types:
queue_name = f"inventory.release.{ptype}"
self.channel.basic_consume(
queue=queue_name,
on_message_callback=self.process_inventory_release,
auto_ack=False
)
print("库存释放处理器启动...")
启动消费线程
consume_thread = threading.Thread(
target=self.channel.start_consuming,
daemon=True
)
consume_thread.start()
def send_reservation_event(self, event_type: str, data: Dict):
"""发送库存事件"""
实际实现应该发送到事件总线
print(f"[库存事件] {event_type}: {json.dumps(data, indent=2)}")
使用示例
if name == "main":
创建库存预占系统
inventory_system = InventoryReservationSystem('192.168.5.101')
try:
模拟订单创建和库存预占
order_id = "ORD20240101000001"
order_items = [
{
'product_id': 'PROD_001',
'sku_code': 'SKU_001_Black_XL',
'quantity': 2,
'type': 'normal',
'user_id': 'user_123456',
'priority': 1
},
{
'product_id': 'PROD_002',
'sku_code': 'SKU_002_White_M',
'quantity': 1,
'type': 'hot',
'user_id': 'user_123456',
'priority': 2
},
{
'product_id': 'PROD_003',
'sku_code': 'SKU_003_Red_S',
'quantity': 3,
'type': 'seckill',
'user_id': 'user_123456',
'priority': 3
}
]
print(f"创建订单: {order_id}")
预占库存
success, result = inventory_system.reserve_inventory(order_id, order_items)
if success:
print(f"库存预占成功: {result['reservation_id']}")
模拟用户支付
import time
time.sleep(3)
print("用户完成支付...")
确认库存
confirm_success = inventory_system.confirm_inventory(
order_id,
result['reservation_id']
)
if confirm_success:
print("库存确认成功")
else:
print("库存确认失败")
else:
print(f"库存预占失败: {result['error']}")
保持运行以观察库存释放
time.sleep(120)
except KeyboardInterrupt:
print("程序退出")
finally:
inventory_system.connection.close()
四、促销定时开始方案
4.1 高精度定时促销系统
promotion_scheduler.py
import pika
import json
import time
import asyncio
import aiohttp
from datetime import datetime, timedelta
from typing import Dict, List, Optional
from concurrent.futures import ThreadPoolExecutor
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
import redis
class PromotionScheduler:
"""促销定时开始系统"""
def init(self, rabbitmq_hosts: list, redis_host: str):
"""
初始化促销定时系统
Args:
rabbitmq_hosts: RabbitMQ集群节点
redis_host: Redis地址
"""
self.rabbitmq_hosts = rabbitmq_hosts
self.redis_client = redis.Redis(
host=redis_host,
port=6379,
db=1, # 使用不同的数据库
decode_responses=True
)
促销配置
self.promotion_config = {
'preheat_minutes': 5, # 预热时间(分钟)
'broadcast_strategy': {
'batch_size': 100,
'parallel_workers': 10,
'retry_policy': {
'max_retries': 3,
'retry_delay': [1, 5, 10] # 秒
}
},
'time_sync': {
'ntp_server': 'pool.ntp.org',
'sync_interval': 60, # 秒
'max_offset_ms': 100 # 最大时间偏移
}
}
初始化连接
self.init_connections()
初始化时间同步
self.init_time_sync()
启动定时任务调度器
self.scheduler = BackgroundScheduler()
self.scheduler.start()
def init_connections(self):
"""初始化连接"""
credentials = pika.PlainCredentials('promotion_user', 'PromotionPass@2024')
for host_info in self.rabbitmq_hosts:
try:
parameters = pika.ConnectionParameters(
host=host_info['host'],
port=host_info['port'],
credentials=credentials,
heartbeat=300
)
self.connection = pika.BlockingConnection(parameters)
self.channel = self.connection.channel()
print(f"✅ 连接到RabbitMQ: {host_info['host']}")
break
except Exception as e:
print(f"❌ 连接失败 {host_info['host']}: {e}")
continue
设置促销交换机
self.setup_promotion_exchange()
def setup_promotion_exchange(self):
"""设置促销交换机"""
1. 促销延迟交换机
exchange_args = {
'x-delayed-type': 'topic',
'x-max-in-memory-length': 5000
}
self.channel.exchange_declare(
exchange='promotion.delayed',
exchange_type='x-delayed-message',
durable=True,
arguments=exchange_args
)
2. 促销广播交换机
self.channel.exchange_declare(
exchange='promotion.broadcast',
exchange_type='fanout',
durable=True
)
3. 促销结果队列
self.channel.queue_declare(
queue='promotion.result',
durable=True
)
print("✅ 促销交换机设置完成")
def init_time_sync(self):
"""初始化时间同步"""
import ntplib
from threading import Thread
def sync_time():
while True:
try:
client = ntplib.NTPClient()
response = client.request(
self.promotion_config['time_sync']['ntp_server']
)
计算时间偏移
offset_ms = response.offset * 1000
if abs(offset_ms) > self.promotion_config['time_sync']['max_offset_ms']:
print(f"⚠️ 时间偏移过大: {offset_ms:.2f}ms")
更新时间戳基准
self.time_offset = offset_ms
记录时间同步状态
self.redis_client.setex(
'promotion:time_sync',
300, # 5分钟
json.dumps({
'offset_ms': offset_ms,
'sync_time': datetime.now().isoformat(),
'ntp_server': self.promotion_config['time_sync']['ntp_server']
})
)
time.sleep(self.promotion_config['time_sync']['sync_interval'])
except Exception as e:
print(f"时间同步失败: {e}")
time.sleep(10)
启动时间同步线程
time_sync_thread = Thread(target=sync_time, daemon=True)
time_sync_thread.start()
print("✅ 时间同步启动")
def calculate_delay_to_target(self, target_time: datetime) -> int:
"""
计算到目标时间的延迟
Args:
target_time: 目标时间
Returns:
延迟毫秒数
"""
now = datetime.now()
考虑时间偏移
if hasattr(self, 'time_offset'):
now = now + timedelta(milliseconds=self.time_offset)
计算延迟
delay = (target_time - now).total_seconds() * 1000 # 转换为毫秒
确保延迟为正数
return max(int(delay), 0)
def schedule_promotion_start(self, promotion_id: str,
start_time: datetime,
promotion_data: Dict) -> bool:
"""
调度促销开始
Args:
promotion_id: 促销ID
start_time: 开始时间
promotion_data: 促销数据
Returns:
是否调度成功
"""
try:
1. 计算延迟
delay_ms = self.calculate_delay_to_target(start_time)
if delay_ms <= 0:
print(f"⚠️ 促销 {promotion_id} 开始时间已过或即将开始")
return False
2. 创建促销消息
promotion_message = {
'promotion_id': promotion_id,
'action': 'start',
'scheduled_time': start_time.isoformat(),
'actual_delay_ms': delay_ms,
'promotion_data': promotion_data,
'metadata': {
'created_at': datetime.now().isoformat(),
'source': 'promotion_scheduler',
'preheated': False
}
}
3. 预热处理(提前5分钟)
preheat_delay = delay_ms - (self.promotion_config['preheat_minutes'] * 60 * 1000)
if preheat_delay > 0:
发送预热消息
self.send_preheat_message(promotion_id, preheat_delay, promotion_data)
4. 发送开始消息
properties = pika.BasicProperties(
headers={'x-delay': delay_ms},
delivery_mode=2,
content_type='application/json',
timestamp=int(time.time()),
message_id=f"promo_{promotion_id}_{int(time.time()*1000)}",
expiration=str(delay_ms + 60000) # 延迟+1分钟
)
self.channel.basic_publish(
exchange='promotion.delayed',
routing_key=f'promotion.start.{promotion_id}',
body=json.dumps(promotion_message, ensure_ascii=False),
properties=properties,
mandatory=True
)
5. 记录调度状态
self.record_schedule_status(promotion_id, {
'scheduled': True,
'start_time': start_time.isoformat(),
'delay_ms': delay_ms,
'scheduled_at': datetime.now().isoformat(),
'status': 'pending'
})
print(f"✅ 促销调度成功: {promotion_id}, 开始时间: {start_time}, 延迟: {delay_ms}ms")
6. 添加监控任务
self.add_promotion_monitor(promotion_id, start_time)
return True
except Exception as e:
print(f"❌ 促销调度失败: {e}")
self.record_schedule_status(promotion_id, {
'scheduled': False,
'error': str(e),
'failed_at': datetime.now().isoformat()
})
return False
def send_preheat_message(self, promotion_id: str, delay_ms: int,
promotion_data: Dict):
"""发送预热消息"""
preheat_message = {
'promotion_id': promotion_id,
'action': 'preheat',
'promotion_data': promotion_data,
'metadata': {
'preheat_time': datetime.now().isoformat(),
'actual_start_delay': delay_ms + (self.promotion_config['preheat_minutes'] * 60 * 1000)
}
}
properties = pika.BasicProperties(
headers={'x-delay': delay_ms},
delivery_mode=2,
content_type='application/json'
)
self.channel.basic_publish(
exchange='promotion.delayed',
routing_key=f'promotion.preheat.{promotion_id}',
body=json.dumps(preheat_message, ensure_ascii=False),
properties=properties
)
print(f"🔥 预热消息已发送: {promotion_id}, 延迟: {delay_ms}ms")
def record_schedule_status(self, promotion_id: str, status_data: Dict):
"""记录调度状态"""
redis_key = f"promotion:schedule:{promotion_id}"
self.redis_client.setex(
redis_key,
7 * 24 * 60 * 60, # 保留7天
json.dumps(status_data)
)
def add_promotion_monitor(self, promotion_id: str, start_time: datetime):
"""添加促销监控"""
使用APScheduler添加定时监控任务
monitor_time = start_time - timedelta(minutes=1) # 提前1分钟监控
self.scheduler.add_job(
self.check_promotion_status,
CronTrigger(
year=monitor_time.year,
month=monitor_time.month,
day=monitor_time.day,
hour=monitor_time.hour,
minute=monitor_time.minute
),
args=[promotion_id],
id=f"monitor_{promotion_id}",
replace_existing=True
)
def check_promotion_status(self, promotion_id: str):
"""检查促销状态"""
print(f"检查促销状态: {promotion_id}")
检查Redis中的调度状态
redis_key = f"promotion:schedule:{promotion_id}"
status_data = self.redis_client.get(redis_key)
if status_data:
status = json.loads(status_data)
if status.get('status') == 'pending':
发送告警
self.send_promotion_alert(promotion_id, 'pending_at_start_time')
def send_promotion_alert(self, promotion_id: str, alert_type: str):
"""发送促销告警"""
alert_message = {
'type': alert_type,
'promotion_id': promotion_id,
'alert_time': datetime.now().isoformat(),
'severity': 'warning'
}
self.channel.basic_publish(
exchange='',
routing_key='promotion.alerts',
body=json.dumps(alert_message),
properties=pika.BasicProperties(
delivery_mode=2,
content_type='application/json'
)
)
print(f"🚨 促销告警: {promotion_id} - {alert_type}")
async def broadcast_promotion_start(self, promotion_id: str,
promotion_data: Dict):
"""
广播促销开始(异步批量处理)
Args:
promotion_id: 促销ID
promotion_data: 促销数据
"""
1. 获取需要通知的服务列表
services = await self.get_promotion_services()
2. 批量广播
batch_size = self.promotion_config['broadcast_strategy']['batch_size']
parallel_workers = self.promotion_config['broadcast_strategy']['parallel_workers']
分批处理
for i in range(0, len(services), batch_size):
batch = services[i:i + batch_size]
并行通知
async with aiohttp.ClientSession() as session:
tasks = []
for service in batch:
task = self.notify_service_async(
session, service, promotion_id, promotion_data
)
tasks.append(task)
等待所有通知完成
results = await asyncio.gather(*tasks, return_exceptions=True)
处理结果
success_count = sum(1 for r in results if r is True)
fail_count = len(results) - success_count
print(f"批次 {i//batch_size + 1}: 成功 {success_count}, 失败 {fail_count}")
记录广播结果
self.record_broadcast_result(
promotion_id,
batch,
results
)
3. 发送广播完成事件
self.send_promotion_event('promotion_broadcast_completed', {
'promotion_id': promotion_id,
'total_services': len(services),
'timestamp': datetime.now().isoformat()
})
async def get_promotion_services(self) -> List[str]:
"""获取需要通知的服务列表(模拟)"""
实际应该从服务注册中心获取
await asyncio.sleep(0.1) # 模拟网络延迟
return [
'price-service',
'inventory-service',
'coupon-service',
'cart-service',
'recommend-service',
'search-service',
'notification-service',
'activity-service',
'point-service',
'logistics-service'
]
async def notify_service_async(self, session: aiohttp.ClientSession,
service: str,
promotion_id: str,
promotion_data: Dict) -> bool:
"""
异步通知服务
Args:
session: aiohttp会话
service: 服务名称
promotion_id: 促销ID
promotion_data: 促销数据
Returns:
是否通知成功
"""
max_retries = self.promotion_config['broadcast_strategy']['retry_policy']['max_retries']
retry_delays = self.promotion_config['broadcast_strategy']['retry_policy']['retry_delay']
for retry in range(max_retries):
try:
构建请求URL(模拟)
url = f"http://{service}.internal/promotion/start"
实际应该发送HTTP请求
async with session.post(url, json={
'promotion_id': promotion_id,
'promotion_data': promotion_data
}) as response:
if response.status == 200:
return True
else:
raise Exception(f"HTTP {response.status}")
模拟成功或失败
await asyncio.sleep(0.05) # 模拟网络延迟
import random
success = random.random() > 0.1 # 90%成功率
if success:
print(f"✅ 通知服务成功: {service}")
return True
else:
raise Exception("模拟通知失败")
except Exception as e:
print(f"❌ 通知服务失败 {service} (重试 {retry+1}/{max_retries}): {e}")
if retry < max_retries - 1:
等待重试
delay = retry_delays[min(retry, len(retry_delays)-1)]
await asyncio.sleep(delay)
else:
最终失败
return False
return False
def record_broadcast_result(self, promotion_id: str,
services: List[str],
results: List):
"""记录广播结果"""
redis_key = f"promotion:broadcast:{promotion_id}"
result_data = {
'services': services,
'results': [
{
'service': service,
'success': result is True,
'timestamp': datetime.now().isoformat()
}
for service, result in zip(services, results)
],
'broadcast_time': datetime.now().isoformat()
}
self.redis_client.setex(
redis_key,
24 * 60 * 60, # 保留24小时
json.dumps(result_data)
)
def send_promotion_event(self, event_type: str, data: Dict):
"""发送促销事件"""
self.channel.basic_publish(
exchange='promotion.broadcast',
routing_key='',
body=json.dumps({
'type': event_type,
'data': data,
'timestamp': datetime.now().isoformat()
}),
properties=pika.BasicProperties(
delivery_mode=2,
content_type='application/json'
)
)
使用示例
if name == "main":
配置
rabbitmq_hosts = [
{'host': '192.168.5.101', 'port': 5672},
{'host': '192.168.5.102', 'port': 5672},
{'host': '192.168.5.103', 'port': 5672}
]
redis_host = '192.168.5.100'
创建促销调度器
scheduler = PromotionScheduler(rabbitmq_hosts, redis_host)
try:
模拟创建促销活动
promotion_id = "PROMO_20240101_001"
设置开始时间(5分钟后)
start_time = datetime.now() + timedelta(minutes=5)
promotion_data = {
'name': '元旦大促',
'type': 'discount',
'discount_rate': 0.3, # 7折
'products': [
{'id': 'PROD_001', 'category': 'electronics'},
{'id': 'PROD_002', 'category': 'clothing'},
{'id': 'PROD_003', 'category': 'home'}
],
'rules': {
'min_amount': 100,
'max_discount': 500,
'user_level': ['vip', 'normal']
},
'duration_hours': 48
}
print(f"创建促销活动: {promotion_id}")
print(f"计划开始时间: {start_time}")
调度促销开始
success = scheduler.schedule_promotion_start(
promotion_id, start_time, promotion_data
)
if success:
print(f"促销调度成功,等待开始...")
异步广播测试
import asyncio
async def test_broadcast():
await scheduler.broadcast_promotion_start(
"TEST_PROMO",
{'test': True}
)
在另一个线程中运行异步测试
import threading
def run_async():
asyncio.run(test_broadcast())
broadcast_thread = threading.Thread(target=run_async, daemon=True)
broadcast_thread.start()
保持运行以观察调度
import time
time.sleep(400) # 等待6分40秒(包含预热和开始)
else:
print("促销调度失败")
except KeyboardInterrupt:
print("程序退出")
finally:
if hasattr(scheduler, 'scheduler'):
scheduler.scheduler.shutdown()
if hasattr(scheduler, 'connection'):
scheduler.connection.close()
五、综合业务场景集成方案
5.1 电商延迟消息统一网关
ecommerce_delay_gateway.py
import pika
import json
import time
import uuid
from datetime import datetime, timedelta
from typing import Dict, Any, Optional
from enum import Enum
import redis
from dataclasses import dataclass, asdict
from abc import ABC, abstractmethod
class DelayScene(Enum):
"""延迟场景枚举"""
ORDER_CANCEL = "order_cancel" # 订单取消
INVENTORY_RELEASE = "inventory_release" # 库存释放
PROMOTION_START = "promotion_start" # 促销开始
COUPON_EXPIRE = "coupon_expire" # 优惠券过期
USER_NOTIFICATION = "user_notification" # 用户通知
REVIEW_REMINDER = "review_reminder" # 评价提醒
LOGISTICS_UPDATE = "logistics_update" # 物流更新
@dataclass
class DelayMessage:
"""延迟消息基础类"""
scene: DelayScene
message_id: str
business_id: str # 业务ID(订单ID、促销ID等)
payload: Dict[str, Any]
delay_ms: int
metadata: Dict[str, Any]
def to_dict(self):
"""转换为字典"""
return {
'scene': self.scene.value,
'message_id': self.message_id,
'business_id': self.business_id,
'payload': self.payload,
'delay_ms': self.delay_ms,
'metadata': self.metadata
}
class DelayStrategy(ABC):
"""延迟策略基类"""
@abstractmethod
def send(self, message: DelayMessage) -> bool:
"""发送延迟消息"""
pass
@abstractmethod
def cancel(self, message_id: str) -> bool:
"""取消延迟消息"""
pass
@abstractmethod
def get_status(self, message_id: str) -> Dict:
"""获取消息状态"""
pass
class PluginDelayStrategy(DelayStrategy):
"""插件延迟策略(高精度)"""
def init(self, rabbitmq_host: str):
self.connection = pika.BlockingConnection(
pika.ConnectionParameters(host=rabbitmq_host)
)
self.channel = self.connection.channel()
设置延迟交换机
self.channel.exchange_declare(
exchange='delay.plugin',
exchange_type='x-delayed-message',
durable=True,
arguments={'x-delayed-type': 'direct'}
)
def send(self, message: DelayMessage) -> bool:
try:
properties = pika.BasicProperties(
headers={'x-delay': message.delay_ms},
delivery_mode=2,
content_type='application/json',
message_id=message.message_id,
timestamp=int(time.time())
)
self.channel.basic_publish(
exchange='delay.plugin',
routing_key=message.scene.value,
body=json.dumps(message.to_dict()),
properties=properties
)
return True
except Exception as e:
print(f"插件延迟发送失败: {e}")
return False
def cancel(self, message_id: str) -> bool:
插件延迟需要通过取消命令队列处理
return True
def get_status(self, message_id: str) -> Dict:
return {'strategy': 'plugin', 'status': 'unknown'}
class TTLDelayStrategy(DelayStrategy):
"""TTL延迟策略(可靠)"""
def init(self, rabbitmq_host: str):
self.connection = pika.BlockingConnection(
pika.ConnectionParameters(host=rabbitmq_host)
)
self.channel = self.connection.channel()
def send(self, message: DelayMessage) -> bool:
try:
创建TTL队列
queue_name = f"delay.ttl.{message.scene.value}.{message.business_id}"
queue_args = {
'x-dead-letter-exchange': 'delay.dlx',
'x-dead-letter-routing-key': message.scene.value,
'x-message-ttl': message.delay_ms,
'x-expires': message.delay_ms + 60000
}
self.channel.queue_declare(
queue=queue_name,
durable=True,
arguments=queue_args
)
发送消息
properties = pika.BasicProperties(
delivery_mode=2,
content_type='application/json',
message_id=message.message_id,
expiration=str(message.delay_ms)
)
self.channel.basic_publish(
exchange='',
routing_key=queue_name,
body=json.dumps(message.to_dict()),
properties=properties
)
return True
except Exception as e:
print(f"TTL延迟发送失败: {e}")
return False
def cancel(self, message_id: str) -> bool:
TTL延迟可以通过删除队列来取消
return True
def get_status(self, message_id: str) -> Dict:
return {'strategy': 'ttl', 'status': 'unknown'}
class EcommerceDelayGateway:
"""电商延迟消息统一网关"""
def init(self, config: Dict):
"""
初始化延迟网关
Args:
config: 配置字典
"""
self.config = config
self.redis_client = redis.Redis(
host=config['redis_host'],
port=6379,
db=2,
decode_responses=True
)
初始化策略
self.strategies = {
'plugin': PluginDelayStrategy(config['rabbitmq_host']),
'ttl': TTLDelayStrategy(config['rabbitmq_host'])
}
场景策略映射
self.scene_strategy_map = {
DelayScene.ORDER_CANCEL: 'plugin',
DelayScene.INVENTORY_RELEASE: 'ttl',
DelayScene.PROMOTION_START: 'plugin',
DelayScene.COUPON_EXPIRE: 'ttl',
DelayScene.USER_NOTIFICATION: 'plugin',
DelayScene.REVIEW_REMINDER: 'ttl',
DelayScene.LOGISTICS_UPDATE: 'ttl'
}
初始化死信处理
self.setup_dlx_handlers()
def setup_dlx_handlers(self):
"""设置死信处理器"""
实际实现应该为每个场景设置死信处理器
pass
def select_strategy(self, scene: DelayScene, delay_ms: int) -> str:
"""
选择延迟策略
Args:
scene: 延迟场景
delay_ms: 延迟毫秒数
Returns:
策略名称
"""
规则1: 超长延迟使用TTL
if delay_ms > 7 * 24 * 60 * 60 * 1000: # 超过7天
return 'ttl'
规则2: 高精度场景使用插件
if scene in [DelayScene.ORDER_CANCEL, DelayScene.PROMOTION_START]:
return 'plugin'
规则3: 默认使用映射策略
return self.scene_strategy_map.get(scene, 'ttl')
def create_delay_message(self, scene: DelayScene,
business_id: str,
payload: Dict,
delay_seconds: int,
metadata: Optional[Dict] = None) -> DelayMessage:
"""
创建延迟消息
Args:
scene: 延迟场景
business_id: 业务ID
payload: 消息负载
delay_seconds: 延迟秒数
metadata: 元数据
Returns:
延迟消息对象
"""
message_id = f"{scene.value}{business_id}{uuid.uuid4().hex[:8]}"
if metadata is None:
metadata = {}
metadata.update({
'created_at': datetime.now().isoformat(),
'gateway_version': '1.0.0'
})
return DelayMessage(
scene=scene,
message_id=message_id,
business_id=business_id,
payload=payload,
delay_ms=delay_seconds * 1000,
metadata=metadata
)
def send_delay_message(self, scene: DelayScene,
business_id: str,
payload: Dict,
delay_seconds: int,
strategy: Optional[str] = None) -> Dict:
"""
发送延迟消息
Args:
scene: 延迟场景
business_id: 业务ID
payload: 消息负载
delay_seconds: 延迟秒数
strategy: 指定策略(可选)
Returns:
发送结果
"""
try:
1. 创建消息
message = self.create_delay_message(
scene, business_id, payload, delay_seconds
)
2. 选择策略
if strategy is None:
strategy = self.select_strategy(scene, message.delay_ms)
if strategy not in self.strategies:
return {
'success': False,
'error': f'未知策略: {strategy}'
}
3. 发送消息
delay_strategy = self.strategies[strategy]
success = delay_strategy.send(message)
if success:
4. 记录发送状态
self.record_message_status(message, strategy, 'sent')
5. 发送监控事件
self.send_monitoring_event('delay_message_sent', {
'message_id': message.message_id,
'scene': scene.value,
'strategy': strategy,
'delay_seconds': delay_seconds,
'timestamp': datetime.now().isoformat()
})
return {
'success': True,
'message_id': message.message_id,
'strategy': strategy,
'estimated_delivery': (
datetime.now() +
timedelta(seconds=delay_seconds)
).isoformat()
}
else:
return {
'success': False,
'error': '发送失败'
}
except Exception as e:
print(f"发送延迟消息异常: {e}")
return {
'success': False,
'error': str(e)
}
def cancel_delay_message(self, message_id: str) -> Dict:
"""
取消延迟消息
Args:
message_id: 消息ID
Returns:
取消结果
"""
try:
1. 获取消息信息
message_info = self.get_message_info(message_id)
if not message_info:
return {
'success': False,
'error': '消息不存在'
}
2. 根据策略取消
strategy = message_info.get('strategy')
if strategy in self.strategies:
success = self.strategies[strategy].cancel(message_id)
else:
success = False
if success:
3. 更新状态
self.record_message_status(
None, strategy, 'cancelled',
message_id=message_id
)
4. 发送监控事件
self.send_monitoring_event('delay_message_cancelled', {
'message_id': message_id,
'timestamp': datetime.now().isoformat()
})
return {'success': True}
else:
return {
'success': False,
'error': '取消失败'
}
except Exception as e:
return {
'success': False,
'error': str(e)
}
def record_message_status(self, message: Optional[DelayMessage],
strategy: str,
status: str,
message_id: Optional[str] = None):
"""记录消息状态"""
if message_id is None and message is not None:
message_id = message.message_id
status_data = {
'status': status,
'strategy': strategy,
'update_time': datetime.now().isoformat()
}
if message is not None:
status_data['message'] = message.to_dict()
redis_key = f"delay:message:{message_id}"
self.redis_client.setex(
redis_key,
30 * 24 * 60 * 60, # 保留30天
json.dumps(status_data)
)
def get_message_info(self, message_id: str) -> Optional[Dict]:
"""获取消息信息"""
redis_key = f"delay:message:{message_id}"
data = self.redis_client.get(redis_key)
if data:
return json.loads(data)
return None
def send_monitoring_event(self, event_type: str, data: Dict):
"""发送监控事件"""
实际实现应该发送到监控系统
print(f"[监控] {event_type}: {json.dumps(data, indent=2)}")
def get_statistics(self) -> Dict:
"""获取统计信息"""
统计各场景的消息数量
statistics = {}
for scene in DelayScene:
pattern = f"delay:message:*{scene.value}*"
keys = self.redis_client.keys(pattern)
scene_stats = {
'total': len(keys),
'strategies': {}
}
按策略统计
for strategy in self.strategies.keys():
strategy_keys = [k for k in keys if strategy in self.redis_client.get(k)]
scene_stats['strategies'][strategy] = len(strategy_keys)
statistics[scene.value] = scene_stats
return statistics
使用示例
if name == "main":
配置
config = {
'rabbitmq_host': '192.168.5.101',
'redis_host': '192.168.5.100'
}
创建延迟网关
gateway = EcommerceDelayGateway(config)
模拟电商业务场景
场景1: 订单取消(15分钟)
print("\n=== 场景1: 订单取消 ===")
order_result = gateway.send_delay_message(
scene=DelayScene.ORDER_CANCEL,
business_id="ORD20240101000001",
payload={
'order_type': 'normal',
'amount': 199.99,
'user_id': 'user_123456'
},
delay_seconds=15 * 60 # 15分钟
)
print(f"订单取消延迟: {order_result}")
场景2: 库存释放(10分钟)
print("\n=== 场景2: 库存释放 ===")
inventory_result = gateway.send_delay_message(
scene=DelayScene.INVENTORY_RELEASE,
business_id="RES20240101000001",
payload={
'product_id': 'PROD_001',
'quantity': 2,
'reservation_type': 'order'
},
delay_seconds=10 * 60 # 10分钟
)
print(f"库存释放延迟: {inventory_result}")
场景3: 促销开始(5分钟)
print("\n=== 场景3: 促销开始 ===")
promotion_result = gateway.send_delay_message(
scene=DelayScene.PROMOTION_START,
business_id="PROMO20240101001",
payload={
'name': '元旦大促',
'discount': 0.3,
'scope': 'all_products'
},
delay_seconds=5 * 60 # 5分钟
)
print(f"促销开始延迟: {promotion_result}")
场景4: 优惠券过期(7天)
print("\n=== 场景4: 优惠券过期 ===")
coupon_result = gateway.send_delay_message(
scene=DelayScene.COUPON_EXPIRE,
business_id="COUPON20240101001",
payload={
'coupon_code': 'NEWYEAR2024',
'user_id': 'user_123456',
'face_value': 50
},
delay_seconds=7 * 24 * 60 * 60 # 7天
)
print(f"优惠券过期延迟: {coupon_result}")
场景5: 用户通知(用户指定时间)
print("\n=== 场景5: 用户通知 ===")
notification_result = gateway.send_delay_message(
scene=DelayScene.USER_NOTIFICATION,
business_id="USER_NOTIFY_001",
payload={
'user_id': 'user_123456',
'notification_type': 'birthday_wish',
'content': '生日快乐!'
},
delay_seconds=24 * 60 * 60 # 24小时后(生日祝福)
)
print(f"用户通知延迟: {notification_result}")
获取统计信息
print("\n=== 统计信息 ===")
stats = gateway.get_statistics()
for scene, scene_stats in stats.items():
print(f"{scene}: {scene_stats['total']} 条消息")
for strategy, count in scene_stats['strategies'].items():
print(f" {strategy}: {count}")
取消一个消息(示例)
if order_result['success']:
print("\n=== 取消订单延迟 ===")
cancel_result = gateway.cancel_delay_message(
order_result['message_id']
)
print(f"取消结果: {cancel_result}")
5.2 电商延迟队列运维管理平台
delay_queue_manager.py
from flask import Flask, render_template, jsonify, request
import json
from datetime import datetime, timedelta
import redis
import pika
app = Flask(name)
class DelayQueueManager:
"""延迟队列管理平台"""
def init(self):
self.redis_client = redis.Redis(
host='192.168.5.100',
port=6379,
db=3,
decode_responses=True
)
连接RabbitMQ管理API
self.rabbitmq_host = '192.168.5.101'
self.rabbitmq_port = 15672
self.rabbitmq_user = 'admin'
self.rabbitmq_pass = 'DelayAdmin@2024'
def get_queue_stats(self):
"""获取队列统计信息"""
import requests
try:
response = requests.get(
f'http://{self.rabbitmq_host}:{self.rabbitmq_port}/api/queues',
auth=(self.rabbitmq_user, self.rabbitmq_pass)
)
if response.status_code == 200:
queues = response.json()
过滤延迟相关队列
delay_queues = []
for queue in queues:
if any(keyword in queue['name'].lower()
for keyword in ['delay', 'dlx', 'cancel', 'reserve']):
delay_queues.append({
'name': queue['name'],
'messages': queue.get('messages', 0),
'messages_ready': queue.get('messages_ready', 0),
'messages_unacknowledged': queue.get('messages_unacknowledged', 0),
'state': queue.get('state', 'unknown'),
'type': queue.get('type', 'classic')
})
return delay_queues
else:
return []
except Exception as e:
print(f"获取队列统计失败: {e}")
return []
def get_business_stats(self):
"""获取业务统计信息"""
从Redis获取业务统计
stats_keys = self.redis_client.keys('delay:stats:*')
business_stats = {}
for key in stats_keys:
scene = key.split(':')[2]
stats_data = self.redis_client.get(key)
if stats_data:
business_stats[scene] = json.loads(stats_data)
return business_stats
def get_alerts(self):
"""获取告警信息"""
alerts = []
检查积压告警
queues = self.get_queue_stats()
for queue in queues:
if queue['messages'] > 10000:
alerts.append({
'level': 'critical',
'message': f"队列 {queue['name']} 积压严重: {queue['messages']} 条消息",
'time': datetime.now().isoformat()
})
elif queue['messages'] > 5000:
alerts.append({
'level': 'warning',
'message': f"队列 {queue['name']} 积压: {queue['messages']} 条消息",
'time': datetime.now().isoformat()
})
检查业务异常
stats = self.get_business_stats()
for scene, scene_stats in stats.items():
if 'error_rate' in scene_stats and scene_stats['error_rate'] > 0.05:
alerts.append({
'level': 'warning',
'message': f"场景 {scene} 错误率过高: {scene_stats['error_rate']*100:.2f}%",
'time': datetime.now().isoformat()
})
return alerts
@app.route('/')
def dashboard():
"""仪表板页面"""
manager = DelayQueueManager()
获取数据
queue_stats = manager.get_queue_stats()
business_stats = manager.get_business_stats()
alerts = manager.get_alerts()
计算汇总指标
total_messages = sum(q['messages'] for q in queue_stats)
total_queues = len(queue_stats)
按场景统计
scene_stats = {}
for queue in queue_stats:
for scene in ['order', 'inventory', 'promotion', 'coupon']:
if scene in queue['name']:
if scene not in scene_stats:
scene_stats[scene] = 0
scene_stats[scene] += queue['messages']
break
return render_template('dashboard.html',
total_messages=total_messages,
total_queues=total_queues,
queue_stats=queue_stats,
business_stats=business_stats,
scene_stats=scene_stats,
alerts=alerts)
@app.route('/api/queues')
def api_queues():
"""API: 获取队列信息"""
manager = DelayQueueManager()
queues = manager.get_queue_stats()
return jsonify(queues)
@app.route('/api/business')
def api_business():
"""API: 获取业务统计"""
manager = DelayQueueManager()
stats = manager.get_business_stats()
return jsonify(stats)
@app.route('/api/alerts')
def api_alerts():
"""API: 获取告警"""
manager = DelayQueueManager()
alerts = manager.get_alerts()
return jsonify(alerts)
@app.route('/api/operations/cancel', methods=['POST'])
def api_cancel():
"""API: 取消延迟消息"""
data = request.json
message_id = data.get('message_id')
实际实现应该调用网关的取消方法
return jsonify({
'success': True,
'message': f'消息 {message_id} 已取消'
})
@app.route('/api/operations/retry', methods=['POST'])
def api_retry():
"""API: 重试失败消息"""
data = request.json
queue_name = data.get('queue_name')
count = data.get('count', 10)
实际实现应该从死信队列重新发布消息
return jsonify({
'success': True,
'message': f'从 {queue_name} 重试 {count} 条消息'
})
@app.route('/api/monitoring/metrics')
def api_metrics():
"""API: 监控指标"""
manager = DelayQueueManager()
metrics = {
'delay_queue_messages_total': sum(q['messages'] for q in manager.get_queue_stats()),
'delay_queue_count': len(manager.get_queue_stats()),
'delay_alerts_total': len(manager.get_alerts()),
'timestamp': datetime.now().isoformat()
}
return jsonify(metrics)
if name == 'main':
app.run(host='0.0.0.0', port=5000, debug=True)
六、部署与运维指南
6.1 生产环境部署架构
docker-compose-production.yml
version: '3.8'
services:
RabbitMQ集群
rabbitmq-node1:
image: rabbitmq:3.12-management
container_name: rabbitmq-node1
hostname: rabbitmq-node1
environment:
-
RABBITMQ_ERLANG_COOKIE=SECURE_COOKIE_VALUE
-
RABBITMQ_DEFAULT_USER=admin
-
RABBITMQ_DEFAULT_PASS=SecurePass2024
volumes:
-
rabbitmq-data-node1:/var/lib/rabbitmq
-
./config/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf:ro
-
./plugins/rabbitmq_delayed_message_exchange.ez:/opt/rabbitmq/plugins/rabbitmq_delayed_message_exchange.ez
ports:
-
"5672:5672"
-
"15672:15672"
networks:
- rabbitmq-cluster
deploy:
resources:
limits:
memory: 4G
cpus: '2'
reservations:
memory: 2G
cpus: '1'
rabbitmq-node2:
image: rabbitmq:3.12-management
container_name: rabbitmq-node2
hostname: rabbitmq-node2
environment:
-
RABBITMQ_ERLANG_COOKIE=SECURE_COOKIE_VALUE
-
RABBITMQ_DEFAULT_USER=admin
-
RABBITMQ_DEFAULT_PASS=SecurePass2024
volumes:
-
rabbitmq-data-node2:/var/lib/rabbitmq
-
./config/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf:ro
-
./plugins/rabbitmq_delayed_message_exchange.ez:/opt/rabbitmq/plugins/rabbitmq_delayed_message_exchange.ez
depends_on:
- rabbitmq-node1
networks:
- rabbitmq-cluster
command: >
bash -c "sleep 10 &&
rabbitmqctl stop_app &&
rabbitmqctl reset &&
rabbitmqctl join_cluster rabbit@rabbitmq-node1 &&
rabbitmqctl start_app"
rabbitmq-node3:
image: rabbitmq:3.12-management
container_name: rabbitmq-node3
hostname: rabbitmq-node3
environment:
-
RABBITMQ_ERLANG_COOKIE=SECURE_COOKIE_VALUE
-
RABBITMQ_DEFAULT_USER=admin
-
RABBITMQ_DEFAULT_PASS=SecurePass2024
volumes:
-
rabbitmq-data-node3:/var/lib/rabbitmq
-
./config/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf:ro
-
./plugins/rabbitmq_delayed_message_exchange.ez:/opt/rabbitmq/plugins/rabbitmq_delayed_message_exchange.ez
depends_on:
- rabbitmq-node1
networks:
- rabbitmq-cluster
command: >
bash -c "sleep 15 &&
rabbitmqctl stop_app &&
rabbitmqctl reset &&
rabbitmqctl join_cluster rabbit@rabbitmq-node1 &&
rabbitmqctl start_app"
Redis集群
redis-master:
image: redis:7-alpine
container_name: redis-master
command: redis-server --requirepass RedisPass2024 --appendonly yes
volumes:
- redis-data-master:/data
ports:
- "6379:6379"
networks:
- rabbitmq-cluster
deploy:
resources:
limits:
memory: 1G
reservations:
memory: 512M
redis-replica:
image: redis:7-alpine
container_name: redis-replica
command: >
redis-server --replicaof redis-master 6379
--masterauth RedisPass2024
--requirepass RedisPass2024
depends_on:
- redis-master
networks:
- rabbitmq-cluster
HAProxy负载均衡
haproxy:
image: haproxy:2.8-alpine
container_name: haproxy
volumes:
- ./config/haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg:ro
ports:
-
"5670:5670" # AMQP负载均衡
-
"15670:15670" # 管理界面负载均衡
-
"8888:8888" # 监控界面
depends_on:
-
rabbitmq-node1
-
rabbitmq-node2
-
rabbitmq-node3
networks:
- rabbitmq-cluster
监控系统
prometheus:
image: prom/prometheus:latest
container_name: prometheus
volumes:
-
./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro
-
prometheus-data:/prometheus
ports:
- "9090:9090"
networks:
- rabbitmq-cluster
grafana:
image: grafana/grafana:latest
container_name: grafana
environment:
- GF_SECURITY_ADMIN_PASSWORD=GrafanaPass2024
volumes:
-
grafana-data:/var/lib/grafana
-
./config/grafana-dashboards:/etc/grafana/provisioning/dashboards:ro
ports:
- "3000:3000"
depends_on:
- prometheus
networks:
- rabbitmq-cluster
延迟网关服务
delay-gateway:
build:
context: .
dockerfile: Dockerfile.delay-gateway
container_name: delay-gateway
environment:
-
RABBITMQ_HOST=haproxy
-
RABBITMQ_PORT=5670
-
REDIS_HOST=redis-master
-
REDIS_PASSWORD=RedisPass2024
depends_on:
-
haproxy
-
redis-master
networks:
- rabbitmq-cluster
deploy:
replicas: 3
update_config:
parallelism: 1
delay: 10s
restart_policy:
condition: on-failure
networks:
rabbitmq-cluster:
driver: bridge
volumes:
rabbitmq-data-node1:
rabbitmq-data-node2:
rabbitmq-data-node3:
redis-data-master:
prometheus-data:
grafana-data:
6.2 监控告警配置
prometheus-alerts.yml
groups:
- name: ecommerce_delay_alerts
rules:
订单取消场景告警
- alert: OrderCancelDelayHigh
expr: |
rate(delay_message_processed_total{scene="order_cancel"}[5m]) < 1
and delay_queue_messages{queue=~".*order.*cancel.*"} > 1000
for: 5m
labels:
severity: critical
business: order
annotations:
summary: "订单取消延迟处理异常"
description: |
订单取消延迟队列积压 {{ $value }} 条消息,
处理速率下降,可能影响用户体验
库存释放成功率告警
- alert: InventoryReleaseSuccessRateLow
expr: |
delay_message_success_rate{scene="inventory_release"} < 0.95
for: 10m
labels:
severity: warning
business: inventory
annotations:
summary: "库存释放成功率下降"
description: "库存释放成功率降至 {{ $value | humanizePercentage }}"
促销定时精度告警
- alert: PromotionTimingAccuracyLow
expr: |
delay_timing_accuracy_ms{scene="promotion_start"} > 5000
for: 5m
labels:
severity: warning
business: promotion
annotations:
summary: "促销定时精度下降"
description: "促销开始时间偏差 {{ $value | humanize }} 毫秒"
整体积压告警
- alert: TotalDelayBacklogHigh
expr: |
sum(delay_queue_messages) > 100000
for: 15m
labels:
severity: critical
annotations:
summary: "延迟队列总积压过高"
description: "所有延迟队列积压 {{ $value | humanize }} 条消息"
死信队列增长告警
- alert: DeadLetterQueueGrowth
expr: |
rate(delay_queue_messages{queue=~".*dlq.*"}[10m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "死信队列持续增长"
description: "死信队列消息增长速率 {{ $value | humanize }} 条/分钟"
6.3 性能压测脚本
performance_test.py
import asyncio
import aiohttp
import time
import statistics
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
class DelayQueuePerformanceTest:
"""延迟队列性能测试"""
def init(self, gateway_url: str):
self.gateway_url = gateway_url
self.results = []
async def test_order_cancel_scene(self, concurrent_users: int,
requests_per_user: int):
"""测试订单取消场景"""
print(f"测试订单取消场景: {concurrent_users}并发, {requests_per_user}请求/用户")
start_time = time.time()
async with aiohttp.ClientSession() as session:
tasks = []
for user_id in range(concurrent_users):
for req_num in range(requests_per_user):
task = self.send_order_cancel_request(
session, user_id, req_num
)
tasks.append(task)
responses = await asyncio.gather(*tasks, return_exceptions=True)
end_time = time.time()
total_time = end_time - start_time
total_requests = concurrent_users * requests_per_user
分析结果
success_count = sum(1 for r in responses if isinstance(r, dict) and r.get('success'))
error_count = total_requests - success_count
计算响应时间
response_times = []
for r in responses:
if isinstance(r, dict) and 'response_time' in r:
response_times.append(r['response_time'])
stats = {
'scene': 'order_cancel',
'concurrent_users': concurrent_users,
'requests_per_user': requests_per_user,
'total_requests': total_requests,
'total_time_seconds': total_time,
'requests_per_second': total_requests / total_time,
'success_rate': success_count / total_requests if total_requests > 0 else 0,
'avg_response_time': statistics.mean(response_times) if response_times else 0,
'p95_response_time': statistics.quantiles(response_times, n=20)[18] if len(response_times) >= 20 else 0,
'timestamp': datetime.now().isoformat()
}
self.results.append(stats)
return stats
async def send_order_cancel_request(self, session: aiohttp.ClientSession,
user_id: int, req_num: int):
"""发送订单取消请求"""
request_start = time.time()
try:
order_id = f"PERF_ORDER_{user_id}{req_num}{int(time.time())}"
payload = {
'scene': 'order_cancel',
'business_id': order_id,
'payload': {
'order_type': 'normal',
'amount': 100 + (req_num % 100),
'user_id': f'user_{user_id}'
},
'delay_seconds': 300, # 5分钟
'strategy': 'plugin'
}
实际应该发送HTTP请求
async with session.post(f"{self.gateway_url}/api/delay",
json=payload) as response:
result = await response.json()
模拟请求延迟
await asyncio.sleep(0.05 + (req_num % 10) * 0.01)
模拟90%成功率
import random
success = random.random() > 0.1
response_time = time.time() - request_start
return {
'success': success,
'order_id': order_id,
'response_time': response_time
}
except Exception as e:
response_time = time.time() - request_start
return {
'success': False,
'error': str(e),
'response_time': response_time
}
async def run_scenario_test(self):
"""运行场景测试"""
scenarios = [
{'concurrent': 10, 'requests': 100}, # 低负载
{'concurrent': 50, 'requests': 200}, # 中负载
{'concurrent': 100, 'requests': 500}, # 高负载
{'concurrent': 200, 'requests': 1000}, # 压力测试
]
print("开始性能测试...")
print("=" * 80)
for scenario in scenarios:
stats = await self.test_order_cancel_scene(
scenario['concurrent'],
scenario['requests']
)
self.print_stats(stats)
print("-" * 80)
间隔一下
await asyncio.sleep(10)
输出总结报告
self.print_summary_report()
def print_stats(self, stats: dict):
"""打印统计信息"""
print(f"场景: {stats['scene']}")
print(f"并发用户: {stats['concurrent_users']}")
print(f"总请求数: {stats['total_requests']}")
print(f"总时间: {stats['total_time_seconds']:.2f}秒")
print(f"吞吐量: {stats['requests_per_second']:.2f} 请求/秒")
print(f"成功率: {stats['success_rate']*100:.2f}%")
print(f"平均响应时间: {stats['avg_response_time']*1000:.2f}毫秒")
print(f"P95响应时间: {stats['p95_response_time']*1000:.2f}毫秒")
def print_summary_report(self):
"""打印总结报告"""
print("\n" + "=" * 80)
print("性能测试总结报告")
print("=" * 80)
if not self.results:
print("没有测试结果")
return
best_scenario = max(self.results, key=lambda x: x['requests_per_second'])
worst_scenario = min(self.results, key=lambda x: x['requests_per_second'])
print(f"\n最佳性能场景:")
print(f" 并发用户: {best_scenario['concurrent_users']}")
print(f" 吞吐量: {best_scenario['requests_per_second']:.2f} 请求/秒")
print(f"\n最差性能场景:")
print(f" 并发用户: {worst_scenario['concurrent_users']}")
print(f" 吞吐量: {worst_scenario['requests_per_second']:.2f} 请求/秒")
print(f"\n建议:")
if best_scenario['success_rate'] < 0.99:
print(" ⚠️ 成功率需要优化,目标 ≥99.9%")
if worst_scenario['avg_response_time'] > 1.0:
print(" ⚠️ 高并发下响应时间过长,需要优化")
if best_scenario['requests_per_second'] < 100:
print(" ⚠️ 吞吐量较低,考虑水平扩展")
print(f"\n测试完成时间: {datetime.now().isoformat()}")
async def main():
"""主函数"""
配置测试参数
gateway_url = "http://localhost:8080" # 延迟网关地址
创建测试实例
tester = DelayQueuePerformanceTest(gateway_url)
运行测试
await tester.run_scenario_test()
if name == "main":
asyncio.run(main())
关键成功要素
-
业务场景适配:为不同电商场景选择最优延迟方案
-
精度与可靠性的平衡:高精度场景用插件,长延迟场景用TTL
-
完善的监控体系:实时监控延迟精度、积压情况和成功率
-
自动故障处理:实现幂等性、重试机制和死信队列处理
-
性能优化:批量处理、异步操作、连接池优化
部署建议
-
分阶段实施:先核心场景(订单取消),后扩展场景
-
灰度发布:先小流量测试,逐步扩大范围
-
容量规划:根据业务量预估资源需求
-
灾备准备:建立完整的备份和恢复方案
后续优化方向
-
智能调度:基于实时负载动态调整延迟策略
-
预测分析:利用历史数据预测延迟需求峰值
-
成本优化:根据业务重要性分级存储和处理
-
生态集成:与微服务治理、配置中心等系统深度集成