一、核心设计理念
1.1 追踪目标
text
复制
下载
四大追踪维度:
1. 生产轨迹:消息从哪个应用、哪个机器、什么时间发送
2. 存储轨迹:消息在Broker的存储状态、投递时间
3. 消费轨迹:消息被哪个消费者、何时消费、消费结果
4. 事务轨迹:事务消息的状态变化(提交/回滚)
1.2 架构设计
java
复制
下载
/**
* 消息轨迹架构分层
*/
public class MessageTraceArchitecture {
// 三层架构设计
class Architecture {
/**
* 第一层:数据采集层(Producer/Consumer/Broker)
* - 埋点采集:在关键路径植入追踪代码
* - 异步发送:不阻塞主业务流程
* - 本地缓冲:批量发送提高性能
*/
/**
* 第二层:数据传输层(TraceDispatcher)
* - 内存队列:解耦采集和发送
* - 失败重试:保证数据可靠性
* - 流量控制:防止数据洪峰
*/
/**
* 第三层:数据存储层(RocketMQ自身或外部存储)
* - 专用Topic:_TRACE_TOPIC
* - 外部存储:MySQL、Elasticsearch、HBase
* - 索引构建:支持快速查询
*/
}
}
二、核心实现源码解析
2.1 追踪数据模型
java
复制
下载
/**
* 追踪数据结构定义
*/
public class TraceBean {
// 基础追踪信息
class TraceContext {
private String traceId; // 全局唯一追踪ID
private String spanId; // 当前跨度ID
private String parentSpanId; // 父跨度ID
private long timestamp; // 时间戳
private String region; // 区域
private String cell; // 单元
}
// 生产者追踪数据
class ProducerTraceData {
private String producerGroup; // 生产者组
private String topic; // 主题
private String msgId; // 消息ID
private String originMsgId; // 原始消息ID
private String tags; // 标签
private String keys; // 业务Key
private String storeHost; // 存储Broker地址
private String clientHost; // 客户端地址
private long storeTime; // 存储时间
private int retryTimes; // 重试次数
private int bodyLength; // 消息体长度
private Map<String, String> properties; // 扩展属性
}
// 消费者追踪数据
class ConsumerTraceData {
private String consumerGroup; // 消费者组
private String topic; // 主题
private String msgId; // 消息ID
private String originMsgId; // 原始消息ID
private String storeHost; // Broker地址
private String clientHost; // 客户端地址
private long storeTime; // 存储时间
private long pickupTime; // 拉取时间
private long consumeStartTime; // 消费开始时间
private long consumeEndTime; // 消费结束时间
private ConsumeStatus status; // 消费状态
private String failureReason; // 失败原因
private Map<String, String> properties; // 扩展属性
}
// 消费状态枚举
enum ConsumeStatus {
CONSUME_SUCCESS, // 消费成功
CONSUME_FAILURE, // 消费失败
CONSUME_RETRY, // 消费重试
CONSUME_PENDING, // 消费挂起
CONSUME_NOT_FOUND // 消息未找到
}
// Broker存储追踪数据
class BrokerTraceData {
private String brokerName; // Broker名称
private String brokerAddr; // Broker地址
private String topic; // 主题
private String msgId; // 消息ID
private long queueId; // 队列ID
private long queueOffset; // 队列偏移量
private long storeTimestamp; // 存储时间戳
private long storeSize; // 存储大小
private int retryTimes; // 重试次数
private boolean isTrans; // 是否事务消息
private String transactionState; // 事务状态
}
}
2.2 数据采集埋点实现
java
复制
下载
/**
* 生产者追踪拦截器
*/
public class ProducerTraceInterceptor implements SendMessageHook {
private ThreadLocal<TraceContext> traceContext = new ThreadLocal<>();
@Override
public SendResult executeBeforeSend(
final Message msg,
final MQClientAPIImpl mqClientAPI) {
// 1. 创建追踪上下文
TraceContext context = new TraceContext();
context.setTraceId(generateTraceId());
context.setSpanId(generateSpanId());
context.setTimestamp(System.currentTimeMillis());
// 2. 设置追踪信息到消息属性
msg.putUserProperty("TRACE_ID", context.getTraceId());
msg.putUserProperty("SPAN_ID", context.getSpanId());
msg.putUserProperty("PARENT_SPAN_ID",
context.getParentSpanId() != null ?
context.getParentSpanId() : "");
// 3. 记录生产者追踪数据
ProducerTraceData traceData = new ProducerTraceData();
traceData.setProducerGroup(mqClientAPI.getProducerGroup());
traceData.setTopic(msg.getTopic());
traceData.setMsgId(msg.getMsgId());
traceData.setKeys(msg.getKeys());
traceData.setTags(msg.getTags());
traceData.setClientHost(getLocalAddress());
traceData.setBodyLength(msg.getBody().length);
// 4. 存入ThreadLocal
traceContext.set(context);
// 5. 异步发送追踪数据
asyncSendTraceData(traceData, TraceType.PRODUCER_SEND_BEFORE);
return null; // 不修改发送结果
}
@Override
public void executeAfterSend(
final Message msg,
final SendResult sendResult,
final MQClientAPIImpl mqClientAPI) {
TraceContext context = traceContext.get();
if (context == null) {
return;
}
// 记录发送结果
ProducerTraceData traceData = new ProducerTraceData();
traceData.setTraceId(context.getTraceId());
traceData.setSpanId(context.getSpanId());
traceData.setMsgId(msg.getMsgId());
traceData.setStoreHost(sendResult.getMessageQueue().getBrokerName());
traceData.setStoreTime(System.currentTimeMillis());
traceData.setRetryTimes(msg.getReconsumeTimes());
// 异步发送追踪数据
asyncSendTraceData(traceData, TraceType.PRODUCER_SEND_AFTER);
// 清理ThreadLocal
traceContext.remove();
}
/**
* 异步发送追踪数据(不阻塞业务)
*/
private void asyncSendTraceData(
final TraceData traceData,
final TraceType traceType) {
TraceDispatcher.getInstance().submit(new Runnable() {
@Override
public void run() {
try {
// 1. 构建追踪消息
Message traceMsg = buildTraceMessage(traceData, traceType);
// 2. 发送到追踪Topic
TraceProducer producer = TraceProducerManager.getInstance()
.getOrCreateProducer();
producer.send(traceMsg, new SendCallback() {
@Override
public void onSuccess(SendResult sendResult) {
// 发送成功统计
Metrics.counter("trace_send_success").increment();
}
@Override
public void onException(Throwable e) {
// 失败处理:记录日志,不影响主流程
log.warn("发送追踪数据失败", e);
Metrics.counter("trace_send_failure").increment();
// 可选的本地存储,后续重试
storeTraceDataLocally(traceData, traceType);
}
});
} catch (Exception e) {
log.error("构建追踪消息异常", e);
}
}
});
}
}
/**
* 消费者追踪拦截器
*/
public class ConsumerTraceInterceptor implements ConsumeMessageHook {
@Override
public void consumeMessageBefore(
final List<MessageExt> msgs,
final ConsumeMessageContext context) {
for (MessageExt msg : msgs) {
// 1. 从消息属性获取追踪信息
String traceId = msg.getUserProperty("TRACE_ID");
String parentSpanId = msg.getUserProperty("SPAN_ID");
if (traceId == null || traceId.isEmpty()) {
// 未开启追踪,跳过
continue;
}
// 2. 创建新的Span(子Span)
TraceContext traceContext = new TraceContext();
traceContext.setTraceId(traceId);
traceContext.setParentSpanId(parentSpanId);
traceContext.setSpanId(generateSpanId());
traceContext.setTimestamp(System.currentTimeMillis());
// 3. 记录消费开始追踪数据
ConsumerTraceData traceData = new ConsumerTraceData();
traceData.setTraceId(traceId);
traceData.setSpanId(traceContext.getSpanId());
traceData.setParentSpanId(parentSpanId);
traceData.setMsgId(msg.getMsgId());
traceData.setOriginMsgId(msg.getOriginMsgId());
traceData.setConsumerGroup(context.getConsumerGroup());
traceData.setTopic(msg.getTopic());
traceData.setStoreHost(msg.getStoreHost());
traceData.setPickupTime(System.currentTimeMillis());
// 4. 存储到消费上下文
context.setMqTraceContext(traceContext);
// 5. 异步发送追踪数据
asyncSendTraceData(traceData, TraceType.CONSUME_BEFORE);
}
}
@Override
public void consumeMessageAfter(
final List<MessageExt> msgs,
final ConsumeMessageContext context,
final ConsumeConcurrentlyStatus status) {
TraceContext traceContext = (TraceContext) context.getMqTraceContext();
if (traceContext == null) {
return;
}
for (MessageExt msg : msgs) {
// 记录消费结果
ConsumerTraceData traceData = new ConsumerTraceData();
traceData.setTraceId(traceContext.getTraceId());
traceData.setSpanId(traceContext.getSpanId());
traceData.setMsgId(msg.getMsgId());
traceData.setConsumeStartTime(context.getConsumeStartTime());
traceData.setConsumeEndTime(System.currentTimeMillis());
traceData.setStatus(convertToConsumeStatus(status));
if (status == ConsumeConcurrentlyStatus.RECONSUME_LATER) {
traceData.setFailureReason("消费失败,需要重试");
}
// 异步发送追踪数据
asyncSendTraceData(traceData, TraceType.CONSUME_AFTER);
}
}
private ConsumeStatus convertToConsumeStatus(
ConsumeConcurrentlyStatus status) {
switch (status) {
case CONSUME_SUCCESS:
return ConsumeStatus.CONSUME_SUCCESS;
case RECONSUME_LATER:
return ConsumeStatus.CONSUME_RETRY;
default:
return ConsumeStatus.CONSUME_PENDING;
}
}
}
/**
* Broker端追踪处理器
*/
public class BrokerTraceProcessor {
/**
* 消息存储追踪
*/
public void traceMessageStore(
final MessageExtBrokerInner msg,
final PutMessageResult result) {
if (!isTraceEnabled(msg.getTopic())) {
return;
}
BrokerTraceData traceData = new BrokerTraceData();
traceData.setBrokerName(brokerController.getBrokerConfig().getBrokerName());
traceData.setBrokerAddr(brokerController.getBrokerAddr());
traceData.setTopic(msg.getTopic());
traceData.setMsgId(msg.getMsgId());
traceData.setQueueId(msg.getQueueId());
if (result != null && result.getAppendMessageResult() != null) {
traceData.setQueueOffset(
result.getAppendMessageResult().getLogicsOffset());
traceData.setStoreTimestamp(
result.getAppendMessageResult().getStoreTimestamp());
traceData.setStoreSize(msg.getBody().length);
}
// 异步发送
asyncSendTraceData(traceData, TraceType.BROKER_STORE);
}
/**
* 消息投递追踪
*/
public void traceMessageDispatch(
final String consumerGroup,
final String topic,
final int queueId,
final long queueOffset,
final int sysFlag) {
BrokerTraceData traceData = new BrokerTraceData();
traceData.setBrokerName(brokerController.getBrokerConfig().getBrokerName());
traceData.setTopic(topic);
traceData.setQueueId(queueId);
traceData.setQueueOffset(queueOffset);
traceData.setRetryTimes((sysFlag & MessageSysFlag.FLAG_RETRY) != 0 ? 1 : 0);
// 异步发送
asyncSendTraceData(traceData, TraceType.BROKER_DISPATCH);
}
}
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
2.3 数据传输层实现
java
复制
下载
/**
* 追踪数据分发器(核心)
*/
public class TraceDispatcher implements ServiceThread {
// 内存队列(无界队列,但有最大容量限制)
private final BlockingQueue<TraceTask> traceQueue =
new LinkedBlockingQueue<>(10000);
// 批量发送配置
private final int batchSize = 100; // 每批发送数量
private final long batchInterval = 1000; // 批量发送间隔(ms)
private final int maxRetryTimes = 3; // 最大重试次数
// 数据缓冲区
private final List<TraceData> buffer = new ArrayList<>(batchSize);
private long lastSendTime = System.currentTimeMillis();
@Override
public void run() {
log.info("TraceDispatcher started");
while (!this.isStopped()) {
try {
// 1. 从队列获取追踪任务(带超时)
TraceTask task = traceQueue.poll(100, TimeUnit.MILLISECONDS);
if (task != null) {
buffer.add(task.getTraceData());
}
// 2. 检查是否满足发送条件
boolean needSend = buffer.size() >= batchSize ||
(System.currentTimeMillis() - lastSendTime >= batchInterval);
if (needSend && !buffer.isEmpty()) {
// 3. 批量发送
sendBatchData(new ArrayList<>(buffer));
// 4. 清空缓冲区
buffer.clear();
lastSendTime = System.currentTimeMillis();
}
} catch (InterruptedException e) {
log.warn("TraceDispatcher interrupted");
} catch (Exception e) {
log.error("TraceDispatcher error", e);
}
}
}
/**
* 提交追踪任务(线程安全)
*/
public boolean submit(TraceTask task) {
if (traceQueue.size() >= 10000) {
// 队列满了,丢弃数据并记录告警
Metrics.counter("trace_queue_full").increment();
log.warn("Trace queue is full, discard trace data");
return false;
}
try {
return traceQueue.offer(task, 10, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return false;
}
}
/**
* 批量发送追踪数据
*/
private void sendBatchData(List<TraceData> batchData) {
// 1. 序列化批量数据
byte[] batchBytes = serializeBatchData(batchData);
// 2. 构建追踪消息
Message traceMsg = new Message();
traceMsg.setTopic(TraceConstants.TRACE_TOPIC);
traceMsg.setBody(batchBytes);
traceMsg.setTags(TraceConstants.TRACE_TAG);
// 3. 设置批量属性
traceMsg.putUserProperty("BATCH_SIZE",
String.valueOf(batchData.size()));
traceMsg.putUserProperty("BATCH_TIME",
String.valueOf(System.currentTimeMillis()));
// 4. 发送(带重试机制)
sendWithRetry(traceMsg, 0);
}
/**
* 带重试机制的发送
*/
private void sendWithRetry(Message msg, int retryCount) {
try {
SendResult sendResult = traceProducer.send(msg);
if (sendResult.getSendStatus() == SendStatus.SEND_OK) {
Metrics.counter("trace_batch_send_success").increment();
Metrics.histogram("trace_batch_size").update(msg.getBody().length);
} else {
// 发送失败,重试
if (retryCount < maxRetryTimes) {
log.warn("Trace batch send failed, retry {}/{}",
retryCount + 1, maxRetryTimes);
Thread.sleep(100 * (retryCount + 1)); // 指数退避
sendWithRetry(msg, retryCount + 1);
} else {
Metrics.counter("trace_batch_send_failure").increment();
// 最终失败,存储到本地文件
storeToLocalFile(msg);
}
}
} catch (Exception e) {
log.error("Send trace batch error", e);
Metrics.counter("trace_batch_send_exception").increment();
}
}
/**
* 本地文件存储(兜底方案)
*/
private void storeToLocalFile(Message msg) {
String fileName = "trace_failed_" +
System.currentTimeMillis() + ".dat";
File file = new File(traceConfig.getLocalStorePath(), fileName);
try (FileOutputStream fos = new FileOutputStream(file);
BufferedOutputStream bos = new BufferedOutputStream(fos)) {
bos.write(msg.getBody());
bos.flush();
log.info("Store trace data to local file: {}", file.getAbsolutePath());
} catch (IOException e) {
log.error("Store trace data to local file failed", e);
}
}
}
2.4 存储与查询实现
java
复制
下载
/**
* 追踪数据存储服务
*/
public class TraceStoreService {
// 支持多种存储后端
private TraceStorage storage;
public TraceStoreService(StorageType storageType) {
switch (storageType) {
case ROCKETMQ:
this.storage = new RocketMQTraceStorage();
break;
case MYSQL:
this.storage = new MySQLTraceStorage();
break;
case ELASTICSEARCH:
this.storage = new ElasticsearchTraceStorage();
break;
case HBASE:
this.storage = new HBaseTraceStorage();
break;
default:
this.storage = new DefaultTraceStorage();
}
}
/**
* 存储追踪数据
*/
public void store(TraceData traceData) {
storage.store(traceData);
}
/**
* 查询消息轨迹
*/
public List<TraceNode> queryTrace(String msgId,
String topic,
long startTime,
long endTime) {
// 1. 根据msgId查询
List<TraceNode> traceNodes = storage.queryByMsgId(msgId);
if (!traceNodes.isEmpty()) {
// 2. 获取traceId,查询完整轨迹
String traceId = traceNodes.get(0).getTraceId();
return storage.queryByTraceId(traceId);
}
// 3. 降级:根据topic和时间范围查询
return storage.queryByTopicAndTime(topic, startTime, endTime);
}
/**
* 聚合统计
*/
public TraceStatistics getStatistics(String topic,
String consumerGroup,
long startTime,
long endTime) {
TraceStatistics stats = new TraceStatistics();
// 生产统计
stats.setProduceCount(
storage.countProduce(topic, startTime, endTime));
stats.setProduceSuccessRate(
storage.calcProduceSuccessRate(topic, startTime, endTime));
stats.setAvgProduceLatency(
storage.calcAvgProduceLatency(topic, startTime, endTime));
// 消费统计
if (consumerGroup != null) {
stats.setConsumeCount(
storage.countConsume(topic, consumerGroup, startTime, endTime));
stats.setConsumeSuccessRate(
storage.calcConsumeSuccessRate(topic, consumerGroup, startTime, endTime));
stats.setAvgConsumeLatency(
storage.calcAvgConsumeLatency(topic, consumerGroup, startTime, endTime));
}
// 错误统计
stats.setErrorCount(
storage.countError(topic, consumerGroup, startTime, endTime));
stats.setTopErrors(
storage.getTopErrors(topic, consumerGroup, startTime, endTime, 10));
return stats;
}
}
/**
* RocketMQ作为存储后端的实现
*/
public class RocketMQTraceStorage implements TraceStorage {
private final DefaultMQPullConsumer traceConsumer;
private final Map<String/* traceId */, List<TraceNode>> cache =
new ConcurrentHashMap<>();
public RocketMQTraceStorage() {
traceConsumer = new DefaultMQPullConsumer("TraceQueryConsumer");
traceConsumer.setNamesrvAddr(nameServerAddr);
traceConsumer.registerMessageQueueListener(
TraceConstants.TRACE_TOPIC, new MessageQueueListener() {
@Override
public void messageQueueChanged(String topic,
Set<MessageQueue> mqAll,
Set<MessageQueue> mqDivided) {
// 处理队列变化
}
});
}
@Override
public List<TraceNode> queryByTraceId(String traceId) {
// 1. 检查缓存
List<TraceNode> cached = cache.get(traceId);
if (cached != null) {
return cached;
}
// 2. 从RocketMQ查询
List<TraceNode> traceNodes = new ArrayList<>();
try {
// 遍历所有队列
Set<MessageQueue> mqs = traceConsumer.fetchSubscribeMessageQueues(
TraceConstants.TRACE_TOPIC);
for (MessageQueue mq : mqs) {
long offset = getMinOffset(mq);
long maxOffset = getMaxOffset(mq);
while (offset < maxOffset) {
PullResult pullResult = traceConsumer.pull(
mq, "*", offset, 32); // 每次拉取32条
if (pullResult.getPullStatus() == PullStatus.FOUND) {
for (MessageExt msg : pullResult.getMsgFoundList()) {
TraceData traceData = deserialize(msg.getBody());
if (traceId.equals(traceData.getTraceId())) {
traceNodes.add(convertToTraceNode(traceData));
}
// 更新缓存
cache.computeIfAbsent(traceData.getTraceId(),
k -> new ArrayList<>())
.add(convertToTraceNode(traceData));
}
offset = pullResult.getNextBeginOffset();
} else {
break;
}
}
}
} catch (Exception e) {
log.error("Query trace by traceId failed", e);
}
// 3. 按时间排序
traceNodes.sort(Comparator.comparingLong(TraceNode::getTimestamp));
return traceNodes;
}
@Override
public TraceNode queryByMsgId(String msgId) {
// 创建索引查询
IndexFile indexFile = getIndexFile();
List<Long> phyOffsets = indexFile.query(msgId, 0, Long.MAX_VALUE, 1);
if (!phyOffsets.isEmpty()) {
MessageExt msg = queryMessageByOffset(phyOffsets.get(0));
if (msg != null) {
TraceData traceData = deserialize(msg.getBody());
return convertToTraceNode(traceData);
}
}
return null;
}
}
/**
* 轨迹节点(用于前端展示)
*/
public class TraceNode {
private String traceId; // 追踪ID
private String spanId; // 跨度ID
private String parentSpanId; // 父跨度ID
private String nodeType; // 节点类型:PRODUCER/BROKER/CONSUMER
private String nodeName; // 节点名称
private String host; // 主机地址
private long timestamp; // 时间戳
private long duration; // 耗时
private Map<String, String> tags; // 标签
private String status; // 状态
private String errorMsg; // 错误信息
// 层级关系
private List<TraceNode> children = new ArrayList<>();
}
三、可视化与查询接口
3.1 REST API设计
java
复制
下载
/**
* 轨迹查询REST接口
*/
@RestController
@RequestMapping("/api/trace")
public class TraceController {
@Autowired
private TraceQueryService traceQueryService;
/**
* 根据消息ID查询轨迹
*/
@GetMapping("/byMsgId/{msgId}")
public Response<TraceView> queryByMsgId(@PathVariable String msgId) {
try {
List<TraceNode> traceNodes = traceQueryService.queryByMsgId(msgId);
TraceView traceView = buildTraceView(traceNodes);
return Response.success(traceView);
} catch (Exception e) {
log.error("Query trace by msgId failed", e);
return Response.error("查询失败: " + e.getMessage());
}
}
/**
* 根据时间范围查询轨迹
*/
@GetMapping("/byTimeRange")
public Response<List<TraceView>> queryByTimeRange(
@RequestParam String topic,
@RequestParam long startTime,
@RequestParam long endTime,
@RequestParam(defaultValue = "100") int limit) {
try {
List<TraceView> traceViews = traceQueryService
.queryByTimeRange(topic, startTime, endTime, limit);
return Response.success(traceViews);
} catch (Exception e) {
log.error("Query trace by time range failed", e);
return Response.error("查询失败: " + e.getMessage());
}
}
/**
* 获取轨迹统计
*/
@GetMapping("/statistics")
public Response<TraceStatistics> getStatistics(
@RequestParam String topic,
@RequestParam(required = false) String consumerGroup,
@RequestParam long startTime,
@RequestParam long endTime) {
try {
TraceStatistics stats = traceQueryService
.getStatistics(topic, consumerGroup, startTime, endTime);
return Response.success(stats);
} catch (Exception e) {
log.error("Get trace statistics failed", e);
return Response.error("获取统计失败: " + e.getMessage());
}
}
/**
* 实时轨迹订阅(WebSocket)
*/
@GetMapping("/subscribe/{topic}")
public SseEmitter subscribe(@PathVariable String topic) {
SseEmitter emitter = new SseEmitter(0L);
traceQueryService.subscribe(topic, new TraceListener() {
@Override
public void onTrace(TraceNode traceNode) {
try {
emitter.send(traceNode);
} catch (IOException e) {
emitter.completeWithError(e);
}
}
@Override
public void onComplete() {
emitter.complete();
}
@Override
public void onError(Throwable t) {
emitter.completeWithError(t);
}
});
return emitter;
}
}
/**
* 轨迹可视化视图
*/
public class TraceView {
private String traceId; // 追踪ID
private String rootSpanId; // 根跨度ID
private long startTime; // 开始时间
private long endTime; // 结束时间
private long duration; // 总耗时
private String status; // 总体状态
private List<SpanView> spans; // 所有跨度
private Map<String, Object> statistics; // 统计信息
// 树形结构展示
private SpanView rootSpan; // 根节点(用于前端树形展示)
public static class SpanView {
private String spanId; // 跨度ID
private String parentSpanId; // 父跨度ID
private String operationName; // 操作名称
private String serviceName; // 服务名称
private String host; // 主机
private long startTime; // 开始时间
private long duration; // 耗时
private Map<String, String> tags; // 标签
private List<LogEntry> logs; // 日志
private List<SpanView> children; // 子跨度
}
public static class LogEntry {
private long timestamp; // 时间戳
private Map<String, String> fields; // 日志字段
}
}
3.2 前端展示组件
javascript
复制
下载
// Vue.js轨迹展示组件
Vue.component('message-trace', {
template: `
<div class="trace-container">
<!-- 头部统计 -->
<div class="trace-header">
<div class="trace-summary">
<span>消息ID: {{ trace.traceId }}</span>
<span>状态: <span :class="statusClass">{{ trace.status }}</span></span>
<span>耗时: {{ formatDuration(trace.duration) }}</span>
</div>
<div class="trace-timeline">
<div class="timeline-bar" :style="timelineStyle"></div>
<div v-for="span in trace.spans"
:key="span.spanId"
class="timeline-span"
:style="getSpanStyle(span)"
:title="getSpanTooltip(span)">
</div>
</div>
</div>
<!-- 树形结构 -->
<div class="trace-tree">
<trace-tree-node
v-for="span in rootSpans"
:key="span.spanId"
:node="span"
:depth="0">
</trace-tree-node>
</div>
<!-- 详情面板 -->
<div class="trace-detail" v-if="selectedSpan">
<h3>跨度详情</h3>
<div class="detail-content">
<div><strong>操作:</strong> {{ selectedSpan.operationName }}</div>
<div><strong>服务:</strong> {{ selectedSpan.serviceName }}</div>
<div><strong>主机:</strong> {{ selectedSpan.host }}</div>
<div><strong>耗时:</strong> {{ formatDuration(selectedSpan.duration) }}</div>
<div><strong>时间:</strong> {{ formatTime(selectedSpan.startTime) }}</div>
<h4>标签</h4>
<div v-for="(value, key) in selectedSpan.tags" :key="key">
{{ key }}: {{ value }}
</div>
<h4>日志</h4>
<div v-for="log in selectedSpan.logs" :key="log.timestamp">
[{{ formatTime(log.timestamp) }}]
<span v-for="(value, key) in log.fields" :key="key">
{{ key }}={{ value }}
</span>
</div>
</div>
</div>
</div>
`,
props: ['trace'],
data() {
return {
selectedSpan: null
}
},
computed: {
rootSpans() {
// 找出所有根节点(没有父节点的)
return this.trace.spans.filter(span => !span.parentSpanId);
},
timelineStyle() {
const duration = this.trace.duration || 1;
return {
width: '100%',
position: 'relative'
};
},
statusClass() {
return {
'status-success': this.trace.status === 'SUCCESS',
'status-error': this.trace.status === 'ERROR',
'status-warning': this.trace.status === 'WARNING'
};
}
},
methods: {
getSpanStyle(span) {
const startPercent = (span.startTime - this.trace.startTime) /
this.trace.duration * 100;
const widthPercent = span.duration / this.trace.duration * 100;
return {
left: `${startPercent}%`,
width: `${Math.max(widthPercent, 1)}%`,
backgroundColor: this.getSpanColor(span)
};
},
getSpanColor(span) {
// 根据服务类型返回不同颜色
const colors = {
'PRODUCER': '#4CAF50',
'BROKER': '#2196F3',
'CONSUMER': '#FF9800'
};
return colors[span.serviceType] || '#9E9E9E';
},
getSpanTooltip(span) {
return `${span.operationName}\n${span.serviceName}\n${this.formatDuration(span.duration)}`;
},
formatDuration(duration) {
if (duration < 1000) {
return `${duration}ms`;
} else if (duration < 60000) {
return `${(duration / 1000).toFixed(2)}s`;
} else {
return `${(duration / 60000).toFixed(2)}min`;
}
},
formatTime(timestamp) {
return new Date(timestamp).toLocaleString();
}
}
});
四、生产环境最佳实践
4.1 性能优化配置
properties
复制
下载
# trace.properties
# 基本配置
rocketmq.trace.enabled=true
rocketmq.trace.topic=RMQ_SYS_TRACE_TOPIC
# 性能优化
rocketmq.trace.batch.size=100
rocketmq.trace.batch.interval=1000
rocketmq.trace.queue.size=10000
rocketmq.trace.max.retry.times=3
# 采样率控制(降低对生产环境影响)
rocketmq.trace.sample.rate=0.1 # 10%采样率
rocketmq.trace.sample.strategy=adaptive # 自适应采样
# 存储配置
rocketmq.trace.storage.type=ROCKETMQ
rocketmq.trace.storage.retention.days=7
rocketmq.trace.storage.cleanup.enabled=true
# 监控告警
rocketmq.trace.monitor.enabled=true
rocketmq.trace.queue.warning.threshold=0.8 # 80%告警
rocketmq.trace.send.error.threshold=100 # 错误数阈值
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
4.2 自适应采样策略
java
复制
下载
/**
* 自适应采样策略
* 核心思想:根据系统负载动态调整采样率
*/
public class AdaptiveSamplingStrategy implements SamplingStrategy {
private double baseSampleRate = 0.1; // 基础采样率10%
private double maxSampleRate = 1.0; // 最大采样率100%
private double minSampleRate = 0.01; // 最小采样率1%
// 系统指标监控
private SystemMetricsCollector metricsCollector;
@Override
public boolean shouldSample(String topic, String msgId) {
// 1. 获取当前系统负载
double systemLoad = metricsCollector.getSystemLoad();
int queueSize = metricsCollector.getTraceQueueSize();
// 2. 计算动态采样率
double dynamicRate = calculateDynamicRate(systemLoad, queueSize);
// 3. 应用采样
return Math.random() < dynamicRate;
}
private double calculateDynamicRate(double systemLoad, int queueSize) {
double rate = baseSampleRate;
// 系统负载低时提高采样率
if (systemLoad < 0.3) {
rate = Math.min(maxSampleRate, rate * 2);
}
// 系统负载高时降低采样率
else if (systemLoad > 0.7) {
rate = Math.max(minSampleRate, rate / 2);
}
// 队列积压时降低采样率
if (queueSize > 8000) { // 80%容量
rate = Math.max(minSampleRate, rate / 4);
}
return rate;
}
/**
* 关键消息强制采样(不参与采样率控制)
*/
public boolean isForceSample(Message msg) {
// 1. 事务消息强制采样
if (msg.getTransactionId() != null) {
return true;
}
// 2. 高优先级消息强制采样
String priority = msg.getProperty("PRIORITY");
if ("HIGH".equals(priority) || "CRITICAL".equals(priority)) {
return true;
}
// 3. 错误重试消息强制采样
if (msg.getReconsumeTimes() > 0) {
return true;
}
return false;
}
}
4.3 故障排查案例
java
复制
下载
/**
* 案例:消费延迟问题排查
*/
public class ConsumptionDelayInvestigation {
public void analyzeDelay(String msgId) {
// 1. 查询完整消息轨迹
List<TraceNode> trace = traceService.queryByMsgId(msgId);
// 2. 分析各阶段耗时
Map<String, Long> stageDurations = new HashMap<>();
for (TraceNode node : trace) {
String stage = node.getNodeType();
long duration = node.getDuration();
stageDurations.merge(stage, duration, Long::sum);
}
// 3. 识别瓶颈点
String bottleneck = null;
long maxDuration = 0;
for (Map.Entry<String, Long> entry : stageDurations.entrySet()) {
if (entry.getValue() > maxDuration) {
maxDuration = entry.getValue();
bottleneck = entry.getKey();
}
}
System.out.println("瓶颈分析结果:");
System.out.println("消息ID: " + msgId);
System.out.println("总耗时: " + maxDuration + "ms");
System.out.println("瓶颈阶段: " + bottleneck);
System.out.println("各阶段耗时:");
stageDurations.forEach((stage, duration) -> {
System.out.println(" " + stage + ": " + duration + "ms");
});
// 4. 深入分析瓶颈阶段
if ("CONSUME".equals(bottleneck)) {
analyzeConsumptionBottleneck(trace);
} else if ("STORE".equals(bottleneck)) {
analyzeStorageBottleneck(trace);
}
}
private void analyzeConsumptionBottleneck(List<TraceNode> trace) {
// 找出所有消费节点
List<TraceNode> consumeNodes = trace.stream()
.filter(node -> "CONSUMER".equals(node.getNodeType()))
.collect(Collectors.toList());
System.out.println("\n消费阶段详细分析:");
for (TraceNode node : consumeNodes) {
System.out.println("消费者: " + node.getNodeName());
System.out.println(" 开始时间: " + new Date(node.getTimestamp()));
System.out.println(" 耗时: " + node.getDuration() + "ms");
System.out.println(" 状态: " + node.getStatus());
if (node.getTags() != null) {
System.out.println(" 标签:");
node.getTags().forEach((key, value) -> {
System.out.println(" " + key + ": " + value);
});
}
}
// 建议优化
System.out.println("\n优化建议:");
System.out.println("1. 检查消费者线程池配置");
System.out.println("2. 检查消息处理逻辑性能");
System.out.println("3. 考虑增加消费者实例");
System.out.println("4. 检查网络延迟");
}
}
五、总结
核心价值
-
全链路可视化:端到端的消息流转跟踪
-
性能分析:识别系统瓶颈,优化架构
-
故障排查:快速定位问题根源
-
容量规划:基于数据做资源规划
关键技术点
-
异步采集:不影响主业务流程
-
批量发送:提高传输效率
-
分级存储:支持不同查询场景
-
智能采样:平衡开销和效果
适用场景
-
金融交易系统:需要严格的审计追踪
-
电商订单系统:排查订单状态异常
-
物流跟踪系统:实时跟踪物流状态
-
微服务架构:跨服务调用链追踪
通过RocketMQ消息轨迹追踪,可以实现从"黑盒"到"白盒"的转变,让消息流转过程完全透明化,极大提升系统的可观测性和可维护性。