一、日志分析平台概述
日志分析平台是运维监控的核心基础设施:
核心挑战:
- 海量日志采集(TB级/天)
- 实时检索分析(秒级响应)
- 多维度聚合统计
- 可视化展示
关键指标:
- 日志吞吐量:100GB+/小时
- 检索延迟:<1秒
- 数据存储:PB级
- 可用性:99.99%
二、系统架构
┌─────────────────────────────────────────────────────────────────┐
│ 日志分析平台架构(ELK) │
├─────────────────────────────────────────────────────────────────┤
│ │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ 数据采集层 │ │
│ │ Filebeat | Metricbeat | Logstash | Fluentd │ │
│ └──────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ 消息队列层 │ │
│ │ Kafka | Redis │ │
│ └──────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ 存储计算层 │ │
│ │ Elasticsearch │ │
│ └──────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ 可视化层 │ │
│ │ Kibana | Grafana │ │
│ └──────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
三、日志采集
1. Filebeat配置
yaml
# filebeat.yml
filebeat.inputs:
- type: log
enabled: true
paths:
- /var/log/*.log
- /var/log/**/*.log
fields:
app: order-service
env: production
multiline:
pattern: '^\d{4}-\d{2}-\d{2}'
negate: true
match: after
output.kafka:
hosts: ["kafka1:9092", "kafka2:9092"]
topic: 'app-logs'
partition.round_robin:
reachable_only: false
required_acks: 1
compression: gzip
processors:
- add_host_metadata:
when.not.contains.tags: forwarded
- add_cloud_metadata: ~
- add_docker_metadata: ~
2. Logstash管道
ruby
# pipeline.conf
input {
kafka {
bootstrap_servers => "kafka:9092"
topics => ["app-logs"]
group_id => "logstash"
codec => json
}
}
filter {
# 解析时间
date {
match => ["timestamp", "ISO8601"]
target => "@timestamp"
}
# JSON解析
if [message] =~ /^\{/ {
json {
source => "message"
target => "parsed"
}
}
# 添加索引字段
mutate {
add_field => { "[@metadata][index]" => "app-logs-%{+YYYY.MM.dd}" }
}
# 过滤敏感信息
mutate {
gsub => [
"message", "password=[^&\s]+", "password=***",
"message", "token=[^&\s]+", "token=***"
]
}
}
output {
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "%{[@metadata][index]}"
document_type => "_doc"
}
}
3. 自定义采集器
java
@Service
public class LogCollectorService {
@Autowired
private KafkaTemplate kafkaTemplate;
// 采集应用日志
public void collect(String serviceName, String logLine) {
LogEntry entry = LogEntry.builder()
.id(IdGenerator.nextId())
.service(serviceName)
.content(logLine)
.hostname(getHostname())
.ip(getLocalIp())
.timestamp(System.currentTimeMillis())
.level(extractLogLevel(logLine))
.thread(extractThread(logLine))
.build();
// 发送到Kafka
kafkaTemplate.send("app-logs", serviceName, entry);
}
// 提取日志级别
private String extractLogLevel(String logLine) {
if (logLine.contains("ERROR")) return "ERROR";
if (logLine.contains("WARN")) return "WARN";
if (logLine.contains("INFO")) return "INFO";
if (logLine.contains("DEBUG")) return "DEBUG";
return "UNKNOWN";
}
}
四、日志存储
1. Elasticsearch索引
json
// 索引模板
{
"index_patterns": ["app-logs-*"],
"settings": {
"number_of_shards": 5,
"number_of_replicas": 1,
"index.refresh_interval": "5s",
"index.translog.durability": "async"
},
"mappings": {
"properties": {
"@timestamp": { "type": "date" },
"service": { "type": "keyword" },
"hostname": { "type": "keyword" },
"ip": { "type": "ip" },
"level": { "type": "keyword" },
"thread": { "type": "keyword" },
"message": { "type": "text", "analyzer": "standard" },
"trace_id": { "type": "keyword" },
"span_id": { "type": "keyword" },
"exception": { "type": "text" },
"stack_trace": { "type": "text" }
}
}
}
2. 索引生命周期管理
json
// ILM策略
{
"policy": "app-logs-policy",
"phases": {
"hot": {
"min_age": "0ms",
"actions": {
"rollover": {
"max_age": "1d",
"max_size": "50gb"
}
}
},
"warm": {
"min_age": "7d",
"actions": {
"shrink": { "number_of_shards": 1 },
"forcemerge": { "max_num_segments": 1 }
}
},
"cold": {
"min_age": "30d",
"actions": {
"freeze": {}
}
},
"delete": {
"min_age": "90d",
"actions": {
"delete": {}
}
}
}
}
3. 日志存储服务
java
@Service
public class LogStorageService {
@Autowired
private ElasticsearchRestTemplate esTemplate;
// 存储日志
public void store(LogEntry entry) {
IndexQuery indexQuery = new IndexQueryBuilder()
.withId(entry.getId())
.withObject(entry)
.build();
esTemplate.index(indexQuery);
}
// 批量存储
public void bulkStore(List<LogEntry> entries) {
List<IndexQuery> queries = entries.stream()
.map(entry -> new IndexQueryBuilder()
.withId(entry.getId())
.withObject(entry)
.build())
.collect(Collectors.toList());
esTemplate.bulkIndex(queries);
}
}
五、日志检索
1. 检索服务
java
@Service
public class LogSearchService {
@Autowired
private ElasticsearchOperations elasticsearchOperations;
// 搜索日志
public SearchResult search(LogSearchRequest request) {
NativeSearchQuery query = new NativeSearchQueryBuilder()
.withQuery(buildQuery(request))
.withPageable(PageRequest.of(request.getPage(), request.getSize()))
.withSort(SortBuilders.fieldSort("@timestamp").order(SortOrder.DESC))
.build();
SearchHits<LogEntry> hits = elasticsearchOperations.search(query, LogEntry.class);
return SearchResult.builder()
.total(hits.getTotalHits())
.logs(hits.getSearchHits().stream()
.map(SearchHit::getContent)
.collect(Collectors.toList()))
.build();
}
// 构建查询
private Query buildQuery(LogSearchRequest request) {
BoolQuery.Builder builder = new BoolQuery.Builder();
// 服务过滤
if (StringUtils.isNotBlank(request.getService())) {
builder.must(QueryBuilders.termQuery("service", request.getService()));
}
// 级别过滤
if (request.getLevels() != null && !request.getLevels().isEmpty()) {
builder.must(QueryBuilders.termsQuery("level", request.getLevels()));
}
// 关键词搜索
if (StringUtils.isNotBlank(request.getKeyword())) {
builder.must(QueryBuilders.matchQuery("message", request.getKeyword()));
}
// 时间范围
if (request.getStartTime() != null && request.getEndTime() != null) {
builder.must(QueryBuilders.rangeQuery("@timestamp")
.gte(request.getStartTime())
.lte(request.getEndTime()));
}
// 关键词高亮
NativeSearchQuery searchQuery = new NativeSearchQueryBuilder()
.withQuery(builder.build())
.withHighlightFields(new HighlightBuilder.Field("message"))
.build();
return builder.build();
}
}
2. 聚合分析
java
@Service
public class LogAggregationService {
// 聚合统计
public AggregationResult aggregate(AggregationRequest request) {
NativeSearchQuery query = new NativeSearchQueryBuilder()
.withQuery(QueryBuilders.rangeQuery("@timestamp")
.gte(request.getStartTime())
.lte(request.getEndTime()))
.withAggregation(AggregationBuilders.terms("by_level")
.field("level"))
.withAggregation(AggregationBuilders.terms("by_service")
.field("service"))
.withAggregation(AggregationBuilders.dateHistogram("by_time")
.field("@timestamp")
.calendarInterval(DateHistogramInterval.HOUR))
.withAggregation(AggregationBuilders.cardinality("error_count")
.field("trace_id"))
.withSize(0)
.build();
SearchHits<LogEntry> hits = elasticsearchOperations.search(query, LogEntry.class);
AggregationsContainer<?> aggs = hits.getAggregations();
return parseAggregation(aggs);
}
// TopN错误统计
public List<ErrorStat> topErrors(String startTime, String endTime) {
NativeSearchQuery query = new NativeSearchQueryBuilder()
.withQuery(QueryBuilders.boolQuery()
.must(QueryBuilders.termQuery("level", "ERROR"))
.must(QueryBuilders.rangeQuery("@timestamp")
.gte(startTime).lte(endTime)))
.withAggregation(AggregationBuilders.terms("by_exception")
.field("exception.keyword")
.size(10)
.subAggregation(AggregationBuilders.terms("by_service")
.field("service")))
.withSize(0)
.build();
return parseTopErrors(elasticsearchOperations.search(query, LogEntry.class));
}
}
六、日志可视化
1. Kibana仪表盘
json
// Kibana仪表盘配置
{
"title": "应用日志监控",
"panels": [
{
"title": "日志量趋势",
"type": "line",
"gridData": { "x": 0, "y": 0, "w": 24, "h": 8 },
"query": {
"aggs": {
"by_time": {
"date_histogram": { "field": "@timestamp", "interval": "5m" }
}
}
}
},
{
"title": "日志级别分布",
"type": "pie",
"gridData": { "x": 24, "y": 0, "w": 12, "h": 8 },
"query": {
"aggs": {
"by_level": { "terms": { "field": "level" } }
}
}
},
{
"title": "服务日志分布",
"type": "bar",
"gridData": { "x": 36, "y": 0, "w": 12, "h": 8 },
"query": {
"aggs": {
"by_service": { "terms": { "field": "service" } }
}
}
}
]
}
2. 告警规则
json
// 告警规则
{
"trigger": {
"name": "ErrorRateAlert",
"schedule": "every 5 minutes"
},
"input": {
"search": {
"indices": ["app-logs-*"],
"body": {
"query": {
"bool": {
"must": [
{ "term": { "level": "ERROR" } },
{ "range": { "@timestamp": { "gte": "now-5m" } } }
]
}
}
}
}
},
"condition": {
"script": "ctx.payload.hits.total > 100"
},
"actions": {
"notify": {
"webhook": {
"url": "https://alert.example.com/webhook"
}
}
}
}
七、日志追踪
1. 分布式追踪
java
// TraceId注入
@Component
public class TraceInterceptor implements HandlerInterceptor {
@Override
public boolean preHandle(HttpServletRequest request, HttpServletResponse response,
Object handler) {
String traceId = request.getHeader("X-Trace-Id");
if (traceId == null) {
traceId = UUID.randomUUID().toString();
}
MDC.put("traceId", traceId);
response.setHeader("X-Trace-Id", traceId);
return true;
}
@Override
public void afterCompletion(HttpServletRequest request, HttpServletResponse response,
Object handler, Exception ex) {
MDC.remove("traceId");
}
}
2. 日志链路
java
@Service
public class LogTraceService {
// 追踪日志链路
public List<LogEntry> traceRequest(String traceId) {
NativeSearchQuery query = new NativeSearchQueryBuilder()
.withQuery(QueryBuilders.termQuery("trace_id", traceId))
.withSort(SortBuilders.fieldSort("@timestamp").order(SortOrder.ASC))
.build();
SearchHits<LogEntry> hits = elasticsearchOperations.search(query, LogEntry.class);
return hits.getSearchHits().stream()
.map(SearchHit::getContent)
.collect(Collectors.toList());
}
}
八、总结
日志分析平台设计要点:
- 采集层:Filebeat + Logstash
- 消息队列:Kafka缓冲
- 存储层:Elasticsearch
- 可视化:Kibana仪表盘
技术选型:
- ELK(Elasticsearch + Logstash + Kibana)
- Loki(日志聚合)
- Splunk(商业方案)
个人观点,仅供参考