Flink 的核心就一句话:处理源源不断产生的数据流。
- 拿到数据(Source):告诉 Flink 数据从哪来。来源可以是:Kafka(最常用)、文本文件、Socket、MySQL 等。、
- 处理数据(Transformation):核心业务逻辑就在这里。就像写 SQL 一样,你可以进行 过滤、转换、分组、聚合、关联 等操作。
- 输出结果(Sink):告诉 Flink 处理完的数据送到哪去。目的地可以是:数据库(MySQL/ClickHouse)、消息队列(Kafka)、前端大屏、文件系统等。
主要解决两种业务问题:
- 实时计算(流处理):数据一来就立刻处理,毫秒/秒级出结果。场景:实时监控异常(如服务器宕机、交易欺诈)、实时大屏展示(如双11GMV)、实时推荐(刷抖音时下个视频的推荐)
- 历史数据回溯(批处理):对已经存在的一大堆数据进行计算。场景:按月统计销售额、分析用户历史行为、数据仓库的ETL。
基于"实时网站用户访问量统计"场景
一、环境依赖
xml
<properties>
<flink.version>1.17.0</flink.version>
</properties>
<dependencies>
<!-- Flink Core -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- Flink Kafka Connector -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- JDBC Connector -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-jdbc</artifactId>
<version>3.1.0-1.17</version>
</dependency>
<!-- MySQL Driver -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.33</version>
</dependency>
</dependencies>
二、数据模型定义
java
// 页面访问事件
public class PageViewEvent {
private String pageUrl;
private String userId;
private String eventType; // "page_view"
private Long timestamp;
// 构造器、getter、setter
public PageViewEvent() {}
public PageViewEvent(String pageUrl, String userId, String eventType, Long timestamp) {
this.pageUrl = pageUrl;
this.userId = userId;
this.eventType = eventType;
this.timestamp = timestamp;
}
// getters and setters...
public String getPageUrl() { return pageUrl; }
public void setPageUrl(String pageUrl) { this.pageUrl = pageUrl; }
public String getUserId() { return userId; }
public void setUserId(String userId) { this.userId = userId; }
public String getEventType() { return eventType; }
public void setEventType(String eventType) { this.eventType = eventType; }
public Long getTimestamp() { return timestamp; }
public void setTimestamp(Long timestamp) { this.timestamp = timestamp; }
@Override
public String toString() {
return "PageViewEvent{" +
"pageUrl='" + pageUrl + '\'' +
", userId='" + userId + '\'' +
", eventType='" + eventType + '\'' +
", timestamp=" + timestamp +
'}';
}
}
// 统计结果
public class PageViewResult {
private String pageUrl;
private Long windowStart;
private Long windowEnd;
private Long userCount;
public PageViewResult() {}
public PageViewResult(String pageUrl, Long windowStart, Long windowEnd, Long userCount) {
this.pageUrl = pageUrl;
this.windowStart = windowStart;
this.windowEnd = windowEnd;
this.userCount = userCount;
}
// getters and setters...
public String getPageUrl() { return pageUrl; }
public void setPageUrl(String pageUrl) { this.pageUrl = pageUrl; }
public Long getWindowStart() { return windowStart; }
public void setWindowStart(Long windowStart) { this.windowStart = windowStart; }
public Long getWindowEnd() { return windowEnd; }
public void setWindowEnd(Long windowEnd) { this.windowEnd = windowEnd; }
public Long getUserCount() { return userCount; }
public void setUserCount(Long userCount) { this.userCount = userCount; }
@Override
public String toString() {
return "PageViewResult{" +
"pageUrl='" + pageUrl + '\'' +
", windowStart=" + new Date(windowStart) +
", windowEnd=" + new Date(windowEnd) +
", userCount=" + userCount +
'}';
}
}
三、核心逻辑
java
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.OutputTag;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.time.Duration;
public class RealTimePageViewAnAlysis{
private static final ObjectMapper objectMapper = new ObjectMapper();
public static void main(String[] args)throws Exception {
//1.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//设置检查点(生产环境需要)
env.enableCheckpointing(5000);//5秒一次检查点
//2.定义数据源-kafka读取
KafkaSource<String> kafkaSource = KafkaSource.<String>builder()
.setBootstrapServers("localhost:9092") // Kafka地址
.setTopics("page-view-topic") // 主题名称
.setGroupId("flink-page-view-group") // 消费者组
.setStartingOffsets(OffsetsInitializer.latest())
.setValueOnlyDeserializer(new SimpleStringSchema())
.build();
//3.添加Kafka Source
Data<String> kafkaStream = env.fromSource(
kafkaSource,
WatermarkStrategy.noWatermarks(),
"Kafka Source"
);
//4.数据转换:JSON字符串 -> PageViewEvent对象
SingleOutputStreamOperator<PageViewEvent> eventStream = kafkaStream.map(new MapFunction<String, PageViewEvent>() {
@Override
public PageViewEvent map(String value) throws Exception {
try {
// 假设Kafka中的消息是JSON格式
return objectMapper.readValue(value, PageViewEvent.class);
} catch (Exception e) {
// 解析失败的数据可以单独处理
System.err.println("解析JSON失败: " + value);
return null;
}
}
})
.filter(new FilterFunction<PageViewEvent>() {
@Override
public boolean filter(PageViewEvent event) throws Exception {
// 过滤掉解析失败的数据和无效数据
return event != null && event.getUserId() != null && event.getPageUrl() != null;
}
});
//5.分配水印和时间戳
SingleOutputStreamOperator<PageViewEvent> timedStream = eventStream
.assignTimestampsAndWatermarks(WatermarkStrategy.<PageViewEvent>forBoundedOutOfOrderness(Duration.ofSeconds(5))
.withTimestampAssigner((event, timestamp) -> event.getTimestamp()));
//6.定义侧输出流(用户处理延迟数据)
OutputTag<PageViewEvent> lateDataTag = new OutputTag<PageViewEvent>("late-data"){};
// 7. 核心处理逻辑:过滤 -> 分组 -> 开窗 -> 聚合
SingleOutputStreamOperator<PageViewResult> resultStream = timedStream
// 过滤出页面浏览事件
.filter(new FilterFunction<PageViewEvent>() {
@Override
public boolean filter(PageViewEvent event) throws Exception {
return "page_view".equals(event.getEventType());
}
})
// 按页面URL分组
.keyBy(PageViewEvent::getPageUrl)
// 开1分钟的滚动窗口,允许2秒延迟
.window(TumblingEventTimeWindows.of(Time.minutes(1)))
.allowedLateness(Time.seconds(2))
.sideOutputLateData(lateDataTag)
// 聚合:统计独立用户数
.aggregate(new UserCountAggregate(), new PageViewWindowFunction());
// 8. 输出结果到控制台(测试用)
resultStream.print("页面访问统计");
// 9. 输出结果到MySQL(生产环境)
resultStream.addSink(new MySQLSink());
// 10. 处理延迟数据(可选)
DataStream<PageViewEvent> lateDataStream = resultStream.getSideOutput(lateDataTag);
lateDataStream.print("延迟数据");
// 11. 执行任务
env.execute("Real-time Page View Analysis");
}
}
四、聚合函数
java
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.util.HashSet;
import java.util.Set;
// 聚合函数:统计独立用户数
public class UserCountAggregate implements AggregateFunction<
PageViewEvent,
Tuple2<Set<String>, Long>, // accumulator: (userSet, count)
Long> { // output: userCount
@Override
public Tuple2<Set<String>, Long> createAccumulator() {
return Tuple2.of(new HashSet<>(), 0L);
}
@Override
public Tuple2<Set<String>, Long> add(PageViewEvent value, Tuple2<Set<String>, Long> accumulator) {
Set<String> userSet = accumulator.f0;
// 如果用户ID不在集合中,则计数+1
if (userSet.add(value.getUserId())) {
return Tuple2.of(userSet, accumulator.f1 + 1);
}
return accumulator;
}
@Override
public Long getResult(Tuple2<Set<String>, Long> accumulator) {
return accumulator.f1;
}
@Override
public Tuple2<Set<String>, Long> merge(Tuple2<Set<String>, Long> a, Tuple2<Set<String>, Long> b) {
a.f0.addAll(b.f0);
return Tuple2.of(a.f0, (long) a.f0.size());
}
}
// 窗口函数:包装结果
public class PageViewWindowFunction implements WindowFunction<
Long, // aggregate result
PageViewResult, // output type
String, // key type (pageUrl)
TimeWindow> { // window type
@Override
public void apply(String pageUrl, TimeWindow window,
Iterable<Long> input, Collector<PageViewResult> out) {
Long userCount = input.iterator().next(); // 获取聚合结果
PageViewResult result = new PageViewResult(
pageUrl,
window.getStart(),
window.getEnd(),
userCount
);
out.collect(result);
}
}
五、MySQL Sink实现
java
import org.apache.flink.connector.jdbc.JdbcConnectionOptions;
import org.apache.flink.connector.jdbc.JdbcExecutionOptions;
import org.apache.flink.connector.jdbc.JdbcSink;
import java.sql.PreparedStatement;
import java.sql.SQLException;
public class MySQLSink {
public static org.apache.flink.streaming.api.functions.sink.SinkFunction<PageViewResult> create() {
return JdbcSink.sink(
// SQL语句
"INSERT INTO page_view_stats (page_url, window_start, window_end, user_count, create_time) " +
"VALUES (?, ?, ?, ?, NOW()) " +
"ON DUPLICATE KEY UPDATE user_count = VALUES(user_count), update_time = NOW()",
// 参数设置
(PreparedStatement ps, PageViewResult result) -> {
ps.setString(1, result.getPageUrl());
ps.setTimestamp(2, new java.sql.Timestamp(result.getWindowStart()));
ps.setTimestamp(3, new java.sql.Timestamp(result.getWindowEnd()));
ps.setLong(4, result.getUserCount());
},
// 执行选项
JdbcExecutionOptions.builder()
.withBatchSize(1000)
.withBatchIntervalMs(200)
.withMaxRetries(5)
.build(),
// 连接选项
new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
.withUrl("jdbc:mysql://localhost:3306/analytics_db")
.withDriverName("com.mysql.cj.jdbc.Driver")
.withUsername("root")
.withPassword("password")
.build()
);
}
}
六、MySQL 表结构
sql
CREATE TABLE page_view_stats (
id BIGINT AUTO_INCREMENT PRIMARY KEY,
page_url VARCHAR(500) NOT NULL,
window_start TIMESTAMP NOT NULL,
window_end TIMESTAMP NOT NULL,
user_count BIGINT NOT NULL,
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
UNIQUE KEY uk_page_window (page_url, window_start)
);
-- 创建索引
CREATE INDEX idx_window ON page_view_stats(window_start, window_end);
CREATE INDEX idx_page ON page_view_stats(page_url);
七、测试数据生成器
java
// 用于测试的数据生成器
public class TestDataGenerator {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 生成测试数据发送到Kafka
DataStream<String> testStream = env.fromElements(
"{\"pageUrl\":\"/home\",\"userId\":\"user1\",\"eventType\":\"page_view\",\"timestamp\":\" + System.currentTimeMillis() + \"}",
"{\"pageUrl\":\"/home\",\"userId\":\"user2\",\"eventType\":\"page_view\",\"timestamp\":\" + System.currentTimeMillis() + \"}",
"{\"pageUrl\":\"/product/123\",\"userId\":\"user1\",\"eventType\":\"page_view\",\"timestamp\":\" + System.currentTimeMillis() + \"}",
"{\"pageUrl\":\"/home\",\"userId\":\"user3\",\"eventType\":\"page_view\",\"timestamp\":\" + System.currentTimeMillis() + \"}"
);
// 发送到Kafka(需要配置Kafka连接)
// testStream.addSink(new FlinkKafkaProducer<>("page-view-topic", new SimpleStringSchema(), properties));
testStream.print();
env.execute("Test Data Generator");
}
}
运行步骤
- 启动Kafka,创建 page-view-topic 主题
- 创建MySQL表 page_view_stats
- 运行Flink作业:打包部署或IDE中直接运行
- 发送测试数据到Kafka主题
- 查看结果:控制台输出和MySQL表中