Spark 使用场景
批处理(Spark Core)
- 大规模 ETL 数据处理
- 数据仓库构建
- 离线数据分析
流处理(Spark Streaming)
- 准实时数据处理(微批处理)
- 实时监控报警
- 实时数据同步
交互式查询(Spark SQL)
- 即席查询分析
- 数据探索
- BI 报表
机器学习(MLlib)
- 大规模机器学习模型训练
- 推荐系统
- 用户画像
Spark 实现"实时网站用户访问量统计
一、依赖
xml
<properties>
<spark.version>3.4.0</spark.version>
<scala.version>2.12</scala.version>
</properties>
<dependencies>
<!-- Spark Core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- Spark SQL -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- Spark Streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- Spark Kafka Integration -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- MySQL Driver -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.33</version>
</dependency>
</dependencies>
二、数据模型定义
java
import java.io.Serializable;
import java.sql.Timestamp;
// 页面访问事件
public class PageViewEvent implements Serializable {
private String pageUrl;
private String userId;
private String eventType;
private Long timestamp;
// 构造器、getter、setter
public PageViewEvent() {}
public PageViewEvent(String pageUrl, String userId, String eventType, Long timestamp) {
this.pageUrl = pageUrl;
this.userId = userId;
this.eventType = eventType;
this.timestamp = timestamp;
}
// getters and setters
public String getPageUrl() { return pageUrl; }
public void setPageUrl(String pageUrl) { this.pageUrl = pageUrl; }
public String getUserId() { return userId; }
public void setUserId(String userId) { this.userId = userId; }
public String getEventType() { return eventType; }
public void setEventType(String eventType) { this.eventType = eventType; }
public Long getTimestamp() { return timestamp; }
public void setTimestamp(Long timestamp) { this.timestamp = timestamp; }
@Override
public String toString() {
return "PageViewEvent{" +
"pageUrl='" + pageUrl + '\'' +
", userId='" + userId + '\'' +
", eventType='" + eventType + '\'' +
", timestamp=" + timestamp +
'}';
}
}
// 统计结果
public class PageViewResult implements Serializable {
private String pageUrl;
private Timestamp windowStart;
private Timestamp windowEnd;
private Long userCount;
public PageViewResult() {}
public PageViewResult(String pageUrl, Timestamp windowStart, Timestamp windowEnd, Long userCount) {
this.pageUrl = pageUrl;
this.windowStart = windowStart;
this.windowEnd = windowEnd;
this.userCount = userCount;
}
// getters and setters
public String getPageUrl() { return pageUrl; }
public void setPageUrl(String pageUrl) { this.pageUrl = pageUrl; }
public Timestamp getWindowStart() { return windowStart; }
public void setWindowStart(Timestamp windowStart) { this.windowStart = windowStart; }
public Timestamp getWindowEnd() { return windowEnd; }
public void setWindowEnd(Timestamp windowEnd) { this.windowEnd = windowEnd; }
public Long getUserCount() { return userCount; }
public void setUserCount(Long userCount) { this.userCount = userCount; }
@Override
public String toString() {
return "PageViewResult{" +
"pageUrl='" + pageUrl + '\'' +
", windowStart=" + windowStart +
", windowEnd=" + windowEnd +
", userCount=" + userCount +
'}';
}
}
三、 核心处理逻辑(Spark Streaming)
java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.*;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import scala.Tuple2;
import java.util.*;
import java.util.regex.Pattern;
public class RealTimePageViewAnalysisSpark {
public static void main(String[] args) throws Exception {
//1.创建Spark配置
SparkConf sparkConf = new SparkConf()
.setAppName("RealTimePageViewAnalysisSpark")
.setMaster("local[2]")// 生产环境去掉这个
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
//2.创建Streaming Context,批处理间隔为10秒
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf,Durations.seconds(10));
//3.设置kafka参数
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "localhost:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "spark-page-view-group");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false);
// 4. 要订阅的Topic
Collection<String> topics = Arrays.asList("page-view-topic");
// 5. 创建Kafka Direct Stream
JavaInputDStream<ConsumerRecord<String, String>> stream =
KafkaUtils.createDirectStream(
jssc,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams)
);
// 6. 提取Kafka消息的值
JavaDStream<String> lines = stream.map(ConsumerRecord::value);
// 7. 解析JSON并处理数据
JavaDStream<PageViewResult> results = lines.transform(rdd -> {
// 使用Spark SQL进行数据处理
SparkSession spark = SparkSession.builder()
.config(rdd.context().getConf())
.getOrCreate();
// 将RDD转换为Dataset
JavaRDD<PageViewEvent> eventsRDD = rdd.map(line -> {
// 简单的JSON解析,生产环境建议使用Jackson
return parseJsonToEvent(line);
}).filter(event -> event != null && "page_view".equals(event.getEventType()));
Dataset<Row> eventsDF = spark.createDataFrame(eventsRDD, PageViewEvent.class);
// 创建临时视图
eventsDF.createOrReplaceTempView("page_events");
// 使用Spark SQL进行窗口聚合
Dataset<Row> resultDF = spark.sql(
"SELECT " +
" pageUrl, " +
" window(FROM_UNIXTIME(timestamp/1000), '1 minute') as window, " +
" COUNT(DISTINCT userId) as userCount " +
"FROM page_events " +
"WHERE eventType = 'page_view' " +
"GROUP BY pageUrl, window " +
"ORDER BY window.start, pageUrl"
);
// 转换回RDD
return resultDF.toJavaRDD().map(row -> {
String pageUrl = row.getString(0);
Row window = row.getStruct(1);
Timestamp start = window.getTimestamp(0);
Timestamp end = window.getTimestamp(1);
Long userCount = row.getLong(2);
return new PageViewResult(pageUrl, start, end, userCount);
});
});
// 8. 输出结果到控制台
results.foreachRDD(rdd -> {
System.out.println("=== 页面访问统计结果 ===");
rdd.foreach(result -> System.out.println(result.toString()));
System.out.println("=====================");
// 保存到MySQL
if (!rdd.isEmpty()) {
saveToMySQL(rdd);
}
});
// 9. 启动流处理
jssc.start();
jssc.awaitTermination();
}
//JSON解析方法(json->对象)
private static PageViewEvent parseJsonToEvent(String json){
try{
//简化的JSON解析,生产环境建议使用Jackson
json = json.replace("{", "").replace("}", "").replace("\"", "");
String[] pairs = json.split(",");
Map<String, String> map = new HashMap<>();
for (String pair : pairs) {
String[] keyValue = pair.split(":");
if (keyValue.length == 2) {
map.put(keyValue[0].trim(), keyValue[1].trim());
}
}
return new PageViewEvent(
map.get("pageUrl"),
map.get("userId"),
map.get("eventType"),
Long.parseLong(map.get("timestamp"))
);
}catch(Exception e){
System.err.println("解析JSON失败: " + json);
return null;
}
}
//保存到MySQL
private static void saveToMySQL(JavaRDD<PageViewResult> rdd){
rdd.foreachPartition(partition -> {
// 每个分区创建数据库连接
java.sql.Connection connection = null;
java.sql.PreparedStatement statement = null;
try {
Class.forName("com.mysql.cj.jdbc.Driver");
connection = java.sql.DriverManager.getConnection(
"jdbc:mysql://localhost:3306/analytics_db",
"root", "password");
String sql = "INSERT INTO page_view_stats " +
"(page_url, window_start, window_end, user_count, create_time) " +
"VALUES (?, ?, ?, ?, NOW()) " +
"ON DUPLICATE KEY UPDATE user_count = VALUES(user_count), update_time = NOW()";
statement = connection.prepareStatement(sql);
int batchSize = 0;
while (partition.hasNext()) {
PageViewResult result = partition.next();
statement.setString(1, result.getPageUrl());
statement.setTimestamp(2, result.getWindowStart());
statement.setTimestamp(3, result.getWindowEnd());
statement.setLong(4, result.getUserCount());
statement.addBatch();
batchSize++;
// 每100条执行一次批量插入
if (batchSize % 100 == 0) {
statement.executeBatch();
}
}
// 执行剩余的批次
if (batchSize % 100 != 0) {
statement.executeBatch();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (statement != null) statement.close();
if (connection != null) connection.close();
}
});
}
}
四、使用 Structured Streaming 的现代实现
java
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.Trigger;
import static org.apache.spark.sql.functions.*;
public class StructuredStreamingPageViewAnalysis {
public static void main(String[] args) throws Exception {
// 1. 创建Spark Session
SparkSession spark = SparkSession.builder()
.appName("StructuredStreamingPageViewAnalysis")
.master("local[2]")
.config("spark.sql.shuffle.partitions", "2")
.getOrCreate();
// 2. 从Kafka读取流数据
Dataset<Row> kafkaDF = spark
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "page-view-topic")
.option("startingOffsets", "latest")
.load();
// 3. 解析JSON数据
Dataset<Row> eventsDF = kafkaDF
.select(
expr("CAST(value AS STRING) as json"),
expr("CAST(timestamp AS TIMESTAMP) as kafka_timestamp")
)
.select(
from_json(col("json"),
"pageUrl STRING, userId STRING, eventType STRING, timestamp LONG"
).as("data"),
col("kafka_timestamp")
)
.select(
col("data.pageUrl").as("pageUrl"),
col("data.userId").as("userId"),
col("data.eventType").as("eventType"),
col("data.timestamp").as("eventTimestamp"),
col("kafka_timestamp")
)
.withColumn("eventTime", from_unixtime(col("eventTimestamp").divide(1000)))
.filter(col("eventType").equalTo("page_view"));
// 4. 窗口聚合
Dataset<Row> resultDF = eventsDF
.withWatermark("eventTime", "2 minutes") // 水印设置
.groupBy(
col("pageUrl"),
window(col("eventTime"), "1 minute") // 1分钟窗口
)
.agg(approx_count_distinct("userId").as("userCount"))
.select(
col("pageUrl"),
col("window.start").as("windowStart"),
col("window.end").as("windowEnd"),
col("userCount")
);
// 5. 输出到控制台
StreamingQuery consoleQuery = resultDF
.writeStream()
.outputMode("update")
.format("console")
.option("truncate", "false")
.trigger(Trigger.ProcessingTime("10 seconds"))
.start();
// 6. 输出到MySQL(foreachBatch方式)
StreamingQuery mysqlQuery = resultDF
.writeStream()
.outputMode("update")
.foreachBatch((batchDF, batchId) -> {
// 对每个微批次数据执行操作
if (!batchDF.isEmpty()) {
batchDF.write()
.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/analytics_db")
.option("driver", "com.mysql.cj.jdbc.Driver")
.option("dbtable", "page_view_stats")
.option("user", "root")
.option("password", "password")
.mode("append")
.save();
}
})
.trigger(Trigger.ProcessingTime("10 seconds"))
.start();
// 7. 等待查询终止
mysqlQuery.awaitTermination();
}
}
五、MySQL 表结构(与Flink相同)
sql
CREATE TABLE page_view_stats (
id BIGINT AUTO_INCREMENT PRIMARY KEY,
page_url VARCHAR(500) NOT NULL,
window_start TIMESTAMP NOT NULL,
window_end TIMESTAMP NOT NULL,
user_count BIGINT NOT NULL,
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
UNIQUE KEY uk_page_window (page_url, window_start)
);
运行步骤
- 启动Zookeeper和Kafka
- 创建Kafka主题:kafka-topics.sh --create --topic page-view-topic --bootstrap-server localhost:9092
- 创建MySQL表
- 运行Spark作业:
bash
spark-submit --class RealTimePageViewAnalysisSpark \
--master local[2] \
--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.4.0 \
your-jar-file.jar
- 发送测试数据到Kafka
- 查看结果在控制台和MySQL中