Spark基于内存计算的数据处理

Spark 使用场景

批处理(Spark Core)

  • 大规模 ETL 数据处理
  • 数据仓库构建
  • 离线数据分析

流处理(Spark Streaming)

  • 准实时数据处理(微批处理)
  • 实时监控报警
  • 实时数据同步

交互式查询(Spark SQL)

  • 即席查询分析
  • 数据探索
  • BI 报表

机器学习(MLlib)

  • 大规模机器学习模型训练
  • 推荐系统
  • 用户画像

Spark 实现"实时网站用户访问量统计

一、依赖

xml 复制代码
<properties>
    <spark.version>3.4.0</spark.version>
    <scala.version>2.12</scala.version>
</properties>

<dependencies>
    <!-- Spark Core -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_${scala.version}</artifactId>
        <version>${spark.version}</version>
    </dependency>
    
    <!-- Spark SQL -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_${scala.version}</artifactId>
        <version>${spark.version}</version>
    </dependency>
    
    <!-- Spark Streaming -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-streaming_${scala.version}</artifactId>
        <version>${spark.version}</version>
    </dependency>
    
    <!-- Spark Kafka Integration -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-streaming-kafka-0-10_${scala.version}</artifactId>
        <version>${spark.version}</version>
    </dependency>
    
    <!-- MySQL Driver -->
    <dependency>
        <groupId>mysql</groupId>
        <artifactId>mysql-connector-java</artifactId>
        <version>8.0.33</version>
    </dependency>
</dependencies>

二、数据模型定义

java 复制代码
import java.io.Serializable;
import java.sql.Timestamp;

// 页面访问事件
public class PageViewEvent implements Serializable {
    private String pageUrl;
    private String userId;
    private String eventType;
    private Long timestamp;
    
    // 构造器、getter、setter
    public PageViewEvent() {}
    
    public PageViewEvent(String pageUrl, String userId, String eventType, Long timestamp) {
        this.pageUrl = pageUrl;
        this.userId = userId;
        this.eventType = eventType;
        this.timestamp = timestamp;
    }
    
    // getters and setters
    public String getPageUrl() { return pageUrl; }
    public void setPageUrl(String pageUrl) { this.pageUrl = pageUrl; }
    
    public String getUserId() { return userId; }
    public void setUserId(String userId) { this.userId = userId; }
    
    public String getEventType() { return eventType; }
    public void setEventType(String eventType) { this.eventType = eventType; }
    
    public Long getTimestamp() { return timestamp; }
    public void setTimestamp(Long timestamp) { this.timestamp = timestamp; }
    
    @Override
    public String toString() {
        return "PageViewEvent{" +
                "pageUrl='" + pageUrl + '\'' +
                ", userId='" + userId + '\'' +
                ", eventType='" + eventType + '\'' +
                ", timestamp=" + timestamp +
                '}';
    }
}

// 统计结果
public class PageViewResult implements Serializable {
    private String pageUrl;
    private Timestamp windowStart;
    private Timestamp windowEnd;
    private Long userCount;
    
    public PageViewResult() {}
    
    public PageViewResult(String pageUrl, Timestamp windowStart, Timestamp windowEnd, Long userCount) {
        this.pageUrl = pageUrl;
        this.windowStart = windowStart;
        this.windowEnd = windowEnd;
        this.userCount = userCount;
    }
    
    // getters and setters
    public String getPageUrl() { return pageUrl; }
    public void setPageUrl(String pageUrl) { this.pageUrl = pageUrl; }
    
    public Timestamp getWindowStart() { return windowStart; }
    public void setWindowStart(Timestamp windowStart) { this.windowStart = windowStart; }
    
    public Timestamp getWindowEnd() { return windowEnd; }
    public void setWindowEnd(Timestamp windowEnd) { this.windowEnd = windowEnd; }
    
    public Long getUserCount() { return userCount; }
    public void setUserCount(Long userCount) { this.userCount = userCount; }
    
    @Override
    public String toString() {
        return "PageViewResult{" +
                "pageUrl='" + pageUrl + '\'' +
                ", windowStart=" + windowStart +
                ", windowEnd=" + windowEnd +
                ", userCount=" + userCount +
                '}';
    }
}

三、 核心处理逻辑(Spark Streaming)

java 复制代码
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.*;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import scala.Tuple2;

import java.util.*;
import java.util.regex.Pattern;

public class RealTimePageViewAnalysisSpark {
	
	public static void main(String[] args) throws Exception {
		
		//1.创建Spark配置
		SparkConf sparkConf = new SparkConf()
			.setAppName("RealTimePageViewAnalysisSpark")
			.setMaster("local[2]")// 生产环境去掉这个
			.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
		
		//2.创建Streaming Context,批处理间隔为10秒
		JavaStreamingContext jssc = new JavaStreamingContext(sparkConf,Durations.seconds(10));

		//3.设置kafka参数
		Map<String, Object> kafkaParams = new HashMap<>();
        kafkaParams.put("bootstrap.servers", "localhost:9092");
        kafkaParams.put("key.deserializer", StringDeserializer.class);
        kafkaParams.put("value.deserializer", StringDeserializer.class);
        kafkaParams.put("group.id", "spark-page-view-group");
        kafkaParams.put("auto.offset.reset", "latest");
        kafkaParams.put("enable.auto.commit", false);

		// 4. 要订阅的Topic
        Collection<String> topics = Arrays.asList("page-view-topic");

		// 5. 创建Kafka Direct Stream
        JavaInputDStream<ConsumerRecord<String, String>> stream =
                KafkaUtils.createDirectStream(
                        jssc,
                        LocationStrategies.PreferConsistent(),
                        ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams)
                );
        
        // 6. 提取Kafka消息的值
        JavaDStream<String> lines = stream.map(ConsumerRecord::value);
        
        // 7. 解析JSON并处理数据
        JavaDStream<PageViewResult> results = lines.transform(rdd -> {
            // 使用Spark SQL进行数据处理
            SparkSession spark = SparkSession.builder()
                    .config(rdd.context().getConf())
                    .getOrCreate();
            
            // 将RDD转换为Dataset
            JavaRDD<PageViewEvent> eventsRDD = rdd.map(line -> {
                // 简单的JSON解析,生产环境建议使用Jackson
                return parseJsonToEvent(line);
            }).filter(event -> event != null && "page_view".equals(event.getEventType()));
            
            Dataset<Row> eventsDF = spark.createDataFrame(eventsRDD, PageViewEvent.class);
            
            // 创建临时视图
            eventsDF.createOrReplaceTempView("page_events");
            
            // 使用Spark SQL进行窗口聚合
            Dataset<Row> resultDF = spark.sql(
                "SELECT " +
                "   pageUrl, " +
                "   window(FROM_UNIXTIME(timestamp/1000), '1 minute') as window, " +
                "   COUNT(DISTINCT userId) as userCount " +
                "FROM page_events " +
                "WHERE eventType = 'page_view' " +
                "GROUP BY pageUrl, window " +
                "ORDER BY window.start, pageUrl"
            );
            
            // 转换回RDD
            return resultDF.toJavaRDD().map(row -> {
                String pageUrl = row.getString(0);
                Row window = row.getStruct(1);
                Timestamp start = window.getTimestamp(0);
                Timestamp end = window.getTimestamp(1);
                Long userCount = row.getLong(2);
                
                return new PageViewResult(pageUrl, start, end, userCount);
            });
        });

		// 8. 输出结果到控制台
        results.foreachRDD(rdd -> {
            System.out.println("=== 页面访问统计结果 ===");
            rdd.foreach(result -> System.out.println(result.toString()));
            System.out.println("=====================");
            
            // 保存到MySQL
            if (!rdd.isEmpty()) {
                saveToMySQL(rdd);
            }
        });
        
        // 9. 启动流处理
        jssc.start();
        jssc.awaitTermination();
	}

	//JSON解析方法(json->对象)
	private static PageViewEvent parseJsonToEvent(String json){
		try{
			//简化的JSON解析,生产环境建议使用Jackson
			json = json.replace("{", "").replace("}", "").replace("\"", "");
			String[] pairs = json.split(",");

			Map<String, String> map = new HashMap<>();
            for (String pair : pairs) {
                String[] keyValue = pair.split(":");
                if (keyValue.length == 2) {
                    map.put(keyValue[0].trim(), keyValue[1].trim());
                }
            }

			return new PageViewEvent(
                map.get("pageUrl"),
                map.get("userId"), 
                map.get("eventType"),
                Long.parseLong(map.get("timestamp"))
            );
		}catch(Exception e){
			System.err.println("解析JSON失败: " + json);
            return null;
		}
	}

	//保存到MySQL
	private static void saveToMySQL(JavaRDD<PageViewResult> rdd){
		rdd.foreachPartition(partition -> {
            // 每个分区创建数据库连接
            java.sql.Connection connection = null;
            java.sql.PreparedStatement statement = null;
            
            try {
                Class.forName("com.mysql.cj.jdbc.Driver");
                connection = java.sql.DriverManager.getConnection(
                    "jdbc:mysql://localhost:3306/analytics_db", 
                    "root", "password");
                
                String sql = "INSERT INTO page_view_stats " +
                           "(page_url, window_start, window_end, user_count, create_time) " +
                           "VALUES (?, ?, ?, ?, NOW()) " +
                           "ON DUPLICATE KEY UPDATE user_count = VALUES(user_count), update_time = NOW()";
                
                statement = connection.prepareStatement(sql);
                
                int batchSize = 0;
                while (partition.hasNext()) {
                    PageViewResult result = partition.next();
                    
                    statement.setString(1, result.getPageUrl());
                    statement.setTimestamp(2, result.getWindowStart());
                    statement.setTimestamp(3, result.getWindowEnd());
                    statement.setLong(4, result.getUserCount());
                    statement.addBatch();
                    
                    batchSize++;
                    // 每100条执行一次批量插入
                    if (batchSize % 100 == 0) {
                        statement.executeBatch();
                    }
                }
                
                // 执行剩余的批次
                if (batchSize % 100 != 0) {
                    statement.executeBatch();
                }
                
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                if (statement != null) statement.close();
                if (connection != null) connection.close();
            }
        });
	}

	
}

四、使用 Structured Streaming 的现代实现

java 复制代码
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.Trigger;
import static org.apache.spark.sql.functions.*;

public class StructuredStreamingPageViewAnalysis {

    public static void main(String[] args) throws Exception {
        
        // 1. 创建Spark Session
        SparkSession spark = SparkSession.builder()
                .appName("StructuredStreamingPageViewAnalysis")
                .master("local[2]")
                .config("spark.sql.shuffle.partitions", "2")
                .getOrCreate();
        
        // 2. 从Kafka读取流数据
        Dataset<Row> kafkaDF = spark
                .readStream()
                .format("kafka")
                .option("kafka.bootstrap.servers", "localhost:9092")
                .option("subscribe", "page-view-topic")
                .option("startingOffsets", "latest")
                .load();
        
        // 3. 解析JSON数据
        Dataset<Row> eventsDF = kafkaDF
                .select(
                    expr("CAST(value AS STRING) as json"),
                    expr("CAST(timestamp AS TIMESTAMP) as kafka_timestamp")
                )
                .select(
                    from_json(col("json"), 
                        "pageUrl STRING, userId STRING, eventType STRING, timestamp LONG"
                    ).as("data"),
                    col("kafka_timestamp")
                )
                .select(
                    col("data.pageUrl").as("pageUrl"),
                    col("data.userId").as("userId"),
                    col("data.eventType").as("eventType"),
                    col("data.timestamp").as("eventTimestamp"),
                    col("kafka_timestamp")
                )
                .withColumn("eventTime", from_unixtime(col("eventTimestamp").divide(1000)))
                .filter(col("eventType").equalTo("page_view"));
        
        // 4. 窗口聚合
        Dataset<Row> resultDF = eventsDF
                .withWatermark("eventTime", "2 minutes")  // 水印设置
                .groupBy(
                    col("pageUrl"),
                    window(col("eventTime"), "1 minute")  // 1分钟窗口
                )
                .agg(approx_count_distinct("userId").as("userCount"))
                .select(
                    col("pageUrl"),
                    col("window.start").as("windowStart"),
                    col("window.end").as("windowEnd"),
                    col("userCount")
                );
        
        // 5. 输出到控制台
        StreamingQuery consoleQuery = resultDF
                .writeStream()
                .outputMode("update")
                .format("console")
                .option("truncate", "false")
                .trigger(Trigger.ProcessingTime("10 seconds"))
                .start();
        
        // 6. 输出到MySQL(foreachBatch方式)
        StreamingQuery mysqlQuery = resultDF
                .writeStream()
                .outputMode("update")
                .foreachBatch((batchDF, batchId) -> {
                    // 对每个微批次数据执行操作
                    if (!batchDF.isEmpty()) {
                        batchDF.write()
                                .format("jdbc")
                                .option("url", "jdbc:mysql://localhost:3306/analytics_db")
                                .option("driver", "com.mysql.cj.jdbc.Driver")
                                .option("dbtable", "page_view_stats")
                                .option("user", "root")
                                .option("password", "password")
                                .mode("append")
                                .save();
                    }
                })
                .trigger(Trigger.ProcessingTime("10 seconds"))
                .start();
        
        // 7. 等待查询终止
        mysqlQuery.awaitTermination();
    }
}

五、MySQL 表结构(与Flink相同)

sql 复制代码
CREATE TABLE page_view_stats (
    id BIGINT AUTO_INCREMENT PRIMARY KEY,
    page_url VARCHAR(500) NOT NULL,
    window_start TIMESTAMP NOT NULL,
    window_end TIMESTAMP NOT NULL,
    user_count BIGINT NOT NULL,
    create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    UNIQUE KEY uk_page_window (page_url, window_start)
);

运行步骤

  1. 启动Zookeeper和Kafka
  2. 创建Kafka主题:kafka-topics.sh --create --topic page-view-topic --bootstrap-server localhost:9092
  3. 创建MySQL表
  4. 运行Spark作业:
bash 复制代码
spark-submit --class RealTimePageViewAnalysisSpark \
             --master local[2] \
             --packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.4.0 \
             your-jar-file.jar
  1. 发送测试数据到Kafka
  2. 查看结果在控制台和MySQL中
相关推荐
little_xianzhong3 小时前
把一个本地项目导入gitee创建的仓库中
大数据·elasticsearch·gitee
青靴4 小时前
轻量级 CI/CD 实战(三):Kafka消费者Docker容器化部署
分布式·docker·kafka
galaxyffang5 小时前
RocketMQ 为什么性能不如 Kafka?
分布式·kafka·rocketmq
金融小师妹5 小时前
基于机器学习框架的上周行情复盘:非农数据与美联储政策信号的AI驱动解析
大数据·人工智能·深度学习·1024程序员节
Leo.yuan5 小时前
2小时,我搭了一套物流分析看板
大数据·人工智能·金融·企业数字化·现金流
sheji34166 小时前
【开题答辩全过程】以 基于Spark的药品库存可视化分析系统为例,包含答辩的问题和答案
大数据·分布式·spark
larance6 小时前
spark-submit 常用方式
大数据·spark
Ace_31750887766 小时前
微店商品详情接口深度挖掘:从多接口联动到数据全息重构
大数据·python·重构
ComPDFKit6 小时前
Salesforce原生PDF编辑的重要性:效率、合规性与用户体验
大数据·pdf·ux