SpringBoot操作spark处理hdfs文件

SpringBoot操作spark处理hdfs文件

1、导入依赖

xml 复制代码
<!--        spark依赖-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.12</artifactId>
            <version>3.2.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.12</artifactId>
            <version>3.2.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-mllib -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-mllib_2.12</artifactId>
            <version>3.2.2</version>
        </dependency>

2、配置spark信息

  • 建立一个配置文件,配置spark信息
java 复制代码
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

//将文件交于spring管理
@Configuration
public class SparkConfig {

    //使用yml中的配置
    @Value("${spark.master}")
    private String sparkMaster;

    @Value("${spark.appName}")
    private String sparkAppName;

    @Value("${hdfs.user}")
    private String hdfsUser;

    @Value("${hdfs.path}")
    private String hdfsPath;
    @Bean
    public SparkConf sparkConf() {
        SparkConf conf = new SparkConf();
        conf.setMaster(sparkMaster);
        conf.setAppName(sparkAppName);
        // 添加HDFS配置
        conf.set("fs.defaultFS", hdfsPath);
        conf.set("spark.hadoop.hdfs.user",hdfsUser);
        return conf;
    }

    @Bean
    public SparkSession sparkSession() {
        return SparkSession.builder()
                .config(sparkConf())
                .getOrCreate();
    }
}

3、controller和service

  • controller类

    java 复制代码
    import org.springframework.beans.factory.annotation.Autowired;
    import org.springframework.web.bind.annotation.GetMapping;
    import org.springframework.web.bind.annotation.RequestMapping;
    import org.springframework.web.bind.annotation.RestController;
    import xyz.zzj.traffic_main_code.service.SparkService;
    
    @RestController
    @RequestMapping("/spark")
    public class SparkController {
    
        @Autowired
        private SparkService sparkService;
    
        @GetMapping("/run")
        public String runSparkJob() {
            //读取Hadoop HDFS文件
            String filePath = "hdfs://192.168.44.128:9000/subwayData.csv";
            sparkService.executeHadoopSparkJob(filePath);
            return "Spark job executed successfully!";
        }
    }
  • 处理地铁数据的service

java 复制代码
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;

import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import xyz.zzj.traffic_main_code.service.SparkReadHdfs;

import java.io.IOException;
import java.net.URI;
import static org.apache.spark.sql.functions.*;

@Service
public class SparkReadHdfsImpl implements SparkReadHdfs {

    private final SparkSession spark;

    @Value("${hdfs.user}")
    private String hdfsUser;

    @Value("${hdfs.path}")
    private String hdfsPath;

    @Autowired
    public SparkReadHdfsImpl(SparkSession spark) {
        this.spark = spark;
    }

    /**
     * 读取HDFS上的CSV文件并上传到HDFS
     * @param filePath
     */
    @Override
    public void sparkSubway(String filePath) {
        try {
            // 设置Hadoop配置
            JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
            Configuration hadoopConf = jsc.hadoopConfiguration();
            hadoopConf.set("fs.defaultFS", hdfsPath);
            hadoopConf.set("hadoop.user.name", hdfsUser);

            // 读取HDFS上的文件
            Dataset<Row> df = spark.read()
                    .option("header", "true") // 指定第一行是列名
                    .option("inferSchema", "true") // 自动推断列的数据类型
                    .csv(filePath);

            // 显示DataFrame的所有数据
//            df.show(Integer.MAX_VALUE, false);

            // 对DataFrame进行清洗和转换操作
            // 检查缺失值
            df.select("number", "people", "dateTime").na().drop().show();
            // 对数据进行类型转换
            Dataset<Row> df2 = df.select(
                    col("number").cast(DataTypes.IntegerType),
                    col("people").cast(DataTypes.IntegerType),
                    to_date(col("dateTime"), "yyyy年MM月dd日").alias("dateTime")
            );
            // 去重
            Dataset<Row> df3 = df2.dropDuplicates();
            // 数据过滤,确保people列没有负数
            Dataset<Row> df4 = df3.filter(col("people").geq(0));
//            df4.show();
            // 数据聚合,按dateTime分组,统计每天的总客流量
            Dataset<Row> df6 = df4.groupBy("dateTime").agg(sum("people").alias("total_people"));
//            df6.show();
            sparkForSubway(df6,"/time_subwayData.csv");
            //数据聚合,获取每天人数最多的地铁number
            Dataset<Row> df7 = df4.groupBy("dateTime").agg(max("people").alias("max_people"));
            sparkForSubway(df7,"/everyday_max_subwayData.csv");
            //数据聚合,计算每天的客流强度:每天总people除以632840
            Dataset<Row> df8 = df4.groupBy("dateTime").agg(sum("people").divide(632.84).alias("strength"));
            sparkForSubway(df8,"/everyday_strength_subwayData.csv");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    private static void sparkForSubway(Dataset<Row> df6, String hdfsPath) throws IOException {
        // 保存处理后的数据到HDFS
        df6.coalesce(1)
                .write().mode("overwrite")
                .option("header", "true")
                .csv("hdfs://192.168.44.128:9000/time_subwayData");
        // 创建Hadoop配置
        Configuration conf = new Configuration();
        // 获取FileSystem实例
        FileSystem fs = FileSystem.get(URI.create("hdfs://192.168.44.128:9000"), conf);
        // 定义临时目录和目标文件路径
        Path tempDir = new Path("/time_subwayData");
        FileStatus[] files = fs.listStatus(tempDir);
        // 检查目标文件是否存在,如果存在则删除
        Path targetFile1 = new Path(hdfsPath);
        if (fs.exists(targetFile1)) {
            fs.delete(targetFile1, true); // true 表示递归删除
        }
        for (FileStatus file : files) {
            if (file.isFile() && file.getPath().getName().startsWith("part-")) {
                Path targetFile = new Path(hdfsPath);
                fs.rename(file.getPath(), targetFile);
            }
        }
        // 删除临时目录
        fs.delete(tempDir, true);
    }

}

4、运行

相关推荐
UNdE CKEY1 分钟前
Spring Boot+Vue项目从零入手
vue.js·spring boot·后端
umeelove351 分钟前
SpringBoot【实用篇】- 测试
java·spring boot·后端
unDl IONA7 分钟前
Spring Boot中使用Server-Sent Events (SSE) 实现实时数据推送教程
java·spring boot·后端
bitt TRES9 分钟前
Spring Boot整合Redisson的两种方式
java·spring boot·后端
csdn2015_11 分钟前
springboot controller 参数非必填
java·spring boot·后端
RNEA ESIO18 分钟前
Spring Boot应用关闭分析
java·spring boot·后端
johnrui28 分钟前
springboot接口限流操作
java·spring boot·后端
Flittly37 分钟前
【SpringAIAlibaba新手村系列】(9)Text to Image 文本生成图像技术
java·spring boot·agent
Flittly39 分钟前
【SpringAIAlibaba新手村系列】(10)Text to Voice 文本转语音技术
java·spring boot·agent
JoshRen42 分钟前
springboot之集成Elasticsearch
spring boot·后端·elasticsearch