spark==centos安装hadoop集群,安装spark standalone集群,编写pyspark/java/scala代码使用集群

master地址 hadoop100:8080

历史服务器 hadoop100:18080

hdfs地址 http://hadoop100:9870/dfshealth.html#tab-overview

1

centos安装hadoop集群,

上传文件到hdfs

2

安装spark standalone集群,查看自带的pyspark使用的python版本,然后安装annaconda安装该版本的虚拟环境,安装该版本的pyspark依赖包

3 python pyspark代码

pycharm远程选择python解释器

编写pyspark代码

python 复制代码
import time

from pyspark.sql import SparkSession
from datetime import datetime

# 获取当前年月日时分秒
current_time_str = datetime.now().strftime("%Y%m%d%H%M%S")
print(current_time_str)

# 创建 SparkSession 并设置 Python 环境
spark = SparkSession.builder \
    .appName(f"Demo{current_time_str}") \
    .master('spark://192.168.111.100:7077') \
    .config("spark.pyspark.python", "python") \
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "hdfs://hadoop100:9820/directory") \
    .getOrCreate()

# 从 HDFS 中读取 CSV 文件
flights_df = spark.read.csv("hdfs://hadoop100:9820/input/flights.csv", header=True, inferSchema=True)

result_f = flights_df.filter(flights_df['FLIGHT_NUMBER'] > 98)
result = result_f.groupBy("AIRLINE").count().orderBy('AIRLINE')
print(result.collect())

# time.sleep(2000)

4编写java代码

编写好后传到服务器打jar包后执行

pom.xml

python 复制代码
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>demo_java_spark</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.12</artifactId>
            <version>3.5.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.12</artifactId>
            <version>3.5.0</version>
        </dependency>
    </dependencies>

    <build>

        <plugins>
            <!-- Maven Compiler Plugin -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.8.1</version>
            </plugin>

            <!-- Maven Shade Plugin -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>3.2.4</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <createDependencyReducedPom>true</createDependencyReducedPom>
                            <transformers>
                                <!-- 定义 Main-Class -->
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass>SparkApp</mainClass>
                                </transformer>
                            </transformers>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <!-- Additional configuration. -->
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>


</project>

代码

python 复制代码
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.functions;

import java.text.SimpleDateFormat;
import java.util.Date;

public class SparkApp {
    public static void main(String[] args) {
        // 获取当前年月日时分秒
        String currentTimeStr = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());
        System.out.println(currentTimeStr);

        // 创建 SparkSession 并设置应用名称和 Spark master
        SparkSession spark = SparkSession.builder()
                .appName("Demo" + currentTimeStr)
                .master("spark://192.168.111.100:7077")
                .config("spark.eventLog.enabled", "true")
                .config("spark.eventLog.dir", "hdfs://hadoop100:9820/directory")
                .getOrCreate();

        // 从 HDFS 读取 CSV 文件
        Dataset<Row> flightsDf = spark.read()
                .option("header", "true")
                .option("inferSchema", "true")
                .csv("hdfs://hadoop100:9820/input/flights.csv");

        // 过滤并分组计数
        Dataset<Row> resultF = flightsDf.filter(flightsDf.col("FLIGHT_NUMBER").gt(98));
        Dataset<Row> result = resultF.groupBy("AIRLINE").count().orderBy("AIRLINE");

        // 打印结果
        result.show();

        // 保持程序运行以查看 Spark UI
        try {
            Thread.sleep(2000 * 1000);  // 2000秒
        } catch (InterruptedException e) {
            e.printStackTrace();
        }

        // 关闭 SparkSession
        spark.stop();
    }
}

5编写scala代码

pom.xml

python 复制代码
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>demo_java_spark</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.12</artifactId>
            <version>3.5.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.12</artifactId>
            <version>3.5.0</version>
        </dependency>
    </dependencies>

    <build>

        <plugins>
<!--            &lt;!&ndash; Maven Compiler Plugin &ndash;&gt;-->
<!--            <plugin>-->
<!--                <groupId>org.apache.maven.plugins</groupId>-->
<!--                <artifactId>maven-compiler-plugin</artifactId>-->
<!--                <version>3.8.1</version>-->
<!--            </plugin>-->

            <!-- Maven Compiler Plugin for Scala -->
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>


            <!-- Maven Shade Plugin -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>3.2.4</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <createDependencyReducedPom>true</createDependencyReducedPom>
                            <transformers>
                                <!-- 定义 Main-Class -->
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass>scala.SparkApp</mainClass>
                                </transformer>
                            </transformers>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <!-- Additional configuration. -->
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>


</project>

SparkApp.scala

python 复制代码
package scala

import org.apache.spark.sql.SparkSession

import java.text.SimpleDateFormat
import java.util.Date

object SparkApp {
  def main(args: Array[String]): Unit = {
    // 获取当前年月日时分秒
    val currentTimeStr = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date())
    println(currentTimeStr)

    // 创建 SparkSession 并设置应用名称和 Spark master
    val spark = SparkSession.builder()
      .appName(s"Demo$currentTimeStr")
      .master("spark://192.168.111.100:7077")
      .config("spark.eventLog.enabled", "true")
      .config("spark.eventLog.dir", "hdfs://hadoop100:9820/directory")
      .getOrCreate()

    // 从 HDFS 读取 CSV 文件
    val flightsDf = spark.read
      .option("header", "true")
      .option("inferSchema", "true")
      .csv("hdfs://hadoop100:9820/input/flights.csv")

    // 过滤并分组计数
    val resultF = flightsDf.filter(flightsDf("FLIGHT_NUMBER") > 98)
    val result = resultF.groupBy("AIRLINE").count().orderBy("AIRLINE")

    // 显示结果
    result.show()

    // 保持程序运行以查看 Spark UI
    Thread.sleep(2000 * 1000) // 2000秒

    // 关闭 SparkSession
    spark.stop()
  }
}
相关推荐
孟意昶16 小时前
Spark专题-第一部分:Spark 核心概述(2)-Spark 应用核心组件剖析
大数据·spark·big data
IT学长编程17 小时前
计算机毕业设计 基于Hadoop的健康饮食推荐系统的设计与实现 Java 大数据毕业设计 Hadoop毕业设计选题【附源码+文档报告+安装调试】
java·大数据·hadoop·毕业设计·课程设计·推荐算法·毕业论文
半梦半醒*18 小时前
正则表达式
linux·运维·开发语言·正则表达式·centos·运维开发
智海观潮20 小时前
Spark SQL | 目前Spark社区最活跃的组件之一
大数据·spark
Lx35220 小时前
Hadoop数据一致性保障:处理分布式系统常见问题
大数据·hadoop
IT学长编程1 天前
计算机毕业设计 基于Hadoop豆瓣电影数据可视化分析设计与实现 Python 大数据毕业设计 Hadoop毕业设计选题【附源码+文档报告+安装调试
大数据·hadoop·python·django·毕业设计·毕业论文·豆瓣电影数据可视化分析
盛源_011 天前
hadoop的api操作对象存储
hdfs·spark
Dobby_051 天前
【Hadoop】Yarn:Hadoop 生态的资源操作系统
大数据·hadoop·分布式·yarn
笨蛋少年派1 天前
安装Hadoop中遇到的一些问题和解决
大数据·hadoop·分布式
SonOfWind03111 天前
CentOS搭建本地源
linux·运维·centos