创建Spark SQL环境、使用Spark DSL以及Spark on Hive

相关代码展示:

spark SQL

java 复制代码
package com.shujia.spark.sql

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object Demo1DataFrame {
  def main(args: Array[String]): Unit = {

    //1、创建spark sql环境
    val spark: SparkSession = SparkSession
      .builder()
      .master("local")
      .appName("df")
      //指定shuffle之后RDD的分区数
      .config("spark.sql.shuffle.partitions", 1)
      .getOrCreate()

    import spark.implicits._

    //2、读取数据
    //DataFrame:在RDD的基础上增加了表结构,为了写sql
    val studentDF: DataFrame = spark
      .read
      .format("csv")
      .option("sep", ",")
      .schema("id STRING,name STRING,age INT,sex STRING,clazz STRING")
      .load("data/students.txt")

    //查看数据
    studentDF.show()

    //创建临时视图
    studentDF.createOrReplaceTempView("students")

    //编写sql处理数据
    val clazzNumDF: DataFrame = spark.sql(
      """
        |select clazz,count(1) as num
        |from students
        |group by clazz
        |""".stripMargin)

    clazzNumDF.show()

    import org.apache.spark.sql.functions._
    //使用DSL处理数据
    val clazzNum: DataFrame = studentDF
      .groupBy("clazz")
      .agg(count("id") as "num")

    //保存结果
    clazzNum
      .write
      .format("csv")
      .option("sep", "\t")
    //.save("data/clazz_num")

    //使用RDD处理数据
    val kvDS: RDD[(String, Int)] = studentDF
      //转换成RDD
      .rdd
      .map {
        //DF中的每一行是一个ROW对象
        case Row(id, name, age, sex, clazz: String) => (clazz, 1)
      }

    kvDS
      .reduceByKey(_ + _)
      .foreach(println)

  }
}

Spark DSL

java 复制代码
package com.shujia.spark.sql

import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{DataFrame, SparkSession}

object Demo2DSL {
  def main(args: Array[String]): Unit = {
    //创建spark sql环境
    val spark: SparkSession = SparkSession
      .builder()
      .master("local")
      .appName("dsl")
      .config("spark.sql.shuffle.partitions", 1)
      .getOrCreate()

    //读取数据,创建DF
    val studentDF: DataFrame = spark
      .read
      .format("csv")
      .option("sep", ",")
      .schema("id STRING,name STRING,age INT,sex STRING,clazz STRING")
      .load("data/students.txt")

    val scoreDF: DataFrame = spark
      .read
      .format("csv")
      .option("sep", ",")
      .schema("id STRING,cid STRING,score DOUBLE")
      .load("data/score.txt")

    import spark.implicits._
    import org.apache.spark.sql.functions._

    //1、select
    studentDF.select("name", "age").show()
    //$"age": 获取列对象
    studentDF.select($"name", $"age" + 1 as "age").show()
    //在select中使用函数
    studentDF.select(substring($"clazz", 1, 2) as "type").show()
    studentDF.selectExpr("age+1 as age").show()

    //2、where
    studentDF.where($"sex" =!= "男" and $"age" === 23).show()
    studentDF.where(substring($"clazz", 1, 2) === "文科").show()
    studentDF.where($"name" isin("葛德曜", "符半双", "羿彦昌")).show()

    //3、groupBy之后在agg中聚合
    studentDF
      .groupBy($"clazz")
      .agg(count($"id") as "count", round(avg($"age"), 2) as "avgAge")
      .show()

    //4、集合之后过滤(having)
    /**
     * select clazz,count(1) as count from
     * students
     * group by clazz
     * having count > 80
     */
    studentDF
      .groupBy($"clazz")
      .agg(count($"id") as "count", round(avg($"age"), 2) as "avgAge")
      .where($"count" > 80)
      .show()

    //5、order by
    studentDF
      .groupBy($"clazz")
      .agg(count($"id") as "count", round(avg($"age"), 2) as "avgAge")
      .orderBy($"count".desc)
      .show()

    //6、limit
    studentDF
      .groupBy($"clazz")
      .agg(count($"id") as "count", round(avg($"age"), 2) as "avgAge")
      .orderBy($"count".desc)
      .limit(10)
      .show()

    //6、show:相当于action算子
    studentDF.show()
    studentDF.show(10)
    studentDF.show(10, truncate = false)


    //7、join
    studentDF.as("a").join(scoreDF.as("b"), $"a.id" === $"b.id", "inner").show()

    studentDF
      .as("a") //取别名
      .join(scoreDF.as("b"), $"a.id" === $"b.id", "inner")
      .groupBy($"name")
      .agg(sum($"score") as "sumScore")
      .show()

    //8、row_number
    /**
     * select * from (
     *    select *,row_number() over(partition by clazz order by sumScore desc) as r from(
     *       select a.id,name,clazz,sum(score) as sumScore from
     *       student as a
     *       join
     *       score as b
     *       on a.id=b.id
     *       group by a.id,name,clazz
     *    ) as c
     * ) as d
     * where r<=10
     */
    studentDF
      .as("a") //取别名
      .join(scoreDF.as("b"), $"a.id" === $"b.id", "inner")
      .groupBy($"a.id", $"name", $"clazz")
      .agg(sum($"score") as "sumScore")
      //.select($"id", $"name", $"clazz", $"sumScore", row_number() over Window.partitionBy($"clazz").orderBy($"sumScore".desc) as "r")
      //withColumn:在上面DF的基础上增加新的字段
      .withColumn("r", row_number() over Window.partitionBy($"clazz").orderBy($"sumScore".desc))
      .where($"r" <= 10)
      .show()
  }
}

Spark on Hive

在代码里整合spark sql 和Hive ,

如果需要将代码提交到服务器运行

bash 复制代码
spark-submit --master yarn --deploy-mode client --num-executors 2 --executor-cores 1 --executor-memory 2G --class com.company.spark.sql.Demo4SparkOnHive spark-1.0.jar

代码展示:

java 复制代码
package com.shujia.spark.sql

import org.apache.spark.sql.{DataFrame, SparkSession}

object Demo4SparkOnHive {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .builder()
      .appName("dsl")
      .config("spark.sql.shuffle.partitions", 1)
      //开启hive元数据支持
      .enableHiveSupport()
      .getOrCreate()
    import spark.implicits._
    import org.apache.spark.sql.functions._

    //使用hive的表
    spark.sql(
      """
        |show tables
        |""".stripMargin).show()

    //编写sql处理hive的表
    spark.sql(
      """
        |select clazz,count(1) as num from
        |students
        |group by clazz
        |""".stripMargin).show()

    //获取表得到DF
    val studentDF: DataFrame = spark.table("students")

    //当DF被多次使用时可以缓存
    studentDF.cache()

    studentDF
      .groupBy($"clazz")
      .agg(count($"id") as "num")
      .show()

    studentDF
      .groupBy($"sex")
      .agg(count($"id") as "num")
      .show()

    //清理缓存
    studentDF.unpersist()

    //需要将代码提交到服务器运行
    //spark-submit --master yarn --deploy-mode client --num-executors 2 --executor-cores 1 --executor-memory 2G --class com.shujia.spark.sql.Demo4SparkOnHive spark-1.0.jar
  }
}
相关推荐
TM1Club5 分钟前
AI驱动的预测:新的竞争优势
大数据·人工智能·经验分享·金融·数据分析·自动化
zhang133830890759 分钟前
CG-09H 超声波风速风向传感器 加热型 ABS材质 重量轻 没有机械部件
大数据·运维·网络·人工智能·自动化
电商API_180079052471 小时前
第三方淘宝商品详情 API 全维度调用指南:从技术对接到生产落地
java·大数据·前端·数据库·人工智能·网络爬虫
龙山云仓2 小时前
No140:AI世间故事-对话康德——先验哲学与AI理性:范畴、道德律与自主性
大数据·人工智能·深度学习·机器学习·全文检索·lucene
躺柒3 小时前
读数字时代的网络风险管理:策略、计划与执行04风险指引体系
大数据·网络·信息安全·数字化·网络管理·网络风险管理
我真的是大笨蛋3 小时前
深度解析InnoDB如何保障Buffer与磁盘数据一致性
java·数据库·sql·mysql·性能优化
独自归家的兔4 小时前
从 “局部凑活“ 到 “全局最优“:AI 规划能力的技术突破与产业落地实践
大数据·人工智能
海域云-罗鹏4 小时前
国内公司与英国总部数据中心/ERP系统互连,SD-WAN专线实操指南
大数据·数据库·人工智能
策知道5 小时前
依托政府工作报告准备省考【经验贴】
大数据·数据库·人工智能·搜索引擎·政务
Henry-SAP6 小时前
SAP(ERP) 组织结构业务视角解析
大数据·人工智能·sap·erp·sap pp