相关代码展示:
spark SQL
java
package com.shujia.spark.sql
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object Demo1DataFrame {
def main(args: Array[String]): Unit = {
//1、创建spark sql环境
val spark: SparkSession = SparkSession
.builder()
.master("local")
.appName("df")
//指定shuffle之后RDD的分区数
.config("spark.sql.shuffle.partitions", 1)
.getOrCreate()
import spark.implicits._
//2、读取数据
//DataFrame:在RDD的基础上增加了表结构,为了写sql
val studentDF: DataFrame = spark
.read
.format("csv")
.option("sep", ",")
.schema("id STRING,name STRING,age INT,sex STRING,clazz STRING")
.load("data/students.txt")
//查看数据
studentDF.show()
//创建临时视图
studentDF.createOrReplaceTempView("students")
//编写sql处理数据
val clazzNumDF: DataFrame = spark.sql(
"""
|select clazz,count(1) as num
|from students
|group by clazz
|""".stripMargin)
clazzNumDF.show()
import org.apache.spark.sql.functions._
//使用DSL处理数据
val clazzNum: DataFrame = studentDF
.groupBy("clazz")
.agg(count("id") as "num")
//保存结果
clazzNum
.write
.format("csv")
.option("sep", "\t")
//.save("data/clazz_num")
//使用RDD处理数据
val kvDS: RDD[(String, Int)] = studentDF
//转换成RDD
.rdd
.map {
//DF中的每一行是一个ROW对象
case Row(id, name, age, sex, clazz: String) => (clazz, 1)
}
kvDS
.reduceByKey(_ + _)
.foreach(println)
}
}
Spark DSL
java
package com.shujia.spark.sql
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{DataFrame, SparkSession}
object Demo2DSL {
def main(args: Array[String]): Unit = {
//创建spark sql环境
val spark: SparkSession = SparkSession
.builder()
.master("local")
.appName("dsl")
.config("spark.sql.shuffle.partitions", 1)
.getOrCreate()
//读取数据,创建DF
val studentDF: DataFrame = spark
.read
.format("csv")
.option("sep", ",")
.schema("id STRING,name STRING,age INT,sex STRING,clazz STRING")
.load("data/students.txt")
val scoreDF: DataFrame = spark
.read
.format("csv")
.option("sep", ",")
.schema("id STRING,cid STRING,score DOUBLE")
.load("data/score.txt")
import spark.implicits._
import org.apache.spark.sql.functions._
//1、select
studentDF.select("name", "age").show()
//$"age": 获取列对象
studentDF.select($"name", $"age" + 1 as "age").show()
//在select中使用函数
studentDF.select(substring($"clazz", 1, 2) as "type").show()
studentDF.selectExpr("age+1 as age").show()
//2、where
studentDF.where($"sex" =!= "男" and $"age" === 23).show()
studentDF.where(substring($"clazz", 1, 2) === "文科").show()
studentDF.where($"name" isin("葛德曜", "符半双", "羿彦昌")).show()
//3、groupBy之后在agg中聚合
studentDF
.groupBy($"clazz")
.agg(count($"id") as "count", round(avg($"age"), 2) as "avgAge")
.show()
//4、集合之后过滤(having)
/**
* select clazz,count(1) as count from
* students
* group by clazz
* having count > 80
*/
studentDF
.groupBy($"clazz")
.agg(count($"id") as "count", round(avg($"age"), 2) as "avgAge")
.where($"count" > 80)
.show()
//5、order by
studentDF
.groupBy($"clazz")
.agg(count($"id") as "count", round(avg($"age"), 2) as "avgAge")
.orderBy($"count".desc)
.show()
//6、limit
studentDF
.groupBy($"clazz")
.agg(count($"id") as "count", round(avg($"age"), 2) as "avgAge")
.orderBy($"count".desc)
.limit(10)
.show()
//6、show:相当于action算子
studentDF.show()
studentDF.show(10)
studentDF.show(10, truncate = false)
//7、join
studentDF.as("a").join(scoreDF.as("b"), $"a.id" === $"b.id", "inner").show()
studentDF
.as("a") //取别名
.join(scoreDF.as("b"), $"a.id" === $"b.id", "inner")
.groupBy($"name")
.agg(sum($"score") as "sumScore")
.show()
//8、row_number
/**
* select * from (
* select *,row_number() over(partition by clazz order by sumScore desc) as r from(
* select a.id,name,clazz,sum(score) as sumScore from
* student as a
* join
* score as b
* on a.id=b.id
* group by a.id,name,clazz
* ) as c
* ) as d
* where r<=10
*/
studentDF
.as("a") //取别名
.join(scoreDF.as("b"), $"a.id" === $"b.id", "inner")
.groupBy($"a.id", $"name", $"clazz")
.agg(sum($"score") as "sumScore")
//.select($"id", $"name", $"clazz", $"sumScore", row_number() over Window.partitionBy($"clazz").orderBy($"sumScore".desc) as "r")
//withColumn:在上面DF的基础上增加新的字段
.withColumn("r", row_number() over Window.partitionBy($"clazz").orderBy($"sumScore".desc))
.where($"r" <= 10)
.show()
}
}
Spark on Hive
在代码里整合spark sql 和Hive ,
如果需要将代码提交到服务器运行
bash
spark-submit --master yarn --deploy-mode client --num-executors 2 --executor-cores 1 --executor-memory 2G --class com.company.spark.sql.Demo4SparkOnHive spark-1.0.jar
代码展示:
java
package com.shujia.spark.sql
import org.apache.spark.sql.{DataFrame, SparkSession}
object Demo4SparkOnHive {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName("dsl")
.config("spark.sql.shuffle.partitions", 1)
//开启hive元数据支持
.enableHiveSupport()
.getOrCreate()
import spark.implicits._
import org.apache.spark.sql.functions._
//使用hive的表
spark.sql(
"""
|show tables
|""".stripMargin).show()
//编写sql处理hive的表
spark.sql(
"""
|select clazz,count(1) as num from
|students
|group by clazz
|""".stripMargin).show()
//获取表得到DF
val studentDF: DataFrame = spark.table("students")
//当DF被多次使用时可以缓存
studentDF.cache()
studentDF
.groupBy($"clazz")
.agg(count($"id") as "num")
.show()
studentDF
.groupBy($"sex")
.agg(count($"id") as "num")
.show()
//清理缓存
studentDF.unpersist()
//需要将代码提交到服务器运行
//spark-submit --master yarn --deploy-mode client --num-executors 2 --executor-cores 1 --executor-memory 2G --class com.shujia.spark.sql.Demo4SparkOnHive spark-1.0.jar
}
}