- 查找满足指定条件的app_id
- 查询条件: 表名、时间、节点名
- 时间限定: 最好适当放大, 不知道什么原因有点不准
- eventLog的存放路径: spark.history.fs.logDirectory
1. spark-sql
- 先限定时间段;
- 数据是逐行读入的, 但 app_id要按整个文件过滤, 按每个条件打标;
- 按app_id粒度聚合, 查找符合条件的数据;
sql
-- 设定时区
set spark.sql.session.timeZone=GMT+8;
-- 创建数据源视图
create temporary view view_name using text options ('path'='hdfs://hdfs-cluster/spark-history/*', 'modifiedAfter'='2023-08-21T08:00:00', 'modifiedBefore'='2023-08-21T14:00:00' );
with tmp as ( -- 打标数据
select
input_file_name() as file_name,
if( value like '%tbl_name%', 1, 0) as table_name,
if( value like '%core-1-7%', 1, 0) as host_01,
if( value like '%core-1-10%', 1, 0) as host_02
from
view_name
),
tmp2 as ( -- 汇总到app_id粒度
select
file_name,
sum(table_name) as table_name,
sum(host_01) as host_01,
sum(host_02) as host_02
from
tmp
group by
file_name
)
select
*
from
tmp2
where
table_name > 0
order by
file_name
;
2. 整文件读取
- 先初步过滤app_id;
- 整个文件读取成一行;
- 按条件进行过滤;
Scala
import spark.implicits._
// 寻找可能的APP_ID
val sql_create_view =
"""
|create temporary view view_name using text options ('path'='hdfs://hdfs-cluster/spark-history/*', 'modifiedAfter'='2023-08-21T00:00:00', 'modifiedBefore'='2023-08-21T23:00:00' )
|""".stripMargin
val sql_filter_app_id =
"""
|select
| split( input_file_name(), 'history/')[1] as file_name
|from
| view_name
|where
| value like '%trandw.dwd_log_app_open_di%'
|group by
| split( input_file_name(), 'history/')[1]
|""".stripMargin
spark.sql(sql_create_view)
val df_app_ids = spark.sql(sql_filter_app_id)
val app_ids = df_app_ids.collect().map(_.getString(0)).mkString(",")
// 整个文件读取成一行
val rdd = spark.sparkContext.wholeTextFiles(s"hdfs://hdfs-cluster/spark-history/{${app_ids}}",20).repartition(12)
val df = rdd.toDF("file_name", "value")
df.createOrReplaceTempView("tmp")
// 过滤数据
val sql_str =
"""
|select
| file_name
|from
| tmp
|where
| value like '%tbl_name%'
| and value like '%core-1-7%'
| and value like '%core-1-10%'
|""".stripMargin
spark.sql(sql_str).show(1)