三台机开zkServer.sh start
构建数仓
Spark01中启动Hive的Metastore服务
nohup hive --service metastore > /export/data/metastore.log 2>&1 &
1 数据仓库设计
hive创建8张表
hive进入
1.1 创建ODS层的表
CREATE EXTERNAL TABLE user_behavior_db.ods_user_behavior (
page_info STRUCT<
page_id: INT,
page_url: STRING,
product_id: INT,
category: STRING
>,
behavior_info STRUCT<
user_id: INT,
behavior_type: STRING,
action_time: STRING,
location: STRING
>,
device_info STRUCT<
operating_system: STRING,
access_method: STRING,
browser_type: STRING,
app_version: STRING
>
)
PARTITIONED BY (dt STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe'
LOCATION '/user_behavior/ods'
TBLPROPERTIES ('compression.codec'='org.apache.hadoop.io.compress.GzipCodec');
1.2 创建DWD层的表
CREATE EXTERNAL TABLE user_behavior_db.dim_date (
date_key STRING,
date_value DATE,
day_of_week STRING,
month INT,
year INT,
day_of_year INT,
day_of_month INT,
quarter INT
)
STORED AS ORC
LOCATION '/user_behavior/dwd/dim_date'
TBLPROPERTIES ('orc.compress' = 'snappy');
创建用于存储时间相关信息的维度表dim_time
CREATE EXTERNAL TABLE user_behavior_db.dim_time (
time_key STRING,
time_value STRING,
hours24 INT,
minutes INT,
seconds INT,
am_pm STRING
)
STORED AS ORC
LOCATION '/user_behavior/dwd/dim_time'
TBLPROPERTIES ('orc.compress' = 'snappy');
1.3 创建ADS层的表
创建用于存储流量分析结果的表ads_visit_counts_2023
CREATE EXTERNAL TABLE user_behavior_db.ads_visit_counts_2023 (
month_info STRING,
day_info STRING,
quarter_info STRING,
am_pm_info STRING,
week_info STRING,
group_type INT,
visit_count INT
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/user_behavior/ads/ads_visit_counts_2023';
创建用于存储商品分析结果的表ads_sale_counts_2023
CREATE EXTERNAL TABLE user_behavior_db.ads_sale_counts_2023 (
product_id INT,
sale_type INT,
sale_count INT
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/user_behavior/ads/ads_sale_counts_2023';
创建用于存储设备分析结果的表ads_device_counts_2023
CREATE EXTERNAL TABLE user_behavior_db.ads_device_counts_2023 (
hour_interval STRING,
device_type STRING,
access_count INT
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/user_behavior/ads/ads_device_counts_2023';
创建用于存储推荐系统结果的表ads_recommend_2023
CREATE EXTERNAL TABLE user_behavior_db.ads_recommend_2023 (
user_id INT,
product_id INT,
rating DOUBLE,
rmse DOUBLE
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/user_behavior/ads/ads_recommend_2023';
创建用于存储地域分析结果的表ads_sale_city_2026
CREATE EXTERNAL TABLE user_behavior_db.ads_sale_city_2026 (
city STRING,
sale_count INT,
create_time TIMESTAMP
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/user_behavior/ads/ads_sale_city_2026';
2 向数据仓库加载数据
Vim 对 Vi 的增强核心体现在三个维度:
- 可视化与效率:语法高亮、多窗口 / 标签、可视化模式、多级撤销,大幅提升编辑效率;
- 定制化与扩展性 :
.vimrc配置、插件系统,让编辑器可按需定制; - 易用性:鼠标支持、拼写检查、增量搜索等,降低使用门槛。
安装vim:yum install vim
或者就使用vi.
2.1 向ODS层的表加载数据
以下是在spark01机子上操作
vim /export/servers/load_user_behavior.sh
#!/bin/bash
# 指定数据源目录
SOURCE_DIR=/origin_data/log/user_behaviors
# 指定Hiveserver2服务地址的同时明确操作的数据库user_behavior_db
HIVE_SERVER2_URL="jdbc:hive2://spark01:10000/user_behavior_db"
# 指定操作Hive的用户root
HIVE_USER="root"
# 遍历数据源目录中的每个子目录
# 优化:用$NF提取最后一列(适配不同Hadoop版本的ls输出)
for dir in `hadoop fs -ls $SOURCE_DIR | awk 'NR>1 {print $NF}'`
do
# 根据子目录提取日期(半角空格)
dt=$(basename $dir)
echo "Processing directory: $dir with partition date: $dt"
# 根据提取的日期向表ods_user_behavior的相应分区加载数据
beeline -u "$HIVE_SERVER2_URL" -n "$HIVE_USER" -e "
LOAD DATA INPATH '$dir' INTO TABLE ods_user_behavior PARTITION (dt='$dt');
"
# 修复:所有空格改为半角,语法格式标准化
if [ $? -eq 0 ]; then
echo "Successfully loaded data for partition: $dt"
else
echo "Failed to load data for partition: $dt"
fi
done
添加权限
chmod 775 /export/servers/load_user_behavior.sh
启动HiveServer2服务
nohup hiveserver2 > /export/data/hiveserver2.log 2>&1 &
执行脚本
sh /export/servers/load_user_behavior.sh
查看表ods_user_behavior的数据
hive
use user_behavior_db;
select * from ods_user_behavior;
2.2 向DWD层的表添加数据
以下是在spark02机子上操作
2.2.1 添加数据到维度表 dim_date
安装pyspark包
sudo dnf install -y python3-pip --allowerasing
pip install pyspark==3.4.3
加载数据
vim /export/servers/load_dim_date.py
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, date_format
spark = SparkSession.builder \
.appName("load_dim_date") \
.config("hive.metastore.uris", "thrift://spark01:9083") \
.enableHiveSupport() \
.getOrCreate()
start_date = "2023-01-01"
end_date = "2023-12-31"
date_df = spark.sql(f"SELECT sequence(to_date('{start_date}'), "
f"to_date('{end_date}'), interval 1 day) as date_seq")
date_seq_df = date_df.selectExpr("explode(date_seq) as date_value")
date_addColumn_df = date_seq_df \
.withColumn('date_key',
date_format(col('date_value'), 'yyyyMMdd').cast("string")) \
.withColumn('day_of_week',
date_format(col('date_value'), 'EEEE').cast("string")) \
.withColumn('month',
date_format(col('date_value'), 'MM').cast("int")) \
.withColumn('year',
date_format(col('date_value'), 'yyyy').cast("int")) \
.withColumn('day_of_year',
date_format(col('date_value'), 'D').cast("int")) \
.withColumn('day_of_month',
date_format(col('date_value'), 'd').cast("int")) \
.withColumn('quarter',
expr("ceil(month/3)").cast("int"))
date_select_df = date_addColumn_df.select(
'date_key', 'date_value', 'day_of_week', 'month', 'year',
'day_of_year', 'day_of_month', 'quarter'
)
# 指定维度表dim_date在HDFS存储数据的目录
table_location = '/user_behavior/dwd/dim_date'
date_select_df.write \
.mode('overwrite') \
.format('orc') \
.option('path', table_location) \
.saveAsTable('user_behavior_db.dim_date')
提交Spark集群
spark-submit --master yarn --deploy-mode cluster /export/servers/load_dim_date.py
查看维度表dim_date的数据(在之前的spark01上hive运行)
select * from dim_date limit 10;
2.2.2 添加数据到维度表 dim_time
vim /export/servers/load_dim_time.py
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format
spark = SparkSession.builder \
.appName("load_dim_time") \
.config("hive.metastore.uris", "thrift://spark01:9083") \
.enableHiveSupport() \
.getOrCreate()
start_time = "00:00:00"
end_time = "23:59:59"
time_df = spark.sql(f"""
SELECT sequence(to_timestamp('{start_time}'),
to_timestamp('{end_time}'), interval 1 second) as time_seq
""")
time_seq_df = time_df.selectExpr("explode(time_seq) as time_value")
time_addColumn_df = time_seq_df \
.withColumn('time_key',
date_format(col('time_value'), 'HHmmss').cast("string")) \
.withColumn('time_value',
date_format(col('time_value'), 'HH:mm:ss').cast("string")) \
.withColumn('hours24',
date_format(col('time_value'), 'HH').cast("int")) \
.withColumn('minutes',
date_format(col('time_value'), 'mm').cast("int")) \
.withColumn('seconds',
date_format(col('time_value'), 'ss').cast("int")) \
.withColumn('am_pm',
date_format(col('time_value'), 'a').cast("string"))
time_select_df = time_addColumn_df.select(
'time_key', 'time_value', 'hours24', 'minutes', 'seconds', 'am_pm'
)
# 指定维度表dim_time在HDFS存储数据的目录
table_location = '/user_behavior/dwd/dim_time'
time_select_df.write.mode('overwrite') \
.format('orc') \
.option('path', table_location) \
.saveAsTable('user_behavior_db.dim_time')
提交spark程序
spark-submit --master yarn --deploy-mode cluster /export/servers/load_dim_time.py
查看数据(在之前的spark01上hive运行)
select * from dim_time limit 10;
2.2.3 向表user_behavior_detail加载数据
CREATE EXTERNAL TABLE user_behavior_db.user_behavior_detail (
page_id INT,
page_url STRING,
product_id INT,
category STRING,
user_id INT,
behavior_type STRING,
operating_system STRING,
access_method STRING,
browser_type STRING,
app_version STRING,
province STRING,
city STRING,
action_date_key STRING,
action_time_key STRING
)
PARTITIONED BY (
yearinfo INT,
monthinfo INT,
dayinfo INT
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe'
LOCATION '/user_behavior/dwd/user_behavior_detail'
TBLPROPERTIES ('compression.codec'='org.apache.hadoop.io.compress.GzipCodec');
设置动态分区参数
SET hive.exec.dynamic.partition=true;
SET hive.exec.dynamic.partition.mode=nonstrict;
SET hive.exec.max.dynamic.partitions=1000;
SET hive.exec.max.dynamic.partitions.pernode=1000;
执行数据加载语句
INSERT OVERWRITE TABLE user_behavior_db.user_behavior_detail
PARTITION (yearinfo,monthinfo,dayinfo)
SELECT
page_info.page_id,
page_info.page_url,
page_info.product_id,
page_info.category,
behavior_info.user_id,
behavior_info.behavior_type,
device_info.operating_system,
CASE device_info.access_method
WHEN 'browser' THEN 1
WHEN 'app' THEN 0
ELSE device_info.access_method
END as access_method,
device_info.browser_type,
device_info.app_version,
split(behavior_info.location,',')[0] AS province,
split(behavior_info.location,',')[1] AS city,
date_format(behavior_info.action_time, 'yyyyMMdd') AS action_date_key,
date_format(behavior_info.action_time, 'HHmmss') AS action_time_key,
year(dt) AS yearinfo,
month(dt) AS monthinfo,
day(dt) AS dayinfo
FROM user_behavior_db.ods_user_behavior;
查看数据(在之前的spark01上hive运行)
select * from user_behavior_detail limit 5;