目录
实现步骤:
1、Hive数仓建模
在scripts目录创建 HQL 脚本,定义两层表结构,直接关联 HDFS 清洗后的数据:
-- 1. 数仓数据库初始化(保留原逻辑,确保幂等性)
CREATE DATABASE IF NOT EXISTS ecommerce_dw;
USE ecommerce_dw;
-- 2. ODS层:原始数据层
-- 字段名与CSV完全一致,类型统一为STRING(保留原始格式,避免转换失败)
CREATE EXTERNAL TABLE IF NOT EXISTS ods_ecommerce_transactions (
Transaction_ID STRING COMMENT '交易ID(原始格式,Spark清洗前为字符串)',
User_Name STRING COMMENT '用户名(用户唯一标识,CSV原始字段)',
Age STRING COMMENT '用户年龄(原始格式,Spark后续转为INT)',
Country STRING COMMENT '交易国家(与CSV一致)',
Product_Category STRING COMMENT '商品品类(原始格式)',
Purchase_Amount STRING COMMENT '交易金额(原始格式,Spark后续转为DECIMAL)',
Payment_Method STRING COMMENT '支付方式(原始格式)',
Transaction_Date STRING COMMENT '交易日期(原始格式:yyyy/M/d)'
)
COMMENT '跨境电商交易原始数据ODS层(与CSV字段1:1对应)'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/raw_data/transactions/'
TBLPROPERTIES (
'skip.header.line.count' = '1',
'serialization.encoding' = 'UTF-8'
);
-- 3. DWD层:明细数据层(100%匹配Spark清洗结果)
CREATE EXTERNAL TABLE IF NOT EXISTS dwd_ecommerce_transactions (
Transaction_ID INT COMMENT '交易ID(清洗后去重去空)',
User_Name STRING COMMENT '用户名(用户唯一标识)',
Age INT COMMENT '用户年龄(18-100岁,过滤异常值)',
Country STRING COMMENT '交易国家',
Product_Category STRING COMMENT '商品品类',
Purchase_Amount DECIMAL(10,2) COMMENT '交易金额(保留2位小数,单位:默认美元)',
Payment_Method STRING COMMENT '支付方式',
Transaction_Date DATE COMMENT '交易日期(yyyy-MM-dd)'
)
COMMENT '跨境电商交易清洗明细DWD层(与Spark输出对齐)'
PARTITIONED BY (dt DATE COMMENT '分区字段:交易日期(按日分区)')
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/processed_data/transactions_clean/'
TBLPROPERTIES (
'parquet.compression' = 'snappy',
'external.table.purge' = 'true'
);
-- 4. DWD层分区元数据
-- Spark按分区写入Parquet后,Hive需刷新分区才能识别新增数据
MSCK REPAIR TABLE dwd_ecommerce_transactions;
2、执行Hive建表脚本
# 1. 确保Hive Metastore和HiveServer2已启动(之前步骤已启动,若未启动则执行)
nohup hive --service metastore &
nohup hive --service hiveserver2 &
# 2. 执行HQL脚本(-f:指定脚本文件)
hive -f /home/Hadoop/ecommerce_dw_project/scripts/hive_create_table.hql
# 3. 验证表是否创建成功(进入Hive CLI查看)
hive
USE ecommerce_dw;
SHOW TABLES; # 预期输出:ods_ecommerce_transactions、dwd_ecommerce_transactions
DESC dwd_ecommerce_transactions; # 查看DWD表结构
SELECT * FROM dwd_ecommerce_transactions LIMIT 10; # 查看表数据(能正常显示则成功)
quit; # 退出Hive CLI
3、业务分析
在scripts目录建分析脚本
①基础验证脚本
功能:验证 ODS/DWD 层数据完整性、一致性,确保数据链路通畅。
-- Basic Verification for ODS & DWD Layers
USE ecommerce_dw;
-- ==============================================
-- ODS Layer Verification
-- ==============================================
SELECT 'ODS Layer Structure' AS verification_step;
DESCRIBE ods_ecommerce_transactions;
SELECT 'ODS Layer Core Metrics' AS verification_step;
SELECT
COUNT(*) AS total_raw_records,
COUNT(DISTINCT Transaction_ID) AS unique_transaction_ids,
COUNT(DISTINCT User_Name) AS unique_users,
COUNT(DISTINCT Country) AS unique_countries,
COUNT(DISTINCT Product_Category) AS unique_categories,
MIN(Transaction_Date) AS earliest_transaction_date,
MAX(Transaction_Date) AS latest_transaction_date
FROM ods_ecommerce_transactions;
-- ==============================================
-- DWD Layer Verification
-- ==============================================
SELECT 'DWD Layer Partitions' AS verification_step;
SHOW PARTITIONS dwd_ecommerce_transactions;
SELECT 'DWD Layer Core Metrics' AS verification_step;
SELECT
COUNT(*) AS total_cleaned_records,
COUNT(DISTINCT Transaction_ID) AS unique_transaction_ids,
COUNT(DISTINCT User_Name) AS unique_users,
MIN(Age) AS min_age,
MAX(Age) AS max_age,
MIN(Purchase_Amount) AS min_purchase_amount,
MAX(Purchase_Amount) AS max_purchase_amount,
COUNT(DISTINCT dt) AS transaction_days
FROM dwd_ecommerce_transactions;
-- ==============================================
-- Data Consistency (ODS vs DWD)
-- ==============================================
SELECT 'Category Distribution Comparison' AS verification_step;
SELECT
'ODS' AS data_layer,
Product_Category,
COUNT(*) AS transaction_count
FROM ods_ecommerce_transactions
GROUP BY Product_Category
UNION ALL
SELECT
'DWD' AS data_layer,
Product_Category,
COUNT(*) AS transaction_count
FROM dwd_ecommerce_transactions
GROUP BY Product_Category
ORDER BY Product_Category, data_layer;
SELECT 'Payment Method Distribution Comparison' AS verification_step;
SELECT
'ODS' AS data_layer,
Payment_Method,
COUNT(*) AS transaction_count
FROM ods_ecommerce_transactions
GROUP BY Payment_Method
UNION ALL
SELECT
'DWD' AS data_layer,
Payment_Method,
COUNT(*) AS transaction_count
FROM dwd_ecommerce_transactions
GROUP BY Payment_Method
ORDER BY Payment_Method, data_layer;
-- ==============================================
-- Verification Completed
-- ==============================================
SELECT 'Basic Verification Finished' AS result;
执行:
# 用beeline执行(推荐,兼容Hive集群)
beeline -u jdbc:hive2://node1:10000 -n Hadoop -f basic_verification.sql > basic_verification_result.txt
# 或直接在Hive CLI中执行
hive -f basic_verification.sql > basic_verification_result.txt
②业务分析脚本
功能:分析交易规模、用户价值、商品表现、地域 & 支付偏好
-- Core Business Analysis
USE ecommerce_dw;
-- ==============================================
-- 1. Transaction Scale Analysis (Time Dimension)
-- ==============================================
SELECT '1. Monthly Transaction Scale' AS analysis_topic;
CREATE TABLE IF NOT EXISTS monthly_transaction_scale (
transaction_year INT,
transaction_month INT,
transaction_count INT,
total_sales DECIMAL(12,2),
average_order_value DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/monthly_transaction_scale';
INSERT OVERWRITE TABLE monthly_transaction_scale
SELECT
YEAR(dt) AS transaction_year,
MONTH(dt) AS transaction_month,
COUNT(*) AS transaction_count,
SUM(Purchase_Amount) AS total_sales,
AVG(Purchase_Amount) AS average_order_value
FROM dwd_ecommerce_transactions
GROUP BY YEAR(dt), MONTH(dt)
ORDER BY transaction_year, transaction_month;
SELECT * FROM monthly_transaction_scale;
SELECT '1. Quarterly Transaction Scale' AS analysis_topic;
CREATE TABLE IF NOT EXISTS quarterly_transaction_scale (
transaction_year INT,
transaction_quarter INT,
transaction_count INT,
total_sales DECIMAL(12,2),
average_order_value DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/quarterly_transaction_scale';
INSERT OVERWRITE TABLE quarterly_transaction_scale
SELECT
YEAR(dt) AS transaction_year,
QUARTER(dt) AS transaction_quarter,
COUNT(*) AS transaction_count,
SUM(Purchase_Amount) AS total_sales,
AVG(Purchase_Amount) AS average_order_value
FROM dwd_ecommerce_transactions
GROUP BY YEAR(dt), QUARTER(dt)
ORDER BY transaction_year, transaction_quarter;
SELECT * FROM quarterly_transaction_scale;
-- ==============================================
-- 2. User Analysis
-- ==============================================
SELECT '2. User Age Group Analysis' AS analysis_topic;
CREATE TABLE IF NOT EXISTS user_age_group_analysis (
age_group STRING,
user_count INT,
transaction_count INT,
total_sales DECIMAL(12,2),
average_order_value DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/user_age_group_analysis';
INSERT OVERWRITE TABLE user_age_group_analysis
SELECT
CASE
WHEN Age BETWEEN 18 AND 25 THEN '18-25'
WHEN Age BETWEEN 26 AND 35 THEN '26-35'
WHEN Age BETWEEN 36 AND 45 THEN '36-45'
WHEN Age BETWEEN 46 AND 55 THEN '46-55'
ELSE '56+'
END AS age_group,
COUNT(DISTINCT User_Name) AS user_count,
COUNT(*) AS transaction_count,
SUM(Purchase_Amount) AS total_sales,
AVG(Purchase_Amount) AS average_order_value
FROM dwd_ecommerce_transactions
GROUP BY
CASE
WHEN Age BETWEEN 18 AND 25 THEN '18-25'
WHEN Age BETWEEN 26 AND 35 THEN '26-35'
WHEN Age BETWEEN 36 AND 45 THEN '36-45'
WHEN Age BETWEEN 46 AND 55 THEN '46-55'
ELSE '56+'
END
ORDER BY total_sales DESC;
SELECT * FROM user_age_group_analysis;
SELECT '2. User Value Segmentation' AS analysis_topic;
CREATE TABLE IF NOT EXISTS user_value_segmentation (
user_value_segment STRING,
user_count INT,
total_transactions INT,
total_spend_amount DECIMAL(12,2),
avg_user_spend DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/user_value_segmentation';
INSERT OVERWRITE TABLE user_value_segmentation
WITH user_transaction_count AS (
SELECT
User_Name,
COUNT(*) AS transaction_count,
SUM(Purchase_Amount) AS total_spend
FROM dwd_ecommerce_transactions
GROUP BY User_Name
)
SELECT
CASE
WHEN transaction_count >= 1000 THEN 'High_Value_1000+'
WHEN transaction_count >= 500 THEN 'Mid_High_Value_500_999'
WHEN transaction_count >= 100 THEN 'Mid_Value_100_499'
ELSE 'Regular_Value_Below_100'
END AS user_value_segment,
COUNT(User_Name) AS user_count,
SUM(transaction_count) AS total_transactions,
SUM(total_spend) AS total_spend_amount,
AVG(total_spend) AS avg_user_spend
FROM user_transaction_count
GROUP BY
CASE
WHEN transaction_count >= 1000 THEN 'High_Value_1000+'
WHEN transaction_count >= 500 THEN 'Mid_High_Value_500_999'
WHEN transaction_count >= 100 THEN 'Mid_Value_100_499'
ELSE 'Regular_Value_Below_100'
END
ORDER BY total_spend_amount DESC;
SELECT * FROM user_value_segmentation;
-- ==============================================
-- 3. Product Analysis
-- ==============================================
SELECT '3. Product Category Performance' AS analysis_topic;
CREATE TABLE IF NOT EXISTS product_category_performance (
product_category STRING,
sales_count INT,
total_sales DECIMAL(12,2),
category_aov DECIMAL(10,2),
sales_contribution_pct DECIMAL(5,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/product_category_performance';
INSERT OVERWRITE TABLE product_category_performance
SELECT
Product_Category,
COUNT(*) AS sales_count,
SUM(Purchase_Amount) AS total_sales,
AVG(Purchase_Amount) AS category_aov,
(SUM(Purchase_Amount) / (SELECT SUM(Purchase_Amount) FROM dwd_ecommerce_transactions)) * 100 AS sales_contribution_pct
FROM dwd_ecommerce_transactions
GROUP BY Product_Category
ORDER BY total_sales DESC;
SELECT * FROM product_category_performance;
SELECT '3. Quarterly Product Category Trend' AS analysis_topic;
CREATE TABLE IF NOT EXISTS quarterly_product_trend (
product_category STRING,
year INT,
quarter INT,
sales_count INT,
quarterly_sales DECIMAL(12,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/quarterly_product_trend';
INSERT OVERWRITE TABLE quarterly_product_trend
SELECT
Product_Category,
YEAR(dt) AS year,
QUARTER(dt) AS quarter,
COUNT(*) AS sales_count,
SUM(Purchase_Amount) AS quarterly_sales
FROM dwd_ecommerce_transactions
GROUP BY Product_Category, YEAR(dt), QUARTER(dt)
ORDER BY Product_Category, year, quarter;
SELECT * FROM quarterly_product_trend;
-- ==============================================
-- 4. Geography & Payment Method Analysis
-- ==============================================
SELECT '4. Country-wise Performance' AS analysis_topic;
CREATE TABLE IF NOT EXISTS country_performance (
country STRING,
transaction_count INT,
total_sales DECIMAL(12,2),
country_aov DECIMAL(10,2),
active_user_count INT
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/country_performance';
INSERT OVERWRITE TABLE country_performance
SELECT
Country,
COUNT(*) AS transaction_count,
SUM(Purchase_Amount) AS total_sales,
AVG(Purchase_Amount) AS country_aov,
COUNT(DISTINCT User_Name) AS active_user_count
FROM dwd_ecommerce_transactions
GROUP BY Country
ORDER BY total_sales DESC;
SELECT * FROM country_performance;
SELECT '4. Payment Method Preference by Country' AS analysis_topic;
CREATE TABLE IF NOT EXISTS payment_preference_by_country (
country STRING,
payment_method STRING,
payment_count INT,
country_payment_share_pct DECIMAL(5,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/payment_preference_by_country';
INSERT OVERWRITE TABLE payment_preference_by_country
SELECT
Country,
Payment_Method,
COUNT(*) AS payment_count,
(COUNT(*) / SUM(COUNT(*)) OVER (PARTITION BY Country)) * 100 AS country_payment_share_pct
FROM dwd_ecommerce_transactions
GROUP BY Country, Payment_Method
ORDER BY Country, payment_count DESC;
SELECT * FROM payment_preference_by_country;
-- ==============================================
-- Analysis Completed
-- ==============================================
SELECT 'All Business Analysis Completed' AS result;
执行:
# 执行并保存结果到文件
beeline -u jdbc:hive2://node1:10000 -n Hadoop -f business_analysis.sql > business_analysis_result.txt
4、提交到git仓库(对应自己的文件)
# 1. 进入项目根目录
cd /home/Hadoop/ecommerce_dw_project/
# 2. 添加新增文件到Git暂存区
git add scripts/spark_preprocess.py
git add scripts/hive_create_table.hql
git add scripts/basic_verification.sql
git add scripts/business_analysis.sql
# 3. 提交到本地仓库(写清楚提交内容,方便回溯)
git commit -m "完成全流程:数据集上传+Spark预处理+Hive建模+业务分析"
# 4. 查看提交记录(验证提交成功)
git log --oneline -n 5
# 预期输出:最新一条记录为上述commit信息
案例小结
算是非常浅地过了一遍流程。
自我反思,像这样的流程只能算是粗略的,肯定可以有很多技术或者工具可以加速或者更好地实现这个流程。期间遇到的报错都可以靠一些方法规避,但是因为没有经验却成为了非常没有效率地挨个找。
知识体系的不成熟,没有指导的无头苍蝇,看起来就像是一场自嗨,初学阶段还是需要有方向指导,否则效率低成效小。
对于案例的启动,初心是过一遍流程,但是真正的项目是为了解决问题而来,意识到这点即可。
未来案例可以升级。