跨境电商大数据分析系统案例:③建模、分析与暂时收尾

目录

实现步骤:

1、Hive数仓建模

2、执行Hive建表脚本

3、业务分析

①基础验证脚本

②业务分析脚本

4、提交到git仓库(对应自己的文件)

案例小结


实现步骤:

1、Hive数仓建模

scripts目录创建 HQL 脚本,定义两层表结构,直接关联 HDFS 清洗后的数据:

复制代码
-- 1. 数仓数据库初始化(保留原逻辑,确保幂等性)
CREATE DATABASE IF NOT EXISTS ecommerce_dw;
USE ecommerce_dw;

-- 2. ODS层:原始数据层
-- 字段名与CSV完全一致,类型统一为STRING(保留原始格式,避免转换失败)
CREATE EXTERNAL TABLE IF NOT EXISTS ods_ecommerce_transactions (
    Transaction_ID STRING COMMENT '交易ID(原始格式,Spark清洗前为字符串)',
    User_Name STRING COMMENT '用户名(用户唯一标识,CSV原始字段)',
    Age STRING COMMENT '用户年龄(原始格式,Spark后续转为INT)',
    Country STRING COMMENT '交易国家(与CSV一致)',
    Product_Category STRING COMMENT '商品品类(原始格式)',
    Purchase_Amount STRING COMMENT '交易金额(原始格式,Spark后续转为DECIMAL)',
    Payment_Method STRING COMMENT '支付方式(原始格式)',
    Transaction_Date STRING COMMENT '交易日期(原始格式:yyyy/M/d)'
)
COMMENT '跨境电商交易原始数据ODS层(与CSV字段1:1对应)'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/raw_data/transactions/'
TBLPROPERTIES (
    'skip.header.line.count' = '1',
    'serialization.encoding' = 'UTF-8'
);


-- 3. DWD层:明细数据层(100%匹配Spark清洗结果)
CREATE EXTERNAL TABLE IF NOT EXISTS dwd_ecommerce_transactions (
    Transaction_ID INT COMMENT '交易ID(清洗后去重去空)',
    User_Name STRING COMMENT '用户名(用户唯一标识)',
    Age INT COMMENT '用户年龄(18-100岁,过滤异常值)',
    Country STRING COMMENT '交易国家',
    Product_Category STRING COMMENT '商品品类',
    Purchase_Amount DECIMAL(10,2) COMMENT '交易金额(保留2位小数,单位:默认美元)',
    Payment_Method STRING COMMENT '支付方式',
    Transaction_Date DATE COMMENT '交易日期(yyyy-MM-dd)'
)
COMMENT '跨境电商交易清洗明细DWD层(与Spark输出对齐)'
PARTITIONED BY (dt DATE COMMENT '分区字段:交易日期(按日分区)')
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/processed_data/transactions_clean/'
TBLPROPERTIES (
    'parquet.compression' = 'snappy',
    'external.table.purge' = 'true'
);

-- 4. DWD层分区元数据
-- Spark按分区写入Parquet后,Hive需刷新分区才能识别新增数据
MSCK REPAIR TABLE dwd_ecommerce_transactions;

2、执行Hive建表脚本

复制代码
# 1. 确保Hive Metastore和HiveServer2已启动(之前步骤已启动,若未启动则执行)
nohup hive --service metastore &
nohup hive --service hiveserver2 &

# 2. 执行HQL脚本(-f:指定脚本文件)
hive -f /home/Hadoop/ecommerce_dw_project/scripts/hive_create_table.hql

# 3. 验证表是否创建成功(进入Hive CLI查看)
hive
USE ecommerce_dw;
SHOW TABLES;  # 预期输出:ods_ecommerce_transactions、dwd_ecommerce_transactions
DESC dwd_ecommerce_transactions;  # 查看DWD表结构
SELECT * FROM dwd_ecommerce_transactions LIMIT 10;  # 查看表数据(能正常显示则成功)
quit;  # 退出Hive CLI

3、业务分析

scripts目录建分析脚本

①基础验证脚本

功能:验证 ODS/DWD 层数据完整性、一致性,确保数据链路通畅。

复制代码
-- Basic Verification for ODS & DWD Layers
USE ecommerce_dw;

-- ==============================================
-- ODS Layer Verification
-- ==============================================
SELECT 'ODS Layer Structure' AS verification_step;
DESCRIBE ods_ecommerce_transactions;

SELECT 'ODS Layer Core Metrics' AS verification_step;
SELECT
  COUNT(*) AS total_raw_records,
  COUNT(DISTINCT Transaction_ID) AS unique_transaction_ids,
  COUNT(DISTINCT User_Name) AS unique_users,
  COUNT(DISTINCT Country) AS unique_countries,
  COUNT(DISTINCT Product_Category) AS unique_categories,
  MIN(Transaction_Date) AS earliest_transaction_date,
  MAX(Transaction_Date) AS latest_transaction_date
FROM ods_ecommerce_transactions;

-- ==============================================
-- DWD Layer Verification
-- ==============================================
SELECT 'DWD Layer Partitions' AS verification_step;
SHOW PARTITIONS dwd_ecommerce_transactions;

SELECT 'DWD Layer Core Metrics' AS verification_step;
SELECT
  COUNT(*) AS total_cleaned_records,
  COUNT(DISTINCT Transaction_ID) AS unique_transaction_ids,
  COUNT(DISTINCT User_Name) AS unique_users,
  MIN(Age) AS min_age,
  MAX(Age) AS max_age,
  MIN(Purchase_Amount) AS min_purchase_amount,
  MAX(Purchase_Amount) AS max_purchase_amount,
  COUNT(DISTINCT dt) AS transaction_days
FROM dwd_ecommerce_transactions;

-- ==============================================
-- Data Consistency (ODS vs DWD)
-- ==============================================
SELECT 'Category Distribution Comparison' AS verification_step;
SELECT 
  'ODS' AS data_layer,
  Product_Category,
  COUNT(*) AS transaction_count
FROM ods_ecommerce_transactions
GROUP BY Product_Category
UNION ALL
SELECT 
  'DWD' AS data_layer,
  Product_Category,
  COUNT(*) AS transaction_count
FROM dwd_ecommerce_transactions
GROUP BY Product_Category
ORDER BY Product_Category, data_layer;

SELECT 'Payment Method Distribution Comparison' AS verification_step;
SELECT 
  'ODS' AS data_layer,
  Payment_Method,
  COUNT(*) AS transaction_count
FROM ods_ecommerce_transactions
GROUP BY Payment_Method
UNION ALL
SELECT 
  'DWD' AS data_layer,
  Payment_Method,
  COUNT(*) AS transaction_count
FROM dwd_ecommerce_transactions
GROUP BY Payment_Method
ORDER BY Payment_Method, data_layer;

-- ==============================================
-- Verification Completed
-- ==============================================
SELECT 'Basic Verification Finished' AS result;

执行:

复制代码
# 用beeline执行(推荐,兼容Hive集群)
beeline -u jdbc:hive2://node1:10000 -n Hadoop -f basic_verification.sql > basic_verification_result.txt

# 或直接在Hive CLI中执行
hive -f basic_verification.sql > basic_verification_result.txt

②业务分析脚本

功能:分析交易规模、用户价值、商品表现、地域 & 支付偏好

复制代码
-- Core Business Analysis
USE ecommerce_dw;

-- ==============================================
-- 1. Transaction Scale Analysis (Time Dimension)
-- ==============================================
SELECT '1. Monthly Transaction Scale' AS analysis_topic;
CREATE TABLE IF NOT EXISTS monthly_transaction_scale (
  transaction_year INT,
  transaction_month INT,
  transaction_count INT,
  total_sales DECIMAL(12,2),
  average_order_value DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/monthly_transaction_scale';

INSERT OVERWRITE TABLE monthly_transaction_scale
SELECT
  YEAR(dt) AS transaction_year,
  MONTH(dt) AS transaction_month,
  COUNT(*) AS transaction_count,
  SUM(Purchase_Amount) AS total_sales,
  AVG(Purchase_Amount) AS average_order_value
FROM dwd_ecommerce_transactions
GROUP BY YEAR(dt), MONTH(dt)
ORDER BY transaction_year, transaction_month;

SELECT * FROM monthly_transaction_scale;

SELECT '1. Quarterly Transaction Scale' AS analysis_topic;
CREATE TABLE IF NOT EXISTS quarterly_transaction_scale (
  transaction_year INT,
  transaction_quarter INT,
  transaction_count INT,
  total_sales DECIMAL(12,2),
  average_order_value DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/quarterly_transaction_scale';

INSERT OVERWRITE TABLE quarterly_transaction_scale
SELECT
  YEAR(dt) AS transaction_year,
  QUARTER(dt) AS transaction_quarter,
  COUNT(*) AS transaction_count,
  SUM(Purchase_Amount) AS total_sales,
  AVG(Purchase_Amount) AS average_order_value
FROM dwd_ecommerce_transactions
GROUP BY YEAR(dt), QUARTER(dt)
ORDER BY transaction_year, transaction_quarter;

SELECT * FROM quarterly_transaction_scale;

-- ==============================================
-- 2. User Analysis
-- ==============================================
SELECT '2. User Age Group Analysis' AS analysis_topic;
CREATE TABLE IF NOT EXISTS user_age_group_analysis (
  age_group STRING,
  user_count INT,
  transaction_count INT,
  total_sales DECIMAL(12,2),
  average_order_value DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/user_age_group_analysis';

INSERT OVERWRITE TABLE user_age_group_analysis
SELECT
  CASE
    WHEN Age BETWEEN 18 AND 25 THEN '18-25'
    WHEN Age BETWEEN 26 AND 35 THEN '26-35'
    WHEN Age BETWEEN 36 AND 45 THEN '36-45'
    WHEN Age BETWEEN 46 AND 55 THEN '46-55'
    ELSE '56+'
  END AS age_group,
  COUNT(DISTINCT User_Name) AS user_count,
  COUNT(*) AS transaction_count,
  SUM(Purchase_Amount) AS total_sales,
  AVG(Purchase_Amount) AS average_order_value
FROM dwd_ecommerce_transactions
GROUP BY
  CASE
    WHEN Age BETWEEN 18 AND 25 THEN '18-25'
    WHEN Age BETWEEN 26 AND 35 THEN '26-35'
    WHEN Age BETWEEN 36 AND 45 THEN '36-45'
    WHEN Age BETWEEN 46 AND 55 THEN '46-55'
    ELSE '56+'
  END
ORDER BY total_sales DESC;

SELECT * FROM user_age_group_analysis;

SELECT '2. User Value Segmentation' AS analysis_topic;
CREATE TABLE IF NOT EXISTS user_value_segmentation (
  user_value_segment STRING,
  user_count INT,
  total_transactions INT,
  total_spend_amount DECIMAL(12,2),
  avg_user_spend DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/user_value_segmentation';

INSERT OVERWRITE TABLE user_value_segmentation
WITH user_transaction_count AS (
  SELECT
    User_Name,
    COUNT(*) AS transaction_count,
    SUM(Purchase_Amount) AS total_spend
  FROM dwd_ecommerce_transactions
  GROUP BY User_Name
)
SELECT
  CASE
    WHEN transaction_count >= 1000 THEN 'High_Value_1000+'
    WHEN transaction_count >= 500 THEN 'Mid_High_Value_500_999'
    WHEN transaction_count >= 100 THEN 'Mid_Value_100_499'
    ELSE 'Regular_Value_Below_100'
  END AS user_value_segment,
  COUNT(User_Name) AS user_count,
  SUM(transaction_count) AS total_transactions,
  SUM(total_spend) AS total_spend_amount,
  AVG(total_spend) AS avg_user_spend
FROM user_transaction_count
GROUP BY
  CASE
    WHEN transaction_count >= 1000 THEN 'High_Value_1000+'
    WHEN transaction_count >= 500 THEN 'Mid_High_Value_500_999'
    WHEN transaction_count >= 100 THEN 'Mid_Value_100_499'
    ELSE 'Regular_Value_Below_100'
  END
ORDER BY total_spend_amount DESC;

SELECT * FROM user_value_segmentation;

-- ==============================================
-- 3. Product Analysis
-- ==============================================
SELECT '3. Product Category Performance' AS analysis_topic;
CREATE TABLE IF NOT EXISTS product_category_performance (
  product_category STRING,
  sales_count INT,
  total_sales DECIMAL(12,2),
  category_aov DECIMAL(10,2),
  sales_contribution_pct DECIMAL(5,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/product_category_performance';

INSERT OVERWRITE TABLE product_category_performance
SELECT
  Product_Category,
  COUNT(*) AS sales_count,
  SUM(Purchase_Amount) AS total_sales,
  AVG(Purchase_Amount) AS category_aov,
  (SUM(Purchase_Amount) / (SELECT SUM(Purchase_Amount) FROM dwd_ecommerce_transactions)) * 100 AS sales_contribution_pct
FROM dwd_ecommerce_transactions
GROUP BY Product_Category
ORDER BY total_sales DESC;

SELECT * FROM product_category_performance;

SELECT '3. Quarterly Product Category Trend' AS analysis_topic;
CREATE TABLE IF NOT EXISTS quarterly_product_trend (
  product_category STRING,
  year INT,
  quarter INT,
  sales_count INT,
  quarterly_sales DECIMAL(12,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/quarterly_product_trend';

INSERT OVERWRITE TABLE quarterly_product_trend
SELECT
  Product_Category,
  YEAR(dt) AS year,
  QUARTER(dt) AS quarter,
  COUNT(*) AS sales_count,
  SUM(Purchase_Amount) AS quarterly_sales
FROM dwd_ecommerce_transactions
GROUP BY Product_Category, YEAR(dt), QUARTER(dt)
ORDER BY Product_Category, year, quarter;

SELECT * FROM quarterly_product_trend;

-- ==============================================
-- 4. Geography & Payment Method Analysis
-- ==============================================
SELECT '4. Country-wise Performance' AS analysis_topic;
CREATE TABLE IF NOT EXISTS country_performance (
  country STRING,
  transaction_count INT,
  total_sales DECIMAL(12,2),
  country_aov DECIMAL(10,2),
  active_user_count INT
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/country_performance';

INSERT OVERWRITE TABLE country_performance
SELECT
  Country,
  COUNT(*) AS transaction_count,
  SUM(Purchase_Amount) AS total_sales,
  AVG(Purchase_Amount) AS country_aov,
  COUNT(DISTINCT User_Name) AS active_user_count
FROM dwd_ecommerce_transactions
GROUP BY Country
ORDER BY total_sales DESC;

SELECT * FROM country_performance;

SELECT '4. Payment Method Preference by Country' AS analysis_topic;
CREATE TABLE IF NOT EXISTS payment_preference_by_country (
  country STRING,
  payment_method STRING,
  payment_count INT,
  country_payment_share_pct DECIMAL(5,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/payment_preference_by_country';

INSERT OVERWRITE TABLE payment_preference_by_country
SELECT
  Country,
  Payment_Method,
  COUNT(*) AS payment_count,
  (COUNT(*) / SUM(COUNT(*)) OVER (PARTITION BY Country)) * 100 AS country_payment_share_pct
FROM dwd_ecommerce_transactions
GROUP BY Country, Payment_Method
ORDER BY Country, payment_count DESC;

SELECT * FROM payment_preference_by_country;

-- ==============================================
-- Analysis Completed
-- ==============================================
SELECT 'All Business Analysis Completed' AS result;

执行:

复制代码
# 执行并保存结果到文件
beeline -u jdbc:hive2://node1:10000 -n Hadoop -f business_analysis.sql > business_analysis_result.txt

4、提交到git仓库(对应自己的文件)

复制代码
# 1. 进入项目根目录
cd /home/Hadoop/ecommerce_dw_project/

# 2. 添加新增文件到Git暂存区
git add scripts/spark_preprocess.py
git add scripts/hive_create_table.hql
git add scripts/basic_verification.sql
git add scripts/business_analysis.sql

# 3. 提交到本地仓库(写清楚提交内容,方便回溯)
git commit -m "完成全流程:数据集上传+Spark预处理+Hive建模+业务分析"

# 4. 查看提交记录(验证提交成功)
git log --oneline -n 5
# 预期输出:最新一条记录为上述commit信息

案例小结

算是非常浅地过了一遍流程。

自我反思,像这样的流程只能算是粗略的,肯定可以有很多技术或者工具可以加速或者更好地实现这个流程。期间遇到的报错都可以靠一些方法规避,但是因为没有经验却成为了非常没有效率地挨个找。

知识体系的不成熟,没有指导的无头苍蝇,看起来就像是一场自嗨,初学阶段还是需要有方向指导,否则效率低成效小。

对于案例的启动,初心是过一遍流程,但是真正的项目是为了解决问题而来,意识到这点即可。

未来案例可以升级。

相关推荐
Cisyam^1 小时前
openGauss + LangChain Agent实战:从自然语言到SQL的智能数据分析助手
sql·数据分析·langchain
yumgpkpm2 小时前
腾讯TBDS和Cloud Data AI CMP 比较的缺陷在哪里?
hive·hadoop·elasticsearch·zookeeper·spark·kafka·hbase
CC数学建模2 小时前
被问爆的 “高颜值 + 强功能” 学生管理系统!Flask+MySQL 全栈开发,自带数据分析 + 幸福指标,毕设 / 竞赛直接
mysql·数据分析·flask
咚咚王者5 小时前
人工智能之数据分析 Matplotlib:第四章 图形类型
人工智能·数据分析·matplotlib
语落心生7 小时前
大宗供应链企业舆情指标系统设计(一)舆情指标设计
数据分析
语落心生7 小时前
餐饮供应链的数仓设计思考 (五) 系统稳定性与SLA保障体系
数据分析
语落心生7 小时前
餐饮供应链的数仓设计思考 (四) 餐饮连锁企业数据模型可解释性
数据分析
语落心生7 小时前
餐饮供应链的数仓设计思考 (三) 数据管道与核心系统API对接方案
数据分析
语落心生8 小时前
餐饮供应链的数仓设计思考 (二) 餐饮连锁企业深度业务模型分析
数据分析