hive电子商务消费行为分析
|-----------------------------------------------------------------------------------|
| 1. 掌握Zeppelin的使用 2. 了解数据结构 3 .数据清洗 4. 基于Hive的数据分析 |
1.物料准备
|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| (1)Customer表 |----------------------|--------------------------------------------| | customer_details | details | | customer_id | Int, 1 - 500 | | first_name | string | | last_name | string | | email | string, such as willddy@gmail.com | | gender | string, Male or female | | address | string | | country | string | | language | string | | job | string, job title/position | | credit_type | string, credit card type, such as visa | | credit_no | string, credit card number | (2)Store表 |---------------------|--------------------------------| | store_details | details | | store_id | Int, 1 - 5 | | store_name | string | | employee_number | Int, 在store有多少employee | (3)Review表 |---------------------|-------------------| | store_review | details | | stransaction_id | Int, 1 - 8000 | | store_id | Int, 1 - 5 | | review_store | Int, 1 - 5 | (4)Transaction表 |-------------------------|-----------------------------------| | transaction_details | details | | transaction_id | Int, 1 - 1000 | | customer_id | Int, 1 - 500 | | store_id | Int, 1 - 5 | | price | decimal, such as 5.08 | | product | string, things bought | | date | string, when to purchase | | time | string, what time to purchase | |
2. 实现数据清洗
|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| 2.1 找出PII (personal information identification) 或PCI (personal confidential information) 数据进行加密或hash sql --数据加密处理 select * from ext_customer_details; create view if not exists vm_customer_details as select customer_id, first_name, unbase64(last_name) as last_name, unbase64(email) as email, gender, unbase64(address) as address, country, language, job, credit_type, unbase64(concat(unbase64(credit_no),'kb23')) as credit_no from ext_customer_details; show views ; show tables ; select * from vw_customer_details;
|
| 2.2 重新组织transaction数据按照日期YYYY-MM做分区,并对transaction_details中的重复数据生成新ID sql --交易详情表 按月创建分区表 create table if not exists transaction_details ( transaction_id string, customer_id string, store_id string, price decimal(8,2), product string, purchase_date string, purchase_time string ) partitioned by (purchase_month string); ------ext_transaction_details 重复数据生成新的id------------- --查询重复的数据 with base as ( select row_number() over (partition by transaction_id order by 1) as rn, * from ext_transaction_details ) select * from base where rn > 1; --打开动态分区 set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; --重复id处理方式 transaction_id_fix_rn --> 123456_fix_2 with base as ( select transaction_id, customer_id, store_id, price, product, purchase_date, purchase_time, from_unixtime(unix_timestamp(purchase_date, 'yyyy-MM-dd'),'yyyy-MM') as purchase_month, row_number() over (partition by transaction_id order by store_id) as rn from ext_transaction_details) insert overwrite table transaction_details partition (purchase_month) select `if`(rn=1,transaction_id, concat(transaction_id, '_fix_', rn)), customer_id, store_id, price, product, purchase_date, purchase_time,purchase_month from base; show partitions transaction_details;
|
| 2.3 过滤掉store_review中没有评分的数据 %hive use shopping; -- select * from ext_store_review where review_score == ''; create view vw_store_review as( select transaction_id, store_id, review_score from ext_store_review where review_score <> ''); |
3. Customer分析(zeppelin和datagrip查询)
|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| 3.1 找出顾客最常用的信用卡 %hive use shopping; select country, credit_type, count(distinct credit_no) as credit_cnt from vm_customer_details group by country,credit_type order by credit_cnt desc; |
| 3.2 找出客户资料中排名前五的职位名称 %hive use shopping; select job, count(1) as pn from vw_customer_details group by job order by pn desc limit 10; |
| 3.3 在美国女性最常用的信用卡 %hive use shopping; select credit_type, count(1) as credit_cnt from vw_customer_details where country='United States' and gender = 'Female' group by credit_type order by credit_cnt desc limit 3; |
| 3.4 按性别和国家进行客户统计 %hive use shopping; select country, gender, count(1) as cn from vw_customer_details group by country,gender order by cn desc limit 3; |
4. Transaction分析
|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| 4.1 计算每月总收入 4.2 计算每个季度的总收入 4.3 按年计算总收入 4.4 按工作日计算总收入 4.5 按时间段计算总收入(需要清理数据) 4.6 按时间段计算平均消费 4.4 按工作日计算平均消费 4.8 计算年、月、日的交易总数 4.9 找出交易量最大的10个客户 4.10 找出消费最多的前10位顾客 sql -- 1计算每月总收入 select purchase_month, sum(price) as monthSUM from transaction_details group by purchase_month;
sql -- 2计算每季度总收入 with base as ( select price, concat_ws('-',substr(purchase_date,1,4),cast(ceil(month(purchase_date)/3.0) as string))as year_quarter from transaction_details) select year_quarter, sum(price) as year_quarterSUM from base group by year_quarter; -- 3计算每年度总收入 select substr(purchase_date,1,4), sum(price) as yearSUM from transaction_details group by substr(purchase_date,1,4); -- 4计算工作日收入 select `dayofweek`(purchase_date), sum(price) as daySUM from transaction_details group by `dayofweek`(purchase_date) having `dayofweek`(purchase_date) in (2,3,4,5,6); -- 5按时间段计算总收入(需要清理数据) with base as ( select price,purchase_time, if(purchase_time like '%M',from_unixtime(unix_timestamp(purchase_time,'hh:mm aa'),'HH:mm'),purchase_time) as time_format from transaction_details), timeformat as ( select price, purchase_time, time_format, (cast(split(time_format,':')[0] as decimal(4,2))+ cast(split(time_format,':')[1] as decimal(4,2))/60) as purchase_time_in_hr from base), t1 as (select price, purchase_time, time_format, purchase_time_in_hr, `if`(purchase_time_in_hr > 5 and purchase_time_in_hr <= 8, 'early morning', `if`(purchase_time_in_hr > 8 and purchase_time_in_hr <= 11, 'morning', `if`(purchase_time_in_hr > 11 and purchase_time_in_hr <= 13, 'noon', `if`(purchase_time_in_hr > 13 and purchase_time_in_hr <= 18, 'afternoon', `if`(purchase_time_in_hr > 18 and purchase_time_in_hr <= 22, 'evening', 'night'))))) as time_bucket from timeformat) select time_bucket, sum(price) bucketSUM, avg(price) avgprice from t1 group by time_bucket ;
sql -- 6按时间段计算平均消费 -- 7按工作日计算平均消费 select `dayofweek`(purchase_date), avg(price) as dayAVG from transaction_details where `dayofweek`(purchase_date) in (2,3,4,5,6) group by `dayofweek`(purchase_date); -- 8计算年、月、日的交易总数 select year(purchase_date) currentYear, count(1) as sumCount from transaction_details group by year(purchase_date); select substring(purchase_date,1,7) currentMonth, count(1) as sumCount from transaction_details group by substring(purchase_date,1,7); select day(purchase_date) currentDay, count(1) as sumCount from transaction_details group by day(purchase_date); -- 9找出交易量最大的10个客户 with base as ( select customer_id,count(transaction_id) as trans_cnt,sum(price) as customerSUM from transaction_details group by customer_id), cust_detail as( select concat_ws(' ', cd.first_name, '***') as cust_name, base.* , dense_rank() over (order by trans_cnt desc ) rn from base join vw_customer_details cd on base.customer_id=cd.customer_id) select * from cust_detail where rn<=10; -- 10找出消费最多的前10位顾客 with base as ( select customer_id,sum(price) as customerSUM from transaction_details group by customer_id), cust_detail as( select concat_ws(' ', cd.first_name, '***') as cust_name, base.* , dense_rank() over (order by customerSUM desc ) rn from base join vw_customer_details cd on base.customer_id=cd.customer_id) select * from cust_detail where rn<=10;
|
| Mysql 中建表查询 7.11 统计该期间交易数量最少的用户 7.12 计算每个季度的独立客户总数 7.13 计算每周的独立客户总数 7.14 计算整个活动客户平均花费的最大值 7.15 统计每月花费最多的客户 7.16 统计每月访问次数最多的客户 7.17 按总价找出最受欢迎的5种产品 7.18 根据购买频率找出最畅销的5种产品 7.19 根据客户数量找出最受欢迎的5种产品 sql -- 7.11统计该期间交易数量最少的用户 WITH t1 AS(SELECT customer_id,COUNT(1) c FROM transaction_details GROUP BY customer_id ORDER BY c), t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY t1.c) as rn FROM t1) SELECT CONCAT(vcd.first_name,'***'), t2.* FROM t2 JOIN vw_customer_details vcd ON t2.customer_id=vcd.customer_id WHERE rn=1; -- 7.12计算每个季度的独立客户总数 SELECT DISTINCT COUNT(customer_id), concat_ws('-',substr(purchase_date,1,4),cast(ceil(month(purchase_date)/3.0) as CHAR(20)))as year_quarter from transaction_details GROUP BY year_quarter ORDER BY (year_quarter+0); -- 7.13计算每周的独立客户总数 SELECT DISTINCT COUNT(customer_id), concat_ws('-',substr(purchase_date,1,4),cast(WEEKOFYEAR(purchase_date) as CHAR(20)))as week_year from transaction_details GROUP BY week_year; -- 7.14计算整个活动客户平均花费的最大值 WITH t1 AS(SELECT customer_id, avg(price) as costAVG FROM transaction_details GROUP BY customer_id), t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY costAVG DESC) as rn FROM t1) SELECT * FROM t2 WHERE rn=1; -- 7.15统计每月花费最多的客户 WITH t1 AS(SELECT purchase_month,customer_id, sum(price) as costSUM FROM transaction_details GROUP BY purchase_month,customer_id), t2 AS(SELECT t1.*,DENSE_RANK() over(PARTITION by purchase_month ORDER BY costSUM DESC) as rn FROM t1) SELECT CONCAT(vcd.first_name,'***'), t2.* FROM t2 JOIN vw_customer_details vcd ON t2.customer_id=vcd.customer_id WHERE rn=1; -- 7.16统计每月访问次数最多的客户 WITH t1 AS(SELECT purchase_month,customer_id, count(customer_id) as costCount FROM transaction_details GROUP BY purchase_month,customer_id), t2 AS(SELECT t1.*,DENSE_RANK() over(PARTITION by purchase_month ORDER BY costCount DESC) as rn FROM t1) SELECT CONCAT(vcd.first_name,'***'), t2.* FROM t2 JOIN vw_customer_details vcd ON t2.customer_id=vcd.customer_id WHERE rn=1; -- 7.17按总价找出最受欢迎的5种产品 WITH t1 AS(SELECT product,sum(price) as proSUM FROM transaction_details GROUP BY product), t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY proSUM DESC) as rn FROM t1) SELECT * FROM t2 WHERE rn BETWEEN 1 AND 5; -- 7.18根据购买频率找出最畅销的5种产品 WITH t1 AS(SELECT product,COUNT(product) as proCount FROM transaction_details GROUP BY product), t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY proCount DESC) as rn FROM t1) SELECT * FROM t2 WHERE rn BETWEEN 1 AND 5; -- 7.19根据客户数量找出最受欢迎的5种产品 WITH t1 AS(SELECT product,COUNT(customer_id) as proCount FROM transaction_details GROUP BY product), t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY proCount DESC) as rn FROM t1) SELECT * FROM t2 WHERE rn BETWEEN 1 AND 5;
|