Step 1将数据从MySQL迁移到HDFS
mysql_to_hdfs.sh
注意:这里sqoop数据迁移 连接的MySQL地址 要仔细比照,不要用老师原有的那个ip,否则就会出现连接被拒绝访问,出现同步半天hdfs那边什么都没有的情况。要用集群主机的那个ip
为了用户下单访问快,数据存在MySQL数据库上。数据分析的时候,需要把今天和过去非常多天的数据放在一起 所以数据量大 需要放在Hadoop的hive上分析。所以需要把数据从MySQL上迁移到HDFS上。
sql
#! /bin/bash
sqoop=/bigdata/sqoop-1.4.7.bin__hadoop-2.6.0/bin/sqoop
# `` 反引号包裹的内容,代表「执行 Linux 命令,并把命令的执行结果赋值给变量」;
# date -d '-1 day' +%F:Linux 的日期命令,含义是 获取「昨天」的日期,格式是 年-月-日(比如 2025-12-30);
# -d '-1 day' :往前推 1 天;如果写+1 day就是往后推 1 天;
# +%F :日期格式化,固定输出 yyyy-MM-dd 格式
do_date=`date -d '-1 day' +%F`
# 如果你写了第二个参数(你指定了日期),就把第二个参数赋值给do_date
# 如果没写第二个参数(你没指定日期),第二个参数就还是取上面定义的昨天的日期
if [[ -n "$2" ]]; then
do_date=$2
fi
# 与python不同,bash中定义函数不需要事先说明函数有几个参数,参数的名字是什么
import_data(){
$sqoop import \
--connect jdbc:mysql://192.168.10.130:3306/duoduo_db \
--username root \
--password Mzp_2022! \
--target-dir /origin_data/duoduo_db/db/$1/$do_date \
--delete-target-dir \
--query "$2 and \$CONDITIONS" \
--num-mappers 1 \
--fields-terminated-by '\t'
}
import_order_info(){
import_data order_info "select
id,
final_total_amount,
order_status,
user_id,
out_trade_no,
create_time,
operate_time,
province_id,
benefit_reduce_amount,
original_total_amount,
feight_fee
from order_info
where (date_format(create_time,'%Y-%m-%d')='$do_date'
or date_format(operate_time,'%Y-%m-%d')='$do_date')"
}
import_coupon_use(){
import_data coupon_use "select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from coupon_use
where (date_format(get_time,'%Y-%m-%d')='$do_date'
or date_format(using_time,'%Y-%m-%d')='$do_date'
or date_format(used_time,'%Y-%m-%d')='$do_date')"
}
import_order_status_log(){
import_data order_status_log "select
id,
order_id,
order_status,
operate_time
from order_status_log
where date_format(operate_time,'%Y-%m-%d')='$do_date'"
}
import_activity_order(){
import_data activity_order "select
id,
activity_id,
order_id,
create_time
from activity_order
where date_format(create_time,'%Y-%m-%d')='$do_date'"
}
import_user_info(){
import_data "user_info" "select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time
from user_info
where (DATE_FORMAT(create_time,'%Y-%m-%d')='$do_date'
or DATE_FORMAT(operate_time,'%Y-%m-%d')='$do_date')"
}
import_order_detail(){
import_data order_detail "select
od.id,
order_id,
user_id,
sku_id,
sku_name,
order_price,
sku_num,
od.create_time
from order_detail od
join order_info oi
on od.order_id=oi.id
where DATE_FORMAT(od.create_time,'%Y-%m-%d')='$do_date'"
}
import_payment_info(){
import_data "payment_info" "select
id,
out_trade_no,
order_id,
user_id,
alipay_trade_no,
total_amount,
subject,
payment_type,
payment_time
from payment_info
where DATE_FORMAT(payment_time,'%Y-%m-%d')='$do_date'"
}
import_comment_info(){
import_data comment_info "select
id,
user_id,
sku_id,
spu_id,
order_id,
appraise,
create_time
from comment_info
where date_format(create_time,'%Y-%m-%d')='$do_date'"
}
import_order_refund_info(){
import_data order_refund_info "select
id,
user_id,
order_id,
sku_id,
refund_type,
refund_num,
refund_amount,
refund_reason_type,
create_time
from order_refund_info
where date_format(create_time,'%Y-%m-%d')='$do_date'"
}
import_sku_info(){
import_data sku_info "select
id,
spu_id,
price,
sku_name,
sku_desc,
weight,
tm_id,
category3_id,
create_time
from sku_info where 1=1"
}
import_base_category1(){
import_data "base_category1" "select
id,
name
from base_category1 where 1=1"
}
import_base_category2(){
import_data "base_category2" "select
id,
name,
category1_id
from base_category2 where 1=1"
}
import_base_category3(){
import_data "base_category3" "select
id,
name,
category2_id
from base_category3 where 1=1"
}
import_base_province(){
import_data base_province "select
id,
name,
region_id,
area_code,
iso_code
from base_province
where 1=1"
}
import_base_region(){
import_data base_region "select
id,
region_name
from base_region
where 1=1"
}
import_base_trademark(){
import_data base_trademark "select
tm_id,
tm_name
from base_trademark
where 1=1"
}
import_spu_info(){
import_data spu_info "select
id,
spu_name,
category3_id,
tm_id
from spu_info
where 1=1"
}
import_favor_info(){
import_data favor_info "select
id,
user_id,
sku_id,
spu_id,
is_cancel,
create_time,
cancel_time
from favor_info
where 1=1"
}
import_cart_info(){
import_data cart_info "select
id,
user_id,
sku_id,
cart_price,
sku_num,
sku_name,
create_time,
operate_time,
is_ordered,
order_time
from cart_info
where 1=1"
}
import_coupon_info(){
import_data coupon_info "select
id,
coupon_name,
coupon_type,
condition_amount,
condition_num,
activity_id,
benefit_amount,
benefit_discount,
create_time,
range_type,
spu_id,
tm_id,
category3_id,
limit_num,
operate_time,
expire_time
from coupon_info
where 1=1"
}
import_activity_info(){
import_data activity_info "select
id,
activity_name,
activity_type,
start_time,
end_time,
create_time
from activity_info
where 1=1"
}
import_activity_rule(){
import_data activity_rule "select
id,
activity_id,
condition_amount,
condition_num,
benefit_amount,
benefit_discount,
benefit_level
from activity_rule
where 1=1"
}
import_base_dic(){
import_data base_dic "select
dic_code,
dic_name,
parent_code,
create_time,
operate_time
from base_dic
where 1=1"
}
case $1 in
"order_info")
import_order_info
;;
"base_category1")
import_base_category1
;;
"base_category2")
import_base_category2
;;
"base_category3")
import_base_category3
;;
"order_detail")
import_order_detail
;;
"sku_info")
import_sku_info
;;
"user_info")
import_user_info
;;
"payment_info")
import_payment_info
;;
"base_province")
import_base_province
;;
"base_region")
import_base_region
;;
"base_trademark")
import_base_trademark
;;
"activity_info")
import_activity_info
;;
"activity_order")
import_activity_order
;;
"cart_info")
import_cart_info
;;
"comment_info")
import_comment_info
;;
"coupon_info")
import_coupon_info
;;
"coupon_use")
import_coupon_use
;;
"favor_info")
import_favor_info
;;
"order_refund_info")
import_order_refund_info
;;
"order_status_log")
import_order_status_log
;;
"spu_info")
import_spu_info
;;
"activity_rule")
import_activity_rule
;;
"base_dic")
import_base_dic
;;
"first")
import_base_category1
import_base_category2
import_base_category3
import_order_info
import_order_detail
import_sku_info
import_user_info
import_payment_info
import_base_province
import_base_region
import_base_trademark
import_activity_info
import_activity_order
import_cart_info
import_comment_info
import_coupon_use
import_coupon_info
import_favor_info
import_order_refund_info
import_order_status_log
import_spu_info
import_activity_rule
import_base_dic
;;
"all")
import_base_category1
import_base_category2
import_base_category3
import_order_info
import_order_detail
import_sku_info
import_user_info
import_payment_info
import_base_trademark
import_activity_info
import_activity_order
import_cart_info
import_comment_info
import_coupon_use
import_coupon_info
import_favor_info
import_order_refund_info
import_order_status_log
import_spu_info
import_activity_rule
import_base_dic
;;
esac
echo "数据同步完成。"
Step2 创立ODS层表
bash
#!/bin/bash
hive=/bigdata/apache-hive-2.3.3-bin/bin/hive
sql="
drop database if exists duoduo_db cascade;
create database duoduo_db;
use duoduo_db;
drop table if exists ods_order_info;
create external table ods_order_info(
id string COMMENT '订单号',
final_total_amount decimal(10,2) COMMENT '订单金额',
order_status string COMMENT '订单状态',
user_id string COMMENT '用户id',
out_trade_no string COMMENT '支付流水号',
create_time string COMMENT '创建时间',
operate_time string COMMENT '操作时间',
province_id string COMMENT '省份ID',
benefit_reduce_amount decimal(10,2) COMMENT '优惠金额',
original_total_amount decimal(10,2) COMMENT '原价金额',
feight_fee decimal(10,2) COMMENT '运费'
) COMMENT '订单表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_order_info/';
drop table if exists ods_order_detail;
create external table ods_order_detail(
id string COMMENT '订单编号',
order_id string COMMENT '订单号',
user_id string COMMENT '用户id',
sku_id string COMMENT '商品id',
sku_name string COMMENT '商品名称',
order_price decimal(10,2) COMMENT '商品价格',
sku_num bigint COMMENT '商品数量',
create_time string COMMENT '创建时间'
) COMMENT '订单详情表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_order_detail/';
drop table if exists ods_sku_info;
create external table ods_sku_info(
id string COMMENT 'skuId',
spu_id string COMMENT 'spuid',
price decimal(10,2) COMMENT '价格',
sku_name string COMMENT '商品名称',
sku_desc string COMMENT '商品描述',
weight string COMMENT '重量',
tm_id string COMMENT '品牌id',
category3_id string COMMENT '品类id',
create_time string COMMENT '创建时间'
) COMMENT 'SKU商品表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_sku_info/';
drop table if exists ods_user_info;
create external table ods_user_info(
id string COMMENT '用户id',
name string COMMENT '姓名',
birthday string COMMENT '生日',
gender string COMMENT '性别',
email string COMMENT '邮箱',
user_level string COMMENT '用户等级',
create_time string COMMENT '创建时间',
operate_time string COMMENT '操作时间'
) COMMENT '用户表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_user_info/';
drop table if exists ods_base_category1;
create external table ods_base_category1(
id string COMMENT 'id',
name string COMMENT '名称'
) COMMENT '商品一级分类表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_base_category1/';
drop table if exists ods_base_category2;
create table ods_base_category2(
id string COMMENT 'id',
name string COMMENT '名称',
category1_id string COMMENT '一级品类id'
) COMMENT '商品二级分类表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/base_category2/';
drop table if exists ods_base_category3;
create external table ods_base_category3(
id string COMMENT ' id',
name string COMMENT '名称',
category2_id string COMMENT '二级品类id'
) COMMENT '商品三级分类表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_base_category3/';
drop table if exists ods_payment_info;
create external table ods_payment_info(
id bigint COMMENT '编号',
out_trade_no string COMMENT '对外业务编号',
order_id string COMMENT '订单编号',
user_id string COMMENT '用户编号',
alipay_trade_no string COMMENT '支付宝交易流水编号',
total_amount decimal(16,2) COMMENT '支付金额',
subject string COMMENT '交易内容',
payment_type string COMMENT '支付类型',
payment_time string COMMENT '支付时间'
) COMMENT '支付流水表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_payment_info/';
drop table if exists ods_base_province;
create external table ods_base_province (
id bigint COMMENT '编号',
name string COMMENT '省份名称',
region_id string COMMENT '地区ID',
area_code string COMMENT '地区编码',
iso_code string COMMENT 'iso编码'
) COMMENT '省份表'
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_base_province/';
drop table if exists ods_base_region;
create external table ods_base_region (
id bigint COMMENT '编号',
region_name string COMMENT '地区名称'
) COMMENT '地区表'
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_base_region/';
drop table if exists ods_base_trademark;
create external table ods_base_trademark (
tm_id bigint COMMENT '编号',
tm_name string COMMENT '品牌名称'
) COMMENT '品牌表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_base_trademark/';
drop table if exists ods_order_status_log;
create external table ods_order_status_log (
id bigint COMMENT '编号',
order_id string COMMENT '订单ID',
order_status string COMMENT '订单状态',
operate_time string COMMENT '修改时间'
) COMMENT '订单状态表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_order_status_log/';
drop table if exists ods_spu_info;
create external table ods_spu_info(
id string COMMENT 'spuid',
spu_name string COMMENT 'spu名称',
category3_id string COMMENT '品类id',
tm_id string COMMENT '品牌id'
) COMMENT 'SPU商品表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_spu_info/';
drop table if exists ods_comment_info;
create external table ods_comment_info(
id string COMMENT '编号',
user_id string COMMENT '用户ID',
sku_id string COMMENT '商品sku',
spu_id string COMMENT '商品spu',
order_id string COMMENT '订单ID',
appraise string COMMENT '评价',
create_time string COMMENT '评价时间'
) COMMENT '商品评论表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_comment_info/';
drop table if exists ods_order_refund_info;
create external table ods_order_refund_info(
id string COMMENT '编号',
user_id string COMMENT '用户ID',
order_id string COMMENT '订单ID',
sku_id string COMMENT '商品ID',
refund_type string COMMENT '退款类型',
refund_num bigint COMMENT '退款件数',
refund_amount decimal(16,2) COMMENT '退款金额',
refund_reason_type string COMMENT '退款原因类型',
create_time string COMMENT '退款时间'
) COMMENT '退单表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_order_refund_info/';
drop table if exists ods_cart_info;
create external table ods_cart_info(
id string COMMENT '编号',
user_id string COMMENT '用户id',
sku_id string COMMENT 'skuid',
cart_price string COMMENT '放入购物车时价格',
sku_num string COMMENT '数量',
sku_name string COMMENT 'sku名称 (冗余)',
create_time string COMMENT '创建时间',
operate_time string COMMENT '修改时间',
is_ordered string COMMENT '是否已经下单',
order_time string COMMENT '下单时间'
) COMMENT '加购表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_cart_info/';
drop table if exists ods_favor_info;
create external table ods_favor_info(
id string COMMENT '编号',
user_id string COMMENT '用户id',
sku_id string COMMENT 'skuid',
spu_id string COMMENT 'spuid',
is_cancel string COMMENT '是否取消',
create_time string COMMENT '收藏时间',
cancel_time string COMMENT '取消时间'
) COMMENT '商品收藏表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_favor_info/';
drop table if exists ods_coupon_use;
create external table ods_coupon_use(
id string COMMENT '编号',
coupon_id string COMMENT '优惠券ID',
user_id string COMMENT 'skuid',
order_id string COMMENT 'spuid',
coupon_status string COMMENT '优惠券状态',
get_time string COMMENT '领取时间',
using_time string COMMENT '使用时间(下单)',
used_time string COMMENT '使用时间(支付)'
) COMMENT '优惠券领用表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_coupon_use/';
drop table if exists ods_coupon_info;
create external table ods_coupon_info(
id string COMMENT '购物券编号',
coupon_name string COMMENT '购物券名称',
coupon_type string COMMENT '购物券类型 1 现金券 2 折扣券 3 满减券 4 满件打折券',
condition_amount string COMMENT '满额数',
condition_num string COMMENT '满件数',
activity_id string COMMENT '活动编号',
benefit_amount string COMMENT '减金额',
benefit_discount string COMMENT '折扣',
create_time string COMMENT '创建时间',
range_type string COMMENT '范围类型 1、商品 2、品类 3、品牌',
spu_id string COMMENT '商品id',
tm_id string COMMENT '品牌id',
category3_id string COMMENT '品类id',
limit_num string COMMENT '最多领用次数',
operate_time string COMMENT '修改时间',
expire_time string COMMENT '过期时间'
) COMMENT '优惠券表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_coupon_info/';
drop table if exists ods_activity_info;
create external table ods_activity_info(
id string COMMENT '编号',
activity_name string COMMENT '活动名称',
activity_type string COMMENT '活动类型',
start_time string COMMENT '开始时间',
end_time string COMMENT '结束时间',
create_time string COMMENT '创建时间'
) COMMENT '活动表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_activity_info/';
drop table if exists ods_activity_order;
create external table ods_activity_order(
id string COMMENT '编号',
activity_id string COMMENT '优惠券ID',
order_id string COMMENT 'skuid',
create_time string COMMENT '领取时间'
) COMMENT '活动订单关联表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_activity_order/';
drop table if exists ods_activity_rule;
create external table ods_activity_rule(
id string COMMENT '编号',
activity_id string COMMENT '活动ID',
condition_amount string COMMENT '满减金额',
condition_num string COMMENT '满减件数',
benefit_amount string COMMENT '优惠金额',
benefit_discount string COMMENT '优惠折扣',
benefit_level string COMMENT '优惠级别'
) COMMENT '优惠规则表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_activity_rule/';
drop table if exists ods_base_dic;
create external table ods_base_dic(
dic_code string COMMENT '编号',
dic_name string COMMENT '编码名称',
parent_code string COMMENT '父编码',
create_time string COMMENT '创建日期',
operate_time string COMMENT '操作日期'
) COMMENT '编码字典表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_base_dic/';
"
hive -e "$sql"
echo "建库建表完成。"
ODS层所有的建表SQL语句都会在结尾加一句"location '/origin_data/duoduo_db/ods/ods_order_info/'; " 这句话指定Hive上存储的数据会被存在hdfs上哪个位置。
Step3 HDFS数据加载进ODS层的Hive数据库
bash
#!/bin/bash
APP=duoduo_db
hive=/bigdata/apache-hive-2.3.3-bin/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [[ -n "$2" ]]; then
do_date=$2
else
do_date=`date -d "-1 day" +%F`
fi
# inpath写hdfs上被加载的数据文件的地址
# OVERWRITE into的表 写存储进去hive表的名字
# partition写分区
# 关于partition有下面这几个疑问:
# 【???疑问???】do_date这个日期是本来就有的一列,按照这列进行分组,还是你人为的添加了一列,
# 填入当天的日期,就能把MySQL中属于当天的数据集就加载进来?
# ods_base_province, ods_base_region这两个不随着日期变化的,就不做这个partition
#
sql1="
load data
inpath '/origin_data/$APP/db/order_info/$do_date'
OVERWRITE into table ${APP}.ods_order_info
partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/order_detail/$do_date' OVERWRITE into table ${APP}.ods_order_detail partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/sku_info/$do_date' OVERWRITE into table ${APP}.ods_sku_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/user_info/$do_date' OVERWRITE into table ${APP}.ods_user_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/payment_info/$do_date' OVERWRITE into table ${APP}.ods_payment_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_category1/$do_date' OVERWRITE into table ${APP}.ods_base_category1 partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_category2/$do_date' OVERWRITE into table ${APP}.ods_base_category2 partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_category3/$do_date' OVERWRITE into table ${APP}.ods_base_category3 partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_trademark/$do_date' OVERWRITE into table ${APP}.ods_base_trademark partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/activity_info/$do_date' OVERWRITE into table ${APP}.ods_activity_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/activity_order/$do_date' OVERWRITE into table ${APP}.ods_activity_order partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/cart_info/$do_date' OVERWRITE into table ${APP}.ods_cart_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/comment_info/$do_date' OVERWRITE into table ${APP}.ods_comment_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/coupon_info/$do_date' OVERWRITE into table ${APP}.ods_coupon_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/coupon_use/$do_date' OVERWRITE into table ${APP}.ods_coupon_use partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/favor_info/$do_date' OVERWRITE into table ${APP}.ods_favor_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/order_refund_info/$do_date' OVERWRITE into table ${APP}.ods_order_refund_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/order_status_log/$do_date' OVERWRITE into table ${APP}.ods_order_status_log partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/spu_info/$do_date' OVERWRITE into table ${APP}.ods_spu_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/activity_rule/$do_date' OVERWRITE into table ${APP}.ods_activity_rule partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_dic/$do_date' OVERWRITE into table ${APP}.ods_base_dic partition(dt='$do_date');
"
sql2="
load data inpath '/origin_data/$APP/db/base_province/$do_date' OVERWRITE into table ${APP}.ods_base_province;
load data inpath '/origin_data/$APP/db/base_region/$do_date' OVERWRITE into table ${APP}.ods_base_region;
"
case $1 in
"first"){
$hive -e "$sql1"
$hive -e "$sql2"
};;
"all"){
$hive -e "$sql1"
};;
esac
echo "数据加载完成。"
【问1】:请问这句hive数据加载hdfs上数据到hive的代码会把数据文件从'/origin_data/APP/db/order_detail/do_date' 剪切到'/origin_data/APP/ods/order_detail/do_date'去?是剪切还是复制?剪切到到哪个位置,如何设定?
- 你的 SQL 执行后,一定会发生「剪切」
执行这句命令后,HDFS 上 /origin_data/$APP/db/order_detail/$do_date 目录下的所有数据文件 ,会被彻底剪切(move) 到 Hive 表 ${APP}.ods_order_detail 对应分区 dt='$do_date' 的 HDFS 存储目录下,原目录的数据文件会被清空。
为什么是「剪切」不是「复制」?【关键核心原理】
原因:HDFS 是分布式文件系统,文件在集群内移动的「IO 成本极低」(本质只是修改 NameNode 的元数据,DataNode 的真实数据块不用动),Hive 设计时为了节省集群存储、避免重复数据,对 HDFS 内的 load 操作默认用「剪切」,这是 Hive 的最优设计。
数据文件被「剪切到哪里」?由谁决定?【核心规则】
✅ 数据最终存储路径 = Hive 表的「存储位置」 + "分区字段 = 分区值"
问:数据存入ODS层Hive数据库,那请问数据文件存在HDFS的什么位置?
答:
ODS层所有的建表语句都会在结尾加一句"location '/origin_data/duoduo_db/ods/ods_order_info/'; " 这句话指定Hive上存储的数据会被存在hdfs上哪个位置。
也就是'/origin_data/duoduo_db/ods/ods_表名/';
sql
create external table ods_order_info(
id string COMMENT '订单号',
final_total_amount decimal(10,2) COMMENT '订单金额',
order_status string COMMENT '订单状态',
user_id string COMMENT '用户id',
out_trade_no string COMMENT '支付流水号',
create_time string COMMENT '创建时间',
operate_time string COMMENT '操作时间',
province_id string COMMENT '省份ID',
benefit_reduce_amount decimal(10,2) COMMENT '优惠金额',
original_total_amount decimal(10,2) COMMENT '原价金额',
feight_fee decimal(10,2) COMMENT '运费'
) COMMENT '订单表'
PARTITIONED BY (dt string)
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/ods/ods_order_info/';
Hive 的分区表有固定的分区目录命名规范 :分区字段名=分区值,这是 Hive 的元数据约定,不可修改。你的表是按 dt 分区,分区值是 $do_date,所以分区目录就是 dt=$do_date。
【问2】:为什么你这里加载数据的时候选择用overwrite覆盖写入,而不是使用 没有overwrite的追加写入?
sql
load data inpath
'/origin_data/$APP/db/order_info/$do_date'
OVERWRITE into
table ${APP}.ods_order_info
partition(dt='$do_date');
答:
你的场景是 ODS 层(数仓贴源层)加载订单明细,用 OVERWRITE 是行业标准写法,因为 ODS 层一般按天全量同步,当天的数据需要覆盖当天的分区,避免重复数据。下一天的数据又写在新的dt分区里,不会干扰前面日期的数据。
【问】:
数据导入后代码,没有报错,是不是意味着数据导入就成功了?没有错误?所以要如何检查数据是否出错了?
LOAD DATA 只做「文件移动」,不做「数据校验」
Hive 的 LOAD DATA 是一个纯文件级别的操作 ,它只会把文件剪切 / 复制到表的存储目录,不会校验文件的内容和表的字段是否匹配!
- 比如:你的表定义了 3 个字段,但数据文件里有 5 列,LOAD 操作依然成功;
- 比如:数据文件是乱码、格式错误,LOAD 操作也依然成功;
- 只有当你执行
select * from ods_order_detail where dt='$do_date'查询时,才会发现数据解析异常。