大数据处理流程
1.将云端mysql上的表通过sqoop放入hdfs进行管理
arduino
sqoop import \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table price_data \
--target-dir /test/agriculture/price_data \
-m 1
arduino
-- 追加没成功
sqoop import \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table price_data \
--target-dir /user/hive/warehouse/agriculture.db/price_data \
-m 1 \
--append
典中典
arduino
-- 全量备份(hive表不存在,导入时自动创建hive表)
sqoop import \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table variety_regional_average \
--hive-import \
--hive-database agriculture \
--create-hive-table \
--fields-terminated-by ',' \
--null-string '\N' \
--null-non-string '\N' \
-m 1
sqoop import \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table variety_national_average \
--hive-import \
--hive-database agriculture \
--create-hive-table \
--fields-terminated-by ',' \
--null-string '\N' \
--null-non-string '\N' \
-m 1
2.在IDEA连接spark后,让spark读到hdfs中的表
spark连不上解决方法:开启thriftserver服务
sql
-- 开启服务
start-thriftserver.sh
-- 停止服务
stop-thriftserver.sh
hdfs删文件
bash
hdfs dfs -rm -r /user/hive/warehouse/agriculture.db/variety_market_average
-- 使用-skipTrash选项来强制删除文件,而不将其移动到回收站。
hdfs dfs -rm -r -skipTrash /test/agriculture/price_data
hdfs dfs -rm -r -skipTrash /user/hive/warehouse/agriculture.db/variety_national_average
hdfs dfs -rm -r -skipTrash /user/hive/warehouse/agriculture.db/variety_regional_average
hdfs dfs -rm -r -skipTrash /user/hive/warehouse/agriculture.db/weekly_paper_data200
hdfs dfs -rm -r -skipTrash /user/root/price_data
2.1建立数据库
ini
CREATE DATABASE IF NOT EXISTS agriculture;
USE agriculture;
2.2建立与刚刚放进hdfs的表结构相同的表
sql
CREATE TABLE IF NOT EXISTS price_data (
collectDate DATE,
market STRING,
variety STRING,
highestPrice FLOAT,
minimumPrice FLOAT,
bulkPrice FLOAT,
province STRING
)
USING parquet
OPTIONS ('path' '/user/hive/warehouse/agriculture.db/price_data');
2.3注册HDFS中的文本文件为临时表,推断模式
sql
CREATE OR REPLACE TEMPORARY VIEW hdfs_price_data
USING text
OPTIONS (
'path' '/test/agriculture/price_data'
);
2.4插入数据到Spark表
scss
INSERT INTO TABLE agriculture.price_data
SELECT
TO_DATE(split(value, ',')[0]) AS collectDate,
split(value, ',')[1] AS market,
split(value, ',')[2] AS variety,
CAST(split(value, ',')[3] AS FLOAT) AS highestPrice,
CAST(split(value, ',')[4] AS FLOAT) AS minimumPrice,
CAST(split(value, ',')[5] AS FLOAT) AS bulkPrice,
split(value, ',')[6] AS province
FROM hdfs_price_data;
3.处理数据
3.1日报数据处理
3.1.1处理200指数每日涨幅
这里代码直接在MySQL中运行
vbnet
-- 此查询将从daily_exponent表中选择从'2023-05-26'开始的数据,并计算每个产品的环比,然后将数据插入DAILY_PAPER_DATA200表中。
INSERT INTO DAILY_PAPER_DATA200 (collectDate, PYesterdayExponent, PTodayExponent, PLinkRelativeRatio, VYesterdayExponent, VTodayExponent, VLinkRelativeRatio, GYesterdayExponent, GTodayExponent, GLinkRelativeRatio)
SELECT
de.collectdate AS collectDate,
lag(de.productexponent) OVER (ORDER BY de.collectdate) AS PYesterdayExponent,
de.productexponent AS PTodayExponent,
(de.productexponent / lag(de.productexponent) OVER (ORDER BY de.collectdate) - 1) AS PLinkRelativeRatio,
lag(de.vegetablebasketexponent) OVER (ORDER BY de.collectdate) AS VYesterdayExponent,
de.vegetablebasketexponent AS VTodayExponent,
(de.vegetablebasketexponent / lag(de.vegetablebasketexponent) OVER (ORDER BY de.collectdate) - 1) AS VLinkRelativeRatio,
lag(de.grainoilexponent) OVER (ORDER BY de.collectdate) AS GYesterdayExponent,
de.grainoilexponent AS GTodayExponent,
(de.grainoilexponent / lag(de.grainoilexponent) OVER (ORDER BY de.collectdate) - 1) AS GLinkRelativeRatio
FROM daily_exponent de
WHERE de.collectdate >= '2023-05-26'
ORDER BY de.collectdate;
3.1.2处理各品种每日涨幅
这里代码直接在MySQL中运行
sql
-- 这个查询会按照日期范围从2023-08-25到最大日期,为每个品种计算昨日价格、今日价格和环比,然后将数据插入到DAILY_PAPER_DATA_VARIETY表中。
DELIMITER //
CREATE PROCEDURE PopulateDailyData()
BEGIN
DECLARE currentDate DATE;
DECLARE maxDate DATE;
-- 找到最大日期
SELECT MAX(collectDate) INTO maxDate FROM VARIETY_NATIONAL_AVERAGE;
-- 初始化当前日期
SET currentDate = '2023-08-25';
-- 循环日期范围,按品种填充数据
WHILE currentDate <= maxDate DO
INSERT INTO DAILY_PAPER_DATA_VARIETY (collectDate, variety, yesterdayPrice, todayPrice, linkRelativeRatio)
SELECT
currentDate AS collectDate,
vna.variety,
IFNULL(vna1.averagePrice, 0) AS yesterdayPrice,
IFNULL(vna2.averagePrice, 0) AS todayPrice,
IFNULL((vna2.averagePrice - vna1.averagePrice) / vna1.averagePrice, 0) AS linkRelativeRatio
FROM
(SELECT DISTINCT variety FROM VARIETY_NATIONAL_AVERAGE) AS vna
LEFT JOIN
VARIETY_NATIONAL_AVERAGE AS vna1
ON
vna.variety = vna1.variety AND DATE_SUB(currentDate, INTERVAL 1 DAY) = vna1.collectDate
LEFT JOIN
VARIETY_NATIONAL_AVERAGE AS vna2
ON
vna.variety = vna2.variety AND currentDate = vna2.collectDate;
SET currentDate = DATE_ADD(currentDate, INTERVAL 1 DAY);
END WHILE;
END //
DELIMITER ;
-- 调用存储过程
CALL PopulateDailyData();
3.2周报数据处理
3.2.1种类
3.2.1.1周报数据处理(上周价格)
sql
-- 创建并注册临时视图,筛选指定日期范围和品种
CREATE OR REPLACE TEMPORARY VIEW filtered_data AS
SELECT
collectDate,
variety,
bulkPrice
FROM
agriculture.price_data
WHERE
collectDate BETWEEN '2023-08-24' AND '2023-08-30'
AND variety IN (
'猪肉', '羊肉', '牛肉', '鸡蛋', '白条鸡',
'活草鱼', '活鲫鱼', '活鲤鱼', '白鲢活鱼', '花鲢活鱼', '大带鱼', '大黄花鱼',
'菠菜', '莴笋', '豆角', '韭菜', '菜花',
'胡萝卜', '油菜', '西红柿', '青椒', '土豆',
'富士苹果', '巨峰葡萄', '香蕉', '菠萝', '西瓜', '鸭梨'
);
sql
-- 计算每个品种的 bulkPrice 平均值
CREATE OR REPLACE TEMPORARY VIEW week_avg AS
SELECT
variety,
AVG(bulkPrice) AS weekPrice
FROM
filtered_data
GROUP BY
variety;
sql
-- 创建表agriculture.weekAvg
CREATE TABLE IF NOT EXISTS weekAvg (
firstDate DATE,
variety STRING,
weekPrice FLOAT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
sql
-- 将结果插入到 agriculture.weekAvg 表中
INSERT INTO TABLE agriculture.weekAvg
SELECT
'2023-08-24' AS firstDate,
variety,
weekPrice
FROM
week_avg;
sql
drop table filtered_data;
drop table week_avg;
3.2.1.2周报数据处理(本周价格)
同上,修改日期,把数据续在agriculture.weekAvg表中。
3.2.1.3合并两周数据
sql
-- 创建 WEEKLY_PAPER_DATA 表
CREATE TABLE WEEKLY_PAPER_DATA (
firstDate DATE,
variety VARCHAR(16),
lastWeekPrice FLOAT,
thisWeekPrice FLOAT,
linkRelativeRatio FLOAT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
vbnet
-- 插入两周的数据到WEEKLY_PAPER_DATA表
INSERT INTO TABLE WEEKLY_PAPER_DATA
SELECT
wb.firstDate AS firstDate,
wa.variety AS variety,
wa.weekPrice AS lastWeekPrice,
wb.weekPrice AS thisWeekPrice,
(wb.weekPrice - wa.weekPrice) / wa.weekPrice AS linkRelativeRatio
FROM
weekAvg wa
JOIN
weekAvg wb
ON
wa.variety = wb.variety
AND wa.firstDate = '2023-08-24' -- 上一周的日期
AND wb.firstDate = '2023-08-31' -- 这一周的日期;
3.2.1.4通过sqoop将hdfs中处理完的表导出至云端mysql上
arduino
sqoop export \
--connect jdbc:mysql://rm-cn-lbj3dbho9001fbgo.rwlb.rds.aliyuncs.com:3306/agriculture \
--username root \
--password Ncp123456 \
--table weekly_paper_data \
--fields-terminated-by ',' \
--export-dir /user/hive/warehouse/agriculture.db/weekly_paper_data \
--input-null-string '\N' \
--input-null-non-string '\N'
3.2.2指数
3.2.2.1spark处理数据
sql
-- 建表
CREATE TABLE WEEKLY_PAPER_DATA200 (
firstDate DATE,
PLastWeekExponent FLOAT,
PThisWeekExponent FLOAT,
PLinkRelativeRatio FLOAT,
VLastWeekExponent FLOAT,
VThisWeekExponent FLOAT,
VLinkRelativeRatio FLOAT,
GLastWeekExponent FLOAT,
GThisWeekExponent FLOAT,
GLinkRelativeRatio FLOAT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
sql
-- 创建一个临时视图,包括日期参数
CREATE OR REPLACE TEMPORARY VIEW date_parameters AS
SELECT
'2023-08-24' AS last_week_start,
'2023-08-30' AS last_week_end,
'2023-08-31' AS this_week_start,
'2023-09-06' AS this_week_end;
sql
-- 计算每周数据并插入到WEEKLY_PAPER_DATA200表中
INSERT INTO WEEKLY_PAPER_DATA200
SELECT
dp.this_week_start AS firstDate,
AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.productexponent END) AS PLastWeekExponent,
AVG(CASE WHEN de.collectdate BETWEEN dp.this_week_start AND dp.this_week_end THEN de.productexponent END) AS PThisWeekExponent,
((AVG(CASE WHEN de.collectdate BETWEEN dp.this_week_start AND dp.this_week_end THEN de.productexponent END) - AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.productexponent END)) / AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.productexponent END)) AS PLinkRelativeRatio,
AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.vegetablebasketexponent END) AS VLastWeekExponent,
AVG(CASE WHEN de.collectdate BETWEEN dp.this_week_start AND dp.this_week_end THEN de.vegetablebasketexponent END) AS VThisWeekExponent,
((AVG(CASE WHEN de.collectdate BETWEEN dp.this_week_start AND dp.this_week_end THEN de.vegetablebasketexponent END) - AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.vegetablebasketexponent END)) / AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.vegetablebasketexponent END)) AS VLinkRelativeRatio,
AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.grainoilexponent END) AS GLastWeekExponent,
AVG(CASE WHEN de.collectdate BETWEEN dp.this_week_start AND dp.this_week_end THEN de.grainoilexponent END) AS GThisWeekExponent,
((AVG(CASE WHEN de.collectdate BETWEEN dp.this_week_start AND dp.this_week_end THEN de.grainoilexponent END) - AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.grainoilexponent END)) / AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.grainoilexponent END)) AS GLinkRelativeRatio
FROM daily_exponent de
CROSS JOIN date_parameters dp
GROUP BY dp.this_week_start, dp.this_week_end;
3.2.2.2通过sqoop将hdfs中处理完的表导出至云端mysql上
arduino
sqoop export \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table weekly_paper_data200 \
--fields-terminated-by ',' \
--export-dir /user/hive/warehouse/agriculture.db/weekly_paper_data200 \
--input-null-string '\N' \
--input-null-non-string '\N'
3.3市场行情处理(国、省每日均价)
3.3.1国
sql
-- 创建 VARIETY_NATIONAL_AVERAGE 表
CREATE TABLE VARIETY_NATIONAL_AVERAGE (
collectDate DATE,
variety VARCHAR(16),
averagePrice FLOAT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
sql
-- 计算bulkPrice的平均值并插入到VARIETY_NATIONAL_AVERAGE表
INSERT INTO TABLE VARIETY_NATIONAL_AVERAGE
SELECT
collectDate,
variety,
AVG(bulkPrice) AS averagePrice
FROM
price_data
GROUP BY
collectDate,
variety;
arduino
-- 用sqoop将hdfs中的数据导出至云端mysql中
sqoop export \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table variety_national_average \
--export-dir /user/hive/warehouse/agriculture.db/variety_national_average \
--input-fields-terminated-by "\001"
最好别使用sqoop来导出,我试了一个晚上,一直报错。不如用datagrip图形化界面导出
追加:
sql
-- 这个SQL查询会将计算结果覆盖写入到VARIETY_NATIONAL_AVERAGE表中,这意味着它会替换已有的数据,但如果数据不存在,它会被插入。这样可以实现追加插入的效果。
INSERT OVERWRITE TABLE VARIETY_NATIONAL_AVERAGE
SELECT
collectDate,
variety,
AVG(bulkPrice) AS averagePrice
FROM
price_data
GROUP BY
collectDate,
variety;
注意用datagrip图形化界面导出之前要先在navicat里把原来的表清空
3.3.2省
sql
-- 建表
CREATE TABLE VARIETY_REGIONAL_AVERAGE (
collectDate DATE,
variety VARCHAR(16),
region VARCHAR(16),
averagePrice FLOAT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
sql
-- 计算bulkPrice的平均值并插入到VARIETY_REGIONAL_AVERAGE表
INSERT INTO TABLE VARIETY_REGIONAL_AVERAGE
SELECT
collectDate,
variety,
province,
AVG(bulkPrice) AS averagePrice
FROM
price_data
GROUP BY
collectDate,
variety,
province;
arduino
-- 用sqoop将hdfs中的数据导出至云端mysql中
sqoop export \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table variety_regional_average \
--export-dir /user/hive/warehouse/agriculture.db/variety_regional_average \
--input-fields-terminated-by "\001"
最好别使用sqoop来导出,我试了一个晚上,一直报错。不如用datagrip图形化界面导出
追加:
sql
-- 这个SQL查询会将计算结果覆盖写入到VARIETY_REGIONAL_AVERAGE表中,这意味着它会替换已有的数据,但如果数据不存在,它会被插入。这样可以实现追加插入的效果。
INSERT OVERWRITE TABLE VARIETY_REGIONAL_AVERAGE
SELECT
collectDate,
variety,
province,
AVG(bulkPrice) AS averagePrice
FROM
price_data
GROUP BY
collectDate,
variety,
province;
注意用datagrip图形化界面导出之前要先在navicat里把原来的表清空
4.用sqoop进行数据导出
直接插入到mysql里,不覆盖不追加
arduino
sqoop export \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table variety_national_average \
--fields-terminated-by ',' \
--export-dir /user/hive/warehouse/agriculture.db/variety_national_average \
--input-null-string '\N' \
--input-null-non-string '\N'
sqoop export \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table variety_regional_average \
--fields-terminated-by ',' \
--export-dir /user/hive/warehouse/agriculture.db/variety_regional_average \
--input-null-string '\N' \
--input-null-non-string '\N'
hwj实验成功追加,关键是--update-key,mysql中要有主键,hive中要有与mysql主键对应的列,并且这一列的值也要与mysql主键列的值一一对应
arduino
sqoop export --connect jdbc:mysql://address:3306/farmer \
--username root \
--password 123456Aa@ \
--table t_briefpriceinfo \
--export-dir /user/hive/warehouse/farmer.db/briefpriceinfo \
--input-fields-terminated-by ',' \
--input-null-non-string '\N' \
--input-null-string '\N' \
--columns varietyname,avg_middleprice,processingdate,undulate,briefid \
--update-mode allowinsert \
--update-key briefid
hwj的核心科技,如何让hive中的一列产生自增效果
css
INSERT OVERWRITE TABLE briefPriceInfo
SELECT
p.varietyName,
p.avg_middlePrice,
p.processingDate,
CASE
WHEN p.prev_avg_middlePrice IS NOT NULL THEN (p.avg_middlePrice - p.prev_avg_middlePrice) / p.prev_avg_middlePrice
ELSE NULL
END AS undulate,
row_number() over (order by 1) as briefID
FROM (
SELECT
a.varietyName,
a.avg_middlePrice,
a.processingDate,
LAG(a.avg_middlePrice) OVER (PARTITION BY a.varietyName ORDER BY a.processingDate) AS prev_avg_middlePrice
FROM priceResult a
) p;
大数据处理流程(每日追加)
1.price_data(每日价格)
1.1从网站爬取每日价格,用Excel简单处理后追加导入云端数据库中
1.2直接用sqoop将云端数据库中更新过的数据覆盖到hive仓库中(聪明)
arduino
-- 全量覆盖备份(导入已有的hive表)
sqoop import \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table price_data \
--hive-import \
--hive-database agriculture \
--hive-table price_data \
--fields-terminated-by ',' \
--hive-overwrite \
--null-string '\N' \
--null-non-string '\N' \
-m 1
1.3 蠢人导数据(看看就好)
1.3.1删除在hdfs上的文件
bash
-- 使用-skipTrash选项来强制删除文件,而不将其移动到回收站。
hdfs dfs -rm -r -skipTrash /test/agriculture/price_data
hdfs dfs -rm -r -skipTrash /user/hive/warehouse/agriculture.db/price_data
1.3.2删除在spark中原来的数据表
sql
drop table price_data;
1.3.3用sqoop导入云端数据库中更新过的数据
arduino
sqoop import \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table price_data \
--target-dir /test/agriculture/price_data \
-m 1
1.3.4建表、插入数据
1.3.4.1使用原有数据库
ini
USE agriculture;
1.3.4.2建立与刚刚放进hdfs的表结构相同的表
sql
CREATE TABLE IF NOT EXISTS price_data (
collectDate DATE,
market STRING,
variety STRING,
highestPrice FLOAT,
minimumPrice FLOAT,
bulkPrice FLOAT,
province STRING
)
USING parquet
OPTIONS ('path' '/user/hive/warehouse/agriculture.db/price_data');
1.3.5注册HDFS中的文本文件为临时表,推断模式
sql
CREATE OR REPLACE TEMPORARY VIEW hdfs_price_data
USING text
OPTIONS (
'path' '/test/agriculture/price_data'
);
1.3.6插入数据到Spark表
scss
INSERT INTO TABLE agriculture.price_data
SELECT
TO_DATE(split(value, ',')[0]) AS collectDate,
split(value, ',')[1] AS market,
split(value, ',')[2] AS variety,
CAST(split(value, ',')[3] AS FLOAT) AS highestPrice,
CAST(split(value, ',')[4] AS FLOAT) AS minimumPrice,
CAST(split(value, ',')[5] AS FLOAT) AS bulkPrice,
split(value, ',')[6] AS province
FROM hdfs_price_data;
2.daily_exponent(每日价格指数)
2.1从网站上找到数据,手动输入数据到MySQL中
2.2直接用sqoop将云端数据库中更新过的数据覆盖到hive仓库中
arduino
-- 全量备份(导入已有的hive表)
sqoop import \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table daily_exponent \
--hive-import \
--hive-database agriculture \
--hive-table daily_exponent \
--fields-terminated-by ',' \
--hive-overwrite \
--null-string '\N' \
--null-non-string '\N' \
-m 1
3.variety_national_average
3.1在spark中处理数据
sql
-- 这个SQL查询会将计算结果覆盖写入到VARIETY_NATIONAL_AVERAGE表中,这意味着它会替换已有的数据,但如果数据不存在,它会被插入。这样可以实现追加插入的效果。
INSERT OVERWRITE TABLE VARIETY_NATIONAL_AVERAGE
SELECT
collectDate,
variety,
AVG(bulkPrice) AS averagePrice
FROM
price_data
GROUP BY
collectDate,
variety;
3.2用sqoop将数据导出到mysql
注意导出之前要先在navicat里把原来的表清空
arduino
-- 全量导出不会覆盖
sqoop export \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table variety_national_average \
--fields-terminated-by ',' \
--export-dir /user/hive/warehouse/agriculture.db/variety_national_average \
--input-null-string '\N' \
--input-null-non-string '\N'
注意用datagrip图形化界面导出之前要先在navicat里把原来的表清空
4.variety_regional_average
4.1在spark中处理数据
sql
-- 这个SQL查询会将计算结果覆盖写入到VARIETY_REGIONAL_AVERAGE表中,这意味着它会替换已有的数据,但如果数据不存在,它会被插入。这样可以实现追加插入的效果。
INSERT OVERWRITE TABLE VARIETY_REGIONAL_AVERAGE
SELECT
collectDate,
variety,
province,
AVG(bulkPrice) AS averagePrice
FROM
price_data
GROUP BY
collectDate,
variety,
province;
4.2用sqoop将数据导出到mysql
注意导出之前要先在navicat里把原来的表清空
arduino
sqoop export \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table variety_regional_average \
--fields-terminated-by ',' \
--export-dir /user/hive/warehouse/agriculture.db/variety_regional_average \
--input-null-string '\N' \
--input-null-non-string '\N'
注意用datagrip图形化界面导出之前要先在navicat里把原来的表清空
5.daily_paper_data200
5.1先在mysql中把更新前的表清空
5.2执行代码
这里代码直接在MySQL中运行
vbnet
-- 此查询将从daily_exponent表中选择从'2023-05-26'开始的数据,并计算每个产品的环比,然后将数据插入DAILY_PAPER_DATA200表中。
INSERT INTO DAILY_PAPER_DATA200 (collectDate, PYesterdayExponent, PTodayExponent, PLinkRelativeRatio, VYesterdayExponent, VTodayExponent, VLinkRelativeRatio, GYesterdayExponent, GTodayExponent, GLinkRelativeRatio)
SELECT
de.collectdate AS collectDate,
lag(de.productexponent) OVER (ORDER BY de.collectdate) AS PYesterdayExponent,
de.productexponent AS PTodayExponent,
(de.productexponent / lag(de.productexponent) OVER (ORDER BY de.collectdate) - 1) AS PLinkRelativeRatio,
lag(de.vegetablebasketexponent) OVER (ORDER BY de.collectdate) AS VYesterdayExponent,
de.vegetablebasketexponent AS VTodayExponent,
(de.vegetablebasketexponent / lag(de.vegetablebasketexponent) OVER (ORDER BY de.collectdate) - 1) AS VLinkRelativeRatio,
lag(de.grainoilexponent) OVER (ORDER BY de.collectdate) AS GYesterdayExponent,
de.grainoilexponent AS GTodayExponent,
(de.grainoilexponent / lag(de.grainoilexponent) OVER (ORDER BY de.collectdate) - 1) AS GLinkRelativeRatio
FROM daily_exponent de
WHERE de.collectdate >= '2023-05-26'
ORDER BY de.collectdate;
6.daily_paper_data_variety
6.1先在mysql中把更新前的表清空
6.2执行代码
这里代码直接在MySQL中运行
sql
-- 这个查询会按照日期范围从2023-08-25到最大日期,为特别关注的28个品种计算昨日价格、今日价格和环比,然后将数据插入到
DELIMITER //
CREATE PROCEDURE PopDailyData()
BEGIN
DECLARE currentDate DATE;
DECLARE maxDate DATE;
-- 找到最大日期
SELECT MAX(collectDate) INTO maxDate FROM VARIETY_NATIONAL_AVERAGE;
-- 初始化当前日期
SET currentDate = '2023-08-25';
-- 循环日期范围,按品种填充数据
WHILE currentDate <= maxDate DO
INSERT INTO DAILY_PAPER_DATA_VARIETY (collectDate, variety, yesterdayPrice, todayPrice, linkRelativeRatio)
SELECT
currentDate AS collectDate,
dv.variety,
IFNULL(vna1.averagePrice, 0) AS yesterdayPrice,
IFNULL(vna2.averagePrice, 0) AS todayPrice,
IFNULL((vna2.averagePrice - vna1.averagePrice) / vna1.averagePrice, 0) AS linkRelativeRatio
FROM
Desired_Varieties AS dv
JOIN
VARIETY_NATIONAL_AVERAGE AS vna1
ON
dv.variety = vna1.variety AND DATE_SUB(currentDate, INTERVAL 1 DAY) = vna1.collectDate
JOIN
VARIETY_NATIONAL_AVERAGE AS vna2
ON
dv.variety = vna2.variety AND currentDate = vna2.collectDate;
SET currentDate = DATE_ADD(currentDate, INTERVAL 1 DAY);
END WHILE;
END //
DELIMITER ;
-- 调用存储过程
CALL PopDailyData();
7.weekly_paper_data(每周追加)
7.1插入本周数据
sql
-- 创建并注册临时视图,筛选指定日期范围和品种
CREATE OR REPLACE TEMPORARY VIEW filtered_data AS
SELECT
collectDate,
variety,
bulkPrice
FROM
agriculture.price_data
WHERE
collectDate BETWEEN '2023-08-31' AND '2023-09-06'
AND variety IN (
'猪肉', '羊肉', '牛肉', '鸡蛋', '白条鸡',
'活草鱼', '活鲫鱼', '活鲤鱼', '白鲢活鱼', '花鲢活鱼', '大带鱼', '大黄花鱼',
'菠菜', '莴笋', '豆角', '韭菜', '菜花',
'胡萝卜', '油菜', '西红柿', '青椒', '土豆',
'富士苹果', '巨峰葡萄', '香蕉', '菠萝', '西瓜', '鸭梨'
);
sql
-- 计算每个品种的 bulkPrice 平均值
CREATE OR REPLACE TEMPORARY VIEW week_avg AS
SELECT
variety,
AVG(bulkPrice) AS weekPrice
FROM
filtered_data
GROUP BY
variety;
sql
-- 将结果插入到 agriculture.weekAvg 表中
INSERT INTO TABLE agriculture.weekAvg
SELECT
'2023-08-31' AS firstDate,
variety,
weekPrice
FROM
week_avg;
sql
-- 删除临时表
drop table filtered_data;
drop table week_avg;
7.2合并两周数据
vbnet
-- 插入两周的数据到WEEKLY_PAPER_DATA表
INSERT INTO TABLE WEEKLY_PAPER_DATA
SELECT
wb.firstDate AS firstDate,
wa.variety AS variety,
wa.weekPrice AS lastWeekPrice,
wb.weekPrice AS thisWeekPrice,
(wb.weekPrice - wa.weekPrice) / wa.weekPrice AS linkRelativeRatio
FROM
weekAvg wa
JOIN
weekAvg wb
ON
wa.variety = wb.variety
AND wa.firstDate = '2023-08-24' -- 上一周的日期
AND wb.firstDate = '2023-08-31' -- 这一周的日期;
7.3通过sqoop将hdfs中处理完的表导出至云端mysql上
先在mysql中把更新前的表清空
arduino
sqoop export \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table weekly_paper_data \
--fields-terminated-by ',' \
--export-dir /user/hive/warehouse/agriculture.db/weekly_paper_data \
--input-null-string '\N' \
--input-null-non-string '\N'
8.weekly_paper_data200(每周追加)
8.1插入本周数据
sql
-- 更新日期参数
CREATE OR REPLACE TEMPORARY VIEW date_parameters AS
SELECT
'2023-08-24' AS last_week_start,
'2023-08-30' AS last_week_end,
'2023-08-31' AS this_week_start,
'2023-09-06' AS this_week_end;
sql
-- 计算每周数据并插入到WEEKLY_PAPER_DATA200表中
INSERT INTO WEEKLY_PAPER_DATA200
SELECT
dp.this_week_start AS firstDate,
AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.productexponent END) AS PLastWeekExponent,
AVG(CASE WHEN de.collectdate BETWEEN dp.this_week_start AND dp.this_week_end THEN de.productexponent END) AS PThisWeekExponent,
((AVG(CASE WHEN de.collectdate BETWEEN dp.this_week_start AND dp.this_week_end THEN de.productexponent END) - AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.productexponent END)) / AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.productexponent END)) AS PLinkRelativeRatio,
AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.vegetablebasketexponent END) AS VLastWeekExponent,
AVG(CASE WHEN de.collectdate BETWEEN dp.this_week_start AND dp.this_week_end THEN de.vegetablebasketexponent END) AS VThisWeekExponent,
((AVG(CASE WHEN de.collectdate BETWEEN dp.this_week_start AND dp.this_week_end THEN de.vegetablebasketexponent END) - AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.vegetablebasketexponent END)) / AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.vegetablebasketexponent END)) AS VLinkRelativeRatio,
AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.grainoilexponent END) AS GLastWeekExponent,
AVG(CASE WHEN de.collectdate BETWEEN dp.this_week_start AND dp.this_week_end THEN de.grainoilexponent END) AS GThisWeekExponent,
((AVG(CASE WHEN de.collectdate BETWEEN dp.this_week_start AND dp.this_week_end THEN de.grainoilexponent END) - AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.grainoilexponent END)) / AVG(CASE WHEN de.collectdate BETWEEN dp.last_week_start AND dp.last_week_end THEN de.grainoilexponent END)) AS GLinkRelativeRatio
FROM daily_exponent de
CROSS JOIN date_parameters dp
GROUP BY dp.this_week_start, dp.this_week_end;
8.2通过sqoop将hdfs中处理完的表导出至云端mysql上
先在mysql中把更新前的表清空
arduino
sqoop export \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table weekly_paper_data200 \
--fields-terminated-by ',' \
--export-dir /user/hive/warehouse/agriculture.db/weekly_paper_data200 \
--input-null-string '\N' \
--input-null-non-string '\N'
airflow调度
注意调度之前先把两个原始数据表price_data、daily_exponent在mysql中更新好
1.开始运行airflow
-
启动测试
-
删除记录:第二次启动再做
bashrm -f /root/airflow/airflow-*
-
启动Redis:消息队列:
bashnohup /opt/redis-4.0.9/src/redis-server /opt/redis-4.0.9/src/redis.conf > output.log 2>&1 & ps -ef | grep redis
-
启动AirFlow
bash# 以后台进程方式,启动服务 airflow webserver -D airflow scheduler -D airflow celery flower -D airflow celery worker -D
-
测试网络端口
-
Airflow Web UI:
node1:8085
- 用户名密码:admin
- Celery Web UI:
node1:5555
-
-
2.创建python文件(不用进入docker)
bash
# 默认的Airflow自动检测工作流程序的文件的目录
mkdir -p /root/airflow/dags
cd /root/airflow/dags
vim agriculture_add_daily.py
3.完整代码复制到agriculture_add_daily.py中
不准出现任何中文
ini
# -*- coding:utf-8 -*-
from airflow import DAG
from airflow.providers.mysql.operators.mysql import MySqlOperator
from airflow.operators.bash import BashOperator
from airflow.providers.apache.spark.operators.spark_sql import SparkSqlOperator
from airflow.utils.dates import days_ago
from datetime import date
default_args = {
'owner': 'airflow',
}
dag = DAG(
'agriculture_bigdata_add_daily',
default_args=default_args,
schedule_interval=None,
start_date=days_ago(1),
tags=['agriculture'],
)
add_into_price_data_cmd=r'''
/opt/sqoop/bin/sqoop import \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table price_data \
--hive-import \
--hive-database agriculture \
--hive-table price_data \
--fields-terminated-by ',' \
--hive-overwrite \
--null-string '\N' \
--null-non-string '\N' \
-m 1
'''
# 1
sqoop_import_price_data_task = BashOperator(
task_id='sqoop_import_price_data_task',
bash_command=add_into_price_data_cmd,
dag=dag
)
add_into_daily_exponent_cmd=r'''
/opt/sqoop/bin/sqoop import \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table daily_exponent \
--hive-import \
--hive-database agriculture \
--hive-table daily_exponent \
--fields-terminated-by ',' \
--hive-overwrite \
--null-string '\N' \
--null-non-string '\N' \
-m 1
'''
# 2
sqoop_import_daily_exponent_task = BashOperator(
task_id='sqoop_import_daily_exponent_task',
bash_command=add_into_daily_exponent_cmd,
dag=dag
)
insert_into_variety_national_average_sql = r"""
INSERT OVERWRITE TABLE agriculture.VARIETY_NATIONAL_AVERAGE
SELECT
collectDate,
variety,
AVG(bulkPrice) AS averagePrice
FROM
agriculture.price_data
GROUP BY
collectDate,
variety;
"""
# 3
spark_insert_into_variety_national_average_sql_job = SparkSqlOperator(
sql=insert_into_variety_national_average_sql,
master="local",
conn_id="agriculture_spark_connection",
task_id="spark_insert_into_variety_national_average_sql_job",
dag=dag
)
# 4
delete_variety_national_average_task = MySqlOperator(
task_id='delete_variety_national_average_task',
mysql_conn_id='agriculture_mysql_connection',
sql=r"""DELETE FROM variety_national_average;""",
dag=dag
)
add_into_variety_national_average_cmd=r'''
/opt/sqoop/bin/sqoop export \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table variety_national_average \
--fields-terminated-by ',' \
--export-dir /user/hive/warehouse/agriculture.db/variety_national_average \
--input-null-string '\N' \
--input-null-non-string '\N'
'''
# 5
sqoop_import_variety_national_average_task = BashOperator(
task_id='sqoop_import_variety_national_average_task',
bash_command=add_into_variety_national_average_cmd,
dag=dag
)
insert_into_variety_regional_average_sql = r"""
INSERT OVERWRITE TABLE agriculture.VARIETY_REGIONAL_AVERAGE
SELECT
collectDate,
variety,
province,
AVG(bulkPrice) AS averagePrice
FROM
agriculture.price_data
GROUP BY
collectDate,
variety,
province;
"""
# 6
spark_insert_into_variety_regional_average_sql_job = SparkSqlOperator(
sql=insert_into_variety_regional_average_sql,
master="local",
conn_id="agriculture_spark_connection",
task_id="spark_insert_into_variety_regional_average_sql_job",
dag=dag
)
# 7
delete_variety_regional_average_task = MySqlOperator(
task_id='delete_variety_regional_average_task',
mysql_conn_id='agriculture_mysql_connection',
sql=r"""DELETE FROM variety_regional_average;""",
dag=dag
)
add_into_variety_regional_average_cmd=r'''
/opt/sqoop/bin/sqoop export \
--connect jdbc:mysql://address:3306/agriculture \
--username root \
--password 123456 \
--table variety_regional_average \
--fields-terminated-by ',' \
--export-dir /user/hive/warehouse/agriculture.db/variety_regional_average \
--input-null-string '\N' \
--input-null-non-string '\N'
'''
# 8
sqoop_import_variety_regional_average_task = BashOperator(
task_id='sqoop_import_variety_regional_average_task',
bash_command=add_into_variety_regional_average_cmd,
dag=dag
)
# 9
delete_daily_paper_data200_task = MySqlOperator(
task_id='delete_daily_paper_data200_task',
mysql_conn_id='agriculture_mysql_connection',
sql=r"""DELETE FROM daily_paper_data200;""",
dag=dag
)
insert_into_daily_paper_data200_sql = r"""
INSERT INTO DAILY_PAPER_DATA200 (collectDate, PYesterdayExponent, PTodayExponent, PLinkRelativeRatio, VYesterdayExponent, VTodayExponent, VLinkRelativeRatio, GYesterdayExponent, GTodayExponent, GLinkRelativeRatio)
SELECT
de.collectdate AS collectDate,
lag(de.productexponent) OVER (ORDER BY de.collectdate) AS PYesterdayExponent,
de.productexponent AS PTodayExponent,
(de.productexponent / lag(de.productexponent) OVER (ORDER BY de.collectdate) - 1) AS PLinkRelativeRatio,
lag(de.vegetablebasketexponent) OVER (ORDER BY de.collectdate) AS VYesterdayExponent,
de.vegetablebasketexponent AS VTodayExponent,
(de.vegetablebasketexponent / lag(de.vegetablebasketexponent) OVER (ORDER BY de.collectdate) - 1) AS VLinkRelativeRatio,
lag(de.grainoilexponent) OVER (ORDER BY de.collectdate) AS GYesterdayExponent,
de.grainoilexponent AS GTodayExponent,
(de.grainoilexponent / lag(de.grainoilexponent) OVER (ORDER BY de.collectdate) - 1) AS GLinkRelativeRatio
FROM daily_exponent de
WHERE de.collectdate >= '2023-05-26'
ORDER BY de.collectdate;
"""
# 10
insert_into_daily_paper_data200_mysql_task = MySqlOperator(
task_id='insert_into_daily_paper_data200_mysql_task',
mysql_conn_id='agriculture_mysql_connection',
sql=insert_into_daily_paper_data200_sql,
dag=dag
)
# 11
delete_daily_paper_data_variety_task = MySqlOperator(
task_id='delete_daily_paper_data_variety_task',
mysql_conn_id='agriculture_mysql_connection',
sql=r"""DELETE FROM daily_paper_data_variety;""",
dag=dag
)
insert_into_daily_paper_data_variety_sql = r"""
CALL PopDailyData();
"""
# 12
insert_into_daily_paper_data_variety_mysql_task = MySqlOperator(
task_id='insert_into_daily_paper_data_variety_mysql_task',
mysql_conn_id='agriculture_mysql_connection',
sql=insert_into_daily_paper_data_variety_sql,
dag=dag
)
# 1 3 4 5 11 12
sqoop_import_price_data_task >> spark_insert_into_variety_national_average_sql_job >> delete_variety_national_average_task >> sqoop_import_variety_national_average_task >> delete_daily_paper_data_variety_task >> insert_into_daily_paper_data_variety_mysql_task
# 1 6 7 8
sqoop_import_price_data_task >> spark_insert_into_variety_regional_average_sql_job >> delete_variety_regional_average_task >> sqoop_import_variety_regional_average_task
# 2 9 10
sqoop_import_daily_exponent_task >> delete_daily_paper_data200_task >> insert_into_daily_paper_data200_mysql_task
4.提交Python调度程序
-
哪种提交都需要等待一段时间
-
自动提交:需要等待自动检测
- 将开发好的程序放入AirFlow的DAG Directory目录中
- 默认路径为:/root/airflow/dags
-
手动提交:手动运行文件让airflow监听加载
python agriculture_add_daily.py
-
调度状态
- No status (scheduler created empty task instance):调度任务已创建,还未产生任务实例
- Scheduled (scheduler determined task instance needs to run):调度任务已生成任务实例,待运行
- Queued (scheduler sent task to executor to run on the queue):调度任务开始在executor执行前,在队列中
- Running (worker picked up a task and is now running it):任务在worker节点上执行中
- Success (task completed):任务执行成功完成
5.检查
5.1hive仓库检查
- price_data增加7000左右
- daily_exponent添加了今日日期数据
- variety_national_average增加372左右
- variety_regional_average增加2872左右
csharp
select * from agriculture.price_data;
select * from agriculture.daily_exponent;
select * from agriculture.VARIETY_NATIONAL_AVERAGE;
select * from agriculture.VARIETY_REGIONAL_AVERAGE;
表名\日期 | 9.5 | 9.6 | 9.7 | 9.8 | 9.9 | 9.10 | 9.11 | 9.12 | 9.13 |
---|---|---|---|---|---|---|---|---|---|
price_data | 92299 | 99607 | 106989 | 114432 | 121121 | 127151 | 134718 | 142262 | 149646 |
daily_exponent | 103 | 104 | 105 | 106 | 106 | 108 | 109 | 110 | 111 |
variety_national_average | 4841 | 5229 | 5619 | 6016 | 6366 | 6736 | 7130 | 7517 | 7894 |
variety_regional_average | 37330 | 40259 | 43209 | 46173 | 49013 | 51679 | 54661 | 57629 | 60545 |
5.2mysql检查
- variety_national_average增加372左右
- variety_regional_average增加2872左右
- daily_paper_data200添加了今日日期数据
- daily_paper_data_variety增加28
表名\日期 | 9.5 | 9.6 | 9.7 | 9.8 | 9.9 | 9.10 | 9.11 | 9.12 | 9.13 |
---|---|---|---|---|---|---|---|---|---|
variety_national_average | 4841 | 5229 | 5619 | 6016 | 6366 | 6736 | 7130 | 7517 | 7894 |
variety_regional_average | 37330 | 40259 | 43209 | 46173 | 49013 | 51679 | 54661 | 57629 | 60545 |
daily_paper_data200 | 103 | 104 | 105 | 106 | 106 | 108 | 109 | 110 | 111 |
daily_paper_data_variety | 336 | 364 | 392 | 420 | 448 | 476 | 504 | 532 | 560 |