可以将结构化的数据文件映射为一张数据库表,并提供简单的 SQL 查询功能,将 SQL 语句转换为 MapReduce/Tez/Spark 任务运行。
- SQL接口:使用 HiveQL(类似 SQL)进行查询
- 大数据处理:处理 PB 级别的数据
- 数据仓库:适合离线批处理和数据仓库建设
- 扩展性:可自定义 UDF、UDAF、UDTF
使用场景
- 数据仓库建设
企业级数据仓库(EDW)
数据湖查询引擎
历史数据分析
- ETL 处理
数据清洗和转换
数据格式转换
数据质量检查
- 离线数据分析
每日/每周/月度报表
用户行为分析
业务指标计算
- 数据探索
数据科学家进行数据探索
即席查询分析
Hive 实现"网站用户访问量统计"
一、环境准备
Hive 表结构设计
sql
-- 创建原始日志表(存储从Kafka或文件导入的原始数据)
CREATE TABLE IF NOT EXISTS page_view_logs (
log_id BIGINT COMMENT '日志ID',
page_url STRING COMMENT '页面URL',
user_id STRING COMMENT '用户ID',
event_type STRING COMMENT '事件类型',
server_time TIMESTAMP COMMENT '服务器时间',
client_time TIMESTAMP COMMENT '客户端时间',
ip_address STRING COMMENT 'IP地址',
user_agent STRING COMMENT '用户代理',
other_info STRING COMMENT '其他信息'
) COMMENT '页面访问日志原始表'
PARTITIONED BY (dt STRING COMMENT '日期分区')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;
-- 创建日统计结果表
CREATE TABLE IF NOT EXISTS page_view_daily_stats (
stat_date STRING COMMENT '统计日期',
page_url STRING COMMENT '页面URL',
pv_count BIGINT COMMENT '页面访问量',
uv_count BIGINT COMMENT '独立用户数',
avg_pv_per_user DOUBLE COMMENT '人均访问次数',
peak_hour INT COMMENT '访问高峰小时',
create_time TIMESTAMP COMMENT '创建时间'
) COMMENT '页面访问日统计表'
PARTITIONED BY (stat_month STRING COMMENT '月份分区')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS ORC;
-- 创建小时统计表(用于近实时分析)
CREATE TABLE IF NOT EXISTS page_view_hourly_stats (
stat_hour STRING COMMENT '统计小时',
page_url STRING COMMENT '页面URL',
pv_count BIGINT COMMENT '页面访问量',
uv_count BIGINT COMMENT '独立用户数',
create_time TIMESTAMP COMMENT '创建时间'
) COMMENT '页面访问小时统计表'
PARTITIONED BY (stat_date STRING COMMENT '日期分区')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS ORC;
二、数据导入脚本
从HDFS导入数据
sql
-- 加载数据到原始日志表
LOAD DATA INPATH '/user/hive/warehouse/logs/page_views_20240101.log'
INTO TABLE page_view_logs
PARTITION (dt='2024-01-01');
-- 或者使用外部表方式(推荐)
CREATE EXTERNAL TABLE IF NOT EXISTS page_view_logs_external (
log_id BIGINT,
page_url STRING,
user_id STRING,
event_type STRING,
server_time TIMESTAMP,
client_time TIMESTAMP,
ip_address STRING,
user_agent STRING,
other_info STRING
) COMMENT '页面访问日志外部表'
PARTITIONED BY (dt STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LOCATION '/user/hive/warehouse/logs/';
-- 修复分区
MSCK REPAIR TABLE page_view_logs_external;
三、核心数据分析脚本
日统计数据分析
sql
-- 每日页面访问统计
INSERT OVERWRITE TABLE page_view_daily_stats PARTITION(stat_month='2024-01')
SELECT
dt as stat_date,
page_url,
COUNT(*) as pv_count, -- 总访问量
COUNT(DISTINCT user_id) as uv_count, -- 独立用户数
ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) as avg_pv_per_user,
-- 计算访问高峰小时
CAST(SUBSTR(peak_hour_data.peak_hour, 1, 2) AS INT) as peak_hour,
CURRENT_TIMESTAMP as create_time
FROM page_view_logs
LATERAL VIEW (
-- 计算每个页面访问量最高的小时
SELECT
CONCAT(stat_hour, ':00') as peak_hour
FROM (
SELECT
page_url as inner_page_url,
DATE_FORMAT(server_time, 'HH') as stat_hour,
COUNT(*) as hour_pv,
ROW_NUMBER() OVER (PARTITION BY page_url ORDER BY COUNT(*) DESC) as rn
FROM page_view_logs
WHERE dt = '2024-01-01'
GROUP BY page_url, DATE_FORMAT(server_time, 'HH')
) t
WHERE t.inner_page_url = page_view_logs.page_url AND rn = 1
) peak_hour_data
WHERE dt = '2024-01-01'
AND event_type = 'page_view'
GROUP BY dt, page_url, peak_hour_data.peak_hour;
-- 查询统计结果
SELECT * FROM page_view_daily_stats
WHERE stat_month='2024-01'
ORDER BY pv_count DESC
LIMIT 10;
小时级统计(近实时)
sql
-- 每小时页面访问统计
INSERT OVERWRITE TABLE page_view_hourly_stats PARTITION(stat_date='2024-01-01')
SELECT
CONCAT(DATE_FORMAT(server_time, 'yyyy-MM-dd HH'), ':00') as stat_hour,
page_url,
COUNT(*) as pv_count,
COUNT(DISTINCT user_id) as uv_count,
CURRENT_TIMESTAMP as create_time
FROM page_view_logs
WHERE dt = '2024-01-01'
AND event_type = 'page_view'
AND server_time >= '2024-01-01 00:00:00'
AND server_time < '2024-01-02 00:00:00'
GROUP BY
DATE_FORMAT(server_time, 'yyyy-MM-dd HH'),
page_url;
-- 查询小时统计结果
SELECT * FROM page_view_hourly_stats
WHERE stat_date='2024-01-01'
ORDER BY stat_hour, pv_count DESC;
四、高级分析脚本
用户行为路径分析
sql
-- 用户访问路径分析
WITH user_sessions AS (
SELECT
user_id,
page_url,
server_time,
LAG(page_url) OVER (PARTITION BY user_id ORDER BY server_time) as prev_page,
LEAD(page_url) OVER (PARTITION BY user_id ORDER BY server_time) as next_page
FROM page_view_logs
WHERE dt = '2024-01-01'
AND event_type = 'page_view'
),
page_transitions AS (
SELECT
COALESCE(prev_page, 'ENTRY') as from_page,
page_url as to_page,
COUNT(*) as transition_count
FROM user_sessions
GROUP BY COALESCE(prev_page, 'ENTRY'), page_url
)
SELECT
from_page,
to_page,
transition_count,
ROUND(transition_count * 100.0 / SUM(transition_count) OVER (PARTITION BY from_page), 2) as percentage
FROM page_transitions
ORDER BY from_page, transition_count DESC;
热门页面趋势分析
sql
-- 热门页面按小时趋势
SELECT
page_url,
DATE_FORMAT(server_time, 'HH') as hour,
COUNT(*) as pv_count,
COUNT(DISTINCT user_id) as uv_count
FROM page_view_logs
WHERE dt = '2024-01-01'
AND event_type = 'page_view'
AND page_url IN (
SELECT page_url
FROM page_view_daily_stats
WHERE stat_date = '2024-01-01'
ORDER BY pv_count DESC
LIMIT 5
)
GROUP BY page_url, DATE_FORMAT(server_time, 'HH')
ORDER BY page_url, hour;
五、Java程序调用Hive
Hive JDBC 连接示例
java
import java.sql.*;
public class HivePageViewAnalysis{
private static final String DRIVER_NAME = "org.apache.hive.jdbc.HiveDriver";
private static final String CONNECTION_URL = "jdbc:hive2://localhost:10000/analytics_db";
public static void main(String[] args) {
Connection connection = null;
Statement statement = null;
ResultSet resultSet = null;
try{
// 1. 加载驱动
Class.forName(DRIVER_NAME);
// 2. 建立连接
connection = DriverManager.getConnection(CONNECTION_URL, "hive", "");
statement = connection.createStatement();
// 3. 执行日统计任务
String dailyStatsSQL =
"INSERT OVERWRITE TABLE page_view_daily_stats PARTITION(stat_month='2024-01') " +
"SELECT " +
" dt as stat_date, " +
" page_url, " +
" COUNT(*) as pv_count, " +
" COUNT(DISTINCT user_id) as uv_count, " +
" ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) as avg_pv_per_user, " +
" 0 as peak_hour, " + // 简化版本
" CURRENT_TIMESTAMP as create_time " +
"FROM page_view_logs " +
"WHERE dt = '2024-01-01' " +
" AND event_type = 'page_view' " +
"GROUP BY dt, page_url";
System.out.println("执行日统计任务...");
statement.execute(dailyStatsSQL);
// 4. 查询统计结果
String querySQL =
"SELECT stat_date, page_url, pv_count, uv_count, avg_pv_per_user " +
"FROM page_view_daily_stats " +
"WHERE stat_month = '2024-01' " +
"ORDER BY pv_count DESC " +
"LIMIT 10";
resultSet = statement.executeQuery(querySQL);
// 5. 输出结果
System.out.println("=== 页面访问统计TOP10 ===");
System.out.println("日期\t\t页面URL\t\t访问量\t独立用户\t人均访问");
System.out.println("------------------------------------------------------------");
while (resultSet.next()) {
String statDate = resultSet.getString("stat_date");
String pageUrl = resultSet.getString("page_url");
long pvCount = resultSet.getLong("pv_count");
long uvCount = resultSet.getLong("uv_count");
double avgPv = resultSet.getDouble("avg_pv_per_user");
System.out.printf("%s\t%s\t%d\t%d\t%.2f%n",
statDate, pageUrl, pvCount, uvCount, avgPv);
}
}catch(Exception e){
e.printStackTrace();
}finally{
// 关闭资源
try {
if (resultSet != null) resultSet.close();
if (statement != null) statement.close();
if (connection != null) connection.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
六、Shell 脚本调度
每日统计任务脚本
bash
#!/bin/bash
# Hive 日统计任务脚本
# 使用方法: ./daily_pageview_stats.sh 2024-01-01
# 参数检查
if [ $# -ne 1 ]; then
echo "Usage: $0 <date>"
echo "Example: $0 2024-01-01"
exit 1
fi
STAT_DATE=$1
STAT_MONTH=$(date -d "$STAT_DATE" +%Y-%m)
echo "开始执行页面访问统计任务,日期: $STAT_DATE"
# 执行 Hive SQL
hive -e "
SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict;
-- 日统计任务
INSERT OVERWRITE TABLE page_view_daily_stats PARTITION(stat_month='$STAT_MONTH')
SELECT
dt as stat_date,
page_url,
COUNT(*) as pv_count,
COUNT(DISTINCT user_id) as uv_count,
ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) as avg_pv_per_user,
0 as peak_hour,
CURRENT_TIMESTAMP as create_time
FROM page_view_logs
WHERE dt = '$STAT_DATE'
AND event_type = 'page_view'
GROUP BY dt, page_url;
-- 小时统计任务
INSERT OVERWRITE TABLE page_view_hourly_stats PARTITION(stat_date='$STAT_DATE')
SELECT
CONCAT(DATE_FORMAT(server_time, 'yyyy-MM-dd HH'), ':00') as stat_hour,
page_url,
COUNT(*) as pv_count,
COUNT(DISTINCT user_id) as uv_count,
CURRENT_TIMESTAMP as create_time
FROM page_view_logs
WHERE dt = '$STAT_DATE'
AND event_type = 'page_view'
GROUP BY DATE_FORMAT(server_time, 'yyyy-MM-dd HH'), page_url;
"
# 检查执行结果
if [ $? -eq 0 ]; then
echo "页面访问统计任务执行成功"
# 发送通知(可选)
# send_notification "PageView Stats" "Daily stats completed for $STAT_DATE"
else
echo "页面访问统计任务执行失败"
exit 1
fi
七、数据导出到 MySQL
Hive 到 MySQL 数据导出
sql
-- 创建 Hive 外部表指向 MySQL(需要安装 Hive MySQL connector)
CREATE EXTERNAL TABLE page_view_stats_mysql (
stat_date STRING,
page_url STRING,
pv_count BIGINT,
uv_count BIGINT,
avg_pv_per_user DOUBLE,
peak_hour INT,
create_time TIMESTAMP
)
STORED BY 'org.apache.hadoop.hive.mysql.storagehandler.MySQLStorageHandler'
TBLPROPERTIES (
"mysql.host" = "localhost",
"mysql.port" = "3306",
"mysql.database" = "analytics_db",
"mysql.table" = "page_view_stats",
"mysql.user" = "root",
"mysql.password" = "password"
);
-- 导出数据到 MySQL
INSERT OVERWRITE TABLE page_view_stats_mysql
SELECT
stat_date,
page_url,
pv_count,
uv_count,
avg_pv_per_user,
peak_hour,
create_time
FROM page_view_daily_stats
WHERE stat_month = '2024-01';
八、性能优化配置
Hive 调优参数
sql
-- 在 Hive SQL 开头设置优化参数
SET hive.exec.parallel = true;
SET hive.exec.parallel.thread.number = 8;
SET hive.auto.convert.join = true;
SET hive.merge.mapfiles = true;
SET hive.merge.mapredfiles = true;
SET hive.merge.size.per.task = 256000000;
SET hive.merge.smallfiles.avgsize = 128000000;
SET hive.exec.compress.output = true;
SET mapred.output.compression.codec = org.apache.hadoop.io.compress.SnappyCodec;
运行步骤
- 准备 Hadoop 环境:确保 HDFS 和 Hive 服务正常运行
- 创建 Hive 表:执行上述表创建 SQL
- 导入数据:将日志文件上传到 HDFS 并加载到 Hive
- 执行分析任务:运行 Hive SQL 或 Java 程序
- 查看结果:在 Hive 或 MySQL 中查询统计结果