常用odps(hive)语法

基础

-- 基本查询

SELECT col1, col2 FROM table;

-- 别名

SELECT col1 AS name1, col2 name2 FROM table;

-- DISTINCT去重

SELECT DISTINCT department FROM employees;

-- LIMIT限制

SELECT * FROM table LIMIT 100;

-- 条件筛选

SELECT * FROM sales WHERE amount > 1000 AND region = '华东';

关联

JOIN类型 左表 右表 适用场景
INNER JOIN(JOIN) 匹配记录 匹配记录 找共同数据
LEFT JOIN 所有记录 匹配记录 找左表有但右表没有的数据
RIGHT JOIN 匹配记录 所有记录 找右表有但左表没有的数据
FULL JOIN 所有记录 所有记录 完整对比两个表差异

聚合查询

sql 复制代码
-- 常用聚合函数
SELECT 
    COUNT(*) AS total_rows,
    COUNT(DISTINCT user_id) AS unique_users,
    SUM(amount) AS total_amount,
    AVG(amount) AS avg_amount,
    MAX(amount) AS max_amount,
    MIN(amount) AS min_amount,
    STDDEV(amount) AS std_amount
FROM sales
WHERE dt = '2023-01-01';

-- GROUP BY分组
SELECT 
    department,
    COUNT(*) AS emp_count,
    AVG(salary) AS avg_salary
FROM employees
GROUP BY department
HAVING COUNT(*) > 5;  -- HAVING过滤分组

字符串函数

sql 复制代码
-- 连接
SELECT CONCAT('Hello', ' ', 'World');  -- Hello World
SELECT CONCAT_WS('-', '2023', '01', '01');  -- 2023-01-01

-- 截取
SELECT SUBSTR('Hello World', 1, 5);  -- Hello
SELECT SUBSTRING('Hello World', 7);  -- World

-- 长度
SELECT LENGTH('Hello');  -- 5
SELECT CHAR_LENGTH('你好');  -- 2

-- 大小写转换
SELECT LOWER('HELLO');  -- hello
SELECT UPPER('hello');  -- HELLO

-- 去除空格
SELECT TRIM('  hello  ');  -- hello
SELECT LTRIM('  hello');  -- hello
SELECT RTRIM('hello  ');  -- hello

-- 替换
SELECT REPLACE('Hello World', 'World', 'MaxCompute');  -- Hello MaxCompute
sql 复制代码
-- 查找位置
SELECT INSTR('hello world', 'world');  -- 7
SELECT LOCATE('lo', 'hello world');  -- 4

-- 正则匹配
SELECT 'abc123' RLIKE '^[a-z]+[0-9]+$';  -- true

-- 分割
SELECT SPLIT('a,b,c,d', ',')[0];  -- a
SELECT EXPLODE(SPLIT('a,b,c', ','));  -- 展开为多行

-- JSON处理
SELECT GET_JSON_OBJECT('{"name":"John","age":30}', '$.name');  -- John

日期时间函数

sql 复制代码
-- 当前时间
SELECT GETDATE();  -- 当前日期时间
SELECT CURRENT_TIMESTAMP;  -- 当前时间戳

-- 日期转换
SELECT TO_DATE('2023-01-01', 'yyyy-MM-dd');
SELECT DATE_FORMAT(GETDATE(), 'yyyy-MM-dd HH:mm:ss');

-- 日期计算
SELECT DATEADD(GETDATE(), 7, 'dd');  -- 7天后
SELECT DATEDIFF('2023-01-10', '2023-01-01', 'dd');  -- 相差9天

-- 提取日期部分
SELECT YEAR('2023-01-01');  -- 2023
SELECT MONTH('2023-01-01');  -- 1
SELECT DAY('2023-01-01');  -- 1
SELECT HOUR('2023-01-01 15:30:00');  -- 15
SELECT WEEKOFYEAR('2023-01-01');  -- 1

条件函数与CASE语句

sql 复制代码
-- 简单CASE
SELECT 
    name,
    CASE department 
        WHEN 'IT' THEN '技术部'
        WHEN 'HR' THEN '人力资源部'
        ELSE '其他部门'
    END AS dept_name
FROM employees;

-- 搜索CASE
SELECT 
    score,
    CASE 
        WHEN score >= 90 THEN '优秀'
        WHEN score >= 80 THEN '良好'
        WHEN score >= 60 THEN '及格'
        ELSE '不及格'
    END AS grade
FROM students;

-- COALESCE返回第一个非NULL值
SELECT COALESCE(NULL, NULL, 'default');  -- default

-- IF函数
SELECT IF(score >= 60, '及格', '不及格') FROM scores;

-- NVL/NVL2
SELECT NVL(null_col, 'default_value');
SELECT NVL2(col, 'not_null', 'is_null');

-- DECODE(类似简单CASE)
SELECT DECODE(status, 1, '激活', 0, '禁用', '未知') FROM users;

窗口函数(重要)

sql 复制代码
--排名
SELECT 
    name,
    score,
    ROW_NUMBER() OVER (ORDER BY score DESC) AS rn,  -- 连续序号
    RANK() OVER (ORDER BY score DESC) AS rk,        -- 并列排名
    DENSE_RANK() OVER (ORDER BY score DESC) AS drk  -- 密集排名
FROM students;
-- 聚合窗口

SELECT 
    department,
    name,
    salary,
    SUM(salary) OVER (PARTITION BY department) AS dept_total,
    AVG(salary) OVER (PARTITION BY department) AS dept_avg,
    -- 移动平均
    AVG(salary) OVER (
        PARTITION BY department 
        ORDER BY hire_date 
        ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
    ) AS moving_avg
FROM employees;

-- 偏移函数
SELECT 
    dt,
    sales,
    LAG(sales, 1) OVER (ORDER BY dt) AS prev_day_sales,
    LEAD(sales, 1) OVER (ORDER BY dt) AS next_day_sales,
    FIRST_VALUE(sales) OVER (PARTITION BY month ORDER BY dt) AS month_first_sales
FROM daily_sales;

ARRAY/MAP/炸开处理

sql 复制代码
-- ARRAY操作
SELECT 
    ARRAY(1, 2, 3) AS arr,
    SIZE(ARRAY(1, 2, 3)) AS arr_size,
    ARRAY_CONTAINS(ARRAY(1, 2, 3), 2) AS contains_2;

-- MAP操作
SELECT 
    MAP('key1', 'value1', 'key2', 'value2') AS my_map,
    MAP_KEYS(my_map) AS keys,
    MAP_VALUES(my_map) AS values,
    my_map['key1'] AS value_by_key;

-- 
-- EXPLODE行转列(炸开)
SELECT EXPLODE(SPLIT('a,b,c', ',')) AS item;

-- COLLECT_SET/COLLECT_LIST列转行
SELECT 
    department,
    COLLECT_LIST(name) AS name_list,
    COLLECT_SET(name) AS name_set
FROM employees
GROUP BY department;

常见示例

sql 复制代码
-- 使用ROW_NUMBER去重
WITH ranked AS (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY login_time DESC) AS rn
    FROM login_logs
)
SELECT * FROM ranked WHERE rn = 1;

-- 递归查询(MaxCompute支持有限)
SELECT * FROM employees
START WITH manager_id IS NULL
CONNECT BY PRIOR id = manager_id;

-- 随机取100行
SELECT * FROM large_table
WHERE RAND() < 0.001  -- 0.1%抽样
LIMIT 100;

-- 分层抽样
SELECT * FROM (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY category ORDER BY RAND()) AS rn
    FROM products
) t
WHERE rn <= 10;  -- 每类取10个

-- 正确:分区字段在前
SELECT * FROM sales 
WHERE dt = '2023-01-01' AND amount > 1000;

-- 错误:先过滤非分区字段(可能全表扫描)
SELECT * FROM sales 
WHERE amount > 1000 AND dt = '2023-01-01';


-- 小表JOIN大表
SELECT /*+ MAPJOIN(small_table) */
    a.*, b.*
FROM large_table a
JOIN small_table b ON a.id = b.id;

常用检测

sql 复制代码
-- 检查空值和异常值
SELECT 
    COUNT(*) AS total,
    COUNT(col1) AS not_null_count,
    COUNT(DISTINCT col1) AS unique_count,
    SUM(CASE WHEN col1 IS NULL THEN 1 ELSE 0 END) AS null_count,
    MIN(col1) AS min_value,
    MAX(col1) AS max_value
FROM your_table;


-- 使用窗口函数生成序列号
SELECT 
    ROW_NUMBER() OVER (ORDER BY id) AS serial_no,
    *
FROM your_table;

-- 生成日期序列(时间维度)
SELECT 
    DATEADD('2023-01-01', n, 'dd') AS date_seq
FROM (
    SELECT ROW_NUMBER() OVER (ORDER BY id) - 1 AS n
    FROM some_table
    LIMIT 365
) t;
相关推荐
jinxinyuuuus1 天前
vsGPU:硬件参数的数据仓库设计、ETL流程与前端OLAP分析
前端·数据仓库·etl
编织幻境的妖1 天前
ETL、数据仓库与数据湖详解
数据仓库·etl
爱吃大芒果1 天前
Flutter 本地存储方案:SharedPreferences、SQFlite 与 Hive
开发语言·javascript·hive·hadoop·flutter·华为·harmonyos
咨询qq 8762239651 天前
三相逆变器MPC控制:从原理到仿真实践
数据仓库
shjita1 天前
hadoop运行jar包的相关配置参考!
大数据·hadoop·分布式
yumgpkpm1 天前
AI大模型手机的“简单替换陷阱”与Hadoop、Cloudera CDP 7大数据底座的关系探析
大数据·人工智能·hadoop·华为·spark·kafka·cloudera
yumgpkpm1 天前
(简略)AI 大模型 手机的“简单替换陷阱”与Hadoop、Cloudera CDP 7大数据底座的关系探析
人工智能·hive·zookeeper·flink·spark·kafka·开源
yumgpkpm1 天前
Cloudera CDP 7.3下载地址、方式,开源适配 CMP 7.3(或类 CDP 的 CMP 7.13 平台,如华为鲲鹏 ARM 版)值得推荐
大数据·hive·hadoop·分布式·华为·开源·cloudera
笨蛋少年派2 天前
数据仓库系统建设:数据采集、预处理与集成
数据仓库