目录
-
- 摘要
- 一、基础查询
-
- [1.1 SELECT语句](#1.1 SELECT语句)
- [1.2 条件过滤](#1.2 条件过滤)
- [1.3 排序与限制](#1.3 排序与限制)
- 二、聚合查询
-
- [2.1 基本聚合](#2.1 基本聚合)
- [2.2 分组聚合](#2.2 分组聚合)
- [2.3 分组集](#2.3 分组集)
- 三、连接查询
-
- [3.1 连接类型](#3.1 连接类型)
- [3.2 内连接](#3.2 内连接)
- [3.3 左连接](#3.3 左连接)
- [3.4 多表连接](#3.4 多表连接)
- 四、子查询
-
- [4.1 标量子查询](#4.1 标量子查询)
- [4.2 行子查询](#4.2 行子查询)
- [4.3 表子查询](#4.3 表子查询)
- 五、窗口函数
-
- [5.1 排序函数](#5.1 排序函数)
- [5.2 聚合窗口函数](#5.2 聚合窗口函数)
- [5.3 偏移函数](#5.3 偏移函数)
- 六、时间序列查询
-
- [6.1 时间窗口](#6.1 时间窗口)
- [6.2 时间对齐](#6.2 时间对齐)
- [6.3 重采样](#6.3 重采样)
- 七、复杂查询实战
-
- [7.1 分组Top-N](#7.1 分组Top-N)
- [7.2 同比环比](#7.2 同比环比)
- [7.3 连续区间](#7.3 连续区间)
- 八、查询优化
-
- [8.1 执行计划](#8.1 执行计划)
- [8.2 优化建议](#8.2 优化建议)
- [8.3 常见问题](#8.3 常见问题)
- 九、总结
- 参考资料
摘要
本文系统介绍DolphinDB SQL查询语言。从基础SELECT语句到复杂的多表关联,从条件过滤到分组聚合,从子查询到窗口函数,逐步带领读者掌握DolphinDB SQL查询的核心技能。通过丰富的实战案例,帮助读者编写高效的数据查询语句。
一、基础查询
1.1 SELECT语句
python
// 创建示例表
t = table(
1..10 as id,
`A`B`C`A`B`C`A`B`C`A as category,
10 20 30 15 25 35 12 22 32 18 as value,
2024.01.01 + 0..9 as date
)
// 查询所有列
select * from t
// 查询指定列
select id, category, value from t
// 列别名
select id as device_id, value as temperature from t
// 计算列
select id, value, value * 1.8 + 32 as fahrenheit from t
1.2 条件过滤
python
// WHERE条件
select * from t where id > 5
// 多条件
select * from t where id > 3 and value < 30
// IN条件
select * from t where category in [`A, `B]
// BETWEEN条件
select * from t where value between 15 and 30
// LIKE模糊匹配
select * from t where category like "A%"
// NULL判断
select * from t where value is not NULL
1.3 排序与限制
python
// 排序
select * from t order by value desc
select * from t order by category, value desc
// 限制行数
select top 5 * from t
select top 5 * from t order by value desc
// 分页
select * from t limit 5 offset 3 // 从第4行开始取5行
二、聚合查询
2.1 基本聚合
python
// 聚合函数
select count(*) as cnt from t
select sum(value) as total from t
select avg(value) as average from t
select max(value) as max_val, min(value) as min_val from t
// 多聚合
select count(*) as cnt,
sum(value) as total,
avg(value) as average,
std(value) as std_dev
from t
2.2 分组聚合
python
// GROUP BY
select category,
count(*) as cnt,
sum(value) as total,
avg(value) as average
from t
group by category
// 多列分组
select category,
date,
count(*) as cnt,
avg(value) as avg_val
from t
group by category, date
// HAVING过滤
select category, avg(value) as avg_val
from t
group by category
having avg(value) > 20
2.3 分组集
python
// ROLLUP
select category, date, sum(value) as total
from t
group by rollup(category, date)
// CUBE
select category, date, sum(value) as total
from t
group by cube(category, date)
// GROUPING SETS
select category, date, sum(value) as total
from t
group by grouping sets((category), (date), (category, date))
三、连接查询
3.1 连接类型
连接类型
INNER JOIN
内连接
交集
LEFT JOIN
左连接
左表全部
RIGHT JOIN
右连接
右表全部
FULL JOIN
全连接
并集
3.2 内连接
python
// 创建示例表
t1 = table(1..5 as id, `A`B`C`D`E as name)
t2 = table(1..3 as id, 100..102 as value)
// 内连接
select * from lj(t1, t2, `id)
// 或
select t1.id, t1.name, t2.value
from t1
inner join t2 on t1.id = t2.id
3.3 左连接
python
// 左连接
select * from lj(t1, t2, `id)
/*
id name value
1 A 100
2 B 101
3 C 102
4 D NULL
5 E NULL
*/
3.4 多表连接
python
// 多表连接
t1 = table(1..5 as id, `A`B`C`D`E as name)
t2 = table(1..5 as id, 100..104 as value)
t3 = table(1..5 as id, `X`Y`Z`X`Y as type)
select t1.id, t1.name, t2.value, t3.type
from t1
left join t2 on t1.id = t2.id
left join t3 on t1.id = t3.id
四、子查询
4.1 标量子查询
python
// 标量子查询(返回单个值)
select * from t
where value > (select avg(value) from t)
// 在SELECT中使用
select id, value,
(select avg(value) from t) as avg_value
from t
4.2 行子查询
python
// 行子查询(返回多行)
select * from t
where category in (select distinct category from t where value > 25)
// EXISTS子查询
select * from t1
where exists (select * from t2 where t2.id = t1.id)
4.3 表子查询
python
// 表子查询(返回表)
select category, avg_val
from (
select category, avg(value) as avg_val
from t
group by category
)
where avg_val > 20
// WITH子句(CTE)
with
stats as (
select category, avg(value) as avg_val
from t
group by category
)
select * from stats where avg_val > 20
五、窗口函数
5.1 排序函数
python
// ROW_NUMBER
select id, category, value,
row_number() over (partition by category order by value desc) as rank
from t
// RANK(有并列)
select id, category, value,
rank() over (order by value desc) as rank
from t
// DENSE_RANK(连续排名)
select id, category, value,
dense_rank() over (order by value desc) as rank
from t
5.2 聚合窗口函数
python
// 累积聚合
select id, value,
sum(value) over (order by id) as cumsum,
avg(value) over (order by id rows between 2 preceding and current row) as mavg
from t
// 分组窗口
select id, category, value,
sum(value) over (partition by category order by id) as category_cumsum
from t
5.3 偏移函数
python
// LAG/LEAD
select id, value,
lag(value, 1) over (order by id) as prev_value,
lead(value, 1) over (order by id) as next_value
from t
// FIRST/LAST
select id, value,
first(value) over (order by id) as first_val,
last(value) over (order by id) as last_val
from t
六、时间序列查询
6.1 时间窗口
python
// 创建时间序列数据
t = table(
1..100 as id,
2024.01.01T00:00:00 + 0..99 * 60000 as timestamp, // 每分钟一条
rand(100.0, 100) as value
)
// 时间窗口聚合
select bar(timestamp, 10m) as time_window,
avg(value) as avg_val,
max(value) as max_val,
count(*) as cnt
from t
group by bar(timestamp, 10m)
6.2 时间对齐
python
// 时间对齐
select * from t
where timestamp between 2024.01.01T00:00:00 and 2024.01.01T01:00:00
// 按小时聚合
select bar(timestamp, 1h) as hour,
avg(value) as avg_val
from t
group by bar(timestamp, 1h)
6.3 重采样
python
// 重采样(1分钟→5分钟)
select bar(timestamp, 5m) as time_5m,
first(value) as open,
max(value) as high,
min(value) as low,
last(value) as close
from t
group by bar(timestamp, 5m)
七、复杂查询实战
7.1 分组Top-N
python
// 每个类别取前3条
select * from (
select id, category, value,
row_number() over (partition by category order by value desc) as rank
from t
) where rank <= 3
7.2 同比环比
python
// 环比计算
select date, value,
lag(value, 1) over (order by date) as prev_value,
(value - lag(value, 1) over (order by date)) / lag(value, 1) over (order by date) as mom_rate
from t
// 同比计算
select date, value,
lag(value, 12) over (order by date) as prev_year_value,
(value - lag(value, 12) over (order by date)) / lag(value, 12) over (order by date) as yoy_rate
from t
7.3 连续区间
python
// 查找连续值
select * from (
select id, value,
id - row_number() over (order by id) as grp
from t
where value > 20
)
group by grp
having count(*) >= 3 // 连续3个以上
八、查询优化
8.1 执行计划
python
// 查看执行计划
explain select * from t where id > 5
// 分析查询性能
timer select count(*) from t
8.2 优化建议
| 优化项 | 说明 |
|---|---|
| 分区裁剪 | 在分区列上过滤 |
| 索引使用 | 在索引列上查询 |
| 减少扫描 | 只查询需要的列 |
| 避免全表 | 使用WHERE条件 |
8.3 常见问题
python
// 避免SELECT *
select id, value from t // 好
select * from t // 避免
// 使用分区过滤
select * from t
where date between 2024.01.01 and 2024.01.31 // 好
// 避免函数包装索引列
select * from t where date = 2024.01.15 // 好
select * from t where year(date) = 2024 // 避免
九、总结
本文系统介绍了DolphinDB SQL查询:
- 基础查询:SELECT、WHERE、ORDER BY
- 聚合查询:GROUP BY、HAVING、分组集
- 连接查询:内连接、左连接、多表连接
- 子查询:标量、行、表子查询
- 窗口函数:排序、聚合、偏移
- 时间序列:时间窗口、重采样
- 查询优化:执行计划、优化建议
思考题:
- 如何选择合适的连接类型?
- 窗口函数和GROUP BY有什么区别?
- 如何优化大数据量查询?