1. 基本数据表结构
假设有一个登录记录表:
sql
CREATE TABLE login_log (
user_id INT,
login_date DATE
);
示例数据:
sql
INSERT INTO login_log VALUES
(1, '2024-01-01'),
(1, '2024-01-02'),
(1, '2024-01-03'),
(1, '2024-01-05'), -- 这里断了一天
(1, '2024-01-06'),
(1, '2024-01-07'),
(2, '2024-01-01'),
(2, '2024-01-02'),
(2, '2024-01-04'); -- 这里断了一天
2. 方法一:使用窗口函数(推荐)
2.1 查询连续登录3天及以上的用户
sql
WITH ranked_logs AS (
SELECT
user_id,
login_date,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY login_date) as rn
FROM login_log
GROUP BY user_id, login_date -- 去重,防止一天多次登录
),
date_diff AS (
SELECT
user_id,
login_date,
DATE_SUB(login_date, INTERVAL rn DAY) as group_date
FROM ranked_logs
)
SELECT
user_id,
MIN(login_date) as start_date,
MAX(login_date) as end_date,
COUNT(*) as consecutive_days
FROM date_diff
GROUP BY user_id, group_date
HAVING COUNT(*) >= 3
ORDER BY user_id, start_date;
2.2 使用LEAD/LAG函数的简化版
sql
WITH consecutive_groups AS (
SELECT
user_id,
login_date,
LAG(login_date) OVER (PARTITION BY user_id ORDER BY login_date) as prev_date,
LEAD(login_date) OVER (PARTITION BY user_id ORDER BY login_date) as next_date
FROM (
SELECT DISTINCT user_id, login_date
FROM login_log
) t
)
SELECT
user_id,
login_date as consecutive_date
FROM consecutive_groups
WHERE
-- 连续3天的情况:当前日期、前1天、前2天都存在
(login_date = prev_date + INTERVAL 1 DAY
AND login_date = prev_date + INTERVAL 2 DAY)
OR
-- 或者检查连续区间
(login_date = prev_date + INTERVAL 1 DAY
AND login_date = next_date - INTERVAL 1 DAY)
OR
(login_date = next_date - INTERVAL 1 DAY
AND login_date = next_date - INTERVAL 2 DAY);
3. 方法二:使用自连接
sql
SELECT DISTINCT
l1.user_id
FROM login_log l1
JOIN login_log l2 ON l1.user_id = l2.user_id
AND l2.login_date = l1.login_date + INTERVAL 1 DAY
JOIN login_log l3 ON l1.user_id = l3.user_id
AND l3.login_date = l1.login_date + INTERVAL 2 DAY
WHERE EXISTS (
SELECT 1 FROM login_log
WHERE user_id = l1.user_id
AND login_date = l1.login_date + INTERVAL 1 DAY
)
AND EXISTS (
SELECT 1 FROM login_log
WHERE user_id = l1.user_id
AND login_date = l1.login_date + INTERVAL 2 DAY
);
4. 方法三:使用递归CTE(复杂但功能强大)
sql
WITH RECURSIVE consecutive_login AS (
-- 基础查询:每个用户的首次登录
SELECT
user_id,
login_date,
login_date as start_date,
1 as consecutive_days
FROM (
SELECT
user_id,
login_date,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY login_date) as rn
FROM login_log
) t
WHERE rn = 1
UNION ALL
-- 递归部分:查找连续的下一天
SELECT
cl.user_id,
ll.login_date,
CASE
WHEN ll.login_date = cl.login_date + INTERVAL 1 DAY
THEN cl.start_date
ELSE ll.login_date
END as start_date,
CASE
WHEN ll.login_date = cl.login_date + INTERVAL 1 DAY
THEN cl.consecutive_days + 1
ELSE 1
END as consecutive_days
FROM consecutive_login cl
JOIN login_log ll ON cl.user_id = ll.user_id
AND ll.login_date > cl.login_date
WHERE ll.login_date = (
SELECT MIN(login_date)
FROM login_log
WHERE user_id = cl.user_id
AND login_date > cl.login_date
)
)
SELECT
user_id,
MAX(consecutive_days) as max_consecutive_days
FROM consecutive_login
GROUP BY user_id
HAVING MAX(consecutive_days) >= 3;
5. 实用查询示例
5.1 查询每个用户的最大连续登录天数
sql
WITH ranked_logs AS (
SELECT
user_id,
login_date,
DATE_SUB(login_date, INTERVAL
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY login_date) DAY
) as group_date
FROM (
SELECT DISTINCT user_id, login_date
FROM login_log
) t
)
SELECT
user_id,
MAX(consecutive_days) as max_consecutive_days
FROM (
SELECT
user_id,
group_date,
COUNT(*) as consecutive_days
FROM ranked_logs
GROUP BY user_id, group_date
) groups
GROUP BY user_id
ORDER BY max_consecutive_days DESC;
5.2 查询指定时间段内的连续登录
sql
WITH ranked_logs AS (
SELECT
user_id,
login_date,
DATE_SUB(login_date, INTERVAL
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY login_date) DAY
) as group_date
FROM (
SELECT DISTINCT user_id, login_date
FROM login_log
WHERE login_date BETWEEN '2024-01-01' AND '2024-01-31'
) t
)
SELECT
user_id,
COUNT(*) as consecutive_days,
MIN(login_date) as start_date,
MAX(login_date) as end_date
FROM ranked_logs
GROUP BY user_id, group_date
HAVING COUNT(*) >= 7 -- 连续7天登录
ORDER BY consecutive_days DESC;
5.3 查询连续登录中断的情况
sql
WITH login_gaps AS (
SELECT
user_id,
login_date,
LEAD(login_date) OVER (PARTITION BY user_id ORDER BY login_date) as next_date,
DATEDIFF(
LEAD(login_date) OVER (PARTITION BY user_id ORDER BY login_date),
login_date
) as gap_days
FROM (
SELECT DISTINCT user_id, login_date
FROM login_log
) t
)
SELECT
user_id,
login_date as last_login_before_gap,
next_date as next_login_after_gap,
gap_days - 1 as missed_days
FROM login_gaps
WHERE gap_days > 1
ORDER BY user_id, login_date;
6. 性能优化建议
- 创建索引:
sql
CREATE INDEX idx_user_login ON login_log(user_id, login_date);
-
分区表:如果数据量很大,按月份或用户ID范围分区
-
物化视图:对于频繁查询的结果可以创建物化视图
-
定期清理:删除历史数据,只保留最近N天的数据
7. 不同数据库的语法差异
| 函数/特性 | MySQL | PostgreSQL | SQL Server | Oracle |
|---|---|---|---|---|
| 日期加减 | DATE_ADD() | + INTERVAL | DATEADD() | + INTERVAL |
| 日期差 | DATEDIFF() | - | DATEDIFF() | - |
| 行号 | ROW_NUMBER() | ROW_NUMBER() | ROW_NUMBER() | ROW_NUMBER() |
| 递归CTE | 支持(8.0+) | 支持 | 支持 | 支持 |
选择哪种方法取决于:
- 数据量:大数据量建议使用窗口函数
- 查询频率:频繁查询建议建立物化视图
- 数据库版本:确保支持相关函数
- 业务需求:是否需要实时结果还是可接受延迟