初赛已结束,没进决赛。
sql
WITH RECURSIVE
a as(
select id rn, replace(replace(puzzle,chr(10),''),'?','0')b from sudoku9_9 where length(replace(replace(puzzle,'?',''),chr(10),''))>=30
),
b as
(
select rn,case flag when 0 then b else reverse(b) end b,flag from
(select rn, b ,case when length(replace(substr(b,1,40),'0',''))>length(replace(substr(b,42,40),'0','')) then 0 else 1 end flag from a)s
),
d(lp) AS (
VALUES(1)
UNION ALL
SELECT lp+1 FROM d WHERE lp<81
),
grid AS (
SELECT lp AS pos
,(lp-1)/9 AS r
,(lp-1)%9 AS c
,(lp-1)/9/3*3 + (lp-1)%9/3 AS g
FROM d
)
,all_pos AS (
SELECT pos,n
,case when(grid.r*9+n-1)>42 then 1::bigint << (grid.r*9+n-1)-42 else 0 end AS r_h
,case when(grid.c*9+n-1)>42 then 1::bigint << (grid.c*9+n-1)-42 else 0 end AS c_h
,case when(grid.g*9+n-1)>42 then 1::bigint << (grid.g*9+n-1)-42 else 0 end AS g_h
,case when(grid.r*9+n-1)>42 then 0 else 1::bigint << (grid.r*9+n-1) end AS r_l
,case when(grid.c*9+n-1)>42 then 0 else 1::bigint << (grid.c*9+n-1) end AS c_l
,case when(grid.g*9+n-1)>42 then 0 else 1::bigint << (grid.g*9+n-1) end AS g_l
FROM grid,generate_series(1, 9) d(n)
)
,t(rn, s,rs_h,cs_h,gs_h,rs_l,cs_l,gs_l,next_pos) AS (
SELECT rn,CAST(b AS text)
,SUM(all_pos.r_h)::bigint rs_h
,SUM(all_pos.c_h)::bigint cs_h
,SUM(all_pos.g_h)::bigint gs_h
,SUM(all_pos.r_l)::bigint rs_h
,SUM(all_pos.c_l)::bigint cs_l
,SUM(all_pos.g_l)::bigint gs_l
,position('0' in b)
FROM all_pos,b
WHERE all_pos.n=SUBSTR(b,all_pos.pos,1)::int
group by rn,b
UNION ALL
SELECT rn,SUBSTR(t.s,1,t.next_pos-1)||a.n||SUBSTR(t.s,t.next_pos+1)
,t.rs_h+a.r_h
,t.cs_h+a.c_h
,t.gs_h+a.g_h
,t.rs_l+a.r_l
,t.cs_l+a.c_l
,t.gs_l+a.g_l
,case when position('0' in SUBSTR(t.s,t.next_pos+1)) >0 then position('0' in SUBSTR(t.s,t.next_pos+1))+t.next_pos else 0 end
FROM t
,all_pos a
WHERE t.next_pos = a.pos
AND (t.rs_h&a.r_h)=0
AND (t.cs_h&a.c_h)=0
AND (t.gs_h&a.g_h)=0
AND (t.rs_l&a.r_l)=0
AND (t.cs_l&a.c_l)=0
AND (t.gs_l&a.g_l)=0
)
,rev_result as(
select t.rn id,
rtrim(regexp_replace(case flag when 0 then b else reverse(b) end,'(.{9})','\1' || chr(10),'g'),chr(10))puzzle,
rtrim(regexp_replace(case flag when 0 then s else reverse(s) end,'(.{9})','\1' || chr(10),'g'),chr(10))result
from b left join
(select rn,s from (select rn,s,row_number()over(partition by rn order by rn)resn from t where t.next_pos=0) t0 WHERE resn=1)t on t.rn=b.rn
),
initial AS (
SELECT id,
puzzle board,
-- 初始化行掩码:确保 SUM 结果被强制转为 int
(SELECT array_agg(m) FROM (
SELECT SUM(case when val>0 then 1 << (val - 1)else 0 end)::int as m
FROM generate_series(0, 8) r
INNER JOIN LATERAL (SELECT SUBSTR(puzzle, r*9 + i, 1) as ch FROM generate_series(1, 9) i) s ON true
CROSS JOIN LATERAL (SELECT (ch::text)::int as val) v
GROUP BY r ORDER BY r
) s) as rows,
-- 初始化列掩码
(SELECT array_agg(m) FROM (
SELECT SUM(case when val>0 then 1 << (val - 1)else 0 end)::int as m
FROM generate_series(1, 9) c
INNER JOIN LATERAL (SELECT SUBSTR(puzzle, (i-1)*9 + c, 1) as ch FROM generate_series(1, 9) i) s ON true
CROSS JOIN LATERAL (SELECT (ch::text)::int as val) v
GROUP BY c ORDER BY c
) s) as cols,
-- 初始化宫掩码
(SELECT array_agg(m) FROM (
SELECT SUM(case when val>0 then 1 << (val - 1)else 0 end)::int as m
FROM generate_series(0, 8) b
INNER JOIN LATERAL (SELECT SUBSTR(puzzle, (b/3)*27 + (b%3)*3 + ((i-1)/3)*9 + ((i-1)%3) + 1, 1) as ch FROM generate_series(1, 9) i) s ON true
CROSS JOIN LATERAL (SELECT (ch::text)::int as val) v
GROUP BY b ORDER BY b
) s) as boxes ,
(SELECT array_agg(pos::smallint) FROM generate_series(1, 81) p(pos) WHERE SUBSTR(puzzle, p.pos, 1) = '0' )as positions
FROM (select id,replace(replace(puzzle, '?', '0'),chr(10), '') puzzle from sudoku9_9 where length(replace(replace(puzzle,'?',''),chr(10),''))<30) sudoku9_9
),
solve AS (
SELECT id, board::text board, rows, cols, boxes, false as solved,positions FROM initial
UNION ALL
(
WITH current_level AS (
SELECT * FROM solve WHERE NOT solved
),
all_candidates AS (
SELECT id,
cl.board, cl.rows, cl.cols, cl.boxes,
pos, positions,
-- 修正重点:每一个数组提取都加 ::int,并且用括号包裹位运算
( ( (cl.rows[(pos-1)/9 + 1]::int | cl.cols[(pos-1)%9 + 1]::int | cl.boxes[((pos-1)/27*3 + (pos-1)%9/3) + 1]::int) # 511 ) & 511 )::int as available_mask
FROM (select *,unnest(positions) pos from current_level) cl
),
best_pos AS (
select * from(
SELECT
*,
row_number()over(partition by id, board order by bit_count(available_mask::bit(9)) ASC , bit_count(rows[(pos-1)/9 + 1]::int::bit(9)))rn
FROM all_candidates
)q
where rn=1
),
next_step AS (
SELECT id,
SUBSTR(bp.board, 1, bp.pos - 1) || n.val || SUBSTR(bp.board, bp.pos + 1) as next_board,
bp.rows[:r_idx-1] || ((bp.rows[r_idx]::int | (1 << (n.val-1)))::int) || bp.rows[r_idx+1:] as next_rows,
bp.cols[:c_idx-1] || ((bp.cols[c_idx]::int | (1 << (n.val-1)))::int) || bp.cols[c_idx+1:] as next_cols,
bp.boxes[:b_idx-1] || ((bp.boxes[b_idx]::int | (1 << (n.val-1)))::int) || bp.boxes[b_idx+1:] as next_boxes
, array_remove(positions, pos)rem_pos
FROM best_pos bp
CROSS JOIN LATERAL(select * from generate_series(position('1' in reverse(bp.available_mask::int::bit(9)::text)), 10-position(b'1' in bp.available_mask::bit(9))) n(val) )n
CROSS JOIN LATERAL (
SELECT ((pos-1)/9) + 1 as r_idx,
((pos-1)%9) + 1 as c_idx,
((pos-1)/27*3 + (pos-1)%9/3) + 1 as b_idx
) idx
-- 明确限定 bp.available_mask 为 int,并使用 & 检查
WHERE ((bp.available_mask::int >> (n.val-1)) & 1) = 1
)
SELECT id,
next_board, next_rows, next_cols, next_boxes,
position('0' IN next_board) = 0 , rem_pos
FROM next_step
)
)
SELECT i.id,
rtrim(replace(regexp_replace(i.board,'(.{9})', '\1' || chr(10),'g'),'0', '?'),chr(10)) AS puzzle ,
rtrim(regexp_replace(v.board,'(.{9})', '\1' || chr(10),'g'),chr(10)) AS result
from initial i left join
(select v0.id, v0.board from(SELECT id, row_number()over(partition by id)rn, board FROM solve WHERE solved)v0 where rn=1)v on i.id=v.id
union all
select id, replace(puzzle,'0', '?'), result from rev_result
;
思路说明
对于一句SQL解决数独问题,由于语言的限制,递归只支持BFS,无法实现回溯,基本上只能采用穷举法。
10年前itpub版主newkid就已经提供了利用二进制位高效判断可选数的Oracle程序。我把二进制掩码由number(38)改成高位和低位两部分的bigint版本,postgresql也能用。
之所以选择postgresql而不是oracle平台,测试用postgresql解决17位数独比oracle快2倍,解决示例1000题快10倍。
高效的原因一是预计算了所有可能位置的可选数字的二进制数,避免重复计算坐标(有大量取模和除法操作)和掩码。二是用整数保存掩码,直接整体二进制底层操作,避免数组操作开销。
在本届大赛公布的次日,postgresql大神德哥发表了他用AI完成的按最小可选数量(MRV)动态选点的程序。
我对他们的程序做了以下优化:
newkid顺序选点高度依赖已知数的位置,已知数在前几行越密集,产生的候选解空间就越小,如果已知数在前后分布不一致,用reverse字符串翻转将它处理后再翻转可以提速1倍,从2.2秒到1.1秒。(AMD 8845H 16G WSL pg 17.7)
还试过行列互换、旋转、三组整体移位等变换,效果不明显。
德哥原版在pg 17.2触发了bug,把left join改为inner join规避。
原版的候选点从81个位置中选,改为从动态更新的剩余位置positions中选unnest(positions) ,速度提升了30%。这是德哥第二版。
现在手中两个高效的版本,一个是优化newkid的,顺序选点+翻转,在处理简单(已知数大于或等于30)问题时比德哥版本快1倍。另一个是优化德哥的,在在处理特别难(已知数等于17)问题时比前面版本快几十倍,而在处理中等(已知数29到30)难题时,两者用时接近。
怎么在一个程序中实现对两种难度的题目分治?
试过直接改造德哥版本,简单题用顺序选点策略(未加入翻转)代替计算最小可选数量,解决1000个示例,尝试不同的已知数据阈值,当把难题阈值设为35时,结果最佳,约2秒,比德哥第二版再提升20%。这是德哥第三版。
最后用30个已知数为分界点,缝合两个大神版本的效果最好。结果1秒。
其实仅翻转和缝合版本在处理示例数据时差距微小,加入德哥版本的考虑主要是正式比赛数据可能加入更难的题目,组委会说明有最高55个未知数的题,这样德哥版本的优势就体现了。
一些优化技巧的采用,最终将示例数据处理时间达到860毫秒。
对在count_c第一关键字基础上,如果排名相同,使用不同第二关键字影响速度,不区分时结果不稳定,设为pos结果稳定,但并非总是最优。
引入计算较简单,开销不大的greatest(pos前密度,pos后密度)作排序第二关键字,优先选出数字密度更大处的选点。密度=数字个数*100/字符串长度。
用一个known跟踪变量记录已知数数量,当known达到某个阈值(比如35)后改用pos做第二关键字。
用bit_count位操作9-bit_count(not_available_mask::bit(9))as c_count代替字符串去0求长度 replace((not_available_mask::bit(9))::text, '0', '')来求1的个数,快10%
再用 ORDER BY id, board, c_count ASC , bit_count(rows[(pos-1)/9 + 1]::int::bit(9)),bit_count代替 pos作第二关键字,用局部密度最大代替一侧密度最大, 提高20%,
用带上下限的CROSS JOIN LATERAL(select * from generate_series(position('1' in reverse(bp.available_mask::int::bit(9)::text)), 10-position(b'1' in bp.available_mask::bit(9))) n(val) )n
替换CROSS JOIN generate_series(1, 9) n(val) ,
把原始distinct on 去重换成 row_number partition,同时去掉不再使用的known跟踪变量和greatest运算,提升10%。