NineData第三届数据库编程大赛:用一条 SQL 解数独问题我的参赛程序

初赛已结束,没进决赛。

sql 复制代码
WITH RECURSIVE  
a as(
    select id rn, replace(replace(puzzle,chr(10),''),'?','0')b from sudoku9_9 where length(replace(replace(puzzle,'?',''),chr(10),''))>=30
),
b as
(
    select rn,case flag when 0 then b else reverse(b) end b,flag from
       (select rn, b ,case when length(replace(substr(b,1,40),'0',''))>length(replace(substr(b,42,40),'0','')) then 0 else 1 end flag from a)s
),
d(lp) AS (
    VALUES(1)
    UNION ALL 
    SELECT lp+1 FROM d WHERE lp<81
),
grid AS (
SELECT lp AS pos
      ,(lp-1)/9 AS r  
      ,(lp-1)%9 AS c  
      ,(lp-1)/9/3*3 + (lp-1)%9/3 AS g  
  FROM d
)
,all_pos AS (  
SELECT pos,n
      ,case when(grid.r*9+n-1)>42 then 1::bigint << (grid.r*9+n-1)-42 else 0 end AS r_h
      ,case when(grid.c*9+n-1)>42 then 1::bigint << (grid.c*9+n-1)-42 else 0 end AS c_h
      ,case when(grid.g*9+n-1)>42 then 1::bigint << (grid.g*9+n-1)-42 else 0 end AS g_h
      ,case when(grid.r*9+n-1)>42 then 0 else 1::bigint << (grid.r*9+n-1) end AS r_l
      ,case when(grid.c*9+n-1)>42 then 0 else 1::bigint << (grid.c*9+n-1) end AS c_l
      ,case when(grid.g*9+n-1)>42 then 0 else 1::bigint << (grid.g*9+n-1) end AS g_l
  FROM grid,generate_series(1, 9) d(n) 
)
,t(rn, s,rs_h,cs_h,gs_h,rs_l,cs_l,gs_l,next_pos) AS (
SELECT rn,CAST(b AS text)
       ,SUM(all_pos.r_h)::bigint rs_h  
       ,SUM(all_pos.c_h)::bigint cs_h
       ,SUM(all_pos.g_h)::bigint gs_h
       ,SUM(all_pos.r_l)::bigint rs_h   
       ,SUM(all_pos.c_l)::bigint cs_l
       ,SUM(all_pos.g_l)::bigint gs_l
       ,position('0' in b)   
   FROM all_pos,b
  WHERE all_pos.n=SUBSTR(b,all_pos.pos,1)::int
  group by rn,b
  UNION ALL
  SELECT rn,SUBSTR(t.s,1,t.next_pos-1)||a.n||SUBSTR(t.s,t.next_pos+1)
        ,t.rs_h+a.r_h
        ,t.cs_h+a.c_h
        ,t.gs_h+a.g_h
        ,t.rs_l+a.r_l
        ,t.cs_l+a.c_l
        ,t.gs_l+a.g_l
        ,case when position('0' in SUBSTR(t.s,t.next_pos+1)) >0 then position('0' in SUBSTR(t.s,t.next_pos+1))+t.next_pos else 0 end
    FROM t
        ,all_pos a
   WHERE t.next_pos = a.pos
         AND (t.rs_h&a.r_h)=0
         AND (t.cs_h&a.c_h)=0
         AND (t.gs_h&a.g_h)=0
         AND (t.rs_l&a.r_l)=0
         AND (t.cs_l&a.c_l)=0
         AND (t.gs_l&a.g_l)=0
)
,rev_result as(
    select t.rn id, 
    rtrim(regexp_replace(case flag when 0 then b else reverse(b) end,'(.{9})','\1' || chr(10),'g'),chr(10))puzzle,  
    rtrim(regexp_replace(case flag when 0 then s else reverse(s) end,'(.{9})','\1' || chr(10),'g'),chr(10))result 
    from b left join 
        (select rn,s from (select rn,s,row_number()over(partition by rn order by rn)resn from t where t.next_pos=0) t0 WHERE resn=1)t on t.rn=b.rn
),
initial AS (  
    SELECT   id, 
        puzzle board,  
        -- 初始化行掩码:确保 SUM 结果被强制转为 int  
        (SELECT array_agg(m) FROM (  
            SELECT SUM(case when val>0 then 1 << (val - 1)else 0 end)::int as m 
            FROM generate_series(0, 8) r  
            INNER JOIN LATERAL (SELECT SUBSTR(puzzle, r*9 + i, 1) as ch FROM generate_series(1, 9) i) s ON true
            CROSS JOIN LATERAL (SELECT (ch::text)::int as val) v  
            GROUP BY r ORDER BY r  
        ) s) as rows,  
        -- 初始化列掩码  
        (SELECT array_agg(m) FROM (  
            SELECT SUM(case when val>0 then 1 << (val - 1)else 0 end)::int as m  
            FROM generate_series(1, 9) c  
            INNER JOIN LATERAL (SELECT SUBSTR(puzzle, (i-1)*9 + c, 1) as ch FROM generate_series(1, 9) i) s ON true
            CROSS JOIN LATERAL (SELECT (ch::text)::int as val) v  
            GROUP BY c ORDER BY c  
        ) s) as cols,  
        -- 初始化宫掩码  
        (SELECT array_agg(m) FROM (  
            SELECT SUM(case when val>0 then 1 << (val - 1)else 0 end)::int as m   
            FROM generate_series(0, 8) b  
            INNER JOIN LATERAL (SELECT SUBSTR(puzzle, (b/3)*27 + (b%3)*3 + ((i-1)/3)*9 + ((i-1)%3) + 1, 1) as ch FROM generate_series(1, 9) i) s ON true
            CROSS JOIN LATERAL (SELECT (ch::text)::int as val) v  
            GROUP BY b ORDER BY b  
        ) s) as boxes , 
        (SELECT array_agg(pos::smallint) FROM generate_series(1, 81) p(pos) WHERE SUBSTR(puzzle, p.pos, 1) = '0' )as positions
         FROM (select id,replace(replace(puzzle, '?', '0'),chr(10), '') puzzle from sudoku9_9 where length(replace(replace(puzzle,'?',''),chr(10),''))<30) sudoku9_9
),  
solve AS (  
    SELECT id, board::text board, rows, cols, boxes, false as solved,positions FROM initial  
    UNION ALL  
    (  
        WITH current_level AS (  
            SELECT * FROM solve WHERE NOT solved
        ),  
        all_candidates AS (  
            SELECT id, 
                cl.board, cl.rows, cl.cols, cl.boxes,  
                pos,  positions, 
                -- 修正重点:每一个数组提取都加 ::int,并且用括号包裹位运算  
                ( ( (cl.rows[(pos-1)/9 + 1]::int | cl.cols[(pos-1)%9 + 1]::int | cl.boxes[((pos-1)/27*3 + (pos-1)%9/3) + 1]::int) # 511 ) & 511 )::int as available_mask  
            FROM (select *,unnest(positions) pos from current_level) cl  
        ),  
        best_pos AS (  
          select * from(    
            SELECT    
                *,  
                row_number()over(partition by id, board order by bit_count(available_mask::bit(9)) ASC  , bit_count(rows[(pos-1)/9 + 1]::int::bit(9)))rn
            FROM all_candidates
            )q
          where rn=1    
        ),  
        next_step AS (  
            SELECT   id, 
                SUBSTR(bp.board, 1, bp.pos - 1) || n.val || SUBSTR(bp.board, bp.pos + 1) as next_board,  
                bp.rows[:r_idx-1] || ((bp.rows[r_idx]::int | (1 << (n.val-1)))::int) || bp.rows[r_idx+1:] as next_rows,  
                bp.cols[:c_idx-1] || ((bp.cols[c_idx]::int | (1 << (n.val-1)))::int) || bp.cols[c_idx+1:] as next_cols,  
                bp.boxes[:b_idx-1] || ((bp.boxes[b_idx]::int | (1 << (n.val-1)))::int) || bp.boxes[b_idx+1:] as next_boxes  
                , array_remove(positions, pos)rem_pos
            FROM best_pos bp  
            CROSS JOIN LATERAL(select * from generate_series(position('1' in reverse(bp.available_mask::int::bit(9)::text)), 10-position(b'1' in bp.available_mask::bit(9))) n(val)  )n
            CROSS JOIN LATERAL (  
                SELECT ((pos-1)/9) + 1 as r_idx,   
                       ((pos-1)%9) + 1 as c_idx,   
                       ((pos-1)/27*3 + (pos-1)%9/3) + 1 as b_idx  
            ) idx  
            -- 明确限定 bp.available_mask 为 int,并使用 & 检查  
            WHERE ((bp.available_mask::int >> (n.val-1)) & 1) = 1  
        )  
        SELECT   id, 
            next_board, next_rows, next_cols, next_boxes,  
            position('0' IN next_board) = 0  , rem_pos
        FROM next_step  
    )  
)
SELECT i.id, 
    rtrim(replace(regexp_replace(i.board,'(.{9})', '\1' || chr(10),'g'),'0', '?'),chr(10)) AS puzzle , 
    rtrim(regexp_replace(v.board,'(.{9})', '\1' || chr(10),'g'),chr(10)) AS result  
    from initial i left join 
        (select v0.id, v0.board from(SELECT id, row_number()over(partition by id)rn, board FROM solve WHERE solved)v0 where rn=1)v on i.id=v.id
union all
select id, replace(puzzle,'0', '?'), result from rev_result
;  

思路说明

对于一句SQL解决数独问题,由于语言的限制,递归只支持BFS,无法实现回溯,基本上只能采用穷举法。

10年前itpub版主newkid就已经提供了利用二进制位高效判断可选数的Oracle程序。我把二进制掩码由number(38)改成高位和低位两部分的bigint版本,postgresql也能用。

之所以选择postgresql而不是oracle平台,测试用postgresql解决17位数独比oracle快2倍,解决示例1000题快10倍。

高效的原因一是预计算了所有可能位置的可选数字的二进制数,避免重复计算坐标(有大量取模和除法操作)和掩码。二是用整数保存掩码,直接整体二进制底层操作,避免数组操作开销。

在本届大赛公布的次日,postgresql大神德哥发表了他用AI完成的按最小可选数量(MRV)动态选点的程序。

我对他们的程序做了以下优化:

newkid顺序选点高度依赖已知数的位置,已知数在前几行越密集,产生的候选解空间就越小,如果已知数在前后分布不一致,用reverse字符串翻转将它处理后再翻转可以提速1倍,从2.2秒到1.1秒。(AMD 8845H 16G WSL pg 17.7)

还试过行列互换、旋转、三组整体移位等变换,效果不明显。

德哥原版在pg 17.2触发了bug,把left join改为inner join规避。

原版的候选点从81个位置中选,改为从动态更新的剩余位置positions中选unnest(positions) ,速度提升了30%。这是德哥第二版。

现在手中两个高效的版本,一个是优化newkid的,顺序选点+翻转,在处理简单(已知数大于或等于30)问题时比德哥版本快1倍。另一个是优化德哥的,在在处理特别难(已知数等于17)问题时比前面版本快几十倍,而在处理中等(已知数29到30)难题时,两者用时接近。

怎么在一个程序中实现对两种难度的题目分治?

试过直接改造德哥版本,简单题用顺序选点策略(未加入翻转)代替计算最小可选数量,解决1000个示例,尝试不同的已知数据阈值,当把难题阈值设为35时,结果最佳,约2秒,比德哥第二版再提升20%。这是德哥第三版。

最后用30个已知数为分界点,缝合两个大神版本的效果最好。结果1秒。

其实仅翻转和缝合版本在处理示例数据时差距微小,加入德哥版本的考虑主要是正式比赛数据可能加入更难的题目,组委会说明有最高55个未知数的题,这样德哥版本的优势就体现了。

一些优化技巧的采用,最终将示例数据处理时间达到860毫秒。

对在count_c第一关键字基础上,如果排名相同,使用不同第二关键字影响速度,不区分时结果不稳定,设为pos结果稳定,但并非总是最优。

引入计算较简单,开销不大的greatest(pos前密度,pos后密度)作排序第二关键字,优先选出数字密度更大处的选点。密度=数字个数*100/字符串长度。

用一个known跟踪变量记录已知数数量,当known达到某个阈值(比如35)后改用pos做第二关键字。

用bit_count位操作9-bit_count(not_available_mask::bit(9))as c_count代替字符串去0求长度 replace((not_available_mask::bit(9))::text, '0', '')来求1的个数,快10%

再用 ORDER BY id, board, c_count ASC , bit_count(rows[(pos-1)/9 + 1]::int::bit(9)),bit_count代替 pos作第二关键字,用局部密度最大代替一侧密度最大, 提高20%,

用带上下限的CROSS JOIN LATERAL(select * from generate_series(position('1' in reverse(bp.available_mask::int::bit(9)::text)), 10-position(b'1' in bp.available_mask::bit(9))) n(val) )n

替换CROSS JOIN generate_series(1, 9) n(val) ,

把原始distinct on 去重换成 row_number partition,同时去掉不再使用的known跟踪变量和greatest运算,提升10%。

相关推荐
NAGNIP5 小时前
一文搞懂深度学习中的通用逼近定理!
人工智能·算法·面试
冬奇Lab6 小时前
一天一个开源项目(第36篇):EverMemOS - 跨 LLM 与平台的长时记忆 OS,让 Agent 会记忆更会推理
人工智能·开源·资讯
冬奇Lab6 小时前
OpenClaw 源码深度解析(一):Gateway——为什么需要一个"中枢"
人工智能·开源·源码阅读
AngelPP10 小时前
OpenClaw 架构深度解析:如何把 AI 助手搬到你的个人设备上
人工智能
宅小年10 小时前
Claude Code 换成了Kimi K2.5后,我再也回不去了
人工智能·ai编程·claude
九狼10 小时前
Flutter URL Scheme 跨平台跳转
人工智能·flutter·github
ZFSS10 小时前
Kimi Chat Completion API 申请及使用
前端·人工智能
天翼云开发者社区11 小时前
春节复工福利就位!天翼云息壤2500万Tokens免费送,全品类大模型一键畅玩!
人工智能·算力服务·息壤
知识浅谈11 小时前
教你如何用 Gemini 将课本图片一键转为精美 PPT
人工智能
Ray Liang12 小时前
被低估的量化版模型,小身材也能干大事
人工智能·ai·ai助手·mindx