NineData第三届数据库编程大赛:用一条 SQL 解数独问题我的参赛程序

初赛已结束,没进决赛。

sql 复制代码
WITH RECURSIVE  
a as(
    select id rn, replace(replace(puzzle,chr(10),''),'?','0')b from sudoku9_9 where length(replace(replace(puzzle,'?',''),chr(10),''))>=30
),
b as
(
    select rn,case flag when 0 then b else reverse(b) end b,flag from
       (select rn, b ,case when length(replace(substr(b,1,40),'0',''))>length(replace(substr(b,42,40),'0','')) then 0 else 1 end flag from a)s
),
d(lp) AS (
    VALUES(1)
    UNION ALL 
    SELECT lp+1 FROM d WHERE lp<81
),
grid AS (
SELECT lp AS pos
      ,(lp-1)/9 AS r  
      ,(lp-1)%9 AS c  
      ,(lp-1)/9/3*3 + (lp-1)%9/3 AS g  
  FROM d
)
,all_pos AS (  
SELECT pos,n
      ,case when(grid.r*9+n-1)>42 then 1::bigint << (grid.r*9+n-1)-42 else 0 end AS r_h
      ,case when(grid.c*9+n-1)>42 then 1::bigint << (grid.c*9+n-1)-42 else 0 end AS c_h
      ,case when(grid.g*9+n-1)>42 then 1::bigint << (grid.g*9+n-1)-42 else 0 end AS g_h
      ,case when(grid.r*9+n-1)>42 then 0 else 1::bigint << (grid.r*9+n-1) end AS r_l
      ,case when(grid.c*9+n-1)>42 then 0 else 1::bigint << (grid.c*9+n-1) end AS c_l
      ,case when(grid.g*9+n-1)>42 then 0 else 1::bigint << (grid.g*9+n-1) end AS g_l
  FROM grid,generate_series(1, 9) d(n) 
)
,t(rn, s,rs_h,cs_h,gs_h,rs_l,cs_l,gs_l,next_pos) AS (
SELECT rn,CAST(b AS text)
       ,SUM(all_pos.r_h)::bigint rs_h  
       ,SUM(all_pos.c_h)::bigint cs_h
       ,SUM(all_pos.g_h)::bigint gs_h
       ,SUM(all_pos.r_l)::bigint rs_h   
       ,SUM(all_pos.c_l)::bigint cs_l
       ,SUM(all_pos.g_l)::bigint gs_l
       ,position('0' in b)   
   FROM all_pos,b
  WHERE all_pos.n=SUBSTR(b,all_pos.pos,1)::int
  group by rn,b
  UNION ALL
  SELECT rn,SUBSTR(t.s,1,t.next_pos-1)||a.n||SUBSTR(t.s,t.next_pos+1)
        ,t.rs_h+a.r_h
        ,t.cs_h+a.c_h
        ,t.gs_h+a.g_h
        ,t.rs_l+a.r_l
        ,t.cs_l+a.c_l
        ,t.gs_l+a.g_l
        ,case when position('0' in SUBSTR(t.s,t.next_pos+1)) >0 then position('0' in SUBSTR(t.s,t.next_pos+1))+t.next_pos else 0 end
    FROM t
        ,all_pos a
   WHERE t.next_pos = a.pos
         AND (t.rs_h&a.r_h)=0
         AND (t.cs_h&a.c_h)=0
         AND (t.gs_h&a.g_h)=0
         AND (t.rs_l&a.r_l)=0
         AND (t.cs_l&a.c_l)=0
         AND (t.gs_l&a.g_l)=0
)
,rev_result as(
    select t.rn id, 
    rtrim(regexp_replace(case flag when 0 then b else reverse(b) end,'(.{9})','\1' || chr(10),'g'),chr(10))puzzle,  
    rtrim(regexp_replace(case flag when 0 then s else reverse(s) end,'(.{9})','\1' || chr(10),'g'),chr(10))result 
    from b left join 
        (select rn,s from (select rn,s,row_number()over(partition by rn order by rn)resn from t where t.next_pos=0) t0 WHERE resn=1)t on t.rn=b.rn
),
initial AS (  
    SELECT   id, 
        puzzle board,  
        -- 初始化行掩码:确保 SUM 结果被强制转为 int  
        (SELECT array_agg(m) FROM (  
            SELECT SUM(case when val>0 then 1 << (val - 1)else 0 end)::int as m 
            FROM generate_series(0, 8) r  
            INNER JOIN LATERAL (SELECT SUBSTR(puzzle, r*9 + i, 1) as ch FROM generate_series(1, 9) i) s ON true
            CROSS JOIN LATERAL (SELECT (ch::text)::int as val) v  
            GROUP BY r ORDER BY r  
        ) s) as rows,  
        -- 初始化列掩码  
        (SELECT array_agg(m) FROM (  
            SELECT SUM(case when val>0 then 1 << (val - 1)else 0 end)::int as m  
            FROM generate_series(1, 9) c  
            INNER JOIN LATERAL (SELECT SUBSTR(puzzle, (i-1)*9 + c, 1) as ch FROM generate_series(1, 9) i) s ON true
            CROSS JOIN LATERAL (SELECT (ch::text)::int as val) v  
            GROUP BY c ORDER BY c  
        ) s) as cols,  
        -- 初始化宫掩码  
        (SELECT array_agg(m) FROM (  
            SELECT SUM(case when val>0 then 1 << (val - 1)else 0 end)::int as m   
            FROM generate_series(0, 8) b  
            INNER JOIN LATERAL (SELECT SUBSTR(puzzle, (b/3)*27 + (b%3)*3 + ((i-1)/3)*9 + ((i-1)%3) + 1, 1) as ch FROM generate_series(1, 9) i) s ON true
            CROSS JOIN LATERAL (SELECT (ch::text)::int as val) v  
            GROUP BY b ORDER BY b  
        ) s) as boxes , 
        (SELECT array_agg(pos::smallint) FROM generate_series(1, 81) p(pos) WHERE SUBSTR(puzzle, p.pos, 1) = '0' )as positions
         FROM (select id,replace(replace(puzzle, '?', '0'),chr(10), '') puzzle from sudoku9_9 where length(replace(replace(puzzle,'?',''),chr(10),''))<30) sudoku9_9
),  
solve AS (  
    SELECT id, board::text board, rows, cols, boxes, false as solved,positions FROM initial  
    UNION ALL  
    (  
        WITH current_level AS (  
            SELECT * FROM solve WHERE NOT solved
        ),  
        all_candidates AS (  
            SELECT id, 
                cl.board, cl.rows, cl.cols, cl.boxes,  
                pos,  positions, 
                -- 修正重点:每一个数组提取都加 ::int,并且用括号包裹位运算  
                ( ( (cl.rows[(pos-1)/9 + 1]::int | cl.cols[(pos-1)%9 + 1]::int | cl.boxes[((pos-1)/27*3 + (pos-1)%9/3) + 1]::int) # 511 ) & 511 )::int as available_mask  
            FROM (select *,unnest(positions) pos from current_level) cl  
        ),  
        best_pos AS (  
          select * from(    
            SELECT    
                *,  
                row_number()over(partition by id, board order by bit_count(available_mask::bit(9)) ASC  , bit_count(rows[(pos-1)/9 + 1]::int::bit(9)))rn
            FROM all_candidates
            )q
          where rn=1    
        ),  
        next_step AS (  
            SELECT   id, 
                SUBSTR(bp.board, 1, bp.pos - 1) || n.val || SUBSTR(bp.board, bp.pos + 1) as next_board,  
                bp.rows[:r_idx-1] || ((bp.rows[r_idx]::int | (1 << (n.val-1)))::int) || bp.rows[r_idx+1:] as next_rows,  
                bp.cols[:c_idx-1] || ((bp.cols[c_idx]::int | (1 << (n.val-1)))::int) || bp.cols[c_idx+1:] as next_cols,  
                bp.boxes[:b_idx-1] || ((bp.boxes[b_idx]::int | (1 << (n.val-1)))::int) || bp.boxes[b_idx+1:] as next_boxes  
                , array_remove(positions, pos)rem_pos
            FROM best_pos bp  
            CROSS JOIN LATERAL(select * from generate_series(position('1' in reverse(bp.available_mask::int::bit(9)::text)), 10-position(b'1' in bp.available_mask::bit(9))) n(val)  )n
            CROSS JOIN LATERAL (  
                SELECT ((pos-1)/9) + 1 as r_idx,   
                       ((pos-1)%9) + 1 as c_idx,   
                       ((pos-1)/27*3 + (pos-1)%9/3) + 1 as b_idx  
            ) idx  
            -- 明确限定 bp.available_mask 为 int,并使用 & 检查  
            WHERE ((bp.available_mask::int >> (n.val-1)) & 1) = 1  
        )  
        SELECT   id, 
            next_board, next_rows, next_cols, next_boxes,  
            position('0' IN next_board) = 0  , rem_pos
        FROM next_step  
    )  
)
SELECT i.id, 
    rtrim(replace(regexp_replace(i.board,'(.{9})', '\1' || chr(10),'g'),'0', '?'),chr(10)) AS puzzle , 
    rtrim(regexp_replace(v.board,'(.{9})', '\1' || chr(10),'g'),chr(10)) AS result  
    from initial i left join 
        (select v0.id, v0.board from(SELECT id, row_number()over(partition by id)rn, board FROM solve WHERE solved)v0 where rn=1)v on i.id=v.id
union all
select id, replace(puzzle,'0', '?'), result from rev_result
;  

思路说明

对于一句SQL解决数独问题,由于语言的限制,递归只支持BFS,无法实现回溯,基本上只能采用穷举法。

10年前itpub版主newkid就已经提供了利用二进制位高效判断可选数的Oracle程序。我把二进制掩码由number(38)改成高位和低位两部分的bigint版本,postgresql也能用。

之所以选择postgresql而不是oracle平台,测试用postgresql解决17位数独比oracle快2倍,解决示例1000题快10倍。

高效的原因一是预计算了所有可能位置的可选数字的二进制数,避免重复计算坐标(有大量取模和除法操作)和掩码。二是用整数保存掩码,直接整体二进制底层操作,避免数组操作开销。

在本届大赛公布的次日,postgresql大神德哥发表了他用AI完成的按最小可选数量(MRV)动态选点的程序。

我对他们的程序做了以下优化:

newkid顺序选点高度依赖已知数的位置,已知数在前几行越密集,产生的候选解空间就越小,如果已知数在前后分布不一致,用reverse字符串翻转将它处理后再翻转可以提速1倍,从2.2秒到1.1秒。(AMD 8845H 16G WSL pg 17.7)

还试过行列互换、旋转、三组整体移位等变换,效果不明显。

德哥原版在pg 17.2触发了bug,把left join改为inner join规避。

原版的候选点从81个位置中选,改为从动态更新的剩余位置positions中选unnest(positions) ,速度提升了30%。这是德哥第二版。

现在手中两个高效的版本,一个是优化newkid的,顺序选点+翻转,在处理简单(已知数大于或等于30)问题时比德哥版本快1倍。另一个是优化德哥的,在在处理特别难(已知数等于17)问题时比前面版本快几十倍,而在处理中等(已知数29到30)难题时,两者用时接近。

怎么在一个程序中实现对两种难度的题目分治?

试过直接改造德哥版本,简单题用顺序选点策略(未加入翻转)代替计算最小可选数量,解决1000个示例,尝试不同的已知数据阈值,当把难题阈值设为35时,结果最佳,约2秒,比德哥第二版再提升20%。这是德哥第三版。

最后用30个已知数为分界点,缝合两个大神版本的效果最好。结果1秒。

其实仅翻转和缝合版本在处理示例数据时差距微小,加入德哥版本的考虑主要是正式比赛数据可能加入更难的题目,组委会说明有最高55个未知数的题,这样德哥版本的优势就体现了。

一些优化技巧的采用,最终将示例数据处理时间达到860毫秒。

对在count_c第一关键字基础上,如果排名相同,使用不同第二关键字影响速度,不区分时结果不稳定,设为pos结果稳定,但并非总是最优。

引入计算较简单,开销不大的greatest(pos前密度,pos后密度)作排序第二关键字,优先选出数字密度更大处的选点。密度=数字个数*100/字符串长度。

用一个known跟踪变量记录已知数数量,当known达到某个阈值(比如35)后改用pos做第二关键字。

用bit_count位操作9-bit_count(not_available_mask::bit(9))as c_count代替字符串去0求长度 replace((not_available_mask::bit(9))::text, '0', '')来求1的个数,快10%

再用 ORDER BY id, board, c_count ASC , bit_count(rows[(pos-1)/9 + 1]::int::bit(9)),bit_count代替 pos作第二关键字,用局部密度最大代替一侧密度最大, 提高20%,

用带上下限的CROSS JOIN LATERAL(select * from generate_series(position('1' in reverse(bp.available_mask::int::bit(9)::text)), 10-position(b'1' in bp.available_mask::bit(9))) n(val) )n

替换CROSS JOIN generate_series(1, 9) n(val) ,

把原始distinct on 去重换成 row_number partition,同时去掉不再使用的known跟踪变量和greatest运算,提升10%。

相关推荐
Evand J15 小时前
【MATLAB代码介绍】基于累积概率的三维轨迹匹配与定位,由轨迹匹配和卡尔曼滤波形成算法,带测试结果演示
算法·matlab·滤波·定位·导航·轨迹匹配
土豆.exe15 小时前
若爱 (IfAI) v0.2.6 - 智能体进化:任务拆解与环境感知
人工智能
千金裘换酒15 小时前
LeetCode 链表两数相加
算法·leetcode·链表
NAGNIP15 小时前
一文搞懂机器学习中的优化方法!
算法
colfree15 小时前
Scanpy
人工智能·机器学习
Sammyyyyy15 小时前
Rust 1.92.0 发布:Never Type 进一步稳定
java·算法·rust
Akamai中国15 小时前
基准测试:Akamai云上的NVIDIA RTX Pro 6000 Blackwell
人工智能·云计算·云服务·云存储
alonewolf_9915 小时前
深入解析G1与ZGC垃圾收集器:原理、调优与选型指南
java·jvm·算法