前文有一个高效的sql,好奇能否把它改写成python程序,让DeepSeek改写,告诉他:
把如下sql精确翻译成python,用列表代替表,注意这是递归CTE。
sql
with n as (
select level n from dual connect by level<=5
)
,A as (select n1.n x, n2.n y, power(2,rownum-1) bit from n n1,n n2)
,C as (
select a1.bit,a1.x,a1.y
,sum(a2.bit) bit2
from a a1,a a2
where (a1.x-a2.x)*(a1.y-a2.y)<=0
group by a1.bit,a1.x,a1.y
)
,b(cnt,bit,bit2) as (
select 1,c.bit,c.bit2
from c
union all
select b.cnt+1,b.bit+c.bit,bitand(b.bit2,c.bit2)
from b,c
where c.bit>b.bit and bitand(b.bit2,c.bit)>0
)
select listagg('('||a.x||','||a.y||')') within group(order by r.bit)
from (select * from (
select b.*,rank() over(order by cnt desc) rnk from b
)
where rnk=1
) r join a on bitand(r.bit,a.bit)>0
group by r.bit;
他翻译的结果如下:
python
def main():
# with n as (select level n from dual connect by level<=5)
n_list = [i for i in range(1, 6)] # level从1开始
# A as (select n1.n x, n2.n y, power(2,rownum-1) bit from n n1,n n2)
A = []
bit_value = 1
for x in n_list:
for y in n_list:
A.append({'x': x, 'y': y, 'bit': bit_value})
bit_value *= 2
# C as (select a1.bit,a1.x,a1.y, sum(a2.bit) bit2 from a a1,a a2 where (a1.x-a2.x)*(a1.y-a2.y)<=0 group by a1.bit,a1.x,a1.y)
C = []
for a1 in A:
bit2_sum = 0
for a2 in A:
if (a1['x'] - a2['x']) * (a1['y'] - a2['y']) <= 0:
bit2_sum += a2['bit']
C.append({
'bit': a1['bit'],
'x': a1['x'],
'y': a1['y'],
'bit2': bit2_sum
})
# 递归CTE b(cnt,bit,bit2)
b = []
# 初始部分: select 1,c.bit,c.bit2 from c
for c_row in C:
b.append({
'cnt': 1,
'bit': c_row['bit'],
'bit2': c_row['bit2']
})
# 递归部分: 模拟递归CTE
changed = True
while changed:
changed = False
new_rows = []
for b_row in b:
for c_row in C:
# where c.bit > b.bit and bitand(b.bit2, c.bit) > 0
if (c_row['bit'] > b_row['bit'] and
(b_row['bit2'] & c_row['bit']) > 0):
new_row = {
'cnt': b_row['cnt'] + 1,
'bit': b_row['bit'] + c_row['bit'],
'bit2': b_row['bit2'] & c_row['bit2']
}
# 检查是否已存在相同的bit组合
if not any(existing['bit'] == new_row['bit'] for existing in b + new_rows):
new_rows.append(new_row)
changed = True
b.extend(new_rows)
# 找到cnt最大的行: select * from (select b.*,rank() over(order by cnt desc) rnk from b) where rnk=1
if not b:
return
max_cnt = max(row['cnt'] for row in b)
max_rows = [row for row in b if row['cnt'] == max_cnt]
# 为每个最大集合生成点坐标字符串
results = []
for r in max_rows:
points = []
for a_row in A:
# where bitand(r.bit, a.bit) > 0
if r['bit'] & a_row['bit'] > 0:
points.append(f"({a_row['x']},{a_row['y']})")
# listagg within group(order by r.bit) - 这里按bit值排序可能没有意义,改为按坐标排序
points.sort()
results.append(''.join(points))
# 输出结果
print(f"最大集合大小: {max_cnt}")
print(f"找到 {len(results)} 个最大集合:")
#for i, result in enumerate(results, 1):
# print(f"{i}: {result}")
if __name__ == "__main__":
main()
以上程序的运行结果正确,但速度很慢,5x5个点,用时12秒。
C:\d>timer64 python sql2py.py
最大集合大小: 9
找到 70 个最大集合:
Kernel Time = 0.078 = 0%
User Time = 11.984 = 99%
Process Time = 12.062 = 99% Virtual Memory = 10 MB
Global Time = 12.072 = 100% Physical Memory = 14 MB
查看上述代码,发现他不懂得递归CTE只能看到上一层递归的结果这个道理,每次都对整个b列表遍历,所以将相应遍历代码改为如下
python
# 递归部分: 模拟递归CTE
changed = True
last_cnt=0
while changed:
changed = False
new_rows = []
last_cnt+=1
temp_rows = [row for row in b if row['cnt'] == last_cnt]
for b_row in temp_rows:
通过每次循环递增last_cnt,限制只在cnt等于last_cnt的子集中遍历,速度快了4倍,仍然不能令人满意
C:\d>timer64 python sql2py2.py
最大集合大小: 9
找到 70 个最大集合:
Kernel Time = 0.031 = 1%
User Time = 2.703 = 98%
Process Time = 2.734 = 99% Virtual Memory = 10 MB
Global Time = 2.750 = 100% Physical Memory = 14 MB
再看代码,还有一段有误,这在SQL中并没有给出,属于画蛇添足,看来他不会优化程序,但是会添加自己的思想。
python
# 检查是否已存在相同的bit组合
if not any(existing['bit'] == new_row['bit'] for existing in b + new_rows):
把上述判断去掉,速度快了几百倍, 继续测试6x6, 7x7也都很快。
C:\d>timer64 python sql2py2b.py
最大集合大小: 9
找到 70 个最大集合:
Kernel Time = 0.015 = 36%
User Time = 0.015 = 36%
Process Time = 0.031 = 72% Virtual Memory = 10 MB
Global Time = 0.043 = 100% Physical Memory = 14 MB
C:\d>timer64 python sql2py2b.py
最大集合大小: 11
找到 252 个最大集合:
Kernel Time = 0.015 = 6%
User Time = 0.234 = 90%
Process Time = 0.250 = 96% Virtual Memory = 36 MB
Global Time = 0.258 = 100% Physical Memory = 40 MB
C:\d>timer64 python sql2py2b.py
最大集合大小: 13
找到 924 个最大集合:
Kernel Time = 0.093 = 3%
User Time = 2.968 = 96%
Process Time = 3.062 = 100% Virtual Memory = 317 MB
Global Time = 3.061 = 100% Physical Memory = 321 MB
前面的代码从b列表中反复筛选row['cnt'] == last_cnt的行,比较低效,考虑递归都在后面追加行,可以用列表的索引来处理
python
# 递归部分: 模拟递归CTE
changed = True
last_idx=0
while changed:
changed = False
new_rows = []
cur_idx=len(b)-1
for b_row in b[last_idx:]:
# 保持源代码
last_idx=cur_idx
b.extend(new_rows)
这个改动收益很少,7x7的集合只节省了0.1秒。
C:\d>timer64 python sql2py3b.py
最大集合大小: 13
找到 924 个最大集合:
Kernel Time = 0.093 = 3%
User Time = 2.812 = 96%
Process Time = 2.906 = 99% Virtual Memory = 315 MB
Global Time = 2.913 = 100% Physical Memory = 319 MB
后记:
对于这个特定的需求,递归CTE的前面各层用过一次就可以丢弃,所以可以用新算出的层代替整个b列表,这种改动涉及的代码是最少的,效果相同。
python
if new_rows != []:b=new_rows
#b.extend(new_rows)
还能节省0.05秒, 内存也节省1半。
C:\d>timer64 python sql2py4.py
最大集合大小: 13
找到 924 个最大集合:
Kernel Time = 0.078 = 2%
User Time = 2.765 = 96%
Process Time = 2.843 = 99% Virtual Memory = 145 MB
Global Time = 2.863 = 100% Physical Memory = 150 MB
用生成式语句代替循环和判断,
python
# 递归部分: 模拟递归CTE
changed = True
while changed:
changed = False
new_rows = [{'cnt': b_row['cnt'] + 1,'bit': b_row['bit'] + c_row['bit'],'bit2': b_row['bit2'] & c_row['bit2']}for b_row in b for c_row in C if c_row['bit'] > b_row['bit'] and (b_row['bit2'] & c_row['bit']) > 0]
if new_rows != []:b=new_rows;changed = True
还能再快0.2秒
C:\d>timer64 python sql2py4b.py
最大集合大小: 13
找到 924 个最大集合:
Kernel Time = 0.140 = 5%
User Time = 2.515 = 94%
Process Time = 2.656 = 99% Virtual Memory = 145 MB
Global Time = 2.672 = 100% Physical Memory = 150 MB