四种python工具包用SQL查询csv和parquet文件的方法比较

chdb

sql 复制代码

root@kylin-pc:/# python3
Python 3.14.3 (main, Feb 24 2026, 22:48:09) [GCC 14.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import chdb
>>> chdb.sql("select sum(l_quantity) from file('/par/lineitem.parquet')")
1529738036

>>> chdb.sql("select sum(l_quantity) from file('/par/lineitem.csv')")
1529738036

>>> import duckdb
>>> duckdb.sql("select sum(l_quantity) from '/par/lineitem.parquet'")
┌─────────────────┐
│ sum(l_quantity) │
│  decimal(38,2)  │
├─────────────────┤
│   1529738036.00 │
└─────────────────┘

>>> duckdb.sql("select sum(l_quantity) from '/par/lineitem.csv'")
100% ▕██████████████████████████████████████▏ (00:00:06.93 elapsed)     
┌─────────────────┐
│ sum(l_quantity) │
│     double      │
├─────────────────┤
│    1529738036.0 │
└─────────────────┘

polars

sql 复制代码

>>> import polars as pl

>>> df=pl.read_csv("/par/lineitem.csv", separator=",")
>>> ctx = pl.SQLContext(my_table=df, eager=True)
>>> result = ctx.execute("select sum(l_quantity) from my_table")
>>> print(result)
shape: (1, 1)
┌────────────┐
│ l_quantity │
│ ---        │
│ f64        │
╞════════════╡
│ 1.5297e9   │
└────────────┘

>>> df=pl.read_parquet("/par/lineitem.parquet")
>>> ctx = pl.SQLContext(my_table=df, eager=True)
>>> result = ctx.execute("select sum(l_quantity) from my_table")
>>> print(result)
shape: (1, 1)
┌───────────────┐
│ l_quantity    │
│ ---           │
│ decimal[15,2] │
╞═══════════════╡
│ 1529738036.00 │
└───────────────┘

datafusion

sql 复制代码

>>> from datafusion import SessionContext
>>> ctx = SessionContext()


>>> ctx.register_csv("lineitem", "/par/lineitem.csv")
>>> ctx.sql("select sum(l_quantity) from lineitem")
DataFrame()
+--------------------------+
| sum(lineitem.l_quantity) |
+--------------------------+
| 1529738036.0             |
+--------------------------+
>>> ctx.register_parquet("lineitem2", "/par/lineitem.parquet")
>>> ctx.sql("select sum(l_quantity) from lineitem2")
DataFrame()
+---------------------------+
| sum(lineitem2.l_quantity) |
+---------------------------+
| 1529738036.00             |
+---------------------------+

databend

sql 复制代码

>>> import databend
>>> ctx = databend.SessionContext()
>>> ctx.register_csv("lineitem", "/par/lineitem.csv")
Traceback (most recent call last):
  File "<python-input-2>", line 1, in <module>
    ctx.register_csv("lineitem", "/par/lineitem.csv")
    ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: DataFrame collect error: SemanticError. Code: 1065, Text = [QUERY-CTX] Query from CSV file lacks column positions. Specify as $1, $2, etc..

<Backtrace disabled by default. Please use RUST_BACKTRACE=1 to enable> 
>>> ctx.register_parquet("lineitem2", "/par/lineitem.parquet")
>>> ctx.sql("select sum(l_quantity) from lineitem2")
┌─────────────────────┐
│   sum(l_quantity)   │
│ Decimal(18, 2) NULL │
├─────────────────────┤
│ 1529738036.00       │
└─────────────────────┘

databend的register_csv方式据开发人员说准备废弃了。

对100万行数据group by的性能比较

root@kylin-pc:/# time python /par/test_duck2.py
┌───────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
│ a │ b0 │ b1 │ b2 │ b3 │ b4 │ b5 │ b6 │
│ int64 │ int64 │ int64 │ int64 │ int64 │ int64 │ int64 │ int64 │
├───────┼────────┼────────┼────────┼────────┼────────┼────────┼────────┤
│ 0 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │ 714286 │
│ 1 │ 714286 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │
└───────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘

0.09658598899841309
┌───────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
│ a │ b0 │ b1 │ b2 │ b3 │ b4 │ b5 │ b6 │
│ int64 │ int128 │ int128 │ int128 │ int128 │ int128 │ int128 │ int128 │
├───────┼────────┼────────┼────────┼────────┼────────┼────────┼────────┤
│ 0 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │ 714286 │
│ 1 │ 714286 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │
└───────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘

0.01613473892211914

real 0m0.868s
user 0m2.157s
sys 0m0.181s

root@kylin-pc:/# time python /par/test_ch2.py
0,714286,714286,714286,714285,714286,714285,714286
1,714286,714286,714286,714286,714285,714286,714285

0.23157906532287598
0,714286,714286,714286,714285,714286,714285,714286
1,714286,714286,714286,714286,714285,714286,714285

0.05921816825866699

real 0m0.694s
user 0m2.803s
sys 0m0.814s

root@kylin-pc:/# time python /par/test_pl.py
shape: (2, 8)
┌─────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
│ a ┆ b0 ┆ b1 ┆ b2 ┆ b3 ┆ b4 ┆ b5 ┆ b6 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 │
╞═════╪════════╪════════╪════════╪════════╪════════╪════════╪════════╡
│ 0 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714285 ┆ 714286 ┆ 714285 ┆ 714286 │
│ 1 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714285 ┆ 714286 ┆ 714285 │
└─────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘
0.15063023567199707
shape: (2, 8)
┌─────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
│ a ┆ b0 ┆ b1 ┆ b2 ┆ b3 ┆ b4 ┆ b5 ┆ b6 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 │
╞═════╪════════╪════════╪════════╪════════╪════════╪════════╪════════╡
│ 1 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714285 ┆ 714286 ┆ 714285 │
│ 0 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714285 ┆ 714286 ┆ 714285 ┆ 714286 │
└─────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘
0.11575460433959961

real 0m1.090s
user 0m1.985s
sys 0m0.454s

test_duck2.py

python 复制代码

import duckdb


conn = duckdb.connect(":memory:")

conn.query("create table t as select i%2 a, i%7 b,i%11 c from range(10000000)t(i)")


s1="""
select a,
count(case b when 0 then 1 end)b0 , 
count(case b when 1 then 1 end)b1 , 
count(case b when 2 then 1 end)b2 , 
count(case b when 3 then 1 end)b3 , 
count(case b when 4 then 1 end)b4 , 
count(case b when 5 then 1 end)b5 ,
count(case b when 6 then 1 end)b6  
from t group by a;
"""

s2="""
with t1 as(select count(*)cnt ,a,b from t group by a,b)
select a,
sum(case b when 0 then cnt end)b0 , 
sum(case b when 1 then cnt end)b1 , 
sum(case b when 2 then cnt end)b2 , 
sum(case b when 3 then cnt end)b3 , 
sum(case b when 4 then cnt end)b4 , 
sum(case b when 5 then cnt end)b5 ,
sum(case b when 6 then cnt end)b6  
from t1 group by a;
"""

import time
t=time.time()
res =conn.query(s1)
print(res)
print(time.time()-t)

t=time.time()
res =conn.query(s2)
print(res)
print(time.time()-t)

test_ch2.py

python 复制代码

import chdb


conn = chdb.connect(":memory:")

conn.query("create table t as select i%2 a, i%7 b,i%11 c from (select number i from numbers(10000000))")


s1="""
select a,
count(case b when 0 then 1 end)b0 , 
count(case b when 1 then 1 end)b1 , 
count(case b when 2 then 1 end)b2 , 
count(case b when 3 then 1 end)b3 , 
count(case b when 4 then 1 end)b4 , 
count(case b when 5 then 1 end)b5 ,
count(case b when 6 then 1 end)b6  
from t group by a;
"""

s2="""
with t1 as(select count(*)cnt ,a,b from t group by a,b)
select a,
sum(case b when 0 then cnt end)b0 , 
sum(case b when 1 then cnt end)b1 , 
sum(case b when 2 then cnt end)b2 , 
sum(case b when 3 then cnt end)b3 , 
sum(case b when 4 then cnt end)b4 , 
sum(case b when 5 then cnt end)b5 ,
sum(case b when 6 then cnt end)b6  
from t1 group by a;
"""

import time
t=time.time()
res =conn.query(s1)
print(res)
print(time.time()-t)

t=time.time()
res =conn.query(s2)
print(res)
print(time.time()-t)

test_pl.py

python 复制代码

import polars as pl
data = {"i": [i for i in range(10000000)]}
df = pl.LazyFrame(data)

ctx = pl.SQLContext(my_table=df, eager=True)

result = ctx.execute("create table t as select i%2 a, i%7 b,i%11 c from my_table")


s1="""
select a,
count(case b when 0 then 1 end)b0 , 
count(case b when 1 then 1 end)b1 , 
count(case b when 2 then 1 end)b2 , 
count(case b when 3 then 1 end)b3 , 
count(case b when 4 then 1 end)b4 , 
count(case b when 5 then 1 end)b5 ,
count(case b when 6 then 1 end)b6  
from t group by a;
"""

s2="""
with t1 as(select count(*)cnt ,a,b from t group by a,b)
select a,
sum(case b when 0 then cnt end)b0 , 
sum(case b when 1 then cnt end)b1 , 
sum(case b when 2 then cnt end)b2 , 
sum(case b when 3 then cnt end)b3 , 
sum(case b when 4 then cnt end)b4 , 
sum(case b when 5 then cnt end)b5 ,
sum(case b when 6 then cnt end)b6  
from t1 group by a;
"""

import time
t=time.time()
res =ctx.execute(s1)
print(res)
print(time.time()-t)

t=time.time()
res =ctx.execute(s2)
print(res)
print(time.time()-t)

注意有的工具是惰性的，print()函数不能保证各参数的执行顺序，导致出现奇怪的计时结果。所以不能简写为

复制代码

t=time.time()
res =conn.query(s1)
print(res,time.time()-t)

┌───────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
│   a   │   b0   │   b1   │   b2   │   b3   │   b4   │   b5   │   b6   │
│ int64 │ int64  │ int64  │ int64  │ int64  │ int64  │ int64  │ int64  │
├───────┼────────┼────────┼────────┼────────┼────────┼────────┼────────┤
│     0 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │ 714286 │
│     1 │ 714286 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │
└───────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘
 0.0005764961242675781