- chdb
sql
root@kylin-pc:/# python3
Python 3.14.3 (main, Feb 24 2026, 22:48:09) [GCC 14.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import chdb
>>> chdb.sql("select sum(l_quantity) from file('/par/lineitem.parquet')")
1529738036
>>> chdb.sql("select sum(l_quantity) from file('/par/lineitem.csv')")
1529738036
>>> import duckdb
>>> duckdb.sql("select sum(l_quantity) from '/par/lineitem.parquet'")
┌─────────────────┐
│ sum(l_quantity) │
│ decimal(38,2) │
├─────────────────┤
│ 1529738036.00 │
└─────────────────┘
>>> duckdb.sql("select sum(l_quantity) from '/par/lineitem.csv'")
100% ▕██████████████████████████████████████▏ (00:00:06.93 elapsed)
┌─────────────────┐
│ sum(l_quantity) │
│ double │
├─────────────────┤
│ 1529738036.0 │
└─────────────────┘
- polars
sql
>>> import polars as pl
>>> df=pl.read_csv("/par/lineitem.csv", separator=",")
>>> ctx = pl.SQLContext(my_table=df, eager=True)
>>> result = ctx.execute("select sum(l_quantity) from my_table")
>>> print(result)
shape: (1, 1)
┌────────────┐
│ l_quantity │
│ --- │
│ f64 │
╞════════════╡
│ 1.5297e9 │
└────────────┘
>>> df=pl.read_parquet("/par/lineitem.parquet")
>>> ctx = pl.SQLContext(my_table=df, eager=True)
>>> result = ctx.execute("select sum(l_quantity) from my_table")
>>> print(result)
shape: (1, 1)
┌───────────────┐
│ l_quantity │
│ --- │
│ decimal[15,2] │
╞═══════════════╡
│ 1529738036.00 │
└───────────────┘
- datafusion
sql
>>> from datafusion import SessionContext
>>> ctx = SessionContext()
>>> ctx.register_csv("lineitem", "/par/lineitem.csv")
>>> ctx.sql("select sum(l_quantity) from lineitem")
DataFrame()
+--------------------------+
| sum(lineitem.l_quantity) |
+--------------------------+
| 1529738036.0 |
+--------------------------+
>>> ctx.register_parquet("lineitem2", "/par/lineitem.parquet")
>>> ctx.sql("select sum(l_quantity) from lineitem2")
DataFrame()
+---------------------------+
| sum(lineitem2.l_quantity) |
+---------------------------+
| 1529738036.00 |
+---------------------------+
- databend
sql
>>> import databend
>>> ctx = databend.SessionContext()
>>> ctx.register_csv("lineitem", "/par/lineitem.csv")
Traceback (most recent call last):
File "<python-input-2>", line 1, in <module>
ctx.register_csv("lineitem", "/par/lineitem.csv")
~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: DataFrame collect error: SemanticError. Code: 1065, Text = [QUERY-CTX] Query from CSV file lacks column positions. Specify as $1, $2, etc..
<Backtrace disabled by default. Please use RUST_BACKTRACE=1 to enable>
>>> ctx.register_parquet("lineitem2", "/par/lineitem.parquet")
>>> ctx.sql("select sum(l_quantity) from lineitem2")
┌─────────────────────┐
│ sum(l_quantity) │
│ Decimal(18, 2) NULL │
├─────────────────────┤
│ 1529738036.00 │
└─────────────────────┘
databend的register_csv方式据开发人员说准备废弃了。
-
对100万行数据group by的性能比较
root@kylin-pc:/# time python /par/test_duck2.py
┌───────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
│ a │ b0 │ b1 │ b2 │ b3 │ b4 │ b5 │ b6 │
│ int64 │ int64 │ int64 │ int64 │ int64 │ int64 │ int64 │ int64 │
├───────┼────────┼────────┼────────┼────────┼────────┼────────┼────────┤
│ 0 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │ 714286 │
│ 1 │ 714286 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │
└───────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘0.09658598899841309
┌───────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
│ a │ b0 │ b1 │ b2 │ b3 │ b4 │ b5 │ b6 │
│ int64 │ int128 │ int128 │ int128 │ int128 │ int128 │ int128 │ int128 │
├───────┼────────┼────────┼────────┼────────┼────────┼────────┼────────┤
│ 0 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │ 714286 │
│ 1 │ 714286 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │
└───────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘0.01613473892211914
real 0m0.868s
user 0m2.157s
sys 0m0.181sroot@kylin-pc:/# time python /par/test_ch2.py
0,714286,714286,714286,714285,714286,714285,714286
1,714286,714286,714286,714286,714285,714286,7142850.23157906532287598
0,714286,714286,714286,714285,714286,714285,714286
1,714286,714286,714286,714286,714285,714286,7142850.05921816825866699
real 0m0.694s
user 0m2.803s
sys 0m0.814sroot@kylin-pc:/# time python /par/test_pl.py
shape: (2, 8)
┌─────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
│ a ┆ b0 ┆ b1 ┆ b2 ┆ b3 ┆ b4 ┆ b5 ┆ b6 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 │
╞═════╪════════╪════════╪════════╪════════╪════════╪════════╪════════╡
│ 0 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714285 ┆ 714286 ┆ 714285 ┆ 714286 │
│ 1 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714285 ┆ 714286 ┆ 714285 │
└─────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘
0.15063023567199707
shape: (2, 8)
┌─────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
│ a ┆ b0 ┆ b1 ┆ b2 ┆ b3 ┆ b4 ┆ b5 ┆ b6 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 │
╞═════╪════════╪════════╪════════╪════════╪════════╪════════╪════════╡
│ 1 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714285 ┆ 714286 ┆ 714285 │
│ 0 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714285 ┆ 714286 ┆ 714285 ┆ 714286 │
└─────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘
0.11575460433959961real 0m1.090s
user 0m1.985s
sys 0m0.454s
test_duck2.py
python
import duckdb
conn = duckdb.connect(":memory:")
conn.query("create table t as select i%2 a, i%7 b,i%11 c from range(10000000)t(i)")
s1="""
select a,
count(case b when 0 then 1 end)b0 ,
count(case b when 1 then 1 end)b1 ,
count(case b when 2 then 1 end)b2 ,
count(case b when 3 then 1 end)b3 ,
count(case b when 4 then 1 end)b4 ,
count(case b when 5 then 1 end)b5 ,
count(case b when 6 then 1 end)b6
from t group by a;
"""
s2="""
with t1 as(select count(*)cnt ,a,b from t group by a,b)
select a,
sum(case b when 0 then cnt end)b0 ,
sum(case b when 1 then cnt end)b1 ,
sum(case b when 2 then cnt end)b2 ,
sum(case b when 3 then cnt end)b3 ,
sum(case b when 4 then cnt end)b4 ,
sum(case b when 5 then cnt end)b5 ,
sum(case b when 6 then cnt end)b6
from t1 group by a;
"""
import time
t=time.time()
res =conn.query(s1)
print(res)
print(time.time()-t)
t=time.time()
res =conn.query(s2)
print(res)
print(time.time()-t)
test_ch2.py
python
import chdb
conn = chdb.connect(":memory:")
conn.query("create table t as select i%2 a, i%7 b,i%11 c from (select number i from numbers(10000000))")
s1="""
select a,
count(case b when 0 then 1 end)b0 ,
count(case b when 1 then 1 end)b1 ,
count(case b when 2 then 1 end)b2 ,
count(case b when 3 then 1 end)b3 ,
count(case b when 4 then 1 end)b4 ,
count(case b when 5 then 1 end)b5 ,
count(case b when 6 then 1 end)b6
from t group by a;
"""
s2="""
with t1 as(select count(*)cnt ,a,b from t group by a,b)
select a,
sum(case b when 0 then cnt end)b0 ,
sum(case b when 1 then cnt end)b1 ,
sum(case b when 2 then cnt end)b2 ,
sum(case b when 3 then cnt end)b3 ,
sum(case b when 4 then cnt end)b4 ,
sum(case b when 5 then cnt end)b5 ,
sum(case b when 6 then cnt end)b6
from t1 group by a;
"""
import time
t=time.time()
res =conn.query(s1)
print(res)
print(time.time()-t)
t=time.time()
res =conn.query(s2)
print(res)
print(time.time()-t)
test_pl.py
python
import polars as pl
data = {"i": [i for i in range(10000000)]}
df = pl.LazyFrame(data)
ctx = pl.SQLContext(my_table=df, eager=True)
result = ctx.execute("create table t as select i%2 a, i%7 b,i%11 c from my_table")
s1="""
select a,
count(case b when 0 then 1 end)b0 ,
count(case b when 1 then 1 end)b1 ,
count(case b when 2 then 1 end)b2 ,
count(case b when 3 then 1 end)b3 ,
count(case b when 4 then 1 end)b4 ,
count(case b when 5 then 1 end)b5 ,
count(case b when 6 then 1 end)b6
from t group by a;
"""
s2="""
with t1 as(select count(*)cnt ,a,b from t group by a,b)
select a,
sum(case b when 0 then cnt end)b0 ,
sum(case b when 1 then cnt end)b1 ,
sum(case b when 2 then cnt end)b2 ,
sum(case b when 3 then cnt end)b3 ,
sum(case b when 4 then cnt end)b4 ,
sum(case b when 5 then cnt end)b5 ,
sum(case b when 6 then cnt end)b6
from t1 group by a;
"""
import time
t=time.time()
res =ctx.execute(s1)
print(res)
print(time.time()-t)
t=time.time()
res =ctx.execute(s2)
print(res)
print(time.time()-t)
注意有的工具是惰性的,print()函数不能保证各参数的执行顺序,导致出现奇怪的计时结果。所以不能简写为
t=time.time()
res =conn.query(s1)
print(res,time.time()-t)
┌───────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
│ a │ b0 │ b1 │ b2 │ b3 │ b4 │ b5 │ b6 │
│ int64 │ int64 │ int64 │ int64 │ int64 │ int64 │ int64 │ int64 │
├───────┼────────┼────────┼────────┼────────┼────────┼────────┼────────┤
│ 0 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │ 714286 │
│ 1 │ 714286 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │
└───────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘
0.0005764961242675781