四种python工具包用SQL查询csv和parquet文件的方法比较

  1. chdb
sql 复制代码
root@kylin-pc:/# python3
Python 3.14.3 (main, Feb 24 2026, 22:48:09) [GCC 14.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import chdb
>>> chdb.sql("select sum(l_quantity) from file('/par/lineitem.parquet')")
1529738036

>>> chdb.sql("select sum(l_quantity) from file('/par/lineitem.csv')")
1529738036

>>> import duckdb
>>> duckdb.sql("select sum(l_quantity) from '/par/lineitem.parquet'")
┌─────────────────┐
│ sum(l_quantity) │
│  decimal(38,2)  │
├─────────────────┤
│   1529738036.00 │
└─────────────────┘

>>> duckdb.sql("select sum(l_quantity) from '/par/lineitem.csv'")
100% ▕██████████████████████████████████████▏ (00:00:06.93 elapsed)     
┌─────────────────┐
│ sum(l_quantity) │
│     double      │
├─────────────────┤
│    1529738036.0 │
└─────────────────┘
  1. polars
sql 复制代码
>>> import polars as pl

>>> df=pl.read_csv("/par/lineitem.csv", separator=",")
>>> ctx = pl.SQLContext(my_table=df, eager=True)
>>> result = ctx.execute("select sum(l_quantity) from my_table")
>>> print(result)
shape: (1, 1)
┌────────────┐
│ l_quantity │
│ ---        │
│ f64        │
╞════════════╡
│ 1.5297e9   │
└────────────┘

>>> df=pl.read_parquet("/par/lineitem.parquet")
>>> ctx = pl.SQLContext(my_table=df, eager=True)
>>> result = ctx.execute("select sum(l_quantity) from my_table")
>>> print(result)
shape: (1, 1)
┌───────────────┐
│ l_quantity    │
│ ---           │
│ decimal[15,2] │
╞═══════════════╡
│ 1529738036.00 │
└───────────────┘
  1. datafusion
sql 复制代码
>>> from datafusion import SessionContext
>>> ctx = SessionContext()


>>> ctx.register_csv("lineitem", "/par/lineitem.csv")
>>> ctx.sql("select sum(l_quantity) from lineitem")
DataFrame()
+--------------------------+
| sum(lineitem.l_quantity) |
+--------------------------+
| 1529738036.0             |
+--------------------------+
>>> ctx.register_parquet("lineitem2", "/par/lineitem.parquet")
>>> ctx.sql("select sum(l_quantity) from lineitem2")
DataFrame()
+---------------------------+
| sum(lineitem2.l_quantity) |
+---------------------------+
| 1529738036.00             |
+---------------------------+
  1. databend
sql 复制代码
>>> import databend
>>> ctx = databend.SessionContext()
>>> ctx.register_csv("lineitem", "/par/lineitem.csv")
Traceback (most recent call last):
  File "<python-input-2>", line 1, in <module>
    ctx.register_csv("lineitem", "/par/lineitem.csv")
    ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: DataFrame collect error: SemanticError. Code: 1065, Text = [QUERY-CTX] Query from CSV file lacks column positions. Specify as $1, $2, etc..

<Backtrace disabled by default. Please use RUST_BACKTRACE=1 to enable> 
>>> ctx.register_parquet("lineitem2", "/par/lineitem.parquet")
>>> ctx.sql("select sum(l_quantity) from lineitem2")
┌─────────────────────┐
│   sum(l_quantity)   │
│ Decimal(18, 2) NULL │
├─────────────────────┤
│ 1529738036.00       │
└─────────────────────┘

databend的register_csv方式据开发人员说准备废弃了。

  1. 对100万行数据group by的性能比较

    root@kylin-pc:/# time python /par/test_duck2.py
    ┌───────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
    │ a │ b0 │ b1 │ b2 │ b3 │ b4 │ b5 │ b6 │
    │ int64 │ int64 │ int64 │ int64 │ int64 │ int64 │ int64 │ int64 │
    ├───────┼────────┼────────┼────────┼────────┼────────┼────────┼────────┤
    │ 0 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │ 714286 │
    │ 1 │ 714286 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │
    └───────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘

    0.09658598899841309
    ┌───────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
    │ a │ b0 │ b1 │ b2 │ b3 │ b4 │ b5 │ b6 │
    │ int64 │ int128 │ int128 │ int128 │ int128 │ int128 │ int128 │ int128 │
    ├───────┼────────┼────────┼────────┼────────┼────────┼────────┼────────┤
    │ 0 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │ 714286 │
    │ 1 │ 714286 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │
    └───────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘

    0.01613473892211914

    real 0m0.868s
    user 0m2.157s
    sys 0m0.181s

    root@kylin-pc:/# time python /par/test_ch2.py
    0,714286,714286,714286,714285,714286,714285,714286
    1,714286,714286,714286,714286,714285,714286,714285

    0.23157906532287598
    0,714286,714286,714286,714285,714286,714285,714286
    1,714286,714286,714286,714286,714285,714286,714285

    0.05921816825866699

    real 0m0.694s
    user 0m2.803s
    sys 0m0.814s

    root@kylin-pc:/# time python /par/test_pl.py
    shape: (2, 8)
    ┌─────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
    │ a ┆ b0 ┆ b1 ┆ b2 ┆ b3 ┆ b4 ┆ b5 ┆ b6 │
    │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
    │ i64 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 │
    ╞═════╪════════╪════════╪════════╪════════╪════════╪════════╪════════╡
    │ 0 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714285 ┆ 714286 ┆ 714285 ┆ 714286 │
    │ 1 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714285 ┆ 714286 ┆ 714285 │
    └─────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘
    0.15063023567199707
    shape: (2, 8)
    ┌─────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
    │ a ┆ b0 ┆ b1 ┆ b2 ┆ b3 ┆ b4 ┆ b5 ┆ b6 │
    │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
    │ i64 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 │
    ╞═════╪════════╪════════╪════════╪════════╪════════╪════════╪════════╡
    │ 1 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714285 ┆ 714286 ┆ 714285 │
    │ 0 ┆ 714286 ┆ 714286 ┆ 714286 ┆ 714285 ┆ 714286 ┆ 714285 ┆ 714286 │
    └─────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘
    0.11575460433959961

    real 0m1.090s
    user 0m1.985s
    sys 0m0.454s

test_duck2.py

python 复制代码
import duckdb


conn = duckdb.connect(":memory:")

conn.query("create table t as select i%2 a, i%7 b,i%11 c from range(10000000)t(i)")


s1="""
select a,
count(case b when 0 then 1 end)b0 , 
count(case b when 1 then 1 end)b1 , 
count(case b when 2 then 1 end)b2 , 
count(case b when 3 then 1 end)b3 , 
count(case b when 4 then 1 end)b4 , 
count(case b when 5 then 1 end)b5 ,
count(case b when 6 then 1 end)b6  
from t group by a;
"""

s2="""
with t1 as(select count(*)cnt ,a,b from t group by a,b)
select a,
sum(case b when 0 then cnt end)b0 , 
sum(case b when 1 then cnt end)b1 , 
sum(case b when 2 then cnt end)b2 , 
sum(case b when 3 then cnt end)b3 , 
sum(case b when 4 then cnt end)b4 , 
sum(case b when 5 then cnt end)b5 ,
sum(case b when 6 then cnt end)b6  
from t1 group by a;
"""

import time
t=time.time()
res =conn.query(s1)
print(res)
print(time.time()-t)

t=time.time()
res =conn.query(s2)
print(res)
print(time.time()-t)

test_ch2.py

python 复制代码
import chdb


conn = chdb.connect(":memory:")

conn.query("create table t as select i%2 a, i%7 b,i%11 c from (select number i from numbers(10000000))")


s1="""
select a,
count(case b when 0 then 1 end)b0 , 
count(case b when 1 then 1 end)b1 , 
count(case b when 2 then 1 end)b2 , 
count(case b when 3 then 1 end)b3 , 
count(case b when 4 then 1 end)b4 , 
count(case b when 5 then 1 end)b5 ,
count(case b when 6 then 1 end)b6  
from t group by a;
"""

s2="""
with t1 as(select count(*)cnt ,a,b from t group by a,b)
select a,
sum(case b when 0 then cnt end)b0 , 
sum(case b when 1 then cnt end)b1 , 
sum(case b when 2 then cnt end)b2 , 
sum(case b when 3 then cnt end)b3 , 
sum(case b when 4 then cnt end)b4 , 
sum(case b when 5 then cnt end)b5 ,
sum(case b when 6 then cnt end)b6  
from t1 group by a;
"""

import time
t=time.time()
res =conn.query(s1)
print(res)
print(time.time()-t)

t=time.time()
res =conn.query(s2)
print(res)
print(time.time()-t)

test_pl.py

python 复制代码
import polars as pl
data = {"i": [i for i in range(10000000)]}
df = pl.LazyFrame(data)

ctx = pl.SQLContext(my_table=df, eager=True)

result = ctx.execute("create table t as select i%2 a, i%7 b,i%11 c from my_table")


s1="""
select a,
count(case b when 0 then 1 end)b0 , 
count(case b when 1 then 1 end)b1 , 
count(case b when 2 then 1 end)b2 , 
count(case b when 3 then 1 end)b3 , 
count(case b when 4 then 1 end)b4 , 
count(case b when 5 then 1 end)b5 ,
count(case b when 6 then 1 end)b6  
from t group by a;
"""

s2="""
with t1 as(select count(*)cnt ,a,b from t group by a,b)
select a,
sum(case b when 0 then cnt end)b0 , 
sum(case b when 1 then cnt end)b1 , 
sum(case b when 2 then cnt end)b2 , 
sum(case b when 3 then cnt end)b3 , 
sum(case b when 4 then cnt end)b4 , 
sum(case b when 5 then cnt end)b5 ,
sum(case b when 6 then cnt end)b6  
from t1 group by a;
"""

import time
t=time.time()
res =ctx.execute(s1)
print(res)
print(time.time()-t)

t=time.time()
res =ctx.execute(s2)
print(res)
print(time.time()-t)

注意有的工具是惰性的,print()函数不能保证各参数的执行顺序,导致出现奇怪的计时结果。所以不能简写为

复制代码
t=time.time()
res =conn.query(s1)
print(res,time.time()-t)

┌───────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
│   a   │   b0   │   b1   │   b2   │   b3   │   b4   │   b5   │   b6   │
│ int64 │ int64  │ int64  │ int64  │ int64  │ int64  │ int64  │ int64  │
├───────┼────────┼────────┼────────┼────────┼────────┼────────┼────────┤
│     0 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │ 714286 │
│     1 │ 714286 │ 714286 │ 714286 │ 714286 │ 714285 │ 714286 │ 714285 │
└───────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘
 0.0005764961242675781
相关推荐
清水白石0082 小时前
Python 并发三剑客:多线程、多进程与协程的实战抉择
java·服务器·python
2301_793804692 小时前
更优雅的测试:Pytest框架入门
jvm·数据库·python
V1ncent Chen2 小时前
SQL大师之路 10 连接基础
数据库·sql·mysql·数据分析
sevenlin2 小时前
MySQL数据库(SQL分类)
数据库·sql·mysql
dinl_vin2 小时前
python:常用的基础工具包
开发语言·python
wefly20172 小时前
无需安装、开箱即用!m3u8live.cn 在线 HLS 播放器,调试直播流效率翻倍
前端·后端·python·前端开发工具·后端开发工具
2301_815482933 小时前
用Python实现自动化的Web测试(Selenium)
jvm·数据库·python
将心ONE3 小时前
melo tts安装使用
python
weixin_505154463 小时前
博维数孪创新引领,3D作业指导助力制造业升级
大数据·人工智能·3d·数字孪生·数据可视化·产品交互展示