DeepSeek辅助测试不同文件格式的读写性能和大小

网上看到有测试多种格式的脚本,想添加vortex的测试,于是把以下文档和原始代码上传。

python 复制代码
Install
pip install vortex-data
Convert
You can either use your own Parquet file or download the example file used here.

Use Arrow to read a Parquet file and then use array() to construct an uncompressed Vortex array:

import pyarrow.parquet as pq
import vortex as vx
parquet = pq.read_table("_static/example.parquet")
vtx = vx.array(parquet)
vtx.nbytes
141024
Write
Use write() to write the Vortex array to disk:

import vortex as vx
vx.io.write(vtx, "example.vortex")
Small Vortex files (this one is just 71KiB) currently have substantial overhead relative to their size. This will be addressed shortly. On files with at least tens of megabytes of data, Vortex is similar to or smaller than Parquet.

from os.path import getsize
getsize("example.vortex") / getsize("_static/example.parquet")
2.0...
Read
Use open() to open and read the Vortex array from disk:

import vortex as vx
cvtx = vx.open("example.vortex").scan().read_all()
Vortex is architected to achieve fast random access, in many cases hundreds of times faster than what can be achieved with Parquet.

If you have an external index that gives you specific rows to pull out of the Vortex file, you can skip a lot more IO and decoding and read just the data that is relevant to you:

import vortex as vx
vf = vx.open("example.vortex")
# row indices must be ordered and unique
indices = vx.array([1, 2, 10])
result = vf.scan(indices=indices).read_all()
assert len(result) == 3

优化附件test_fmt代码,尽量减少冗余,并添加vortex文件格式测试,调用方法见附件py-vortex

他优化后的脚本如下:

python 复制代码
import os
import time
import shutil
import pandas as pd
import numpy as np

# 可选依赖
try:
    import pyarrow as pa
    import pyarrow.parquet as pq
except ImportError:
    pa = None
    pq = None

try:
    import vortex as vx
except ImportError:
    vx = None

try:
    import h5py
except ImportError:
    h5py = None

try:
    import feather
except ImportError:
    feather = None


# 设置随机数种子以确保可重复性
np.random.seed(42)


def generate_random_data(num_records=1000000):
    """生成随机数据"""
    data = {
        'id': np.arange(num_records),
        'name': np.random.choice(['Alice', 'Bob', 'Charlie'], size=num_records),
        'age': np.random.randint(20, 80, size=num_records),
        'salary': np.random.uniform(20000, 100000, size=num_records)
    }
    return pd.DataFrame(data)


def benchmark_format(df, filename, write_func, read_func, format_name):
    """通用基准测试函数"""
    # 写入测试
    start_time = time.time()
    write_func(df, filename)
    write_time = time.time() - start_time

    # 文件大小
    file_size = os.path.getsize(filename)

    # 读取测试
    start_time = time.time()
    read_func(filename)
    read_time = time.time() - start_time

    # 综合评分:写入时间 * 读取时间 * 文件大小(MB)
    score = write_time * read_time * (file_size / (1024 * 1024))

    print(f"\n{format_name} 测试结果:")
    print(f"  写入时间: {write_time:.2f} 秒")
    print(f"  读取时间: {read_time:.2f} 秒")
    print(f"  文件大小: {file_size / (1024 * 1024):.2f} MB")
    print(f"  综合评分: {score:.2f}")

    return write_time, read_time, file_size, score


def test_csv(df, filename):
    """CSV 格式测试"""
    return benchmark_format(
        df, filename,
        lambda d, f: d.to_csv(f, index=False),
        lambda f: pd.read_csv(f),
        "CSV"
    )


def test_parquet(df, filename):
    """Parquet 格式测试"""
    return benchmark_format(
        df, filename,
        lambda d, f: d.to_parquet(f, engine='pyarrow', compression='SNAPPY'),
        lambda f: pd.read_parquet(f, engine='pyarrow'),
        "Parquet"
    )


def test_feather(df, filename):
    """Feather 格式测试"""
    return benchmark_format(
        df, filename,
        lambda d, f: d.to_feather(f),
        lambda f: pd.read_feather(f),
        "Feather"
    )


def test_hdf5(df, filename):
    """HDF5 格式测试"""
    return benchmark_format(
        df, filename,
        lambda d, f: d.to_hdf(f, key='data', mode='w'),
        lambda f: pd.read_hdf(f, key='data'),
        "HDF5"
    )


def test_vortex(df, filename):
    """Vortex 格式测试"""
    # 将 DataFrame 转换为 Arrow Table,再转换为 Vortex Array
    table = pa.Table.from_pandas(df)

    def write_func(d, f):
        vtx = vx.array(table)
        vx.io.write(vtx, f)

    def read_func(f):
        return vx.open(f).scan().read_all()

    return benchmark_format(
        df, filename,
        write_func,
        read_func,
        "Vortex"
    )


def cleanup_files(*filenames):
    """清理文件"""
    for filename in filenames:
        if os.path.exists(filename):
            os.remove(filename)


def main():
    # 配置
    num_records = 20_000_000  # 2千万条记录
    formats = [
        ('csv', 'random_data.csv', test_csv),
        ('parquet', 'random_data.parquet', test_parquet),
        ('feather', 'random_data.feather', test_feather),
        ('hdf5', 'random_data.h5', test_hdf5),
    ]

    # 如果 vortex 可用,添加到测试列表
    if vx is not None and pa is not None:
        formats.append(('vortex', 'random_data.vortex', test_vortex))
    else:
        print("警告: vortex 或 pyarrow 未安装,跳过 Vortex 测试")

    print(f"开始生成 {num_records:,} 条随机数据...")
    df = generate_random_data(num_records=num_records)
    print("数据生成完成")

    results = []
    filenames = []

    try:
        for fmt_name, filename, test_func in formats:
            filenames.append(filename)
            result = test_func(df, filename)
            results.append((fmt_name, result))

        # 输出汇总信息
        print("\n" + "=" * 60)
        print("综合性能对比 (评分越低越好):")
        print("=" * 60)
        for fmt_name, (_, _, _, score) in sorted(results, key=lambda x: x[1][3]):
            print(f"{fmt_name.upper():10} 综合评分: {score:.2f}")

    finally:
        # 清理生成的文件
        cleanup_files(*filenames)
        print("\n临时文件已清理")


if __name__ == "__main__":
    main()

再用pip install安装feather-format,h5py, tables 和vortex-data,都没有报错。

但是调用时报错。

复制代码
python3 test_fmt2.py
警告: vortex 或 pyarrow 未安装,跳过 Vortex 测试
开始生成 20,000,000 条随机数据...
数据生成完成

CSV 测试结果:
  写入时间: 58.77 秒
  读取时间: 8.98 秒
  文件大小: 678.03 MB
  综合评分: 357682.29

Parquet 测试结果:
  写入时间: 3.16 秒
  读取时间: 0.71 秒
  文件大小: 258.49 MB
  综合评分: 582.74

Feather 测试结果:
  写入时间: 1.02 秒
  读取时间: 0.51 秒
  文件大小: 393.72 MB
  综合评分: 205.16

HDF5 测试结果:
  写入时间: 9.37 秒
  读取时间: 4.85 秒
  文件大小: 764.01 MB
  综合评分: 34740.01

============================================================
综合性能对比 (评分越低越好):
============================================================
FEATHER    综合评分: 205.16
PARQUET    综合评分: 582.74
HDF5       综合评分: 34740.01
CSV        综合评分: 357682.29

临时文件已清理

手工import也报错。

复制代码
python3
Python 3.14.3 (main, Feb 24 2026, 22:48:09) [GCC 14.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import vortex as vx
Traceback (most recent call last):
  File "/usr/local/lib/python3.14/site-packages/vortex/substrait.py", line 23, in <module>
    from substrait.extensions.extensions_pb2 import SimpleExtensionDeclaration, SimpleExtensionURI
ImportError: cannot import name 'SimpleExtensionURI' from 'substrait.extensions.extensions_pb2' (/usr/local/lib/python3.14/site-packages/substrait/extensions/extensions_pb2.py)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<python-input-0>", line 1, in <module>
    import vortex as vx
  File "/usr/local/lib/python3.14/site-packages/vortex/__init__.py", line 6, in <module>
    from . import _lib, arrays, dataset, expr, file, io, ray, registry, scan
  File "/usr/local/lib/python3.14/site-packages/vortex/dataset.py", line 18, in <module>
    from .arrow.expression import ensure_vortex_expression
  File "/usr/local/lib/python3.14/site-packages/vortex/arrow/expression.py", line 14, in <module>
    from ..substrait import extended_expression
  File "/usr/local/lib/python3.14/site-packages/vortex/substrait.py", line 27, in <module>
    from substrait.gen.proto.algebra_pb2 import Expression, FunctionArgument
ModuleNotFoundError: No module named 'substrait.gen'

看pypi上whl文件名是vortex_data-0.64.0-cp311-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl,

改用python 3.11的机器测试, 果然就可以了。

复制代码
Vortex 测试结果:
  写入时间: 2.21 秒
  读取时间: 0.26 秒
  文件大小: 148.04 MB
  综合评分: 84.42

============================================================
综合性能对比 (评分越低越好):
============================================================
VORTEX     综合评分: 84.42
PARQUET    综合评分: 1190.20
FEATHER    综合评分: 2050.87
HDF5       综合评分: 43331.76
CSV        综合评分: 146389.94
相关推荐
IT知识分享2 小时前
从零开发在线简繁转换工具:OpenCC 实战、避坑经验与方案选型
javascript·python
2601_959480152 小时前
Moneta Markets亿汇:“比特币反弹走势仍脆弱”
人工智能
lunzi_08262 小时前
【学习笔记】《Python编程 从入门到实践》第8章:函数定义、参数传递与模块导入
笔记·python·学习
ULIi096kr2 小时前
MySQL解决Too many connections报错:连接数爆满排查、优化与永久解决方案
数据库·mysql·adb
没事别瞎琢磨2 小时前
六、输出捕获与截断
人工智能·node.js
嘉子的秃头日记2 小时前
TRO 2026|轮椅也能“猜到”用户想往哪走?
大数据·人工智能·机器学习
杨运交2 小时前
[030][Web模块]Spring Boot 验证与 OpenAPI 集成实战:从校验规则到文档生成
前端·spring boot·python
2601_957190902 小时前
极致裸眼沉浸!飞行影院重塑文旅游玩新体验
大数据·人工智能·旅游
Meinianda2 小时前
我用Agent 使用瑞幸官方MCP下了一单:过程全记录,优缺点分析
人工智能
没事别瞎琢磨2 小时前
七、敏感路径预检——Protected Paths
人工智能·node.js