DeepSeek辅助测试不同文件格式的读写性能和大小

在网上看到有测试多种格式的脚本，想添加vortex的测试，于是把以下文档和原始代码上传。

python 复制代码

Install
pip install vortex-data
Convert
You can either use your own Parquet file or download the example file used here.

Use Arrow to read a Parquet file and then use array() to construct an uncompressed Vortex array:

import pyarrow.parquet as pq
import vortex as vx
parquet = pq.read_table("_static/example.parquet")
vtx = vx.array(parquet)
vtx.nbytes
141024
Write
Use write() to write the Vortex array to disk:

import vortex as vx
vx.io.write(vtx, "example.vortex")
Small Vortex files (this one is just 71KiB) currently have substantial overhead relative to their size. This will be addressed shortly. On files with at least tens of megabytes of data, Vortex is similar to or smaller than Parquet.

from os.path import getsize
getsize("example.vortex") / getsize("_static/example.parquet")
2.0...
Read
Use open() to open and read the Vortex array from disk:

import vortex as vx
cvtx = vx.open("example.vortex").scan().read_all()
Vortex is architected to achieve fast random access, in many cases hundreds of times faster than what can be achieved with Parquet.

If you have an external index that gives you specific rows to pull out of the Vortex file, you can skip a lot more IO and decoding and read just the data that is relevant to you:

import vortex as vx
vf = vx.open("example.vortex")
# row indices must be ordered and unique
indices = vx.array([1, 2, 10])
result = vf.scan(indices=indices).read_all()
assert len(result) == 3

优化附件test_fmt代码，尽量减少冗余，并添加vortex文件格式测试，调用方法见附件py-vortex

他优化后的脚本如下：

python 复制代码

import os
import time
import shutil
import pandas as pd
import numpy as np

# 可选依赖
try:
    import pyarrow as pa
    import pyarrow.parquet as pq
except ImportError:
    pa = None
    pq = None

try:
    import vortex as vx
except ImportError:
    vx = None

try:
    import h5py
except ImportError:
    h5py = None

try:
    import feather
except ImportError:
    feather = None


# 设置随机数种子以确保可重复性
np.random.seed(42)


def generate_random_data(num_records=1000000):
    """生成随机数据"""
    data = {
        'id': np.arange(num_records),
        'name': np.random.choice(['Alice', 'Bob', 'Charlie'], size=num_records),
        'age': np.random.randint(20, 80, size=num_records),
        'salary': np.random.uniform(20000, 100000, size=num_records)
    }
    return pd.DataFrame(data)


def benchmark_format(df, filename, write_func, read_func, format_name):
    """通用基准测试函数"""
    # 写入测试
    start_time = time.time()
    write_func(df, filename)
    write_time = time.time() - start_time

    # 文件大小
    file_size = os.path.getsize(filename)

    # 读取测试
    start_time = time.time()
    read_func(filename)
    read_time = time.time() - start_time

    # 综合评分：写入时间 * 读取时间 * 文件大小(MB)
    score = write_time * read_time * (file_size / (1024 * 1024))

    print(f"\n{format_name} 测试结果:")
    print(f"  写入时间: {write_time:.2f} 秒")
    print(f"  读取时间: {read_time:.2f} 秒")
    print(f"  文件大小: {file_size / (1024 * 1024):.2f} MB")
    print(f"  综合评分: {score:.2f}")

    return write_time, read_time, file_size, score


def test_csv(df, filename):
    """CSV 格式测试"""
    return benchmark_format(
        df, filename,
        lambda d, f: d.to_csv(f, index=False),
        lambda f: pd.read_csv(f),
        "CSV"
    )


def test_parquet(df, filename):
    """Parquet 格式测试"""
    return benchmark_format(
        df, filename,
        lambda d, f: d.to_parquet(f, engine='pyarrow', compression='SNAPPY'),
        lambda f: pd.read_parquet(f, engine='pyarrow'),
        "Parquet"
    )


def test_feather(df, filename):
    """Feather 格式测试"""
    return benchmark_format(
        df, filename,
        lambda d, f: d.to_feather(f),
        lambda f: pd.read_feather(f),
        "Feather"
    )


def test_hdf5(df, filename):
    """HDF5 格式测试"""
    return benchmark_format(
        df, filename,
        lambda d, f: d.to_hdf(f, key='data', mode='w'),
        lambda f: pd.read_hdf(f, key='data'),
        "HDF5"
    )


def test_vortex(df, filename):
    """Vortex 格式测试"""
    # 将 DataFrame 转换为 Arrow Table，再转换为 Vortex Array
    table = pa.Table.from_pandas(df)

    def write_func(d, f):
        vtx = vx.array(table)
        vx.io.write(vtx, f)

    def read_func(f):
        return vx.open(f).scan().read_all()

    return benchmark_format(
        df, filename,
        write_func,
        read_func,
        "Vortex"
    )


def cleanup_files(*filenames):
    """清理文件"""
    for filename in filenames:
        if os.path.exists(filename):
            os.remove(filename)


def main():
    # 配置
    num_records = 20_000_000  # 2千万条记录
    formats = [
        ('csv', 'random_data.csv', test_csv),
        ('parquet', 'random_data.parquet', test_parquet),
        ('feather', 'random_data.feather', test_feather),
        ('hdf5', 'random_data.h5', test_hdf5),
    ]

    # 如果 vortex 可用，添加到测试列表
    if vx is not None and pa is not None:
        formats.append(('vortex', 'random_data.vortex', test_vortex))
    else:
        print("警告: vortex 或 pyarrow 未安装，跳过 Vortex 测试")

    print(f"开始生成 {num_records:,} 条随机数据...")
    df = generate_random_data(num_records=num_records)
    print("数据生成完成")

    results = []
    filenames = []

    try:
        for fmt_name, filename, test_func in formats:
            filenames.append(filename)
            result = test_func(df, filename)
            results.append((fmt_name, result))

        # 输出汇总信息
        print("\n" + "=" * 60)
        print("综合性能对比 (评分越低越好):")
        print("=" * 60)
        for fmt_name, (_, _, _, score) in sorted(results, key=lambda x: x[1][3]):
            print(f"{fmt_name.upper():10} 综合评分: {score:.2f}")

    finally:
        # 清理生成的文件
        cleanup_files(*filenames)
        print("\n临时文件已清理")


if __name__ == "__main__":
    main()

再用pip install安装feather-format，h5py, tables 和vortex-data，都没有报错。

但是调用时报错。

复制代码

python3 test_fmt2.py
警告: vortex 或 pyarrow 未安装，跳过 Vortex 测试
开始生成 20,000,000 条随机数据...
数据生成完成

CSV 测试结果:
  写入时间: 58.77 秒
  读取时间: 8.98 秒
  文件大小: 678.03 MB
  综合评分: 357682.29

Parquet 测试结果:
  写入时间: 3.16 秒
  读取时间: 0.71 秒
  文件大小: 258.49 MB
  综合评分: 582.74

Feather 测试结果:
  写入时间: 1.02 秒
  读取时间: 0.51 秒
  文件大小: 393.72 MB
  综合评分: 205.16

HDF5 测试结果:
  写入时间: 9.37 秒
  读取时间: 4.85 秒
  文件大小: 764.01 MB
  综合评分: 34740.01

============================================================
综合性能对比 (评分越低越好):
============================================================
FEATHER    综合评分: 205.16
PARQUET    综合评分: 582.74
HDF5       综合评分: 34740.01
CSV        综合评分: 357682.29

临时文件已清理

手工import也报错。

复制代码

python3
Python 3.14.3 (main, Feb 24 2026, 22:48:09) [GCC 14.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import vortex as vx
Traceback (most recent call last):
  File "/usr/local/lib/python3.14/site-packages/vortex/substrait.py", line 23, in <module>
    from substrait.extensions.extensions_pb2 import SimpleExtensionDeclaration, SimpleExtensionURI
ImportError: cannot import name 'SimpleExtensionURI' from 'substrait.extensions.extensions_pb2' (/usr/local/lib/python3.14/site-packages/substrait/extensions/extensions_pb2.py)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<python-input-0>", line 1, in <module>
    import vortex as vx
  File "/usr/local/lib/python3.14/site-packages/vortex/__init__.py", line 6, in <module>
    from . import _lib, arrays, dataset, expr, file, io, ray, registry, scan
  File "/usr/local/lib/python3.14/site-packages/vortex/dataset.py", line 18, in <module>
    from .arrow.expression import ensure_vortex_expression
  File "/usr/local/lib/python3.14/site-packages/vortex/arrow/expression.py", line 14, in <module>
    from ..substrait import extended_expression
  File "/usr/local/lib/python3.14/site-packages/vortex/substrait.py", line 27, in <module>
    from substrait.gen.proto.algebra_pb2 import Expression, FunctionArgument
ModuleNotFoundError: No module named 'substrait.gen'

看pypi上whl文件名是vortex_data-0.64.0-cp311-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl,

改用python 3.11的机器测试, 果然就可以了。

复制代码

Vortex 测试结果:
  写入时间: 2.21 秒
  读取时间: 0.26 秒
  文件大小: 148.04 MB
  综合评分: 84.42

============================================================
综合性能对比 (评分越低越好):
============================================================
VORTEX     综合评分: 84.42
PARQUET    综合评分: 1190.20
FEATHER    综合评分: 2050.87
HDF5       综合评分: 43331.76
CSV        综合评分: 146389.94