DeepSeek辅助测试不同文件格式的读写性能和大小

网上看到有测试多种格式的脚本,想添加vortex的测试,于是把以下文档和原始代码上传。

python 复制代码
Install
pip install vortex-data
Convert
You can either use your own Parquet file or download the example file used here.

Use Arrow to read a Parquet file and then use array() to construct an uncompressed Vortex array:

import pyarrow.parquet as pq
import vortex as vx
parquet = pq.read_table("_static/example.parquet")
vtx = vx.array(parquet)
vtx.nbytes
141024
Write
Use write() to write the Vortex array to disk:

import vortex as vx
vx.io.write(vtx, "example.vortex")
Small Vortex files (this one is just 71KiB) currently have substantial overhead relative to their size. This will be addressed shortly. On files with at least tens of megabytes of data, Vortex is similar to or smaller than Parquet.

from os.path import getsize
getsize("example.vortex") / getsize("_static/example.parquet")
2.0...
Read
Use open() to open and read the Vortex array from disk:

import vortex as vx
cvtx = vx.open("example.vortex").scan().read_all()
Vortex is architected to achieve fast random access, in many cases hundreds of times faster than what can be achieved with Parquet.

If you have an external index that gives you specific rows to pull out of the Vortex file, you can skip a lot more IO and decoding and read just the data that is relevant to you:

import vortex as vx
vf = vx.open("example.vortex")
# row indices must be ordered and unique
indices = vx.array([1, 2, 10])
result = vf.scan(indices=indices).read_all()
assert len(result) == 3

优化附件test_fmt代码,尽量减少冗余,并添加vortex文件格式测试,调用方法见附件py-vortex

他优化后的脚本如下:

python 复制代码
import os
import time
import shutil
import pandas as pd
import numpy as np

# 可选依赖
try:
    import pyarrow as pa
    import pyarrow.parquet as pq
except ImportError:
    pa = None
    pq = None

try:
    import vortex as vx
except ImportError:
    vx = None

try:
    import h5py
except ImportError:
    h5py = None

try:
    import feather
except ImportError:
    feather = None


# 设置随机数种子以确保可重复性
np.random.seed(42)


def generate_random_data(num_records=1000000):
    """生成随机数据"""
    data = {
        'id': np.arange(num_records),
        'name': np.random.choice(['Alice', 'Bob', 'Charlie'], size=num_records),
        'age': np.random.randint(20, 80, size=num_records),
        'salary': np.random.uniform(20000, 100000, size=num_records)
    }
    return pd.DataFrame(data)


def benchmark_format(df, filename, write_func, read_func, format_name):
    """通用基准测试函数"""
    # 写入测试
    start_time = time.time()
    write_func(df, filename)
    write_time = time.time() - start_time

    # 文件大小
    file_size = os.path.getsize(filename)

    # 读取测试
    start_time = time.time()
    read_func(filename)
    read_time = time.time() - start_time

    # 综合评分:写入时间 * 读取时间 * 文件大小(MB)
    score = write_time * read_time * (file_size / (1024 * 1024))

    print(f"\n{format_name} 测试结果:")
    print(f"  写入时间: {write_time:.2f} 秒")
    print(f"  读取时间: {read_time:.2f} 秒")
    print(f"  文件大小: {file_size / (1024 * 1024):.2f} MB")
    print(f"  综合评分: {score:.2f}")

    return write_time, read_time, file_size, score


def test_csv(df, filename):
    """CSV 格式测试"""
    return benchmark_format(
        df, filename,
        lambda d, f: d.to_csv(f, index=False),
        lambda f: pd.read_csv(f),
        "CSV"
    )


def test_parquet(df, filename):
    """Parquet 格式测试"""
    return benchmark_format(
        df, filename,
        lambda d, f: d.to_parquet(f, engine='pyarrow', compression='SNAPPY'),
        lambda f: pd.read_parquet(f, engine='pyarrow'),
        "Parquet"
    )


def test_feather(df, filename):
    """Feather 格式测试"""
    return benchmark_format(
        df, filename,
        lambda d, f: d.to_feather(f),
        lambda f: pd.read_feather(f),
        "Feather"
    )


def test_hdf5(df, filename):
    """HDF5 格式测试"""
    return benchmark_format(
        df, filename,
        lambda d, f: d.to_hdf(f, key='data', mode='w'),
        lambda f: pd.read_hdf(f, key='data'),
        "HDF5"
    )


def test_vortex(df, filename):
    """Vortex 格式测试"""
    # 将 DataFrame 转换为 Arrow Table,再转换为 Vortex Array
    table = pa.Table.from_pandas(df)

    def write_func(d, f):
        vtx = vx.array(table)
        vx.io.write(vtx, f)

    def read_func(f):
        return vx.open(f).scan().read_all()

    return benchmark_format(
        df, filename,
        write_func,
        read_func,
        "Vortex"
    )


def cleanup_files(*filenames):
    """清理文件"""
    for filename in filenames:
        if os.path.exists(filename):
            os.remove(filename)


def main():
    # 配置
    num_records = 20_000_000  # 2千万条记录
    formats = [
        ('csv', 'random_data.csv', test_csv),
        ('parquet', 'random_data.parquet', test_parquet),
        ('feather', 'random_data.feather', test_feather),
        ('hdf5', 'random_data.h5', test_hdf5),
    ]

    # 如果 vortex 可用,添加到测试列表
    if vx is not None and pa is not None:
        formats.append(('vortex', 'random_data.vortex', test_vortex))
    else:
        print("警告: vortex 或 pyarrow 未安装,跳过 Vortex 测试")

    print(f"开始生成 {num_records:,} 条随机数据...")
    df = generate_random_data(num_records=num_records)
    print("数据生成完成")

    results = []
    filenames = []

    try:
        for fmt_name, filename, test_func in formats:
            filenames.append(filename)
            result = test_func(df, filename)
            results.append((fmt_name, result))

        # 输出汇总信息
        print("\n" + "=" * 60)
        print("综合性能对比 (评分越低越好):")
        print("=" * 60)
        for fmt_name, (_, _, _, score) in sorted(results, key=lambda x: x[1][3]):
            print(f"{fmt_name.upper():10} 综合评分: {score:.2f}")

    finally:
        # 清理生成的文件
        cleanup_files(*filenames)
        print("\n临时文件已清理")


if __name__ == "__main__":
    main()

再用pip install安装feather-format,h5py, tables 和vortex-data,都没有报错。

但是调用时报错。

复制代码
python3 test_fmt2.py
警告: vortex 或 pyarrow 未安装,跳过 Vortex 测试
开始生成 20,000,000 条随机数据...
数据生成完成

CSV 测试结果:
  写入时间: 58.77 秒
  读取时间: 8.98 秒
  文件大小: 678.03 MB
  综合评分: 357682.29

Parquet 测试结果:
  写入时间: 3.16 秒
  读取时间: 0.71 秒
  文件大小: 258.49 MB
  综合评分: 582.74

Feather 测试结果:
  写入时间: 1.02 秒
  读取时间: 0.51 秒
  文件大小: 393.72 MB
  综合评分: 205.16

HDF5 测试结果:
  写入时间: 9.37 秒
  读取时间: 4.85 秒
  文件大小: 764.01 MB
  综合评分: 34740.01

============================================================
综合性能对比 (评分越低越好):
============================================================
FEATHER    综合评分: 205.16
PARQUET    综合评分: 582.74
HDF5       综合评分: 34740.01
CSV        综合评分: 357682.29

临时文件已清理

手工import也报错。

复制代码
python3
Python 3.14.3 (main, Feb 24 2026, 22:48:09) [GCC 14.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import vortex as vx
Traceback (most recent call last):
  File "/usr/local/lib/python3.14/site-packages/vortex/substrait.py", line 23, in <module>
    from substrait.extensions.extensions_pb2 import SimpleExtensionDeclaration, SimpleExtensionURI
ImportError: cannot import name 'SimpleExtensionURI' from 'substrait.extensions.extensions_pb2' (/usr/local/lib/python3.14/site-packages/substrait/extensions/extensions_pb2.py)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<python-input-0>", line 1, in <module>
    import vortex as vx
  File "/usr/local/lib/python3.14/site-packages/vortex/__init__.py", line 6, in <module>
    from . import _lib, arrays, dataset, expr, file, io, ray, registry, scan
  File "/usr/local/lib/python3.14/site-packages/vortex/dataset.py", line 18, in <module>
    from .arrow.expression import ensure_vortex_expression
  File "/usr/local/lib/python3.14/site-packages/vortex/arrow/expression.py", line 14, in <module>
    from ..substrait import extended_expression
  File "/usr/local/lib/python3.14/site-packages/vortex/substrait.py", line 27, in <module>
    from substrait.gen.proto.algebra_pb2 import Expression, FunctionArgument
ModuleNotFoundError: No module named 'substrait.gen'

看pypi上whl文件名是vortex_data-0.64.0-cp311-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl,

改用python 3.11的机器测试, 果然就可以了。

复制代码
Vortex 测试结果:
  写入时间: 2.21 秒
  读取时间: 0.26 秒
  文件大小: 148.04 MB
  综合评分: 84.42

============================================================
综合性能对比 (评分越低越好):
============================================================
VORTEX     综合评分: 84.42
PARQUET    综合评分: 1190.20
FEATHER    综合评分: 2050.87
HDF5       综合评分: 43331.76
CSV        综合评分: 146389.94
相关推荐
eastyuxiao1 天前
思维导图拆解项目范围 3 个真实落地案例
大数据·运维·人工智能·流程图
风落无尘1 天前
《智能重生:从垃圾堆到AI工程师》——第五章 代码与灵魂
服务器·网络·人工智能
冬奇Lab1 天前
RAG 系列(八):RAG 评估体系——用数据说话
人工智能·llm
landyjzlai1 天前
蓝迪哥玩转Ai(8)---端侧AI:RK3588 端侧大语言模型(LLM)开发实战指南
人工智能·python
S1998_1997111609•X1 天前
论当今社会主义与人文关怀人格思想下的恶意仿生注入污染蜜罐描述进行函数值非法侵入爬虫的咼忄乂癿〇仺⺋.
数据库·网络协议·百度·ssh·开闭原则
我叫黑大帅1 天前
如何通过 Python 实现招聘平台自动投递
后端·python·面试
其实防守也摸鱼1 天前
CTF密码学综合教学指南--第九章
开发语言·网络·python·安全·网络安全·密码学·ctf
ZhengEnCi1 天前
05-自注意力机制详解 🧠
人工智能·pytorch·深度学习