python pandas ,处理csv文件、hdf5文件、parquet文件效率详细对比

废话不多说,先放结论:

  1. python 处理csv 和hdf5对比

我本地存了100个小的csv文件(内容是股票交易数据),总30M

复制代码
遍历读出来全部:read_csvlist: 664.34 毫秒
100份df数据按照key写入、读出:write_hdf5_by_keys: 2579.56 毫秒  30M 写入后 130M hdf5
100份df数据按照key写入、读出:read_hdf5_by_keys: 1407.38 毫秒
100份df数据合并一个df写入、读出:write_onecsv_hdf5: 1127.10 毫秒  30Mcsv 写入后 26M hdf5
100份df数据合并一个df写入、读出:read_one_df_hdf5: 786.55 毫秒
100份df数据合并一个df写入、读出,用pandas接口:write_onecsv_hdf5_pandas: 1137.50 毫秒

结论: hdf5 key越多,压缩率越低,-4倍;正常一个key,压缩率 10%

  1. python 处理csv 和parquet对比

我本地存了100个小的csv文件(内容是股票交易数据),总30M

复制代码
df memory usage: 45.0+ MB
遍历读出来全部:read_csvlist: 780.23 毫秒
合并为一个df写入parquet:write_parquet: 1120.26 毫秒  30M csv --> 10M pq
从parquet读出:read_parquet: 131.98 毫秒
合并为一个df写入csv启用压缩:write_to_onecsv_zip: 8871.84 毫秒 30M csv --> 11.5M
从csv读出压缩数据:read_onecsv_zip: 535.38 毫秒
合并为一个df写入csv无压缩:write_to_onecsv: 2439.60 毫秒
从csv读出无压缩数据:read_onecsv: 432.28 毫秒

结论: parquet 写入稍慢于hdf5,读是真的快;压缩率70%很惊艳;csv启用压缩是真的慢;

源码放上来,你们自己跑吧

python 复制代码
def get_filelist():
    file_names = []
    for root, dirs, files in os.walk('data_csv'):
        for file in files:
            file_names.append(file)
    print(file_names)

def get_files_in_current_directory():
    # 使用 os.listdir 列出当前目录下的所有文件和目录名
    entries = os.listdir('data_csv')
    # 使用列表推导式过滤出文件(排除目录)
    files = [entry for entry in entries if os.path.isfile(os.path.join('data_csv', entry))]
    return files

def read_csvlist():
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件
    file_dflist = []
    start_time = time.time()
    for f in file_list:
        char_to_remove = "."
        new_string = f.replace(char_to_remove, '')
        #print(new_string)  # 输出: hell wrld
        df = pd.read_csv('data_csv/' + f )
        file_dflist.append(df)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_csvlist: {elapsed_time_ms:.2f} 毫秒")
    return file_dflist


def write_hdf5_by_keys(listdf):
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件
    with pd.HDFStore('pd_HDF.h5', complevel=9, complib='zlib') as store:
        # 将 DataFrame 存储到名为 'my_group/my_dataset' 的路径下
        # 这里 'my_group' 是组名,'my_dataset' 是数据集名
        # 如果 'my_group' 不存在,它将被自动创建
        start_time = time.time()
        file_dflist = []
        idx = 0
        for f in file_list:
            char_to_remove = "."
            new_string = f.replace(char_to_remove, '')
            store.put('gp_daily/'+new_string, listdf[idx])
            idx += 1

        end_time = time.time()
        elapsed_time_ms = (end_time - start_time) * 1000
        print(f"write_hdf5_by_keys: {elapsed_time_ms:.2f} 毫秒")


def read_hdf5_by_keys():
    # 读取存储在特定组中的数据
    with pd.HDFStore('pd_HDF.h5') as store:
        file_list = get_files_in_current_directory()
        start_time = time.time()
        file_dflist = []
        idx = 0
        for f in file_list:
            char_to_remove = "."
            new_string = f.replace(char_to_remove, '')
            file_dflist.append( store.get('gp_daily/' + new_string) )
            idx += 1

        end_time = time.time()
        elapsed_time_ms = (end_time - start_time) * 1000
        print(f"read_hdf5_by_keys: {elapsed_time_ms:.2f} 毫秒")
        return file_dflist




def write_onecsv_hdf5(listdf):
    df = pd.concat(listdf, ignore_index=True)
    start_time = time.time()
    # 打开(或创建)一个 HDF5 文件
    with pd.HDFStore('pd_HDF_one.h5', mode='w', complevel=5, complib='zlib') as store:
        store.put('onecsv_hdf', df)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_onecsv_hdf5: {elapsed_time_ms:.2f} 毫秒")


def read_one_df_hdf5():
    start_time = time.time()
    # 读取存储在特定组中的数据
    with pd.HDFStore('pd_HDF_one.h5', mode='r') as store:
        df = store.get('onecsv_hdf')

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_one_df_hdf5: {elapsed_time_ms:.2f} 毫秒")



def write_onecsv_hdf5_pandas(listdf):
    df = pd.concat(listdf, ignore_index=True)
    start_time = time.time()
    df.to_hdf('pd_HDF_one_pandas.h5',key = 'test-csv', complevel = 5, mode = 'w', complib = 'zlib')

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_onecsv_hdf5_pandas: {elapsed_time_ms:.2f} 毫秒")



if __name__ == '__main__':
    print( get_files_in_current_directory() )
    listdf = read_csvlist()
    write_hdf5_by_keys(listdf)
    read_hdf5_by_keys()
    write_onecsv_hdf5(listdf)
    read_one_df_hdf5()

    write_onecsv_hdf5_pandas(listdf)
python 复制代码
def get_files_in_current_directory():
    # 使用 os.listdir 列出当前目录下的所有文件和目录名
    entries = os.listdir('data_csv')
    # 使用列表推导式过滤出文件(排除目录)
    files = [entry for entry in entries if os.path.isfile(os.path.join('data_csv', entry))]
    return files

def read_csvlist():
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件
    file_dflist = []
    start_time = time.time()
    for f in file_list:
        char_to_remove = "."
        new_string = f.replace(char_to_remove, '')
        #print(new_string)  # 输出: hell wrld
        df = pd.read_csv('data_csv/' + f )
        file_dflist.append(df)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_csvlist: {elapsed_time_ms:.2f} 毫秒")
    return file_dflist


def write_to_onecsv(listdf):

    df = pd.concat(listdf,  ignore_index=True)
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df.to_csv(path_or_buf= 'concat_one.csv', index=False)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_to_onecsv: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)

def read_onecsv():
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df = pd.read_csv(filepath_or_buffer='concat_one.csv' )

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_onecsv: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)


def write_to_onecsv_zip(listdf):

    df = pd.concat(listdf,  ignore_index=True)
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df.to_csv(path_or_buf= 'concat_one_gzip.csv', index=False, compression='gzip' )

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_to_onecsv_zip: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)

def read_onecsv_zip():
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df = pd.read_csv(filepath_or_buffer='concat_one_gzip.csv', compression='gzip' )
    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_onecsv_zip: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)


def write_parquet(listdf):
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件

    df = pd.concat(listdf,  ignore_index=True)
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df.to_parquet('data_gzip.parquet', compression='gzip', index=False)  # 惊艳,30M压缩到了10M,可以替代csv

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_parquet: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)



def read_parquet():
    # 读取存储在特定组中的数据
    start_time = time.time()
    df = pd.read_parquet('data_gzip.parquet')

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_parquet: {elapsed_time_ms:.2f} 毫秒")

    print(df.info())
    sleep(1)


if __name__ == '__main__':
    dflist1 = read_csvlist()
    write_parquet(dflist1)
    read_parquet()

    write_to_onecsv_zip(dflist1)
    read_onecsv_zip()

    write_to_onecsv(dflist1)
    read_onecsv()
相关推荐
沃洛德.辛肯23 分钟前
PyTorch 的 F.scaled_dot_product_attention 返回Nan
人工智能·pytorch·python
noravinsc35 分钟前
人大金仓数据库 与django结合
数据库·python·django
豌豆花下猫42 分钟前
Python 潮流周刊#102:微软裁员 Faster CPython 团队(摘要)
后端·python·ai
yzx9910131 小时前
Gensim 是一个专为 Python 设计的开源库
开发语言·python·开源
麻雀无能为力2 小时前
python自学笔记2 数据类型
开发语言·笔记·python
Ndmzi2 小时前
matlab与python问题解析
python·matlab
懒大王爱吃狼2 小时前
怎么使用python进行PostgreSQL 数据库连接?
数据库·python·postgresql
猫猫村晨总2 小时前
网络爬虫学习之httpx的使用
爬虫·python·httpx
web150854159352 小时前
Python线性回归:从理论到实践的完整指南
python·机器学习·线性回归
ayiya_Oese2 小时前
[训练和优化] 3. 模型优化
人工智能·python·深度学习·神经网络·机器学习