python pandas ,处理csv文件、hdf5文件、parquet文件效率详细对比

废话不多说,先放结论:

  1. python 处理csv 和hdf5对比

我本地存了100个小的csv文件(内容是股票交易数据),总30M

复制代码
遍历读出来全部:read_csvlist: 664.34 毫秒
100份df数据按照key写入、读出:write_hdf5_by_keys: 2579.56 毫秒  30M 写入后 130M hdf5
100份df数据按照key写入、读出:read_hdf5_by_keys: 1407.38 毫秒
100份df数据合并一个df写入、读出:write_onecsv_hdf5: 1127.10 毫秒  30Mcsv 写入后 26M hdf5
100份df数据合并一个df写入、读出:read_one_df_hdf5: 786.55 毫秒
100份df数据合并一个df写入、读出,用pandas接口:write_onecsv_hdf5_pandas: 1137.50 毫秒

结论: hdf5 key越多,压缩率越低,-4倍;正常一个key,压缩率 10%

  1. python 处理csv 和parquet对比

我本地存了100个小的csv文件(内容是股票交易数据),总30M

复制代码
df memory usage: 45.0+ MB
遍历读出来全部:read_csvlist: 780.23 毫秒
合并为一个df写入parquet:write_parquet: 1120.26 毫秒  30M csv --> 10M pq
从parquet读出:read_parquet: 131.98 毫秒
合并为一个df写入csv启用压缩:write_to_onecsv_zip: 8871.84 毫秒 30M csv --> 11.5M
从csv读出压缩数据:read_onecsv_zip: 535.38 毫秒
合并为一个df写入csv无压缩:write_to_onecsv: 2439.60 毫秒
从csv读出无压缩数据:read_onecsv: 432.28 毫秒

结论: parquet 写入稍慢于hdf5,读是真的快;压缩率70%很惊艳;csv启用压缩是真的慢;

源码放上来,你们自己跑吧

python 复制代码
def get_filelist():
    file_names = []
    for root, dirs, files in os.walk('data_csv'):
        for file in files:
            file_names.append(file)
    print(file_names)

def get_files_in_current_directory():
    # 使用 os.listdir 列出当前目录下的所有文件和目录名
    entries = os.listdir('data_csv')
    # 使用列表推导式过滤出文件(排除目录)
    files = [entry for entry in entries if os.path.isfile(os.path.join('data_csv', entry))]
    return files

def read_csvlist():
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件
    file_dflist = []
    start_time = time.time()
    for f in file_list:
        char_to_remove = "."
        new_string = f.replace(char_to_remove, '')
        #print(new_string)  # 输出: hell wrld
        df = pd.read_csv('data_csv/' + f )
        file_dflist.append(df)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_csvlist: {elapsed_time_ms:.2f} 毫秒")
    return file_dflist


def write_hdf5_by_keys(listdf):
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件
    with pd.HDFStore('pd_HDF.h5', complevel=9, complib='zlib') as store:
        # 将 DataFrame 存储到名为 'my_group/my_dataset' 的路径下
        # 这里 'my_group' 是组名,'my_dataset' 是数据集名
        # 如果 'my_group' 不存在,它将被自动创建
        start_time = time.time()
        file_dflist = []
        idx = 0
        for f in file_list:
            char_to_remove = "."
            new_string = f.replace(char_to_remove, '')
            store.put('gp_daily/'+new_string, listdf[idx])
            idx += 1

        end_time = time.time()
        elapsed_time_ms = (end_time - start_time) * 1000
        print(f"write_hdf5_by_keys: {elapsed_time_ms:.2f} 毫秒")


def read_hdf5_by_keys():
    # 读取存储在特定组中的数据
    with pd.HDFStore('pd_HDF.h5') as store:
        file_list = get_files_in_current_directory()
        start_time = time.time()
        file_dflist = []
        idx = 0
        for f in file_list:
            char_to_remove = "."
            new_string = f.replace(char_to_remove, '')
            file_dflist.append( store.get('gp_daily/' + new_string) )
            idx += 1

        end_time = time.time()
        elapsed_time_ms = (end_time - start_time) * 1000
        print(f"read_hdf5_by_keys: {elapsed_time_ms:.2f} 毫秒")
        return file_dflist




def write_onecsv_hdf5(listdf):
    df = pd.concat(listdf, ignore_index=True)
    start_time = time.time()
    # 打开(或创建)一个 HDF5 文件
    with pd.HDFStore('pd_HDF_one.h5', mode='w', complevel=5, complib='zlib') as store:
        store.put('onecsv_hdf', df)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_onecsv_hdf5: {elapsed_time_ms:.2f} 毫秒")


def read_one_df_hdf5():
    start_time = time.time()
    # 读取存储在特定组中的数据
    with pd.HDFStore('pd_HDF_one.h5', mode='r') as store:
        df = store.get('onecsv_hdf')

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_one_df_hdf5: {elapsed_time_ms:.2f} 毫秒")



def write_onecsv_hdf5_pandas(listdf):
    df = pd.concat(listdf, ignore_index=True)
    start_time = time.time()
    df.to_hdf('pd_HDF_one_pandas.h5',key = 'test-csv', complevel = 5, mode = 'w', complib = 'zlib')

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_onecsv_hdf5_pandas: {elapsed_time_ms:.2f} 毫秒")



if __name__ == '__main__':
    print( get_files_in_current_directory() )
    listdf = read_csvlist()
    write_hdf5_by_keys(listdf)
    read_hdf5_by_keys()
    write_onecsv_hdf5(listdf)
    read_one_df_hdf5()

    write_onecsv_hdf5_pandas(listdf)
python 复制代码
def get_files_in_current_directory():
    # 使用 os.listdir 列出当前目录下的所有文件和目录名
    entries = os.listdir('data_csv')
    # 使用列表推导式过滤出文件(排除目录)
    files = [entry for entry in entries if os.path.isfile(os.path.join('data_csv', entry))]
    return files

def read_csvlist():
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件
    file_dflist = []
    start_time = time.time()
    for f in file_list:
        char_to_remove = "."
        new_string = f.replace(char_to_remove, '')
        #print(new_string)  # 输出: hell wrld
        df = pd.read_csv('data_csv/' + f )
        file_dflist.append(df)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_csvlist: {elapsed_time_ms:.2f} 毫秒")
    return file_dflist


def write_to_onecsv(listdf):

    df = pd.concat(listdf,  ignore_index=True)
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df.to_csv(path_or_buf= 'concat_one.csv', index=False)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_to_onecsv: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)

def read_onecsv():
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df = pd.read_csv(filepath_or_buffer='concat_one.csv' )

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_onecsv: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)


def write_to_onecsv_zip(listdf):

    df = pd.concat(listdf,  ignore_index=True)
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df.to_csv(path_or_buf= 'concat_one_gzip.csv', index=False, compression='gzip' )

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_to_onecsv_zip: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)

def read_onecsv_zip():
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df = pd.read_csv(filepath_or_buffer='concat_one_gzip.csv', compression='gzip' )
    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_onecsv_zip: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)


def write_parquet(listdf):
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件

    df = pd.concat(listdf,  ignore_index=True)
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df.to_parquet('data_gzip.parquet', compression='gzip', index=False)  # 惊艳,30M压缩到了10M,可以替代csv

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_parquet: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)



def read_parquet():
    # 读取存储在特定组中的数据
    start_time = time.time()
    df = pd.read_parquet('data_gzip.parquet')

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_parquet: {elapsed_time_ms:.2f} 毫秒")

    print(df.info())
    sleep(1)


if __name__ == '__main__':
    dflist1 = read_csvlist()
    write_parquet(dflist1)
    read_parquet()

    write_to_onecsv_zip(dflist1)
    read_onecsv_zip()

    write_to_onecsv(dflist1)
    read_onecsv()
相关推荐
汪洪墩19 分钟前
【Mars3d】设置backgroundImage、map.scene.skyBox、backgroundImage来回切换
开发语言·javascript·python·ecmascript·webgl·cesium
程序员shen1616112 小时前
抖音短视频saas矩阵源码系统开发所需掌握的技术
java·前端·数据库·python·算法
人人人人一样一样2 小时前
作业Python
python
四口鲸鱼爱吃盐2 小时前
Pytorch | 利用VMI-FGSM针对CIFAR10上的ResNet分类器进行对抗攻击
人工智能·pytorch·python
四口鲸鱼爱吃盐2 小时前
Pytorch | 利用PI-FGSM针对CIFAR10上的ResNet分类器进行对抗攻击
人工智能·pytorch·python
小陈phd3 小时前
深度学习之超分辨率算法——SRCNN
python·深度学习·tensorflow·卷积
CodeClimb3 小时前
【华为OD-E卷-简单的自动曝光 100分(python、java、c++、js、c)】
java·python·华为od
数据小小爬虫3 小时前
如何利用Python爬虫获取商品历史价格信息
开发语言·爬虫·python
NiNg_1_2343 小时前
Python的sklearn中的RandomForestRegressor使用详解
开发语言·python·sklearn