python pandas ,处理csv文件、hdf5文件、parquet文件效率详细对比

废话不多说,先放结论:

  1. python 处理csv 和hdf5对比

我本地存了100个小的csv文件(内容是股票交易数据),总30M

复制代码
遍历读出来全部:read_csvlist: 664.34 毫秒
100份df数据按照key写入、读出:write_hdf5_by_keys: 2579.56 毫秒  30M 写入后 130M hdf5
100份df数据按照key写入、读出:read_hdf5_by_keys: 1407.38 毫秒
100份df数据合并一个df写入、读出:write_onecsv_hdf5: 1127.10 毫秒  30Mcsv 写入后 26M hdf5
100份df数据合并一个df写入、读出:read_one_df_hdf5: 786.55 毫秒
100份df数据合并一个df写入、读出,用pandas接口:write_onecsv_hdf5_pandas: 1137.50 毫秒

结论: hdf5 key越多,压缩率越低,-4倍;正常一个key,压缩率 10%

  1. python 处理csv 和parquet对比

我本地存了100个小的csv文件(内容是股票交易数据),总30M

复制代码
df memory usage: 45.0+ MB
遍历读出来全部:read_csvlist: 780.23 毫秒
合并为一个df写入parquet:write_parquet: 1120.26 毫秒  30M csv --> 10M pq
从parquet读出:read_parquet: 131.98 毫秒
合并为一个df写入csv启用压缩:write_to_onecsv_zip: 8871.84 毫秒 30M csv --> 11.5M
从csv读出压缩数据:read_onecsv_zip: 535.38 毫秒
合并为一个df写入csv无压缩:write_to_onecsv: 2439.60 毫秒
从csv读出无压缩数据:read_onecsv: 432.28 毫秒

结论: parquet 写入稍慢于hdf5,读是真的快;压缩率70%很惊艳;csv启用压缩是真的慢;

源码放上来,你们自己跑吧

python 复制代码
def get_filelist():
    file_names = []
    for root, dirs, files in os.walk('data_csv'):
        for file in files:
            file_names.append(file)
    print(file_names)

def get_files_in_current_directory():
    # 使用 os.listdir 列出当前目录下的所有文件和目录名
    entries = os.listdir('data_csv')
    # 使用列表推导式过滤出文件(排除目录)
    files = [entry for entry in entries if os.path.isfile(os.path.join('data_csv', entry))]
    return files

def read_csvlist():
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件
    file_dflist = []
    start_time = time.time()
    for f in file_list:
        char_to_remove = "."
        new_string = f.replace(char_to_remove, '')
        #print(new_string)  # 输出: hell wrld
        df = pd.read_csv('data_csv/' + f )
        file_dflist.append(df)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_csvlist: {elapsed_time_ms:.2f} 毫秒")
    return file_dflist


def write_hdf5_by_keys(listdf):
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件
    with pd.HDFStore('pd_HDF.h5', complevel=9, complib='zlib') as store:
        # 将 DataFrame 存储到名为 'my_group/my_dataset' 的路径下
        # 这里 'my_group' 是组名,'my_dataset' 是数据集名
        # 如果 'my_group' 不存在,它将被自动创建
        start_time = time.time()
        file_dflist = []
        idx = 0
        for f in file_list:
            char_to_remove = "."
            new_string = f.replace(char_to_remove, '')
            store.put('gp_daily/'+new_string, listdf[idx])
            idx += 1

        end_time = time.time()
        elapsed_time_ms = (end_time - start_time) * 1000
        print(f"write_hdf5_by_keys: {elapsed_time_ms:.2f} 毫秒")


def read_hdf5_by_keys():
    # 读取存储在特定组中的数据
    with pd.HDFStore('pd_HDF.h5') as store:
        file_list = get_files_in_current_directory()
        start_time = time.time()
        file_dflist = []
        idx = 0
        for f in file_list:
            char_to_remove = "."
            new_string = f.replace(char_to_remove, '')
            file_dflist.append( store.get('gp_daily/' + new_string) )
            idx += 1

        end_time = time.time()
        elapsed_time_ms = (end_time - start_time) * 1000
        print(f"read_hdf5_by_keys: {elapsed_time_ms:.2f} 毫秒")
        return file_dflist




def write_onecsv_hdf5(listdf):
    df = pd.concat(listdf, ignore_index=True)
    start_time = time.time()
    # 打开(或创建)一个 HDF5 文件
    with pd.HDFStore('pd_HDF_one.h5', mode='w', complevel=5, complib='zlib') as store:
        store.put('onecsv_hdf', df)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_onecsv_hdf5: {elapsed_time_ms:.2f} 毫秒")


def read_one_df_hdf5():
    start_time = time.time()
    # 读取存储在特定组中的数据
    with pd.HDFStore('pd_HDF_one.h5', mode='r') as store:
        df = store.get('onecsv_hdf')

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_one_df_hdf5: {elapsed_time_ms:.2f} 毫秒")



def write_onecsv_hdf5_pandas(listdf):
    df = pd.concat(listdf, ignore_index=True)
    start_time = time.time()
    df.to_hdf('pd_HDF_one_pandas.h5',key = 'test-csv', complevel = 5, mode = 'w', complib = 'zlib')

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_onecsv_hdf5_pandas: {elapsed_time_ms:.2f} 毫秒")



if __name__ == '__main__':
    print( get_files_in_current_directory() )
    listdf = read_csvlist()
    write_hdf5_by_keys(listdf)
    read_hdf5_by_keys()
    write_onecsv_hdf5(listdf)
    read_one_df_hdf5()

    write_onecsv_hdf5_pandas(listdf)
python 复制代码
def get_files_in_current_directory():
    # 使用 os.listdir 列出当前目录下的所有文件和目录名
    entries = os.listdir('data_csv')
    # 使用列表推导式过滤出文件(排除目录)
    files = [entry for entry in entries if os.path.isfile(os.path.join('data_csv', entry))]
    return files

def read_csvlist():
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件
    file_dflist = []
    start_time = time.time()
    for f in file_list:
        char_to_remove = "."
        new_string = f.replace(char_to_remove, '')
        #print(new_string)  # 输出: hell wrld
        df = pd.read_csv('data_csv/' + f )
        file_dflist.append(df)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_csvlist: {elapsed_time_ms:.2f} 毫秒")
    return file_dflist


def write_to_onecsv(listdf):

    df = pd.concat(listdf,  ignore_index=True)
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df.to_csv(path_or_buf= 'concat_one.csv', index=False)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_to_onecsv: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)

def read_onecsv():
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df = pd.read_csv(filepath_or_buffer='concat_one.csv' )

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_onecsv: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)


def write_to_onecsv_zip(listdf):

    df = pd.concat(listdf,  ignore_index=True)
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df.to_csv(path_or_buf= 'concat_one_gzip.csv', index=False, compression='gzip' )

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_to_onecsv_zip: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)

def read_onecsv_zip():
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df = pd.read_csv(filepath_or_buffer='concat_one_gzip.csv', compression='gzip' )
    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_onecsv_zip: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)


def write_parquet(listdf):
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件

    df = pd.concat(listdf,  ignore_index=True)
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df.to_parquet('data_gzip.parquet', compression='gzip', index=False)  # 惊艳,30M压缩到了10M,可以替代csv

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_parquet: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)



def read_parquet():
    # 读取存储在特定组中的数据
    start_time = time.time()
    df = pd.read_parquet('data_gzip.parquet')

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_parquet: {elapsed_time_ms:.2f} 毫秒")

    print(df.info())
    sleep(1)


if __name__ == '__main__':
    dflist1 = read_csvlist()
    write_parquet(dflist1)
    read_parquet()

    write_to_onecsv_zip(dflist1)
    read_onecsv_zip()

    write_to_onecsv(dflist1)
    read_onecsv()
相关推荐
只怕自己不够好10 分钟前
手写体识别Tensorflow实现
人工智能·python·tensorflow
想去看海98525 分钟前
如何在pycharm中 判断是否成功安装pytorch环境
ide·python·pycharm
YRr YRr30 分钟前
ubuntu20.04 解决Pycharm没有写入权限,无法通过检查更新更新的问题
ide·python·pycharm·ubuntu20.04
孤客网络科技工作室1 小时前
Python Tornado框架教程:高性能Web框架的全面解析
前端·python·tornado
青石横刀策马1 小时前
Python学习笔记(2)正则表达式
笔记·python·学习
小白的大数据之旅1 小时前
告别Pandas瓶颈,迎接Dask时代:Python数据处理从此起飞!
python
MinggeQingchun2 小时前
Python - 初识Python;Python解释器下载安装;Python IDE(一)
python·pycharm
大数据编程之光2 小时前
Python版Spark Structured Streaming编程指南
开发语言·python·spark
豌豆花下猫2 小时前
Python 潮流周刊#77:Python 依赖管理就像垃圾场火灾?(摘要)
后端·python·ai
红队it2 小时前
【计算机毕设】无查重 基于python豆瓣电影评论舆情数据可视化系统(完整系统源码+数据库+开发笔记+详细部署教程)✅
数据库·笔记·python·算法·机器学习·信息可视化·数据分析