python pandas ,处理csv文件、hdf5文件、parquet文件效率详细对比

废话不多说,先放结论:

  1. python 处理csv 和hdf5对比

我本地存了100个小的csv文件(内容是股票交易数据),总30M

复制代码
遍历读出来全部:read_csvlist: 664.34 毫秒
100份df数据按照key写入、读出:write_hdf5_by_keys: 2579.56 毫秒  30M 写入后 130M hdf5
100份df数据按照key写入、读出:read_hdf5_by_keys: 1407.38 毫秒
100份df数据合并一个df写入、读出:write_onecsv_hdf5: 1127.10 毫秒  30Mcsv 写入后 26M hdf5
100份df数据合并一个df写入、读出:read_one_df_hdf5: 786.55 毫秒
100份df数据合并一个df写入、读出,用pandas接口:write_onecsv_hdf5_pandas: 1137.50 毫秒

结论: hdf5 key越多,压缩率越低,-4倍;正常一个key,压缩率 10%

  1. python 处理csv 和parquet对比

我本地存了100个小的csv文件(内容是股票交易数据),总30M

复制代码
df memory usage: 45.0+ MB
遍历读出来全部:read_csvlist: 780.23 毫秒
合并为一个df写入parquet:write_parquet: 1120.26 毫秒  30M csv --> 10M pq
从parquet读出:read_parquet: 131.98 毫秒
合并为一个df写入csv启用压缩:write_to_onecsv_zip: 8871.84 毫秒 30M csv --> 11.5M
从csv读出压缩数据:read_onecsv_zip: 535.38 毫秒
合并为一个df写入csv无压缩:write_to_onecsv: 2439.60 毫秒
从csv读出无压缩数据:read_onecsv: 432.28 毫秒

结论: parquet 写入稍慢于hdf5,读是真的快;压缩率70%很惊艳;csv启用压缩是真的慢;

源码放上来,你们自己跑吧

python 复制代码
def get_filelist():
    file_names = []
    for root, dirs, files in os.walk('data_csv'):
        for file in files:
            file_names.append(file)
    print(file_names)

def get_files_in_current_directory():
    # 使用 os.listdir 列出当前目录下的所有文件和目录名
    entries = os.listdir('data_csv')
    # 使用列表推导式过滤出文件(排除目录)
    files = [entry for entry in entries if os.path.isfile(os.path.join('data_csv', entry))]
    return files

def read_csvlist():
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件
    file_dflist = []
    start_time = time.time()
    for f in file_list:
        char_to_remove = "."
        new_string = f.replace(char_to_remove, '')
        #print(new_string)  # 输出: hell wrld
        df = pd.read_csv('data_csv/' + f )
        file_dflist.append(df)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_csvlist: {elapsed_time_ms:.2f} 毫秒")
    return file_dflist


def write_hdf5_by_keys(listdf):
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件
    with pd.HDFStore('pd_HDF.h5', complevel=9, complib='zlib') as store:
        # 将 DataFrame 存储到名为 'my_group/my_dataset' 的路径下
        # 这里 'my_group' 是组名,'my_dataset' 是数据集名
        # 如果 'my_group' 不存在,它将被自动创建
        start_time = time.time()
        file_dflist = []
        idx = 0
        for f in file_list:
            char_to_remove = "."
            new_string = f.replace(char_to_remove, '')
            store.put('gp_daily/'+new_string, listdf[idx])
            idx += 1

        end_time = time.time()
        elapsed_time_ms = (end_time - start_time) * 1000
        print(f"write_hdf5_by_keys: {elapsed_time_ms:.2f} 毫秒")


def read_hdf5_by_keys():
    # 读取存储在特定组中的数据
    with pd.HDFStore('pd_HDF.h5') as store:
        file_list = get_files_in_current_directory()
        start_time = time.time()
        file_dflist = []
        idx = 0
        for f in file_list:
            char_to_remove = "."
            new_string = f.replace(char_to_remove, '')
            file_dflist.append( store.get('gp_daily/' + new_string) )
            idx += 1

        end_time = time.time()
        elapsed_time_ms = (end_time - start_time) * 1000
        print(f"read_hdf5_by_keys: {elapsed_time_ms:.2f} 毫秒")
        return file_dflist




def write_onecsv_hdf5(listdf):
    df = pd.concat(listdf, ignore_index=True)
    start_time = time.time()
    # 打开(或创建)一个 HDF5 文件
    with pd.HDFStore('pd_HDF_one.h5', mode='w', complevel=5, complib='zlib') as store:
        store.put('onecsv_hdf', df)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_onecsv_hdf5: {elapsed_time_ms:.2f} 毫秒")


def read_one_df_hdf5():
    start_time = time.time()
    # 读取存储在特定组中的数据
    with pd.HDFStore('pd_HDF_one.h5', mode='r') as store:
        df = store.get('onecsv_hdf')

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_one_df_hdf5: {elapsed_time_ms:.2f} 毫秒")



def write_onecsv_hdf5_pandas(listdf):
    df = pd.concat(listdf, ignore_index=True)
    start_time = time.time()
    df.to_hdf('pd_HDF_one_pandas.h5',key = 'test-csv', complevel = 5, mode = 'w', complib = 'zlib')

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_onecsv_hdf5_pandas: {elapsed_time_ms:.2f} 毫秒")



if __name__ == '__main__':
    print( get_files_in_current_directory() )
    listdf = read_csvlist()
    write_hdf5_by_keys(listdf)
    read_hdf5_by_keys()
    write_onecsv_hdf5(listdf)
    read_one_df_hdf5()

    write_onecsv_hdf5_pandas(listdf)
python 复制代码
def get_files_in_current_directory():
    # 使用 os.listdir 列出当前目录下的所有文件和目录名
    entries = os.listdir('data_csv')
    # 使用列表推导式过滤出文件(排除目录)
    files = [entry for entry in entries if os.path.isfile(os.path.join('data_csv', entry))]
    return files

def read_csvlist():
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件
    file_dflist = []
    start_time = time.time()
    for f in file_list:
        char_to_remove = "."
        new_string = f.replace(char_to_remove, '')
        #print(new_string)  # 输出: hell wrld
        df = pd.read_csv('data_csv/' + f )
        file_dflist.append(df)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_csvlist: {elapsed_time_ms:.2f} 毫秒")
    return file_dflist


def write_to_onecsv(listdf):

    df = pd.concat(listdf,  ignore_index=True)
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df.to_csv(path_or_buf= 'concat_one.csv', index=False)

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_to_onecsv: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)

def read_onecsv():
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df = pd.read_csv(filepath_or_buffer='concat_one.csv' )

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_onecsv: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)


def write_to_onecsv_zip(listdf):

    df = pd.concat(listdf,  ignore_index=True)
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df.to_csv(path_or_buf= 'concat_one_gzip.csv', index=False, compression='gzip' )

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_to_onecsv_zip: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)

def read_onecsv_zip():
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df = pd.read_csv(filepath_or_buffer='concat_one_gzip.csv', compression='gzip' )
    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_onecsv_zip: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)


def write_parquet(listdf):
    file_list = get_files_in_current_directory()
    # 打开(或创建)一个 HDF5 文件

    df = pd.concat(listdf,  ignore_index=True)
    start_time = time.time()
    # 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
    df.to_parquet('data_gzip.parquet', compression='gzip', index=False)  # 惊艳,30M压缩到了10M,可以替代csv

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"write_parquet: {elapsed_time_ms:.2f} 毫秒")
    sleep(1)



def read_parquet():
    # 读取存储在特定组中的数据
    start_time = time.time()
    df = pd.read_parquet('data_gzip.parquet')

    end_time = time.time()
    elapsed_time_ms = (end_time - start_time) * 1000
    print(f"read_parquet: {elapsed_time_ms:.2f} 毫秒")

    print(df.info())
    sleep(1)


if __name__ == '__main__':
    dflist1 = read_csvlist()
    write_parquet(dflist1)
    read_parquet()

    write_to_onecsv_zip(dflist1)
    read_onecsv_zip()

    write_to_onecsv(dflist1)
    read_onecsv()
相关推荐
跟橙姐学代码2 分钟前
不要再用 print() 了!Python logging 库才是调试的终极武器
前端·python
小叶lr43 分钟前
python 从pycharm部署到新环境
开发语言·python·pycharm
2301_763471031 小时前
Python单元测试(unittest)实战指南
python
xiaoxiongip6661 小时前
动态ip适合挂什么项目
网络·爬虫·python·网络协议·tcp/ip·ip
Q_Q5110082851 小时前
springboot+python+uniapp基于微信小程序的旅游服务系统景点信息展示 路线推荐 在线预约 评论互动系统
spring boot·python·微信小程序·django·flask·uni-app
E___V___E2 小时前
设计模式--装饰器模式
python·设计模式·装饰器模式
Dan.Qiao2 小时前
pycharm如何处理python项目间引用
ide·python·pycharm
万粉变现经纪人3 小时前
如何解决pip安装报错ModuleNotFoundError: No module named ‘sympy’问题
python·beautifulsoup·pandas·scikit-learn·pyqt·pip·scipy
xiaohouzi1122333 小时前
Python读取视频-硬解和软解
python·opencv·ffmpeg·视频编解码·gstreamer
念念不忘 必有回响3 小时前
Pygame模块化实战:从零构建Aliens射击游戏全流程(一)
python·游戏·pygame