废话不多说,先放结论:
- python 处理csv 和hdf5对比
我本地存了100个小的csv文件(内容是股票交易数据),总30M
遍历读出来全部:read_csvlist: 664.34 毫秒
100份df数据按照key写入、读出:write_hdf5_by_keys: 2579.56 毫秒 30M 写入后 130M hdf5
100份df数据按照key写入、读出:read_hdf5_by_keys: 1407.38 毫秒
100份df数据合并一个df写入、读出:write_onecsv_hdf5: 1127.10 毫秒 30Mcsv 写入后 26M hdf5
100份df数据合并一个df写入、读出:read_one_df_hdf5: 786.55 毫秒
100份df数据合并一个df写入、读出,用pandas接口:write_onecsv_hdf5_pandas: 1137.50 毫秒
结论: hdf5 key越多,压缩率越低,-4倍;正常一个key,压缩率 10%
- python 处理csv 和parquet对比
我本地存了100个小的csv文件(内容是股票交易数据),总30M
df memory usage: 45.0+ MB
遍历读出来全部:read_csvlist: 780.23 毫秒
合并为一个df写入parquet:write_parquet: 1120.26 毫秒 30M csv --> 10M pq
从parquet读出:read_parquet: 131.98 毫秒
合并为一个df写入csv启用压缩:write_to_onecsv_zip: 8871.84 毫秒 30M csv --> 11.5M
从csv读出压缩数据:read_onecsv_zip: 535.38 毫秒
合并为一个df写入csv无压缩:write_to_onecsv: 2439.60 毫秒
从csv读出无压缩数据:read_onecsv: 432.28 毫秒
结论: parquet 写入稍慢于hdf5,读是真的快;压缩率70%很惊艳;csv启用压缩是真的慢;
源码放上来,你们自己跑吧
python
def get_filelist():
file_names = []
for root, dirs, files in os.walk('data_csv'):
for file in files:
file_names.append(file)
print(file_names)
def get_files_in_current_directory():
# 使用 os.listdir 列出当前目录下的所有文件和目录名
entries = os.listdir('data_csv')
# 使用列表推导式过滤出文件(排除目录)
files = [entry for entry in entries if os.path.isfile(os.path.join('data_csv', entry))]
return files
def read_csvlist():
file_list = get_files_in_current_directory()
# 打开(或创建)一个 HDF5 文件
file_dflist = []
start_time = time.time()
for f in file_list:
char_to_remove = "."
new_string = f.replace(char_to_remove, '')
#print(new_string) # 输出: hell wrld
df = pd.read_csv('data_csv/' + f )
file_dflist.append(df)
end_time = time.time()
elapsed_time_ms = (end_time - start_time) * 1000
print(f"read_csvlist: {elapsed_time_ms:.2f} 毫秒")
return file_dflist
def write_hdf5_by_keys(listdf):
file_list = get_files_in_current_directory()
# 打开(或创建)一个 HDF5 文件
with pd.HDFStore('pd_HDF.h5', complevel=9, complib='zlib') as store:
# 将 DataFrame 存储到名为 'my_group/my_dataset' 的路径下
# 这里 'my_group' 是组名,'my_dataset' 是数据集名
# 如果 'my_group' 不存在,它将被自动创建
start_time = time.time()
file_dflist = []
idx = 0
for f in file_list:
char_to_remove = "."
new_string = f.replace(char_to_remove, '')
store.put('gp_daily/'+new_string, listdf[idx])
idx += 1
end_time = time.time()
elapsed_time_ms = (end_time - start_time) * 1000
print(f"write_hdf5_by_keys: {elapsed_time_ms:.2f} 毫秒")
def read_hdf5_by_keys():
# 读取存储在特定组中的数据
with pd.HDFStore('pd_HDF.h5') as store:
file_list = get_files_in_current_directory()
start_time = time.time()
file_dflist = []
idx = 0
for f in file_list:
char_to_remove = "."
new_string = f.replace(char_to_remove, '')
file_dflist.append( store.get('gp_daily/' + new_string) )
idx += 1
end_time = time.time()
elapsed_time_ms = (end_time - start_time) * 1000
print(f"read_hdf5_by_keys: {elapsed_time_ms:.2f} 毫秒")
return file_dflist
def write_onecsv_hdf5(listdf):
df = pd.concat(listdf, ignore_index=True)
start_time = time.time()
# 打开(或创建)一个 HDF5 文件
with pd.HDFStore('pd_HDF_one.h5', mode='w', complevel=5, complib='zlib') as store:
store.put('onecsv_hdf', df)
end_time = time.time()
elapsed_time_ms = (end_time - start_time) * 1000
print(f"write_onecsv_hdf5: {elapsed_time_ms:.2f} 毫秒")
def read_one_df_hdf5():
start_time = time.time()
# 读取存储在特定组中的数据
with pd.HDFStore('pd_HDF_one.h5', mode='r') as store:
df = store.get('onecsv_hdf')
end_time = time.time()
elapsed_time_ms = (end_time - start_time) * 1000
print(f"read_one_df_hdf5: {elapsed_time_ms:.2f} 毫秒")
def write_onecsv_hdf5_pandas(listdf):
df = pd.concat(listdf, ignore_index=True)
start_time = time.time()
df.to_hdf('pd_HDF_one_pandas.h5',key = 'test-csv', complevel = 5, mode = 'w', complib = 'zlib')
end_time = time.time()
elapsed_time_ms = (end_time - start_time) * 1000
print(f"write_onecsv_hdf5_pandas: {elapsed_time_ms:.2f} 毫秒")
if __name__ == '__main__':
print( get_files_in_current_directory() )
listdf = read_csvlist()
write_hdf5_by_keys(listdf)
read_hdf5_by_keys()
write_onecsv_hdf5(listdf)
read_one_df_hdf5()
write_onecsv_hdf5_pandas(listdf)
python
def get_files_in_current_directory():
# 使用 os.listdir 列出当前目录下的所有文件和目录名
entries = os.listdir('data_csv')
# 使用列表推导式过滤出文件(排除目录)
files = [entry for entry in entries if os.path.isfile(os.path.join('data_csv', entry))]
return files
def read_csvlist():
file_list = get_files_in_current_directory()
# 打开(或创建)一个 HDF5 文件
file_dflist = []
start_time = time.time()
for f in file_list:
char_to_remove = "."
new_string = f.replace(char_to_remove, '')
#print(new_string) # 输出: hell wrld
df = pd.read_csv('data_csv/' + f )
file_dflist.append(df)
end_time = time.time()
elapsed_time_ms = (end_time - start_time) * 1000
print(f"read_csvlist: {elapsed_time_ms:.2f} 毫秒")
return file_dflist
def write_to_onecsv(listdf):
df = pd.concat(listdf, ignore_index=True)
start_time = time.time()
# 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
df.to_csv(path_or_buf= 'concat_one.csv', index=False)
end_time = time.time()
elapsed_time_ms = (end_time - start_time) * 1000
print(f"write_to_onecsv: {elapsed_time_ms:.2f} 毫秒")
sleep(1)
def read_onecsv():
start_time = time.time()
# 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
df = pd.read_csv(filepath_or_buffer='concat_one.csv' )
end_time = time.time()
elapsed_time_ms = (end_time - start_time) * 1000
print(f"read_onecsv: {elapsed_time_ms:.2f} 毫秒")
sleep(1)
def write_to_onecsv_zip(listdf):
df = pd.concat(listdf, ignore_index=True)
start_time = time.time()
# 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
df.to_csv(path_or_buf= 'concat_one_gzip.csv', index=False, compression='gzip' )
end_time = time.time()
elapsed_time_ms = (end_time - start_time) * 1000
print(f"write_to_onecsv_zip: {elapsed_time_ms:.2f} 毫秒")
sleep(1)
def read_onecsv_zip():
start_time = time.time()
# 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
df = pd.read_csv(filepath_or_buffer='concat_one_gzip.csv', compression='gzip' )
end_time = time.time()
elapsed_time_ms = (end_time - start_time) * 1000
print(f"read_onecsv_zip: {elapsed_time_ms:.2f} 毫秒")
sleep(1)
def write_parquet(listdf):
file_list = get_files_in_current_directory()
# 打开(或创建)一个 HDF5 文件
df = pd.concat(listdf, ignore_index=True)
start_time = time.time()
# 使用 'pyarrow' 引擎和 'gzip' 压缩算法写入 , 不包含索引,
df.to_parquet('data_gzip.parquet', compression='gzip', index=False) # 惊艳,30M压缩到了10M,可以替代csv
end_time = time.time()
elapsed_time_ms = (end_time - start_time) * 1000
print(f"write_parquet: {elapsed_time_ms:.2f} 毫秒")
sleep(1)
def read_parquet():
# 读取存储在特定组中的数据
start_time = time.time()
df = pd.read_parquet('data_gzip.parquet')
end_time = time.time()
elapsed_time_ms = (end_time - start_time) * 1000
print(f"read_parquet: {elapsed_time_ms:.2f} 毫秒")
print(df.info())
sleep(1)
if __name__ == '__main__':
dflist1 = read_csvlist()
write_parquet(dflist1)
read_parquet()
write_to_onecsv_zip(dflist1)
read_onecsv_zip()
write_to_onecsv(dflist1)
read_onecsv()