利用DeepSeek编写验证xlsx格式文件中是否启用sharedStrings.xml对读写效率影响python程序

让他分别用100*10000个单元格有100、1000、10000个不同的1-200字符长的大写英文字母字符串测试.

一开始DeepSeek没有找到启用sharedStrings.xml的写xlsx模块，我自己找了pyxlsbwriter的例子告诉他才改好的。

python 复制代码

import os
import time
import random
import string
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import matplotlib.pyplot as plt
import numpy as np
from pyxlsbwriter import XlsxWriter

def generate_random_string(length):
    """生成指定长度的大写英文字母随机字符串"""
    return ''.join(random.choices(string.ascii_uppercase, k=length))

def create_test_data(rows, cols, unique_strings):
    """
    创建测试数据
    rows: 行数
    cols: 列数
    unique_strings: 不同字符串的数量
    """
    # 生成唯一字符串池
    string_pool = [generate_random_string(random.randint(1, 200)) for _ in range(unique_strings)]
    
    # 创建数据矩阵
    data = []
    for i in range(rows):
        row = []
        for j in range(cols):
            # 从字符串池中随机选择一个字符串
            row.append(random.choice(string_pool))
        data.append(row)
    
    return data


def write_excel_with_shared_strings(data, filename):
    """使用shared strings写入Excel文件（使用pyxlsbwriter库确保生成sharedStrings.xml）"""
    
    
    # 使用XlsxWriter创建xlsx文件，默认会使用shared strings
    with XlsxWriter(filename, compressionLevel=6) as writer:
        writer.add_sheet("Sheet1")
        writer.write_sheet(data)    


def write_excel_without_shared_strings(data, filename):
    """不使用shared strings写入Excel文件（转换为pandas DataFrame再保存）"""
    df = pd.DataFrame(data)
    df.to_excel(filename, index=False, header=False)

def read_excel(filename):
    """读取Excel文件并测量时间"""
    start_time = time.time()
    df = pd.read_excel(filename, header=None)
    end_time = time.time()
    return end_time - start_time, df

def test_scenario(rows, cols, unique_strings, output_dir="test_results"):
    """测试一个场景：特定行、列和唯一字符串数量的情况"""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # 创建测试数据
    print(f"生成测试数据: {rows}行 x {cols}列, {unique_strings}个唯一字符串")
    data = create_test_data(rows, cols, unique_strings)
    
    # 测试启用shared strings的情况
    with_shared_file = os.path.join(output_dir, f"with_shared_{rows}_{cols}_{unique_strings}.xlsx")
    start_time = time.time()
    write_excel_with_shared_strings(data, with_shared_file)
    with_shared_write_time = time.time() - start_time
    
    with_shared_read_time, with_shared_df = read_excel(with_shared_file)
    with_shared_size = os.path.getsize(with_shared_file)
    
    # 测试不启用shared strings的情况
    without_shared_file = os.path.join(output_dir, f"without_shared_{rows}_{cols}_{unique_strings}.xlsx")
    start_time = time.time()
    write_excel_without_shared_strings(data, without_shared_file)
    without_shared_write_time = time.time() - start_time
    
    without_shared_read_time, without_shared_df = read_excel(without_shared_file)
    without_shared_size = os.path.getsize(without_shared_file)
    
    # 验证数据一致性
    assert with_shared_df.equals(without_shared_df), "两种方式保存的数据不一致!"
    
    # 返回结果
    return {
        'rows': rows,
        'cols': cols,
        'unique_strings': unique_strings,
        'with_shared_write_time': with_shared_write_time,
        'with_shared_read_time': with_shared_read_time,
        'with_shared_size': with_shared_size,
        'without_shared_write_time': without_shared_write_time,
        'without_shared_read_time': without_shared_read_time,
        'without_shared_size': without_shared_size
    }

def main():
    """主函数"""
    # 测试配置
    rows = 100
    cols = 10000
    unique_strings_list = [100, 1000, 10000, 100000]
    
    results = []
    
    # 运行测试
    for unique_strings in unique_strings_list:
        result = test_scenario(rows, cols, unique_strings)
        results.append(result)
        
        # 打印当前测试结果
        print(f"\n测试结果 (唯一字符串数: {unique_strings}):")
        print(f"启用shared strings - 写入时间: {result['with_shared_write_time']:.2f}s, "
              f"读取时间: {result['with_shared_read_time']:.2f}s, "
              f"文件大小: {result['with_shared_size']/1024/1024:.2f}MB")
        print(f"禁用shared strings - 写入时间: {result['without_shared_write_time']:.2f}s, "
              f"读取时间: {result['without_shared_read_time']:.2f}s, "
              f"文件大小: {result['without_shared_size']/1024/1024:.2f}MB")
    
    # 绘制结果图表
    plot_results(results)

def plot_results(results):
    from pylab import mpl
    # 设置显示中文字体
    mpl.rcParams["font.sans-serif"] = ["SimSun"]
    # 设置正常显示符号
    mpl.rcParams["axes.unicode_minus"] = False

    """绘制测试结果图表"""
    unique_strings = [r['unique_strings'] for r in results]
    
    # 创建图表
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 10))
    
    # 写入时间对比
    with_shared_write_times = [r['with_shared_write_time'] for r in results]
    without_shared_write_times = [r['without_shared_write_time'] for r in results]
    
    ax1.plot(unique_strings, with_shared_write_times, 'o-', label='启用shared strings')
    ax1.plot(unique_strings, without_shared_write_times, 'o-', label='禁用shared strings')
    ax1.set_xlabel('唯一字符串数量')
    ax1.set_ylabel('写入时间 (秒)')
    ax1.set_title('写入时间对比')
    ax1.legend()
    ax1.grid(True)
    
    # 读取时间对比
    with_shared_read_times = [r['with_shared_read_time'] for r in results]
    without_shared_read_times = [r['without_shared_read_time'] for r in results]
    
    ax2.plot(unique_strings, with_shared_read_times, 'o-', label='启用shared strings')
    ax2.plot(unique_strings, without_shared_read_times, 'o-', label='禁用shared strings')
    ax2.set_xlabel('唯一字符串数量')
    ax2.set_ylabel('读取时间 (秒)')
    ax2.set_title('读取时间对比')
    ax2.legend()
    ax2.grid(True)
    
    # 文件大小对比
    with_shared_sizes = [r['with_shared_size']/1024/1024 for r in results]
    without_shared_sizes = [r['without_shared_size']/1024/1024 for r in results]
    
    ax3.plot(unique_strings, with_shared_sizes, 'o-', label='启用shared strings')
    ax3.plot(unique_strings, without_shared_sizes, 'o-', label='禁用shared strings')
    ax3.set_xlabel('唯一字符串数量')
    ax3.set_ylabel('文件大小 (MB)')
    ax3.set_title('文件大小对比')
    ax3.legend()
    ax3.grid(True)
    
    # 总时间对比
    with_shared_total_times = [r['with_shared_write_time'] + r['with_shared_read_time'] for r in results]
    without_shared_total_times = [r['without_shared_write_time'] + r['without_shared_read_time'] for r in results]
    
    ax4.plot(unique_strings, with_shared_total_times, 'o-', label='启用shared strings')
    ax4.plot(unique_strings, without_shared_total_times, 'o-', label='禁用shared strings')
    ax4.set_xlabel('唯一字符串数量')
    ax4.set_ylabel('总时间 (秒)')
    ax4.set_title('总时间 (写入+读取) 对比')
    ax4.legend()
    ax4.grid(True)
    
    plt.tight_layout()
    plt.savefig('shared_strings_performance_comparison.png', dpi=300)
    plt.show()
    
    # 打印详细结果表格
    print("\n详细测试结果:")
    print("唯一字符串数 | 启用shared写入时间 | 禁用shared写入时间 | 启用shared读取时间 | 禁用shared读取时间 | 启用shared文件大小 | 禁用shared文件大小")
    for r in results:
        print(f"{r['unique_strings']:>12} | {r['with_shared_write_time']:>17.2f} | {r['without_shared_write_time']:>17.2f} | "
              f"{r['with_shared_read_time']:>17.2f} | {r['without_shared_read_time']:>17.2f} | "
              f"{r['with_shared_size']/1024/1024:>17.2f} | {r['without_shared_size']/1024/1024:>17.2f}")

if __name__ == "__main__":
    main()

执行结果

bash 复制代码

生成测试数据: 100行 x 10000列, 100个唯一字符串

测试结果 (唯一字符串数: 100):
启用shared strings - 写入时间: 0.91s, 读取时间: 2.70s, 文件大小: 1.60MB
禁用shared strings - 写入时间: 10.37s, 读取时间: 9.21s, 文件大小: 12.16MB
生成测试数据: 100行 x 10000列, 1000个唯一字符串

测试结果 (唯一字符串数: 1000):
启用shared strings - 写入时间: 0.89s, 读取时间: 2.94s, 文件大小: 2.24MB
禁用shared strings - 写入时间: 11.71s, 读取时间: 9.52s, 文件大小: 53.34MB
生成测试数据: 100行 x 10000列, 10000个唯一字符串

测试结果 (唯一字符串数: 10000):
启用shared strings - 写入时间: 0.85s, 读取时间: 2.97s, 文件大小: 3.29MB
禁用shared strings - 写入时间: 12.11s, 读取时间: 9.60s, 文件大小: 64.52MB
生成测试数据: 100行 x 10000列, 100000个唯一字符串

测试结果 (唯一字符串数: 100000):
启用shared strings - 写入时间: 3.10s, 读取时间: 3.77s, 文件大小: 9.54MB
禁用shared strings - 写入时间: 12.65s, 读取时间: 9.96s, 文件大小: 66.06MB

结果显示禁用shared strings时写入更慢且文件更大，而启用时文件更小且写入更快，随着唯一字符串数量增加，启用shared strings的效率下降，不启用没有变化。验证了shared strings在重复字符串场景下的存储优化效果。