我们知道xlsx文件实质上是包含众多底层xml文件的zip压缩文件,那么容易想到可以利用搬移底层xml文件实现一个快速编辑xlsx文件的python程序,实现(1)从多个xlsx文件中读取sheet,合并到一个新xlsx文件中,合并后的sheet名采用原始文件名_sheet名的形式(2)可以从某些xlsx文件中指定要保存到新文件的sheet。思路是:将xlsx解压缩到临时目录,将xml文件移动到同一个xl\worksheets 目录,修改workbook.xml中的sheets的引用。假定所有xlsx都未使用共享字符串。把这些需求发给DeepSeek,他给出了如下程序。
python
import os
import shutil
import tempfile
import zipfile
from pathlib import Path
import xml.etree.ElementTree as ET
from typing import List, Dict, Union
class FastXlsxEditor:
def __init__(self):
self.namespace = {'ns': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
def merge_sheets(self,
input_files: List[str],
output_file: str,
selected_sheets: Dict[str, List[str]] = None) -> None:
"""
合并多个xlsx文件的sheet到一个新文件中
Args:
input_files: 输入文件路径列表
output_file: 输出文件路径
selected_sheets: 可选,指定要从哪些文件保存哪些sheet
{文件名: [sheet名列表]}
"""
# 创建临时工作目录
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
merged_dir = temp_path / "merged"
merged_dir.mkdir()
# 解压第一个文件作为基础模板
base_file = input_files[0]
self._extract_zip(base_file, merged_dir)
# 处理workbook.xml和sheets
self._process_workbook_and_sheets(merged_dir, input_files, selected_sheets)
# 重新打包为xlsx文件
self._create_zip(merged_dir, output_file)
def _extract_zip(self, zip_path: str, extract_dir: Path) -> None:
"""解压xlsx文件"""
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_dir)
def _create_zip(self, source_dir: Path, output_file: str) -> None:
"""创建xlsx文件"""
# 确保输出目录存在
os.makedirs(os.path.dirname(os.path.abspath(output_file)), exist_ok=True)
with zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, dirs, files in os.walk(source_dir):
for file in files:
file_path = os.path.join(root, file)
# 在zip文件中保持相对路径
arcname = os.path.relpath(file_path, source_dir)
zipf.write(file_path, arcname)
def _process_workbook_and_sheets(self,
merged_dir: Path,
input_files: List[str],
selected_sheets: Dict[str, List[str]] = None) -> None:
"""处理workbook.xml和所有sheet文件"""
workbook_path = merged_dir / "xl" / "workbook.xml"
worksheets_dir = merged_dir / "xl" / "worksheets"
shared_strings_path = merged_dir / "xl" / "sharedStrings.xml"
# 如果存在共享字符串文件,删除它(根据假设不使用共享字符串)
if shared_strings_path.exists():
shared_strings_path.unlink()
# 解析基础workbook.xml
tree = ET.parse(workbook_path)
root = tree.getroot()
# 找到sheets元素
sheets_elem = root.find('.//ns:sheets', self.namespace)
if sheets_elem is None:
raise ValueError("在workbook.xml中找不到sheets元素")
# 清空原有的sheets
sheets_elem.clear()
sheet_id = 1
relationships = []
# 处理每个输入文件
for file_path in input_files:
file_name = Path(file_path).stem
# 创建临时目录处理当前文件
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
self._extract_zip(file_path, temp_path)
# 获取该文件的所有sheet或选定的sheet
file_sheets = self._get_sheets_from_file(temp_path, file_name, selected_sheets)
# 复制sheet文件并更新workbook
for sheet_name, sheet_file in file_sheets:
new_sheet_name = f"{file_name}_{sheet_name}"
new_sheet_file = f"sheet{sheet_id}.xml"
# 复制sheet文件
source_sheet_path = temp_path / "xl" / "worksheets" / sheet_file
target_sheet_path = worksheets_dir / new_sheet_file
shutil.copy2(source_sheet_path, target_sheet_path)
# 在workbook.xml中添加sheet引用
sheet_elem = ET.SubElement(sheets_elem, 'sheet')
sheet_elem.set('name', new_sheet_name)
sheet_elem.set('sheetId', str(sheet_id))
sheet_elem.set('r:id', f"rId{sheet_id}")
# 记录关系
relationships.append({
'Id': f"rId{sheet_id}",
'Type': "http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet",
'Target': f"worksheets/{new_sheet_file}"
})
sheet_id += 1
# 保存修改后的workbook.xml
tree.write(workbook_path, encoding='UTF-8', xml_declaration=True)
# 更新relationships文件
self._update_relationships(merged_dir, relationships)
def _get_sheets_from_file(self,
extracted_dir: Path,
file_name: str,
selected_sheets: Dict[str, List[str]] = None) -> List[tuple]:
"""从解压的文件中获取sheet信息"""
workbook_path = extracted_dir / "xl" / "workbook.xml"
tree = ET.parse(workbook_path)
root = tree.getroot()
sheets = []
sheets_elem = root.find('.//ns:sheets', self.namespace)
if sheets_elem is not None:
for sheet_elem in sheets_elem.findall('ns:sheet', self.namespace):
sheet_name = sheet_elem.get('name')
sheet_r_id = sheet_elem.get('r:id')
# 获取实际的sheet文件名
sheet_file = self._get_sheet_filename(extracted_dir, sheet_r_id)
# 检查是否在选定的sheet中
if selected_sheets:
if file_name in selected_sheets:
if sheet_name in selected_sheets[file_name]:
sheets.append((sheet_name, sheet_file))
# 如果文件不在selected_sheets中,跳过所有sheet
else:
continue
else:
# 如果没有指定selected_sheets,包含所有sheet
sheets.append((sheet_name, sheet_file))
return sheets
def _get_sheet_filename(self, extracted_dir: Path, r_id: str) -> str:
"""通过关系ID获取sheet文件名"""
rels_path = extracted_dir / "xl" / "_rels" / "workbook.xml.rels"
tree = ET.parse(rels_path)
root = tree.getroot()
for rel in root.findall('.//{http://schemas.openxmlformats.org/package/2006/relationships}Relationship'):
if rel.get('Id') == r_id:
target = rel.get('Target')
return os.path.basename(target)
raise ValueError(f"找不到关系ID: {r_id}")
def _update_relationships(self, merged_dir: Path, relationships: List[Dict]) -> None:
"""更新workbook的关系文件"""
rels_dir = merged_dir / "xl" / "_rels"
rels_dir.mkdir(exist_ok=True)
rels_path = rels_dir / "workbook.xml.rels"
# 创建新的relationships根元素
root = ET.Element('Relationships')
root.set('xmlns', 'http://schemas.openxmlformats.org/package/2006/relationships')
for rel in relationships:
rel_elem = ET.SubElement(root, 'Relationship')
rel_elem.set('Id', rel['Id'])
rel_elem.set('Type', rel['Type'])
rel_elem.set('Target', rel['Target'])
# 写入relationships文件
tree = ET.ElementTree(root)
tree.write(rels_path, encoding='UTF-8', xml_declaration=True)
# 测试代码
def create_sample_xlsx():
"""创建测试用的xlsx文件(使用openpyxl创建简单示例)"""
try:
from openpyxl import Workbook
# 创建测试文件1
wb1 = Workbook()
ws1 = wb1.active
ws1.title = "Data"
ws1['A1'] = "File1_Data_A1"
ws1['B1'] = "File1_Data_B1"
ws2 = wb1.create_sheet("Summary")
ws2['A1'] = "File1_Summary_A1"
wb1.save("file1.xlsx")
# 创建测试文件2
wb2 = Workbook()
ws1 = wb2.active
ws1.title = "Data"
ws1['A1'] = "File2_Data_A1"
ws2 = wb2.create_sheet("Report")
ws2['A1'] = "File2_Report_A1"
wb2.save("file2.xlsx")
print("测试文件创建完成")
except ImportError:
print("请安装openpyxl来创建测试文件: pip install openpyxl")
# 使用示例
def main():
create_sample_xlsx()
editor = FastXlsxEditor()
# 示例1: 合并所有文件的所有sheet
input_files = ["file1.xlsx", "file2.xlsx"]
editor.merge_sheets(input_files, "merged_all.xlsx")
# 示例2: 只合并特定文件的特定sheet
selected_sheets = {
"file1": ["Data"], # 从file1.xlsx只取Data
"file2": ["Report"] # 从file2.xlsx只取Report
# file3.xlsx不会被包含,因为没有在selected_sheets中指定
}
editor.merge_sheets(input_files, "merged_selected.xlsx", selected_sheets)
if __name__ == "__main__":
main()
结果执行报错
file_sheets = self._get_sheets_from_file(temp_path, file_name, selected_sheets)
File "C:\d\mergexlsx1.py", line 152, in _get_sheets_from_file
sheet_file = self._get_sheet_filename(extracted_dir, sheet_r_id)
File "C:\d\mergexlsx1.py", line 179, in _get_sheet_filename
raise ValueError(f"找不到关系ID: {r_id}")
经过调试,发现r_id的值是None, 让他修改后,还是不行,于是让他用手工读取xml文件的方法获取r:id,他给出了如下代码,
python
def _get_sheets_from_file(self,
extracted_dir: Path,
file_name: str,
selected_sheets: Dict[str, List[str]] = None) -> List[tuple]:
"""从解压的文件中获取sheet信息"""
workbook_path = extracted_dir / "xl" / "workbook.xml"
# 手工读取XML文件内容
with open(workbook_path, 'r', encoding='utf-8') as f:
content = f.read()
sheets = []
# 手工解析sheet元素
import re
# 查找所有的sheet元素
sheet_pattern = r'<sheet[^>]*name="([^"]*)"[^>]*r:id="([^"]*)"[^>]*>'
matches = re.findall(sheet_pattern, content)
for sheet_name, sheet_r_id in matches:
print(f"找到sheet: name='{sheet_name}', r:id='{sheet_r_id}'")
# 获取实际的sheet文件名
sheet_file = self._get_sheet_filename(extracted_dir, sheet_r_id)
# 检查是否在选定的sheet中
if selected_sheets:
if file_name in selected_sheets:
if sheet_name in selected_sheets[file_name]:
sheets.append((sheet_name, sheet_file))
# 如果文件不在selected_sheets中,跳过所有sheet
else:
continue
else:
# 如果没有指定selected_sheets,包含所有sheet
sheets.append((sheet_name, sheet_file))
return sheets
替换原有的函数后,能够合并成功了,用rusty_sheet能正确读出。
python ../mergexlsx3.py
找到sheet: name='Data', r:id='rId1'
找到sheet: name='Summary', r:id='rId2'
找到sheet: name='Data', r:id='rId1'
找到sheet: name='Report', r:id='rId2'
找到sheet: name='Data', r:id='rId1'
找到sheet: name='Summary', r:id='rId2'
找到sheet: name='Data', r:id='rId1'
找到sheet: name='Report', r:id='rId2'
\d\duckdb141 -unsigned -cmd "load '/d/2sh/rusty_sheet.duckdb_extension';"
D from read_sheets(['/d/mgxlsx/file*.xlsx'],header=0,sheet_name_column='sn',file_name_column='fn');
┌──────────────────┬───────────────┬─────────┬──────────────────────┐
│ A │ B │ sn │ fn │
│ varchar │ varchar │ varchar │ varchar │
├──────────────────┼───────────────┼─────────┼──────────────────────┤
│ File1_Data_A1 │ File1_Data_B1 │ Data │ /d\mgxlsx\file1.xlsx │
│ File1_Summary_A1 │ NULL │ Summary │ /d\mgxlsx\file1.xlsx │
│ File2_Data_A1 │ NULL │ Data │ /d\mgxlsx\file2.xlsx │
│ File2_Report_A1 │ NULL │ Report │ /d\mgxlsx\file2.xlsx │
└──────────────────┴───────────────┴─────────┴──────────────────────┘
D from read_sheets(['/d/mgxlsx/merged*.xlsx'],header=0,sheet_name_column='sn',file_name_column='fn');
┌──────────────────┬───────────────┬───────────────┬────────────────────────────────┐
│ A │ B │ sn │ fn │
│ varchar │ varchar │ varchar │ varchar │
├──────────────────┼───────────────┼───────────────┼────────────────────────────────┤
│ File1_Data_A1 │ File1_Data_B1 │ file1_Data │ /d\mgxlsx\merged_all.xlsx │
│ File1_Summary_A1 │ NULL │ file1_Summary │ /d\mgxlsx\merged_all.xlsx │
│ File2_Data_A1 │ NULL │ file2_Data │ /d\mgxlsx\merged_all.xlsx │
│ File2_Report_A1 │ NULL │ file2_Report │ /d\mgxlsx\merged_all.xlsx │
│ File1_Data_A1 │ File1_Data_B1 │ file1_Data │ /d\mgxlsx\merged_selected.xlsx │
│ File2_Report_A1 │ NULL │ file2_Report │ /d\mgxlsx\merged_selected.xlsx │
└──────────────────┴───────────────┴───────────────┴────────────────────────────────┘
但有个问题,用wps打开合并后的文件,看不到有数据的sheet, 而只能看到一个空白的sheet1,还需要进一步研究。