第一、单个PDF文件命名格式:
提取PDF文档中的合同号和姓名并按"合同号_姓名"格式重命名文件
python
import pdfplumber
import re
import os
def extract_contract_info(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
# 匹配合同编号(只取前12位)
contract_match = re.search(r'合同编号[::]\s*([A-Za-z0-9]{12})', text)
# 匹配借款人(直到换行或括号,并去除所有空格)
borrower_match = re.search(r'借款人[::]\s*([^\n(()]+)', text)
if contract_match and borrower_match:
contract_number = contract_match.group(1)
borrower_name = borrower_match.group(1).replace(" ", "") # 删除所有空格
return contract_number, borrower_name
return None, None
def rename_pdf_file(pdf_path):
contract_number, borrower_name = extract_contract_info(pdf_path)
if contract_number and borrower_name:
dir_name = os.path.dirname(pdf_path)
new_filename = f"{contract_number}_{borrower_name}.pdf"
new_filepath = os.path.join(dir_name, new_filename)
# 避免文件名冲突(自动添加序号)
counter = 1
while os.path.exists(new_filepath):
new_filename = f"{contract_number}_{borrower_name}_{counter}.pdf"
new_filepath = os.path.join(dir_name, new_filename)
counter += 1
os.rename(pdf_path, new_filepath)
print(f"文件已重命名为: {new_filename}")
else:
print("未能提取合同信息,无法重命名")
# 使用示例
pdf_path = r'D:\xmfile\案件杂货表\部门\鲍晓文\《个人借款额度合同》XZ_20201224.pdf'
rename_pdf_file(pdf_path)
第二、批量实现PDF文件命名格式:
提取PDF文档中的合同号和姓名并按"合同号_姓名"格式重命名文件
python
import pdfplumber
import re
import os
from typing import Tuple, Optional
def validate_pdf(filepath: str) -> bool:
"""验证文件是否为有效的PDF"""
try:
with pdfplumber.open(filepath) as pdf:
if len(pdf.pages) == 0:
return False
return True
except:
return False
def clean_borrower_name(name: str) -> str:
"""清洗借款人姓名"""
# 去除前后空格、特殊字符和标点
name = re.sub(r'[^\w\u4e00-\u9fa5]', '', name.strip())
# 去除身份证号、电话等可能跟随的信息
name = re.sub(r'[\dXx]{15,}', '', name)
return name
def extract_contract_info(pdf_path: str) -> Tuple[Optional[str], Optional[str]]:
"""提取合同编号和借款人信息"""
if not validate_pdf(pdf_path):
return None, None
try:
with pdfplumber.open(pdf_path) as pdf:
full_text = "\n".join([page.extract_text() or "" for page in pdf.pages])
# 更精确的合同编号匹配(考虑多种格式)
contract_match = re.search(
r'(合同编号|合同号|编号)[::]?\s*([A-Za-z0-9]{10,15})',
full_text,
re.IGNORECASE
)
# 更精确的借款人匹配(考虑多种表述方式)
borrower_match = re.search(
r'(借款人|借款方|甲方)[::]?\s*([^\n()()]{2,20}?)(?=[\n()()]|身份证|联系方式|$)',
full_text,
re.IGNORECASE
)
if contract_match and borrower_match:
contract_number = contract_match.group(2)[:12] # 仍只取前12位
raw_borrower_name = borrower_match.group(2)
borrower_name = clean_borrower_name(raw_borrower_name)
if len(borrower_name) >= 2: # 姓名至少2个字符
return contract_number, borrower_name
except Exception as e:
print(f"⚠️ 处理 {os.path.basename(pdf_path)} 时发生错误: {str(e)}")
return None, None
def rename_pdf_file(pdf_path: str) -> None:
"""重命名PDF文件"""
original_name = os.path.basename(pdf_path)
contract_number, borrower_name = extract_contract_info(pdf_path)
if contract_number and borrower_name:
dir_name = os.path.dirname(pdf_path)
base_name = f"{contract_number}_{borrower_name}"
extension = ".pdf"
# 处理文件名冲突
counter = 0
while True:
new_name = f"{base_name}{f'_{counter}' if counter else ''}{extension}"
new_path = os.path.join(dir_name, new_name)
if not os.path.exists(new_path):
try:
os.rename(pdf_path, new_path)
print(f"✅ 成功: {original_name} → {new_name}")
break
except Exception as e:
print(f"❌ 重命名失败: {original_name} → {new_name} | 错误: {str(e)}")
break
counter += 1
else:
print(f"⚠️ 跳过: {original_name} (未提取到有效信息)")
def batch_rename_pdfs_in_subfolders(root_dir: str) -> None:
"""批量处理子文件夹中的PDF文件"""
if not os.path.isdir(root_dir):
print(f"❌ 错误: 目录不存在 {root_dir}")
return
processed_files = 0
skipped_files = 0
error_files = 0
for person_dir in os.listdir(root_dir):
person_dir_path = os.path.join(root_dir, person_dir)
if not os.path.isdir(person_dir_path):
continue
print(f"\n🔍 正在处理: {person_dir}")
for filename in os.listdir(person_dir_path):
if filename.lower().endswith('.pdf'):
pdf_path = os.path.join(person_dir_path, filename)
try:
rename_pdf_file(pdf_path)
processed_files += 1
except Exception as e:
print(f"❌ 处理 {filename} 时出错: {str(e)}")
error_files += 1
else:
skipped_files += 1
print(f"\n📊 处理完成: 共处理 {processed_files} 个文件, 跳过 {skipped_files} 个, 错误 {error_files} 个")
if __name__ == "__main__":
# 使用示例
department_dir = r'D:\xmfile\案件杂货表\部门'
# 输入验证
while not os.path.exists(department_dir):
print(f"目录不存在: {department_dir}")
department_dir = input("请输入正确的目录路径: ")
batch_rename_pdfs_in_subfolders(department_dir)