根据图片中特殊的标识将图片进行分组
python
def group_images_by_project(png_img_paths):
"""
按项目规则(项目: \d 开头 → 七、牵头单位 结束)对PNG图片进行分组
参数:
png_img_paths: 所有PNG图片路径列表(按原始顺序传入,保证拼接顺序正确)
返回:
project_image_groups: 字典,key=项目顺序编号(如"project_1"),value=该项目对应的图片路径列表
"""
project_image_groups = {}
current_project_images = [] # 当前项目的图片列表
project_counter = 1 # 项目编号计数器
is_in_project = False # 标记是否在项目处理中
for img_path in png_img_paths:
# 1. 跳过非PNG文件
if not img_path.lower().endswith('.png'):
print(f"跳过非PNG文件:{img_path}")
continue
# 2. 单独提取当前图片的文本
img_text = ""
try:
img = Image.open(img_path)
img_text = extract_image_text(img)
img.close()
except Exception as e:
print(f"处理图片 {img_path} 失败:{e}")
# 即使图片读取失败,若正在处理项目中,仍将图片加入
if is_in_project:
current_project_images.append(img_path)
continue
# 3. 判断当前图片是否包含新项目开头
project_start_list = batch_match_all_projects(img_text)
if project_start_list:
# 如果已在一个项目中,先保存当前项目
if is_in_project and current_project_images:
project_key = f"project_{project_counter}"
project_image_groups[project_key] = current_project_images.copy()
print(f"保存项目 {project_key},图片数量:{len(current_project_images)}")
project_counter += 1
# 开始新项目
is_in_project = True
current_project_images = [img_path]
project_name = project_start_list[0] # 保留原始项目名称用于日志
print(f"开始新项目 {project_counter}(原始名称:{project_name}),对应图片:{img_path}")
# 4. 判断当前图片是否是项目结束标识
elif "七、牵头单位" in img_text and is_in_project:
# 将当前图片加入当前项目
current_project_images.append(img_path)
# 结束当前项目
project_key = f"project_{project_counter}"
project_image_groups[project_key] = current_project_images.copy()
print(f"结束项目 {project_key},图片数量:{len(current_project_images)}")
# 重置状态
is_in_project = False
current_project_images = []
project_counter += 1
# 5. 既不是开头也不是结尾,若在项目中则加入
elif is_in_project:
current_project_images.append(img_path)
# 可选:打印日志,跟踪图片加入情况
# print(f"图片 {img_path} 加入项目 {project_counter}")
# 6. 处理最后一个未正常结束的项目
if is_in_project and current_project_images:
project_key = f"project_{project_counter}"
project_image_groups[project_key] = current_project_images
print(f"处理未结束项目 {project_key},图片数量:{len(current_project_images)}")
return project_image_groups
## 提炼图片索引
def batch_match_all_projects(text):
"""批量提取文本中的项目标识(返回原始匹配列表)"""
# 核心正则:兼容带/不带冒号、有/无空格的项目标识
full_pattern = r'\b(项目\s*\d+\s*[::]?)'
all_full_projects = re.findall(full_pattern, text)
return all_full_projects