【Python】论文长截图、页面分割、水印去除、整合PDF

有的学校的论文只能在线预览，且存在水印。为保存到本地方便查阅，可以使用以下工作流进行处理：

用浏览器打开在线论文预览界面；
使用fastone capture软件截长图；
将论文按页数进行分割；
按照阈值消除浅色的背景水印；
整合为A4尺寸的PDF文件；
使用WPS将PDF转为OCR版本（可选）。

以下代码为上述流程的第三、四、五步，注释都在代码中，随取随用。

python 复制代码

import os
from PIL import Image
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
import numpy as np

def process_image(input_image_path, output_dir="split_images", total_pages=138, watermark_threshold=230):
    """
    处理图片并生成PDF
    
    Args:
        input_image_path: 输入图片路径
        output_dir: 输出目录路径
        total_pages: 总页数
        watermark_threshold: 水印识别阈值
    """
    # 创建输出文件夹
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 打开原始图片
    img = Image.open(input_image_path)
    width, height = img.size

    # 计算新的高度(total_pages的整数倍)
    new_height = ((height + total_pages - 1) // total_pages) * total_pages
    img = img.resize((width, new_height), Image.Resampling.LANCZOS)
    height = new_height

    # 计算每份高度
    slice_height = height // total_pages
    remaining_height = height % total_pages  # 此时应该为0

    # 分割并保存图片
    image_files = []
    for i in range(total_pages):
        # 计算当前切片的位置
        top = i * slice_height
        bottom = top + slice_height
        if i == total_pages - 1:  # 最后一片加上余数
            bottom += remaining_height
        
        # 裁剪图片
        slice_img = img.crop((0, top, width, bottom))
        
        # 去除水印处理
        img_array = np.array(slice_img)
        
        # 设置阈值来识别水印
        mask = np.all(img_array > watermark_threshold, axis=2)
        
        # 将水印区域替换为背景色
        img_array[mask] = [255, 255, 255]  # 替换为白色
        
        # 转回PIL图片
        processed_img = Image.fromarray(img_array)
        
        # 保存处理后的切片
        output_file = os.path.join(output_dir, f"slice_{i+1:03d}.png")
        processed_img.save(output_file)
        image_files.append(output_file)

    return image_files

def create_pdf(image_files, pdf_file="combined_output.pdf"):
    """
    将图片合并为PDF
    
    Args:
        image_files: 图片文件路径列表
        pdf_file: 输出PDF文件路径
    """
    c = canvas.Canvas(pdf_file, pagesize=A4)
    a4_width, a4_height = A4

    for img_file in image_files:
        img = Image.open(img_file)
        # 计算缩放比例以适应A4纸张
        aspect = img.width / img.height
        if aspect > A4[0] / A4[1]:  # 如果图片太宽
            new_width = a4_width
            new_height = new_width / aspect
        else:  # 如果图片太高
            new_height = a4_height
            new_width = new_height * aspect
        
        # 居中放置图片
        x = (a4_width - new_width) / 2
        y = (a4_height - new_height) / 2
        
        # 添加图片到PDF
        c.drawImage(img_file, x, y, width=new_width, height=new_height)
        c.showPage()

    c.save()

def main(input_image_path, output_dir="split_images", pdf_file="combined_output.pdf", 
            total_pages=138, watermark_threshold=230):
    """
    主函数
    
    Args:
        input_image_path: 输入图片路径
        output_dir: 输出目录路径
        pdf_file: 输出PDF文件路径
        total_pages: 总页数
        watermark_threshold: 水印识别阈值
    """
    image_files = process_image(input_image_path, output_dir, total_pages, watermark_threshold)
    create_pdf(image_files, pdf_file)
    
    print("处理完成！")
    print(f"切片图片保存在: {output_dir}")
    print(f"PDF文件保存为: {pdf_file}")

if __name__ == "__main__":
    # 示例使用
    input_path = r"C:\Users\Administrator\Desktop\test\2025-01-06_102531.png"
    main(input_path)

最后得到的PDF是图片格式的，可以使用WPS转为OCR版本，可以直接划取文字。