【python】快速实现pdf批量去除指定位置水印

最近下载了10本电子书，每页的上头都被水印给霸占了，对于强迫症的人来说，看着很不舒服，果断下手除之

图示：

处理前：

处理后：

每本书都有大几百页，手动处理繁琐耗时，现在实现自动化大概2分钟可以处理1200页，快哉快哉，哈哈哈，直接附上代码了，比较简单：

主要实现对制定文件夹下的所有pdf进行处理，结果保存在output文件夹下；

python 复制代码

import fitz  # PyMuPDF库
import os


def fill_pdf_margins(input_pdf_path, output_pdf_path):
    """
    读取PDF文件，在每一页的指定区域填充纯白色，并保存为新PDF。

    Args:
        input_pdf_path (str): 输入PDF文件的路径。
        output_pdf_path (str): 输出PDF文件的路径。
    """
    # 检查输入文件是否存在
    if not os.path.exists(input_pdf_path):
        print(f"错误：找不到输入文件 '{input_pdf_path}'")
        return

    try:
        # 1. 打开原始PDF文件
        doc = fitz.open(input_pdf_path)

        print(f"开始处理文件: {input_pdf_path}，共 {len(doc)} 页")

        # 2. 遍历PDF的每一页
        for page_num in range(len(doc)):
        # for page_num in range(5):
            page = doc[page_num]  # 获取当前页

            # 获取页面的矩形尺寸 (x0, y0, x1, y1)
            # (x0, y0) 是左上角坐标, (x1, y1) 是右下角坐标
            page_rect = page.rect
            print(f"处理第 {page_num + 1} 页，尺寸: {page_rect}----{page_rect.width}---{page_rect.height}")

            # 3. 计算需要填充的区域的坐标
            # 定义填充颜色为纯白色
            fill_color = (1, 1, 1)  # 在RGB中，(1,1,1)代表白色
            # fill_color = (0, 0, 0)  # 在RGB中，(1,1,1)代表白色
            # --- 计算顶部区域 ---
            # 宽度从页面的30%处到70%处

            top_x0 = page_rect.x0 + page_rect.width * 0.3
            top_x1 = page_rect.x0 + page_rect.width * 0.7
            # 高度从页面顶部到页面高度的5%处
            top_y0 = page_rect.y0 + page_rect.height * 0.95
            top_y1 = page_rect.y0 + page_rect.height
            top_rect = fitz.Rect(top_x0, top_y0, top_x1, top_y1)
            # print("top_rect:", top_rect)

            # 4. 在页面上绘制白色矩形来覆盖指定区域
            # overlay=True 表示新绘制的内容会覆盖在原有内容之上
            page.draw_rect(top_rect, color=fill_color, fill=fill_color, overlay=True)

            print(f"  - 已处理第 {page_num + 1} 页")

        # 5. 将修改后的所有页面保存为新的PDF文件
        doc.save(output_pdf_path, garbage=4, deflate=True, clean=True)
        doc.close()

        print(f"\n处理完成！输出文件已保存为: {output_pdf_path}")

    except Exception as e:
        print(f"处理过程中发生错误: {e}")

def pdf_single(input_file, output_file):
    # 你可以在这里创建一个测试用的input.pdf，或者直接使用你已有的文件
    # 如果没有input.pdf，下面这行会创建一个简单的测试文件
    if not os.path.exists(input_file):
        print("未找到 input.pdf，正在创建一个用于测试的PDF文件...")
        test_doc = fitz.open()
        for i in range(3):
            page = test_doc.new_page()
            page.insert_text((72, 72), f"这是第 {i + 1} 页的测试内容。\n" * 20, fontsize=11)
        test_doc.save(input_file)
        test_doc.close()
        print("测试PDF 'input.pdf' 已创建。")

    # 调用函数进行处理
    fill_pdf_margins(input_file, output_file)

def all_raw(path='./raw'):
    output = './output'
    if not os.path.exists(output):
        os.makedirs(output)
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".pdf"):
                input_file = os.path.join(root, file)
                output_file = os.path.join(output, file)
                print(f"正在处理文件: {input_file}--- {output_file}")
                pdf_single(input_file, output_file)
                print(f"处理完成: {input_file}--- {output_file}")
# --- 如何使用 ---
if __name__ == '__main__':
    all_raw()