python
# !/usr/bin/python3
# -*- coding:utf-8 -*-
"""
@author: JHC000abc@gmail.com
@file: util_pdf.py
@time: 2025/3/28 15:22
@desc:提取pdf第一页图片 ,检查url中pdf是否完整
"""
import os
import time
import requests
from requests.exceptions import RequestException
from io import BytesIO
import PyPDF2
import pypdf
from pdf2image import convert_from_path
from uuid import uuid4
class PdfCheck:
"""
"""
def __init__(self, file, tmp_file="./tmp"):
self.file = file
self.reader = None
self.file_obj = None
self.pages = 0
self.tmp_file = tmp_file
os.makedirs(self.tmp_file, exist_ok=True)
self.first_name = f"{self.tmp_file}/{uuid4()}.png"
print("首页图片:", self.first_name)
def __enter__(self):
self.file_obj = open(self.file, 'rb')
self.reader = PyPDF2.PdfReader(self.file_obj)
self.num_pages = len(self.reader.pages)
print("pdf 页数:", self.num_pages)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if self.file_obj:
self.file_obj.close()
def save_first_page(self):
"""
:return:
"""
images = convert_from_path(self.file, first_page=1, last_page=1, poppler_path=r"./poppler/Library/bin")
if images:
# 保存第一页的图像
images[0].save(self.first_name, 'PNG')
print(f"{self.file} 首页图片已保存至 {self.first_name}")
return self.first_name
else:
print(f"无法提取{self.file}首页图片。")
return None
def save_page_to_image(self, save_path="images"):
"""
:return:
"""
os.makedirs(save_path, exist_ok=True)
images = convert_from_path(self.file, poppler_path=r"../poppler/Library/bin")
res = []
if images:
for ind, image in enumerate(images, 1):
save_file = os.path.join(save_path, f"{ind}.png")
image.save(save_file, 'PNG')
print(f"{self.file} 图片 {ind} 已保存至 {save_file}")
res.append(save_file)
return res
else:
print(f"无法提取{self.file}首页图片。")
return None
def check_pdf_integrity_with_pypdf(self, url, timeout=30):
"""
下载 PDF 文件内容到内存,并使用 pypdf 库尝试打开和读取页数,
以检查文件结构是否完整。
"""
try:
# 步骤 2: 下载整个文件内容到内存
# 注意:对于大文件,这会占用大量内存
response = requests.get(url, timeout=timeout, verify=False)
response.raise_for_status()
# 检查 Content-Type (可选,但推荐)
content_type = response.headers.get('Content-Type', '').split(';')[0].strip().lower()
if 'application/pdf' not in content_type:
print(f"【失败】Content-Type 错误: {content_type}。这不是一个 PDF。")
return False
# 步骤 3: 使用 BytesIO 将二进制数据传递给 pypdf
pdf_bytes = BytesIO(response.content)
# 步骤 4: 尝试用 pypdf 打开和读取
reader = pypdf.PdfReader(pdf_bytes)
# 尝试访问页数,如果文件损坏,这通常会抛出异常
num_pages = len(reader.pages)
print(f"【成功】pypdf 成功打开文件。文件结构完整,总页数: {num_pages}")
return True
except pypdf.errors.PdfReadError as e:
print(f"【失败】PDF 文件结构损坏。pypdf 无法读取。错误信息: {e}")
except requests.exceptions.HTTPError as e:
print(f"【失败】HTTP 错误: 状态码 {e.response.status_code} ({e})")
except RequestException as e:
print(f"【失败】网络请求错误: {e}")
except Exception as e:
print(f"【失败】发生未知错误: {e}")
return False
if __name__ == '__main__':
start = time.time()
with PdfCheck(
r"D:\Desktop\10001231-6+中国平安保险(集团)股份有限公司2024年度可持续发展报告(A)+2025-03-20.pdf") as pdf:
pdf.save_page_to_image()
print(pdf.num_pages)
end = time.time()
dif = end - start
print("dif", dif)