Step1
提取PDF中的图片,并另存
Step2
去除灰色纸张背景
python
import PyPDF2
from PIL import ImageEnhance,Image,ImageFilter
import cv2
import numpy as np
from skimage.filters import unsharp_mask
from skimage.filters import gaussian
from skimage.restoration import denoise_tv_chambolle
local = './'
pdf_file = open(local+'001.pdf', 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)
num_pages = len(pdf_reader.pages)
print("num : "+str(num_pages))
pic_n=0
# 遍历每一页
for page_num in range(num_pages):
# 获取当前页对象
page_obj = pdf_reader.pages[page_num]
# 获取当前页中的所有对象
page_objs = page_obj['/Resources']['/XObject'].get_object()
# 遍历每个对象
for obj_name in page_objs:
# 判断对象是否为图片
if page_objs[obj_name]['/Subtype'] == '/Image':
# 获取图片对象
img_obj = page_objs[obj_name]
# 获取图片数据
img_data = img_obj.get_data()
# 将图片数据保存为文件
with open(local+"/99_tmp.jpg", 'wb') as img_file:
img_file.write(img_data)
#####################################################################################
# 图片处理
image = Image.open(local+"/99_tmp.jpg")
#饱和度
enhancer = ImageEnhance.Color(image)
C_image = enhancer.enhance(0.001)
C_image.save("./0_tmp.jpg")
img = cv2.imread('./0_tmp.jpg')
# 对每个像素进行对比度调整 alpha=对比度 beta=亮度
img_contrast = cv2.convertScaleAbs(img, alpha=1.4, beta=0)
####################################################################################
pnum=str(page_num).zfill(4)
pic_str=str(pic_n).zfill(4)
img_path ="./image_heibai/"+ pic_str + "_" +"page_"+pnum + '.jpg'
cv2.imwrite(img_path,img_contrast)
print("image : "+img_path)
pic_n=pic_n+1
Step3
去除黑色边框
Step4
去除阴影部分,字清晰