python
import requests
from lxml import etree
import os
import urllib
from PIL import Image
from pymysql.converters import escape_string
import sys
import time
from selenium import webdriver
import warnings
import pdfplumber
from urllib import request
import io
import socks
import urllib.parse
import pymysql
import socks
import configparser
import hashlib
import threading
import urllib.parse
import re
def read_pdf(path,proxies={},timeout=(3.2,10),download_image=False):
f=''
if path=='' or type(path)!=str:
print("路径为空或格式不对!")
if path[0:4]=="http":
try:
#data=request.urlopen(path,timeout=10).read()
print(proxies)
data=requests.get(url=path,timeout=timeout,proxies=proxies)
f = io.BytesIO(data.content)
except Exception as e:
print(e,"打开链接失败")
return ''
else:
try:
path=urllib.parse.unquote(path)
path=path.replace('file:///','').replace('/','\\')
f=open(path,'rb')
except Exception as e:
print(e,"打开本地文件失败")
text=''
old_path=os.getcwd()
if download_image:
im_path=path.replace('https://','').replace("http://",'')
os.makedirs(im_path, exist_ok=True)
os.chdir(im_path)
with pdfplumber.open(f) as pdf:
# 遍历每个页面
for page in pdf.pages:
# 获取当前页面的全部文本信息,包括表格中的文字,没有内容则打印None
text+=page.extract_text()
if download_image:
images=page.images
i=0
for img in images:
f_img=open('{}.png'.format(i),'wb+')
f_img.write(img['stream'].get_data())
f_img.close()
i+=1
os.chdir(old_path)
f.close()
return text
可用库自己筛选一下,我把全部的加进来了,proxies是http代理,path是路径,download_image是是否存成本地图片文件。