import requests
import pdfplumber
import io
import os
def read_pdf(path,proxies={},timeout=(3.2,10),download_image=False,headers={}):
f=''
if path=='' or type(path)!=str:
print("路径为空或格式不对!")
if path[0:4]=="http":
try:
#data=request.urlopen(path,timeout=10).read()
print(proxies)
print(headers)
data=requests.get(url=path,timeout=timeout,proxies=proxies,headers=headers)
print(data)
with open('99.pdf','wb+') as f2:
f2.write(data.content)
f = io.BytesIO(data.content)
except Exception as e:
print(e,"打开链接失败")
return ''
else:
try:
path=urllib.parse.unquote(path)
path=path.replace('file:///','').replace('/','\\')
f=open(path,'rb')
except Exception as e:
print(e,"打开本地文件失败")
text=''
old_path=os.getcwd()
if download_image:
im_path=path.replace('https://','').replace("http://",'')
os.makedirs(im_path, exist_ok=True)
os.chdir(im_path)
with pdfplumber.open(f) as pdf:
遍历每个页面
for page in pdf.pages:
获取当前页面的全部文本信息,包括表格中的文字,没有内容则打印None
text+=page.extract_text()
if download_image:
images=page.images
i=0
for img in images:
f_img=open('{}.png'.format(i),'wb+')
f_img.write(img['stream'].get_data())
f_img.close()
i+=1
os.chdir(old_path)
f.close()
return text
proxies={'http':'192.168.1.122:1080','https':'192.168.1.122:1080'}
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36 Edg/139.0.0.0',\
#'cookie':'.ASPXANONYMOUS=txzams-d65KJ-plLxuK7ohwAwg6cmLo5fJzCbD3i4CaFY7sH2EYFw2jvP3bw64iuYiVJbpNQBxFLir7s-_8p65Huzw9Sab4REdtsGyvLi4E0hge-0; _ga=GA1.1.509430065.1753846772; dnn_IsMobile=False; ARRAffinity=7604675fe895ac43d4eee5ed64a571e723c5cb50da2e00ebe078cb3d6f359b1c; _ga_CSLL4ZEK4L=GS2.1.s1755051758o5g1t1755052205j60l0h0; _ga_313558765=GS2.1.s1755051759o5g1t1755052205j60l0h0',\
'sec-fetch-dest':'document'}
read_pdf(url,proxies=proxies,headers=headers)