import requests

import pdfplumber

import io

import os

def read_pdf(path,proxies={},timeout=(3.2,10),download_image=False,headers={}):

f=''

if path=='' or type(path)!=str:

print("路径为空或格式不对！")

if path[0:4]=="http":

try:

#data=request.urlopen(path,timeout=10).read()

print(proxies)

print(headers)

data=requests.get(url=path,timeout=timeout,proxies=proxies,headers=headers)

print(data)

with open('99.pdf','wb+') as f2:

f2.write(data.content)

f = io.BytesIO(data.content)

except Exception as e:

print(e,"打开链接失败")

return ''

else:

try:

path=urllib.parse.unquote(path)

path=path.replace('file:///','').replace('/','\\')

f=open(path,'rb')

except Exception as e:

print(e,"打开本地文件失败")

text=''

old_path=os.getcwd()

if download_image:

im_path=path.replace('https://','').replace("http://",'')

os.makedirs(im_path, exist_ok=True)

os.chdir(im_path)

with pdfplumber.open(f) as pdf:

遍历每个页面

for page in pdf.pages:

获取当前页面的全部文本信息，包括表格中的文字,没有内容则打印None

text+=page.extract_text()

if download_image:

images=page.images

i=0

for img in images:

f_img=open('{}.png'.format(i),'wb+')

f_img.write(img['stream'].get_data())

f_img.close()

i+=1

os.chdir(old_path)

f.close()

return text

url='https://www.airuniversity.af.edu/Portals/10/ASPJ/journals/Volume-27_Issue-6/V-Soine-Harker-Heminger-Scherrer.pdf'

proxies={'http':'192.168.1.122:1080','https':'192.168.1.122:1080'}

headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36 Edg/139.0.0.0',\

#'cookie':'.ASPXANONYMOUS=txzams-d65KJ-plLxuK7ohwAwg6cmLo5fJzCbD3i4CaFY7sH2EYFw2jvP3bw64iuYiVJbpNQBxFLir7s-_8p65Huzw9Sab4REdtsGyvLi4E0hge-0; _ga=GA1.1.509430065.1753846772; dnn_IsMobile=False; ARRAffinity=7604675fe895ac43d4eee5ed64a571e723c5cb50da2e00ebe078cb3d6f359b1c; _ga_CSLL4ZEK4L=GS2.1.s1755051758 $o5$ g1 $t1755052205$ j60 $l0$ h0; _ga_313558765=GS2.1.s1755051759 $o5$ g1 $t1755052205$ j60 $l0$ h0',\

'sec-fetch-dest':'document'}

read_pdf(url,proxies=proxies,headers=headers)

pdf文件流或者本地文件读取

遍历每个页面

获取当前页面的全部文本信息，包括表格中的文字,没有内容则打印None