pymupdf提取pdf表格非常快速,相比其他库是个更好的选择.
一个行列多的表格打印成pdf后会由于页宽分页原因变成多个表格,提取的多个表格需要合并为一个表格,再来处理数据.
下面代码中merge函数用于合并表格.addOneAxis0用于合并仅分页导致的多子表.
python
def addOneAxis0(data,one):
if len(data)==0:#first
r=one
else:
r=data+one[1:]#remove first title row
return r
def merge(data):
output=data[0]
titles=data[0][0][1:]
for i in range(1,len(data)):
newtitle=data[i][0][1:]
# print(titles)
# print(newtitle)
# input("here")
if newtitle[0] in titles:#repeat
for one in data[i][1:]:#add row
output.append(one)
else:#
if newtitle[0] in output[0]:#add column without title
thedata=data[i][1:]#remove title
n=len(thedata)
out=output[-n:]
for j in range(len(out)):
out[j]+=thedata[j][1:]
pass
else:#add column with title
n=len(data[i])
out=output[-n:]
for j in range(len(out)):#add column
out[j]+=data[i][j][1:]
pass
return output
def getDataMass(file_name):
global doc,curvePage
doc=fitz.open(file_name) # open document
data=[]
mass=[]
tables=[]
i=None
for i in range(doc.page_count).__reversed__():
page = doc[i]
tabs=page.find_tables()
if len(tabs.tables)==0:
break
else:
tables.append(tabs[0].extract())
tables.reverse()
for one in tables:
if one[0][0]=="溶液标签":
data.append(one)#data.append(one)
else:
mass=addOneAxis0(mass,one)
curvePage=i
print(data)
data=merge(data)
print(data)
return (data,mass)