目录
报错内容
在使用pymupdf解析PDF时,出现报错
bash
MuPDF error: format error: object is not a stream
MuPDF error: syntax error: invalid ICC colorspace
MuPDF error: syntax error: unknown cid font type
MuPDF error: format error: object is not a stream
MuPDF error: syntax error: invalid ICC colorspace
MuPDF error: syntax error: unknown cid font type
MuPDF error: syntax error: unknown cid font type
MuPDF error: syntax error: unknown cid font type
MuPDF error: library error: zlib error: invalid stored block lengths
试错方案
这些是尝试解决问题的过程,都是无效方案,可以跳过
因为我是记录bug笔记,所以写在这里
捕获更具体的异常
失败
bash
import fitz
def pdf2text(pdf_file):
try:
data = dict()
texts = ''
doc = fitz.open(pdf_file) # open a document
for page in doc: # iterate the document pages
text = page.get_text() # get plain text (is in UTF-8)
texts += text
data['document'] = texts
data['metadata'] = pdf_file
data['format'] = "pdf_text"
return data
except fitz.FitzError as fe:
print("MuPDF specific error:", fe)
return False
except Exception as e:
print(e)
return False
设置超时
失败
bash
import fitz
import signal
def handler(signum, frame):
raise TimeoutError("Operation timed out")
signal.signal(signal.SIGALRM, handler)
def pdf2text(pdf_file):
try:
data = dict()
texts = ''
doc = fitz.open(pdf_file) # open a document
for page in doc: # iterate the document pages
signal.alarm(10) # 设置超时为 10 秒
text = page.get_text() # get plain text (is in UTF-8)
signal.alarm(0) # 取消超时
texts += text
data['document'] = texts
data['metadata'] = pdf_file
data['format'] = "pdf_text"
return data
except TimeoutError as te:
print("Timeout error:", te)
return False
except Exception as e:
print(e)
return False
使用子进程
单独运行文件成功了,但是在服务调用时仍然卡死
bash
import fitz
import multiprocessing
def extract_text(pdf_file):
try:
data = dict()
texts = ''
doc = fitz.open(pdf_file)
for page in doc:
texts += page.get_text()
data['document'] = texts
data['metadata'] = pdf_file
data['format'] = "pdf_text"
return data
except Exception as e:
print(f"Error in subprocess: {e}")
return None
def pdf2text(pdf_file):
try:
with multiprocessing.Pool(1) as pool:
result = pool.apply_async(extract_text, (pdf_file,))
return result.get(timeout=5) # 设置超时为30秒
except multiprocessing.TimeoutError:
print("Subprocess timed out.")
return False
except Exception as e:
print(f"Main process error: {e}")
return False
使用线程池
失败
bash
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import fitz
def parse_pdf_page(pdf_file):
try:
data = dict()
texts = ''
doc = fitz.open(pdf_file)
for page in doc:
text = page.get_text()
texts += text
data['document'] = texts
data['metadata'] = pdf_file
data['format'] = "pdf_text"
return data
except Exception as e:
return str(e)
def pdf2text_with_threadpool(pdf_file, timeout=30):
with ThreadPoolExecutor(max_workers=1) as executor:
future = executor.submit(parse_pdf_page, pdf_file)
try:
return future.result(timeout=timeout)
except TimeoutError:
print("PDF parsing timed out.")
return False
# 服务中调用示例
result = pdf2text_with_threadpool('your_pdf_file.pdf')
成功方案
使用 subprocess 调用外部脚本
脚本中设置子进程超时反馈
主进程
bash
import subprocess
import json
pdf_file_path = "your_pdf_file"
result = subprocess.run([
'python',
'pdf_demo.py',
pdf_file_path
],
capture_output=True,
text=True)
try:
result = json.loads(result.stdout)
print(result)
except json.JSONDecodeError:
print("未解析",repr(result.stdout))
外部脚本
pdf_demo.py
bash
import fitz
import multiprocessing
import sys
import json
def pdf2text(pdf_file):
data = dict()
texts = ''
doc = fitz.open(pdf_file)
for page in doc:
text = page.get_text()
texts += text
data['document'] = texts
data['metadata'] = pdf_file
data['format'] = "pdf_text"
return data
if __name__ == "__main__":
pdf_file_path = sys.argv[1]
pool = multiprocessing.Pool(1)
result = pool.apply_async(pdf2text, (pdf_file_path,))
try:
data = result.get(timeout=5)
print(json.dumps(data, ensure_ascii=False, indent=4))
except multiprocessing.TimeoutError:
pool.terminate()
print("Subprocess timed out.")
except Exception as e:
print(f"Main process error: {e}")
finally:
pool.close()
pool.join()