- 先pdf转图片
python
import os
from pdf2image import convert_from_path
# PDF文件路径
pdf_path = '/Users/xxx/2022.pdf'
# 输出图片的文件夹
output_folder = './output_images2022'
# 输出图片的命名格式
output_name = 'page'
# 如果输出文件夹不存在,创建它
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 将PDF转换为图像列表,设置分辨率为300 DPI
images = convert_from_path(pdf_path, dpi=300)
# 保存每一页为PNG图片
for i, image in enumerate(images):
image.save(f'{output_folder}/{output_name}_{i+1}.png', 'PNG')
- OCR
python
from PIL import ImageEnhance
import pytesseract
from PIL import Image
from openpyxl import Workbook
# 配置 Tesseract 的路径(如果需要)
# pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract' # Mac 的路径
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Windows 的路径
# 打开图片
# image_path = "/Users/xxx/page_3.png" # 替换为你的图片路径
def enhance_image(img):
img = img.convert('L') # 转灰度
img = ImageEnhance.Contrast(img).enhance(2.0)
return img
def allimngs(image_path):
image = Image.open(image_path)
image = enhance_image(image)
# 使用 pytesseract 进行 OCR
text = pytesseract.image_to_string(image, lang="chi_sim") # 中文
# # 打印提取的文本
# print("提取的文本:")
# print(text.replace(' ', ''))
return text.replace(' ', '')
# 统计子字符串出现次数
class TrieNode:
def __init__(self):
self.children = {}
self.keywords = []
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, keyword):
node = self.root
for char in keyword:
if char not in node.children:
node.children[char] = TrieNode()
node = node.children[char]
node.keywords.append(keyword)
def count_keywords(text, keywords):
# 去重关键词以确保唯一性
keywords = list(set(keywords))
# 构建Trie树
trie = Trie()
for kw in keywords:
trie.insert(kw)
# 初始化计数器
counters = {kw: 0 for kw in keywords}
i = 0
n = len(text)
while i < n:
current_node = trie.root
max_len = 0
current_len = 0
end_pos = i
# 寻找从当前位置开始的最长匹配关键词
for j in range(i, n):
char = text[j]
if char in current_node.children:
current_node = current_node.children[char]
current_len += 1
if current_node.keywords: # 当前节点是某个关键词的结尾
max_len = current_len
end_pos = j + 1 # 更新结束位置为当前字符的下一个位置
else:
break # 无后续匹配,退出循环
if max_len > 0:
# 更新所有匹配的关键词计数器
for kw in current_node.keywords:
counters[kw] += 1
i = end_pos # 跳跃到已匹配部分的末尾
else:
i += 1 # 无匹配,移动到下一个字符
return counters
if __name__ == "__main__":
keywords = ['矮小',
'安于现状',
'暗藏',
'暗淡',
'暗黑']
all_text = ''
workbook = Workbook()
sheet = workbook.active
for i in range(108):
i = i+1
image_path = f"/Users/xxx/output_images2022/page_{i}.png"
all_text = all_text + allimngs(image_path)
all_text = all_text.replace(' ', '').replace('\n', '')
result = count_keywords(all_text, keywords)
num = 1
for k, v in result.items():
sheet[f'A{num}'] = k
sheet[f'B{num}'] = v
print(k, v, num)
num = num + 1
workbook.save(filename='2022.xlsx')