复制代码
import base64
import io
from PIL import Image
import pyautogui
import time
from datetime import datetime
from ollama import chat
import os
import json
def capture_screenshot(region=None):
"""
截取屏幕截图
Args:
region: 截图区域 (left, top, width, height),如果为None则全屏截图
Returns:
PIL.Image: 截图图像对象
"""
try:
if region:
screenshot = pyautogui.screenshot(region=region)
else:
screenshot = pyautogui.screenshot()
print(f"[{datetime.now().strftime('%H:%M:%S')}] 截图成功")
return screenshot
except Exception as e:
print(f"[错误] 截图失败: {e}")
return None
def image_to_base64(image):
"""
将PIL图像转换为base64字符串
Args:
image: PIL图像对象
Returns:
str: base64编码的图像字符串
"""
buffer = io.BytesIO()
image.save(buffer, format='PNG')
return base64.b64encode(buffer.getvalue()).decode('utf-8')
def ollama_vlm_ocr(image=None, region=None, model='glm-ocr', prompt=None):
"""
使用Ollama VLM进行OCR识别
Args:
image: PIL图像对象,如果为None则自动截图
region: 截图区域 (left, top, width, height)
model: Ollama模型名称,默认'glm-ocr'
prompt: 自定义提示词,默认为OCR识别提示
Returns:
dict: 包含识别结果的字典
"""
# 如果没有提供图像,则自动截图
if image is None:
image = capture_screenshot(region)
if image is None:
return {"success": False, "error": "截图失败"}
# 转换图像为base64
try:
image_base64 = image_to_base64(image)
except Exception as e:
return {"success": False, "error": f"图像转换失败: {e}"}
# 默认提示词
if prompt is None:
prompt = "请识别这张图片中的所有文字内容,并按照从上到下、从左到右的顺序输出。如果是表格或特殊格式,请尽量保持原有结构。"
# 构建消息内容,包含图像和文本
messages = [
{
'role': 'user',
'content': prompt,
'images': [image_base64]
}
]
try:
print(f"[{datetime.now().strftime('%H:%M:%S')}] 正在调用Ollama模型 '{model}' 进行OCR识别...")
response = chat(
model=model,
messages=messages,
)
result_text = response.message.content
print(f"[{datetime.now().strftime('%H:%M:%S')}] OCR识别完成")
return {
"success": True,
"text": result_text,
"model": model,
"timestamp": datetime.now().isoformat()
}
except Exception as e:
print(f"[错误] Ollama调用失败: {e}")
return {"success": False, "error": str(e)}
def save_ocr_result(result, filename=None):
"""
保存OCR识别结果到文件
Args:
result: OCR识别结果字典
filename: 保存文件名,如果为None则自动生成
"""
if filename is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"ocr_result_{timestamp}.json"
try:
with open(filename, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"OCR结果已保存到: {filename}")
return filename
except Exception as e:
print(f"[错误] 保存文件失败: {e}")
return None
def auto_ocr_cycle(interval=5, region=None, model='glm-ocr'):
"""
自动OCR循环,按指定间隔截图并识别
Args:
interval: 截图间隔时间(秒)
region: 截图区域
model: OCR模型名称
"""
print("=== 自动OCR识别程序启动 ===")
print(f"截图间隔: {interval}秒")
print(f"使用模型: {model}")
if region:
print(f"截图区域: {region}")
else:
print("截图区域: 全屏")
print("按 Ctrl+C 退出程序\n")
try:
while True:
start_time = time.time()
# 执行OCR识别
result = ollama_vlm_ocr(region=region, model=model)
if result["success"]:
print(f"识别结果:\n{result['text']}\n")
# 保存结果
save_ocr_result(result)
else:
print(f"OCR失败: {result['error']}\n")
# 控制执行频率
elapsed = time.time() - start_time
sleep_time = max(0, interval - elapsed)
if sleep_time > 0:
time.sleep(sleep_time)
except KeyboardInterrupt:
print("\n\n程序已退出")
# 示例使用方法
if __name__ == "__main__":
# 方法1: 单次OCR识别(全屏)
print("=== 单次OCR识别示例 ===")
result = ollama_vlm_ocr(model='glm-ocr')
if result["success"]:
print("识别结果:")
print(result["text"])
save_ocr_result(result)
print("\n" + "="*50 + "\n")
# 方法2: 指定区域OCR识别
# result = ollama_vlm_ocr(region=(100, 100, 800, 600))
# 方法3: 自定义提示词
# custom_prompt = "请识别图片中的数学公式和数字"
# result = ollama_vlm_ocr(prompt=custom_prompt)
# 方法4: 启动自动循环识别
# auto_ocr_cycle(interval=10) # 每10秒识别一次