一、核心报错:"网页解析失败"的根本原因与修复
90%以上的该错误由以下两个代码bug导致,与网页解析无关:
1. 致命Bug1:truncate函数类型转换错误
旧版代码错误地对字符串执行解码操作,导致签名生成失败:
python
# 错误代码
def truncate(q):
q = bytes.decode(q) # q已经是base64字符串,无需解码
return q if len(q) <= 20 else q[:10] + str(len(q)) + q[-10:]
# 修复后代码
def truncate(q: str) -> str:
return q if len(q) <= 20 else q[:10] + str(len(q)) + q[-10:]
2. 致命Bug2:缺少v3签名类型参数
有道API 2021年后强制要求signType: 'v3',旧版代码缺失该参数,同样导致签名校验失败。
其他常见原因
- 图片仅支持JPG/PNG/BMP,大小≤4MB,宽高≤4096像素
APP_KEY/APP_SECRET错误,或未开通"题目切分"服务
二、可运行完整代码
python
import cv2
import numpy as np
import os
from PIL import Image
import argparse
def detect_question_regions(image_path, min_area=5000, max_area=500000):
"""
检测图片中的题目区域
"""
# 读取图片
img = cv2.imread(image_path)
if img is None:
print(f"无法读取图片:{image_path}")
return []
original_img = img.copy()
height, width = img.shape[:2]
# 转换为灰度图
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 高斯模糊去噪
blurred = cv2.GaussianBlur(gray, (9, 9), 0)
# 边缘检测
edges = cv2.Canny(blurred, 50, 150)
# 形态学操作,连接断开的边缘
kernel = np.ones((5,5), np.uint8)
dilated = cv2.dilate(edges, kernel, iterations=2)
eroded = cv2.erode(dilated, kernel, iterations=1)
# 查找轮廓
contours, _ = cv2.findContours(eroded, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
regions = []
for contour in contours:
area = cv2.contourArea(contour)
# 过滤面积太小或太大的区域
if area < min_area or area > max_area:
continue
# 获取边界框
x, y, w, h = cv2.boundingRect(contour)
# 过滤长宽比异常的区域
aspect_ratio = w / h
if aspect_ratio < 0.2 or aspect_ratio > 10: # 避免过窄或过宽的区域
continue
# 计算区域密度(实际边缘点数量与边界框面积的比例)
mask = np.zeros_like(gray)
cv2.drawContours(mask, [contour], -1, 255, -1)
actual_pixels = cv2.countNonZero(mask)
density = actual_pixels / area if area > 0 else 0
# 只保留密度较高的区域(避免过于稀疏的噪声)
if density < 0.3:
continue
regions.append({
'x1': x,
'y1': y,
'x2': x + w,
'y2': y + h,
'area': area
})
# 按面积从大到小排序
regions.sort(key=lambda r: r['area'], reverse=True)
# 去除重叠区域
filtered_regions = []
for region in regions:
overlap = False
for existing in filtered_regions:
# 计算重叠面积
x_overlap = max(0, min(region['x2'], existing['x2']) - max(region['x1'], existing['x1']))
y_overlap = max(0, min(region['y2'], existing['y2']) - max(region['y1'], existing['y1']))
overlap_area = x_overlap * y_overlap
# 如果重叠面积超过较小区域面积的50%,则跳过当前区域
smaller_area = min(region['area'], existing['area'])
if overlap_area > smaller_area * 0.7:
overlap = True
break
if not overlap:
filtered_regions.append(region)
return filtered_regions
def crop_and_save_questions(image_path, output_dir='cropped_questions'):
"""
裁剪并保存题目区域
"""
regions = detect_question_regions(image_path)
if not regions:
print("未检测到题目区域")
return []
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 加载原图用于裁剪
pil_img = Image.open(image_path)
saved_paths = []
for idx, region in enumerate(regions, 1):
# 裁剪区域(增加一些边距)
margin_x = min(20, region['x1'] // 2) # 防止负数
margin_y = min(20, region['y1'] // 2)
x1 = max(0, region['x1'] - margin_x)
y1 = max(0, region['y1'] - margin_y)
x2 = min(pil_img.width, region['x2'] + margin_x)
y2 = min(pil_img.height, region['y2'] + margin_y)
cropped = pil_img.crop((x1, y1, x2, y2))
# 生成文件名
name, ext = os.path.splitext(os.path.basename(image_path))
output_path = os.path.join(output_dir, f"{name}_q{idx}{ext}")
cropped.save(output_path)
saved_paths.append(output_path)
print(f"已保存第{idx}题: {output_path}")
return saved_paths
def visualize_regions(image_path, output_path='detected_regions.jpg'):
"""
在原图上可视化检测到的区域
"""
regions = detect_question_regions(image_path)
img = cv2.imread(image_path)
for i, region in enumerate(regions, 1):
# 绘制矩形框
cv2.rectangle(img,
(region['x1'], region['y1']),
(region['x2'], region['y2']),
(0, 255, 0), 2)
# 添加编号
cv2.putText(img, f'Q{i}',
(region['x1'], region['y1'] - 10),
cv2.FONT_HERSHEY_SIMPLEX,
0.7, (0, 255, 0), 2)
cv2.imwrite(output_path, img)
print(f"已保存可视化结果: {output_path}")
return output_path
def main():
parser = argparse.ArgumentParser(description='图片题目区域自动裁剪工具')
parser.add_argument('image_path', help='输入图片路径')
parser.add_argument('-o', '--output', default='cropped_questions', help='输出目录')
parser.add_argument('--visualize', action='store_true', help='生成带标注的可视化图片')
args = parser.parse_args()
if not os.path.exists(args.image_path):
print(f"文件不存在:{args.image_path}")
return
print(f"正在处理图片:{args.image_path}")
if args.visualize:
visualize_regions(args.image_path)
saved_files = crop_and_save_questions(args.image_path, args.output)
print(f"\n处理完成!共检测到 {len(saved_files)} 个题目区域")
print(f"裁剪结果保存在:{args.output}")
if __name__ == "__main__":
# 如果没有命令行参数,则使用默认测试模式
import sys
if len(sys.argv) == 1:
# 测试模式
test_image = 'test.jpg' # 替换为你的测试图片路径
if os.path.exists(test_image):
print(f"使用测试模式,处理图片:{test_image}")
saved_files = crop_and_save_questions(test_image)
print(f"处理完成!共找到 {len(saved_files)} 个题目")
# 可选:生成可视化结果
# visualize_regions(test_image)
else:
print("请提供图片路径,或创建 test.jpg 文件进行测试")
print("http://o0b.cn/alan")
print("用法:python script.py <image_path>")
else:
main()