订单截图隐私信息自动脱敏工具
一、工具概述
本工具是基于 Python + OCR 技术开发的电商订单截图隐私自动脱敏工具 ,针对淘宝、京东、拼多多等主流电商平台订单截图,自动识别并对收货地址、手机号码 等敏感信息进行马赛克打码,实现隐私保护。
工具支持单张截图、横向多图拼接、竖向长截图等多种格式,自动分区识别、精准定位打码区域,全程无需人工干预。
二、核心功能
1. 图片加载与识别
- 支持从网络 URL 批量加载订单截图
- 自动识别图片尺寸与截图比例
- 对横向拼接截图自动分区,避免左右区域文字混淆
2. OCR 文字识别与排版还原
- 使用 Tesseract OCR 识别中文、数字、字母
- 按文字坐标自动分行、分列,还原截图排版结构
- 根据字符间距智能判断是否添加空格,提升文本可读性
3. 隐私信息智能识别
(1)手机号码识别
- 识别 11 位国内手机号
- 兼容含
*/x等脱敏格式号码 - 自动过滤订单号、快递单号、金额等干扰数字
(2)收货地址识别
- 识别包含省、市、区、街道、路、小区、楼栋、单元、门牌号的完整地址
- 支持单行地址、多行连续地址
- 自动清洗特殊符号、乱码、空格等干扰内容
- 采用关键词匹配 + 地理库解析双重校验,降低误判率
(3)干扰信息自动过滤
- 自动过滤订单关键词:订单、实付、运费、物流、合计、时间、商家等
- 自动过滤店铺名称:旗舰店、专营店、工厂店、商城等,避免误打码
4. 自动马赛克打码
- 对识别出的手机号区域自动打码
- 对识别出的收货地址区域自动打码
- 打码区域自动外扩留白,确保隐私完全覆盖
- 横向多图分区精准打码,不跨区、不错位
5. 批量处理与结果输出
- 支持批量 URL 列表自动处理
- 处理结果自动保存至
output目录 - 控制台实时输出识别日志与打码区域
- 处理完成后自动预览结果图片
三、适用场景
- 电商订单截图隐私脱敏
- 订单截图分享前隐私保护
- 订单截图批量归档与数据安全处理
- 横向/竖向拼接订单截图自动脱敏
- 淘宝、京东、拼多多、抖音电商等多平台订单统一处理
四、处理流程
- 加载网络图片
- 图片尺寸与分区判断
- OCR 文字识别与坐标提取
- 文本分行、间距处理
- 手机号码识别
- 收货地址识别与过滤
- 对应区域自动马赛克打码
- 结果图片保存与预览
五、运行环境
- Python 3.8 及以上
- 依赖库:opencv-python、numpy、pytesseract、pillow、requests、jionlp
- 需安装 Tesseract-OCR 并配置中文语言包
六、工具特点
- 全自动处理:无需框选、无需配置,一键批量运行
- 高识别准确率:双重地址识别规则,降低误判、漏判
- 多格式兼容:支持单张、横向 2~4 张拼接、竖向长截图
- 精准打码:基于文字坐标区域打码,不跨区、不漏覆盖
- 本地安全:所有识别与打码在本地完成,不上传任何数据
- 易部署使用:代码结构清晰,可直接集成到业务系统
bash
#!/usr/bin/env python
#-*- coding: utf-8 -*-
# vim:fenc=utf-8
# @author tlwlmy
# @version 2026-04-07
import cv2
import numpy as np
import pytesseract
import re
import requests
from io import BytesIO
from PIL import Image
from collections import defaultdict
import os
import jionlp as jio
# ===================== 配置 =====================
OCR_LANG = 'chi_sim'
PHONE_REGEX = re.compile(r'(?<!\d)(?:1[3-9]\d{8,9}|.*?[*+x]{4,}.*?)(?!\d)')
NAME_REGEX = re.compile(r'^\s*[\u4e00-\u9fa5]{2,4}\s*$')
EXCLUDE_NAMES = {"收货人","收件人","姓名","收货","地址","订单","买家","卖家"}
# ===================== 安全马赛克 =====================
def mosaic_region(img, y, line_h, x1, x2):
h, w = img.shape[:2]
padding = 8
y1 = max(0, y - padding)
y2 = min(h, y + line_h + padding)
roi = img[y1:y2, x1:x2]
if roi.size <= 0:
return img
block = 16
small = cv2.resize(roi, (block, block), interpolation=cv2.INTER_NEAREST)
mosaic = cv2.resize(small, (x2-x1, y2-y1), interpolation=cv2.INTER_NEAREST)
img[y1:y2, x1:x2] = mosaic
return img
def is_valid_address(address: str) -> bool:
addr = address.strip()
ORDER_KEYWORDS = {"订单","买家","卖家","商家","实付","运费","快递","物流","发货","售后","合计","小计", "支付宝", "交易", "时间"}
for kw in ORDER_KEYWORDS:
if kw in address:
return False
# 过滤掉店面
SHOP_PATTERN = re.compile(
r'.*(?:旗舰店|专营店|专卖店|直营店|官方店|生活馆|精品店|百货店|好物店|优品店|严选|商城|超市|铺|小店|工厂店|源头厂|厂家直供|多多|包邮特卖|土特产|批发店|秒杀店|女装店|男装店|美妆店|母婴店|零食店|数码店|家纺店|车品店|文具店|家电店|宠物用品店).*$',
re.IGNORECASE
)
if bool(SHOP_PATTERN.fullmatch(address.strip())):
return False
# 终极正则:
# 1. 正向预查:必须包含至少一个地址特征关键字,过滤普通文本
# 2. 开头必须是2个以上中文(地址都是中文地名开头)
# 3. 后续允许中文、数字、字母、空格、横杠、括号等常见地址符号
# 按空格分割,每一段单独匹配
parts = addr.split()
pattern = r'^.*[\u4e00-\u9fa5\dA-Za-z]+(省|市|区|县|镇|乡|街道|路|街|巷|弄|里|大道|号楼|单元|室|层|苑|花园|小区|公寓|栋|座|坊|邨|村|社区|组).*$'
for part in parts:
part = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', part)
if len(part) < 6:
continue
if re.fullmatch(pattern, part, re.IGNORECASE):
return True
return False
# ===================== 地址匹配 =====================
def is_address_line(line_text):
text = line_text.strip().replace(" ","").replace("(","").replace(")","")
text = re.sub(r'\d+', '', text)
if len(text) < 4:
return False
ORDER_KEYWORDS = {"订单","买家","卖家","商家","实付","运费","快递","物流","发货","售后","合计","小计", "支付宝", "交易", "时间"}
for kw in ORDER_KEYWORDS:
if kw in text:
return False
try:
res = jio.parse_location(line_text)
has_province = res.get('province') is not None
has_city = res.get('city') is not None
has_county = res.get('county') is not None
has_town = res.get('town') is not None
has_village = res.get('village') is not None
return any([has_province, has_city, has_county, has_town, has_village])
except:
return False
# ===================== OCR行合并:返回 文本 + y + h + 子图x1x2 =====================
def group_ocr_by_line(data, full_w, full_h, line_height_threshold=20):
aspect = full_w / full_h
sub_regions = []
# 计算横向截图的每个子图区域
if aspect >= 0.9:
if aspect < 0.9:
n = 1
elif aspect < 1.4:
n = 2
elif aspect < 1.95:
n = 3
else:
n = 4
pw = full_w // n
for i in range(n):
x1 = i * pw
x2 = x1 + pw
sub_regions.append((x1, x2))
else:
sub_regions = [(0, full_w)]
group = defaultdict(lambda: defaultdict(list))
line_pos = dict()
for i, word in enumerate(data["text"]):
word = word.strip()
if not word:
continue
x = data["left"][i]
y = data["top"][i]
h = data["height"][i]
w = data["width"][i]
# 行分组
ly = None
for cy in line_pos:
if abs(y - cy) < line_height_threshold:
ly = cy
break
if ly is None:
ly = y
line_pos[ly] = (y, h)
# 子图分组
sub_idx = 0
for idx, (sx1, sx2) in enumerate(sub_regions):
if sx1 <= x < sx2:
sub_idx = idx
break
group[ly][sub_idx].append((x, w, word))
# group[ly][sub_idx].append(word)
# 返回:文本, y, h, x1, x2
# res = []
# for y in sorted(group):
# y_pos, h_pos = line_pos[y]
# sub_dict = group[y]
# for idx in sorted(sub_dict):
# line_text = ''.join(sub_dict[idx])
# x1, x2 = sub_regions[idx]
# res.append((line_text, y_pos, h_pos, x1, x2))
# 返回:文本, y, h, x1, x2
res = []
for y in sorted(group):
y_pos, h_pos = line_pos[y]
sub_dict = group[y]
sorted_indices = sorted(sub_dict) # 按x坐标排序
for idx in sorted_indices:
words = sorted(sub_dict[idx], key=lambda v: v[0])
line_text = ""
prev_x2 = None # 上一个字符的右坐标
for x, w, word in words:
current_x1, current_x2 = sub_regions[idx]
if prev_x2 is not None:
gap = x - prev_x2 # 两个字符之间的空白
# 间隔 ≥ 3个字符宽度 → 加1个空格
line_text += " " * (gap * len(word) // (w * 2))
line_text += word
prev_x2 = x + w
x1, x2 = sub_regions[idx]
res.append((line_text, y_pos, h_pos, x1, x2))
return res
# ===================== 处理单张图片 =====================
def process_one(img_url, save_path):
resp = requests.get(img_url, timeout=20)
img = cv2.imdecode(np.frombuffer(resp.content, np.uint8), cv2.IMREAD_COLOR)
result = img.copy()
H, W = img.shape[:2]
print(f"尺寸:{W}x{H}")
pil = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
data = pytesseract.image_to_data(pil, lang=OCR_LANG, output_type=pytesseract.Output.DICT)
# 直接返回带 x1 x2 的结果
lines = group_ocr_by_line(data, W, H)
for idx, (text, y, h, x1_region, x2_region) in enumerate(lines):
t = text.strip()
# print(f"区域[{x1_region}-{x2_region}] {t}")
if PHONE_REGEX.search(t):
mosaic_region(result, y - 10, h + 20, x1_region, x2_region)
if idx > 0:
py, ph = lines[idx-1][1], lines[idx-1][2]
px1, px2 = lines[idx-1][3], lines[idx-1][4]
mosaic_region(result, py, ph, px1, px2)
print(f"区域[{x1_region}-{x2_region}] {t}")
print(f" 手机号 → 打码")
elif is_address_line(t) or is_valid_address(t):
mosaic_region(result, y - 10, h + 20, x1_region, x2_region)
print(f"区域[{x1_region}-{x2_region}] {t}")
print(f" 地址 → 打码")
cv2.imwrite(save_path, result)
return result
# ===================== 批量处理 =====================
if __name__ == "__main__":
IMAGE_URLS = [
# 单张
"xxx",
# 横向2张
"xxx",
# 竖向2张
"xxx"
]
os.makedirs("output", exist_ok=True)
for i, url in enumerate(IMAGE_URLS):
print(f"\n===== 处理第 {i+1} 张 =====")
save = f"output/result_{i+1:02d}.png"
out_img = process_one(url, save)
cv2.imshow(f"结果 {i+1}", out_img)
cv2.waitKey(1)
cv2.destroyAllWindows()
print("\n✅ 全部处理完成!已保存到 output 文件夹")