做数据获取安**时发现请求返回的数据跟woff字体有关,这里写一个使用OCR识别方法,
1、重要部分的原理是解析woff文件,并将woff字体转为图片,并将字体编码与字体图片相对应,便于之后的数据解密解析。
2、使用的是muggle_ocr,当然可以使用其他的图片识别技术。
3、对解析结果进行转换或优化,整体识别率可达98%左右。
python
# from font_transfer import *
import requests
import json
import io
import os
import threading
import base64
import muggle_ocr
from PIL import Image, ImageDraw, ImageFont
from fontTools.ttLib import TTFont
from fontTools.ttLib.woff2 import decompress
import traceback
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
class FontTransfer:
_instance_lock = threading.Lock()
def __init__(self, font_size=20):
self.font_size = font_size # 字体文字的尺寸
self.image_size = self.font_size + 4
self.ocr = muggle_ocr.SDK(model_type=muggle_ocr.ModelType.OCR)
# self.ocr = ddddocr.DdddOcr(use_gpu=True)
self.res_dict = dict()
# self.thread_pool = ThreadPoolExecutor(15)
# # 线程安全的单例模式
# def __new__(cls, *args, **kwargs):
# if not hasattr(cls, '_instance'):
# with FontTransfer._instance_lock:
# if not hasattr(cls, '_instance'):
# FontTransfer._instance = super().__new__(cls)
#
# return FontTransfer._instance
def get_chars_from_font(self, ttf):
"""
从字体文件中获取字体编码、字体字型等信息
:param font_path: 字体文件路径 str
:return: dict
"""
# ttf = TTFont(font_path)
return {k: v for k, v in ttf['cmap'].getBestCmap().items() if ttf['glyf'][v].xMax}
def draw_font_word(self, char_unicode, font, v):
"""
在画板上画出字体文件中的字型
:param char_unicode: unicode编码字符串 str
:param board: 画板对象
:param font: 字型对象
:return: None
"""
board = Image.new('RGB', (self.image_size, self.image_size), (255, 255, 255))
draw = ImageDraw.ImageDraw(board)
# 自适应字体在图片中保持居中
center_background = (self.image_size / 2, self.image_size / 2)
size = draw.textsize(char_unicode, font=font)
origin = [center_background[0] - size[0] / 2, center_background[1] - size[1] / 2 - 8]
draw.text(tuple(origin), char_unicode, font=font, fill=0)
# board.save(f"./{v}.jpg")
img_byte = io.BytesIO()
board.save(img_byte, format='JPEG')
img_data = img_byte.getvalue()
result = self.ocr.predict(img_data)
# result = self.ocr.classification(img_data)
# self.res_dict[v] = result
return result
def get_font_transfer_dict(self, font_path="", font_body=""):
"""
自适应画出图片的大小,生成字体字型的坐标
:param font_path:
:param font_body: 字体文件的base64 字符串
:return:
"""
if font_path != '':
ttf = TTFont(font_path)
char_dict = self.get_chars_from_font(ttf)
font = ImageFont.truetype(font_path, self.font_size)
elif font_body != '':
tmp_byts = io.BytesIO()
font_bytes = io.BytesIO(base64.b64decode(font_body))
ttf = TTFont(font_bytes)
char_dict = self.get_chars_from_font(ttf)
decompress(font_bytes, tmp_byts)
file_bytes = io.BytesIO(tmp_byts.getbuffer())
font = ImageFont.truetype(font=file_bytes, size=self.font_size)
# thread_arr = []
res_dict = dict()
for k, v in char_dict.items():
char_unicode = chr(k)
res = self.draw_font_word(char_unicode, font, v)
# thread = self.thread_pool.submit(self.draw_font_word, char_unicode, font, v)
# thread_arr.append(thread)
res_dict[v] = res
# for t in as_completed(res):
# pass
# return self.res_dict
return res_dict
def change_font_encode(self, font_body, font_encode):
try:
char_dict = self.get_font_transfer_dict(font_body=font_body)
font_dict = {}
font_change = {'玫': '玖', '参': '叁', '染': '柒', '忏': '仟', '挪': '捌', '青': '壹'}
for key, value in char_dict.items():
if value in font_change.keys():
font_dict[key] = font_change[value]
else:
font_dict[key] = value
for key, value in font_dict.items():
font_encode = font_encode.replace(key, value)
return font_encode
except Exception as e:
traceback.print_exc()
return font_encode
fft = FontTransfer(font_size=50)
def change_font(font_body, text):
text = '%r' % text
text = text.replace('\\U000', 'uni').replace("'", '').replace('\\', '')
res = fft.change_font_encode(font_body=font_body, font_encode=text)
return res
cookie = ''
headers = {
'accept': '*/*',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'cookie': cookie
}
json_data = {
'type': 'HOUSE',
'pageNum': 1,
'pageSize': 50,
'condition': {
'communityId': xxxxx,
'buildingOpenId': 'xxxxxx',
'unitOpenId': '',
'floorOpenId': 'xxxxx',
'bizType': 'HOUSE',
'keyword': '',
},
}
response = requests.post('https://x.xxxx.com/landlord/cross/v1/communities/buildings', headers=headers, json=json_data)
json_data = json.loads(response.text)
if json_data['status'] == "0":
data = json_data['data']
fontInfo = data['fontInfo']
woffFontBody = fontInfo['woffFontBody']
results = data['results']
item_count = len(results)
for res in results:
encryptBizName = res['encryptBizName']
decryptBizName = change_font(woffFontBody, text=encryptBizName)
print(encryptBizName, decryptBizName)
else:
print(json_data)
该部分代码仅用于学习使用。