python 使用OCR 识别woff字体文件

做数据获取安**时发现请求返回的数据跟woff字体有关,这里写一个使用OCR识别方法,

1、重要部分的原理是解析woff文件,并将woff字体转为图片,并将字体编码与字体图片相对应,便于之后的数据解密解析。

2、使用的是muggle_ocr,当然可以使用其他的图片识别技术。

3、对解析结果进行转换或优化,整体识别率可达98%左右。

python 复制代码
# from font_transfer import *
import requests
import json

import io
import os
import threading
import base64
import muggle_ocr
from PIL import Image, ImageDraw, ImageFont
from fontTools.ttLib import TTFont
from fontTools.ttLib.woff2 import decompress
import traceback

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'


class FontTransfer:
    _instance_lock = threading.Lock()

    def __init__(self, font_size=20):
        self.font_size = font_size  # 字体文字的尺寸
        self.image_size = self.font_size + 4
        self.ocr = muggle_ocr.SDK(model_type=muggle_ocr.ModelType.OCR)
        # self.ocr = ddddocr.DdddOcr(use_gpu=True)
        self.res_dict = dict()
        # self.thread_pool = ThreadPoolExecutor(15)

    # # 线程安全的单例模式
    # def __new__(cls, *args, **kwargs):
    #     if not hasattr(cls, '_instance'):
    #         with FontTransfer._instance_lock:
    #             if not hasattr(cls, '_instance'):
    #                 FontTransfer._instance = super().__new__(cls)
    #
    #         return FontTransfer._instance

    def get_chars_from_font(self, ttf):
        """
        从字体文件中获取字体编码、字体字型等信息
        :param font_path: 字体文件路径 str
        :return: dict
        """
        # ttf = TTFont(font_path)
        return {k: v for k, v in ttf['cmap'].getBestCmap().items() if ttf['glyf'][v].xMax}

    def draw_font_word(self, char_unicode, font, v):
        """
        在画板上画出字体文件中的字型
        :param char_unicode: unicode编码字符串 str
        :param board: 画板对象
        :param font: 字型对象
        :return: None
        """
        board = Image.new('RGB', (self.image_size, self.image_size), (255, 255, 255))
        draw = ImageDraw.ImageDraw(board)

        # 自适应字体在图片中保持居中
        center_background = (self.image_size / 2, self.image_size / 2)
        size = draw.textsize(char_unicode, font=font)
        origin = [center_background[0] - size[0] / 2, center_background[1] - size[1] / 2 - 8]

        draw.text(tuple(origin), char_unicode, font=font, fill=0)

        # board.save(f"./{v}.jpg")

        img_byte = io.BytesIO()
        board.save(img_byte, format='JPEG')
        img_data = img_byte.getvalue()

        result = self.ocr.predict(img_data)
        # result = self.ocr.classification(img_data)
        # self.res_dict[v] = result
        return result

    def get_font_transfer_dict(self, font_path="", font_body=""):
        """
        自适应画出图片的大小,生成字体字型的坐标
        :param font_path:
        :param font_body: 字体文件的base64 字符串
        :return:
        """
        if font_path != '':
            ttf = TTFont(font_path)
            char_dict = self.get_chars_from_font(ttf)
            font = ImageFont.truetype(font_path, self.font_size)

        elif font_body != '':
            tmp_byts = io.BytesIO()
            font_bytes = io.BytesIO(base64.b64decode(font_body))
            ttf = TTFont(font_bytes)
            char_dict = self.get_chars_from_font(ttf)
            decompress(font_bytes, tmp_byts)
            file_bytes = io.BytesIO(tmp_byts.getbuffer())
            font = ImageFont.truetype(font=file_bytes, size=self.font_size)

        # thread_arr = []
        res_dict = dict()
        for k, v in char_dict.items():
            char_unicode = chr(k)
            res = self.draw_font_word(char_unicode, font, v)
            # thread = self.thread_pool.submit(self.draw_font_word, char_unicode, font, v)
            # thread_arr.append(thread)
            res_dict[v] = res

        # for t in as_completed(res):
        #     pass

        # return self.res_dict

        return res_dict

    def change_font_encode(self, font_body, font_encode):
        try:
            char_dict = self.get_font_transfer_dict(font_body=font_body)
            font_dict = {}
            font_change = {'玫': '玖', '参': '叁', '染': '柒', '忏': '仟', '挪': '捌', '青': '壹'}
            for key, value in char_dict.items():
                if value in font_change.keys():
                    font_dict[key] = font_change[value]
                else:
                    font_dict[key] = value

            for key, value in font_dict.items():
                font_encode = font_encode.replace(key, value)

            return font_encode
        except Exception as e:
            traceback.print_exc()
            return font_encode

fft = FontTransfer(font_size=50)


def change_font(font_body, text):
    text = '%r' % text
    text = text.replace('\\U000', 'uni').replace("'", '').replace('\\', '')
    res = fft.change_font_encode(font_body=font_body, font_encode=text)
    return res



cookie = ''
headers = {
    'accept': '*/*',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
    'cookie': cookie
}

json_data = {
    'type': 'HOUSE',
    'pageNum': 1,
    'pageSize': 50,
    'condition': {
        'communityId': xxxxx,
        'buildingOpenId': 'xxxxxx',
        'unitOpenId': '',
        'floorOpenId': 'xxxxx',
        'bizType': 'HOUSE',
        'keyword': '',
    },
}

response = requests.post('https://x.xxxx.com/landlord/cross/v1/communities/buildings', headers=headers, json=json_data)
json_data = json.loads(response.text)

if json_data['status'] == "0":
    data = json_data['data']
    fontInfo = data['fontInfo']
    woffFontBody = fontInfo['woffFontBody']
    results = data['results']
    item_count = len(results)
    for res in results:
        encryptBizName = res['encryptBizName']

        decryptBizName = change_font(woffFontBody, text=encryptBizName)
        print(encryptBizName, decryptBizName)
else:
    print(json_data)

该部分代码仅用于学习使用。

相关推荐
张小生1803 分钟前
PyCharm中 argparse 库 的使用方法
python·pycharm
秃头佛爷3 分钟前
Python使用PDF相关组件案例详解
python
Dxy12393102164 分钟前
python下载pdf
数据库·python·pdf
叶知安5 分钟前
如何用pycharm连接sagemath?
ide·python·pycharm
weixin_432702269 分钟前
代码随想录算法训练营第五十五天|图论理论基础
数据结构·python·算法·深度优先·图论
y52364815 分钟前
Javascript监控元素样式变化
开发语言·javascript·ecmascript
菜鸟清风16 分钟前
ChromeDriver下载地址
python
deephub28 分钟前
Tokenformer:基于参数标记化的高效可扩展Transformer架构
人工智能·python·深度学习·架构·transformer
Open-AI1 小时前
Python如何判断一个数是几位数
python
IT技术分享社区1 小时前
C#实战:使用腾讯云识别服务轻松提取火车票信息
开发语言·c#·云计算·腾讯云·共识算法