一. 背景
假如我们有一个标准文件,我们对其进行文字识别、版面分析或者其他下游任务就比较容易。然而,当图片是手机拍照获取的,图片中往往有阴影、摩尔纹、弯曲。
那么,如何通过标准的文档,获得类似相机拍照的图片呢?
这里介绍的就是文档数据增强,用标准文档模拟相机拍照场景。该方法不仅能用于文档各场景的数据增强,用于OCR检测识别等任务;还能合成各种图片训练对,用于文档去阴影、文档去摩尔纹、文档弯曲矫正等各项任务。
二. 效果实现
首先给大家展示的是一个PDF截图和对应的标注(红色为标注框)
下面给标准图片分别添加阴影、摩尔纹、弯曲,效果如下:
摩尔纹+弯曲,并且把标注点映射到弯曲图片上,如下图所示:
阴影+弯曲,并且把标注点映射到弯曲图片上,如下图所示:
三. 算法原理与代码实现
原理:利用渲染工具(推荐blender),渲染出各种弯曲、阴影、摩尔纹,然后再pdf图片上进行合成。
最后,一定要代码实现(只给初级版本,完整版本比较复杂):
python
import os
import cv2
import json
import random
import numpy as np
from scipy.interpolate import LinearNDInterpolator as linterp
from scipy.interpolate import NearestNDInterpolator as nearest
class LinearNDInterpolatorExt(object):
def __init__(self, points, values):
self.funcinterp = linterp(points, values)
self.funcnearest = nearest(points, values)
def __call__(self, *args):
z = self.funcinterp(*args)
chk = np.isnan(z)
if chk.any():
return np.where(chk, self.funcnearest(*args), z)
else:
return z
def crop_flow_from_nan(flow):
mask = ~np.any(np.isnan(flow), -1)
x, y, w, h = cv2.boundingRect(mask.astype(np.uint8))
flow = flow[y: y + h, x: x + w]
mask = mask[y: y + h, x: x + w]
max_nonzero_ratio = 0.9
max_crop_size = 20
mask_h, mask_w = mask.shape[0], mask.shape[1]
y0 = max_crop_size
for i in range(0, max_crop_size):
if np.count_nonzero(mask[i]) / mask_w > max_nonzero_ratio:
y0 = i
break
y1 = mask_h - 1 - max_crop_size
for i in range(mask_h - 1, y1, -1):
if np.count_nonzero(mask[i]) / mask_w > max_nonzero_ratio:
y1 = i
break
crop_mask = mask[y0:y1]
mask_h, mask_w = crop_mask.shape[0], crop_mask.shape[1]
x0 = max_crop_size
for i in range(0, x0):
if np.count_nonzero(mask[:, i]) / mask_h > max_nonzero_ratio:
x0 = i
break
x1 = mask_w - 1 - max_crop_size
for i in range(mask_w - 1, x1, -1):
if np.count_nonzero(mask[:, i]) / mask_h > max_nonzero_ratio:
x1 = i
break
flow = flow[y0:y1, x0:x1]
return flow
def flow_2_points(flow, pts):
"""
根据flow映射场反向计算点的对应点
:param flow: 前向、或后向映射场, range (-1, 1)
:param pts: 目标图、或原图的坐标点, 点经过归一化 range (0, 1), shape: (n, 2)
:return: 原图、或目标图的坐标点, 经过归一化 range (0, 1), shape: (n, 2)
"""
mask = ~np.any(np.isnan(flow), -1)
flow_masked = flow[mask]
flow_w, flow_h = flow.shape[1], flow.shape[0]
flow_xrange = np.arange(flow_w, dtype=np.float32)
flow_yrange = np.arange(flow_h, dtype=np.float32)
flow_xgrid, flow_ygrid = np.meshgrid(flow_xrange, flow_yrange)
flow_xgrid_masked = flow_xgrid[mask]
flow_ygrid_masked = flow_ygrid[mask]
src_pts = (pts - 0.5) * 2 # (0-1) to (-1, 1)
interpX = LinearNDInterpolatorExt(np.reshape(flow_masked, [-1, 2]), flow_xgrid_masked.reshape(-1))
interpY = LinearNDInterpolatorExt(np.reshape(flow_masked, [-1, 2]), flow_ygrid_masked.reshape(-1))
fm_x = interpX(src_pts)
fm_y = interpY(src_pts)
# fm_x, fm_y range is (0, flow_w-1) and (0, flow_h-1), need convert to (0-1)
fm_x = fm_x / (flow_w - 1)
fm_y = fm_y / (flow_h - 1)
return np.stack((fm_x, fm_y), axis=-1)
def warp_img(img, flow, points_list):
h, w, _ = img.shape
flow = crop_flow_from_nan(flow)
flow = flow.astype(np.float32)
flow = cv2.resize(flow, (256, 256))
points_list_warp = []
for points in points_list:
points = points.astype(np.float64)
points[:, 0] /= w*1.0
points[:, 1] /= h*1.0
points_warp = flow_2_points(flow, points)
points_warp[:, 0] *= w
points_warp[:, 1] *= h
points_list_warp.append(points_warp)
bm_flow = flow / 2 + 0.5
bm_flow[..., 0] = bm_flow[..., 0] * w
bm_flow[..., 1] = bm_flow[..., 1] * h
bm_flow = np.nan_to_num(bm_flow, nan=-1)
if bm_flow.shape[0] != h or bm_flow.shape[1] != w:
bm_flow = cv2.resize(bm_flow, (w, h))
warp_img = cv2.remap(img, bm_flow.astype(np.float32), None, cv2.INTER_LINEAR, borderValue=(255, 255, 255))
return warp_img, points_list_warp
def json_2_points(json_path):
with open(json_path, "r") as f:
data = json.load(f)
obj_list = []
for obj in data[0]['annotations']:
obj = obj['coordinates']
cx, cy, w, h = obj['x'], obj['y'], obj['width'], obj['height']
x1 = cx - 0.5 * w
x2 = cx + 0.5 * w
y1 = cy - 0.5 * h
y2 = cy + 0.5 * h
points = np.array([[x1,y1], [x2,y1], [x2,y2], [x1,y2]], np.int32)
obj_list.append(points)
return obj_list
def add_background(img, img_background):
height, width, _ = img.shape
background = cv2.resize(img_background, (width, height))
img_res = img * 0.5 + background * 0.5
img_res = np.clip(img_res, 0, 255)
return img_res
if __name__ == "__main__":
img = cv2.imread("test.png")
shadow = cv2.imread("./background/shadow.jpg")
img = add_background(img, shadow)
obj_list = json_2_points("test.json")
flow = np.load("test.npy")
warp_img, points_list_warp = warp_img(img, flow, obj_list)
cv2.imwrite("warp_shadow.jpg", warp_img)
for points in points_list_warp:
cv2.polylines(warp_img, [points.astype(np.int32)], isClosed=True, color=(0, 0, 255), thickness=1)
cv2.imwrite("warp_shadow_draw.jpg", warp_img)
致谢,在写代码过程中受到了鑫哥的启发,再次表示感谢!
欢迎小伙伴们技术交流~