当前位置:   article > 正文

python ocr text detection 数据增强_ocr.detecttext

ocr.detecttext
  1. def augmentation(im: np.ndarray, text_polys: np.ndarray, scales: np.ndarray, degrees: int, input_size: int) -> tuple:
  2. # the images are rescaled with ratio {0.5, 1.0, 2.0, 3.0} randomly
  3. im, text_polys = data_aug.random_scale(im, text_polys, scales)
  4. # the images are horizontally fliped and rotated in range [−10◦, 10◦] randomly
  5. if random.random() < 0.5:
  6. im, text_polys = data_aug.horizontal_flip(im, text_polys)
  7. if random.random() < 0.5:
  8. im, text_polys = data_aug.random_rotate_img_bbox(im, text_polys, degrees)
  9. # 640 × 640 random samples are cropped from the transformed images
  10. # im, text_polys = data_aug.random_crop_img_bboxes(im, text_polys)
  11. # im, text_polys = data_aug.resize(im, text_polys, input_size, keep_ratio=False)
  12. # im, text_polys = data_aug.random_crop_image_pse(im, text_polys, input_size)
  13. return im, text_polys
  1. # -*- coding: utf-8 -*-
  2. # @Time : 2019/1/12 13:06
  3. import cv2
  4. import numbers
  5. import math
  6. import random
  7. import numpy as np
  8. from skimage.util import random_noise
  9. def show_pic(img, bboxes=None, name='pic'):
  10. '''
  11. 输入:
  12. img:图像array
  13. bboxes:图像的所有boudning box list, 格式为[[x_min, y_min, x_max, y_max]....]
  14. names:每个box对应的名称
  15. '''
  16. show_img = img.copy()
  17. if not isinstance(bboxes, np.ndarray):
  18. bboxes = np.array(bboxes)
  19. for point in bboxes.astype(np.int):
  20. cv2.line(show_img, tuple(point[0]), tuple(point[1]), (255, 0, 0), 2)
  21. cv2.line(show_img, tuple(point[1]), tuple(point[2]), (255, 0, 0), 2)
  22. cv2.line(show_img, tuple(point[2]), tuple(point[3]), (255, 0, 0), 2)
  23. cv2.line(show_img, tuple(point[3]), tuple(point[0]), (255, 0, 0), 2)
  24. # cv2.namedWindow(name, 0) # 1表示原图
  25. # cv2.moveWindow(name, 0, 0)
  26. # cv2.resizeWindow(name, 1200, 800) # 可视化的图片大小
  27. cv2.imshow(name, show_img)
  28. # 图像均为cv2读取
  29. class DataAugment():
  30. def __init__(self):
  31. pass
  32. def add_noise(self, im: np.ndarray):
  33. """
  34. 对图片加噪声
  35. :param img: 图像array
  36. :return: 加噪声后的图像array,由于输出的像素是在[0,1]之间,所以得乘以255
  37. """
  38. return (random_noise(im, mode='gaussian', clip=True) * 255).astype(im.dtype)
  39. def random_scale(self, im: np.ndarray, text_polys: np.ndarray, scales: np.ndarray or list) -> tuple:
  40. """
  41. 从scales中随机选择一个尺度,对图片和文本框进行缩放
  42. :param im: 原图
  43. :param text_polys: 文本框
  44. :param scales: 尺度
  45. :return: 经过缩放的图片和文本
  46. """
  47. tmp_text_polys = text_polys.copy()
  48. rd_scale = float(np.random.choice(scales))
  49. im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale)
  50. tmp_text_polys *= rd_scale
  51. return im, tmp_text_polys
  52. def random_rotate_img_bbox(self, img, text_polys, degrees: numbers.Number or list or tuple or np.ndarray,
  53. same_size=False):
  54. """
  55. 从给定的角度中选择一个角度,对图片和文本框进行旋转
  56. :param img: 图片
  57. :param text_polys: 文本框
  58. :param degrees: 角度,可以是一个数值或者list
  59. :param same_size: 是否保持和原图一样大
  60. :return: 旋转后的图片和角度
  61. """
  62. if isinstance(degrees, numbers.Number):
  63. if degrees < 0:
  64. raise ValueError("If degrees is a single number, it must be positive.")
  65. degrees = (-degrees, degrees)
  66. elif isinstance(degrees, list) or isinstance(degrees, tuple) or isinstance(degrees, np.ndarray):
  67. if len(degrees) != 2:
  68. raise ValueError("If degrees is a sequence, it must be of len 2.")
  69. degrees = degrees
  70. else:
  71. raise Exception('degrees must in Number or list or tuple or np.ndarray')
  72. # ---------------------- 旋转图像 ----------------------
  73. w = img.shape[1]
  74. h = img.shape[0]
  75. angle = np.random.uniform(degrees[0], degrees[1])
  76. if same_size:
  77. nw = w
  78. nh = h
  79. else:
  80. # 角度变弧度
  81. rangle = np.deg2rad(angle)
  82. # 计算旋转之后图像的w, h
  83. nw = (abs(np.sin(rangle) * h) + abs(np.cos(rangle) * w))
  84. nh = (abs(np.cos(rangle) * h) + abs(np.sin(rangle) * w))
  85. # 构造仿射矩阵
  86. rot_mat = cv2.getRotationMatrix2D((nw * 0.5, nh * 0.5), angle, 1)
  87. # 计算原图中心点到新图中心点的偏移量
  88. rot_move = np.dot(rot_mat, np.array([(nw - w) * 0.5, (nh - h) * 0.5, 0]))
  89. # 更新仿射矩阵
  90. rot_mat[0, 2] += rot_move[0]
  91. rot_mat[1, 2] += rot_move[1]
  92. # 仿射变换
  93. rot_img = cv2.warpAffine(img, rot_mat, (int(math.ceil(nw)), int(math.ceil(nh))), flags=cv2.INTER_LANCZOS4)
  94. # ---------------------- 矫正bbox坐标 ----------------------
  95. # rot_mat是最终的旋转矩阵
  96. # 获取原始bbox的四个中点,然后将这四个点转换到旋转后的坐标系下
  97. rot_text_polys = list()
  98. for bbox in text_polys:
  99. point1 = np.dot(rot_mat, np.array([bbox[0, 0], bbox[0, 1], 1]))
  100. point2 = np.dot(rot_mat, np.array([bbox[1, 0], bbox[1, 1], 1]))
  101. point3 = np.dot(rot_mat, np.array([bbox[2, 0], bbox[2, 1], 1]))
  102. point4 = np.dot(rot_mat, np.array([bbox[3, 0], bbox[3, 1], 1]))
  103. rot_text_polys.append([point1, point2, point3, point4])
  104. return rot_img, np.array(rot_text_polys, dtype=np.float32)
  105. def random_crop_img_bboxes(self, im: np.ndarray, text_polys: np.ndarray, max_tries=50) -> tuple:
  106. """
  107. 从图片中裁剪出 cropsize大小的图片和对应区域的文本框
  108. :param im: 图片
  109. :param text_polys: 文本框
  110. :param max_tries: 最大尝试次数
  111. :return: 裁剪后的图片和文本框
  112. """
  113. h, w, _ = im.shape
  114. pad_h = h // 10
  115. pad_w = w // 10
  116. h_array = np.zeros((h + pad_h * 2), dtype=np.int32)
  117. w_array = np.zeros((w + pad_w * 2), dtype=np.int32)
  118. for poly in text_polys:
  119. poly = np.round(poly, decimals=0).astype(np.int32) # 四舍五入取整
  120. minx = np.min(poly[:, 0])
  121. maxx = np.max(poly[:, 0])
  122. w_array[minx + pad_w:maxx + pad_w] = 1 # 将文本区域的在w_array上设为1,表示x轴方向上这部分位置有文本
  123. miny = np.min(poly[:, 1])
  124. maxy = np.max(poly[:, 1])
  125. h_array[miny + pad_h:maxy + pad_h] = 1 # 将文本区域的在h_array上设为1,表示y轴方向上这部分位置有文本
  126. # 在两个轴上 拿出背景位置去进行随机的位置选择,避免选择的区域穿过文本
  127. h_axis = np.where(h_array == 0)[0]
  128. w_axis = np.where(w_array == 0)[0]
  129. if len(h_axis) == 0 or len(w_axis) == 0:
  130. # 整张图全是文本的情况下,直接返回
  131. return im, text_polys
  132. for i in range(max_tries):
  133. xx = np.random.choice(w_axis, size=2)
  134. # 对选择区域进行边界控制
  135. xmin = np.min(xx) - pad_w
  136. xmax = np.max(xx) - pad_w
  137. xmin = np.clip(xmin, 0, w - 1)
  138. xmax = np.clip(xmax, 0, w - 1)
  139. yy = np.random.choice(h_axis, size=2)
  140. ymin = np.min(yy) - pad_h
  141. ymax = np.max(yy) - pad_h
  142. ymin = np.clip(ymin, 0, h - 1)
  143. ymax = np.clip(ymax, 0, h - 1)
  144. if xmax - xmin < 0.1 * w or ymax - ymin < 0.1 * h:
  145. # 选择的区域过小
  146. # area too small
  147. continue
  148. if text_polys.shape[0] != 0: # 这个判断不知道干啥的
  149. poly_axis_in_area = (text_polys[:, :, 0] >= xmin) & (text_polys[:, :, 0] <= xmax) \
  150. & (text_polys[:, :, 1] >= ymin) & (text_polys[:, :, 1] <= ymax)
  151. selected_polys = np.where(np.sum(poly_axis_in_area, axis=1) == 4)[0]
  152. else:
  153. selected_polys = []
  154. if len(selected_polys) == 0:
  155. # 区域内没有文本
  156. continue
  157. im = im[ymin:ymax + 1, xmin:xmax + 1, :]
  158. polys = text_polys[selected_polys]
  159. # 坐标调整到裁剪图片上
  160. polys[:, :, 0] -= xmin
  161. polys[:, :, 1] -= ymin
  162. return im, polys
  163. return im, text_polys
  164. def random_crop_image_pse(self, im: np.ndarray, text_polys: np.ndarray, input_size) -> tuple:
  165. """
  166. 从图片中裁剪出 cropsize大小的图片和对应区域的文本框
  167. :param im: 图片
  168. :param text_polys: 文本框
  169. :param input_size: 输出图像大小
  170. :return: 裁剪后的图片和文本框
  171. """
  172. h, w, _ = im.shape
  173. short_edge = min(h, w)
  174. if short_edge < input_size:
  175. # 保证短边 >= inputsize
  176. scale = input_size / short_edge
  177. im = cv2.resize(im, dsize=None, fx=scale, fy=scale)
  178. text_polys *= scale
  179. h, w, _ = im.shape
  180. # 计算随机范围
  181. w_range = w - input_size
  182. h_range = h - input_size
  183. for _ in range(50):
  184. xmin = random.randint(0, w_range)
  185. ymin = random.randint(0, h_range)
  186. xmax = xmin + input_size
  187. ymax = ymin + input_size
  188. if text_polys.shape[0] != 0:
  189. selected_polys = []
  190. for poly in text_polys:
  191. if poly[:, 0].max() < xmin or poly[:, 0].min() > xmax or \
  192. poly[:, 1].max() < ymin or poly[:, 1].min() > ymax:
  193. continue
  194. # area_p = cv2.contourArea(poly)
  195. poly[:, 0] -= xmin
  196. poly[:, 1] -= ymin
  197. poly[:, 0] = np.clip(poly[:, 0], 0, input_size)
  198. poly[:, 1] = np.clip(poly[:, 1], 0, input_size)
  199. # rect = cv2.minAreaRect(poly)
  200. # area_n = cv2.contourArea(poly)
  201. # h1, w1 = rect[1]
  202. # if w1 < 10 or h1 < 10 or area_n / area_p < 0.5:
  203. # continue
  204. selected_polys.append(poly)
  205. else:
  206. selected_polys = []
  207. # if len(selected_polys) == 0:
  208. # 区域内没有文本
  209. # continue
  210. im = im[ymin:ymax, xmin:xmax, :]
  211. polys = np.array(selected_polys)
  212. return im, polys
  213. return im, text_polys
  214. def random_crop_author(self,imgs, img_size):
  215. h, w = imgs[0].shape[0:2]
  216. th, tw = img_size
  217. if w == tw and h == th:
  218. return imgs
  219. ttt=random.random()
  220. # print(ttt)
  221. # label中存在文本实例,并且按照概率进行裁剪
  222. if np.max(imgs[1][:,:,-1]) > 0 and random.random() > 3.0 / 8.0:
  223. # 文本实例的top left
  224. tl = np.min(np.where(imgs[1][:,:,-1] > 0), axis=1) - img_size
  225. tl[tl < 0] = 0
  226. # 文本实例的 bottom right
  227. br = np.max(np.where(imgs[1][:,:,-1] > 0), axis=1) - img_size
  228. br[br < 0] = 0
  229. # 保证选到右下角点是,有足够的距离进行crop
  230. br[0] = min(br[0], h - th)
  231. br[1] = min(br[1], w - tw)
  232. for _ in range(50000):
  233. i = random.randint(tl[0], br[0])
  234. j = random.randint(tl[1], br[1])
  235. # 保证最小的图有文本
  236. if imgs[1][:,:,0][i:i + th, j:j + tw].sum() <= 0:
  237. continue
  238. else:
  239. break
  240. else:
  241. i = random.randint(0, h - th)
  242. j = random.randint(0, w - tw)
  243. # return i, j, th, tw
  244. for idx in range(len(imgs)):
  245. if len(imgs[idx].shape) == 3:
  246. imgs[idx] = imgs[idx][i:i + th, j:j + tw, :]
  247. else:
  248. imgs[idx] = imgs[idx][i:i + th, j:j + tw]
  249. return imgs
  250. def resize(self, im: np.ndarray, text_polys: np.ndarray,
  251. input_size: numbers.Number or list or tuple or np.ndarray, keep_ratio: bool = False) -> tuple:
  252. """
  253. 对图片和文本框进行resize
  254. :param im: 图片
  255. :param text_polys: 文本框
  256. :param input_size: resize尺寸,数字或者list的形式,如果为list形式,就是[w,h]
  257. :param keep_ratio: 是否保持长宽比
  258. :return: resize后的图片和文本框
  259. """
  260. if isinstance(input_size, numbers.Number):
  261. if input_size < 0:
  262. raise ValueError("If input_size is a single number, it must be positive.")
  263. input_size = (input_size, input_size)
  264. elif isinstance(input_size, list) or isinstance(input_size, tuple) or isinstance(input_size, np.ndarray):
  265. if len(input_size) != 2:
  266. raise ValueError("If input_size is a sequence, it must be of len 2.")
  267. input_size = (input_size[0], input_size[1])
  268. else:
  269. raise Exception('input_size must in Number or list or tuple or np.ndarray')
  270. if keep_ratio:
  271. # 将图片短边pad到和长边一样
  272. h, w, c = im.shape
  273. max_h = max(h, input_size[0])
  274. max_w = max(w, input_size[1])
  275. im_padded = np.zeros((max_h, max_w, c), dtype=np.uint8)
  276. im_padded[:h, :w] = im.copy()
  277. im = im_padded
  278. text_polys = text_polys.astype(np.float32)
  279. h, w, _ = im.shape
  280. im = cv2.resize(im, input_size)
  281. w_scale = input_size[0] / float(w)
  282. h_scale = input_size[1] / float(h)
  283. text_polys[:, :, 0] *= w_scale
  284. text_polys[:, :, 1] *= h_scale
  285. return im, text_polys
  286. def horizontal_flip(self, im: np.ndarray, text_polys: np.ndarray) -> tuple:
  287. """
  288. 对图片和文本框进行水平翻转
  289. :param im: 图片
  290. :param text_polys: 文本框
  291. :return: 水平翻转之后的图片和文本框
  292. """
  293. flip_text_polys = text_polys.copy()
  294. flip_im = cv2.flip(im, 1)
  295. h, w, _ = flip_im.shape
  296. flip_text_polys[:, :, 0] = w - flip_text_polys[:, :, 0]
  297. return flip_im, flip_text_polys
  298. def vertical_flip(self, im: np.ndarray, text_polys: np.ndarray) -> tuple:
  299. """
  300. 对图片和文本框进行竖直翻转
  301. :param im: 图片
  302. :param text_polys: 文本框
  303. :return: 竖直翻转之后的图片和文本框
  304. """
  305. flip_text_polys = text_polys.copy()
  306. flip_im = cv2.flip(im, 0)
  307. h, w, _ = flip_im.shape
  308. flip_text_polys[:, :, 1] = h - flip_text_polys[:, :, 1]
  309. return flip_im, flip_text_polys
  310. def test(self, im: np.ndarray, text_polys: np.ndarray):
  311. print('随机尺度缩放')
  312. t_im, t_text_polys = self.random_scale(im, text_polys, [0.5, 1, 2, 3])
  313. print(t_im.shape, t_text_polys.dtype)
  314. show_pic(t_im, t_text_polys, 'random_scale')
  315. print('随机旋转')
  316. t_im, t_text_polys = self.random_rotate_img_bbox(im, text_polys, 10)
  317. print(t_im.shape, t_text_polys.dtype)
  318. show_pic(t_im, t_text_polys, 'random_rotate_img_bbox')
  319. print('随机裁剪')
  320. t_im, t_text_polys = self.random_crop_img_bboxes(im, text_polys)
  321. print(t_im.shape, t_text_polys.dtype)
  322. show_pic(t_im, t_text_polys, 'random_crop_img_bboxes')
  323. print('水平翻转')
  324. t_im, t_text_polys = self.horizontal_flip(im, text_polys)
  325. print(t_im.shape, t_text_polys.dtype)
  326. show_pic(t_im, t_text_polys, 'horizontal_flip')
  327. print('竖直翻转')
  328. t_im, t_text_polys = self.vertical_flip(im, text_polys)
  329. print(t_im.shape, t_text_polys.dtype)
  330. show_pic(t_im, t_text_polys, 'vertical_flip')
  331. show_pic(im, text_polys, 'vertical_flip_ori')
  332. print('加噪声')
  333. t_im = self.add_noise(im)
  334. print(t_im.shape)
  335. show_pic(t_im, text_polys, 'add_noise')
  336. show_pic(im, text_polys, 'add_noise_ori')

 

声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号