赞
踩
1.数据集介绍:比赛数据集,共29万张图片,其中21万张图片为训练集(train_img),8万张为测试集(test_img),训练集标注(train.list)。所有图像经过一些预处理,将文字区域利用仿射变化,等比映射为一张高为48像素的图片。
2.比赛任务:要求选手必须使用飞桨对图像区域中的文字行进行预测,返回文字行的内容。
3.评分标准:本任务采用 文字行级别准确率为评价标准只有当预测的文字行与标注文本行完全匹配时视为正确,公式如下:
其中,hit_count为与标注文本行完全匹配的预测文字行的数目,gt_count为所有标注文本行的数目。评价指标只在测试集上进行评价。
结构图:
#卷积层的paddle实现
paddle.fluid.layers.conv2d(input, num_filters, filter_size, stride=1, padding=0, dilation=1, groups=None, param_attr=None, bias_attr=None, use_cudnn=True, act=None, name=None, data_format="NCHW")
#最大池化层(最大下采样)的paddle实现
paddle.fluid.layers.pool2d(input, pool_size=-1, pool_type='max', pool_stride=1, pool_padding=0, global_pooling=False, use_cudnn=True, ceil_mode=False, name=None, exclusive=True, data_format="NCHW")
#全链接层的paddle实现
paddle.fluid.layers.fc(input, size, num_flatten_dims=1, param_attr=None, bias_attr=None, act=None, name=None)
#使用GRU单元
paddle.fluid.layers.dynamic_gru(input, size, param_attr=None, bias_attr=None, is_reverse=False, gate_activation='sigmoid', candidate_activation='tanh', h_0=None, origin_mode=False)
#paddle1.6版本也提供了实现LSTM的方法
paddle.fluid.layers.dynamic_lstm(input, size, h_0=None, c_0=None, param_attr=None, bias_attr=None, use_peepholes=True, is_reverse=False, gate_activation='sigmoid', cell_activation='tanh', candidate_activation='tanh', dtype='float32', name=None)
原论文:Transcription is the process of converting the per-frame predictions made by RNN into a label sequence. Mathematically, transcription is to find the label sequence with the highest probability conditioned on the per-frame predictions. In practice, there exists two modes of transcription, namely the lexicon-free and lexicon-based transcriptions. A lexicon is a set of label sequences that prediction is constraint to, e.g. a spell checking dictionary. In lexiconfree mode, predictions are made without any lexicon. In lexicon-based mode, predictions are made by choosing the label sequence that has the highest probability
原论文:转录是将RNN所做的每帧预测转换为标签序列的过程。 从数学上讲,转录是找到基于每帧预测的概率最高的标签序列。 在实践中,存在两种转录模式,即无词典转录和基于词典的转录。 词汇是一组标签序列,预测是对的约束,例如。 拼写检查字典。 在无词汇模式下,预测是在没有任何词汇的情况下进行的。 在基于词汇的模式下,预测是通过选择概率最高的标签序列来进行的。
#paddle1.6提供了代码实现
paddle.fluid.layers.ctc_greedy_decoder(input, blank, name=None)
具体网络层:
#batch_norm的paddle实现
paddle.fluid.layers.batch_norm(input, act=None, is_test=False, momentum=0.9, epsilon=1e-05, param_attr=None, bias_attr=None, data_layout='NCHW', in_place=False, name=None, moving_mean_name=None, moving_variance_name=None, do_model_average_for_mean_and_var=False, use_global_stats=False)
原文:Back-Propagation Through Time (BPTT). At the bottom of the recurrent layers, the sequence of propagated differentials are concatenated into maps, inverting the operation of converting feature maps into feature sequences, and fed back to the convolutional layers. In practice, we create a custom network layer, called “Map-to-Sequence”, as the bridge between convolutional layers and recurrent layers.
原文:回溯时间(BPTT)。 在递归层的底部,将传播的差分序列连接成映射,将特征映射转换为特征序列的操作倒置,并反馈给卷积层。 在实践中,我们创建了一个自定义网络层,称为“映射到等”,作为卷积层和递归层之间的桥梁。
import paddle.fluid as fluid from paddle.fluid import ParamAttr from paddle.fluid.clip import GradientClipByNorm from paddle.fluid.regularizer import L2Decay from paddle.fluid.initializer import MSRA, Normal from paddle.fluid.layers import conv2d, conv2d_transpose, batch_norm, fc, dynamic_gru, im2sequence, elementwise_mul, \ pool2d, dropout, concat class CRNN(object): def __init__(self, num_classes, label_dict): self.outputs = None self.label_dict = label_dict self.num_classes = num_classes#类别数 def name(self): return 'crnn' def conv_bn_pool(self, x, n_filters, n_ConvBN, pool_stride, w_conv, is_test): w_bn = ParamAttr(regularizer=L2Decay(0.0005))#设置L2正则化,初始化权重 b_bn = ParamAttr(regularizer=L2Decay(0.0005), initializer=Normal(0.0, 0.0)) for _ in range(n_ConvBN): x = conv2d(x, n_filters, 3, 1, 1, param_attr=w_conv)#定义卷积层 #批归一化 x = batch_norm(x, act='relu', param_attr=w_bn, bias_attr=b_bn, is_test=is_test) assert pool_stride in [2, (2, 1), (3, 1)]#使用断言 if pool_stride == 2: x = pool2d(x, 2, 'max', pool_stride, 0, ceil_mode=True)#定义池化层,最大池化 elif pool_stride == (2, 1): x = pool2d(x, (2, 1), 'max', pool_stride, 0, ceil_mode=True) elif pool_stride == (3, 1): x = pool2d(x, (3, 1), 'max', pool_stride, 0, ceil_mode=True) return x def ocr_convs(self, x, is_test): w_conv1 = ParamAttr(regularizer=L2Decay(0.0005)) w_conv2 = ParamAttr(regularizer=L2Decay(0.0005)) w_conv3 = ParamAttr(regularizer=L2Decay(0.0005)) x = self.conv_bn_pool(x, 128, 1, 2, w_conv1, is_test) x = self.conv_bn_pool(x, 256, 1, 2, w_conv2, is_test) x = self.conv_bn_pool(x, 512, 2, 2, w_conv2, is_test) x = self.conv_bn_pool(x, 1024, 2, (2, 1), w_conv3, is_test) return x def net(self, images, rnn_hidden_size=750, is_test=False): w_fc = ParamAttr(regularizer=L2Decay(0.0005)) b_fc1 = ParamAttr(regularizer=L2Decay(0.0005), initializer=Normal(0.0, 0.0)) b_fc2 = ParamAttr(regularizer=L2Decay(0.0005), initializer=Normal(0.0, 0.0), learning_rate=2.0) b_fc3 = ParamAttr(regularizer=L2Decay(0.0005), initializer=Normal(0.0, 0.0)) x = self.ocr_convs(images, is_test) x = im2sequence(x, (x.shape[2], 1), (1, 1))#用 filter 扫描输入的Tensor并将输入Tensor转换成序列 fc_1 = fc(x, rnn_hidden_size * 3, param_attr=w_fc, bias_attr=b_fc1)#定义全连接层,将cnn层输出处理成序列,用于代入RNN层 fc_2 = fc(x, rnn_hidden_size * 3, param_attr=w_fc, bias_attr=b_fc1) gru_forward = dynamic_gru(fc_1, rnn_hidden_size, param_attr=w_fc, bias_attr=b_fc2, candidate_activation='relu')#用于在完整序列上逐个时间步的进行单层Gated Recurrent Unit(GRU)的计算 gru_backward = dynamic_gru(fc_2, rnn_hidden_size, param_attr=w_fc, bias_attr=b_fc2, candidate_activation='relu', is_reverse=True)#使用2层结构 bigru = gru_forward + gru_backward bigru = dropout(bigru, 0.5, is_test)#使用随机丢弃单元的正则化方法 fc_out = fc(bigru, self.num_classes + 1, param_attr=w_fc, bias_attr=b_fc3)#全连接层 self.outputs = fc_out return fc_out def get_infer(self, images):#CTC转录层 return fluid.layers.ctc_greedy_decoder(input=self.outputs, blank=self.num_classes)
class Converter(object): def __init__(self, to_encoding): self.to_encoding = to_encoding self.map = MAPS[to_encoding] self.start() def feed(self, char): branches = [] for fsm in self.machines: new = fsm.feed(char, self.map) if new: branches.append(new) if branches: self.machines.extend(branches) self.machines = [fsm for fsm in self.machines if fsm.state != FAIL] all_ok = True for fsm in self.machines: if fsm.state != END: all_ok = False if all_ok: self._clean() return self.get_result() def _clean(self): if len(self.machines): self.machines.sort(key=lambda x: len(x)) # self.machines.sort(cmp=lambda x,y: cmp(len(x), len(y))) self.final += self.machines[0].final self.machines = [StatesMachine()] def start(self): self.machines = [StatesMachine()] self.final = UEMPTY def end(self): self.machines = [fsm for fsm in self.machines if fsm.state == FAIL or fsm.state == END] self._clean() def convert(self, string): self.start() for char in string: self.feed(char) self.end() return self.get_result() def get_result(self): return self.final
import codecs import random import sys from os.path import join as pjoin #函数 read_ims_list:读取train.list文件,生成图片的信息字典 def read_ims_list(path_ims_list): """ 读取 train.list 文件 """ ims_info_dic = {} with open(path_ims_list, 'r', encoding='utf-8') as f: for line in f: parts = line.strip().split(maxsplit=3) w, h, file, label = parts[0], parts[1], parts[2], parts[3] ims_info_dic[file] = {'label': label, 'w': int(w)} return ims_info_dic #函数 modify_ch:对标签label进行修改,进行四项操作,分别是“繁体->简体”、“大写->小写”、“删除空格”、“删除符号”。 def modify_ch(label): # 繁体 -> 简体 label = Converter("zh-hans").convert(label) # 大写 -> 小写 label = label.lower() # 删除空格 label = label.replace(' ', '') # 删除符号 for ch in label: if (not '\u4e00' <= ch <= '\u9fff') and (not ch.isalnum()): label = label.replace(ch, '') return label #函数 pipeline:调用定义的函数,对训练数据进行初步处理。 def pipeline(dataset_dir): path_ims = pjoin(dataset_dir, "train_images") path_ims_list = pjoin(dataset_dir, "train.list") path_train_list = pjoin(dataset_dir, "train.txt") path_label_list = pjoin(dataset_dir, "label_list.txt") # 读取数据信息 file_info_dic = read_ims_list(path_ims_list) # 创建 train.txt class_set = set() with codecs.open(path_train_list, 'w', encoding='utf-8') as f: for file, info in file_info_dic.items(): label = info['label'] label = modify_ch(label) # 异常: 标签为空 if label == '': continue for e in label: class_set.add(e) f.write("{0}\t{1}\n".format(pjoin(path_ims, file), label)) # 创建 label_list.txt class_list = list(class_set) class_list.sort() print("class num: {0}".format(len(class_list))) with codecs.open(path_label_list, "w", encoding='utf-8') as label_list: for id, c in enumerate(class_list): label_list.write("{0}\t{1}\n".format(c, id)) random.seed(0) pipeline(dataset_dir="data/data10879")
train_opt = { "input_size": [1, 48, 256],#输入图片大小 "data_dir": "data/data10879",#图片路径 "train_dir": "train_images",#训练集位置 "train_list": "train.txt",#训练集list文件,内含具体图片文件名 "label_list": "label_list.txt",#训练集标注文件 "class_dim": -1,#分类数 "label_dic": {}, "n_im": -1, "continue_train": True,#是否加载训练好的模型 #"continue_train": False,#是否加载训练好的模型 "save_model_dir": "work/crnn_model",#模型保存路径 "num_epochs": 80,#训练轮数 "train_batch_size": 256,#batch_size大小 "mean_color": 127.0, "multi_data_reader_count": 8, "apply_distort": True, #数据增强参数配置 "image_distort_strategy": { "expand_prob": 0.3, "expand_max_ratio": 2.0, "hue_prob": 0.5, "hue_delta": 48, "contrast_prob": 0.5, "contrast_delta": 0.5, "saturation_prob": 0.5, "saturation_delta": 0.5, "brightness_prob": 0.5, "brightness_delta": 0.5, }, #训练优化器配置 "optimizer_strategy": { "learning_rate": 0.001,#学习率 "momentum": 0.9, #"lr_epochs": [40, 80],#将学习率按训练轮数分段,0-40,40-80,80-120 #"lr_decay": [1, 0.5, 0.1],#按分段学习率乘以该比率 "lr_epochs": [20,40 ],#学习率衰减epoch的位置 "lr_decay": [1, 0.5, 0.1],#每一次衰减的步幅 }, }
import numpy as np import random import cv2 as cv from PIL import Image, ImageEnhance, ImageDraw def resize_img(img, input_size):#调整图片大小 target_size = input_size percent_h = float(target_size[1]) / img.size[1] percent_w = float(target_size[2]) / img.size[0] percent = min(percent_h, percent_w) resized_width = int(round(img.size[0] * percent)) resized_height = int(round(img.size[1] * percent)) w_off = (target_size[2] - resized_width) / 2 h_off = (target_size[1] - resized_height) / 2 img = img.resize((resized_width, resized_height), Image.ANTIALIAS) array = np.ndarray((target_size[1], target_size[2], 3), np.uint8) array[:, :, 0] = 127 array[:, :, 1] = 127 array[:, :, 2] = 127 ret = Image.fromarray(array) ret.paste(img, (np.random.randint(0, w_off + 1), int(h_off))) return ret def random_brightness(img):#随机调整亮度,进行数据增强 prob = np.random.uniform(0, 1) if prob < train_opt['image_distort_strategy']['brightness_prob']: brightness_delta = train_opt['image_distort_strategy']['brightness_delta'] delta = np.random.uniform(-brightness_delta, brightness_delta) + 1 img = ImageEnhance.Brightness(img).enhance(delta) return img def random_contrast(img):#随机调整对比度,进行数据增强 prob = np.random.uniform(0, 1) if prob < train_opt['image_distort_strategy']['contrast_prob']: contrast_delta = train_opt['image_distort_strategy']['contrast_delta'] delta = np.random.uniform(-contrast_delta, contrast_delta) + 1 img = ImageEnhance.Contrast(img).enhance(delta) return img def random_saturation(img):#随机调整饱和度,进行数据增强 prob = np.random.uniform(0, 1) if prob < train_opt['image_distort_strategy']['saturation_prob']: saturation_delta = train_opt['image_distort_strategy']['saturation_delta'] delta = np.random.uniform(-saturation_delta, saturation_delta) + 1 img = ImageEnhance.Color(img).enhance(delta) return img def random_hue(img):#随机调整色相,进行数据增强 prob = np.random.uniform(0, 1) if prob < train_opt['image_distort_strategy']['hue_prob']: hue_delta = train_opt['image_distort_strategy']['hue_delta'] delta = np.random.uniform(-hue_delta, hue_delta) img_hsv = np.array(img.convert('HSV')) img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta img = Image.fromarray(img_hsv, mode='HSV').convert('RGB') return img def distort_image(img):#将上述数据增强手段整合,施加到训练样本上 prob = np.random.uniform(0, 1) # Apply different distort order if prob > 0.5: img = random_brightness(img) img = random_contrast(img) img = random_saturation(img) img = random_hue(img) else: img = random_brightness(img) img = random_saturation(img) img = random_hue(img) img = random_contrast(img) return img def rotate_image(img):#随机旋转图片,进行数据增强 """ 图像增强,增加随机旋转角度 """ prob = np.random.uniform(0, 1) if prob > 0.: angle = np.random.randint(-8, 8) img = img.convert('RGBA') img = img.rotate(angle, resample=Image.BILINEAR, expand=0) fff = Image.new('RGBA', img.size, (127, 127, 127, 127)) img = Image.composite(img, fff, mask=img).convert('RGB') return img def rotate_image_0(img): """ 图像增强,增加随机旋转角度 """ prob = np.random.uniform(0, 1) if prob > 0.: angle = np.random.randint(-10, 10) img = img.convert('RGBA') img = img.rotate(angle, resample=Image.BILINEAR, expand=0) fff = Image.new('RGBA', img.size, (127, 127, 127, 127)) img = Image.composite(img, fff, mask=img).convert('RGB') return img def random_expand(img, keep_ratio=True):#随机改变图片大小,进行数据增强 if np.random.uniform(0, 1) < train_opt['image_distort_strategy']['expand_prob']: return img max_ratio = 1.3 # train_opt['image_distort_strategy']['expand_max_ratio'] w, h = img.size#图像尺寸 c = 3 ratio_x = random.uniform(1, max_ratio) if keep_ratio: ratio_y = ratio_x else: ratio_y = random.uniform(1, max_ratio) oh = int(h * ratio_y) ow = int(w * ratio_x) off_x = random.randint(0, ow - w) off_y = random.randint(0, oh - h) out_img = np.zeros((oh, ow, c), np.uint8) for i in range(c): out_img[:, :, i] = train_opt['mean_color'] out_img[off_y: off_y + h, off_x: off_x + w, :] = img return Image.fromarray(out_img) def random_expand_0(img,keep_ratio=True): if np.random.uniform(0, 1) < 0 :#train_opt['image_distort_strategy']['expand_prob']: return img #max_ratio = train_opt['image_distort_strategy']['expand_max_ratio'] w, h = img.size c = 3 ratio_x = random.uniform(1, 2) if keep_ratio: ratio_y = ratio_x else: ratio_y = random.uniform(1, 2) oh = int(h * ratio_y) ow = int(w * ratio_x) off_x = random.randint(0, ow - w) off_y = random.randint(0, oh - h) out_img = np.zeros((oh, ow, c), np.uint8) for i in range(c): out_img[:, :, i] = train_opt['mean_color'] out_img[off_y: off_y + h, off_x: off_x + w, :] = img return Image.fromarray(out_img) def preprocess(img, input_size): img_width, img_height = img.size if train_opt['apply_distort']: img = distort_image(img) img_m = np.mean(img.convert('L')) img_std = max(np.std(img.convert('L')), 1e-2) img = resize_img(img, input_size) img = img.convert('L') img = (np.array(img).astype('float32') - img_m) / img_std return img def preprocess_0(img, input_size): img_width, img_height = img.size if train_opt['apply_distort']: img = distort_image(img) img_m = np.mean(img.convert('L')) img_std = max(np.std(img.convert('L')), 1e-2) if train_opt['apply_distort']: img = random_expand_0(img) img = rotate_image_0(img) img = resize_img(img, input_size) img = img.convert('L') img = (np.array(img).astype('float32') - img_m) / img_std return img
import math import os import paddle import numpy as np from PIL import Image, ImageEnhance, ImageDraw #定义数据读取器 def custom_reader(file_list, input_size, mode): def reader(): for i in [1,2]: np.random.shuffle(file_list) for line in file_list: parts = line.split() image_path = parts[0] img = Image.open(image_path) if img.mode != 'RGB': img = img.convert('RGB') label = [int(train_opt['label_dic'][c]) for c in parts[-1]] if len(label) == 0: continue if i == 1: img = preprocess(img, input_size) else: img = preprocess_0(img, input_size) img = img[np.newaxis, ...] yield img, label return reader #将custom_reader封装成多进程数据读取器,提高读取效率 def multi_process_custom_reader(file_path, data_dir, num_workers, input_size, mode): file_path = os.path.join(data_dir, file_path) readers = [] images = [line.strip() for line in open(file_path, encoding='utf-8')] np.random.shuffle(images)#打乱序列,洗牌 n = int(math.ceil(len(images) / num_workers))#图片数/4,然后向上取整数 image_lists = [images[i: i + n] for i in range(0, len(images), n)]#将图片等分4份 #增加一倍的数据集 for l in image_lists: reader = paddle.batch(custom_reader(l, input_size, mode), batch_size=train_opt['train_batch_size'])#将每64个数据放到一个列表中 readers.append(paddle.reader.shuffle(reader, train_opt['train_batch_size'])) return paddle.reader.multiprocess_reader(readers, False)
import os import numpy as np import time import math import random import paddle.fluid as fluid import logging import codecs import sys from os.path import join as pjoin from paddle.fluid.layers import piecewise_decay, ctc_greedy_decoder, cast, edit_distance, warpctc, reduce_sum, create_py_reader_by_data from paddle.fluid.regularizer import L2Decay from paddle.fluid.optimizer import ModelAverage, Momentum, Adam logger = None def init_log_config(): global logger logger = logging.getLogger() logger.setLevel(logging.INFO) log_path = pjoin('work', 'logs') if not os.path.exists(log_path): os.makedirs(log_path) log_name = pjoin(log_path, 'train.log') sh = logging.StreamHandler() fh = logging.FileHandler(log_name, mode='w') fh.setLevel(logging.DEBUG) formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s") fh.setFormatter(formatter) sh.setFormatter(formatter) logger.addHandler(sh) logger.addHandler(fh) def init_train_parameters(): """ 初始化训练参数,主要是初始化图片数量,字典 """ path_train_list = pjoin(train_opt['data_dir'], train_opt['train_list']) path_label_list = pjoin(train_opt['data_dir'], train_opt['label_list']) with codecs.open(path_train_list, encoding='utf-8') as f: lines = [line.strip() for line in f] train_opt['n_im'] = len(lines)#获取训练集长度,即图片张数 with codecs.open(path_label_list, encoding='utf-8') as f: for line in f: parts = line.strip().split() train_opt['label_dic'][parts[0]] = int(parts[1])#读取汉字、字符和对应的编号 train_opt['class_dim'] = len(train_opt['label_dic'])#存储汉字、字符长度 def optimizer_setting(): batch_size = train_opt["train_batch_size"] iters = train_opt["n_im"] # batch_size learning_strategy = train_opt['optimizer_strategy'] lr = learning_strategy['learning_rate'] boundaries = [i * iters for i in learning_strategy["lr_epochs"]] values = [i * lr for i in learning_strategy["lr_decay"]] optimizer = Adam( learning_rate=piecewise_decay(boundaries, values),#对学习率分段衰减 regularization=L2Decay(0.1),#增加正则化参数缓解过拟合问题,原为0.1 beta1=0.9 ) return optimizer def build_train_program_with_async_reader(main_prog, startup_prog): with fluid.program_guard(main_prog, startup_prog): img = fluid.layers.data(name='img', shape=train_opt['input_size'], dtype='float32') gt_label = fluid.layers.data(name='gt_label', shape=[1], dtype='int32', lod_level=1) data_reader = create_py_reader_by_data(capacity=train_opt['train_batch_size'], feed_list=[img, gt_label], name='train') multi_reader = multi_process_custom_reader(train_opt['train_list'], train_opt['data_dir'], train_opt['multi_data_reader_count'], train_opt['input_size'], 'train') data_reader.decorate_paddle_reader(multi_reader) with fluid.unique_name.guard(): img, gt_label = fluid.layers.read_file(data_reader) model = CRNN(train_opt['class_dim'], train_opt['label_dic']) fc_out = model.net(img) cost = warpctc(fc_out, gt_label, blank=train_opt['class_dim'], norm_by_times=True) loss = reduce_sum(cost) optimizer = optimizer_setting() optimizer.minimize(loss) model_average = ModelAverage(0.15, 10000, 12500) decoded_out = ctc_greedy_decoder(fc_out, blank=train_opt['class_dim']) casted_label = cast(gt_label, dtype='int64') distances, seq_num = edit_distance(decoded_out, casted_label) return data_reader, loss, model_average, distances, seq_num, decoded_out def load_pretrained_params(exe, program): if train_opt['continue_train']: logger.info('load param from retrain model') #fluid.io.load_persistables(executor=exe, dirname=train_opt['save_model_dir'], main_program=program)#训练好的模型的参数w,b,用于分断训练,参数目前是false #fluid.io.load_persistables(executor=exe, dirname='data/data10879/crnn_model', main_program=program)#训练好的模型的参数w,b,用于分断训练,参数目前是false fluid.io.load_persistables(executor=exe, dirname='data/data42037/home/aistudio/work/crnn_model', main_program=program)#训练好的模型的参数w,b,用于分断训练,参数目前是false def train(): logger.info("build network and program") train_program = fluid.Program() start_program = fluid.Program() train_reader, loss, model_avg, distances, seq_num, decoded_out = build_train_program_with_async_reader(train_program, start_program) logger.info("build executor and init params") exe = fluid.Executor(fluid.CUDAPlace(0)) exe.run(start_program) train_fetch_list = [loss.name, distances.name, seq_num.name, decoded_out.name] load_pretrained_params(exe, train_program) total_batch_count = 0 current_best_accuracy = 0.10 distance_evaluator = fluid.metrics.EditDistance("edit-distance") for epoch in range(train_opt["num_epochs"]): logger.info("current epoch: %d, start read image", epoch) batch_id = 0 train_reader.start() distance_evaluator.reset() loss_mean = 0 try: while True: t1 = time.time() loss, distances, seq_num, decoded_out = exe.run(train_program, fetch_list=train_fetch_list, return_numpy=False) loss = np.mean(np.array(loss)) distances = np.array(distances) seq_num = np.array(seq_num) distance_evaluator.update(distances, seq_num) period = time.time() - t1 batch_id += 1 total_batch_count += 1 loss_mean = loss_mean + loss #loss平均值 if batch_id % 200 == 0:#打印训练结果 with model_avg.apply(exe): distance, instance_error = distance_evaluator.eval() logger.info("Epoch {0}, trainbatch {1}, loss {2} distance {3} instance error {4} time {5}" .format(epoch, batch_id, loss, distance, instance_error, "%2.2f sec" % period)) except fluid.core.EOFException: train_reader.reset() with model_avg.apply(exe): logger.info("loss_mean:{0}".format(loss_mean/batch_id)) distance, instance_error = distance_evaluator.eval() logger.info("Epoch {0} distance {1} instance error {2}".format(epoch, distance, instance_error)) current_accuracy = 1.0 - instance_error if current_accuracy >= current_best_accuracy: with model_avg.apply(exe): logger.info("temp save pass {0} train result, current bset accuracy {1}".format(epoch, 1.0 - instance_error)) current_best_accuracy = current_accuracy fluid.io.save_persistables(dirname=train_opt['save_model_dir'], main_program=train_program, executor=exe)#训练好的模型存储位置 logger.info("training till last, end training") init_log_config() init_train_parameters() train()
import os import codecs import paddle.fluid as fluid # 读取 label_list.txt 文件获取类别数量 class_dim = -1 all_file_dir = "data/data10879" with codecs.open(os.path.join(all_file_dir, "label_list.txt")) as label_list: class_dim = len(label_list.readlines()) target_size = [1, 48, 1024] save_freeze_dir = "work/crnn_model"#读取训练好的模型参数 def freeze_model(): exe = fluid.Executor(fluid.CPUPlace()) image = fluid.layers.data(name='image', shape=target_size, dtype='float32') model = CRNN(class_dim, {}) pred = model.net(image) out = model.get_infer(image) freeze_program = fluid.default_main_program() fluid.io.load_persistables(exe, save_freeze_dir, freeze_program) freeze_program = freeze_program.clone(for_test=True) fluid.io.save_inference_model("work/freeze_model", ['image'], out, exe, freeze_program) freeze_model()
import os from os.path import join as pjoin import numpy as np import time import codecs import shutil import math import cv2 as cv import paddle.fluid as fluid from functools import reduce from tqdm import tqdm from PIL import Image, ImageEnhance from work.langconv import Converter target_size = [1, 48, 512] mean_rgb = 127.0 data_dir = 'data/data10879' label_list = "label_list.txt" use_gpu = True label_dict = {} place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) save_freeze_dir = "work/freeze_model" [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(dirname=save_freeze_dir, executor=exe) def init_eval_parameters(): """ 初始化预测参数 """ label_list_path = pjoin(data_dir, label_list) with codecs.open(label_list_path, encoding='utf-8') as flist: lines = [line.strip() for line in flist] for line in lines: parts = line.split() label_dict[int(parts[1])] = parts[0] def resize_img(img): w, h = img.size target_size[2] = math.ceil(w / 16) * 16 percent_h = float(target_size[1]) / img.size[1] percent_w = float(target_size[2]) / img.size[0] percent = min(percent_h, percent_w) resized_width = int(round(img.size[0] * percent)) resized_height = int(round(img.size[1] * percent)) w_off = (target_size[2] - resized_width) / 2 h_off = (target_size[1] - resized_height) / 2 img = img.resize((resized_width, resized_height), Image.ANTIALIAS) array = np.ndarray((target_size[1], target_size[2]), np.uint8) array[:, :] = 127 ret = Image.fromarray(array) ret.paste(img, (int(w_off), int(h_off))) return ret def read_image(img_path): img = Image.open(img_path) img = img.convert('L') img_m = np.mean(img.convert('L')) img_std = max(np.std(img.convert('L')), 1e-2) img = resize_img(img) img = (np.array(img).astype('float32') - img_m) / img_std img = img[..., np.newaxis] img = img.transpose((2, 0, 1)) img = img[np.newaxis, :] return img def infer(image_path): tensor_img = read_image(image_path) label = exe.run(inference_program, feed={feed_target_names[0]: tensor_img}, fetch_list=fetch_targets, return_numpy=False) label = np.array(label[0]) ret = "" if label[0] != -1: ret = ret.join([label_dict[int(c[0])] for c in label]) return ret def eval_all(): predict = codecs.open(pjoin(data_dir, 'predict.txt'), 'w') files = [file for file in os.listdir(pjoin(data_dir, 'test_images')) if file.endswith('.jpg')] files = sorted(files) for file in tqdm(files): path_file = pjoin(data_dir, 'test_images', file) result = infer(path_file) predict.write('{0}\t{1}\n'.format(file, result)) init_eval_parameters() eval_all()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。