赞
踩
个人来讲,弄明白了训练部分的代码后才完整理解了faster-rcnn的整个过程。结合这篇博客记录对代码的理解。
一.首先来看trainer.py。
trainer.py中有很多函数在train.py中调用,所以先记录trainer.py。我分块记录,方便查看。
1.__init __函数主要进行参数初始化。
from __future__ import absolute_import import os from collections import namedtuple import time from torch.nn import functional as F from model.utils.creator_tool import AnchorTargetCreator, ProposalTargetCreator from torch import nn import torch as t from utils import array_tool as at from utils.vis_tool import Visualizer from utils.config import opt from torchnet.meter import ConfusionMeter, AverageValueMeter LossTuple = namedtuple('LossTuple', ['rpn_loc_loss', 'rpn_cls_loss', 'roi_loc_loss', 'roi_cls_loss', 'total_loss' ]) class FasterRCNNTrainer(nn.Module): def __init__(self, faster_rcnn): super(FasterRCNNTrainer, self).__init__() # 初始化函数,用来初始化一些变量 self.faster_rcnn = faster_rcnn self.rpn_sigma = opt.rpn_sigma # 用来计算位置损失函数所要用到的超参数 self.roi_sigma = opt.roi_sigma # 用来计算位置损失函数所要用到的超参数 # target creator create gt_bbox gt_label etc as training targets. 将真实的bbox和真实的label作为训练目标 self.anchor_target_creator = AnchorTargetCreator() # AnchorTargetCreator服务于RPN网络,为从20000个候选anchor中产生256个anchor进行二分类预测和位置回归预测提供真值 self.proposal_target_creator = ProposalTargetCreator() # 服务于ROIHearder(真正产生ROI__loc和ROI_cls的网络),从2000个筛选出的ROIS中选出128个用于rpn自我训练 self.loc_normalize_mean = faster_rcnn.loc_normalize_mean # 位置均值 (为进行归一化处理) self.loc_normalize_std = faster_rcnn.loc_normalize_std # 位置方差 self.optimizer = self.faster_rcnn.get_optimizer() # 优化器,决定使用Adam还是SGD,本代码使用SGD # visdom wrapper self.vis = Visualizer(env=opt.env) # 可视化 # indicators for training status self.rpn_cm = ConfusionMeter(2) # rpn_cm是混淆矩阵,用来验证预测值与真实值精确度,括号里的2指的是类别数(前景和背景) self.roi_cm = ConfusionMeter(21) # roi_cm =21(20个object类+1个background) self.meters = {k: AverageValueMeter() for k in LossTuple._fields} # average loss
2.forward函数是trainer.py的最重要的部分,进行了求损失之前的训练过程和计算了两部分的损失。
def forward(self, imgs, bboxes, labels, scale): n = bboxes.shape[0] # 获取batch个数 if n != 1: raise ValueError('Currently only batch size 1 is supported.') # 规定该程序中batch_size只能为1 _, _, H, W = imgs.shape # 读取图片的高和宽(图片和bbox的数据格式都是(n,c,hh,ww)) img_size = (H, W) features = self.faster_rcnn.extractor(imgs) # 提取图片的特征 rpn_locs, rpn_scores, rois, roi_indices, anchor = \ self.faster_rcnn.rpn(features, img_size, scale) # 将特征放到rpn网络里面的self.faster_rcnn.rpn(feature,img_size,scale)提取出rpn_locs,rpn_scores,rois,roi_indices,anchor来 # Since batch size is one, convert variables to singular form. 转换形式 bbox = bboxes[0] label = labels[0] rpn_score = rpn_scores[0] rpn_loc = rpn_locs[0] roi = rois # Sample RoIs and forward # it's fine to break the computation graph of rois, # consider them as constant input sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean, self.loc_normalize_std) # 经过proposal_target_creator网络产生采样过后的sample_roi,以及其对应的gt_cls_loc和gt_score。RoIHead网络利用这些sample_roi+featue为输入,输出是分类(21类)和回归(进一步微调bbox)的预测值,分类回归的真值是由ProposalTargetCreator输出的gt_roi_label和gt_roi_loc # NOTE it's all zero because now it only support for batch=1 now sample_roi_index = t.zeros(len(sample_roi)) roi_cls_loc, roi_score = self.faster_rcnn.head( features, sample_roi, sample_roi_index) # ------------------ RPN losses -------------------# gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) # 获得2000个anchor与边界框的偏差与类别 gt_rpn_label = at.totensor(gt_rpn_label).long() gt_rpn_loc = at.totensor(gt_rpn_loc) rpn_loc_loss = _fast_rcnn_loc_loss( rpn_loc, # rpn_loc为rpn网络回归出来的偏移量(20000个) gt_rpn_loc, # gt_rpn_loc为anchor_target_creator函数得到2000个anchor与bbox的偏移量 gt_rpn_label.data, # 定位损失加label是因为负例不参与定位损失 self.rpn_sigma) # rpn_sigma=1 # NOTE: default value of ignore_index is -100 ... rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) # rpn_score为20000个label与2000个label的softmax损失 _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] # 只计算前景的的类 _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1] self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long()) # ------------------ ROI losses (fast rcnn loss) -------------------# n_sample = roi_cls_loc.shape[0] # n_sample=128 roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \ at.totensor(gt_roi_label).long()] gt_roi_label = at.totensor(gt_roi_label).long() # 128个proposal与边界框求得的位置偏移dx,dy,dw,dh gt_roi_loc = at.totensor(gt_roi_loc) roi_loc_loss = _fast_rcnn_loc_loss( roi_loc.contiguous(), gt_roi_loc, gt_roi_label.data, self.roi_sigma) roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) # 计算softmax损失 self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long()) losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] # 总的损失=4个loss的和 losses = losses + [sum(losses)] return LossTuple(*losses)
3.train_step函数就是进行参数优化。
def train_step(self, imgs, bboxes, labels, scale): # 整个函数实际上就是进行了一次参数的优化过程
self.optimizer.zero_grad() # 将梯度全部置0
losses = self.forward(imgs, bboxes, labels, scale) # 计算所有损失
losses.total_loss.backward() # 反向传播计算梯度
self.optimizer.step() # 进行一次参数的更新
self.update_meters(losses) # 在可视化界面上显示所有损失
return losses
4.save和load部分。
def save(self, save_optimizer=False, save_path=None, **kwargs): # 根据传入的参数来选择保存model模型或者config设置或者是other_info其他参数vis_info可视化参数 """serialize models include optimizer and other info return path where the model-file is stored. Args: save_optimizer (bool): whether save optimizer.state_dict(). save_path (string): where to save model, if it's None, save_path is generate using time str and info from kwargs. Returns: save_path(str): the path to save models.保存模型的路径 """ save_dict = dict() save_dict['model'] = self.faster_rcnn.state_dict() save_dict['config'] = opt._state_dict() save_dict['other_info'] = kwargs save_dict['vis_info'] = self.vis.state_dict() if save_optimizer: # 是否保存优化器 save_dict['optimizer'] = self.optimizer.state_dict() if save_path is None: # 保存模型的路径 timestr = time.strftime('%m%d%H%M') save_path = 'checkpoints/fasterrcnn_%s' % timestr for k_, v_ in kwargs.items(): save_path += '_%s' % v_ save_dir = os.path.dirname(save_path) if not os.path.exists(save_dir): os.makedirs(save_dir) t.save(save_dict, save_path) self.vis.save([self.vis.env]) return save_path def load(self, path, load_optimizer=True, parse_opt=False, ): state_dict = t.load(path) if 'model' in state_dict: self.faster_rcnn.load_state_dict(state_dict['model']) else: # legacy way, for backward compatibility self.faster_rcnn.load_state_dict(state_dict) return self if parse_opt: opt._parse(state_dict['config']) if 'optimizer' in state_dict and load_optimizer: self.optimizer.load_state_dict(state_dict['optimizer']) return self
5.update_meters,reset_meters以及get_meter_data函数。
def update_meters(self, losses): # 向visdom界面update数据
loss_d = {k: at.scalar(v) for k, v in losses._asdict().items()}
for key, meter in self.meters.items():
meter.add(loss_d[key])
def reset_meters(self): # 向visdom界面reset数据
for key, meter in self.meters.items():
meter.reset()
self.roi_cm.reset()
self.rpn_cm.reset()
def get_meter_data(self): # 向visdom界面get数据
return {k: v.value()[0] for k, v in self.meters.items()}
6._smooth_l1_loss函数就是计算smooth_l1损失。
def _smooth_l1_loss(x, t, in_weight, sigma): # x代表预测,t代表真值,in_weight代表权重
sigma2 = sigma ** 2
diff = in_weight * (x - t) # 被标定为背景的类的权重设置为0(忽略背景类),这也就是为什么计算位置的损失函数还要传入真实label作为参数的原因
abs_diff = diff.abs()
flag = (abs_diff.data < (1. / sigma2)).float()
y = (flag * (sigma2 / 2.) * (diff ** 2) +
(1 - flag) * (abs_diff - 0.5 / sigma2))
return y.sum()
7._fast_rcnn_loc_loss(pred_loc,gt_loc,gt_label,sigma)函数用于计算位置损失。
def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma):
in_weight = t.zeros(gt_loc.shape).cuda()
# Localization loss is calculated only for positive rois.
# NOTE: unlike origin implementation,
# we don't need inside_weight and outside_weight, they can calculate by gt_label
in_weight[(gt_label > 0).view(-1, 1).expand_as(in_weight).cuda()] = 1 # 将不是背景的anchor或ROIs的位置加入到损失函数的计算中来,方法是只给不是背景的anchor/ROIs的in_weight设置为1
loc_loss = _smooth_l1_loss(pred_loc, gt_loc, in_weight.detach(), sigma) # sigma=1
# Normalize by total number of negtive and positive rois.
loc_loss /= ((gt_label >= 0).sum().float()) # 只计算前景类
return loc_loss
二.再来看trainer.py。
1.eval函数是用来评估预测结果好坏的函数。
from __future__ import absolute_import import os import ipdb import matplotlib from tqdm import tqdm from utils.config import opt from data.dataset import Dataset, TestDataset, inverse_normalize from model import FasterRCNNVGG16 from torch.utils import data as data_ from trainer import FasterRCNNTrainer from utils import array_tool as at from utils.vis_tool import visdom_bbox from utils.eval_tool import eval_detection_voc # fix for ulimit # https://github.com/pytorch/pytorch/issues/973#issuecomment-346405667 import resource rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) resource.setrlimit(resource.RLIMIT_NOFILE, (20480, rlimit[1])) matplotlib.use('agg') def eval(dataloader, faster_rcnn, test_num=10000): pred_bboxes, pred_labels, pred_scores = list(), list(), list() gt_bboxes, gt_labels, gt_difficults = list(), list(), list() # 定义了预测和真实的框的位置,类别和分数的列表 for ii, (imgs, sizes, gt_bboxes_, gt_labels_, gt_difficults_) in tqdm(enumerate(dataloader)) #读取数据 sizes = [sizes[0][0].item(), sizes[1][0].item()] pred_bboxes_, pred_labels_, pred_scores_ = faster_rcnn.predict(imgs, [sizes]) # 利用faster_rcnn.predict预测bbox的位置,label以及分数。 gt_bboxes += list(gt_bboxes_.numpy()) # 添加预测值和真实值到列表中 gt_labels += list(gt_labels_.numpy()) gt_difficults += list(gt_difficults_.numpy()) pred_bboxes += pred_bboxes_ pred_labels += pred_labels_ pred_scores += pred_scores_ if ii == test_num: break # 迭代次=test_num就跳出循环 result = eval_detection_voc( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults, use_07_metric=True) # 接收列表参数,得到预测结果 return result
2.train(**kwargs)函数是整个网络的训练部分,这部分一定要弄明白。
def train(**kwargs): opt._parse(kwargs) dataset = Dataset(opt) # 获取数据存储的路径 print('load data') dataloader = data_.DataLoader(dataset, \ batch_size=1, \ shuffle=True, \ # 允许数据打乱排序 # pin_memory=True, num_workers=opt.num_workers) # 设置数据几次处理完 testset = TestDataset(opt) # 与上边进行相同操作 test_dataloader = data_.DataLoader(testset, batch_size=1, num_workers=opt.test_num_workers, shuffle=False, \ pin_memory=True ) faster_rcnn = FasterRCNNVGG16() # 定义模型 print('model construct completed') trainer = FasterRCNNTrainer(faster_rcnn).cuda() # 使用VGG16模型 if opt.load_path: # 判断opt.load_path是否存在 trainer.load(opt.load_path) # 读取pre_train模型 print('load pretrained model from %s' % opt.load_path) trainer.vis.text(dataset.db.label_names, win='labels') # 可视化操作 best_map = 0 lr_ = opt.lr for epoch in range(opt.epoch): # 训练 trainer.reset_meters() # 在可视化界面reset所有数据 for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)): scale = at.scalar(scale) # 设置缩放范围 img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() # gpu加速 trainer.train_step(img, bbox, label, scale) # 参数优化 if (ii + 1) % opt.plot_every == 0: # 读取次数是否达到了画图次数 if os.path.exists(opt.debug_file): ipdb.set_trace() # 设置断点 # plot loss trainer.vis.plot_many(trainer.get_meter_data()) # 读取训练数据并上传进行可视化 # plot groud truth bboxes ori_img_ = inverse_normalize(at.tonumpy(img[0])) # 预处理迭代读取的图片 gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]), at.tonumpy(label_[0])) trainer.vis.img('gt_img', gt_img) # 读取原始数据中的原图,边界框,标签,并显示在visdom界面 # plot predicti bboxes _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True) # 预测并保存结果 pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) trainer.vis.img('pred_img', pred_img) # 同理将原始图片以及预测边框和预测类别显示在visdom界面 # rpn confusion matrix(meter) trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') # 在visdom界面显示混淆矩阵 # roi confusion matrix trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float()) # 在visdom界面以图片的形式显示可视化矩阵 # 接下来是测试阶段的代码 eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num) # 将测试数据进行评价 trainer.vis.plot('test_map', eval_result['map']) # 在visdom界面显示map lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr'] # 设置学习率 log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_), # 更新损失学习率以及map str(eval_result['map']), str(trainer.get_meter_data())) trainer.vis.log(log_info) if eval_result['map'] > best_map: # 保存效果最好的map best_map = eval_result['map'] best_path = trainer.save(best_map=best_map) if epoch == 9: # 学习9次就将学习率变成原来的十分之一 trainer.load(best_path) trainer.faster_rcnn.scale_lr(opt.lr_decay) lr_ = lr_ * opt.lr_decay if epoch == 13: # 结束训练验证过程 break
至此,对Faster RCNN的理解结束了,作为一个刚开始研究目标检测的学生,在结合别人对Faster RCNN的理解后终于自己基本看明白了代码,希望我也希望大家科研道路一切顺利~~
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。