赞
踩
论文地址:CFPNet: Channel-wise Feature Pyramid for Real-Time Semantic Segmentation
代码地址: https://github.com/chukai123/CFPNet
目录
本文为了实现更好的性能,模型尺寸和推断速度,提出了channel-wise feature pyramid(CFP)模块。并且基于CFP模块,构建了CFPNet用于实时语义分割,其中采用了一系列dliate卷积通道来提取有效特征。
在Cityscapes数据集中,CFPNet取得了70.1%的class-wise mIoU,并且只有0.55亿参数和2.5 MB内存。推断速度可以在单个rtx 2080ti gpu上达到30 fps,图像为1024×2048像素。
class-wise mIoU:在cityscapes中,class表示19个小类别,而mIoU表示先计算每个类别的IoU,然后对所有类别的IoU取平均即可;
category-wise (mIoU):一般是指大类别;
mIoU:针对语义分割的一个评估指标,平均交并比Mean Intersection over Union:
从上图可以看出,IoU就是分子中重叠的蓝色部分/分母中的蓝色部分减去重叠的蓝色块的比值,然后对每个类别的IoU取平均即可得到mIoU。公式如下:
其中K表示类别个数,P表示预测集,G表示真实集;
mIoU代码实现:
(1)可以看作是分类任务,借助混淆矩阵计算mIoU
- from sklearn.metrics import confusion_matrix
- import numpy as np
-
- def miou(y_true, y_pred):
- # y_true表示真实值,y_pred表示预测
- com = confusion_matrix(y_true, y_pred)
- TP = np.diag(cm) # 混淆矩阵中的对角线部分,表示预测对的数量
- FP = com.sum(axis=0) - TP
- FN = com.sum(axis=1) - TP
- return np.mean(TP / (FN + FP + TP + np.finfo(float).eps))
(2)numpy计算版本:可以参考https://github.com/dilligencer-zrj/code_zoo/blob/master/compute_mIOU
- #设标签宽W,长H
- def fast_hist(a, b, n):#a是转化成一维数组的标签,形状(H×W,);b是转化成一维数组的预测特征图,形状(H×W,);n是类别数目
- k = (a > 0) & (a <= n) #k是一个一维bool数组,形状(H×W,);目的是找出标签中需要计算的类别(去掉了背景),假设0是背景
- return np.bincount(n * a[k].astype(int) + b[k], minlength=n ** 2).reshape(n, n)
-
-
- def per_class_iu(hist):#分别为每个类别(在这里是19类)计算mIoU,hist的形状(n, n)
- '''
- 核心代码
- '''
- return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))#矩阵的对角线上的值组成的一维数组/矩阵的所有元素之和,返回值形状(n,)
-
- def compute_mIoU(pred,label,n_classes = args.num_class):
- hist = np.zeros((num_classes, n_classes))#hist初始化为全零,在这里的hist的形状是[n_classes, n_classes]
- hist += fast_hist(label.flatten(), pred.flatten(), n_classes) #对一张图片计算 n_classes×n_classes 的hist矩阵,并累加
-
- mIoUs = per_class_iu(hist)#计算逐类别mIoU值
- for ind_class in range(n_classes):#逐类别输出一下mIoU值
- print(str(round(mIoUs[ind_class] * 100, 2)))
- print('===> mIoU: ' + str(round(np.nanmean(mIoUs) * 100, 2)))#在所有验证集图像上求所有类别平均的mIoU值,计算时忽略NaN值
- return mIoUs
data:image/s3,"s3://crabby-images/deb9d/deb9d52e6c78f73fbfaadc6e519fd00d286664e1" alt=""
FPS:frame per second,检测器每秒能处理的图像张数。就是跟踪算法每秒钟给出多少张图片的跟踪结果。实时性一般fps>=30就表示具有实时性了。fps越高表示效率越高。省机器,省钱。
- # Copyright (c) OpenMMLab. All rights reserved.
- import argparse
- import copy
- import os
- import time
-
- import torch
- from mmcv import Config, DictAction
- from mmcv.cnn import fuse_conv_bn
- from mmcv.parallel import MMDistributedDataParallel
- from mmcv.runner import init_dist, load_checkpoint, wrap_fp16_model
-
- from mmdet.datasets import (build_dataloader, build_dataset,
- replace_ImageToTensor)
- from mmdet.models import build_detector
- from mmdet.utils import update_data_root
-
-
- def parse_args():
- parser = argparse.ArgumentParser(description='MMDet benchmark a model')
- parser.add_argument('config', help='test config file path')
- parser.add_argument('checkpoint', help='checkpoint file')
- parser.add_argument(
- '--repeat-num',
- type=int,
- default=1,
- help='number of repeat times of measurement for averaging the results')
- parser.add_argument(
- '--max-iter', type=int, default=2000, help='num of max iter')
- parser.add_argument(
- '--log-interval', type=int, default=50, help='interval of logging')
- parser.add_argument(
- '--fuse-conv-bn',
- action='store_true',
- help='Whether to fuse conv and bn, this will slightly increase'
- 'the inference speed')
- parser.add_argument(
- '--cfg-options',
- nargs='+',
- action=DictAction,
- help='override some settings in the used config, the key-value pair '
- 'in xxx=yyy format will be merged into config file. If the value to '
- 'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
- 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
- 'Note that the quotation marks are necessary and that no white space '
- 'is allowed.')
- parser.add_argument(
- '--launcher',
- choices=['none', 'pytorch', 'slurm', 'mpi'],
- default='none',
- help='job launcher')
- parser.add_argument('--local_rank', type=int, default=0)
- args = parser.parse_args()
- if 'LOCAL_RANK' not in os.environ:
- os.environ['LOCAL_RANK'] = str(args.local_rank)
- return args
-
-
- def measure_inference_speed(cfg, checkpoint, max_iter, log_interval,
- is_fuse_conv_bn):
- # set cudnn_benchmark
- if cfg.get('cudnn_benchmark', False):
- torch.backends.cudnn.benchmark = True
- cfg.model.pretrained = None
- cfg.data.test.test_mode = True
-
- # build the dataloader
- samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
- if samples_per_gpu > 1:
- # Replace 'ImageToTensor' to 'DefaultFormatBundle'
- cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
- dataset = build_dataset(cfg.data.test)
- data_loader = build_dataloader(
- dataset,
- samples_per_gpu=1,
- # Because multiple processes will occupy additional CPU resources,
- # FPS statistics will be more unstable when workers_per_gpu is not 0.
- # It is reasonable to set workers_per_gpu to 0.
- workers_per_gpu=0,
- dist=True,
- shuffle=False)
-
- # build the model and load checkpoint
- cfg.model.train_cfg = None
- model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
- fp16_cfg = cfg.get('fp16', None)
- if fp16_cfg is not None:
- wrap_fp16_model(model)
- load_checkpoint(model, checkpoint, map_location='cpu')
- if is_fuse_conv_bn:
- model = fuse_conv_bn(model)
-
- model = MMDistributedDataParallel(
- model.cuda(),
- device_ids=[torch.cuda.current_device()],
- broadcast_buffers=False)
- model.eval()
-
- # the first several iterations may be very slow so skip them
- num_warmup = 5
- pure_inf_time = 0
- fps = 0
-
- # benchmark with 2000 image and take the average
- for i, data in enumerate(data_loader):
-
- torch.cuda.synchronize()
- start_time = time.perf_counter()
-
- with torch.no_grad():
- model(return_loss=False, rescale=True, **data)
-
- torch.cuda.synchronize()
- elapsed = time.perf_counter() - start_time
-
- if i >= num_warmup:
- pure_inf_time += elapsed
- if (i + 1) % log_interval == 0:
- fps = (i + 1 - num_warmup) / pure_inf_time
- print(
- f'Done image [{i + 1:<3}/ {max_iter}], '
- f'fps: {fps:.1f} img / s, '
- f'times per image: {1000 / fps:.1f} ms / img',
- flush=True)
-
- if (i + 1) == max_iter:
- fps = (i + 1 - num_warmup) / pure_inf_time
- print(
- f'Overall fps: {fps:.1f} img / s, '
- f'times per image: {1000 / fps:.1f} ms / img',
- flush=True)
- break
- return fps
-
-
- def repeat_measure_inference_speed(cfg,
- checkpoint,
- max_iter,
- log_interval,
- is_fuse_conv_bn,
- repeat_num=1):
- assert repeat_num >= 1
-
- fps_list = []
-
- for _ in range(repeat_num):
- #
- cp_cfg = copy.deepcopy(cfg)
-
- fps_list.append(
- measure_inference_speed(cp_cfg, checkpoint, max_iter, log_interval,
- is_fuse_conv_bn))
-
- if repeat_num > 1:
- fps_list_ = [round(fps, 1) for fps in fps_list]
- times_pre_image_list_ = [round(1000 / fps, 1) for fps in fps_list]
- mean_fps_ = sum(fps_list_) / len(fps_list_)
- mean_times_pre_image_ = sum(times_pre_image_list_) / len(
- times_pre_image_list_)
- print(
- f'Overall fps: {fps_list_}[{mean_fps_:.1f}] img / s, '
- f'times per image: '
- f'{times_pre_image_list_}[{mean_times_pre_image_:.1f}] ms / img',
- flush=True)
- return fps_list
-
- return fps_list[0]
-
-
- def main():
- args = parse_args()
-
- cfg = Config.fromfile(args.config)
-
- # update data root according to MMDET_DATASETS
- update_data_root(cfg)
-
- if args.cfg_options is not None:
- cfg.merge_from_dict(args.cfg_options)
-
- if args.launcher == 'none':
- raise NotImplementedError('Only supports distributed mode')
- else:
- init_dist(args.launcher, **cfg.dist_params)
-
- repeat_measure_inference_speed(cfg, args.checkpoint, args.max_iter,
- args.log_interval, args.fuse_conv_bn,
- args.repeat_num)
-
-
- if __name__ == '__main__':
- main()
data:image/s3,"s3://crabby-images/deb9d/deb9d52e6c78f73fbfaadc6e519fd00d286664e1" alt=""
多尺度卷积:
多尺度卷积层就是用不同大小的卷积核对某一时刻所得到的特征图进行卷积操作,得到新的大小不同的特征图,之后针对不同大小的特征图上采样到输入特征图的大小。也就是说,多尺度卷积层不会改变原有特征图的大小,只是通过不同卷积核的卷积操作,丰富了图像的特征,从全局的视角对图像中的感兴趣的特征信息进行编码解码,进而提高图像的分割性能。
参考:https://zhuanlan.zhihu.com/p/451122397
dliate convolution:空洞卷积
具有单一扩张率的空洞卷积可以提取全局信息,但可能会丢失局部特征。许多模型都采用空洞卷积来构建空间特征金字塔来提取多尺度特征。本文在CFP模块的每个通道中都采用了空洞卷积。
具有dilation rate为r的n×n的空洞卷积核的有效大小为:[
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/不正经/article/detail/346049
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。