论文地址:CFPNet: Channel-wise Feature Pyramid for Real-Time Semantic Segmentation
代码地址: https://github.com/chukai123/CFPNet
本文为了实现更好的性能,模型尺寸和推断速度,提出了channel-wise feature pyramid(CFP)模块。并且基于CFP模块,构建了CFPNet用于实时语义分割,其中采用了一系列dliate卷积通道来提取有效特征。
在Cityscapes数据集中,CFPNet取得了70.1%的class-wise mIoU,并且只有0.55亿参数和2.5 MB内存。推断速度可以在单个rtx 2080ti gpu上达到30 fps,图像为1024×2048像素。
class-wise mIoU:在cityscapes中,class表示19个小类别,而mIoU表示先计算每个类别的IoU,然后对所有类别的IoU取平均即可;
category-wise (mIoU):一般是指大类别;
mIoU:针对语义分割的一个评估指标,平均交并比Mean Intersection over Union:
- from sklearn.metrics import confusion_matrix
- import numpy as np
- def miou(y_true, y_pred):
- # y_true表示真实值,y_pred表示预测
- com = confusion_matrix(y_true, y_pred)
- TP = np.diag(cm) # 混淆矩阵中的对角线部分,表示预测对的数量
- FP = com.sum(axis=0) - TP
- FN = com.sum(axis=1) - TP
- return np.mean(TP / (FN + FP + TP + np.finfo(float).eps))
- #设标签宽W,长H
- def fast_hist(a, b, n):#a是转化成一维数组的标签,形状(H×W,);b是转化成一维数组的预测特征图,形状(H×W,);n是类别数目
- k = (a > 0) & (a <= n) #k是一个一维bool数组,形状(H×W,);目的是找出标签中需要计算的类别(去掉了背景),假设0是背景
- return np.bincount(n * a[k].astype(int) + b[k], minlength=n ** 2).reshape(n, n)
- def per_class_iu(hist):#分别为每个类别(在这里是19类)计算mIoU,hist的形状(n, n)
- '''
- 核心代码
- '''
- return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))#矩阵的对角线上的值组成的一维数组/矩阵的所有元素之和,返回值形状(n,)
- def compute_mIoU(pred,label,n_classes = args.num_class):
- hist = np.zeros((num_classes, n_classes))#hist初始化为全零,在这里的hist的形状是[n_classes, n_classes]
- hist += fast_hist(label.flatten(), pred.flatten(), n_classes) #对一张图片计算 n_classes×n_classes 的hist矩阵,并累加
- mIoUs = per_class_iu(hist)#计算逐类别mIoU值
- for ind_class in range(n_classes):#逐类别输出一下mIoU值
- print(str(round(mIoUs[ind_class] * 100, 2)))
- print('===> mIoU: ' + str(round(np.nanmean(mIoUs) * 100, 2)))#在所有验证集图像上求所有类别平均的mIoU值,计算时忽略NaN值
- return mIoUs

FPS:frame per second,检测器每秒能处理的图像张数。就是跟踪算法每秒钟给出多少张图片的跟踪结果。实时性一般fps>=30就表示具有实时性了。fps越高表示效率越高。省机器,省钱。
- # Copyright (c) OpenMMLab. All rights reserved.
- import argparse
- import copy
- import os
- import time
- import torch
- from mmcv import Config, DictAction
- from mmcv.cnn import fuse_conv_bn
- from mmcv.parallel import MMDistributedDataParallel
- from mmcv.runner import init_dist, load_checkpoint, wrap_fp16_model
- from mmdet.datasets import (build_dataloader, build_dataset,
- replace_ImageToTensor)
- from mmdet.models import build_detector
- from mmdet.utils import update_data_root
- def parse_args():
- parser = argparse.ArgumentParser(description='MMDet benchmark a model')
- parser.add_argument('config', help='test config file path')
- parser.add_argument('checkpoint', help='checkpoint file')
- parser.add_argument(
- '--repeat-num',
- type=int,
- default=1,
- help='number of repeat times of measurement for averaging the results')
- parser.add_argument(
- '--max-iter', type=int, default=2000, help='num of max iter')
- parser.add_argument(
- '--log-interval', type=int, default=50, help='interval of logging')
- parser.add_argument(
- '--fuse-conv-bn',
- action='store_true',
- help='Whether to fuse conv and bn, this will slightly increase'
- 'the inference speed')
- parser.add_argument(
- '--cfg-options',
- nargs='+',
- action=DictAction,
- help='override some settings in the used config, the key-value pair '
- 'in xxx=yyy format will be merged into config file. If the value to '
- 'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
- 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
- 'Note that the quotation marks are necessary and that no white space '
- 'is allowed.')
- parser.add_argument(
- '--launcher',
- choices=['none', 'pytorch', 'slurm', 'mpi'],
- default='none',
- help='job launcher')
- parser.add_argument('--local_rank', type=int, default=0)
- args = parser.parse_args()
- if 'LOCAL_RANK' not in os.environ:
- os.environ['LOCAL_RANK'] = str(args.local_rank)
- return args
- def measure_inference_speed(cfg, checkpoint, max_iter, log_interval,
- is_fuse_conv_bn):
- # set cudnn_benchmark
- if cfg.get('cudnn_benchmark', False):
- torch.backends.cudnn.benchmark = True
- cfg.model.pretrained = None
- cfg.data.test.test_mode = True
- # build the dataloader
- samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
- if samples_per_gpu > 1:
- # Replace 'ImageToTensor' to 'DefaultFormatBundle'
- cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
- dataset = build_dataset(cfg.data.test)
- data_loader = build_dataloader(
- dataset,
- samples_per_gpu=1,
- # Because multiple processes will occupy additional CPU resources,
- # FPS statistics will be more unstable when workers_per_gpu is not 0.
- # It is reasonable to set workers_per_gpu to 0.
- workers_per_gpu=0,
- dist=True,
- shuffle=False)
- # build the model and load checkpoint
- cfg.model.train_cfg = None
- model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
- fp16_cfg = cfg.get('fp16', None)
- if fp16_cfg is not None:
- wrap_fp16_model(model)
- load_checkpoint(model, checkpoint, map_location='cpu')
- if is_fuse_conv_bn:
- model = fuse_conv_bn(model)
- model = MMDistributedDataParallel(
- model.cuda(),
- device_ids=[torch.cuda.current_device()],
- broadcast_buffers=False)
- model.eval()
- # the first several iterations may be very slow so skip them
- num_warmup = 5
- pure_inf_time = 0
- fps = 0
- # benchmark with 2000 image and take the average
- for i, data in enumerate(data_loader):
- torch.cuda.synchronize()
- start_time = time.perf_counter()
- with torch.no_grad():
- model(return_loss=False, rescale=True, **data)
- torch.cuda.synchronize()
- elapsed = time.perf_counter() - start_time
- if i >= num_warmup:
- pure_inf_time += elapsed
- if (i + 1) % log_interval == 0:
- fps = (i + 1 - num_warmup) / pure_inf_time
- print(
- f'Done image [{i + 1:<3}/ {max_iter}], '
- f'fps: {fps:.1f} img / s, '
- f'times per image: {1000 / fps:.1f} ms / img',
- flush=True)
- if (i + 1) == max_iter:
- fps = (i + 1 - num_warmup) / pure_inf_time
- print(
- f'Overall fps: {fps:.1f} img / s, '
- f'times per image: {1000 / fps:.1f} ms / img',
- flush=True)
- break
- return fps
- def repeat_measure_inference_speed(cfg,
- checkpoint,
- max_iter,
- log_interval,
- is_fuse_conv_bn,
- repeat_num=1):
- assert repeat_num >= 1
- fps_list = []
- for _ in range(repeat_num):
- #
- cp_cfg = copy.deepcopy(cfg)
- fps_list.append(
- measure_inference_speed(cp_cfg, checkpoint, max_iter, log_interval,
- is_fuse_conv_bn))
- if repeat_num > 1:
- fps_list_ = [round(fps, 1) for fps in fps_list]
- times_pre_image_list_ = [round(1000 / fps, 1) for fps in fps_list]
- mean_fps_ = sum(fps_list_) / len(fps_list_)
- mean_times_pre_image_ = sum(times_pre_image_list_) / len(
- times_pre_image_list_)
- print(
- f'Overall fps: {fps_list_}[{mean_fps_:.1f}] img / s, '
- f'times per image: '
- f'{times_pre_image_list_}[{mean_times_pre_image_:.1f}] ms / img',
- flush=True)
- return fps_list
- return fps_list[0]
- def main():
- args = parse_args()
- cfg = Config.fromfile(args.config)
- # update data root according to MMDET_DATASETS
- update_data_root(cfg)
- if args.cfg_options is not None:
- cfg.merge_from_dict(args.cfg_options)
- if args.launcher == 'none':
- raise NotImplementedError('Only supports distributed mode')
- else:
- init_dist(args.launcher, **cfg.dist_params)
- repeat_measure_inference_speed(cfg, args.checkpoint, args.max_iter,
- args.log_interval, args.fuse_conv_bn,
- args.repeat_num)
- if __name__ == '__main__':
- main()

dliate convolution:空洞卷积
具有dilation rate为r的n×n的空洞卷积核的有效大小为:[
