赞
踩
mmdetection支持多卡训练,有两种模式,分别是distributed模式和非distributed模式。官方推荐使用distributed模式。
那我们先讲一下distributed模式,mmdetection是使用tools/dist_train.sh来实现。其使用方法是如下:
./tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments]
# CONFIG_FILE是指模型的参数文件,例如: ./configs/faster_rcnn_r50_fpn_1x.py
# GPU_NUM是指使用GPU个数
# optional arguments其中可以使用的有:“--validate”,这个表示在trian的过程中使用val数据集进行验证
其中--validate的默认值是1个epoch进行一次validation,如果需要修改,在模型参数文件中加入如下
# 例如在./configs/faster_rcnn_r50_fpn_1x.py加入如下
evaluation = dict(interval=1)
打开dist_train.sh文件,可以看到其实还是调用tools/train.py
但由于我在电脑上跑dist_train.sh总是卡住,也不知道原因,所以我就尝试了非distributed的模式。
非distributed模式就直接调用tools/train.py就可以,调用格式如下:
python tools/train.py ${CONFIG_FILE}
# CONFIG_FILE是指模型的参数文件,例如: ./configs/faster_rcnn_r50_fpn_1x.py
需要注意的有如下:
这个解释过程是要从tools/train.py → \rightarrow → mmdet/apis/train.py → \rightarrow → mmcv/runner/runner.py
想要了解训练过程,就需要仔细地看一下train.py中的内容
# tools/train.py def main(): args = parse_args() cfg = Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # update configs according to CLI args if args.work_dir is not None: cfg.work_dir = args.work_dir if args.resume_from is not None: cfg.resume_from = args.resume_from cfg.gpus = args.gpus if args.autoscale_lr: # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) cfg.optimizer['lr'] = cfg.optimizer['lr'] * cfg.gpus / 8 # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # init logger before other steps logger = get_root_logger(cfg.log_level) logger.info('Distributed training: {}'.format(distributed)) # set random seeds if args.seed is not None: logger.info('Set random seed to {}'.format(args.seed)) set_random_seed(args.seed) #构建模型,其中cfg.model包含着模型的各种参数,加载自CONFIG_FILE model = build_detector( cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) #构建训练数据集 datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: datasets.append(build_dataset(cfg.data.val)) if cfg.checkpoint_config is not None: # save mmdet version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict( mmdet_version=__version__, config=cfg.text, CLASSES=datasets[0].CLASSES) # add an attribute for visualization convenience model.CLASSES = datasets[0].CLASSES #进行训练 train_detector( model, datasets, cfg, distributed=distributed, validate=args.validate, logger=logger)
所以tools/train.py完成了模型的加载,数据集的加载,使用train_detector进行训练。接下来看一下train_detector中的内容。
#/mmdet/apis/train.py
def train_detector(model,
dataset,
cfg,
distributed=False,
validate=False,
logger=None):
if logger is None:
logger = get_root_logger(cfg.log_level)
# start training
if distributed:
_dist_train(model, dataset, cfg, validate=validate)
else:
_non_dist_train(model, dataset, cfg, validate=validate)
可以看到,train_detector只是为了将distributed_train和non_distributed_train分开,因为我们选择non_distributed_train的模式,就直接看_non_dist_train的函数。
#/mmdet/apis/train.py def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) #进行训练 runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
可以看到_non_dist_train完成了定义optimizer,定义data_loader和定义runner,runner就是为了训练网络,使用runner.run对网络进行训练。接下来看runner.run,这个是在依赖的mmcv包中。
#mmcv/runner/runner.py class Runner(object): def run(self, data_loaders, workflow, max_epochs, **kwargs): """Start running. Args: data_loaders (list[:obj:`DataLoader`]): Dataloaders for training and validation. workflow (list[tuple]): A list of (phase, epochs) to specify the running order and epochs. E.g, [('train', 2), ('val', 1)] means running 2 epochs for training and 1 epoch for validation, iteratively. max_epochs (int): Total training epochs. """ ... # 省略了一些代码 while self.epoch < max_epochs: for i, flow in enumerate(workflow): mode, epochs = flow if isinstance(mode, str): # self.train() if not hasattr(self, mode): raise ValueError( 'runner has no method named "{}" to run an epoch'. format(mode)) # mode只有两种,是'train'和'val',把epoch_runner指向对应的函数 epoch_runner = getattr(self, mode) elif callable(mode): # custom train() epoch_runner = mode else: raise TypeError('mode in workflow must be a str or ' 'callable function, not {}'.format( type(mode))) for _ in range(epochs): if mode == 'train' and self.epoch >= max_epochs: return epoch_runner(data_loaders[i], **kwargs) time.sleep(1) # wait for some hooks like loggers to finish self.call_hook('after_run') def train(self, data_loader, **kwargs): self.model.train() self.mode = 'train' self.data_loader = data_loader self._max_iters = self._max_epochs * len(data_loader) self.call_hook('before_train_epoch') for i, data_batch in enumerate(data_loader): self._inner_iter = i self.call_hook('before_train_iter') # 调用batch_processor来进行前向计算 outputs = self.batch_processor( self.model, data_batch, train_mode=True, **kwargs) if not isinstance(outputs, dict): raise TypeError('batch_processor() must return a dict') if 'log_vars' in outputs: self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) self.outputs = outputs self.call_hook('after_train_iter') self._iter += 1 self.call_hook('after_train_epoch') self._epoch += 1
看了runner的代码,其中有两个问题需要注意:
先来解决第一个问题:
batch_processor是作为参数在Runner的初始化中使用,所以在./mmdet/apis/train.py找,在runner初始化时用到了batch_processor,其实它是mmdet/apis/train.py中的一个函数:
def batch_processor(model, data, train_mode):
losses = model(**data)
loss, log_vars = parse_losses(losses)
outputs = dict(
loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
return outputs
可以看到,在这个函数中,将data送入前向计算中。熟悉pytorch的同学应该对这个表示会觉得很亲切了。
TODO:查看model(**data)中的代码
解决第二个问题:
这个是由CONFIG_FILE中定义的,参看CONFIG_FILE中的:
workflow = [('train', 1)]
这表示在训练过程中1个train为一个循环,这个可以更改,例如:
workflow = [('train', 2), ('val', 1)] # 表示2个train,一个val为一个循环
Test东西少,就直接放在这里讲了。
Test也分两种模式,distributed和non-distributed,操作与train类似,只不过需要另外一个参数就是要加载的网络权重。所以这两种调用方式分别如下
# single-gpu testing
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] [--show]
# multi-gpu testing
./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}]
接下来,我们就看一下tools/test.py中的内容:
def single_gpu_test(model, data_loader, show=False): model.eval() results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=not show, **data) results.append(result) if show: model.module.show_result(data, result) batch_size = data['img'][0].size(0) for _ in range(batch_size): prog_bar.update() return results def main(): ... # 省略了一些加载函数调用参数的代码 # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if 'CLASSES' in checkpoint['meta']: model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, args.show) else: model = MMDistributedDataParallel(model.cuda()) outputs = multi_gpu_test(model, data_loader, args.tmpdir) ... # 省略了一些储存输出和eval的代码
可以看到,在main函数中,使用single_gpu_test函数进行前向计算,而single_gpu_test中使用model(return_loss=False, rescale=not show, **data)计算,这与mmdet/api/train.py中batch_processor函数是一样的。
值得注意的一点是:test过程中,目前的代码只支持一张卡只前向计算一张图片。(其实这一点我很不明白,在训练中都支持一张卡多张图片,这里我认为不难实现啊)。
TODO:看看能不能支持一张卡test多张图片。
赞
踩
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。