赞
踩
大家好,最近一直在看DiMP的代码,但是发现自己经常容易看了后面的忘了前面的,妈妈常和我说“好脑子不如烂笔头”,所以我今天决定把关于代码的一些想法记录下来,一定会有很多问题的,希望大家多多包容,多多与我交流,很欢迎大家评论或私信我哦~
最后更新时间:20201214
下面主要focus DiMP中关于目标状态判定的代码~最近又重新下载了一次代码,发现和之前的有稍微的不一样。
在pytracking/pytracking/tracker/dimp/dimp.py文件中
def track(self, image, info: dict = None) -> dict: self.debug_info = { } ......此处省略若干行代码...... # ------- LOCALIZATION ------- # # Extract backbone features 采样图像块,并提取其主干特征,见下面的函数 backbone_feat, sample_coords, im_patches = self.extract_backbone_features(im, self.get_centered_sample_pos(), self.target_scale * self.params.scale_factors, self.img_sample_sz) # Extract classification features 提取用于classifier的特征,由一个卷积层和一个L2正则化组成 test_x = self.get_classification_features(backbone_feat) # Location of sample sample_pos:center position of sample 获取中心点坐标和采样尺度,详情见下面 sample_pos, sample_scales = self.get_sample_location(sample_coords) # Compute classification scores 计算出分类的置信度分数 scores_raw = self.classify_target(test_x) # Localize the target 此处进入localize_target函数 translation_vec, scale_ind, s, flag = self.localize_target(scores_raw, sample_pos, sample_scales) new_pos = sample_pos[scale_ind,:] + translation_vec #获得新的样本中心点 # Update position and scale 如果目标未丢失,更新各参数 if flag != 'not_found': if self.params.get('use_iou_net', True): #进入当前if update_scale_flag = self.params.get('update_scale_when_uncertain', True) or flag != 'uncertain' if self.params.get('use_classifier', True): #更新bb的中心点坐标 self.pos self.update_state(new_pos) #微调classifier输出的结果,并更新各参数,见下面self.refine_target_box函数,理论可以看ATOM那篇论文 self.refine_target_box(backbone_feat, sample_pos[scale_ind,:], sample_scales[scale_ind], scale_ind, update_scale_flag) elif self.params.get('use_classifier', True): self.update_state(new_pos, sample_scales[scale_ind]) # ------- UPDATE ------- # #在'not_found', 'uncertain'情况外,update_flag为True update_flag = flag not in ['not_found', 'uncertain'] hard_negative = (flag == 'hard_negative') learning_rate = self.params.get('hard_negative_learning_rate', None) if hard_negative else None #如果不处于'not_found'和'uncertain'状态,则更新classifier if update_flag and self.params.get('update_classifier', False): # Get train sample 将当前样本加入训练样本中 train_x = test_x[scale_ind:scale_ind+1, ...] # Create target_box and label for spatial sample target_box = self.get_iounet_box(self.pos, self.target_sz, sample_pos[scale_ind,:], sample_scales[scale_ind]) # Update the classifier model 更新classifier模板,并绘制loss曲线,见下面self.update_classifier函数 self.update_classifier(train_x, target_box, learning_rate, s[scale_ind,...]) # Set the pos of the tracker to iounet pos 又更新了一遍self.pos,其实之前在self.refine_target_box函数中更新过 if self.params.get('use_iou_net', True) and flag != 'not_found' and hasattr(self, 'pos_iounet'): self.pos = self.pos_iounet.clone() score_map = s[scale_ind, ...] #置信度分数 max_score = torch.max(score_map).item() #置信度分数的最大值 # Visualize and set debug info 可视化 self.search_area_box = torch.cat((sample_coords[scale_ind,[1,0]], sample_coords[scale_ind,[3,2]] - sample_coords[scale_ind,[1,0]] - 1)) #搜索区域的大小位置 self.debug_info['flag' + self.id_str] = flag #当前帧目标状态信息 self.debug_info['max_score' + self.id_str] = max_score #置信度分数的最大值 if self.visdom is not None: self.visdom.register(score_map, 'heatmap', 2, 'Score Map' + self.id_str) self.visdom.register(self.debug_info, 'info_dict', 1, 'Status') elif self.params.debug >= 2: show_tensor(score_map, 5, title='Max score = {:.2f}'.format(max_score)) # Compute output bounding box 当前帧的目标bbox new_state = torch.cat((self.pos[[1,0]] - (self.target_sz[[1,0]]-1)/2, self.target_sz[[1,0]])) if self.params.get('output_not_found_box', False) and flag == 'not_found': output_state = [-1, -1, -1, -1] else: output_state = new_state.tolist() out = { 'target_bbox': output_state} return out
函数作用:提取采样图像块的主干特征
def extract_backbone_features(self, im: torch.Tensor, pos: torch.Tensor, scales, sz: torch.Tensor): """ input im: frame t pos: center position of sample scales: just scale sz: 288 output im_patches: image patches sampled from frame t 从t帧中采样的样本 patch_coords: top-left and bottom-right coordinate for image patches 样本的左上角和右下角坐标 """ # sample_patch_multiscale函数在preprocessing.py文件中,见下面 im_patches, patch_coords = sample_patch_multiscale(im, pos, scales, sz, mode=self.params.get('border_mode', 'replicate'), max_scale_change=self.params.get('patch_max_scale_change', None)) with torch.no_grad(): backbone_feat = self.net.extract_backbone(im_patches) return backbone_feat, patch_coords, im_patches
函数作用:在某一中心点处,采样一定大小和尺度的图像块
def sample_patch_multiscale(im, pos, scales, image_sz, mode: str='replicate', max_scale_change=None): """Extract image patches at multiple scales. args: im: Image. pos: Center position for extraction. scales: Image scales to extract image patches from. image_sz: Size to resize the image samples to mode: how to treat image borders: 'replicate' (default), 'inside' or 'inside_major' max_scale_change: maximum allowed scale change when using 'inside' and 'inside_major' mode """ if isinstance(scales, (int, float)): scales = [scales] # Get image patches 见sample_patch函数 得到采样的图像块和对应的坐标 patch_iter, coord_iter = zip(*(sample_patch(im, pos, s*image_sz, image_sz, mode=mode, max_scale_change=max_scale_change) for s in scales)) im_patches = torch.cat(list(patch_iter)) patch_coords = torch.cat(list(coord_iter)) return im_patches, patch_coords #返回extract_backbone_features函数 def sample_patch(im: torch.Tensor, pos: torch.Tensor, sample_sz: torch.Tensor, output_sz: torch.Tensor = None, mode: str = 'replicate', max_scale_change=None, is_mask=False): """Sample an image patch. args: im: Image pos: center position of crop sample_sz: size to crop output_sz: size to resize to mode: how to treat image borders: 'replicate' (default), 'inside' or 'inside_major' max_scale_change: maximum allowed scale change when using 'inside' and 'inside_major' mode """ # if mode not in ['replicate', 'inside']: # raise ValueError('Unknown border mode \'{}\'.'.format(mode)) # copy and convert posl = pos.long().clone() pad_mode = mode # Get new sample size if forced inside the image if mode == 'inside' or mode == 'inside_major': pad_mode = 'replicate' im_sz = torch.Tensor([im.shape[2], im.shape[3]]) shrink_factor = (sample_sz.float() / im_sz) if mode == 'inside': shrink_factor = shrink_factor.max() elif mode == 'inside_major': shrink_factor = shrink_factor.min() shrink_factor.clamp_(min=1, max=max_scale_change) sample_sz = (sample_sz.float() / shrink_factor).long() # Compute pre-downsampling factor if output_sz is not None: # 采样大小相对于输出大小的倍数 resize_factor = torch.min(sample_sz.float() / output_sz.float()).item() # 取整 df = int(max(int(resize_factor - 0.1), 1)) else: df = int(1) sz = sample_sz.float() / df # new size 新的output size # Do downsampling if df > 1: os = posl % df # offset posl = (posl - os) // df # new position im2 = im[..., os[0].item()::df, os[1].item()::df] # downsample else: im2 = im # compute size to crop 四舍五入取整 szl = torch.max(sz.round(), torch.Tensor([2])).long() # Extract top and bottom coordinates 根据中心点和尺寸求取左上角和右下角坐标 tl = posl - (szl - 1) // 2 br = posl + szl//2 + 1 # Shift the crop to inside if mode == 'inside' or mode == 'inside_major': im2_sz = torch.LongTensor([im2.shape[2], im2.shape[3]]) shift = (-tl).clamp(0) - (br - im2_sz).clamp(0) tl += shift br += shift outside = ((-tl).clamp(0) + (br - im2_sz).clamp(0)) // 2 shift = (-tl - outside) * (outside > 0).long() tl += shift br += shift # Get image patch # im_patch = im2[...,tl[0].item():br[0].item(),tl[1].item():br[1].item()] # Get image patch 填充函数,但其实填充的参数(-tl[1].item(), br[1].item() - im2.shape[3], -tl[0].item(), br[0].item() - im2.shape[2])都是负数,所以其实是往里“扣除”的作用 if not is_mask: im_patch = F.pad(im2, (-tl[1].item(), br[1].item() - im2.shape[3], -tl[0].item(), br[0].item() - im2.shape[2]), pad_mode) else: im_patch = F.pad(im2, (-tl[1].item(), br[1].item() - im2.shape[3], -tl[0].item(), br[0].item() - im2.shape[2])) # Get image coordinates 获得image patch对应的坐标 patch_coord = df * torch.cat((tl, br)).view(1,4) if output_sz is None or (im_patch.shape[-2] == output_sz[0] and im_patch.shape[-1] == output_sz[1]): return im_patch.clone(), patch_coord # Resample 将image patch插值成output_sz大小 if not is_mask: im_patch = F.interpolate(im_patch, output_sz.long().tolist(), mode='bilinear') else: im_patch = F.interpolate(im_patch, output_sz.long().tolist(), mode='nearest') return im_patch, patch_coord #返回采样的图像块和对应的左上角和右下角坐标
函数作用:获取样本的中心点坐标和采样尺度(采样区域大小相对于[288,288]的倍数)
def get_sample_location(self, sample_coord):
"""Get the location of the extracted sample."""
sample_coord = sample_coord.float()
sample_pos = 0.5*(sample_coord[:,:2] + sample_ coord[:,2:] - 1) # center coordinate of samples Sum tl and br and then multiplied with 0.5 to get the center coordinate
sample_scales = ((sample_coord[:,2:] - sample_coord[:,:2]) / self.img_sample_sz).prod(dim=1).sqrt()
return sample_pos, sample_scales
def localize_target(self, scores, sample_pos, sample_scales): 输入: scores:classifier输出的置信度分数 sample_pos:样本的中心点坐标 sample_scales:采样尺度 """Run the target localization.""" # score的维度:Dimensions (images_in_sequence, sequences, yH, yW) or (images_in_sequence, sequences, filters, yH, yW) scores = scores.squeeze(1) # 去除掉sequence那一维 preprocess_method = self.params.get('score_preprocess', 'none') #默认为none if preprocess_method == 'none': pass elif preprocess_method == 'exp': scores = scores.exp() elif preprocess_method == 'softmax': reg_val = getattr(self.net.classifier.filter_optimizer, 'softmax_reg', None) scores_view = scores.view(scores.shape[0], -1) scores_softmax = activation.softmax_reg(scores_view, dim=-1, reg=reg_val) scores = scores_softmax.view(scores.shape) else: raise Exception('Unknown score_preprocess in params.') score_filter_ksz = self.params.get('score_filter_ksz', 1) #默认为1 if score_filter_ksz > 1: assert score_filter_ksz % 2 == 1 kernel = scores.new_ones(1,1,score_filter_ksz,score_filter_ksz) scores = F.conv2d(scores.view(-1,1,*scores.shape[-2:]), kernel, padding=score_filter_ksz//2).view(scores.shape) if self.params.get('advanced_localization', False): #进入localize_advanced函数 return self.localize_advanced(scores, sample_pos, sample_scales) # Get maximum score_sz = torch.Tensor(list(scores.shape[-2:])) score_center = (score_sz - 1)/2 max_score, max_disp = dcf.max2d(scores) _, scale_ind = torch.max(max_score, dim=0) max_disp = max_disp[scale_ind,...].float().cpu().view(-1) target_disp = max_disp - score_center # Compute translation vector and scale change factor output_sz = score_sz - (self.kernel_size + 1) % 2 translation_vec = target_disp * (self.img_support_sz
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。